mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-23 18:49:08 +08:00
Fixed bidi handling in PDF output
This commit is contained in:
parent
f927728169
commit
d9699c4099
@ -60,15 +60,118 @@ long dist2(int x1, int y1, int x2, int y2) {
|
||||
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
|
||||
}
|
||||
|
||||
// Viewers like evince can get really confused during copy-paste when
|
||||
// the baseline wanders around. So I've decided to project every word
|
||||
// onto the (straight) line baseline. All numbers are in the native
|
||||
// PDF coordinate system, which has the origin in the bottom left and
|
||||
// the unit is points, which is 1/72 inch. Tesseract reports baselines
|
||||
// left-to-right no matter what the reading order is. We need the
|
||||
// word baseline in reading order, so we do that conversion here. Returns
|
||||
// the word's baseline origin and length.
|
||||
void GetWordBaseline(int writing_direction, int ppi, int height,
|
||||
int word_x1, int word_y1, int word_x2, int word_y2,
|
||||
int line_x1, int line_y1, int line_x2, int line_y2,
|
||||
double *x0, double *y0, double *length) {
|
||||
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
|
||||
Swap(&word_x1, &word_x2);
|
||||
Swap(&word_y1, &word_y2);
|
||||
}
|
||||
double word_length;
|
||||
double x, y;
|
||||
{
|
||||
int px = word_x1;
|
||||
int py = word_y1;
|
||||
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
|
||||
if (l2 == 0) {
|
||||
x = line_x1;
|
||||
y = line_y1;
|
||||
} else {
|
||||
double t = ((px - line_x2) * (line_x2 - line_x1) +
|
||||
(py - line_y2) * (line_y2 - line_y1)) / l2;
|
||||
x = line_x2 + t * (line_x2 - line_x1);
|
||||
y = line_y2 + t * (line_y2 - line_y1);
|
||||
}
|
||||
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
|
||||
word_x2, word_y2)));
|
||||
word_length = word_length * 72.0 / ppi;
|
||||
x = x * 72 / ppi;
|
||||
y = height - (y * 72.0 / ppi);
|
||||
}
|
||||
*x0 = x;
|
||||
*y0 = y;
|
||||
*length = word_length;
|
||||
}
|
||||
|
||||
// Compute coefficients for an affine matrix describing the rotation
|
||||
// of the text. If the text is right-to-left such as Arabic or Hebrew,
|
||||
// we reflect over the Y-axis. This matrix will set the coordinate
|
||||
// system for placing text in the PDF file.
|
||||
//
|
||||
// RTL
|
||||
// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
|
||||
// [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ]
|
||||
void AffineMatrix(int writing_direction,
|
||||
int line_x1, int line_y1, int line_x2, int line_y2,
|
||||
double *a, double *b, double *c, double *d) {
|
||||
double theta = atan2(static_cast<double>(line_y1 - line_y2),
|
||||
static_cast<double>(line_x2 - line_x1));
|
||||
*a = cos(theta);
|
||||
*b = sin(theta);
|
||||
*c = -sin(theta);
|
||||
*d = cos(theta);
|
||||
switch(writing_direction) {
|
||||
case WRITING_DIRECTION_RIGHT_TO_LEFT:
|
||||
*a = -*a;
|
||||
*b = -*b;
|
||||
break;
|
||||
case WRITING_DIRECTION_TOP_TO_BOTTOM:
|
||||
// TODO(jbreiden) Consider using the vertical PDF writing mode.
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// There are some really stupid PDF viewers in the wild, such as
|
||||
// 'Preview' which ships with the Mac. They do a better job with text
|
||||
// selection and highlighting when given perfectly flat baseline
|
||||
// instead of very slightly tilted. We clip small tilts to appease
|
||||
// these viewers. I chose this threshold large enough to absorb noise,
|
||||
// but small enough that lines probably won't cross each other if the
|
||||
// whole page is tilted at almost exactly the clipping threshold.
|
||||
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
|
||||
int *line_x1, int *line_y1,
|
||||
int *line_x2, int *line_y2) {
|
||||
*line_x1 = x1;
|
||||
*line_y1 = y1;
|
||||
*line_x2 = x2;
|
||||
*line_y2 = y2;
|
||||
double rise = abs(y2 - y1) * 72 / ppi;
|
||||
double run = abs(x2 - x1) * 72 / ppi;
|
||||
if (rise < 2.0 && 2.0 < run)
|
||||
*line_y1 = *line_y2 = (y1 + y2) / 2;
|
||||
}
|
||||
|
||||
char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
||||
double width, double height) {
|
||||
double ppi = api->GetSourceYResolution();
|
||||
STRING pdf_str("");
|
||||
double old_x = 0.0, old_y = 0.0;
|
||||
int old_pointsize = 0;
|
||||
double ppi = api->GetSourceYResolution();
|
||||
|
||||
// TODO(jbreiden) Slightly cleaner from an abstraction standpoint
|
||||
// if this were to live inside a separate text object.
|
||||
// These initial conditions are all arbitrary and will be overwritten
|
||||
double old_x = 0.0, old_y = 0.0;
|
||||
int old_fontsize = 0;
|
||||
tesseract::WritingDirection old_writing_direction =
|
||||
WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||
bool new_block = true;
|
||||
int fontsize = 0;
|
||||
double a = 1;
|
||||
double b = 0;
|
||||
double c = 0;
|
||||
double d = 1;
|
||||
|
||||
// TODO(jbreiden) This marries the text and image together.
|
||||
// Slightly cleaner from an abstraction standpoint if this were to
|
||||
// live inside a separate text object.
|
||||
pdf_str += "q ";
|
||||
pdf_str.add_str_double("", prec(width));
|
||||
pdf_str += " 0 0 ";
|
||||
@ -76,28 +179,18 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
||||
pdf_str += " 0 0 cm /Im1 Do Q\n";
|
||||
|
||||
ResultIterator *res_it = api->GetIterator();
|
||||
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
pdf_str += "BT\n3 Tr\n"; // Begin text object, use invisible ink
|
||||
old_pointsize = 0.0; // Every block will declare its font
|
||||
pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink
|
||||
old_fontsize = 0; // Every block will declare its fontsize
|
||||
new_block = true; // Every block will declare its affine matrix
|
||||
}
|
||||
|
||||
int line_x1, line_y1, line_x2, line_y2;
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
res_it->Baseline(RIL_TEXTLINE,
|
||||
&line_x1, &line_y1, &line_x2, &line_y2);
|
||||
double rise = abs(line_y2 - line_y1) * 72 / ppi;
|
||||
double run = abs(line_x2 - line_x1) * 72 / ppi;
|
||||
// There are some really stupid PDF viewers in the wild, such as
|
||||
// 'Preview' which ships with the Mac. They might do a better
|
||||
// job with text selection and highlighting when given perfectly
|
||||
// straight text instead of very slightly tilted text. I chose
|
||||
// this threshold large enough to absorb noise, but small enough
|
||||
// that lines probably won't cross each other if the whole page
|
||||
// is tilted at almost exactly the clipping threshold.
|
||||
if (rise < 2.0 && 2.0 < run)
|
||||
line_y1 = line_y2 = (line_y1 + line_y2) / 2;
|
||||
int x1, y1, x2, y2;
|
||||
res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
|
||||
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
|
||||
}
|
||||
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
@ -105,120 +198,78 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
||||
continue;
|
||||
}
|
||||
|
||||
int word_x1, word_y1, word_x2, word_y2;
|
||||
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
|
||||
|
||||
// The critical one is writing_direction
|
||||
tesseract::Orientation orientation;
|
||||
// Writing direction changes at a per-word granularity
|
||||
tesseract::WritingDirection writing_direction;
|
||||
tesseract::TextlineOrder textline_order;
|
||||
float deskew_angle;
|
||||
res_it->Orientation(&orientation, &writing_direction,
|
||||
&textline_order, &deskew_angle);
|
||||
|
||||
// Unlike Tesseract, we always want the word baseline in reading order.
|
||||
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
|
||||
Swap(&word_x1, &word_x2);
|
||||
Swap(&word_y1, &word_y2);
|
||||
}
|
||||
|
||||
// Viewers like evince can get really confused during copy-paste
|
||||
// when the baseline wanders around. I've decided to force every
|
||||
// word to match the (straight) baseline. The math below is just
|
||||
// projecting the word origin onto the baseline. All numbers are
|
||||
// in the native PDF coordinate system, which has the origin in
|
||||
// the bottom left and the unit is points, which is 1/72 inch.
|
||||
double word_length;
|
||||
double x, y;
|
||||
{
|
||||
int px = word_x1;
|
||||
int py = word_y1;
|
||||
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
|
||||
if (l2 == 0) {
|
||||
x = line_x1;
|
||||
y = line_y1;
|
||||
} else {
|
||||
double t = ((px - line_x2) * (line_x2 - line_x1) +
|
||||
(py - line_y2) * (line_y2 - line_y1)) / l2;
|
||||
x = line_x2 + t * (line_x2 - line_x1);
|
||||
y = line_y2 + t * (line_y2 - line_y1);
|
||||
tesseract::Orientation orientation;
|
||||
tesseract::TextlineOrder textline_order;
|
||||
float deskew_angle;
|
||||
res_it->Orientation(&orientation, &writing_direction,
|
||||
&textline_order, &deskew_angle);
|
||||
if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
|
||||
switch (res_it->WordDirection()) {
|
||||
case DIR_LEFT_TO_RIGHT:
|
||||
writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||
break;
|
||||
case DIR_RIGHT_TO_LEFT:
|
||||
writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
|
||||
break;
|
||||
default:
|
||||
writing_direction = old_writing_direction;
|
||||
}
|
||||
}
|
||||
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
|
||||
word_x2, word_y2)));
|
||||
word_length = word_length * 72.0 / ppi;
|
||||
x = x * 72 / ppi;
|
||||
y = height - (y * 72.0 / ppi);
|
||||
}
|
||||
|
||||
int pointsize = 0;
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
// Calculate the rotation angle in the PDF cooordinate system,
|
||||
// which has the origin in the bottom left. The Tesseract
|
||||
// coordinate system has the origin in the upper left.
|
||||
//
|
||||
// PDF is kind of a like turtle graphics, and we orient the
|
||||
// turtle (errr... initial cursor position) with an affine
|
||||
// transformation.
|
||||
//
|
||||
// Rotate RTL Translate
|
||||
//
|
||||
// [ x' y' 1 ] = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ] [ -1 0 0 ] [ 1 0 0 ]
|
||||
// [ sin𝜃 cos𝜃 0 ] [ 0 1 0 ] [ 0 1 0 ]
|
||||
// [ 0 0 1 ] [ 0 0 1 ] [ x y 1 ]
|
||||
//
|
||||
double theta = atan2(static_cast<double>(line_y1 - line_y2),
|
||||
static_cast<double>(line_x2 - line_x1));
|
||||
double a, b, c, d;
|
||||
a = cos(theta);
|
||||
b = sin(theta);
|
||||
c = -sin(theta);
|
||||
d = cos(theta);
|
||||
switch(writing_direction) {
|
||||
case WRITING_DIRECTION_RIGHT_TO_LEFT:
|
||||
a = -a;
|
||||
b = -b;
|
||||
c = -c;
|
||||
break;
|
||||
case WRITING_DIRECTION_TOP_TO_BOTTOM:
|
||||
// TODO(jbreiden) Consider switching PDF writing mode to vertical.
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// Where is word origin and how long is it?
|
||||
double x, y, word_length;
|
||||
{
|
||||
int word_x1, word_y1, word_x2, word_y2;
|
||||
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
|
||||
GetWordBaseline(writing_direction, ppi, height,
|
||||
word_x1, word_y1, word_x2, word_y2,
|
||||
line_x1, line_y1, line_x2, line_y2,
|
||||
&x, &y, &word_length);
|
||||
}
|
||||
|
||||
pdf_str.add_str_double("", prec(a)); // . This affine matrix
|
||||
if (writing_direction != old_writing_direction || new_block) {
|
||||
AffineMatrix(writing_direction,
|
||||
line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
|
||||
pdf_str.add_str_double(" ", prec(a)); // . This affine matrix
|
||||
pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate
|
||||
pdf_str.add_str_double(" ", prec(c)); // . system for all
|
||||
pdf_str.add_str_double(" ", prec(d)); // . text in the entire
|
||||
pdf_str.add_str_double(" ", prec(x)); // . line.
|
||||
pdf_str.add_str_double(" ", prec(d)); // . text that follows.
|
||||
pdf_str.add_str_double(" ", prec(x)); // .
|
||||
pdf_str.add_str_double(" ", prec(y)); // .
|
||||
pdf_str += (" Tm "); // Place cursor absolutely
|
||||
new_block = false;
|
||||
} else {
|
||||
double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
|
||||
pdf_str.add_str_double(" ", prec(offset)); // Delta x in pts
|
||||
pdf_str.add_str_double(" ", 0); // Delta y in pts
|
||||
pdf_str += (" Td "); // Relative moveto
|
||||
double dx = x - old_x;
|
||||
double dy = y - old_y;
|
||||
pdf_str.add_str_double(" ", prec(dx * a + dy * b));
|
||||
pdf_str.add_str_double(" ", prec(dx * c + dy * d));
|
||||
pdf_str += (" Td "); // Relative moveto
|
||||
}
|
||||
old_x = x;
|
||||
old_y = y;
|
||||
old_writing_direction = writing_direction;
|
||||
|
||||
// Adjust font size on a per word granularity. Pay attention to
|
||||
// pointsize, old_pointsize, and pdf_str. We've found that for
|
||||
// in Arabic, Tesseract will happily return a pointsize of zero,
|
||||
// fontsize, old_fontsize, and pdf_str. We've found that for
|
||||
// in Arabic, Tesseract will happily return a fontsize of zero,
|
||||
// so we make up a default number to protect ourselves.
|
||||
{
|
||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||
int font_id;
|
||||
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
|
||||
&serif, &smallcaps, &pointsize, &font_id);
|
||||
const int kDefaultPointSize = 8;
|
||||
if (pointsize <= 0)
|
||||
pointsize = kDefaultPointSize;
|
||||
if (pointsize != old_pointsize) {
|
||||
&serif, &smallcaps, &fontsize, &font_id);
|
||||
const int kDefaultFontsize = 8;
|
||||
if (fontsize <= 0)
|
||||
fontsize = kDefaultFontsize;
|
||||
if (fontsize != old_fontsize) {
|
||||
char textfont[20];
|
||||
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
|
||||
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
|
||||
pdf_str += textfont;
|
||||
old_pointsize = pointsize;
|
||||
old_fontsize = fontsize;
|
||||
}
|
||||
}
|
||||
|
||||
@ -243,9 +294,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
||||
delete []grapheme;
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
|
||||
if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
|
||||
double h_stretch =
|
||||
kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len));
|
||||
kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
|
||||
pdf_str.add_str_double("", h_stretch);
|
||||
pdf_str += " Tz"; // horizontal stretch
|
||||
pdf_str += " [ ";
|
||||
@ -449,7 +500,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
|
||||
|
||||
L_COMP_DATA *cid = NULL;
|
||||
const int kJpegQuality = 85;
|
||||
l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
|
||||
|
||||
// TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain
|
||||
// types of PNG files, especially if there are 2 samples per pixel.
|
||||
// We can get rid of this logic after Leptonica 1.72 is released and
|
||||
@ -747,5 +798,4 @@ bool TessPDFRenderer::EndDocumentHandler() {
|
||||
AppendString(buf);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -195,7 +195,7 @@ private:
|
||||
double width, double height);
|
||||
// Turn an image into a PDF object. Only transcode if we have to.
|
||||
static bool imageToPDFObj(Pix *pix, char *filename, long int objnum,
|
||||
char **pdf_object, long int *pdf_object_size);
|
||||
char **pdf_object, long int *pdf_object_size);
|
||||
};
|
||||
|
||||
|
||||
|
BIN
tessdata/pdf.ttf
BIN
tessdata/pdf.ttf
Binary file not shown.
1928
tessdata/pdf.ttx
1928
tessdata/pdf.ttx
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user