diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index 04f7fa93..55232515 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -60,15 +60,118 @@ long dist2(int x1, int y1, int x2, int y2) { return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1); } +// Viewers like evince can get really confused during copy-paste when +// the baseline wanders around. So I've decided to project every word +// onto the (straight) line baseline. All numbers are in the native +// PDF coordinate system, which has the origin in the bottom left and +// the unit is points, which is 1/72 inch. Tesseract reports baselines +// left-to-right no matter what the reading order is. We need the +// word baseline in reading order, so we do that conversion here. Returns +// the word's baseline origin and length. +void GetWordBaseline(int writing_direction, int ppi, int height, + int word_x1, int word_y1, int word_x2, int word_y2, + int line_x1, int line_y1, int line_x2, int line_y2, + double *x0, double *y0, double *length) { + if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) { + Swap(&word_x1, &word_x2); + Swap(&word_y1, &word_y2); + } + double word_length; + double x, y; + { + int px = word_x1; + int py = word_y1; + double l2 = dist2(line_x1, line_y1, line_x2, line_y2); + if (l2 == 0) { + x = line_x1; + y = line_y1; + } else { + double t = ((px - line_x2) * (line_x2 - line_x1) + + (py - line_y2) * (line_y2 - line_y1)) / l2; + x = line_x2 + t * (line_x2 - line_x1); + y = line_y2 + t * (line_y2 - line_y1); + } + word_length = sqrt(static_cast(dist2(word_x1, word_y1, + word_x2, word_y2))); + word_length = word_length * 72.0 / ppi; + x = x * 72 / ppi; + y = height - (y * 72.0 / ppi); + } + *x0 = x; + *y0 = y; + *length = word_length; +} + +// Compute coefficients for an affine matrix describing the rotation +// of the text. If the text is right-to-left such as Arabic or Hebrew, +// we reflect over the Y-axis. This matrix will set the coordinate +// system for placing text in the PDF file. +// +// RTL +// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ] +// [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ] +void AffineMatrix(int writing_direction, + int line_x1, int line_y1, int line_x2, int line_y2, + double *a, double *b, double *c, double *d) { + double theta = atan2(static_cast(line_y1 - line_y2), + static_cast(line_x2 - line_x1)); + *a = cos(theta); + *b = sin(theta); + *c = -sin(theta); + *d = cos(theta); + switch(writing_direction) { + case WRITING_DIRECTION_RIGHT_TO_LEFT: + *a = -*a; + *b = -*b; + break; + case WRITING_DIRECTION_TOP_TO_BOTTOM: + // TODO(jbreiden) Consider using the vertical PDF writing mode. + break; + default: + break; + } +} + +// There are some really stupid PDF viewers in the wild, such as +// 'Preview' which ships with the Mac. They do a better job with text +// selection and highlighting when given perfectly flat baseline +// instead of very slightly tilted. We clip small tilts to appease +// these viewers. I chose this threshold large enough to absorb noise, +// but small enough that lines probably won't cross each other if the +// whole page is tilted at almost exactly the clipping threshold. +void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, + int *line_x1, int *line_y1, + int *line_x2, int *line_y2) { + *line_x1 = x1; + *line_y1 = y1; + *line_x2 = x2; + *line_y2 = y2; + double rise = abs(y2 - y1) * 72 / ppi; + double run = abs(x2 - x1) * 72 / ppi; + if (rise < 2.0 && 2.0 < run) + *line_y1 = *line_y2 = (y1 + y2) / 2; +} + char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, double width, double height) { - double ppi = api->GetSourceYResolution(); STRING pdf_str(""); - double old_x = 0.0, old_y = 0.0; - int old_pointsize = 0; + double ppi = api->GetSourceYResolution(); - // TODO(jbreiden) Slightly cleaner from an abstraction standpoint - // if this were to live inside a separate text object. + // These initial conditions are all arbitrary and will be overwritten + double old_x = 0.0, old_y = 0.0; + int old_fontsize = 0; + tesseract::WritingDirection old_writing_direction = + WRITING_DIRECTION_LEFT_TO_RIGHT; + bool new_block = true; + int fontsize = 0; + double a = 1; + double b = 0; + double c = 0; + double d = 1; + + // TODO(jbreiden) This marries the text and image together. + // Slightly cleaner from an abstraction standpoint if this were to + // live inside a separate text object. pdf_str += "q "; pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; @@ -76,28 +179,18 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, pdf_str += " 0 0 cm /Im1 Do Q\n"; ResultIterator *res_it = api->GetIterator(); - while (!res_it->Empty(RIL_BLOCK)) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - pdf_str += "BT\n3 Tr\n"; // Begin text object, use invisible ink - old_pointsize = 0.0; // Every block will declare its font + pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink + old_fontsize = 0; // Every block will declare its fontsize + new_block = true; // Every block will declare its affine matrix } int line_x1, line_y1, line_x2, line_y2; if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { - res_it->Baseline(RIL_TEXTLINE, - &line_x1, &line_y1, &line_x2, &line_y2); - double rise = abs(line_y2 - line_y1) * 72 / ppi; - double run = abs(line_x2 - line_x1) * 72 / ppi; - // There are some really stupid PDF viewers in the wild, such as - // 'Preview' which ships with the Mac. They might do a better - // job with text selection and highlighting when given perfectly - // straight text instead of very slightly tilted text. I chose - // this threshold large enough to absorb noise, but small enough - // that lines probably won't cross each other if the whole page - // is tilted at almost exactly the clipping threshold. - if (rise < 2.0 && 2.0 < run) - line_y1 = line_y2 = (line_y1 + line_y2) / 2; + int x1, y1, x2, y2; + res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); + ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2); } if (res_it->Empty(RIL_WORD)) { @@ -105,120 +198,78 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, continue; } - int word_x1, word_y1, word_x2, word_y2; - res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); - - // The critical one is writing_direction - tesseract::Orientation orientation; + // Writing direction changes at a per-word granularity tesseract::WritingDirection writing_direction; - tesseract::TextlineOrder textline_order; - float deskew_angle; - res_it->Orientation(&orientation, &writing_direction, - &textline_order, &deskew_angle); - - // Unlike Tesseract, we always want the word baseline in reading order. - if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) { - Swap(&word_x1, &word_x2); - Swap(&word_y1, &word_y2); - } - - // Viewers like evince can get really confused during copy-paste - // when the baseline wanders around. I've decided to force every - // word to match the (straight) baseline. The math below is just - // projecting the word origin onto the baseline. All numbers are - // in the native PDF coordinate system, which has the origin in - // the bottom left and the unit is points, which is 1/72 inch. - double word_length; - double x, y; { - int px = word_x1; - int py = word_y1; - double l2 = dist2(line_x1, line_y1, line_x2, line_y2); - if (l2 == 0) { - x = line_x1; - y = line_y1; - } else { - double t = ((px - line_x2) * (line_x2 - line_x1) + - (py - line_y2) * (line_y2 - line_y1)) / l2; - x = line_x2 + t * (line_x2 - line_x1); - y = line_y2 + t * (line_y2 - line_y1); + tesseract::Orientation orientation; + tesseract::TextlineOrder textline_order; + float deskew_angle; + res_it->Orientation(&orientation, &writing_direction, + &textline_order, &deskew_angle); + if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) { + switch (res_it->WordDirection()) { + case DIR_LEFT_TO_RIGHT: + writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT; + break; + case DIR_RIGHT_TO_LEFT: + writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT; + break; + default: + writing_direction = old_writing_direction; + } } - word_length = sqrt(static_cast(dist2(word_x1, word_y1, - word_x2, word_y2))); - word_length = word_length * 72.0 / ppi; - x = x * 72 / ppi; - y = height - (y * 72.0 / ppi); } - int pointsize = 0; - if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { - // Calculate the rotation angle in the PDF cooordinate system, - // which has the origin in the bottom left. The Tesseract - // coordinate system has the origin in the upper left. - // - // PDF is kind of a like turtle graphics, and we orient the - // turtle (errr... initial cursor position) with an affine - // transformation. - // - // Rotate RTL Translate - // - // [ x' y' 1 ] = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ] [ -1 0 0 ] [ 1 0 0 ] - // [ sin𝜃 cos𝜃 0 ] [ 0 1 0 ] [ 0 1 0 ] - // [ 0 0 1 ] [ 0 0 1 ] [ x y 1 ] - // - double theta = atan2(static_cast(line_y1 - line_y2), - static_cast(line_x2 - line_x1)); - double a, b, c, d; - a = cos(theta); - b = sin(theta); - c = -sin(theta); - d = cos(theta); - switch(writing_direction) { - case WRITING_DIRECTION_RIGHT_TO_LEFT: - a = -a; - b = -b; - c = -c; - break; - case WRITING_DIRECTION_TOP_TO_BOTTOM: - // TODO(jbreiden) Consider switching PDF writing mode to vertical. - break; - default: - break; - } + // Where is word origin and how long is it? + double x, y, word_length; + { + int word_x1, word_y1, word_x2, word_y2; + res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); + GetWordBaseline(writing_direction, ppi, height, + word_x1, word_y1, word_x2, word_y2, + line_x1, line_y1, line_x2, line_y2, + &x, &y, &word_length); + } - pdf_str.add_str_double("", prec(a)); // . This affine matrix + if (writing_direction != old_writing_direction || new_block) { + AffineMatrix(writing_direction, + line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d); + pdf_str.add_str_double(" ", prec(a)); // . This affine matrix pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate pdf_str.add_str_double(" ", prec(c)); // . system for all - pdf_str.add_str_double(" ", prec(d)); // . text in the entire - pdf_str.add_str_double(" ", prec(x)); // . line. + pdf_str.add_str_double(" ", prec(d)); // . text that follows. + pdf_str.add_str_double(" ", prec(x)); // . pdf_str.add_str_double(" ", prec(y)); // . pdf_str += (" Tm "); // Place cursor absolutely + new_block = false; } else { - double offset = sqrt(static_cast(dist2(old_x, old_y, x, y))); - pdf_str.add_str_double(" ", prec(offset)); // Delta x in pts - pdf_str.add_str_double(" ", 0); // Delta y in pts - pdf_str += (" Td "); // Relative moveto + double dx = x - old_x; + double dy = y - old_y; + pdf_str.add_str_double(" ", prec(dx * a + dy * b)); + pdf_str.add_str_double(" ", prec(dx * c + dy * d)); + pdf_str += (" Td "); // Relative moveto } old_x = x; old_y = y; + old_writing_direction = writing_direction; // Adjust font size on a per word granularity. Pay attention to - // pointsize, old_pointsize, and pdf_str. We've found that for - // in Arabic, Tesseract will happily return a pointsize of zero, + // fontsize, old_fontsize, and pdf_str. We've found that for + // in Arabic, Tesseract will happily return a fontsize of zero, // so we make up a default number to protect ourselves. { bool bold, italic, underlined, monospace, serif, smallcaps; int font_id; res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, - &serif, &smallcaps, &pointsize, &font_id); - const int kDefaultPointSize = 8; - if (pointsize <= 0) - pointsize = kDefaultPointSize; - if (pointsize != old_pointsize) { + &serif, &smallcaps, &fontsize, &font_id); + const int kDefaultFontsize = 8; + if (fontsize <= 0) + fontsize = kDefaultFontsize; + if (fontsize != old_fontsize) { char textfont[20]; - snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize); + snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize); pdf_str += textfont; - old_pointsize = pointsize; + old_fontsize = fontsize; } } @@ -243,9 +294,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, delete []grapheme; res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) { + if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) { double h_stretch = - kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len)); + kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len)); pdf_str.add_str_double("", h_stretch); pdf_str += " Tz"; // horizontal stretch pdf_str += " [ "; @@ -449,7 +500,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, L_COMP_DATA *cid = NULL; const int kJpegQuality = 85; - l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid); + // TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain // types of PNG files, especially if there are 2 samples per pixel. // We can get rid of this logic after Leptonica 1.72 is released and @@ -747,5 +798,4 @@ bool TessPDFRenderer::EndDocumentHandler() { AppendString(buf); return true; } - } // namespace tesseract diff --git a/api/renderer.h b/api/renderer.h index 6f0c01c3..6d189dad 100644 --- a/api/renderer.h +++ b/api/renderer.h @@ -195,7 +195,7 @@ private: double width, double height); // Turn an image into a PDF object. Only transcode if we have to. static bool imageToPDFObj(Pix *pix, char *filename, long int objnum, - char **pdf_object, long int *pdf_object_size); + char **pdf_object, long int *pdf_object_size); }; diff --git a/tessdata/pdf.ttf b/tessdata/pdf.ttf index eb359b31..08fd97ae 100644 Binary files a/tessdata/pdf.ttf and b/tessdata/pdf.ttf differ diff --git a/tessdata/pdf.ttx b/tessdata/pdf.ttx index c6db1c8e..66ac4dfb 100644 --- a/tessdata/pdf.ttx +++ b/tessdata/pdf.ttx @@ -120,38 +120,49 @@ + + + + + + + + + + + - - + + - + - - + + - + - + - - - - + + + + @@ -160,17 +171,17 @@ - + - - - - - + + + + + @@ -179,250 +190,142 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -541,7 +444,148 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -549,245 +593,1155 @@ + - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +