Fixed bidi handling in PDF output

2025-06-10 20:23:12 +08:00 · 2014-10-09 13:29:01 -07:00 · 2014-10-09 13:29:01 -07:00 · d9699c4099
commit d9699c4099
parent f927728169
4 changed files with 1608 additions and 604 deletions
--- a/api/pdfrenderer.cpp
+++ b/api/pdfrenderer.cpp
@ -60,74 +60,22 @@ long dist2(int x1, int y1, int x2, int y2) {
  return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
 }
-char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
+// Viewers like evince can get really confused during copy-paste when
-                                         double width, double height) {
+// the baseline wanders around. So I've decided to project every word
-  double ppi = api->GetSourceYResolution();
+// onto the (straight) line baseline. All numbers are in the native
-  STRING pdf_str("");
+// PDF coordinate system, which has the origin in the bottom left and
-  double old_x = 0.0, old_y = 0.0;
+// the unit is points, which is 1/72 inch. Tesseract reports baselines
-  int old_pointsize = 0;
+// left-to-right no matter what the reading order is. We need the
-
+// word baseline in reading order, so we do that conversion here. Returns
-  // TODO(jbreiden) Slightly cleaner from an abstraction standpoint
+// the word's baseline origin and length.
-  // if this were to live inside a separate text object.
+void GetWordBaseline(int writing_direction, int ppi, int height,
-  pdf_str += "q ";
+                     int word_x1, int word_y1, int word_x2, int word_y2,
-  pdf_str.add_str_double("", prec(width));
+                     int line_x1, int line_y1, int line_x2, int line_y2,
-  pdf_str += " 0 0 ";
+                     double *x0, double *y0, double *length) {
  pdf_str.add_str_double("", prec(height));
  pdf_str += " 0 0 cm /Im1 Do Q\n";
  ResultIterator *res_it = api->GetIterator();
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
      pdf_str += "BT\n3 Tr\n";  // Begin text object, use invisible ink
      old_pointsize = 0.0;      // Every block will declare its font
    }
    int line_x1, line_y1, line_x2, line_y2;
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
      res_it->Baseline(RIL_TEXTLINE,
                       &line_x1, &line_y1, &line_x2, &line_y2);
      double rise = abs(line_y2 - line_y1) * 72 / ppi;
      double run = abs(line_x2 - line_x1) * 72 / ppi;
      // There are some really stupid PDF viewers in the wild, such as
      // 'Preview' which ships with the Mac. They might do a better
      // job with text selection and highlighting when given perfectly
      // straight text instead of very slightly tilted text. I chose
      // this threshold large enough to absorb noise, but small enough
      // that lines probably won't cross each other if the whole page
      // is tilted at almost exactly the clipping threshold.
      if (rise < 2.0 && 2.0 < run)
        line_y1 = line_y2 = (line_y1 + line_y2) / 2;
    }
    if (res_it->Empty(RIL_WORD)) {
      res_it->Next(RIL_WORD);
      continue;
    }
    int word_x1, word_y1, word_x2, word_y2;
    res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
    // The critical one is writing_direction
    tesseract::Orientation orientation;
    tesseract::WritingDirection writing_direction;
    tesseract::TextlineOrder textline_order;
    float deskew_angle;
    res_it->Orientation(&orientation, &writing_direction,
                        &textline_order, &deskew_angle);
    // Unlike Tesseract, we always want the word baseline in reading order.
  if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
    Swap(&word_x1, &word_x2);
    Swap(&word_y1, &word_y2);
  }
    // Viewers like evince can get really confused during copy-paste
    // when the baseline wanders around. I've decided to force every
    // word to match the (straight) baseline.  The math below is just
    // projecting the word origin onto the baseline.  All numbers are
    // in the native PDF coordinate system, which has the origin in
    // the bottom left and the unit is points, which is 1/72 inch.
  double word_length;
  double x, y;
  {
@ -149,76 +97,179 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
    x = x * 72 / ppi;
    y = height - (y * 72.0 / ppi);
  }
  *x0 = x;
  *y0 = y;
  *length = word_length;
 }
-    int pointsize = 0;
+// Compute coefficients for an affine matrix describing the rotation
-    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+// of the text. If the text is right-to-left such as Arabic or Hebrew,
-      // Calculate the rotation angle in the PDF cooordinate system,
+// we reflect over the Y-axis. This matrix will set the coordinate
-      // which has the origin in the bottom left. The Tesseract
+// system for placing text in the PDF file.
      // coordinate system has the origin in the upper left.
      //
      // PDF is kind of a like turtle graphics, and we orient the
      // turtle (errr... initial cursor position) with an affine
      // transformation.
      //
      //                                Rotate              RTL    Translate
      //
      // [ x' y' 1 ]  = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ]  [ -1 0 0 ] [ 1 0 0 ]
      //                          [ sin𝜃  cos𝜃 0 ]  [  0 1 0 ] [ 0 1 0 ]
      //                          [   0    0   1 ]  [  0 0 1 ] [ x y 1 ]
 //
 //                           RTL
 // [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
 // [ y' ]   [ c d ][ y ]   [ 0 1 ] [-sin cos ][ y ]
 void AffineMatrix(int writing_direction,
                  int line_x1, int line_y1, int line_x2, int line_y2,
                  double *a, double *b, double *c, double *d) {
  double theta = atan2(static_cast<double>(line_y1 - line_y2),
                       static_cast<double>(line_x2 - line_x1));
-      double a, b, c, d;
+  *a = cos(theta);
-      a = cos(theta);
+  *b = sin(theta);
-      b = sin(theta);
+  *c = -sin(theta);
-      c = -sin(theta);
+  *d = cos(theta);
      d = cos(theta);
  switch(writing_direction) {
    case WRITING_DIRECTION_RIGHT_TO_LEFT:
-          a = -a;
+      *a = -*a;
-          b = -b;
+      *b = -*b;
          c = -c;
      break;
    case WRITING_DIRECTION_TOP_TO_BOTTOM:
-          // TODO(jbreiden) Consider switching PDF writing mode to vertical.
+      // TODO(jbreiden) Consider using the vertical PDF writing mode.
      break;
    default:
      break;
  }
 }
 // There are some really stupid PDF viewers in the wild, such as
 // 'Preview' which ships with the Mac. They do a better job with text
 // selection and highlighting when given perfectly flat baseline
 // instead of very slightly tilted. We clip small tilts to appease
 // these viewers. I chose this threshold large enough to absorb noise,
 // but small enough that lines probably won't cross each other if the
 // whole page is tilted at almost exactly the clipping threshold.
 void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
                  int *line_x1, int *line_y1,
                  int *line_x2, int *line_y2) {
  *line_x1 = x1;
  *line_y1 = y1;
  *line_x2 = x2;
  *line_y2 = y2;
  double rise = abs(y2 - y1) * 72 / ppi;
  double run = abs(x2 - x1) * 72 / ppi;
  if (rise < 2.0 && 2.0 < run)
    *line_y1 = *line_y2 = (y1 + y2) / 2;
 }
 char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
                                         double width, double height) {
  STRING pdf_str("");
  double ppi = api->GetSourceYResolution();
  // These initial conditions are all arbitrary and will be overwritten
  double old_x = 0.0, old_y = 0.0;
  int old_fontsize = 0;
  tesseract::WritingDirection old_writing_direction =
      WRITING_DIRECTION_LEFT_TO_RIGHT;
  bool new_block = true;
  int fontsize = 0;
  double a = 1;
  double b = 0;
  double c = 0;
  double d = 1;
  // TODO(jbreiden) This marries the text and image together.
  // Slightly cleaner from an abstraction standpoint if this were to
  // live inside a separate text object.
  pdf_str += "q ";
  pdf_str.add_str_double("", prec(width));
  pdf_str += " 0 0 ";
  pdf_str.add_str_double("", prec(height));
  pdf_str += " 0 0 cm /Im1 Do Q\n";
  ResultIterator *res_it = api->GetIterator();
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
      pdf_str += "BT\n3 Tr";     // Begin text object, use invisible ink
      old_fontsize = 0;          // Every block will declare its fontsize
      new_block = true;          // Every block will declare its affine matrix
    }
    int line_x1, line_y1, line_x2, line_y2;
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
      int x1, y1, x2, y2;
      res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
      ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
    }
    if (res_it->Empty(RIL_WORD)) {
      res_it->Next(RIL_WORD);
      continue;
    }
    // Writing direction changes at a per-word granularity
    tesseract::WritingDirection writing_direction;
    {
      tesseract::Orientation orientation;
      tesseract::TextlineOrder textline_order;
      float deskew_angle;
      res_it->Orientation(&orientation, &writing_direction,
                          &textline_order, &deskew_angle);
      if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
        switch (res_it->WordDirection()) {
          case DIR_LEFT_TO_RIGHT:
            writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
            break;
          case DIR_RIGHT_TO_LEFT:
            writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
            break;
          default:
            writing_direction = old_writing_direction;
        }
      }
    }
    // Where is word origin and how long is it?
    double x, y, word_length;
    {
      int word_x1, word_y1, word_x2, word_y2;
      res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
      GetWordBaseline(writing_direction, ppi, height,
                      word_x1, word_y1, word_x2, word_y2,
                      line_x1, line_y1, line_x2, line_y2,
                      &x, &y, &word_length);
    }
    if (writing_direction != old_writing_direction || new_block) {
      AffineMatrix(writing_direction,
                   line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
      pdf_str.add_str_double(" ", prec(a));  // . This affine matrix
      pdf_str.add_str_double(" ", prec(b));  // . sets the coordinate
      pdf_str.add_str_double(" ", prec(c));  // . system for all
-      pdf_str.add_str_double(" ", prec(d));  // . text in the entire
+      pdf_str.add_str_double(" ", prec(d));  // . text that follows.
-      pdf_str.add_str_double(" ", prec(x));  // . line.
+      pdf_str.add_str_double(" ", prec(x));  // .
      pdf_str.add_str_double(" ", prec(y));  // .
      pdf_str += (" Tm ");                   // Place cursor absolutely
      new_block = false;
    } else {
-      double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
+      double dx = x - old_x;
-      pdf_str.add_str_double(" ", prec(offset));  // Delta x in pts
+      double dy = y - old_y;
-      pdf_str.add_str_double(" ", 0);             // Delta y in pts
+      pdf_str.add_str_double(" ", prec(dx * a + dy * b));
      pdf_str.add_str_double(" ", prec(dx * c + dy * d));
      pdf_str += (" Td ");                   // Relative moveto
    }
    old_x = x;
    old_y = y;
    old_writing_direction = writing_direction;
    // Adjust font size on a per word granularity. Pay attention to
-    // pointsize, old_pointsize, and pdf_str. We've found that for
+    // fontsize, old_fontsize, and pdf_str. We've found that for
-    // in Arabic, Tesseract will happily return a pointsize of zero,
+    // in Arabic, Tesseract will happily return a fontsize of zero,
    // so we make up a default number to protect ourselves.
    {
      bool bold, italic, underlined, monospace, serif, smallcaps;
      int font_id;
      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
-                                 &serif, &smallcaps, &pointsize, &font_id);
+                                 &serif, &smallcaps, &fontsize, &font_id);
-      const int kDefaultPointSize = 8;
+      const int kDefaultFontsize = 8;
-      if (pointsize <= 0)
+      if (fontsize <= 0)
-        pointsize = kDefaultPointSize;
+        fontsize = kDefaultFontsize;
-      if (pointsize != old_pointsize) {
+      if (fontsize != old_fontsize) {
        char textfont[20];
-        snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
+        snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
        pdf_str += textfont;
-        old_pointsize = pointsize;
+        old_fontsize = fontsize;
      }
    }
@ -243,9 +294,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
      delete []grapheme;
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
-    if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
+    if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
      double h_stretch =
-          kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len));
+          kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
      pdf_str.add_str_double("", h_stretch);
      pdf_str += " Tz";          // horizontal stretch
      pdf_str += " [ ";
@ -449,7 +500,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
  L_COMP_DATA *cid = NULL;
  const int kJpegQuality = 85;
-  l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
+
  // TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain
  // types of PNG files, especially if there are 2 samples per pixel.
  // We can get rid of this logic after Leptonica 1.72 is released and
@ -747,5 +798,4 @@ bool TessPDFRenderer::EndDocumentHandler() {
  AppendString(buf);
  return true;
 }
 }  // namespace tesseract
--- a/tessdata/pdf.ttf
+++ b/tessdata/pdf.ttf
--- a/tessdata/pdf.ttx
+++ b/tessdata/pdf.ttx