Fixed bidi handling in PDF output

2025-06-07 09:52:40 +08:00 · 2014-10-09 13:29:01 -07:00 · 2014-10-09 13:29:01 -07:00 · d9699c4099
commit d9699c4099
parent f927728169
4 changed files with 1608 additions and 604 deletions
--- a/api/pdfrenderer.cpp
+++ b/api/pdfrenderer.cpp
@ -60,15 +60,118 @@ long dist2(int x1, int y1, int x2, int y2) {
  return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
 }

+// Viewers like evince can get really confused during copy-paste when
+// the baseline wanders around. So I've decided to project every word
+// onto the (straight) line baseline. All numbers are in the native
+// PDF coordinate system, which has the origin in the bottom left and
+// the unit is points, which is 1/72 inch. Tesseract reports baselines
+// left-to-right no matter what the reading order is. We need the
+// word baseline in reading order, so we do that conversion here. Returns
+// the word's baseline origin and length.
+void GetWordBaseline(int writing_direction, int ppi, int height,
+                     int word_x1, int word_y1, int word_x2, int word_y2,
+                     int line_x1, int line_y1, int line_x2, int line_y2,
+                     double *x0, double *y0, double *length) {
+  if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
+    Swap(&word_x1, &word_x2);
+    Swap(&word_y1, &word_y2);
+  }
+  double word_length;
+  double x, y;
+  {
+    int px = word_x1;
+    int py = word_y1;
+    double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
+    if (l2 == 0) {
+      x = line_x1;
+      y = line_y1;
+    } else {
+      double t = ((px - line_x2) * (line_x2 - line_x1) +
+                  (py - line_y2) * (line_y2 - line_y1)) / l2;
+      x = line_x2 + t * (line_x2 - line_x1);
+      y = line_y2 + t * (line_y2 - line_y1);
+    }
+    word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
+                                                 word_x2, word_y2)));
+    word_length = word_length * 72.0 / ppi;
+    x = x * 72 / ppi;
+    y = height - (y * 72.0 / ppi);
+  }
+  *x0 = x;
+  *y0 = y;
+  *length = word_length;
+}
+
+// Compute coefficients for an affine matrix describing the rotation
+// of the text. If the text is right-to-left such as Arabic or Hebrew,
+// we reflect over the Y-axis. This matrix will set the coordinate
+// system for placing text in the PDF file.
+//
+//                           RTL
+// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
+// [ y' ]   [ c d ][ y ]   [ 0 1 ] [-sin cos ][ y ]
+void AffineMatrix(int writing_direction,
+                  int line_x1, int line_y1, int line_x2, int line_y2,
+                  double *a, double *b, double *c, double *d) {
+  double theta = atan2(static_cast<double>(line_y1 - line_y2),
+                       static_cast<double>(line_x2 - line_x1));
+  *a = cos(theta);
+  *b = sin(theta);
+  *c = -sin(theta);
+  *d = cos(theta);
+  switch(writing_direction) {
+    case WRITING_DIRECTION_RIGHT_TO_LEFT:
+      *a = -*a;
+      *b = -*b;
+      break;
+    case WRITING_DIRECTION_TOP_TO_BOTTOM:
+      // TODO(jbreiden) Consider using the vertical PDF writing mode.
+      break;
+    default:
+      break;
+  }
+}
+
+// There are some really stupid PDF viewers in the wild, such as
+// 'Preview' which ships with the Mac. They do a better job with text
+// selection and highlighting when given perfectly flat baseline
+// instead of very slightly tilted. We clip small tilts to appease
+// these viewers. I chose this threshold large enough to absorb noise,
+// but small enough that lines probably won't cross each other if the
+// whole page is tilted at almost exactly the clipping threshold.
+void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
+                  int *line_x1, int *line_y1,
+                  int *line_x2, int *line_y2) {
+  *line_x1 = x1;
+  *line_y1 = y1;
+  *line_x2 = x2;
+  *line_y2 = y2;
+  double rise = abs(y2 - y1) * 72 / ppi;
+  double run = abs(x2 - x1) * 72 / ppi;
+  if (rise < 2.0 && 2.0 < run)
+    *line_y1 = *line_y2 = (y1 + y2) / 2;
+}
+
 char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
                                         double width, double height) {
-  double ppi = api->GetSourceYResolution();
  STRING pdf_str("");
-  double old_x = 0.0, old_y = 0.0;
-  int old_pointsize = 0;
+  double ppi = api->GetSourceYResolution();

-  // TODO(jbreiden) Slightly cleaner from an abstraction standpoint
-  // if this were to live inside a separate text object.
+  // These initial conditions are all arbitrary and will be overwritten
+  double old_x = 0.0, old_y = 0.0;
+  int old_fontsize = 0;
+  tesseract::WritingDirection old_writing_direction =
+      WRITING_DIRECTION_LEFT_TO_RIGHT;
+  bool new_block = true;
+  int fontsize = 0;
+  double a = 1;
+  double b = 0;
+  double c = 0;
+  double d = 1;
+
+  // TODO(jbreiden) This marries the text and image together.
+  // Slightly cleaner from an abstraction standpoint if this were to
+  // live inside a separate text object.
  pdf_str += "q ";
  pdf_str.add_str_double("", prec(width));
  pdf_str += " 0 0 ";
@ -76,28 +179,18 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
  pdf_str += " 0 0 cm /Im1 Do Q\n";

  ResultIterator *res_it = api->GetIterator();
-
  while (!res_it->Empty(RIL_BLOCK)) {
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
-      pdf_str += "BT\n3 Tr\n";  // Begin text object, use invisible ink
-      old_pointsize = 0.0;      // Every block will declare its font
+      pdf_str += "BT\n3 Tr";     // Begin text object, use invisible ink
+      old_fontsize = 0;          // Every block will declare its fontsize
+      new_block = true;          // Every block will declare its affine matrix
    }

    int line_x1, line_y1, line_x2, line_y2;
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
-      res_it->Baseline(RIL_TEXTLINE,
-                       &line_x1, &line_y1, &line_x2, &line_y2);
-      double rise = abs(line_y2 - line_y1) * 72 / ppi;
-      double run = abs(line_x2 - line_x1) * 72 / ppi;
-      // There are some really stupid PDF viewers in the wild, such as
-      // 'Preview' which ships with the Mac. They might do a better
-      // job with text selection and highlighting when given perfectly
-      // straight text instead of very slightly tilted text. I chose
-      // this threshold large enough to absorb noise, but small enough
-      // that lines probably won't cross each other if the whole page
-      // is tilted at almost exactly the clipping threshold.
-      if (rise < 2.0 && 2.0 < run)
-        line_y1 = line_y2 = (line_y1 + line_y2) / 2;
+      int x1, y1, x2, y2;
+      res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
+      ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
    }

    if (res_it->Empty(RIL_WORD)) {
@ -105,120 +198,78 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
      continue;
    }

-    int word_x1, word_y1, word_x2, word_y2;
-    res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
-
-    // The critical one is writing_direction
-    tesseract::Orientation orientation;
+    // Writing direction changes at a per-word granularity
    tesseract::WritingDirection writing_direction;
-    tesseract::TextlineOrder textline_order;
-    float deskew_angle;
-    res_it->Orientation(&orientation, &writing_direction,
-                        &textline_order, &deskew_angle);
-
-    // Unlike Tesseract, we always want the word baseline in reading order.
-    if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
-      Swap(&word_x1, &word_x2);
-      Swap(&word_y1, &word_y2);
-    }
-
-    // Viewers like evince can get really confused during copy-paste
-    // when the baseline wanders around. I've decided to force every
-    // word to match the (straight) baseline.  The math below is just
-    // projecting the word origin onto the baseline.  All numbers are
-    // in the native PDF coordinate system, which has the origin in
-    // the bottom left and the unit is points, which is 1/72 inch.
-    double word_length;
-    double x, y;
    {
-      int px = word_x1;
-      int py = word_y1;
-      double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
-      if (l2 == 0) {
-        x = line_x1;
-        y = line_y1;
-      } else {
-        double t = ((px - line_x2) * (line_x2 - line_x1) +
-                    (py - line_y2) * (line_y2 - line_y1)) / l2;
-        x = line_x2 + t * (line_x2 - line_x1);
-        y = line_y2 + t * (line_y2 - line_y1);
+      tesseract::Orientation orientation;
+      tesseract::TextlineOrder textline_order;
+      float deskew_angle;
+      res_it->Orientation(&orientation, &writing_direction,
+                          &textline_order, &deskew_angle);
+      if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
+        switch (res_it->WordDirection()) {
+          case DIR_LEFT_TO_RIGHT:
+            writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
+            break;
+          case DIR_RIGHT_TO_LEFT:
+            writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
+            break;
+          default:
+            writing_direction = old_writing_direction;
+        }
      }
-      word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
-                                                   word_x2, word_y2)));
-      word_length = word_length * 72.0 / ppi;
-      x = x * 72 / ppi;
-      y = height - (y * 72.0 / ppi);
    }

-    int pointsize = 0;
-    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
-      // Calculate the rotation angle in the PDF cooordinate system,
-      // which has the origin in the bottom left. The Tesseract
-      // coordinate system has the origin in the upper left.
-      //
-      // PDF is kind of a like turtle graphics, and we orient the
-      // turtle (errr... initial cursor position) with an affine
-      // transformation.
-      //
-      //                                Rotate              RTL    Translate
-      //
-      // [ x' y' 1 ]  = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ]  [ -1 0 0 ] [ 1 0 0 ]
-      //                          [ sin𝜃  cos𝜃 0 ]  [  0 1 0 ] [ 0 1 0 ]
-      //                          [   0    0   1 ]  [  0 0 1 ] [ x y 1 ]
-      //
-      double theta = atan2(static_cast<double>(line_y1 - line_y2),
-                           static_cast<double>(line_x2 - line_x1));
-      double a, b, c, d;
-      a = cos(theta);
-      b = sin(theta);
-      c = -sin(theta);
-      d = cos(theta);
-      switch(writing_direction) {
-        case WRITING_DIRECTION_RIGHT_TO_LEFT:
-          a = -a;
-          b = -b;
-          c = -c;
-          break;
-        case WRITING_DIRECTION_TOP_TO_BOTTOM:
-          // TODO(jbreiden) Consider switching PDF writing mode to vertical.
-          break;
-        default:
-          break;
-      }
+    // Where is word origin and how long is it?
+    double x, y, word_length;
+    {
+      int word_x1, word_y1, word_x2, word_y2;
+      res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
+      GetWordBaseline(writing_direction, ppi, height,
+                      word_x1, word_y1, word_x2, word_y2,
+                      line_x1, line_y1, line_x2, line_y2,
+                      &x, &y, &word_length);
+    }

-      pdf_str.add_str_double("",  prec(a));  // . This affine matrix
+    if (writing_direction != old_writing_direction || new_block) {
+      AffineMatrix(writing_direction,
+                   line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
+      pdf_str.add_str_double(" ", prec(a));  // . This affine matrix
      pdf_str.add_str_double(" ", prec(b));  // . sets the coordinate
      pdf_str.add_str_double(" ", prec(c));  // . system for all
-      pdf_str.add_str_double(" ", prec(d));  // . text in the entire
-      pdf_str.add_str_double(" ", prec(x));  // . line.
+      pdf_str.add_str_double(" ", prec(d));  // . text that follows.
+      pdf_str.add_str_double(" ", prec(x));  // .
      pdf_str.add_str_double(" ", prec(y));  // .
      pdf_str += (" Tm ");                   // Place cursor absolutely
+      new_block = false;
    } else {
-      double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
-      pdf_str.add_str_double(" ", prec(offset));  // Delta x in pts
-      pdf_str.add_str_double(" ", 0);             // Delta y in pts
-      pdf_str += (" Td ");                        // Relative moveto
+      double dx = x - old_x;
+      double dy = y - old_y;
+      pdf_str.add_str_double(" ", prec(dx * a + dy * b));
+      pdf_str.add_str_double(" ", prec(dx * c + dy * d));
+      pdf_str += (" Td ");                   // Relative moveto
    }
    old_x = x;
    old_y = y;
+    old_writing_direction = writing_direction;

    // Adjust font size on a per word granularity. Pay attention to
-    // pointsize, old_pointsize, and pdf_str. We've found that for
-    // in Arabic, Tesseract will happily return a pointsize of zero,
+    // fontsize, old_fontsize, and pdf_str. We've found that for
+    // in Arabic, Tesseract will happily return a fontsize of zero,
    // so we make up a default number to protect ourselves.
    {
      bool bold, italic, underlined, monospace, serif, smallcaps;
      int font_id;
      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
-                                 &serif, &smallcaps, &pointsize, &font_id);
-      const int kDefaultPointSize = 8;
-      if (pointsize <= 0)
-        pointsize = kDefaultPointSize;
-      if (pointsize != old_pointsize) {
+                                 &serif, &smallcaps, &fontsize, &font_id);
+      const int kDefaultFontsize = 8;
+      if (fontsize <= 0)
+        fontsize = kDefaultFontsize;
+      if (fontsize != old_fontsize) {
        char textfont[20];
-        snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
+        snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
        pdf_str += textfont;
-        old_pointsize = pointsize;
+        old_fontsize = fontsize;
      }
    }

@ -243,9 +294,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
      delete []grapheme;
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
-    if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
+    if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
      double h_stretch =
-          kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len));
+          kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
      pdf_str.add_str_double("", h_stretch);
      pdf_str += " Tz";          // horizontal stretch
      pdf_str += " [ ";
@ -449,7 +500,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,

  L_COMP_DATA *cid = NULL;
  const int kJpegQuality = 85;
-  l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
+
  // TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain
  // types of PNG files, especially if there are 2 samples per pixel.
  // We can get rid of this logic after Leptonica 1.72 is released and
@ -747,5 +798,4 @@ bool TessPDFRenderer::EndDocumentHandler() {
  AppendString(buf);
  return true;
 }
-
 }  // namespace tesseract
--- a/api/renderer.h
+++ b/api/renderer.h
@ -195,7 +195,7 @@ private:
                                 double width, double height);
  // Turn an image into a PDF object. Only transcode if we have to.
  static bool imageToPDFObj(Pix *pix, char *filename, long int objnum,
-                            char **pdf_object, long int *pdf_object_size);
+                          char **pdf_object, long int *pdf_object_size);
 };


--- a/tessdata/pdf.ttf
+++ b/tessdata/pdf.ttf
--- a/tessdata/pdf.ttx
+++ b/tessdata/pdf.ttx