backport pdfrenderer changes from master (4.0)

2025-01-19 15:03:45 +08:00 · 2017-05-05 20:10:03 +02:00 · 2017-05-05 20:10:03 +02:00 · 90b6f17838
commit 90b6f17838
parent 71712d1073
2 changed files with 50 additions and 27 deletions
--- a/api/pdfrenderer.cpp
+++ b/api/pdfrenderer.cpp
@ -159,7 +159,7 @@ CIDToGIDMap.

 OK there is a small problem there, if I use GID 0 then Acrobat gets
 upset about it and complains it cannot extract the font. If I set the
-CIDToGIDMap so that all the entries are 1 instead, its happy. Totally
+CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally
 mad......

 */
@ -169,10 +169,15 @@ namespace tesseract {
 // Use for PDF object fragments. Must be large enough
 // to hold a colormap with 256 colors in the verbose
 // PDF representation.
-const int kBasicBufSize = 2048;
+static const int kBasicBufSize = 2048;

 // If the font is 10 pts, nominal character width is 5 pts
-const int kCharWidth = 2;
+static const int kCharWidth = 2;
+
+// Used for memory allocation. A codepoint must take no more than this
+// many bytes, when written in the PDF way. e.g. "<0063>" for the
+// letter 'c'
+static const int kMaxBytesPerCodepoint = 20;

 /**********************************************************************
 * PDF Renderer interface implementation
@ -304,6 +309,23 @@ void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
    *line_y1 = *line_y2 = (y1 + y2) / 2;
 }

+bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) {
+  if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
+    tprintf("Dropping invalid codepoint %d\n", code);
+    return false;
+  }
+  if (code < 0x10000) {
+    snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code);
+  } else {
+    int a = code - 0x010000;
+    int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
+    int low_surrogate = (0x03FF & a) + 0xDC00;
+    snprintf(utf16, kMaxBytesPerCodepoint,
+             "%04X%04X", high_surrogate, low_surrogate);
+  }
+  return true;
+}
+
 char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
                                         double width, double height) {
  STRING pdf_str("");
@ -442,25 +464,13 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
      if (grapheme && grapheme[0] != '\0') {
        GenericVector<int> unicodes;
        UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
-        char utf16[20];
+        char utf16[kMaxBytesPerCodepoint];
        for (int i = 0; i < unicodes.length(); i++) {
          int code = unicodes[i];
-          // Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16
-          if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
-            tprintf("Dropping invalid codepoint %d\n", code);
-            continue;
+          if (CodepointToUtf16be(code, utf16)) {
+            pdf_word += utf16;
+            pdf_word_len++;
          }
-          if (code < 0x10000) {
-            snprintf(utf16, sizeof(utf16), "<%04X>", code);
-          } else {
-            int a = code - 0x010000;
-            int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
-            int low_surrogate = (0x03FF & a) + 0xDC00;
-            snprintf(utf16, sizeof(utf16), "<%04X%04X>",
-                     high_surrogate, low_surrogate);
-          }
-          pdf_word += utf16;
-          pdf_word_len++;
        }
      }
      delete []grapheme;
@ -471,9 +481,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
          kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
      pdf_str.add_str_double("", h_stretch);
      pdf_str += " Tz";          // horizontal stretch
-      pdf_str += " [ ";
+      pdf_str += " [ <";
      pdf_str += pdf_word;       // UTF-16BE representation
-      pdf_str += " ] TJ";        // show the text
+      pdf_str += "> ] TJ";       // show the text
    }
    if (last_word_in_line) {
      pdf_str += " \n";
@ -706,7 +716,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
  if (!filename)
    return false;

-  L_COMP_DATA *cid = NULL;
+  L_Compressed_Data *cid = NULL;
  const int kJpegQuality = 85;

  int format, sad;
@ -960,15 +970,27 @@ bool TessPDFRenderer::EndDocumentHandler() {
  offsets_.back() += pages_objsize;    // manipulation #2

  // INFO
+  STRING utf16_title = "FEFF";  // byte_order_marker
+  GenericVector<int> unicodes;
+  UNICHAR::UTF8ToUnicode(title(), &unicodes);
+  char utf16[kMaxBytesPerCodepoint];
+  for (int i = 0; i < unicodes.length(); i++) {
+    int code = unicodes[i];
+    if (CodepointToUtf16be(code, utf16)) {
+      utf16_title += utf16;
+    }
+  }
+
  char* datestr = l_getFormattedDate();
  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Producer (Tesseract %s)\n"
               "  /CreationDate (D:%s)\n"
-               "  /Title (%s)"
+               "  /Title <%s>\n"
               ">>\n"
-               "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
+               "endobj\n",
+               obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str());
  lept_free(datestr);
  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);
--- a/api/renderer.h
+++ b/api/renderer.h
@ -15,8 +15,8 @@
 //
 ///////////////////////////////////////////////////////////////////////

-#ifndef TESSERACT_API_RENDERER_H__
-#define TESSERACT_API_RENDERER_H__
+#ifndef TESSERACT_API_RENDERER_H_
+#define TESSERACT_API_RENDERER_H_

 // To avoid collision with other typenames include the ABSOLUTE MINIMUM
 // complexity of includes here. Use forward declarations wherever possible
@ -57,6 +57,7 @@ class TESS_API TessResultRenderer {
    /**
     * Starts a new document with the given title.
     * This clears the contents of the output data.
+     * Title should use UTF-8 encoding.
     */
    bool BeginDocument(const char* title);

@ -251,4 +252,4 @@ class TESS_API TessOsdRenderer : public TessResultRenderer {

 }  // namespace tesseract.

-#endif  // TESSERACT_API_RENDERER_H__
+#endif  // TESSERACT_API_RENDERER_H_