replace CubeUtils::UTF8ToUTF32 in pdfrenderer

2025-06-07 18:02:40 +08:00 · 2015-08-07 22:18:33 +02:00 · 2015-08-07 22:18:33 +02:00 · 300b5246f3
commit 300b5246f3
parent 7cb196d80d
1 changed files with 18 additions and 6 deletions
--- a/api/pdfrenderer.cpp
+++ b/api/pdfrenderer.cpp
@ -413,13 +413,25 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
    do {
      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
      if (grapheme && grapheme[0] != '\0') {
-        // TODO(jbreiden) Do a real UTF-16BE conversion
-        // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
-        string_32 utf32;
-        CubeUtils::UTF8ToUTF32(grapheme, &utf32);
+        GenericVector<int> unicodes;
+        UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
        char utf16[20];
-        for (int i = 0; i < static_cast<int>(utf32.length()); i++) {
-          snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]);
+        for (int i = 0; i < unicodes.length(); i++) {
+          int code = unicodes[i];
+          // Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16
+          if ((code > 0xD7FF && code < 0xE0000) || code > 0x10FFFF) {
+                tprintf("Dropping invalid codepoint %d\n", code);
+                continue;
+          }
+          if (code < 0x10000) {
+            snprintf(utf16, sizeof(utf16), "<%04X>", code);
+          } else {
+            int a = code - 0x010000;
+            int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
+            int low_surrogate = (0x03FF & a) + 0xDC00;
+            snprintf(utf16, sizeof(utf16), "<%04X%04X>",
+                     high_surrogate, low_surrogate);
+          }
          pdf_word += utf16;
          pdf_word_len++;
        }