From 0dcc889e8d9b029026d3aa6a6d6b1f1d1b890c6a Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Wed, 15 May 2019 22:43:47 +0200
Subject: [PATCH 1/4] Fix apiexample_test with locale de_DE.UTF-8

The unittest failed with LANG=de_DE.UTF-8:

    $ unittest/apiexample_test
    Running main() from ../../../../unittest/../googletest/googletest/src/gtest_main.cc
    [==========] Running 4 tests from 2 test suites.
    [----------] Global test environment set-up.
    [----------] 1 test from EuroText
    [ RUN      ] EuroText.FastLatinOCR
    contains_unichar_id(unichar_id):Error:Assert failed:in file ../../../../../src/ccutil/unicharset.h, line 874

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/ccutil/unicharset.cpp | 97 +++++++++++++++++++++++++--------------
 1 file changed, 62 insertions(+), 35 deletions(-)
diff --git a/src/ccutil/unicharset.cpp b/src/ccutil/unicharset.cpp
index 6c69ecd2..8585e64f 100644
--- a/src/ccutil/unicharset.cpp
+++ b/src/ccutil/unicharset.cpp
@@ -22,6 +22,9 @@
 #include <cassert>
 #include <cstdio>
 #include <cstring>
+#include <iomanip>    // for std::setw
+#include <locale>     // for std::locale::classic
+#include <sstream>    // for std::istringstream
 
 #include "params.h"
 #include "serialis.h"
@@ -706,6 +709,7 @@ bool UNICHARSET::save_to_string(STRING *str) const {
               this->get_script_from_script_id(this->get_script(id)),
               this->get_other_case(id));
     } else {
+      // FIXME
       snprintf(buffer, kFileBufSize,
               "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
               this->id_to_unichar(id), properties,
@@ -815,41 +819,64 @@ bool UNICHARSET::load_via_fgets(
     float advance = 0.0f;
     float advance_sd = 0.0f;
     // TODO(eger): check that this default it ok
-    // after enabling BiDi iterator for Arabic+Cube.
+    // after enabling BiDi iterator for Arabic.
     int direction = UNICHARSET::U_LEFT_TO_RIGHT;
-    UNICHAR_ID other_case = id;
-    UNICHAR_ID mirror = id;
-    char normed[64];
-    int v = -1;
-    if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr ||
-        ((v = sscanf(buffer,
-                     "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
-                     unichar, &properties,
-                     &min_bottom, &max_bottom, &min_top, &max_top,
-                     &width, &width_sd, &bearing, &bearing_sd,
-                     &advance, &advance_sd, script, &other_case,
-                     &direction, &mirror, normed)) != 17 &&
-         (v = sscanf(buffer,
-                     "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
-                     unichar, &properties,
-                     &min_bottom, &max_bottom, &min_top, &max_top,
-                     &width, &width_sd, &bearing, &bearing_sd,
-                     &advance, &advance_sd, script, &other_case,
-                     &direction, &mirror)) != 16 &&
-          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
-                      unichar, &properties,
-                      &min_bottom, &max_bottom, &min_top, &max_top,
-                      script, &other_case, &direction, &mirror)) != 10 &&
-          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
-                      &min_bottom, &max_bottom, &min_top, &max_top,
-                      script, &other_case)) != 8 &&
-          (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
-                      script, &other_case)) != 4 &&
-          (v = sscanf(buffer, "%s %x %63s",
-                      unichar, &properties, script)) != 3 &&
-          (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
+    UNICHAR_ID other_case = unicharset_size;
+    UNICHAR_ID mirror = unicharset_size;
+    if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr) {
       return false;
     }
+    char normed[64];
+    normed[0] = '\0';
+    std::istringstream stream(buffer);
+    stream.imbue(std::locale::classic());
+    // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标  # 标 [6807 ]x
+    //stream.flags(std::ios::hex);
+    stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
+    //stream.flags(std::ios::dec);
+    if (stream.fail()) {
+      fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
+      return false;
+    }
+    auto position = stream.tellg();
+    stream.seekg(position);
+    char c1, c2, c3, c4, c5, c6, c7, c8, c9;
+    stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
+      width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
+      advance >> c9 >> advance_sd >> std::setw(63) >> script >>
+      other_case >> direction >> mirror >> std::setw(63) >> normed;
+    if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
+        c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
+      stream.clear();
+      stream.seekg(position);
+      stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
+      width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
+      advance >> c9 >> advance_sd >> std::setw(63) >> script >>
+      other_case >> direction >> mirror;
+      if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
+          c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
+        stream.clear();
+        stream.seekg(position);
+        stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
+        std::setw(63) >> script >> other_case >> direction >> mirror;
+        if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
+          stream.clear();
+          stream.seekg(position);
+          stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
+          std::setw(63) >> script >> other_case;
+          if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
+            stream.clear();
+            stream.seekg(position);
+            stream >> std::setw(63) >> script >> other_case;
+            if (stream.fail()) {
+              stream.clear();
+              stream.seekg(position);
+              stream >> std::setw(63) >> script;
+            }
+          }
+        }
+      }
+    }
 
     // Skip fragments if needed.
     CHAR_FRAGMENT *frag = nullptr;
@@ -880,9 +907,9 @@ bool UNICHARSET::load_via_fgets(
     this->set_advance_stats(id, advance, advance_sd);
     this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
     this->set_other_case(
-        id, (v > 3 && other_case < unicharset_size) ? other_case : id);
-    this->set_mirror(id, (v > 8 && mirror < unicharset_size) ? mirror : id);
-    this->set_normed(id, (v>16) ? normed : unichar);
+        id, (other_case < unicharset_size) ? other_case : id);
+    this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
+    this->set_normed(id, normed[0] != '\0' ? normed : unichar);
   }
   post_load_setup();
   return true;

From 36ed6da3499c93c2d04de29ee2f02f6d9975a1fe Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 16 May 2019 07:09:27 +0200
Subject: [PATCH 2/4] Fix baseapi_test with locale de_DE.UTF-8

The unittest failed with LANG=de_DE.UTF-8:

    $ unittest/baseapi_test
    Running main() from ../../../../unittest/../googletest/googletest/src/gtest_main.cc
    [==========] Running 12 tests from 2 test suites.
    [----------] Global test environment set-up.
    [----------] 10 tests from TesseractTest
    [ RUN      ] TesseractTest.ArraySizeTest
    [       OK ] TesseractTest.ArraySizeTest (0 ms)
    [ RUN      ] TesseractTest.BasicTesseractTest
    [       OK ] TesseractTest.BasicTesseractTest (1251 ms)
    [ RUN      ] TesseractTest.IteratesParagraphsEvenIfNotDetected
    [       OK ] TesseractTest.IteratesParagraphsEvenIfNotDetected (347 ms)
    [ RUN      ] TesseractTest.HOCRWorksWithoutSetInputName
    [       OK ] TesseractTest.HOCRWorksWithoutSetInputName (403 ms)
    [ RUN      ] TesseractTest.HOCRContainsBaseline
    [       OK ] TesseractTest.HOCRContainsBaseline (389 ms)
    [ RUN      ] TesseractTest.RickSnyderNotFuckSnyder
    [       OK ] TesseractTest.RickSnyderNotFuckSnyder (346 ms)
    [ RUN      ] TesseractTest.AdaptToWordStrTest
    Trying to adapt "136
    " to "1 3 6"
    Trying to adapt "256
    " to "2 5 6"
    Trying to adapt "410
    " to "4 1 0"
    Trying to adapt "432
    " to "4 3 2"
    Trying to adapt "540
    " to "5 4 0"
    Trying to adapt "692
    " to "6 9 2"
    Trying to adapt "779
    " to "7 7 9"
    Trying to adapt "793
    " to "7 9 3"
    Trying to adapt "808
    " to "8 0 8"
    Trying to adapt "815
    " to "8 1 5"
    Trying to adapt "12
    " to "1 2"
    Trying to adapt "12
    " to "1 2"
    [       OK ] TesseractTest.AdaptToWordStrTest (788 ms)
    [ RUN      ] TesseractTest.BasicLSTMTest
    [       OK ] TesseractTest.BasicLSTMTest (4525 ms)
    [ RUN      ] TesseractTest.LSTMGeometryTest
    [       OK ] TesseractTest.LSTMGeometryTest (615 ms)
    [ RUN      ] TesseractTest.InitConfigOnlyTest
    Error: unichar ? in normproto file is not in unichar set.
    Error: unichar 0.232621 in normproto file is not in unichar set.
    Error: unichar 0.000400 in normproto file is not in unichar set.
    Error: unichar 0.231864 in normproto file is not in unichar set.
    [...]
    Error: unichar ? in normproto file is not in unichar set.
    Error: unichar 0.233915 in normproto file is not in unichar set.
    Error: unichar 0.000400 in normproto file is not in unichar set.
    Error: unichar 0.221755 in normproto file is not in unichar set.
    Error: unichar 0.000400 in normproto file is not in unichar set.
    Error: unichar ? in normproto file is not in unichar set.
    baseapi_test(21845,0x1134c45c0) malloc: *** error for object 0x927f96c28005e0: pointer being freed was not allocated
    baseapi_test(21845,0x1134c45c0) malloc: *** set a breakpoint in malloc_error_break to debug
    [INFO]  Lang eng took 327ms in regular init
    [INFO]  Lang chi_tra took 1422ms in regular init
    Abort trap: 6

TesseractTest.InitConfigOnlyTest is fixed by using std::istringstream
instead of sscanf.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/classify/normmatch.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/classify/normmatch.cpp b/src/classify/normmatch.cpp
index 4efaf5cb..2c14c50f 100644
--- a/src/classify/normmatch.cpp
+++ b/src/classify/normmatch.cpp
@@ -21,6 +21,7 @@
 
 #include <cstdio>
 #include <cmath>
+#include <sstream>          // for std::istringstream
 
 #include "classify.h"
 #include "clusttool.h"
@@ -113,7 +114,7 @@ float Classify::ComputeNormMatch(CLASS_ID ClassId,
       feature.Params[CharNormRx] * 8000.0 +
       feature.Params[CharNormRy] *
       feature.Params[CharNormRy] * 8000.0);
-    return (1.0 - NormEvidenceOf (Match));
+    return (1.0 - NormEvidenceOf(Match));
   }
 
   BestMatch = FLT_MAX;
@@ -209,7 +210,11 @@ NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
   const int kMaxLineSize = 100;
   char line[kMaxLineSize];
   while (fp->FGets(line, kMaxLineSize) != nullptr) {
-    if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
+    std::istringstream stream(line);
+    stream >> unichar >> NumProtos;
+    if (stream.fail()) {
+      continue;
+    }
     if (unicharset.contains_unichar(unichar)) {
       unichar_id = unicharset.unichar_to_id(unichar);
       Protos = NormProtos->Protos[unichar_id];

From 77f9bad3c28ebbf703150d356556e90977029c67 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 16 May 2019 11:38:46 +0200
Subject: [PATCH 3/4] Fix UNICHARSET::save_to_string for locale de_DE.UTF-8

That function writes float values which must always use '.' as the
decimal separator, no matter what the current locale setting is.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/ccutil/unicharset.cpp | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/ccutil/unicharset.cpp b/src/ccutil/unicharset.cpp
index 8585e64f..5e7f60aa 100644
--- a/src/ccutil/unicharset.cpp
+++ b/src/ccutil/unicharset.cpp
@@ -24,7 +24,7 @@
 #include <cstring>
 #include <iomanip>    // for std::setw
 #include <locale>     // for std::locale::classic
-#include <sstream>    // for std::istringstream
+#include <sstream>    // for std::istringstream, std::ostringstream
 
 #include "params.h"
 #include "serialis.h"
@@ -708,19 +708,24 @@ bool UNICHARSET::save_to_string(STRING *str) const {
       snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
               this->get_script_from_script_id(this->get_script(id)),
               this->get_other_case(id));
+      *str += buffer;
     } else {
-      // FIXME
-      snprintf(buffer, kFileBufSize,
-              "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
-              this->id_to_unichar(id), properties,
-              min_bottom, max_bottom, min_top, max_top, width, width_sd,
-              bearing, bearing_sd, advance, advance_sd,
-              this->get_script_from_script_id(this->get_script(id)),
-              this->get_other_case(id), this->get_direction(id),
-              this->get_mirror(id), this->get_normed_unichar(id),
-              this->debug_str(id).string());
+      std::ostringstream stream;
+      stream.imbue(std::locale::classic());
+      stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
+              min_bottom << ',' << max_bottom << ',' <<
+              min_top << ',' << max_top << ',' <<
+              width << ',' << width_sd << ',' <<
+              bearing << ',' << bearing_sd << ',' <<
+              advance << ',' << advance_sd << ' ' <<
+              this->get_script_from_script_id(this->get_script(id)) << ' ' <<
+              this->get_other_case(id) << ' ' <<
+              this->get_direction(id) << ' ' <<
+              this->get_mirror(id) << ' ' <<
+              this->get_normed_unichar(id) << "\t# " <<
+              this->debug_str(id).string() << '\n';
+      *str += stream.str().c_str();
     }
-    *str += buffer;
   }
   return true;
 }

From 331cc84d8d790f3d368045a80061793557a201aa Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Thu, 16 May 2019 11:39:38 +0200
Subject: [PATCH 4/4] Remove assertions for unsupported locale settings

The latest code passed all unittests with locale de_DE.UTF-8
and has fixed the locale issues which were reported on GitHub.
Therefore the assertions can be removed.

Any remaining locale issue will be fixed when it is identified.
To help finding such remaining isses, debug code now uses the
user's locale settings instead of the default "C" locale for all
executables which use TessBaseAPI.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/api/baseapi.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
index 61b38f8e..5311a17d 100644
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
@@ -41,7 +41,6 @@
 #include <unistd.h>
 #endif  // _WIN32
 
-#include <clocale>             // for LC_ALL, LC_CTYPE, LC_NUMERIC
 #include <cmath>               // for round, M_PI
 #include <cstdint>             // for int32_t
 #include <cstring>             // for strcmp, strcpy
@@ -209,13 +208,16 @@ TessBaseAPI::TessBaseAPI()
       rect_height_(0),
       image_width_(0),
       image_height_(0) {
-  const char *locale;
-  locale = std::setlocale(LC_ALL, nullptr);
-  ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
-  locale = std::setlocale(LC_CTYPE, nullptr);
-  ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
-  locale = std::setlocale(LC_NUMERIC, nullptr);
-  ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
+#if defined(DEBUG)
+  // The Tesseract executables would use the "C" locale by default,
+  // but other software which is linked against the Tesseract library
+  // typically uses the locale from the user's environment.
+  // Here the default is overridden to allow debugging of potential
+  // problems caused by the locale settings.
+
+  // Use the current locale if building debug code.
+  std::locale::global(std::locale(""));
+#endif
 }
 
 TessBaseAPI::~TessBaseAPI() {