Merge pull request #2437 from stweil/locale-fix

Fix some unittests with locale de_DE.UTF-8
2024-11-30 23:49:05 +08:00 · 2019-05-16 17:02:02 +02:00 · 2019-05-16 17:02:02 +02:00 · b124a5f6ca
commit b124a5f6ca
parent 4b397c70cc 331cc84d8d
3 changed files with 94 additions and 55 deletions
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
@ -41,7 +41,6 @@
 #include <unistd.h>
 #endif  // _WIN32

-#include <clocale>             // for LC_ALL, LC_CTYPE, LC_NUMERIC
 #include <cmath>               // for round, M_PI
 #include <cstdint>             // for int32_t
 #include <cstring>             // for strcmp, strcpy
@ -209,13 +208,16 @@ TessBaseAPI::TessBaseAPI()
      rect_height_(0),
      image_width_(0),
      image_height_(0) {
-  const char *locale;
-  locale = std::setlocale(LC_ALL, nullptr);
-  ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
-  locale = std::setlocale(LC_CTYPE, nullptr);
-  ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
-  locale = std::setlocale(LC_NUMERIC, nullptr);
-  ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
+#if defined(DEBUG)
+  // The Tesseract executables would use the "C" locale by default,
+  // but other software which is linked against the Tesseract library
+  // typically uses the locale from the user's environment.
+  // Here the default is overridden to allow debugging of potential
+  // problems caused by the locale settings.
+
+  // Use the current locale if building debug code.
+  std::locale::global(std::locale(""));
+#endif
 }

 TessBaseAPI::~TessBaseAPI() {
--- a/src/ccutil/unicharset.cpp
+++ b/src/ccutil/unicharset.cpp
@ -22,6 +22,9 @@
 #include <cassert>
 #include <cstdio>
 #include <cstring>
+#include <iomanip>    // for std::setw
+#include <locale>     // for std::locale::classic
+#include <sstream>    // for std::istringstream, std::ostringstream

 #include "params.h"
 #include "serialis.h"
@ -705,18 +708,24 @@ bool UNICHARSET::save_to_string(STRING *str) const {
      snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
              this->get_script_from_script_id(this->get_script(id)),
              this->get_other_case(id));
+      *str += buffer;
    } else {
-      snprintf(buffer, kFileBufSize,
-              "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
-              this->id_to_unichar(id), properties,
-              min_bottom, max_bottom, min_top, max_top, width, width_sd,
-              bearing, bearing_sd, advance, advance_sd,
-              this->get_script_from_script_id(this->get_script(id)),
-              this->get_other_case(id), this->get_direction(id),
-              this->get_mirror(id), this->get_normed_unichar(id),
-              this->debug_str(id).string());
+      std::ostringstream stream;
+      stream.imbue(std::locale::classic());
+      stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
+              min_bottom << ',' << max_bottom << ',' <<
+              min_top << ',' << max_top << ',' <<
+              width << ',' << width_sd << ',' <<
+              bearing << ',' << bearing_sd << ',' <<
+              advance << ',' << advance_sd << ' ' <<
+              this->get_script_from_script_id(this->get_script(id)) << ' ' <<
+              this->get_other_case(id) << ' ' <<
+              this->get_direction(id) << ' ' <<
+              this->get_mirror(id) << ' ' <<
+              this->get_normed_unichar(id) << "\t# " <<
+              this->debug_str(id).string() << '\n';
+      *str += stream.str().c_str();
    }
-    *str += buffer;
  }
  return true;
 }
@ -815,41 +824,64 @@ bool UNICHARSET::load_via_fgets(
    float advance = 0.0f;
    float advance_sd = 0.0f;
    // TODO(eger): check that this default it ok
-    // after enabling BiDi iterator for Arabic+Cube.
+    // after enabling BiDi iterator for Arabic.
    int direction = UNICHARSET::U_LEFT_TO_RIGHT;
-    UNICHAR_ID other_case = id;
-    UNICHAR_ID mirror = id;
-    char normed[64];
-    int v = -1;
-    if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr ||
-        ((v = sscanf(buffer,
-                     "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
-                     unichar, &properties,
-                     &min_bottom, &max_bottom, &min_top, &max_top,
-                     &width, &width_sd, &bearing, &bearing_sd,
-                     &advance, &advance_sd, script, &other_case,
-                     &direction, &mirror, normed)) != 17 &&
-         (v = sscanf(buffer,
-                     "%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
-                     unichar, &properties,
-                     &min_bottom, &max_bottom, &min_top, &max_top,
-                     &width, &width_sd, &bearing, &bearing_sd,
-                     &advance, &advance_sd, script, &other_case,
-                     &direction, &mirror)) != 16 &&
-          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
-                      unichar, &properties,
-                      &min_bottom, &max_bottom, &min_top, &max_top,
-                      script, &other_case, &direction, &mirror)) != 10 &&
-          (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
-                      &min_bottom, &max_bottom, &min_top, &max_top,
-                      script, &other_case)) != 8 &&
-          (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
-                      script, &other_case)) != 4 &&
-          (v = sscanf(buffer, "%s %x %63s",
-                      unichar, &properties, script)) != 3 &&
-          (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
+    UNICHAR_ID other_case = unicharset_size;
+    UNICHAR_ID mirror = unicharset_size;
+    if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr) {
      return false;
    }
+    char normed[64];
+    normed[0] = '\0';
+    std::istringstream stream(buffer);
+    stream.imbue(std::locale::classic());
+    // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标  # 标 [6807 ]x
+    //stream.flags(std::ios::hex);
+    stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
+    //stream.flags(std::ios::dec);
+    if (stream.fail()) {
+      fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
+      return false;
+    }
+    auto position = stream.tellg();
+    stream.seekg(position);
+    char c1, c2, c3, c4, c5, c6, c7, c8, c9;
+    stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
+      width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
+      advance >> c9 >> advance_sd >> std::setw(63) >> script >>
+      other_case >> direction >> mirror >> std::setw(63) >> normed;
+    if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
+        c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
+      stream.clear();
+      stream.seekg(position);
+      stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
+      width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
+      advance >> c9 >> advance_sd >> std::setw(63) >> script >>
+      other_case >> direction >> mirror;
+      if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
+          c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
+        stream.clear();
+        stream.seekg(position);
+        stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
+        std::setw(63) >> script >> other_case >> direction >> mirror;
+        if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
+          stream.clear();
+          stream.seekg(position);
+          stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
+          std::setw(63) >> script >> other_case;
+          if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
+            stream.clear();
+            stream.seekg(position);
+            stream >> std::setw(63) >> script >> other_case;
+            if (stream.fail()) {
+              stream.clear();
+              stream.seekg(position);
+              stream >> std::setw(63) >> script;
+            }
+          }
+        }
+      }
+    }

    // Skip fragments if needed.
    CHAR_FRAGMENT *frag = nullptr;
@ -880,9 +912,9 @@ bool UNICHARSET::load_via_fgets(
    this->set_advance_stats(id, advance, advance_sd);
    this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
    this->set_other_case(
-        id, (v > 3 && other_case < unicharset_size) ? other_case : id);
-    this->set_mirror(id, (v > 8 && mirror < unicharset_size) ? mirror : id);
-    this->set_normed(id, (v>16) ? normed : unichar);
+        id, (other_case < unicharset_size) ? other_case : id);
+    this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
+    this->set_normed(id, normed[0] != '\0' ? normed : unichar);
  }
  post_load_setup();
  return true;
--- a/src/classify/normmatch.cpp
+++ b/src/classify/normmatch.cpp
@ -21,6 +21,7 @@

 #include <cstdio>
 #include <cmath>
+#include <sstream>          // for std::istringstream

 #include "classify.h"
 #include "clusttool.h"
@ -113,7 +114,7 @@ float Classify::ComputeNormMatch(CLASS_ID ClassId,
      feature.Params[CharNormRx] * 8000.0 +
      feature.Params[CharNormRy] *
      feature.Params[CharNormRy] * 8000.0);
-    return (1.0 - NormEvidenceOf (Match));
+    return (1.0 - NormEvidenceOf(Match));
  }

  BestMatch = FLT_MAX;
@ -209,7 +210,11 @@ NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
  const int kMaxLineSize = 100;
  char line[kMaxLineSize];
  while (fp->FGets(line, kMaxLineSize) != nullptr) {
-    if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
+    std::istringstream stream(line);
+    stream >> unichar >> NumProtos;
+    if (stream.fail()) {
+      continue;
+    }
    if (unicharset.contains_unichar(unichar)) {
      unichar_id = unicharset.unichar_to_id(unichar);
      Protos = NormProtos->Protos[unichar_id];