Merge pull request #2437 from stweil/locale-fix

Fix some unittests with locale de_DE.UTF-8
This commit is contained in:
zdenop 2019-05-16 17:02:02 +02:00 committed by GitHub
commit b124a5f6ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 94 additions and 55 deletions

View File

@ -41,7 +41,6 @@
#include <unistd.h>
#endif // _WIN32
#include <clocale> // for LC_ALL, LC_CTYPE, LC_NUMERIC
#include <cmath> // for round, M_PI
#include <cstdint> // for int32_t
#include <cstring> // for strcmp, strcpy
@ -209,13 +208,16 @@ TessBaseAPI::TessBaseAPI()
rect_height_(0),
image_width_(0),
image_height_(0) {
const char *locale;
locale = std::setlocale(LC_ALL, nullptr);
ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
locale = std::setlocale(LC_CTYPE, nullptr);
ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
locale = std::setlocale(LC_NUMERIC, nullptr);
ASSERT_HOST(!strcmp(locale, "C") || !strcmp(locale, "C.UTF-8"));
#if defined(DEBUG)
// The Tesseract executables would use the "C" locale by default,
// but other software which is linked against the Tesseract library
// typically uses the locale from the user's environment.
// Here the default is overridden to allow debugging of potential
// problems caused by the locale settings.
// Use the current locale if building debug code.
std::locale::global(std::locale(""));
#endif
}
TessBaseAPI::~TessBaseAPI() {

View File

@ -22,6 +22,9 @@
#include <cassert>
#include <cstdio>
#include <cstring>
#include <iomanip> // for std::setw
#include <locale> // for std::locale::classic
#include <sstream> // for std::istringstream, std::ostringstream
#include "params.h"
#include "serialis.h"
@ -705,18 +708,24 @@ bool UNICHARSET::save_to_string(STRING *str) const {
snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
this->get_script_from_script_id(this->get_script(id)),
this->get_other_case(id));
*str += buffer;
} else {
snprintf(buffer, kFileBufSize,
"%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
this->id_to_unichar(id), properties,
min_bottom, max_bottom, min_top, max_top, width, width_sd,
bearing, bearing_sd, advance, advance_sd,
this->get_script_from_script_id(this->get_script(id)),
this->get_other_case(id), this->get_direction(id),
this->get_mirror(id), this->get_normed_unichar(id),
this->debug_str(id).string());
std::ostringstream stream;
stream.imbue(std::locale::classic());
stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
min_bottom << ',' << max_bottom << ',' <<
min_top << ',' << max_top << ',' <<
width << ',' << width_sd << ',' <<
bearing << ',' << bearing_sd << ',' <<
advance << ',' << advance_sd << ' ' <<
this->get_script_from_script_id(this->get_script(id)) << ' ' <<
this->get_other_case(id) << ' ' <<
this->get_direction(id) << ' ' <<
this->get_mirror(id) << ' ' <<
this->get_normed_unichar(id) << "\t# " <<
this->debug_str(id).string() << '\n';
*str += stream.str().c_str();
}
*str += buffer;
}
return true;
}
@ -815,41 +824,64 @@ bool UNICHARSET::load_via_fgets(
float advance = 0.0f;
float advance_sd = 0.0f;
// TODO(eger): check that this default it ok
// after enabling BiDi iterator for Arabic+Cube.
// after enabling BiDi iterator for Arabic.
int direction = UNICHARSET::U_LEFT_TO_RIGHT;
UNICHAR_ID other_case = id;
UNICHAR_ID mirror = id;
char normed[64];
int v = -1;
if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr ||
((v = sscanf(buffer,
"%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
unichar, &properties,
&min_bottom, &max_bottom, &min_top, &max_top,
&width, &width_sd, &bearing, &bearing_sd,
&advance, &advance_sd, script, &other_case,
&direction, &mirror, normed)) != 17 &&
(v = sscanf(buffer,
"%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
unichar, &properties,
&min_bottom, &max_bottom, &min_top, &max_top,
&width, &width_sd, &bearing, &bearing_sd,
&advance, &advance_sd, script, &other_case,
&direction, &mirror)) != 16 &&
(v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
unichar, &properties,
&min_bottom, &max_bottom, &min_top, &max_top,
script, &other_case, &direction, &mirror)) != 10 &&
(v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
&min_bottom, &max_bottom, &min_top, &max_top,
script, &other_case)) != 8 &&
(v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
script, &other_case)) != 4 &&
(v = sscanf(buffer, "%s %x %63s",
unichar, &properties, script)) != 3 &&
(v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
UNICHAR_ID other_case = unicharset_size;
UNICHAR_ID mirror = unicharset_size;
if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr) {
return false;
}
char normed[64];
normed[0] = '\0';
std::istringstream stream(buffer);
stream.imbue(std::locale::classic());
// 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标 # 标 [6807 ]x
//stream.flags(std::ios::hex);
stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
//stream.flags(std::ios::dec);
if (stream.fail()) {
fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
return false;
}
auto position = stream.tellg();
stream.seekg(position);
char c1, c2, c3, c4, c5, c6, c7, c8, c9;
stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
advance >> c9 >> advance_sd >> std::setw(63) >> script >>
other_case >> direction >> mirror >> std::setw(63) >> normed;
if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
stream.clear();
stream.seekg(position);
stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
advance >> c9 >> advance_sd >> std::setw(63) >> script >>
other_case >> direction >> mirror;
if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
stream.clear();
stream.seekg(position);
stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
std::setw(63) >> script >> other_case >> direction >> mirror;
if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
stream.clear();
stream.seekg(position);
stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
std::setw(63) >> script >> other_case;
if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
stream.clear();
stream.seekg(position);
stream >> std::setw(63) >> script >> other_case;
if (stream.fail()) {
stream.clear();
stream.seekg(position);
stream >> std::setw(63) >> script;
}
}
}
}
}
// Skip fragments if needed.
CHAR_FRAGMENT *frag = nullptr;
@ -880,9 +912,9 @@ bool UNICHARSET::load_via_fgets(
this->set_advance_stats(id, advance, advance_sd);
this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
this->set_other_case(
id, (v > 3 && other_case < unicharset_size) ? other_case : id);
this->set_mirror(id, (v > 8 && mirror < unicharset_size) ? mirror : id);
this->set_normed(id, (v>16) ? normed : unichar);
id, (other_case < unicharset_size) ? other_case : id);
this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
this->set_normed(id, normed[0] != '\0' ? normed : unichar);
}
post_load_setup();
return true;

View File

@ -21,6 +21,7 @@
#include <cstdio>
#include <cmath>
#include <sstream> // for std::istringstream
#include "classify.h"
#include "clusttool.h"
@ -113,7 +114,7 @@ float Classify::ComputeNormMatch(CLASS_ID ClassId,
feature.Params[CharNormRx] * 8000.0 +
feature.Params[CharNormRy] *
feature.Params[CharNormRy] * 8000.0);
return (1.0 - NormEvidenceOf (Match));
return (1.0 - NormEvidenceOf(Match));
}
BestMatch = FLT_MAX;
@ -209,7 +210,11 @@ NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
const int kMaxLineSize = 100;
char line[kMaxLineSize];
while (fp->FGets(line, kMaxLineSize) != nullptr) {
if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
std::istringstream stream(line);
stream >> unichar >> NumProtos;
if (stream.fail()) {
continue;
}
if (unicharset.contains_unichar(unichar)) {
unichar_id = unicharset.unichar_to_id(unichar);
Protos = NormProtos->Protos[unichar_id];