Changed the way unicharsets are handled to allow support for the ™ character. Can find the issue where it was requested.

2025-06-12 21:53:25 +08:00 · 2017-07-24 11:45:57 -07:00 · 2017-07-24 11:45:57 -07:00 · b0ead95d64
commit b0ead95d64
parent 4efc539f51
9 changed files with 177 additions and 112 deletions
--- a/ccstruct/ratngs.cpp
+++ b/ccstruct/ratngs.cpp
@ -24,6 +24,7 @@

 #include "ratngs.h"

+#include <string>
 #include "blobs.h"
 #include "callcpp.h"
 #include "genericvector.h"
@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
    : unicharset_(&unicharset){
  GenericVector<UNICHAR_ID> encoding;
  GenericVector<char> lengths;
-  if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
+  string cleaned = unicharset.CleanupString(src_string);
+  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
+                               NULL)) {
    lengths.push_back('\0');
    STRING src_lengths = &lengths[0];
-    this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
+    this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
  } else {  // There must have been an invalid unichar in the string.
    this->init(8);
    this->make_bad();
--- a/ccutil/ambigs.cpp
+++ b/ccutil/ambigs.cpp
@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
  // Insert the corresponding correct ngram into the unicharset.
  // Unicharset code assumes that the "base" ngram is inserted into
  // the unicharset before fragments of this ngram are inserted.
-  unicharset->unichar_insert(replacement_string);
+  unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
  ambig_spec->correct_ngram_id =
    unicharset->unichar_to_id(replacement_string);
  if (replacement_ambig_part_size > 1) {
@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
    } else {
      STRING frag_str = CHAR_FRAGMENT::to_string(
          replacement_string, i, test_ambig_part_size, false);
-      unicharset->unichar_insert(frag_str.string());
+      unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
      unichar_id = unicharset->unichar_to_id(frag_str.string());
    }
    ambig_spec->correct_fragments[i] = unichar_id;
--- a/ccutil/unicharcompress.cpp
+++ b/ccutil/unicharcompress.cpp
@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
  direct_set.clear();
  radicals.clear();
  // Always keep space as 0;
-  direct_set.unichar_insert(" ");
+  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
  // Null char is next if we have one.
  if (null_id >= 0) {
    direct_set.unichar_insert(kNullChar);
@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
      if (it != radical_map.end()) {
        // This is Han. Convert to radical, stroke, index.
        if (!radicals.contains_unichar(it->second.radical.string())) {
-          radicals.unichar_insert(it->second.radical.string());
+          radicals.unichar_insert(it->second.radical.string(),
+                                  OldUncleanUnichars::kTrue);
        }
        int radical = radicals.unichar_to_id(it->second.radical.string());
        int num_strokes = it->second.num_strokes;
--- a/ccutil/unicharmap.cpp
+++ b/ccutil/unicharmap.cpp
@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
    delete[] nodes;
 }

-// Search the given unichar representation in the tree. Each character in the
-// string is interpreted as an index in an array of nodes.
-UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
-  const char* current_char = unichar_repr;
-  UNICHARMAP_NODE* current_nodes = nodes;
-
-  assert(*unichar_repr != '\0');
-
-  do {
-    if (*(current_char + 1) == '\0')
-      return current_nodes[static_cast<unsigned char>(*current_char)].id;
-    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-  } while (true);
-}
-
 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes.
 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
                                     int length) const {
-  const char* current_char = unichar_repr;
  UNICHARMAP_NODE* current_nodes = nodes;

  assert(*unichar_repr != '\0');
  assert(length > 0 && length <= UNICHAR_LEN);

+  int index = 0;
+  if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
  do {
-    if (length == 1 || *(current_char + 1) == '\0')
-      return current_nodes[static_cast<unsigned char>(*current_char)].id;
+    if (index + 1 >= length || unichar_repr[index + 1] == '\0')
+      return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-    --length;
+        current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
+    ++index;
  } while (true);
 }

@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
 // string is interpreted as an index in an array of nodes.
 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
  const char* current_char = unichar_repr;
+  if (*current_char == '\0') return;
  UNICHARMAP_NODE** current_nodes_pointer = &nodes;
-
-  assert(*unichar_repr != '\0');
-  assert(id >= 0);
-
  do {
    if (*current_nodes_pointer == 0)
      *current_nodes_pointer = new UNICHARMAP_NODE[256];
-    if (*(current_char + 1) == '\0') {
+    if (current_char[1] == '\0') {
      (*current_nodes_pointer)
          [static_cast<unsigned char>(*current_char)].id = id;
      return;
@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
  } while (true);
 }

-// Search the given unichar representation in the tree. Each character in the
-// string is interpreted as an index in an array of nodes. Stop once the tree
-// does not have anymore nodes or once we found the right unichar_repr.
-bool UNICHARMAP::contains(const char* const unichar_repr) const {
-  if (unichar_repr == NULL || *unichar_repr == '\0') return false;
-
-  const char* current_char = unichar_repr;
-  UNICHARMAP_NODE* current_nodes = nodes;
-
-  while (current_nodes != 0 && *(current_char + 1) != '\0') {
-    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-  }
-  return current_nodes != 0 && *(current_char + 1) == '\0' &&
-      current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
-}
-
 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes. Stop once the tree does not have anymore nodes or once we
@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
                          int length) const {
  if (unichar_repr == NULL || *unichar_repr == '\0') return false;
  if (length <= 0 || length > UNICHAR_LEN) return false;
-
-  const char* current_char = unichar_repr;
+  int index = 0;
+  if (index >= length || unichar_repr[index] == '\0') return false;
  UNICHARMAP_NODE* current_nodes = nodes;

-  while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
+  while (current_nodes != 0 && index + 1 < length &&
+         unichar_repr[index + 1] != '\0') {
    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    --length;
-    ++current_char;
+        current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
+    ++index;
  }
-  return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
-      current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
+  return current_nodes != 0 &&
+         (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
+         current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
 }

 // Return the minimum number of characters that must be used from this string
 // to obtain a match in the UNICHARMAP.
 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
  const char* current_char = unichar_repr;
+  if (*current_char == '\0') return 0;
  UNICHARMAP_NODE* current_nodes = nodes;

  while (current_nodes != NULL && *current_char != '\0') {
--- a/ccutil/unicharmap.h
+++ b/ccutil/unicharmap.h
@ -36,21 +36,12 @@ class UNICHARMAP {
  // with the given id. The length of the representation MUST be non-zero.
  void insert(const char* const unichar_repr, UNICHAR_ID id);

-  // Return the id associated with the given unichar representation,
-  // this representation MUST exist within the UNICHARMAP.
-  // The length of the representation MUST be non-zero.
-  UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
-
  // Return the id associated with the given unichar representation,
  // this representation MUST exist within the UNICHARMAP. The first
  // length characters (maximum) from unichar_repr are used. The length
  // MUST be non-zero.
  UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;

-  // Return true if the given unichar representation is already present in the
-  // UNICHARMAP. The length of the representation MUST be non-zero.
-  bool contains(const char* const unichar_repr) const;
-
  // Return true if the given unichar representation is already present in the
  // UNICHARMAP. The first length characters (maximum) from unichar_repr are
  // used. The length MUST be non-zero.
--- a/ccutil/unicharset.cpp
+++ b/ccutil/unicharset.cpp
@ -67,6 +67,15 @@ const char* UNICHARSET::kCustomLigatures[][2] = {
  {NULL, NULL}
 };

+// List of mappings to make when ingesting strings from the outside.
+// The substitutions clean up text that should exist for rendering of
+// synthetic data, but not in the recognition set.
+const char* UNICHARSET::kCleanupMaps[][2] = {
+    {"\u0640", ""},    // TATWEEL is deleted.
+    {"\ufb01", "fi"},  // fi ligature->fi pair.
+    {"\ufb02", "fl"},  // fl ligature->fl pair.
+    {nullptr, nullptr}};
+
 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
 const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
    " ",
@ -196,15 +205,21 @@ void UNICHARSET::reserve(int unichars_number) {

 UNICHAR_ID
 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
-  return ids.contains(unichar_repr) ?
-    ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
+  string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  return ids.contains(cleaned.data(), cleaned.size())
+             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
+             : INVALID_UNICHAR_ID;
 }

 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
                                     int length) const {
  assert(length > 0 && length <= UNICHAR_LEN);
-  return ids.contains(unichar_repr, length) ?
-    ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
+  string cleaned(unichar_repr, length);
+  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
+  return ids.contains(cleaned.data(), cleaned.size())
+             ? ids.unichar_to_id(cleaned.data(), cleaned.size())
+             : INVALID_UNICHAR_ID;
 }

 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
@ -235,6 +250,9 @@ bool UNICHARSET::encodable_string(const char *str,
 // the rest of the string is still encoded.
 // If lengths is not NULL, then it is filled with the corresponding
 // byte length of each encoded UNICHAR_ID.
+// WARNING: Caller must guarantee that str has already been cleaned of codes
+// that do not belong in the unicharset, or encoding may fail.
+// Use CleanupString to perform the cleaning.
 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
                               GenericVector<UNICHAR_ID>* encoding,
                               GenericVector<char>* lengths,
@ -429,7 +447,7 @@ void UNICHARSET::CopyFrom(const UNICHARSET& src) {
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
-    unichar_insert(utf8);
+    unichar_insert_backwards_compatible(utf8);
    unichars[ch].properties.ExpandRangesFrom(src_props);
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
@ -445,24 +463,13 @@ void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
-    if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
-      // Only use fully valid entries.
-      tprintf("Bad properties for index %d, char %s: "
-              "%d,%d %d,%d %g,%g %g,%g %g,%g\n",
-              ch, utf8, src_props.min_bottom, src_props.max_bottom,
-              src_props.min_top, src_props.max_top,
-              src_props.width, src_props.width_sd,
-              src_props.bearing, src_props.bearing_sd,
-              src_props.advance, src_props.advance_sd);
-      continue;
-    }
    int id = size_used;
    if (contains_unichar(utf8)) {
      id = unichar_to_id(utf8);
      // Just expand current ranges.
      unichars[id].properties.ExpandRangesFrom(src_props);
    } else {
-      unichar_insert(utf8);
+      unichar_insert_backwards_compatible(utf8);
      unichars[id].properties.SetRangesEmpty();
    }
  }
@ -613,40 +620,55 @@ char UNICHARSET::get_chartype(UNICHAR_ID id) const {
  return 0;
 }

-void UNICHARSET::unichar_insert(const char* const unichar_repr) {
-  if (!ids.contains(unichar_repr)) {
-    if (strlen(unichar_repr) > UNICHAR_LEN) {
-      fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
-              int(strlen(unichar_repr)), unichar_repr);
+void UNICHARSET::unichar_insert(const char* const unichar_repr,
+                                OldUncleanUnichars old_style) {
+  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
+  string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
+    const char* str = cleaned.c_str();
+    GenericVector<int> encoding;
+    if (!old_style_included_ &&
+        encode_string(str, true, &encoding, nullptr, nullptr))
      return;
-    }
    if (size_used == size_reserved) {
      if (size_used == 0)
        reserve(8);
      else
        reserve(2 * size_used);
    }
-
-    strcpy(unichars[size_used].representation, unichar_repr);
+    int index = 0;
+    do {
+      if (index > UNICHAR_LEN) {
+        fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
+                unichar_repr);
+        return;
+      }
+      unichars[size_used].representation[index++] = *str++;
+    } while (*str != '\0');
+    unichars[size_used].representation[index] = '\0';
    this->set_script(size_used, null_script);
    // If the given unichar_repr represents a fragmented character, set
    // fragment property to a pointer to CHAR_FRAGMENT class instance with
    // information parsed from the unichar representation. Use the script
    // of the base unichar for the fragmented character if possible.
-    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
+    CHAR_FRAGMENT* frag =
+        CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
    this->unichars[size_used].properties.fragment = frag;
    if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
      this->unichars[size_used].properties.script_id =
        this->get_script(frag->get_unichar());
    }
    this->unichars[size_used].properties.enabled = true;
-    ids.insert(unichar_repr, size_used);
+    ids.insert(unichars[size_used].representation, size_used);
    ++size_used;
  }
 }

 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
-  return ids.contains(unichar_repr);
+  string cleaned =
+      old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
+  return ids.contains(cleaned.data(), cleaned.size());
 }

 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
@ -654,7 +676,9 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr,
  if (length == 0) {
    return false;
  }
-  return ids.contains(unichar_repr, length);
+  string cleaned(unichar_repr, length);
+  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
+  return ids.contains(cleaned.data(), cleaned.size());
 }

 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
@ -840,7 +864,7 @@ bool UNICHARSET::load_via_fgets(
    if (strcmp(unichar, "NULL") == 0)
      this->unichar_insert(" ");
    else
-      this->unichar_insert(unichar);
+      this->unichar_insert_backwards_compatible(unichar);

    this->set_isalpha(id, properties & ISALPHA_MASK);
    this->set_islower(id, properties & ISLOWER_MASK);
@ -1088,3 +1112,32 @@ int UNICHARSET::get_script_id_from_name(const char* script_name) const {
  }
  return 0;  // 0 is always the null_script
 }
+
+// Removes/replaces content that belongs in rendered text, but not in the
+// unicharset.
+/* static */
+string UNICHARSET::CleanupString(const char* utf8_str, int length) {
+  string result;
+  result.reserve(length);
+  char ch;
+  while ((ch = *utf8_str) != '\0' && --length >= 0) {
+    int key_index = 0;
+    const char* key;
+    while ((key = kCleanupMaps[key_index][0]) != nullptr) {
+      int match = 0;
+      while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
+      if (key[match] == '\0') {
+        utf8_str += match;
+        break;
+      }
+      ++key_index;
+    }
+    if (key == nullptr) {
+      result.push_back(ch);
+      ++utf8_str;
+    } else {
+      result.append(kCleanupMaps[key_index][1]);
+    }
+  }
+  return result;
+}
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@ -39,6 +39,13 @@ enum SpecialUnicharCodes {
  SPECIAL_UNICHAR_CODES_COUNT
 };

+// Boolean flag for unichar_insert. It's a bit of a double negative to allow
+// the default value to be false.
+enum class OldUncleanUnichars {
+  kFalse,
+  kTrue,
+};
+
 class CHAR_FRAGMENT {
 public:
  // Minimum number of characters used for fragment representation.
@ -190,7 +197,7 @@ class UNICHARSET {
  // Use encode_string in preference to repeatedly calling step.
  int step(const char* str) const;

-  // Return whether the given UTF-8 string is encodable with this UNICHARSET.
+  // Returns true if the given UTF-8 string is encodable with this UNICHARSET.
  // If not encodable, write the first byte offset which cannot be converted
  // into the second (return) argument.
  bool encodable_string(const char *str, int *first_bad_position) const;
@ -207,6 +214,9 @@ class UNICHARSET {
  // If encoded_length is not NULL then on return it contains the length of
  // str that was encoded. (if give_up_on_failure the location of the first
  // failure, otherwise strlen(str).)
+  // WARNING: Caller must guarantee that str has already been cleaned of codes
+  // that do not belong in the unicharset, or encoding may fail.
+  // Use CleanupString to perform the cleaning.
  bool encode_string(const char* str, bool give_up_on_failure,
                     GenericVector<UNICHAR_ID>* encoding,
                     GenericVector<char>* lengths,
@ -226,6 +236,13 @@ class UNICHARSET {
  // by its hex unicodes.
  static STRING debug_utf8_str(const char* str);

+  // Removes/replaces content that belongs in rendered text, but not in the
+  // unicharset.
+  static string CleanupString(const char* utf8_str) {
+    return CleanupString(utf8_str, strlen(utf8_str));
+  }
+  static string CleanupString(const char* utf8_str, int length);
+
  // Return a STRING containing debug information on the unichar, including
  // the id_to_unichar, its hex unicodes and the properties.
  STRING debug_str(UNICHAR_ID id) const;
@ -233,8 +250,29 @@ class UNICHARSET {
    return debug_str(unichar_to_id(unichar_repr));
  }

-  // Add a unichar representation to the set.
-  void unichar_insert(const char* const unichar_repr);
+  // Adds a unichar representation to the set. If old_style is true, then
+  // TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
+  // characters are ignored/skipped as if they don't exist and n-grams that
+  // can already be encoded are not added.
+  void unichar_insert(const char* const unichar_repr,
+                      OldUncleanUnichars old_style);
+  void unichar_insert(const char* const unichar_repr) {
+    unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
+  }
+  // Adds a unichar representation to the set. Avoids setting old_style to true,
+  // unless it is necessary to make the new unichar get added.
+  void unichar_insert_backwards_compatible(const char* const unichar_repr) {
+    string cleaned = CleanupString(unichar_repr);
+    if (cleaned != unichar_repr) {
+      unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
+    } else {
+      int old_size = size();
+      unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
+      if (size() == old_size) {
+        unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
+      }
+    }
+  }

  // Return true if the given unichar id exists within the set.
  // Relies on the fact that unichar ids are contiguous in the unicharset.
@ -282,6 +320,7 @@ class UNICHARSET {
    top_bottom_set_ = false;
    script_has_upper_lower_ = false;
    script_has_xheight_ = false;
+    old_style_included_ = false;
    null_sid_ = 0;
    common_sid_ = 0;
    latin_sid_ = 0;
@ -743,7 +782,7 @@ class UNICHARSET {
  // unichar representation represents a character fragment.
  const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
    if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
-        !ids.contains(unichar_repr)) {
+        !ids.contains(unichar_repr, false)) {
      return NULL;
    }
    return get_fragment(unichar_to_id(unichar_repr));
@ -965,6 +1004,11 @@ class UNICHARSET {
  bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
                      bool skip_fragments);

+  // List of mappings to make when ingesting strings from the outside.
+  // The substitutions clean up text that should exists for rendering of
+  // synthetic data, but not in the recognition set.
+  static const char* kCleanupMaps[][2];
+
  UNICHAR_SLOT* unichars;
  UNICHARMAP ids;
  int size_used;
@ -980,6 +1024,8 @@ class UNICHARSET {
  // True if the unicharset has a significant mean-line with significant
  // ascenders above that.
  bool script_has_xheight_;
+  // True if the set contains chars that would be changed by the cleanup.
+  bool old_style_included_;

  // A few convenient script name-to-id mapping without using hash.
  // These are initialized when unicharset file is loaded.  Anything
--- a/lstm/lstmtrainer.cpp
+++ b/lstm/lstmtrainer.cpp
@ -170,6 +170,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
  tprintf("Training parameters:\n  Debug interval = %d,"
          " weights = %g, learning rate = %g, momentum=%g\n",
          debug_interval_, weight_range_, learning_rate_, momentum_);
+  tprintf("null char=%d\n", null_char_);
  return true;
 }

@ -733,7 +734,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
  GenericVector<int> internal_labels;
  labels->truncate(0);
  if (!simple_text) labels->push_back(null_char);
-  if (unicharset.encode_string(str.string(), true, &internal_labels, NULL,
+  string cleaned = unicharset.CleanupString(str.string());
+  if (unicharset.encode_string(cleaned.c_str(), true, &internal_labels, NULL,
                               &err_index)) {
    bool success = true;
    for (int i = 0; i < internal_labels.size(); ++i) {
@ -759,8 +761,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
    if (success) return true;
  }
  tprintf("Encoding of string failed! Failure bytes:");
-  while (err_index < str.length()) {
-    tprintf(" %x", str[err_index++]);
+  while (err_index < cleaned.size()) {
+    tprintf(" %x", cleaned[err_index++]);
  }
  tprintf("\n");
  return false;
@ -813,8 +815,9 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData* trainingdata,
      training_iteration() % debug_interval_ == 0;
  GenericVector<int> truth_labels;
  if (!EncodeString(trainingdata->transcription(), &truth_labels)) {
-    tprintf("Can't encode transcription: %s\n",
-            trainingdata->transcription().string());
+    tprintf("Can't encode transcription: '%s' in language '%s'\n",
+            trainingdata->transcription().string(),
+            trainingdata->language().string());
    return UNENCODABLE;
  }
  int w = 0;
--- a/training/text2image.cpp
+++ b/training/text2image.cpp
@ -409,9 +409,7 @@ using tesseract::SpanUTF8NotWhitespace;
 using tesseract::SpanUTF8Whitespace;
 using tesseract::StringRenderer;

-int main(int argc, char** argv) {
-  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
-
+int Main() {
  if (FLAGS_list_available_fonts) {
    const std::vector<string>& all_fonts = FontUtils::ListAvailableFonts();
    for (unsigned int i = 0; i < all_fonts.size(); ++i) {
@ -543,8 +541,9 @@ int main(int argc, char** argv) {
      const char *curr_pos = str8 + offsets[i].first;
      int ngram_len = offsets[i].second;
      // Skip words that contain characters not in found in unicharset.
+      string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
      if (!FLAGS_unicharset_file.empty() &&
-          !unicharset.encodable_string(curr_pos, nullptr)) {
+          !unicharset.encodable_string(cleaned.c_str(), nullptr)) {
        continue;
      }
      rand_utf8.append(curr_pos, ngram_len);
@ -665,3 +664,8 @@ int main(int argc, char** argv) {

  return 0;
 }
+
+int main(int argc, char** argv) {
+  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
+  Main();
+}