Removed debug messages, forward compatability of traineddata files, further bug fix.

2025-01-18 14:41:36 +08:00 · 2015-07-09 14:50:25 -07:00 · 2015-07-09 14:50:25 -07:00 · 44122698d7
commit 44122698d7
parent a303ab9d00
4 changed files with 41 additions and 53 deletions
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@ -1,6 +1,11 @@
 /**********************************************************************
 * File:        pageres.cpp  (Formerly page_res.c)
- * Description: Results classes used by control.c
+ * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
+ *              and an iterator class to iterate over the words.
+ * Main purposes:
+ *              Easy way to iterate over the words without a 3-nested loop.
+ *              Holds data used during word recognition.
+ *              Holds information about alternative spacing paths.
 * Author:      Phil Cheatle
 * Created:     Tue Sep 22 08:42:49 BST 1992
 *
@ -1478,8 +1483,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
  WERD* real_word = word_res->word;
  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
    real_word->set_flag(W_FUZZY_SP, true);
-    tprintf("Made word fuzzy at:");
-    real_word->bounding_box().print();
    if (word_res->combination) {
      // The next word should be the corresponding part of combo, but we have
      // already stepped past it, so find it by search.
@ -1493,8 +1496,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
                  !real_word->flag(W_FUZZY_NON));
      real_word->set_flag(W_FUZZY_SP, true);
-      tprintf("Made part of combo word fuzzy at:");
-      real_word->bounding_box().print();
    }
  }
 }
--- a/ccutil/tessdatamanager.cpp
+++ b/ccutil/tessdatamanager.cpp
@ -50,7 +50,10 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
    ReverseN(&actual_tessdata_num_entries_,
             sizeof(actual_tessdata_num_entries_));
  }
-  ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
+  if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
+    // For forward compatability, truncate to the number we can handle.
+    actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
+  }
  fread(offset_table_, sizeof(inT64),
        actual_tessdata_num_entries_, data_file_);
  if (swap_) {
--- a/ccutil/unicharset.cpp
+++ b/ccutil/unicharset.cpp
@ -215,34 +215,6 @@ int UNICHARSET::step(const char* str) const {
  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
  return lengths[0];
 }
-// As step except constraining the search to unichar-ids that are
-// self-normalized. Unlike step, does not encode the whole string, therefore
-// should be used on short strings (like those obtained from
-// get_normed_unichar.)
-int UNICHARSET::normed_step(const char* str) const {
-  // Find the length of the first matching unicharset member.
-  int length = ids.minmatch(str);
-  if (length == 0)
-    return 0;  // Empty string or illegal char.
-
-  while (length <= UNICHAR_LEN) {
-    if (ids.contains(str, length)) {
-      int matched_id = unichar_to_id(str, length);
-      const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id);
-      bool good_start = matched_norms.size() == 1 &&
-                        matched_norms[0] == matched_id;
-      if (str[length] == '\0') {
-        return good_start ? length : 0;
-      }
-      if (normed_step(str + length) > 0)
-        return length;  // This length works!
-    } else if (str[length] == '\0') {
-      return 0;  // Ran out of string.
-    }
-    ++length;
-  }
-  return 0;
-}

 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
 // If not encodable, write the first byte offset which cannot be converted
@ -375,19 +347,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
 // stored in the file, and needs to be set when the UNICHARSET is loaded.
 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
  unichars[unichar_id].properties.normed_ids.truncate(0);
-  int length = unichars[unichar_id].properties.normed.length();
-  const char* normed_str = unichars[unichar_id].properties.normed.string();
-  int step = 0;
-  for (int offset = 0; offset < length; offset+= step) {
-    step = normed_step(normed_str + offset);
-    if (step == 0) {
+  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
+    unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
+  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
+                            true, &unichars[unichar_id].properties.normed_ids,
+                            NULL, NULL)) {
    unichars[unichar_id].properties.normed_ids.truncate(0);
    unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
-      break;
-    }
-    int normed_id = unichar_to_id(normed_str + offset, step);
-    ASSERT_HOST(normed_id >= 0);
-    unichars[unichar_id].properties.normed_ids.push_back(normed_id);
  }
 }

@ -1015,6 +981,24 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
  }
 }

+// Returns true if there are any repeated unicodes in the normalized
+// text of any unichar-id in the unicharset.
+bool UNICHARSET::AnyRepeatedUnicodes() const {
+  int start_id = 0;
+  if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
+  for (int id = start_id; id < size_used; ++id) {
+    // Convert to unicodes.
+    GenericVector<int> unicodes;
+    if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
+        unicodes.size() > 1) {
+      for (int u = 1; u < unicodes.size(); ++u) {
+        if (unicodes[u - 1] == unicodes[u]) return true;
+      }
+    }
+  }
+  return false;
+}
+
 int UNICHARSET::add_script(const char* script) {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script, script_table[i]) == 0)
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@ -190,11 +190,6 @@ class UNICHARSET {
  // WARNING: this function now encodes the whole string for precision.
  // Use encode_string in preference to repeatedly calling step.
  int step(const char* str) const;
-  // As step except constraining the search to unichar-ids that are
-  // self-normalized. Unlike step, does not encode the whole string, therefore
-  // should be used on short strings (like those obtained from
-  // get_normed_unichar.)
-  int normed_step(const char* str) const;

  // Return whether the given UTF-8 string is encodable with this UNICHARSET.
  // If not encodable, write the first byte offset which cannot be converted
@ -678,6 +673,10 @@ class UNICHARSET {
               kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
  }

+  // Returns true if there are any repeated unicodes in the normalized
+  // text of any unichar-id in the unicharset.
+  bool AnyRepeatedUnicodes() const;
+
  // Return a pointer to the CHAR_FRAGMENT class if the given
  // unichar id represents a character fragment.
  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
@ -775,6 +774,7 @@ class UNICHARSET {

  // Returns normalized version of unichar with the given unichar_id.
  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
+    if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
    return unichars[unichar_id].properties.normed.string();
  }
  // Returns a vector of UNICHAR_IDs that represent the ids of the normalized