Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 09:52:40 +08:00 · 2012-02-02 03:03:56 +00:00 · 2012-02-02 03:03:56 +00:00 · 73adf693d5
commit 73adf693d5
parent e33ae59f4d
13 changed files with 104 additions and 84 deletions
--- a/cube/beam_search.cpp
+++ b/cube/beam_search.cpp
@ -245,13 +245,11 @@ WordAltList *BeamSearch::CreateWordAltList(SearchObject *srch_obj) {
      int cost = 0;
      // char bigram cost
      int bigram_cost = !bigrams ? 0 :
-          bigrams->Cost(ch_buff, cntxt_->CharacterSet(),
-                        &cntxt_->TesseractObject()->unicharset);
+          bigrams->Cost(ch_buff, cntxt_->CharacterSet());
      // word unigram cost
      int unigram_cost = !word_unigrams ? 0 :
          word_unigrams->Cost(ch_buff, cntxt_->LangMod(),
-                              cntxt_->CharacterSet(),
-                              &cntxt_->TesseractObject()->unicharset);
+                              cntxt_->CharacterSet());
      // overall cost
      cost = static_cast<int>(
          (size_cost * cntxt_->Params()->SizeWgt()) +
--- a/cube/char_bigrams.cpp
+++ b/cube/char_bigrams.cpp
@ -25,7 +25,6 @@
 #include "char_bigrams.h"
 #include "cube_utils.h"
 #include "ndminx.h"
-#include "unicharset.h"
 #include "cube_const.h"

 namespace tesseract {
@ -167,21 +166,20 @@ int CharBigrams::PairCost(char_32 ch1, char_32 ch2) const {
  return bigram_table_.char_bigram[ch1].bigram[ch2].cost;
 }

-int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set,
-                      UNICHARSET *unicharset) const {
+int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set) const {
  if (!char_32_ptr || char_32_ptr[0] == 0) {
    return bigram_table_.worst_cost;
  }
  int cost = MeanCostWithSpaces(char_32_ptr);
  if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant &&
-      CubeUtils::IsCaseInvariant(char_32_ptr, char_set, unicharset)) {
-    char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set, unicharset);
+      CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) {
+    char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set);
    if (lower_32 && lower_32[0] != 0) {
      int cost_lower = MeanCostWithSpaces(lower_32);
      cost = MIN(cost, cost_lower);
      delete [] lower_32;
    }
-    char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set, unicharset);
+    char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set);
    if (upper_32 && upper_32[0] != 0) {
      int cost_upper = MeanCostWithSpaces(upper_32);
      cost = MIN(cost, cost_upper);
--- a/cube/char_bigrams.h
+++ b/cube/char_bigrams.h
@ -61,13 +61,12 @@ class CharBigrams {
  static CharBigrams *Create(const string &data_file_path,
                             const string &lang);
  // Top-level function to return the mean character bigram cost of a
-  // sequence of characters.  If char_set and unicharset are not NULL
-  // and cube and tesseract share the same unicharset, use
+  // sequence of characters.  If char_set is not NULL, use
  // tesseract functions to return a case-invariant cost.
  // This avoids unnecessarily penalizing all-one-case words or
  // capitalized words (first-letter upper-case and remaining letters
  // lower-case).
-  int Cost(const char_32 *str, CharSet *char_set, UNICHARSET *unicharset) const;
+  int Cost(const char_32 *str, CharSet *char_set) const;

 protected:
  // Returns the character bigram cost of two characters.
--- a/cube/char_set.cpp
+++ b/cube/char_set.cpp
@ -78,10 +78,16 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
  // map its unichars to tesseract's; if only one unicharset exists,
  // just load it.
  bool loaded;
-  if (cube_unicharset_exists)
-    loaded = char_set->LoadSupportedCharList(charset_fp, tess_unicharset);
-  else
+  if (cube_unicharset_exists) {
+    char_set->cube_unicharset_.load_from_file(charset_fp);
+    loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET);
+    loaded = loaded && char_set->LoadSupportedCharList(
+        tessdata_manager->GetDataFilePtr(), tess_unicharset);
+    char_set->unicharset_ = &char_set->cube_unicharset_;
+  } else {
    loaded = char_set->LoadSupportedCharList(charset_fp, NULL);
+    char_set->unicharset_ = tess_unicharset;
+  }
  if (!loaded) {
    delete char_set;
    return false;
--- a/cube/char_set.h
+++ b/cube/char_set.h
@ -116,6 +116,10 @@ class CharSet {
  static CharSet *Create(TessdataManager *tessdata_manager,
                         UNICHARSET *tess_unicharset);

+  // Return the UNICHARSET cube is using for recognition internally --
+  // ClassId() returns unichar_id's in this unicharset.
+  UNICHARSET *InternalUnicharset() { return unicharset_; }
+
 private:
  // Hash table configuration params. Determined emperically on
  // the supported languages so far (Eng, Ara, Hin). Might need to be
@ -155,6 +159,13 @@ class CharSet {
  string_32  **class_strings_;
  // map from class id to secondary (tesseract's) unicharset's ids
  int *unicharset_map_;
+  // A unicharset which is filled in with a Tesseract-style UNICHARSET for
+  // cube's data if our unicharset is different from tesseract's.
+  UNICHARSET cube_unicharset_;
+  // This points to either the tess_unicharset we're passed or cube_unicharset_,
+  // depending upon whether we just have one unicharset or one for each
+  // tesseract and cube, respectively.
+  UNICHARSET *unicharset_;
  // has the char set been initialized flag
  bool init_;
 };
--- a/cube/cube_object.cpp
+++ b/cube/cube_object.cpp
@ -242,6 +242,16 @@ int CubeObject::WordCost(const char *str) {
  return cost;
 }

+// Recognizes a single character and returns the list of results.
+CharAltList *CubeObject::RecognizeChar() {
+  if (char_samp_ == NULL) return NULL;
+  CharAltList* alt_list = NULL;
+  CharClassifier *char_classifier = cntxt_->Classifier();
+  ASSERT_HOST(char_classifier != NULL);
+  alt_list = char_classifier->Classify(char_samp_);
+  return alt_list;
+}
+
 // Normalize the input word bitmap to have a minimum aspect ratio
 bool CubeObject::Normalize() {
  // create a cube search object
--- a/cube/cube_object.h
+++ b/cube/cube_object.h
@ -64,7 +64,6 @@
 //
 //         // Call this once you are done recognizing all words with
 //         // for the current language
-//         tess_obj->end_tesseract();
 //         delete tess_obj;
 //
 // Note that if the language supports "Italics" (see the CubeRecoContext), the
@ -110,6 +109,8 @@ class CubeObject {
  // recognition of a language model that allows only the specified word.
  // The alternate list(s) will be permanently modified.
  int WordCost(const char *str);
+  // Recognizes a single character and returns the list of results.
+  CharAltList *RecognizeChar();

  // Returns the BeamSearch object that resulted from the last call to
  // RecognizeWord
--- a/cube/cube_utils.cpp
+++ b/cube/cube_utils.cpp
@ -358,8 +358,7 @@ void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
  }
 }

-bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
-                                UNICHARSET *unicharset) {
+bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
  bool all_one_case = true;
  bool capitalized;
  bool prev_upper;
@ -370,20 +369,18 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
  bool cur_lower;

  string str8;
-  if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
-    // If cube char_set or tesseract unicharset are missing, or
-    // unicharset is not shared, then use C-locale-dependent functions
+  if (!char_set) {
+    // If cube char_set is missing, use C-locale-dependent functions
    // on UTF8 characters to determine case properties.
-    UTF32ToUTF8(str32, &str8);
-    first_upper = isupper(str8[0]);
-    first_lower = islower(str8[0]);
+    first_upper = isupper(str32[0]);
+    first_lower = islower(str32[0]);
    if (first_upper)
      capitalized = true;
    prev_upper = first_upper;
-    prev_lower = islower(str8[0]);
-    for (int c = 1; c < str8.length(); ++c) {
-      cur_upper = isupper(str8[c]);
-      cur_lower = islower(str8[c]);
+    prev_lower = islower(str32[0]);
+    for (int c = 1; str32[c] != 0; ++c) {
+      cur_upper = isupper(str32[c]);
+      cur_lower = islower(str32[c]);
      if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
        all_one_case = false;
      if (cur_upper)
@ -392,17 +389,18 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
      prev_lower = cur_lower;
    }
  } else {
+    UNICHARSET *unicharset = char_set->InternalUnicharset();
    // Use UNICHARSET functions to determine case properties
-    first_upper = unicharset->get_isupper(char_set->UnicharID(str32[0]));
-    first_lower = unicharset->get_islower(char_set->UnicharID(str32[0]));
+    first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
+    first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
    if (first_upper)
      capitalized = true;
    prev_upper = first_upper;
-    prev_lower = unicharset->get_islower(char_set->UnicharID(str32[0]));
+    prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));

    for (int c = 1; c < StrLen(str32); ++c) {
-      cur_upper = unicharset->get_isupper(char_set->UnicharID(str32[c]));
-      cur_lower = unicharset->get_islower(char_set->UnicharID(str32[c]));
+      cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
+      cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
      if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
        all_one_case = false;
      if (cur_upper)
@ -414,11 +412,11 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
  return all_one_case || capitalized;
 }

-char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
-                            UNICHARSET *unicharset) {
-  if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
+char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
+  if (!char_set) {
    return NULL;
  }
+  UNICHARSET *unicharset = char_set->InternalUnicharset();
  int len = StrLen(str32);
  char_32 *lower = new char_32[len + 1];
  if (!lower)
@ -430,9 +428,8 @@ char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
      return NULL;
    }
    // convert upper-case characters to lower-case
-    if (unicharset->get_isupper(char_set->UnicharID(ch))) {
-      UNICHAR_ID uid_lower =
-          unicharset->get_other_case(char_set->UnicharID(ch));
+    if (unicharset->get_isupper(char_set->ClassID(ch))) {
+      UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
      const char_32 *str32_lower = char_set->ClassString(uid_lower);
      // expect lower-case version of character to be a single character
      if (!str32_lower || StrLen(str32_lower) != 1) {
@ -448,11 +445,11 @@ char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
  return lower;
 }

-char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set,
-                            UNICHARSET *unicharset) {
-  if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
+char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
+  if (!char_set) {
    return NULL;
  }
+  UNICHARSET *unicharset = char_set->InternalUnicharset();
  int len = StrLen(str32);
  char_32 *upper = new char_32[len + 1];
  if (!upper)
@ -464,9 +461,8 @@ char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set,
      return NULL;
    }
    // convert lower-case characters to upper-case
-    if (unicharset->get_islower(char_set->UnicharID(ch))) {
-      UNICHAR_ID uid_upper =
-          unicharset->get_other_case(char_set->UnicharID(ch));
+    if (unicharset->get_islower(char_set->ClassID(ch))) {
+      UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
      const char_32 *str32_upper = char_set->ClassString(uid_upper);
      // expect upper-case version of character to be a single character
      if (!str32_upper || StrLen(str32_upper) != 1) {
--- a/cube/cube_utils.h
+++ b/cube/cube_utils.h
@ -69,24 +69,18 @@ class CubeUtils {
  static void UTF32ToUTF8(const char_32 *utf32_str, string *str);
  // Returns true if input word has either 1) all-one-case, or 2)
  // first character upper-case, and remaining characters lower-case.
-  // If char_set and unicharset are not NULL, uses tesseract's unicharset
-  // functions to determine case properties. Otherwise, uses
-  // C-locale-dependent functions, which may be unreliable on
-  // non-ASCII characters.
-  static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set,
-                              UNICHARSET *unicharset);
+  // If char_set is not NULL, uses tesseract's unicharset functions
+  // to determine case properties. Otherwise, uses C-locale-dependent
+  // functions, which may be unreliable on non-ASCII characters.
+  static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set);
  // Returns char_32 pointer to the lower-case-transformed version of
-  // the input string or NULL on error. If char_set or unicharset are
-  // NULL, or tesseract and cube do not share unicharsets, returns
-  // NULL. Return array must be freed by caller.
-  static char_32 *ToLower(const char_32 *str32, CharSet *char_set,
-                          UNICHARSET *unicharset);
+  // the input string or NULL on error. If char_set is NULL returns NULL.
+  // Return array must be freed by caller.
+  static char_32 *ToLower(const char_32 *str32, CharSet *char_set);
  // Returns char_32 pointer to the upper-case-transformed version of
-  // the input string or NULL on error. If char_set or unicharset are
-  // NULL, or tesseract and cube do not share unicharsets, returns
-  // NULL. Return array must be freed by caller.
-  static char_32 *ToUpper(const char_32 *str32, CharSet *char_set,
-                          UNICHARSET *unicharset);
+  // the input string or NULL on error. If char_set is NULL returns NULL.
+  // Return array must be freed by caller.
+  static char_32 *ToUpper(const char_32 *str32, CharSet *char_set);
 private:
  static unsigned char *GetImageData(IMAGE *img,
                                     int left, int top, int wid, int hgt);
--- a/cube/word_list_lang_model.cpp
+++ b/cube/word_list_lang_model.cpp
@ -121,11 +121,11 @@ void WordListLangModel::WordVariants(const CharSet &char_set,
                                     string_32 prefix_str32,
                                     WERD_CHOICE *word_so_far,
                                     string_32 str32,
-                                     vector<WERD_CHOICE> *word_variants) {
+                                     vector<WERD_CHOICE *> *word_variants) {
  int str_len = str32.length();
  if (str_len == 0) {
    if (word_so_far->length() > 0) {
-      word_variants->push_back(*word_so_far);
+      word_variants->push_back(new WERD_CHOICE(*word_so_far));
    }
  } else {
    // Try out all the possible prefixes of the str32.
@ -151,11 +151,15 @@ void WordListLangModel::WordVariants(const CharSet &char_set,
 // Compute all the variants of a 32-bit string in terms of the class-ids
 // This is needed for languages that have ligatures. A word can then have more
 // than one spelling in terms of the class-ids
-void WordListLangModel::WordVariants(const CharSet &char_set, string_32 str32,
-                                     vector<WERD_CHOICE> *word_variants) {
+void WordListLangModel::WordVariants(const CharSet &char_set,
+                                     const UNICHARSET *uchset, string_32 str32,
+                                     vector<WERD_CHOICE *> *word_variants) {
+  for (int i = 0; i < word_variants->size(); i++) {
+    delete (*word_variants)[i];
+  }
  word_variants->clear();
  string_32 prefix_str32;
-  WERD_CHOICE word_so_far;
+  WERD_CHOICE word_so_far(uchset);
  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
 }

@ -179,21 +183,23 @@ bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
    return false;
  }
  // get all the word variants
-  vector<WERD_CHOICE> word_variants;
-  WordVariants(*(cntxt_->CharacterSet()), char_32_ptr, &word_variants);
+  vector<WERD_CHOICE *> word_variants;
+  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
+               char_32_ptr, &word_variants);

  if (word_variants.size() > 0) {
    // find the shortest variant
    int shortest_word = 0;
    for (int word = 1; word < word_variants.size(); word++) {
-      if (word_variants[shortest_word].length() >
-          word_variants[word].length()) {
+      if (word_variants[shortest_word]->length() >
+          word_variants[word]->length()) {
        shortest_word = word;
      }
    }
    // only add the shortest grapheme interpretation of string to the word list
-    dawg_->add_word_to_dawg(word_variants[shortest_word]);
+    dawg_->add_word_to_dawg(*word_variants[shortest_word]);
  }
+  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
  return true;
 }

--- a/cube/word_list_lang_model.h
+++ b/cube/word_list_lang_model.h
@ -63,8 +63,9 @@ class WordListLangModel : public LangModel {
  // Compute all the variants of a 32-bit string in terms of the class-ids.
  // This is needed for languages that have ligatures. A word can then have
  // more than one spelling in terms of the class-ids.
-  static void WordVariants(const CharSet &char_set, string_32 str32,
-                           vector<WERD_CHOICE> *word_variants);
+  static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset,
+                           string_32 str32,
+                           vector<WERD_CHOICE *> *word_variants);
 private:
  // constants needed to configure the language model
  static const int kMaxEdge = 512;
@ -78,9 +79,11 @@ class WordListLangModel : public LangModel {
  // Cleanup
  void Cleanup();
  // Recursive helper function for WordVariants().
-  static void WordVariants(const CharSet &char_set,
-                           string_32 prefix_str32, WERD_CHOICE *word_so_far,
-                           string_32 str32, vector<WERD_CHOICE> *word_variants);
+  static void WordVariants(
+      const CharSet &char_set,
+      string_32 prefix_str32, WERD_CHOICE *word_so_far,
+      string_32 str32,
+      vector<WERD_CHOICE *> *word_variants);
 };
 }  // tesseract

--- a/cube/word_unigrams.cpp
+++ b/cube/word_unigrams.cpp
@ -25,7 +25,6 @@
 #include "const.h"
 #include "cube_utils.h"
 #include "ndminx.h"
-#include "unicharset.h"
 #include "word_unigrams.h"

 namespace tesseract {
@ -150,8 +149,7 @@ WordUnigrams *WordUnigrams::Create(const string &data_file_path,
 // cost.
 int WordUnigrams::Cost(const char_32 *key_str32,
                       LangModel *lang_mod,
-                       CharSet *char_set,
-                       UNICHARSET *unicharset) const {
+                       CharSet *char_set) const {
  if (!key_str32)
    return 0;
  // convert string to UTF8 to split into space-separated words
@ -206,15 +204,15 @@ int WordUnigrams::Cost(const char_32 *key_str32,
    // if case invariant, get costs of all-upper-case and all-lower-case
    // versions and return the min cost
    if (clean_len >= kMinLengthNumOrCaseInvariant &&
-        CubeUtils::IsCaseInvariant(clean_str32, char_set, unicharset)) {
-      char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set, unicharset);
+        CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
+      char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
      if (lower_32) {
        string lower_8;
        CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
        word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
        delete [] lower_32;
      }
-      char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set, unicharset);
+      char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
      if (upper_32) {
        string upper_8;
        CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
--- a/cube/word_unigrams.h
+++ b/cube/word_unigrams.h
@ -49,7 +49,7 @@ class WordUnigrams {
  // case-invariant cost is computed in those cases, assuming the word
  // meets a minimum length.
  int Cost(const char_32 *str32, LangModel *lang_mod,
-           CharSet *char_set, UNICHARSET *unicharset) const;
+           CharSet *char_set) const;
 protected:
  // Compute the word unigram cost of a UTF-8 string with binary
  // search of sorted words_ array.