Merge pull request #1954 from stweil/unicharset

Fix use of wrong UNICHARSET
2025-01-18 06:30:14 +08:00 · 2018-10-06 15:04:31 +02:00 · 2018-10-06 15:04:31 +02:00 · 9efedc15b2
commit 9efedc15b2
parent 76cd80e1d7 8dc9e9fd14
6 changed files with 23 additions and 12 deletions
--- a/src/api/baseapi.h
+++ b/src/api/baseapi.h
@ -75,6 +75,7 @@ class Trie;
 class Wordrec;

 typedef int (Dict::*DictFunc)(void* void_dawg_args,
+                              const UNICHARSET& unicharset,
                              UNICHAR_ID unichar_id, bool word_end) const;
 typedef double (Dict::*ProbabilityInContextFunc)(const char* lang,
                                                 const char* context,
--- a/src/dict/dict.cpp
+++ b/src/dict/dict.cpp
@ -361,10 +361,13 @@ void Dict::End() {
 // according to at least one of the dawgs in the dawgs_ vector.
 // See more extensive comments in dict.h where this function is declared.
 int Dict::def_letter_is_okay(void* void_dawg_args,
+                             const UNICHARSET& unicharset,
                             UNICHAR_ID unichar_id,
                             bool word_end) const {
  DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);

+  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
+
  if (dawg_debug_level >= 3) {
    tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
            " num active dawgs=%d\n",
@ -410,7 +413,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
        for (int s = 0; s < slist.length(); ++s) {
          int sdawg_index = slist[s];
          const Dawg *sdawg = dawgs_[sdawg_index];
-          UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
+          UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
          EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
          if (dawg_edge != NO_EDGE) {
            if (dawg_debug_level >=3) {
@ -477,7 +480,8 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
    // Find the edge out of the node for the unichar_id.
    NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
    EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
-        : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
+        : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg),
+                             word_end);

    if (dawg_debug_level >= 3) {
      tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
@ -759,7 +763,8 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
  int last_index = word_ptr->length() - 1;
  // Call letter_is_okay for each letter in the word.
  for (int i = hyphen_base_size(); i <= last_index; ++i) {
-    if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
+    if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
+                                   word_ptr->unichar_id(i),
                                   i == last_index))) break;
    // Swap active_dawgs, constraints with the corresponding updated vector.
    if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
--- a/src/dict/dict.h
+++ b/src/dict/dict.h
@ -351,15 +351,17 @@ class Dict {
   */

  //
-  int def_letter_is_okay(void* void_dawg_args,
+  int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
                         UNICHAR_ID unichar_id, bool word_end) const;

  int (Dict::*letter_is_okay_)(void* void_dawg_args,
+                               const UNICHARSET& unicharset,
                               UNICHAR_ID unichar_id, bool word_end) const;
  /// Calls letter_is_okay_ member function.
-  int LetterIsOkay(void* void_dawg_args,
+  int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
                   UNICHAR_ID unichar_id, bool word_end) const {
-    return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
+    return (this->*letter_is_okay_)(void_dawg_args,
+                                    unicharset, unichar_id, word_end);
  }


@ -428,11 +430,12 @@ class Dict {
  // Given a unichar from a string and a given dawg, return the unichar
  // we should use to match in that dawg type.  (for example, in the number
  // dawg, all numbers are transformed to kPatternUnicharId).
-  inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const {
+  UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
+                           const Dawg *dawg) const {
    if (!dawg) return ch;
    switch (dawg->type()) {
      case DAWG_TYPE_NUMBER:
-        return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
+        return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
      default:
        return ch;
    }
--- a/src/dict/permdawg.cpp
+++ b/src/dict/permdawg.cpp
@ -88,7 +88,7 @@ void Dict::go_deeper_dawg_fxn(
      ++num_unigrams;
      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
      unigrams_ok = (this->*letter_is_okay_)(
-          &unigram_dawg_args,
+          &unigram_dawg_args, *word->unicharset(),
          word->unichar_id(word_index+num_unigrams-1),
          word_ending && i == encoding.size() - 1);
      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
@ -111,7 +111,8 @@ void Dict::go_deeper_dawg_fxn(
  // Check which dawgs from the dawgs_ vector contain the word
  // up to and including the current unichar.
  if (checked_unigrams || (this->*letter_is_okay_)(
-      more_args, word->unichar_id(word_index), word_ending)) {
+      more_args, *word->unicharset(), word->unichar_id(word_index),
+      word_ending)) {
    // Add a new word choice
    if (word_ending) {
      if (dawg_debug_level) {
--- a/src/lstm/recodebeam.cpp
+++ b/src/lstm/recodebeam.cpp
@ -771,7 +771,8 @@ void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert,
    return;  // Can't continue if not a dict word.
  }
  PermuterType permuter = static_cast<PermuterType>(
-      dict_->def_letter_is_okay(&dawg_args, unichar_id, false));
+      dict_->def_letter_is_okay(&dawg_args,
+                                dict_->getUnicharset(), unichar_id, false));
  if (permuter != NO_PERM) {
    PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false,
                     word_start, dawg_args.valid_end, false, cert, prev,
--- a/src/wordrec/language_model.cpp
+++ b/src/wordrec/language_model.cpp
@ -853,7 +853,7 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
    if (language_model_debug_level > 2)
      tprintf("Test Letter OK for unichar %d, normed %d\n",
              b.unichar_id(), normed_ids[i]);
-    dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
+    dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
                        word_end && i == normed_ids.size() - 1);
    if (dawg_args_.permuter == NO_PERM) {
      break;