From 8dc9e9fd147640b904c764a24708ba6a818b9ef7 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 6 Oct 2018 13:20:43 +0200 Subject: [PATCH] Fix use of wrong UNICHARSET Signed-off-by: Stefan Weil --- src/api/baseapi.h | 1 + src/dict/dict.cpp | 11 ++++++++--- src/dict/dict.h | 13 ++++++++----- src/dict/permdawg.cpp | 5 +++-- src/lstm/recodebeam.cpp | 3 ++- src/wordrec/language_model.cpp | 2 +- 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/api/baseapi.h b/src/api/baseapi.h index 291337d3f..b474ad90b 100644 --- a/src/api/baseapi.h +++ b/src/api/baseapi.h @@ -75,6 +75,7 @@ class Trie; class Wordrec; typedef int (Dict::*DictFunc)(void* void_dawg_args, + const UNICHARSET& unicharset, UNICHAR_ID unichar_id, bool word_end) const; typedef double (Dict::*ProbabilityInContextFunc)(const char* lang, const char* context, diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp index 245e74d03..66622b56c 100644 --- a/src/dict/dict.cpp +++ b/src/dict/dict.cpp @@ -361,10 +361,13 @@ void Dict::End() { // according to at least one of the dawgs in the dawgs_ vector. // See more extensive comments in dict.h where this function is declared. int Dict::def_letter_is_okay(void* void_dawg_args, + const UNICHARSET& unicharset, UNICHAR_ID unichar_id, bool word_end) const { DawgArgs *dawg_args = static_cast(void_dawg_args); + ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); + if (dawg_debug_level >= 3) { tprintf("def_letter_is_okay: current unichar=%s word_end=%d" " num active dawgs=%d\n", @@ -410,7 +413,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args, for (int s = 0; s < slist.length(); ++s) { int sdawg_index = slist[s]; const Dawg *sdawg = dawgs_[sdawg_index]; - UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg); + UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg); EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); if (dawg_edge != NO_EDGE) { if (dawg_debug_level >=3) { @@ -477,7 +480,8 @@ int Dict::def_letter_is_okay(void* void_dawg_args, // Find the edge out of the node for the unichar_id. NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE - : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end); + : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), + word_end); if (dawg_debug_level >= 3) { tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", @@ -759,7 +763,8 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { int last_index = word_ptr->length() - 1; // Call letter_is_okay for each letter in the word. for (int i = hyphen_base_size(); i <= last_index; ++i) { - if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i), + if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), + word_ptr->unichar_id(i), i == last_index))) break; // Swap active_dawgs, constraints with the corresponding updated vector. if (dawg_args.updated_dawgs == &(active_dawgs[1])) { diff --git a/src/dict/dict.h b/src/dict/dict.h index 2eeffdc02..5dc0fad98 100644 --- a/src/dict/dict.h +++ b/src/dict/dict.h @@ -351,15 +351,17 @@ class Dict { */ // - int def_letter_is_okay(void* void_dawg_args, + int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset, UNICHAR_ID unichar_id, bool word_end) const; int (Dict::*letter_is_okay_)(void* void_dawg_args, + const UNICHARSET& unicharset, UNICHAR_ID unichar_id, bool word_end) const; /// Calls letter_is_okay_ member function. - int LetterIsOkay(void* void_dawg_args, + int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset, UNICHAR_ID unichar_id, bool word_end) const { - return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end); + return (this->*letter_is_okay_)(void_dawg_args, + unicharset, unichar_id, word_end); } @@ -428,11 +430,12 @@ class Dict { // Given a unichar from a string and a given dawg, return the unichar // we should use to match in that dawg type. (for example, in the number // dawg, all numbers are transformed to kPatternUnicharId). - inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const { + UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch, + const Dawg *dawg) const { if (!dawg) return ch; switch (dawg->type()) { case DAWG_TYPE_NUMBER: - return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch; + return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch; default: return ch; } diff --git a/src/dict/permdawg.cpp b/src/dict/permdawg.cpp index 26b488f24..87456fcfa 100644 --- a/src/dict/permdawg.cpp +++ b/src/dict/permdawg.cpp @@ -88,7 +88,7 @@ void Dict::go_deeper_dawg_fxn( ++num_unigrams; word->append_unichar_id(uch_id, 1, 0.0, 0.0); unigrams_ok = (this->*letter_is_okay_)( - &unigram_dawg_args, + &unigram_dawg_args, *word->unicharset(), word->unichar_id(word_index+num_unigrams-1), word_ending && i == encoding.size() - 1); (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); @@ -111,7 +111,8 @@ void Dict::go_deeper_dawg_fxn( // Check which dawgs from the dawgs_ vector contain the word // up to and including the current unichar. if (checked_unigrams || (this->*letter_is_okay_)( - more_args, word->unichar_id(word_index), word_ending)) { + more_args, *word->unicharset(), word->unichar_id(word_index), + word_ending)) { // Add a new word choice if (word_ending) { if (dawg_debug_level) { diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 019cef559..7d76a31f0 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -771,7 +771,8 @@ void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert, return; // Can't continue if not a dict word. } PermuterType permuter = static_cast( - dict_->def_letter_is_okay(&dawg_args, unichar_id, false)); + dict_->def_letter_is_okay(&dawg_args, + dict_->getUnicharset(), unichar_id, false)); if (permuter != NO_PERM) { PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false, word_start, dawg_args.valid_end, false, cert, prev, diff --git a/src/wordrec/language_model.cpp b/src/wordrec/language_model.cpp index cc506a3ee..b1cd48b92 100644 --- a/src/wordrec/language_model.cpp +++ b/src/wordrec/language_model.cpp @@ -853,7 +853,7 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo( if (language_model_debug_level > 2) tprintf("Test Letter OK for unichar %d, normed %d\n", b.unichar_id(), normed_ids[i]); - dict_->LetterIsOkay(&dawg_args_, normed_ids[i], + dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i], word_end && i == normed_ids.size() - 1); if (dawg_args_.permuter == NO_PERM) { break;