Merge pull request #1954 from stweil/unicharset

Fix use of wrong UNICHARSET
This commit is contained in:
zdenop 2018-10-06 15:04:31 +02:00 committed by GitHub
commit 9efedc15b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 23 additions and 12 deletions

View File

@ -75,6 +75,7 @@ class Trie;
class Wordrec;
typedef int (Dict::*DictFunc)(void* void_dawg_args,
const UNICHARSET& unicharset,
UNICHAR_ID unichar_id, bool word_end) const;
typedef double (Dict::*ProbabilityInContextFunc)(const char* lang,
const char* context,

View File

@ -361,10 +361,13 @@ void Dict::End() {
// according to at least one of the dawgs in the dawgs_ vector.
// See more extensive comments in dict.h where this function is declared.
int Dict::def_letter_is_okay(void* void_dawg_args,
const UNICHARSET& unicharset,
UNICHAR_ID unichar_id,
bool word_end) const {
DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
if (dawg_debug_level >= 3) {
tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
" num active dawgs=%d\n",
@ -410,7 +413,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
for (int s = 0; s < slist.length(); ++s) {
int sdawg_index = slist[s];
const Dawg *sdawg = dawgs_[sdawg_index];
UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
if (dawg_edge != NO_EDGE) {
if (dawg_debug_level >=3) {
@ -477,7 +480,8 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
// Find the edge out of the node for the unichar_id.
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
: dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
: dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg),
word_end);
if (dawg_debug_level >= 3) {
tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
@ -759,7 +763,8 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
int last_index = word_ptr->length() - 1;
// Call letter_is_okay for each letter in the word.
for (int i = hyphen_base_size(); i <= last_index; ++i) {
if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
word_ptr->unichar_id(i),
i == last_index))) break;
// Swap active_dawgs, constraints with the corresponding updated vector.
if (dawg_args.updated_dawgs == &(active_dawgs[1])) {

View File

@ -351,15 +351,17 @@ class Dict {
*/
//
int def_letter_is_okay(void* void_dawg_args,
int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
UNICHAR_ID unichar_id, bool word_end) const;
int (Dict::*letter_is_okay_)(void* void_dawg_args,
const UNICHARSET& unicharset,
UNICHAR_ID unichar_id, bool word_end) const;
/// Calls letter_is_okay_ member function.
int LetterIsOkay(void* void_dawg_args,
int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
UNICHAR_ID unichar_id, bool word_end) const {
return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
return (this->*letter_is_okay_)(void_dawg_args,
unicharset, unichar_id, word_end);
}
@ -428,11 +430,12 @@ class Dict {
// Given a unichar from a string and a given dawg, return the unichar
// we should use to match in that dawg type. (for example, in the number
// dawg, all numbers are transformed to kPatternUnicharId).
inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const {
UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
const Dawg *dawg) const {
if (!dawg) return ch;
switch (dawg->type()) {
case DAWG_TYPE_NUMBER:
return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
default:
return ch;
}

View File

@ -88,7 +88,7 @@ void Dict::go_deeper_dawg_fxn(
++num_unigrams;
word->append_unichar_id(uch_id, 1, 0.0, 0.0);
unigrams_ok = (this->*letter_is_okay_)(
&unigram_dawg_args,
&unigram_dawg_args, *word->unicharset(),
word->unichar_id(word_index+num_unigrams-1),
word_ending && i == encoding.size() - 1);
(*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
@ -111,7 +111,8 @@ void Dict::go_deeper_dawg_fxn(
// Check which dawgs from the dawgs_ vector contain the word
// up to and including the current unichar.
if (checked_unigrams || (this->*letter_is_okay_)(
more_args, word->unichar_id(word_index), word_ending)) {
more_args, *word->unicharset(), word->unichar_id(word_index),
word_ending)) {
// Add a new word choice
if (word_ending) {
if (dawg_debug_level) {

View File

@ -771,7 +771,8 @@ void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert,
return; // Can't continue if not a dict word.
}
PermuterType permuter = static_cast<PermuterType>(
dict_->def_letter_is_okay(&dawg_args, unichar_id, false));
dict_->def_letter_is_okay(&dawg_args,
dict_->getUnicharset(), unichar_id, false));
if (permuter != NO_PERM) {
PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false,
word_start, dawg_args.valid_end, false, cert, prev,

View File

@ -853,7 +853,7 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
if (language_model_debug_level > 2)
tprintf("Test Letter OK for unichar %d, normed %d\n",
b.unichar_id(), normed_ids[i]);
dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
word_end && i == normed_ids.size() - 1);
if (dawg_args_.permuter == NO_PERM) {
break;