mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Fix use of wrong UNICHARSET
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
0e43ae5cf4
commit
8dc9e9fd14
@ -75,6 +75,7 @@ class Trie;
|
||||
class Wordrec;
|
||||
|
||||
typedef int (Dict::*DictFunc)(void* void_dawg_args,
|
||||
const UNICHARSET& unicharset,
|
||||
UNICHAR_ID unichar_id, bool word_end) const;
|
||||
typedef double (Dict::*ProbabilityInContextFunc)(const char* lang,
|
||||
const char* context,
|
||||
|
@ -361,10 +361,13 @@ void Dict::End() {
|
||||
// according to at least one of the dawgs in the dawgs_ vector.
|
||||
// See more extensive comments in dict.h where this function is declared.
|
||||
int Dict::def_letter_is_okay(void* void_dawg_args,
|
||||
const UNICHARSET& unicharset,
|
||||
UNICHAR_ID unichar_id,
|
||||
bool word_end) const {
|
||||
DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
|
||||
|
||||
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
|
||||
|
||||
if (dawg_debug_level >= 3) {
|
||||
tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
|
||||
" num active dawgs=%d\n",
|
||||
@ -410,7 +413,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
|
||||
for (int s = 0; s < slist.length(); ++s) {
|
||||
int sdawg_index = slist[s];
|
||||
const Dawg *sdawg = dawgs_[sdawg_index];
|
||||
UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
|
||||
UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
|
||||
EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
|
||||
if (dawg_edge != NO_EDGE) {
|
||||
if (dawg_debug_level >=3) {
|
||||
@ -477,7 +480,8 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
|
||||
// Find the edge out of the node for the unichar_id.
|
||||
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
|
||||
EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
|
||||
: dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
|
||||
: dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg),
|
||||
word_end);
|
||||
|
||||
if (dawg_debug_level >= 3) {
|
||||
tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
|
||||
@ -759,7 +763,8 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
|
||||
int last_index = word_ptr->length() - 1;
|
||||
// Call letter_is_okay for each letter in the word.
|
||||
for (int i = hyphen_base_size(); i <= last_index; ++i) {
|
||||
if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
|
||||
if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
|
||||
word_ptr->unichar_id(i),
|
||||
i == last_index))) break;
|
||||
// Swap active_dawgs, constraints with the corresponding updated vector.
|
||||
if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
|
||||
|
@ -351,15 +351,17 @@ class Dict {
|
||||
*/
|
||||
|
||||
//
|
||||
int def_letter_is_okay(void* void_dawg_args,
|
||||
int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
|
||||
UNICHAR_ID unichar_id, bool word_end) const;
|
||||
|
||||
int (Dict::*letter_is_okay_)(void* void_dawg_args,
|
||||
const UNICHARSET& unicharset,
|
||||
UNICHAR_ID unichar_id, bool word_end) const;
|
||||
/// Calls letter_is_okay_ member function.
|
||||
int LetterIsOkay(void* void_dawg_args,
|
||||
int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
|
||||
UNICHAR_ID unichar_id, bool word_end) const {
|
||||
return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
|
||||
return (this->*letter_is_okay_)(void_dawg_args,
|
||||
unicharset, unichar_id, word_end);
|
||||
}
|
||||
|
||||
|
||||
@ -428,11 +430,12 @@ class Dict {
|
||||
// Given a unichar from a string and a given dawg, return the unichar
|
||||
// we should use to match in that dawg type. (for example, in the number
|
||||
// dawg, all numbers are transformed to kPatternUnicharId).
|
||||
inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const {
|
||||
UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
|
||||
const Dawg *dawg) const {
|
||||
if (!dawg) return ch;
|
||||
switch (dawg->type()) {
|
||||
case DAWG_TYPE_NUMBER:
|
||||
return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
|
||||
return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
|
||||
default:
|
||||
return ch;
|
||||
}
|
||||
|
@ -88,7 +88,7 @@ void Dict::go_deeper_dawg_fxn(
|
||||
++num_unigrams;
|
||||
word->append_unichar_id(uch_id, 1, 0.0, 0.0);
|
||||
unigrams_ok = (this->*letter_is_okay_)(
|
||||
&unigram_dawg_args,
|
||||
&unigram_dawg_args, *word->unicharset(),
|
||||
word->unichar_id(word_index+num_unigrams-1),
|
||||
word_ending && i == encoding.size() - 1);
|
||||
(*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
|
||||
@ -111,7 +111,8 @@ void Dict::go_deeper_dawg_fxn(
|
||||
// Check which dawgs from the dawgs_ vector contain the word
|
||||
// up to and including the current unichar.
|
||||
if (checked_unigrams || (this->*letter_is_okay_)(
|
||||
more_args, word->unichar_id(word_index), word_ending)) {
|
||||
more_args, *word->unicharset(), word->unichar_id(word_index),
|
||||
word_ending)) {
|
||||
// Add a new word choice
|
||||
if (word_ending) {
|
||||
if (dawg_debug_level) {
|
||||
|
@ -771,7 +771,8 @@ void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert,
|
||||
return; // Can't continue if not a dict word.
|
||||
}
|
||||
PermuterType permuter = static_cast<PermuterType>(
|
||||
dict_->def_letter_is_okay(&dawg_args, unichar_id, false));
|
||||
dict_->def_letter_is_okay(&dawg_args,
|
||||
dict_->getUnicharset(), unichar_id, false));
|
||||
if (permuter != NO_PERM) {
|
||||
PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false,
|
||||
word_start, dawg_args.valid_end, false, cert, prev,
|
||||
|
@ -853,7 +853,7 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
|
||||
if (language_model_debug_level > 2)
|
||||
tprintf("Test Letter OK for unichar %d, normed %d\n",
|
||||
b.unichar_id(), normed_ids[i]);
|
||||
dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
|
||||
dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
|
||||
word_end && i == normed_ids.size() - 1);
|
||||
if (dawg_args_.permuter == NO_PERM) {
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user