mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Fix use of wrong UNICHARSET
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
0e43ae5cf4
commit
8dc9e9fd14
@ -75,6 +75,7 @@ class Trie;
|
|||||||
class Wordrec;
|
class Wordrec;
|
||||||
|
|
||||||
typedef int (Dict::*DictFunc)(void* void_dawg_args,
|
typedef int (Dict::*DictFunc)(void* void_dawg_args,
|
||||||
|
const UNICHARSET& unicharset,
|
||||||
UNICHAR_ID unichar_id, bool word_end) const;
|
UNICHAR_ID unichar_id, bool word_end) const;
|
||||||
typedef double (Dict::*ProbabilityInContextFunc)(const char* lang,
|
typedef double (Dict::*ProbabilityInContextFunc)(const char* lang,
|
||||||
const char* context,
|
const char* context,
|
||||||
|
@ -361,10 +361,13 @@ void Dict::End() {
|
|||||||
// according to at least one of the dawgs in the dawgs_ vector.
|
// according to at least one of the dawgs in the dawgs_ vector.
|
||||||
// See more extensive comments in dict.h where this function is declared.
|
// See more extensive comments in dict.h where this function is declared.
|
||||||
int Dict::def_letter_is_okay(void* void_dawg_args,
|
int Dict::def_letter_is_okay(void* void_dawg_args,
|
||||||
|
const UNICHARSET& unicharset,
|
||||||
UNICHAR_ID unichar_id,
|
UNICHAR_ID unichar_id,
|
||||||
bool word_end) const {
|
bool word_end) const {
|
||||||
DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
|
DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
|
||||||
|
|
||||||
|
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
|
||||||
|
|
||||||
if (dawg_debug_level >= 3) {
|
if (dawg_debug_level >= 3) {
|
||||||
tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
|
tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
|
||||||
" num active dawgs=%d\n",
|
" num active dawgs=%d\n",
|
||||||
@ -410,7 +413,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
|
|||||||
for (int s = 0; s < slist.length(); ++s) {
|
for (int s = 0; s < slist.length(); ++s) {
|
||||||
int sdawg_index = slist[s];
|
int sdawg_index = slist[s];
|
||||||
const Dawg *sdawg = dawgs_[sdawg_index];
|
const Dawg *sdawg = dawgs_[sdawg_index];
|
||||||
UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
|
UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
|
||||||
EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
|
EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
|
||||||
if (dawg_edge != NO_EDGE) {
|
if (dawg_edge != NO_EDGE) {
|
||||||
if (dawg_debug_level >=3) {
|
if (dawg_debug_level >=3) {
|
||||||
@ -477,7 +480,8 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
|
|||||||
// Find the edge out of the node for the unichar_id.
|
// Find the edge out of the node for the unichar_id.
|
||||||
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
|
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
|
||||||
EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
|
EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
|
||||||
: dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
|
: dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg),
|
||||||
|
word_end);
|
||||||
|
|
||||||
if (dawg_debug_level >= 3) {
|
if (dawg_debug_level >= 3) {
|
||||||
tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
|
tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
|
||||||
@ -759,7 +763,8 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
|
|||||||
int last_index = word_ptr->length() - 1;
|
int last_index = word_ptr->length() - 1;
|
||||||
// Call letter_is_okay for each letter in the word.
|
// Call letter_is_okay for each letter in the word.
|
||||||
for (int i = hyphen_base_size(); i <= last_index; ++i) {
|
for (int i = hyphen_base_size(); i <= last_index; ++i) {
|
||||||
if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
|
if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
|
||||||
|
word_ptr->unichar_id(i),
|
||||||
i == last_index))) break;
|
i == last_index))) break;
|
||||||
// Swap active_dawgs, constraints with the corresponding updated vector.
|
// Swap active_dawgs, constraints with the corresponding updated vector.
|
||||||
if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
|
if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
|
||||||
|
@ -351,15 +351,17 @@ class Dict {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
//
|
//
|
||||||
int def_letter_is_okay(void* void_dawg_args,
|
int def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
|
||||||
UNICHAR_ID unichar_id, bool word_end) const;
|
UNICHAR_ID unichar_id, bool word_end) const;
|
||||||
|
|
||||||
int (Dict::*letter_is_okay_)(void* void_dawg_args,
|
int (Dict::*letter_is_okay_)(void* void_dawg_args,
|
||||||
|
const UNICHARSET& unicharset,
|
||||||
UNICHAR_ID unichar_id, bool word_end) const;
|
UNICHAR_ID unichar_id, bool word_end) const;
|
||||||
/// Calls letter_is_okay_ member function.
|
/// Calls letter_is_okay_ member function.
|
||||||
int LetterIsOkay(void* void_dawg_args,
|
int LetterIsOkay(void* void_dawg_args, const UNICHARSET& unicharset,
|
||||||
UNICHAR_ID unichar_id, bool word_end) const {
|
UNICHAR_ID unichar_id, bool word_end) const {
|
||||||
return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
|
return (this->*letter_is_okay_)(void_dawg_args,
|
||||||
|
unicharset, unichar_id, word_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -428,11 +430,12 @@ class Dict {
|
|||||||
// Given a unichar from a string and a given dawg, return the unichar
|
// Given a unichar from a string and a given dawg, return the unichar
|
||||||
// we should use to match in that dawg type. (for example, in the number
|
// we should use to match in that dawg type. (for example, in the number
|
||||||
// dawg, all numbers are transformed to kPatternUnicharId).
|
// dawg, all numbers are transformed to kPatternUnicharId).
|
||||||
inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const {
|
UNICHAR_ID char_for_dawg(const UNICHARSET& unicharset, UNICHAR_ID ch,
|
||||||
|
const Dawg *dawg) const {
|
||||||
if (!dawg) return ch;
|
if (!dawg) return ch;
|
||||||
switch (dawg->type()) {
|
switch (dawg->type()) {
|
||||||
case DAWG_TYPE_NUMBER:
|
case DAWG_TYPE_NUMBER:
|
||||||
return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
|
return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
|
||||||
default:
|
default:
|
||||||
return ch;
|
return ch;
|
||||||
}
|
}
|
||||||
|
@ -88,7 +88,7 @@ void Dict::go_deeper_dawg_fxn(
|
|||||||
++num_unigrams;
|
++num_unigrams;
|
||||||
word->append_unichar_id(uch_id, 1, 0.0, 0.0);
|
word->append_unichar_id(uch_id, 1, 0.0, 0.0);
|
||||||
unigrams_ok = (this->*letter_is_okay_)(
|
unigrams_ok = (this->*letter_is_okay_)(
|
||||||
&unigram_dawg_args,
|
&unigram_dawg_args, *word->unicharset(),
|
||||||
word->unichar_id(word_index+num_unigrams-1),
|
word->unichar_id(word_index+num_unigrams-1),
|
||||||
word_ending && i == encoding.size() - 1);
|
word_ending && i == encoding.size() - 1);
|
||||||
(*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
|
(*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
|
||||||
@ -111,7 +111,8 @@ void Dict::go_deeper_dawg_fxn(
|
|||||||
// Check which dawgs from the dawgs_ vector contain the word
|
// Check which dawgs from the dawgs_ vector contain the word
|
||||||
// up to and including the current unichar.
|
// up to and including the current unichar.
|
||||||
if (checked_unigrams || (this->*letter_is_okay_)(
|
if (checked_unigrams || (this->*letter_is_okay_)(
|
||||||
more_args, word->unichar_id(word_index), word_ending)) {
|
more_args, *word->unicharset(), word->unichar_id(word_index),
|
||||||
|
word_ending)) {
|
||||||
// Add a new word choice
|
// Add a new word choice
|
||||||
if (word_ending) {
|
if (word_ending) {
|
||||||
if (dawg_debug_level) {
|
if (dawg_debug_level) {
|
||||||
|
@ -771,7 +771,8 @@ void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert,
|
|||||||
return; // Can't continue if not a dict word.
|
return; // Can't continue if not a dict word.
|
||||||
}
|
}
|
||||||
PermuterType permuter = static_cast<PermuterType>(
|
PermuterType permuter = static_cast<PermuterType>(
|
||||||
dict_->def_letter_is_okay(&dawg_args, unichar_id, false));
|
dict_->def_letter_is_okay(&dawg_args,
|
||||||
|
dict_->getUnicharset(), unichar_id, false));
|
||||||
if (permuter != NO_PERM) {
|
if (permuter != NO_PERM) {
|
||||||
PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false,
|
PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false,
|
||||||
word_start, dawg_args.valid_end, false, cert, prev,
|
word_start, dawg_args.valid_end, false, cert, prev,
|
||||||
|
@ -853,7 +853,7 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
|
|||||||
if (language_model_debug_level > 2)
|
if (language_model_debug_level > 2)
|
||||||
tprintf("Test Letter OK for unichar %d, normed %d\n",
|
tprintf("Test Letter OK for unichar %d, normed %d\n",
|
||||||
b.unichar_id(), normed_ids[i]);
|
b.unichar_id(), normed_ids[i]);
|
||||||
dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
|
dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
|
||||||
word_end && i == normed_ids.size() - 1);
|
word_end && i == normed_ids.size() - 1);
|
||||||
if (dawg_args_.permuter == NO_PERM) {
|
if (dawg_args_.permuter == NO_PERM) {
|
||||||
break;
|
break;
|
||||||
|
Loading…
Reference in New Issue
Block a user