mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
language_model.cpp
This commit is contained in:
parent
5d8ede8c89
commit
a58a4e0f7b
@ -171,8 +171,10 @@ void LanguageModel::InitForWord(const WERD_CHOICE *prev_word,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper scans the collection of predecessors for competing siblings that
|
/**
|
||||||
// have the same letter with the opposite case, setting competing_vse.
|
* Helper scans the collection of predecessors for competing siblings that
|
||||||
|
* have the same letter with the opposite case, setting competing_vse.
|
||||||
|
*/
|
||||||
static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
|
static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
|
||||||
LanguageModelState* parent_node) {
|
LanguageModelState* parent_node) {
|
||||||
if (parent_node == NULL) return;
|
if (parent_node == NULL) return;
|
||||||
@ -200,8 +202,10 @@ static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper returns true if the given choice has a better case variant before
|
/**
|
||||||
// it in the choice_list that is not distinguishable by size.
|
* Helper returns true if the given choice has a better case variant before
|
||||||
|
* it in the choice_list that is not distinguishable by size.
|
||||||
|
*/
|
||||||
static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
|
static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
|
||||||
const BLOB_CHOICE* choice,
|
const BLOB_CHOICE* choice,
|
||||||
BLOB_CHOICE_LIST* choices) {
|
BLOB_CHOICE_LIST* choices) {
|
||||||
@ -222,27 +226,32 @@ static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
|
|||||||
return false; // Should never happen, but just in case.
|
return false; // Should never happen, but just in case.
|
||||||
}
|
}
|
||||||
|
|
||||||
// UpdateState has the job of combining the ViterbiStateEntry lists on each
|
/**
|
||||||
// of the choices on parent_list with each of the blob choices in curr_list,
|
* UpdateState has the job of combining the ViterbiStateEntry lists on each
|
||||||
// making a new ViterbiStateEntry for each sensible path.
|
* of the choices on parent_list with each of the blob choices in curr_list,
|
||||||
// This could be a huge set of combinations, creating a lot of work only to
|
* making a new ViterbiStateEntry for each sensible path.
|
||||||
// be truncated by some beam limit, but only certain kinds of paths will
|
*
|
||||||
// continue at the next step:
|
* This could be a huge set of combinations, creating a lot of work only to
|
||||||
// paths that are liked by the language model: either a DAWG or the n-gram
|
* be truncated by some beam limit, but only certain kinds of paths will
|
||||||
// model, where active.
|
* continue at the next step:
|
||||||
// paths that represent some kind of top choice. The old permuter permuted
|
* - paths that are liked by the language model: either a DAWG or the n-gram
|
||||||
// the top raw classifier score, the top upper case word and the top lower-
|
* model, where active.
|
||||||
// case word. UpdateState now concentrates its top-choice paths on top
|
* - paths that represent some kind of top choice. The old permuter permuted
|
||||||
// lower-case, top upper-case (or caseless alpha), and top digit sequence,
|
* the top raw classifier score, the top upper case word and the top lower-
|
||||||
// with allowance for continuation of these paths through blobs where such
|
* case word. UpdateState now concentrates its top-choice paths on top
|
||||||
// a character does not appear in the choices list.
|
* lower-case, top upper-case (or caseless alpha), and top digit sequence,
|
||||||
// GetNextParentVSE enforces some of these models to minimize the number of
|
* with allowance for continuation of these paths through blobs where such
|
||||||
// calls to AddViterbiStateEntry, even prior to looking at the language model.
|
* a character does not appear in the choices list.
|
||||||
// Thus an n-blob sequence of [l1I] will produce 3n calls to
|
*
|
||||||
// AddViterbiStateEntry instead of 3^n.
|
* GetNextParentVSE enforces some of these models to minimize the number of
|
||||||
// Of course it isn't quite that simple as Title Case is handled by allowing
|
* calls to AddViterbiStateEntry, even prior to looking at the language model.
|
||||||
// lower case to continue an upper case initial, but it has to be detected
|
* Thus an n-blob sequence of [l1I] will produce 3n calls to
|
||||||
// in the combiner so it knows which upper case letters are initial alphas.
|
* AddViterbiStateEntry instead of 3^n.
|
||||||
|
*
|
||||||
|
* Of course it isn't quite that simple as Title Case is handled by allowing
|
||||||
|
* lower case to continue an upper case initial, but it has to be detected
|
||||||
|
* in the combiner so it knows which upper case letters are initial alphas.
|
||||||
|
*/
|
||||||
bool LanguageModel::UpdateState(
|
bool LanguageModel::UpdateState(
|
||||||
bool just_classified,
|
bool just_classified,
|
||||||
int curr_col, int curr_row,
|
int curr_col, int curr_row,
|
||||||
@ -367,10 +376,12 @@ bool LanguageModel::UpdateState(
|
|||||||
return new_changed;
|
return new_changed;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finds the first lower and upper case letter and first digit in curr_list.
|
/**
|
||||||
// For non-upper/lower languages, alpha counts as upper.
|
* Finds the first lower and upper case letter and first digit in curr_list.
|
||||||
// Uses the first character in the list in place of empty results.
|
* For non-upper/lower languages, alpha counts as upper.
|
||||||
// Returns true if both alpha and digits are found.
|
* Uses the first character in the list in place of empty results.
|
||||||
|
* Returns true if both alpha and digits are found.
|
||||||
|
*/
|
||||||
bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
|
bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
|
||||||
BLOB_CHOICE **first_lower,
|
BLOB_CHOICE **first_lower,
|
||||||
BLOB_CHOICE **first_upper,
|
BLOB_CHOICE **first_upper,
|
||||||
@ -402,13 +413,15 @@ bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
|
|||||||
return mixed;
|
return mixed;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Forces there to be at least one entry in the overall set of the
|
/**
|
||||||
// viterbi_state_entries of each element of parent_node that has the
|
* Forces there to be at least one entry in the overall set of the
|
||||||
// top_choice_flag set for lower, upper and digit using the same rules as
|
* viterbi_state_entries of each element of parent_node that has the
|
||||||
// GetTopLowerUpperDigit, setting the flag on the first found suitable
|
* top_choice_flag set for lower, upper and digit using the same rules as
|
||||||
// candidate, whether or not the flag is set on some other parent.
|
* GetTopLowerUpperDigit, setting the flag on the first found suitable
|
||||||
// Returns 1 if both alpha and digits are found among the parents, -1 if no
|
* candidate, whether or not the flag is set on some other parent.
|
||||||
// parents are found at all (a legitimate case), and 0 otherwise.
|
* Returns 1 if both alpha and digits are found among the parents, -1 if no
|
||||||
|
* parents are found at all (a legitimate case), and 0 otherwise.
|
||||||
|
*/
|
||||||
int LanguageModel::SetTopParentLowerUpperDigit(
|
int LanguageModel::SetTopParentLowerUpperDigit(
|
||||||
LanguageModelState *parent_node) const {
|
LanguageModelState *parent_node) const {
|
||||||
if (parent_node == NULL) return -1;
|
if (parent_node == NULL) return -1;
|
||||||
@ -481,9 +494,11 @@ int LanguageModel::SetTopParentLowerUpperDigit(
|
|||||||
return mixed ? 1 : 0;
|
return mixed ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finds the next ViterbiStateEntry with which the given unichar_id can
|
/**
|
||||||
// combine sensibly, taking into account any mixed alnum/mixed case
|
* Finds the next ViterbiStateEntry with which the given unichar_id can
|
||||||
// situation, and whether this combination has been inspected before.
|
* combine sensibly, taking into account any mixed alnum/mixed case
|
||||||
|
* situation, and whether this combination has been inspected before.
|
||||||
|
*/
|
||||||
ViterbiStateEntry* LanguageModel::GetNextParentVSE(
|
ViterbiStateEntry* LanguageModel::GetNextParentVSE(
|
||||||
bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
|
bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
|
||||||
LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
|
LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
|
||||||
|
Loading…
Reference in New Issue
Block a user