language_model.cpp

This commit is contained in:
Jimmy O'Regan 2014-08-11 11:45:13 +01:00 committed by Jim O'Regan
parent 5d8ede8c89
commit a58a4e0f7b

View File

@ -171,8 +171,10 @@ void LanguageModel::InitForWord(const WERD_CHOICE *prev_word,
} }
} }
// Helper scans the collection of predecessors for competing siblings that /**
// have the same letter with the opposite case, setting competing_vse. * Helper scans the collection of predecessors for competing siblings that
* have the same letter with the opposite case, setting competing_vse.
*/
static void ScanParentsForCaseMix(const UNICHARSET& unicharset, static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
LanguageModelState* parent_node) { LanguageModelState* parent_node) {
if (parent_node == NULL) return; if (parent_node == NULL) return;
@ -200,8 +202,10 @@ static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
} }
} }
// Helper returns true if the given choice has a better case variant before /**
// it in the choice_list that is not distinguishable by size. * Helper returns true if the given choice has a better case variant before
* it in the choice_list that is not distinguishable by size.
*/
static bool HasBetterCaseVariant(const UNICHARSET& unicharset, static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
const BLOB_CHOICE* choice, const BLOB_CHOICE* choice,
BLOB_CHOICE_LIST* choices) { BLOB_CHOICE_LIST* choices) {
@ -222,27 +226,32 @@ static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
return false; // Should never happen, but just in case. return false; // Should never happen, but just in case.
} }
// UpdateState has the job of combining the ViterbiStateEntry lists on each /**
// of the choices on parent_list with each of the blob choices in curr_list, * UpdateState has the job of combining the ViterbiStateEntry lists on each
// making a new ViterbiStateEntry for each sensible path. * of the choices on parent_list with each of the blob choices in curr_list,
// This could be a huge set of combinations, creating a lot of work only to * making a new ViterbiStateEntry for each sensible path.
// be truncated by some beam limit, but only certain kinds of paths will *
// continue at the next step: * This could be a huge set of combinations, creating a lot of work only to
// paths that are liked by the language model: either a DAWG or the n-gram * be truncated by some beam limit, but only certain kinds of paths will
// model, where active. * continue at the next step:
// paths that represent some kind of top choice. The old permuter permuted * - paths that are liked by the language model: either a DAWG or the n-gram
// the top raw classifier score, the top upper case word and the top lower- * model, where active.
// case word. UpdateState now concentrates its top-choice paths on top * - paths that represent some kind of top choice. The old permuter permuted
// lower-case, top upper-case (or caseless alpha), and top digit sequence, * the top raw classifier score, the top upper case word and the top lower-
// with allowance for continuation of these paths through blobs where such * case word. UpdateState now concentrates its top-choice paths on top
// a character does not appear in the choices list. * lower-case, top upper-case (or caseless alpha), and top digit sequence,
// GetNextParentVSE enforces some of these models to minimize the number of * with allowance for continuation of these paths through blobs where such
// calls to AddViterbiStateEntry, even prior to looking at the language model. * a character does not appear in the choices list.
// Thus an n-blob sequence of [l1I] will produce 3n calls to *
// AddViterbiStateEntry instead of 3^n. * GetNextParentVSE enforces some of these models to minimize the number of
// Of course it isn't quite that simple as Title Case is handled by allowing * calls to AddViterbiStateEntry, even prior to looking at the language model.
// lower case to continue an upper case initial, but it has to be detected * Thus an n-blob sequence of [l1I] will produce 3n calls to
// in the combiner so it knows which upper case letters are initial alphas. * AddViterbiStateEntry instead of 3^n.
*
* Of course it isn't quite that simple as Title Case is handled by allowing
* lower case to continue an upper case initial, but it has to be detected
* in the combiner so it knows which upper case letters are initial alphas.
*/
bool LanguageModel::UpdateState( bool LanguageModel::UpdateState(
bool just_classified, bool just_classified,
int curr_col, int curr_row, int curr_col, int curr_row,
@ -367,10 +376,12 @@ bool LanguageModel::UpdateState(
return new_changed; return new_changed;
} }
// Finds the first lower and upper case letter and first digit in curr_list. /**
// For non-upper/lower languages, alpha counts as upper. * Finds the first lower and upper case letter and first digit in curr_list.
// Uses the first character in the list in place of empty results. * For non-upper/lower languages, alpha counts as upper.
// Returns true if both alpha and digits are found. * Uses the first character in the list in place of empty results.
* Returns true if both alpha and digits are found.
*/
bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
BLOB_CHOICE **first_lower, BLOB_CHOICE **first_lower,
BLOB_CHOICE **first_upper, BLOB_CHOICE **first_upper,
@ -402,13 +413,15 @@ bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
return mixed; return mixed;
} }
// Forces there to be at least one entry in the overall set of the /**
// viterbi_state_entries of each element of parent_node that has the * Forces there to be at least one entry in the overall set of the
// top_choice_flag set for lower, upper and digit using the same rules as * viterbi_state_entries of each element of parent_node that has the
// GetTopLowerUpperDigit, setting the flag on the first found suitable * top_choice_flag set for lower, upper and digit using the same rules as
// candidate, whether or not the flag is set on some other parent. * GetTopLowerUpperDigit, setting the flag on the first found suitable
// Returns 1 if both alpha and digits are found among the parents, -1 if no * candidate, whether or not the flag is set on some other parent.
// parents are found at all (a legitimate case), and 0 otherwise. * Returns 1 if both alpha and digits are found among the parents, -1 if no
* parents are found at all (a legitimate case), and 0 otherwise.
*/
int LanguageModel::SetTopParentLowerUpperDigit( int LanguageModel::SetTopParentLowerUpperDigit(
LanguageModelState *parent_node) const { LanguageModelState *parent_node) const {
if (parent_node == NULL) return -1; if (parent_node == NULL) return -1;
@ -481,9 +494,11 @@ int LanguageModel::SetTopParentLowerUpperDigit(
return mixed ? 1 : 0; return mixed ? 1 : 0;
} }
// Finds the next ViterbiStateEntry with which the given unichar_id can /**
// combine sensibly, taking into account any mixed alnum/mixed case * Finds the next ViterbiStateEntry with which the given unichar_id can
// situation, and whether this combination has been inspected before. * combine sensibly, taking into account any mixed alnum/mixed case
* situation, and whether this combination has been inspected before.
*/
ViterbiStateEntry* LanguageModel::GetNextParentVSE( ViterbiStateEntry* LanguageModel::GetNextParentVSE(
bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc, bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset, LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,