tesseract/wordrec/language_model.cpp
Stefan Weil 58d9593094 wordrec: Replace NULL by nullptr
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-04-22 17:42:36 +02:00

1472 lines
62 KiB
C++

///////////////////////////////////////////////////////////////////////
// File: language_model.cpp
// Description: Functions that utilize the knowledge about the properties,
// structure and statistics of the language to help recognition.
// Author: Daria Antonova
// Created: Mon Nov 11 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <math.h>
#include "language_model.h"
#include "dawg.h"
#include "intproto.h"
#include "helpers.h"
#include "lm_state.h"
#include "lm_pain_points.h"
#include "matrix.h"
#include "params.h"
#include "params_training_featdef.h"
namespace tesseract {
#if (defined(_MSC_VER) && _MSC_VER < 1900) || defined(ANDROID)
static inline double log2(double n) {
return log(n) / log(2.0);
}
#endif // _MSC_VER
const float LanguageModel::kMaxAvgNgramCost = 25.0f;
LanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table,
Dict *dict)
: INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
dict->getCCUtil()->params()),
BOOL_INIT_MEMBER(language_model_ngram_on, false,
"Turn on/off the use of character ngram model",
dict->getCCUtil()->params()),
INT_MEMBER(language_model_ngram_order, 8,
"Maximum order of the character ngram model",
dict->getCCUtil()->params()),
INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
"Maximum number of prunable (those for which"
" PrunablePath() is true) entries in each viterbi list"
" recorded in BLOB_CHOICEs",
dict->getCCUtil()->params()),
INT_MEMBER(language_model_viterbi_list_max_size, 500,
"Maximum size of viterbi lists recorded in BLOB_CHOICEs",
dict->getCCUtil()->params()),
double_MEMBER(language_model_ngram_small_prob, 0.000001,
"To avoid overly small denominators use this as the "
"floor of the probability returned by the ngram model.",
dict->getCCUtil()->params()),
double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
"Average classifier score of a non-matching unichar.",
dict->getCCUtil()->params()),
BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
"Use only the first UTF8 step of the given string"
" when computing log probabilities.",
dict->getCCUtil()->params()),
double_MEMBER(language_model_ngram_scale_factor, 0.03,
"Strength of the character ngram model relative to the"
" character classifier ",
dict->getCCUtil()->params()),
double_MEMBER(language_model_ngram_rating_factor, 16.0,
"Factor to bring log-probs into the same range as ratings"
" when multiplied by outline length ",
dict->getCCUtil()->params()),
BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
"Words are delimited by space", dict->getCCUtil()->params()),
INT_MEMBER(language_model_min_compound_length, 3,
"Minimum length of compound words",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
"Penalty for words not in the frequent word dictionary",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_non_dict_word, 0.15,
"Penalty for non-dictionary words",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_punc, 0.2,
"Penalty for inconsistent punctuation",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_case, 0.1,
"Penalty for inconsistent case",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_script, 0.5,
"Penalty for inconsistent script",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_chartype, 0.3,
"Penalty for inconsistent character type",
dict->getCCUtil()->params()),
// TODO(daria, rays): enable font consistency checking
// after improving font analysis.
double_MEMBER(language_model_penalty_font, 0.00,
"Penalty for inconsistent font",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_spacing, 0.05,
"Penalty for inconsistent spacing",
dict->getCCUtil()->params()),
double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
dict->getCCUtil()->params()),
INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
dict->getCCUtil()->params()),
BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
"Use sigmoidal score for certainty",
dict->getCCUtil()->params()),
dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
fontinfo_table_(fontinfo_table),
dict_(dict),
fixed_pitch_(false),
max_char_wh_ratio_(0.0),
acceptable_choice_found_(false) {
ASSERT_HOST(dict_ != nullptr);
}
LanguageModel::~LanguageModel() { delete dawg_args_.updated_dawgs; }
void LanguageModel::InitForWord(const WERD_CHOICE *prev_word,
bool fixed_pitch, float max_char_wh_ratio,
float rating_cert_scale) {
fixed_pitch_ = fixed_pitch;
max_char_wh_ratio_ = max_char_wh_ratio;
rating_cert_scale_ = rating_cert_scale;
acceptable_choice_found_ = false;
correct_segmentation_explored_ = false;
// Initialize vectors with beginning DawgInfos.
very_beginning_active_dawgs_.clear();
dict_->init_active_dawgs(&very_beginning_active_dawgs_, false);
beginning_active_dawgs_.clear();
dict_->default_dawgs(&beginning_active_dawgs_, false);
// Fill prev_word_str_ with the last language_model_ngram_order
// unichars from prev_word.
if (language_model_ngram_on) {
if (prev_word != nullptr && prev_word->unichar_string() != nullptr) {
prev_word_str_ = prev_word->unichar_string();
if (language_model_ngram_space_delimited_language) prev_word_str_ += ' ';
} else {
prev_word_str_ = " ";
}
const char *str_ptr = prev_word_str_.string();
const char *str_end = str_ptr + prev_word_str_.length();
int step;
prev_word_unichar_step_len_ = 0;
while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
str_ptr += step;
++prev_word_unichar_step_len_;
}
ASSERT_HOST(str_ptr == str_end);
}
}
/**
* Helper scans the collection of predecessors for competing siblings that
* have the same letter with the opposite case, setting competing_vse.
*/
static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
LanguageModelState* parent_node) {
if (parent_node == nullptr) return;
ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
ViterbiStateEntry* vse = vit.data();
vse->competing_vse = nullptr;
UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
if (unicharset.get_isupper(unichar_id) ||
unicharset.get_islower(unichar_id)) {
UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
if (other_case == unichar_id) continue; // Not in unicharset.
// Find other case in same list. There could be multiple entries with
// the same unichar_id, but in theory, they should all point to the
// same BLOB_CHOICE, and that is what we will be using to decide
// which to keep.
ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
vit2.data()->curr_b->unichar_id() != other_case;
vit2.forward()) {}
if (!vit2.cycled_list()) {
vse->competing_vse = vit2.data();
}
}
}
}
/**
* Helper returns true if the given choice has a better case variant before
* it in the choice_list that is not distinguishable by size.
*/
static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
const BLOB_CHOICE* choice,
BLOB_CHOICE_LIST* choices) {
UNICHAR_ID choice_id = choice->unichar_id();
UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
return false; // Not upper or lower or not in unicharset.
if (unicharset.SizesDistinct(choice_id, other_case))
return false; // Can be separated by size.
BLOB_CHOICE_IT bc_it(choices);
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
BLOB_CHOICE* better_choice = bc_it.data();
if (better_choice->unichar_id() == other_case)
return true; // Found an earlier instance of other_case.
else if (better_choice == choice)
return false; // Reached the original choice.
}
return false; // Should never happen, but just in case.
}
/**
* UpdateState has the job of combining the ViterbiStateEntry lists on each
* of the choices on parent_list with each of the blob choices in curr_list,
* making a new ViterbiStateEntry for each sensible path.
*
* This could be a huge set of combinations, creating a lot of work only to
* be truncated by some beam limit, but only certain kinds of paths will
* continue at the next step:
* - paths that are liked by the language model: either a DAWG or the n-gram
* model, where active.
* - paths that represent some kind of top choice. The old permuter permuted
* the top raw classifier score, the top upper case word and the top lower-
* case word. UpdateState now concentrates its top-choice paths on top
* lower-case, top upper-case (or caseless alpha), and top digit sequence,
* with allowance for continuation of these paths through blobs where such
* a character does not appear in the choices list.
*
* GetNextParentVSE enforces some of these models to minimize the number of
* calls to AddViterbiStateEntry, even prior to looking at the language model.
* Thus an n-blob sequence of [l1I] will produce 3n calls to
* AddViterbiStateEntry instead of 3^n.
*
* Of course it isn't quite that simple as Title Case is handled by allowing
* lower case to continue an upper case initial, but it has to be detected
* in the combiner so it knows which upper case letters are initial alphas.
*/
bool LanguageModel::UpdateState(
bool just_classified,
int curr_col, int curr_row,
BLOB_CHOICE_LIST *curr_list,
LanguageModelState *parent_node,
LMPainPoints *pain_points,
WERD_RES *word_res,
BestChoiceBundle *best_choice_bundle,
BlamerBundle *blamer_bundle) {
if (language_model_debug_level > 0) {
tprintf("\nUpdateState: col=%d row=%d %s",
curr_col, curr_row, just_classified ? "just_classified" : "");
if (language_model_debug_level > 5)
tprintf("(parent=%p)\n", parent_node);
else
tprintf("\n");
}
// Initialize helper variables.
bool word_end = (curr_row+1 >= word_res->ratings->dimension());
bool new_changed = false;
float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
const UNICHARSET& unicharset = dict_->getUnicharset();
BLOB_CHOICE *first_lower = nullptr;
BLOB_CHOICE *first_upper = nullptr;
BLOB_CHOICE *first_digit = nullptr;
bool has_alnum_mix = false;
if (parent_node != nullptr) {
int result = SetTopParentLowerUpperDigit(parent_node);
if (result < 0) {
if (language_model_debug_level > 0)
tprintf("No parents found to process\n");
return false;
}
if (result > 0)
has_alnum_mix = true;
}
if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
&first_digit))
has_alnum_mix = false;;
ScanParentsForCaseMix(unicharset, parent_node);
if (language_model_debug_level > 3 && parent_node != nullptr) {
parent_node->Print("Parent viterbi list");
}
LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
// Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
ViterbiStateEntry_IT vit;
BLOB_CHOICE_IT c_it(curr_list);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
BLOB_CHOICE* choice = c_it.data();
// TODO(antonova): make sure commenting this out if ok for ngram
// model scoring (I think this was introduced to fix ngram model quirks).
// Skip nullptr unichars unless it is the only choice.
//if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
UNICHAR_ID unichar_id = choice->unichar_id();
if (unicharset.get_fragment(unichar_id)) {
continue; // Skip fragments.
}
// Set top choice flags.
LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
if (c_it.at_first() || !new_changed)
blob_choice_flags |= kSmallestRatingFlag;
if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
if (first_digit == choice) blob_choice_flags |= kDigitFlag;
if (parent_node == nullptr) {
// Process the beginning of a word.
// If there is a better case variant that is not distinguished by size,
// skip this blob choice, as we have no choice but to accept the result
// of the character classifier to distinguish between them, even if
// followed by an upper case.
// With words like iPoc, and other CamelBackWords, the lower-upper
// transition can only be achieved if the classifier has the correct case
// as the top choice, and leaving an initial I lower down the list
// increases the chances of choosing IPoc simply because it doesn't
// include such a transition. iPoc will beat iPOC and ipoc because
// the other words are baseline/x-height inconsistent.
if (HasBetterCaseVariant(unicharset, choice, curr_list))
continue;
// Upper counts as lower at the beginning of a word.
if (blob_choice_flags & kUpperCaseFlag)
blob_choice_flags |= kLowerCaseFlag;
new_changed |= AddViterbiStateEntry(
blob_choice_flags, denom, word_end, curr_col, curr_row,
choice, curr_state, nullptr, pain_points,
word_res, best_choice_bundle, blamer_bundle);
} else {
// Get viterbi entries from each parent ViterbiStateEntry.
vit.set_to_list(&parent_node->viterbi_state_entries);
int vit_counter = 0;
vit.mark_cycle_pt();
ViterbiStateEntry* parent_vse = nullptr;
LanguageModelFlagsType top_choice_flags;
while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
c_it.data(), blob_choice_flags,
unicharset, word_res, &vit,
&top_choice_flags)) != nullptr) {
// Skip pruned entries and do not look at prunable entries if already
// examined language_model_viterbi_list_max_num_prunable of those.
if (PrunablePath(*parent_vse) &&
(++vit_counter > language_model_viterbi_list_max_num_prunable ||
(language_model_ngram_on && parent_vse->ngram_info->pruned))) {
continue;
}
// If the parent has no alnum choice, (ie choice is the first in a
// string of alnum), and there is a better case variant that is not
// distinguished by size, skip this blob choice/parent, as with the
// initial blob treatment above.
if (!parent_vse->HasAlnumChoice(unicharset) &&
HasBetterCaseVariant(unicharset, choice, curr_list))
continue;
// Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
// looks good according to the Dawgs or character ngram model.
new_changed |= AddViterbiStateEntry(
top_choice_flags, denom, word_end, curr_col, curr_row,
c_it.data(), curr_state, parent_vse, pain_points,
word_res, best_choice_bundle, blamer_bundle);
}
}
}
return new_changed;
}
/**
* Finds the first lower and upper case letter and first digit in curr_list.
* For non-upper/lower languages, alpha counts as upper.
* Uses the first character in the list in place of empty results.
* Returns true if both alpha and digits are found.
*/
bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
BLOB_CHOICE **first_lower,
BLOB_CHOICE **first_upper,
BLOB_CHOICE **first_digit) const {
BLOB_CHOICE_IT c_it(curr_list);
const UNICHARSET &unicharset = dict_->getUnicharset();
BLOB_CHOICE *first_unichar = nullptr;
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
UNICHAR_ID unichar_id = c_it.data()->unichar_id();
if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
if (first_unichar == nullptr) first_unichar = c_it.data();
if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
*first_lower = c_it.data();
}
if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
!unicharset.get_islower(unichar_id)) {
*first_upper = c_it.data();
}
if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
*first_digit = c_it.data();
}
}
ASSERT_HOST(first_unichar != nullptr);
bool mixed = (*first_lower != nullptr || *first_upper != nullptr) &&
*first_digit != nullptr;
if (*first_lower == nullptr) *first_lower = first_unichar;
if (*first_upper == nullptr) *first_upper = first_unichar;
if (*first_digit == nullptr) *first_digit = first_unichar;
return mixed;
}
/**
* Forces there to be at least one entry in the overall set of the
* viterbi_state_entries of each element of parent_node that has the
* top_choice_flag set for lower, upper and digit using the same rules as
* GetTopLowerUpperDigit, setting the flag on the first found suitable
* candidate, whether or not the flag is set on some other parent.
* Returns 1 if both alpha and digits are found among the parents, -1 if no
* parents are found at all (a legitimate case), and 0 otherwise.
*/
int LanguageModel::SetTopParentLowerUpperDigit(
LanguageModelState *parent_node) const {
if (parent_node == nullptr) return -1;
UNICHAR_ID top_id = INVALID_UNICHAR_ID;
ViterbiStateEntry* top_lower = nullptr;
ViterbiStateEntry* top_upper = nullptr;
ViterbiStateEntry* top_digit = nullptr;
ViterbiStateEntry* top_choice = nullptr;
float lower_rating = 0.0f;
float upper_rating = 0.0f;
float digit_rating = 0.0f;
float top_rating = 0.0f;
const UNICHARSET &unicharset = dict_->getUnicharset();
ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
ViterbiStateEntry* vse = vit.data();
// INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
// back to the real character if needed.
ViterbiStateEntry* unichar_vse = vse;
UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
float rating = unichar_vse->curr_b->rating();
while (unichar_id == INVALID_UNICHAR_ID &&
unichar_vse->parent_vse != nullptr) {
unichar_vse = unichar_vse->parent_vse;
unichar_id = unichar_vse->curr_b->unichar_id();
rating = unichar_vse->curr_b->rating();
}
if (unichar_id != INVALID_UNICHAR_ID) {
if (unicharset.get_islower(unichar_id)) {
if (top_lower == nullptr || lower_rating > rating) {
top_lower = vse;
lower_rating = rating;
}
} else if (unicharset.get_isalpha(unichar_id)) {
if (top_upper == nullptr || upper_rating > rating) {
top_upper = vse;
upper_rating = rating;
}
} else if (unicharset.get_isdigit(unichar_id)) {
if (top_digit == nullptr || digit_rating > rating) {
top_digit = vse;
digit_rating = rating;
}
}
}
if (top_choice == nullptr || top_rating > rating) {
top_choice = vse;
top_rating = rating;
top_id = unichar_id;
}
}
if (top_choice == nullptr) return -1;
bool mixed = (top_lower != nullptr || top_upper != nullptr) &&
top_digit != nullptr;
if (top_lower == nullptr) top_lower = top_choice;
top_lower->top_choice_flags |= kLowerCaseFlag;
if (top_upper == nullptr) top_upper = top_choice;
top_upper->top_choice_flags |= kUpperCaseFlag;
if (top_digit == nullptr) top_digit = top_choice;
top_digit->top_choice_flags |= kDigitFlag;
top_choice->top_choice_flags |= kSmallestRatingFlag;
if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
(top_choice->top_choice_flags &
(kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {
// If the compound marker top choice carries any of the top alnum flags,
// then give it all of them, allowing words like I-295 to be chosen.
top_choice->top_choice_flags |=
kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;
}
return mixed ? 1 : 0;
}
/**
* Finds the next ViterbiStateEntry with which the given unichar_id can
* combine sensibly, taking into account any mixed alnum/mixed case
* situation, and whether this combination has been inspected before.
*/
ViterbiStateEntry* LanguageModel::GetNextParentVSE(
bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
LanguageModelFlagsType* top_choice_flags) const {
for (; !vse_it->cycled_list(); vse_it->forward()) {
ViterbiStateEntry* parent_vse = vse_it->data();
// Only consider the parent if it has been updated or
// if the current ratings cell has just been classified.
if (!just_classified && !parent_vse->updated) continue;
if (language_model_debug_level > 2)
parent_vse->Print("Considering");
// If the parent is non-alnum, then upper counts as lower.
*top_choice_flags = blob_choice_flags;
if ((blob_choice_flags & kUpperCaseFlag) &&
!parent_vse->HasAlnumChoice(unicharset)) {
*top_choice_flags |= kLowerCaseFlag;
}
*top_choice_flags &= parent_vse->top_choice_flags;
UNICHAR_ID unichar_id = bc->unichar_id();
const BLOB_CHOICE* parent_b = parent_vse->curr_b;
UNICHAR_ID parent_id = parent_b->unichar_id();
// Digits do not bind to alphas if there is a mix in both parent and current
// or if the alpha is not the top choice.
if (unicharset.get_isdigit(unichar_id) &&
unicharset.get_isalpha(parent_id) &&
(mixed_alnum || *top_choice_flags == 0))
continue; // Digits don't bind to alphas.
// Likewise alphas do not bind to digits if there is a mix in both or if
// the digit is not the top choice.
if (unicharset.get_isalpha(unichar_id) &&
unicharset.get_isdigit(parent_id) &&
(mixed_alnum || *top_choice_flags == 0))
continue; // Alphas don't bind to digits.
// If there is a case mix of the same alpha in the parent list, then
// competing_vse is non-null and will be used to determine whether
// or not to bind the current blob choice.
if (parent_vse->competing_vse != nullptr) {
const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
UNICHAR_ID other_id = competing_b->unichar_id();
if (language_model_debug_level >= 5) {
tprintf("Parent %s has competition %s\n",
unicharset.id_to_unichar(parent_id),
unicharset.id_to_unichar(other_id));
}
if (unicharset.SizesDistinct(parent_id, other_id)) {
// If other_id matches bc wrt position and size, and parent_id, doesn't,
// don't bind to the current parent.
if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
language_model_debug_level >= 5) &&
!bc->PosAndSizeAgree(*parent_b, word_res->x_height,
language_model_debug_level >= 5))
continue; // Competing blobchoice has a better vertical match.
}
}
vse_it->forward();
return parent_vse; // This one is good!
}
return nullptr; // Ran out of possibilities.
}
bool LanguageModel::AddViterbiStateEntry(
LanguageModelFlagsType top_choice_flags,
float denom,
bool word_end,
int curr_col, int curr_row,
BLOB_CHOICE *b,
LanguageModelState *curr_state,
ViterbiStateEntry *parent_vse,
LMPainPoints *pain_points,
WERD_RES *word_res,
BestChoiceBundle *best_choice_bundle,
BlamerBundle *blamer_bundle) {
ViterbiStateEntry_IT vit;
if (language_model_debug_level > 1) {
tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
" certainty=%.4f top_choice_flags=0x%x",
dict_->getUnicharset().id_to_unichar(b->unichar_id()),
b->rating(), b->certainty(), top_choice_flags);
if (language_model_debug_level > 5)
tprintf(" parent_vse=%p\n", parent_vse);
else
tprintf("\n");
}
// Check whether the list is full.
if (curr_state != nullptr &&
curr_state->viterbi_state_entries_length >=
language_model_viterbi_list_max_size) {
if (language_model_debug_level > 1) {
tprintf("AddViterbiStateEntry: viterbi list is full!\n");
}
return false;
}
// Invoke Dawg language model component.
LanguageModelDawgInfo *dawg_info =
GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
float outline_length =
AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b);
// Invoke Ngram language model component.
LanguageModelNgramInfo *ngram_info = nullptr;
if (language_model_ngram_on) {
ngram_info = GenerateNgramInfo(
dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),
denom, curr_col, curr_row, outline_length, parent_vse);
ASSERT_HOST(ngram_info != nullptr);
}
bool liked_by_language_model = dawg_info != nullptr ||
(ngram_info != nullptr && !ngram_info->pruned);
// Quick escape if not liked by the language model, can't be consistent
// xheight, and not top choice.
if (!liked_by_language_model && top_choice_flags == 0) {
if (language_model_debug_level > 1) {
tprintf("Language model components very early pruned this entry\n");
}
delete ngram_info;
delete dawg_info;
return false;
}
// Check consistency of the path and set the relevant consistency_info.
LMConsistencyInfo consistency_info(
parent_vse != nullptr ? &parent_vse->consistency_info : nullptr);
// Start with just the x-height consistency, as it provides significant
// pruning opportunity.
consistency_info.ComputeXheightConsistency(
b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));
// Turn off xheight consistent flag if not consistent.
if (consistency_info.InconsistentXHeight()) {
top_choice_flags &= ~kXhtConsistentFlag;
}
// Quick escape if not liked by the language model, not consistent xheight,
// and not top choice.
if (!liked_by_language_model && top_choice_flags == 0) {
if (language_model_debug_level > 1) {
tprintf("Language model components early pruned this entry\n");
}
delete ngram_info;
delete dawg_info;
return false;
}
// Compute the rest of the consistency info.
FillConsistencyInfo(curr_col, word_end, b, parent_vse,
word_res, &consistency_info);
if (dawg_info != nullptr && consistency_info.invalid_punc) {
consistency_info.invalid_punc = false; // do not penalize dict words
}
// Compute cost of associating the blobs that represent the current unichar.
AssociateStats associate_stats;
ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
parent_vse, word_res, &associate_stats);
if (parent_vse != nullptr) {
associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
}
// Create the new ViterbiStateEntry compute the adjusted cost of the path.
ViterbiStateEntry *new_vse = new ViterbiStateEntry(
parent_vse, b, 0.0, outline_length,
consistency_info, associate_stats, top_choice_flags, dawg_info,
ngram_info, (language_model_debug_level > 0) ?
dict_->getUnicharset().id_to_unichar(b->unichar_id()) : nullptr);
new_vse->cost = ComputeAdjustedPathCost(new_vse);
if (language_model_debug_level >= 3)
tprintf("Adjusted cost = %g\n", new_vse->cost);
// Invoke Top Choice language model component to make the final adjustments
// to new_vse->top_choice_flags.
if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
}
// If language model components did not like this unichar - return.
bool keep = new_vse->top_choice_flags || liked_by_language_model;
if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
consistency_info.inconsistent_script) { // with inconsistent script
keep = false;
}
if (!keep) {
if (language_model_debug_level > 1) {
tprintf("Language model components did not like this entry\n");
}
delete new_vse;
return false;
}
// Discard this entry if it represents a prunable path and
// language_model_viterbi_list_max_num_prunable such entries with a lower
// cost have already been recorded.
if (PrunablePath(*new_vse) &&
(curr_state->viterbi_state_entries_prunable_length >=
language_model_viterbi_list_max_num_prunable) &&
new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
if (language_model_debug_level > 1) {
tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
new_vse->cost,
curr_state->viterbi_state_entries_prunable_max_cost);
}
delete new_vse;
return false;
}
// Update best choice if needed.
if (word_end) {
UpdateBestChoice(new_vse, pain_points, word_res,
best_choice_bundle, blamer_bundle);
// Discard the entry if UpdateBestChoice() found flaws in it.
if (new_vse->cost >= WERD_CHOICE::kBadRating &&
new_vse != best_choice_bundle->best_vse) {
if (language_model_debug_level > 1) {
tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
}
delete new_vse;
return false;
}
}
// Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
false, new_vse);
curr_state->viterbi_state_entries_length++;
if (PrunablePath(*new_vse)) {
curr_state->viterbi_state_entries_prunable_length++;
}
// Update lms->viterbi_state_entries_prunable_max_cost and clear
// top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
if ((curr_state->viterbi_state_entries_prunable_length >=
language_model_viterbi_list_max_num_prunable) ||
new_vse->top_choice_flags) {
ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
int prunable_counter = language_model_viterbi_list_max_num_prunable;
vit.set_to_list(&(curr_state->viterbi_state_entries));
for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
ViterbiStateEntry *curr_vse = vit.data();
// Clear the appropriate top choice flags of the entries in the
// list that have cost higher thank new_entry->cost
// (since they will not be top choices any more).
if (curr_vse->top_choice_flags && curr_vse != new_vse &&
curr_vse->cost > new_vse->cost) {
curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
}
if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
// Update curr_state->viterbi_state_entries_prunable_max_cost.
if (prunable_counter == 0) {
curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
if (language_model_debug_level > 1) {
tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
curr_state->viterbi_state_entries_prunable_max_cost);
}
prunable_counter = -1; // stop counting
}
}
}
// Print the newly created ViterbiStateEntry.
if (language_model_debug_level > 2) {
new_vse->Print("New");
if (language_model_debug_level > 5)
curr_state->Print("Updated viterbi list");
}
return true;
}
void LanguageModel::GenerateTopChoiceInfo(ViterbiStateEntry *new_vse,
const ViterbiStateEntry *parent_vse,
LanguageModelState *lms) {
ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
new_vse->cost >= vit.data()->cost; vit.forward()) {
// Clear the appropriate flags if the list already contains
// a top choice entry with a lower cost.
new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
}
if (language_model_debug_level > 2) {
tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
new_vse->top_choice_flags);
}
}
LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
bool word_end,
int curr_col, int curr_row,
const BLOB_CHOICE &b,
const ViterbiStateEntry *parent_vse) {
// Initialize active_dawgs from parent_vse if it is not nullptr.
// Otherwise use very_beginning_active_dawgs_.
if (parent_vse == nullptr) {
dawg_args_.active_dawgs = &very_beginning_active_dawgs_;
dawg_args_.permuter = NO_PERM;
} else {
if (parent_vse->dawg_info == nullptr) return nullptr; // not a dict word path
dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
dawg_args_.permuter = parent_vse->dawg_info->permuter;
}
// Deal with hyphenated words.
if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
}
// Deal with compound words.
if (dict_->compound_marker(b.unichar_id()) &&
(parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
if (language_model_debug_level > 0) tprintf("Found compound marker\n");
// Do not allow compound operators at the beginning and end of the word.
// Do not allow more than one compound operator per word.
// Do not allow compounding of words with lengths shorter than
// language_model_min_compound_length
if (parent_vse == nullptr || word_end ||
dawg_args_.permuter == COMPOUND_PERM ||
parent_vse->length < language_model_min_compound_length)
return nullptr;
int i;
// Check a that the path terminated before the current character is a word.
bool has_word_ending = false;
for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
const Dawg *pdawg = pos.dawg_index < 0
? nullptr : dict_->GetDawg(pos.dawg_index);
if (pdawg == nullptr || pos.back_to_punc) continue;;
if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
pdawg->end_of_word(pos.dawg_ref)) {
has_word_ending = true;
break;
}
}
if (!has_word_ending) return nullptr;
if (language_model_debug_level > 0) tprintf("Compound word found\n");
return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
} // done dealing with compound words
LanguageModelDawgInfo *dawg_info = nullptr;
// Call LetterIsOkay().
// Use the normalized IDs so that all shapes of ' can be allowed in words
// like don't.
const GenericVector<UNICHAR_ID>& normed_ids =
dict_->getUnicharset().normed_ids(b.unichar_id());
DawgPositionVector tmp_active_dawgs;
for (int i = 0; i < normed_ids.size(); ++i) {
if (language_model_debug_level > 2)
tprintf("Test Letter OK for unichar %d, normed %d\n",
b.unichar_id(), normed_ids[i]);
dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
word_end && i == normed_ids.size() - 1);
if (dawg_args_.permuter == NO_PERM) {
break;
} else if (i < normed_ids.size() - 1) {
tmp_active_dawgs = *dawg_args_.updated_dawgs;
dawg_args_.active_dawgs = &tmp_active_dawgs;
}
if (language_model_debug_level > 2)
tprintf("Letter was OK for unichar %d, normed %d\n",
b.unichar_id(), normed_ids[i]);
}
dawg_args_.active_dawgs = nullptr;
if (dawg_args_.permuter != NO_PERM) {
dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs,
dawg_args_.permuter);
} else if (language_model_debug_level > 3) {
tprintf("Letter %s not OK!\n",
dict_->getUnicharset().id_to_unichar(b.unichar_id()));
}
return dawg_info;
}
LanguageModelNgramInfo *LanguageModel::GenerateNgramInfo(
const char *unichar, float certainty, float denom,
int curr_col, int curr_row, float outline_length,
const ViterbiStateEntry *parent_vse) {
// Initialize parent context.
const char *pcontext_ptr = "";
int pcontext_unichar_step_len = 0;
if (parent_vse == nullptr) {
pcontext_ptr = prev_word_str_.string();
pcontext_unichar_step_len = prev_word_unichar_step_len_;
} else {
pcontext_ptr = parent_vse->ngram_info->context.string();
pcontext_unichar_step_len =
parent_vse->ngram_info->context_unichar_step_len;
}
// Compute p(unichar | parent context).
int unichar_step_len = 0;
bool pruned = false;
float ngram_cost;
float ngram_and_classifier_cost =
ComputeNgramCost(unichar, certainty, denom,
pcontext_ptr, &unichar_step_len,
&pruned, &ngram_cost);
// Normalize just the ngram_and_classifier_cost by outline_length.
// The ngram_cost is used by the params_model, so it needs to be left as-is,
// and the params model cost will be normalized by outline_length.
ngram_and_classifier_cost *=
outline_length / language_model_ngram_rating_factor;
// Add the ngram_cost of the parent.
if (parent_vse != nullptr) {
ngram_and_classifier_cost +=
parent_vse->ngram_info->ngram_and_classifier_cost;
ngram_cost += parent_vse->ngram_info->ngram_cost;
}
// Shorten parent context string by unichar_step_len unichars.
int num_remove = (unichar_step_len + pcontext_unichar_step_len -
language_model_ngram_order);
if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
while (num_remove > 0 && *pcontext_ptr != '\0') {
pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
--num_remove;
}
// Decide whether to prune this ngram path and update changed accordingly.
if (parent_vse != nullptr && parent_vse->ngram_info->pruned) pruned = true;
// Construct and return the new LanguageModelNgramInfo.
LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
ngram_and_classifier_cost);
ngram_info->context += unichar;
ngram_info->context_unichar_step_len += unichar_step_len;
assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
return ngram_info;
}
float LanguageModel::ComputeNgramCost(const char *unichar,
float certainty,
float denom,
const char *context,
int *unichar_step_len,
bool *found_small_prob,
float *ngram_cost) {
const char *context_ptr = context;
char *modified_context = nullptr;
char *modified_context_end = nullptr;
const char *unichar_ptr = unichar;
const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
float prob = 0.0f;
int step = 0;
while (unichar_ptr < unichar_end &&
(step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
if (language_model_debug_level > 1) {
tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
}
prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
++(*unichar_step_len);
if (language_model_ngram_use_only_first_uft8_step) break;
unichar_ptr += step;
// If there are multiple UTF8 characters present in unichar, context is
// updated to include the previously examined characters from str,
// unless use_only_first_uft8_step is true.
if (unichar_ptr < unichar_end) {
if (modified_context == nullptr) {
size_t context_len = strlen(context);
modified_context =
new char[context_len + strlen(unichar_ptr) + step + 1];
memcpy(modified_context, context, context_len);
modified_context_end = modified_context + context_len;
context_ptr = modified_context;
}
strncpy(modified_context_end, unichar_ptr - step, step);
modified_context_end += step;
*modified_context_end = '\0';
}
}
prob /= static_cast<float>(*unichar_step_len); // normalize
if (prob < language_model_ngram_small_prob) {
if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
*found_small_prob = true;
prob = language_model_ngram_small_prob;
}
*ngram_cost = -1.0*log2(prob);
float ngram_and_classifier_cost =
-1.0*log2(CertaintyScore(certainty)/denom) +
*ngram_cost * language_model_ngram_scale_factor;
if (language_model_debug_level > 1) {
tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
ngram_and_classifier_cost);
}
delete[] modified_context;
return ngram_and_classifier_cost;
}
float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
if (curr_list->empty()) return 1.0f;
float denom = 0.0f;
int len = 0;
BLOB_CHOICE_IT c_it(curr_list);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
ASSERT_HOST(c_it.data() != nullptr);
++len;
denom += CertaintyScore(c_it.data()->certainty());
}
assert(len != 0);
// The ideal situation would be to have the classifier scores for
// classifying each position as each of the characters in the unicharset.
// Since we can not do this because of speed, we add a very crude estimate
// of what these scores for the "missing" classifications would sum up to.
denom += (dict_->getUnicharset().size() - len) *
CertaintyScore(language_model_ngram_nonmatch_score);
return denom;
}
void LanguageModel::FillConsistencyInfo(
int curr_col,
bool word_end,
BLOB_CHOICE *b,
ViterbiStateEntry *parent_vse,
WERD_RES *word_res,
LMConsistencyInfo *consistency_info) {
const UNICHARSET &unicharset = dict_->getUnicharset();
UNICHAR_ID unichar_id = b->unichar_id();
BLOB_CHOICE* parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
// Check punctuation validity.
if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
(unicharset.get_isalpha(parent_b->unichar_id()) ||
unicharset.get_isdigit(parent_b->unichar_id()))) {
// reset punc_ref for compound words
consistency_info->punc_ref = NO_EDGE;
} else {
bool is_apos = dict_->is_apostrophe(unichar_id);
bool prev_is_numalpha = (parent_b != nullptr &&
(unicharset.get_isalpha(parent_b->unichar_id()) ||
unicharset.get_isdigit(parent_b->unichar_id())));
UNICHAR_ID pattern_unichar_id =
(unicharset.get_isalpha(unichar_id) ||
unicharset.get_isdigit(unichar_id) ||
(is_apos && prev_is_numalpha)) ?
Dawg::kPatternUnicharID : unichar_id;
if (consistency_info->punc_ref == NO_EDGE ||
pattern_unichar_id != Dawg::kPatternUnicharID ||
dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
Dawg::kPatternUnicharID) {
NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(),
consistency_info->punc_ref);
consistency_info->punc_ref =
(node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
node, pattern_unichar_id, word_end) : NO_EDGE;
if (consistency_info->punc_ref == NO_EDGE) {
consistency_info->invalid_punc = true;
}
}
}
}
// Update case related counters.
if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
// Reset counters if we are dealing with a compound word.
consistency_info->num_lower = 0;
consistency_info->num_non_first_upper = 0;
}
else if (unicharset.get_islower(unichar_id)) {
consistency_info->num_lower++;
} else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
if (unicharset.get_isupper(parent_b->unichar_id()) ||
consistency_info->num_lower > 0 ||
consistency_info->num_non_first_upper > 0) {
consistency_info->num_non_first_upper++;
}
}
// Initialize consistency_info->script_id (use script of unichar_id
// if it is not Common, use script id recorded by the parent otherwise).
// Set inconsistent_script to true if the script of the current unichar
// is not consistent with that of the parent.
consistency_info->script_id = unicharset.get_script(unichar_id);
// Hiragana and Katakana can mix with Han.
if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {
if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
consistency_info->script_id == unicharset.hiragana_sid()) ||
(unicharset.katakana_sid() != unicharset.null_sid() &&
consistency_info->script_id == unicharset.katakana_sid())) {
consistency_info->script_id = dict_->getUnicharset().han_sid();
}
}
if (parent_vse != nullptr &&
(parent_vse->consistency_info.script_id !=
dict_->getUnicharset().common_sid())) {
int parent_script_id = parent_vse->consistency_info.script_id;
// If script_id is Common, use script id of the parent instead.
if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
consistency_info->script_id = parent_script_id;
}
if (consistency_info->script_id != parent_script_id) {
consistency_info->inconsistent_script = true;
}
}
// Update chartype related counters.
if (unicharset.get_isalpha(unichar_id)) {
consistency_info->num_alphas++;
} else if (unicharset.get_isdigit(unichar_id)) {
consistency_info->num_digits++;
} else if (!unicharset.get_ispunctuation(unichar_id)) {
consistency_info->num_other++;
}
// Check font and spacing consistency.
if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
int fontinfo_id = -1;
if (parent_b->fontinfo_id() == b->fontinfo_id() ||
parent_b->fontinfo_id2() == b->fontinfo_id()) {
fontinfo_id = b->fontinfo_id();
} else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
parent_b->fontinfo_id2() == b->fontinfo_id2()) {
fontinfo_id = b->fontinfo_id2();
}
if(language_model_debug_level > 1) {
tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
(parent_b->fontinfo_id() >= 0) ?
fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
(parent_b->fontinfo_id2() >= 0) ?
fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
(b->fontinfo_id() >= 0) ?
fontinfo_table_->get(b->fontinfo_id()).name : "",
(fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
(fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
fontinfo_id);
}
if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
bool expected_gap_found = false;
float expected_gap;
int temp_gap;
if (fontinfo_id >= 0) { // found a common font
ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
if (fontinfo_table_->get(fontinfo_id).get_spacing(
parent_b->unichar_id(), unichar_id, &temp_gap)) {
expected_gap = temp_gap;
expected_gap_found = true;
}
} else {
consistency_info->inconsistent_font = true;
// Get an average of the expected gaps in each font
int num_addends = 0;
expected_gap = 0;
int temp_fid;
for (int i = 0; i < 4; ++i) {
if (i == 0) {
temp_fid = parent_b->fontinfo_id();
} else if (i == 1) {
temp_fid = parent_b->fontinfo_id2();
} else if (i == 2) {
temp_fid = b->fontinfo_id();
} else {
temp_fid = b->fontinfo_id2();
}
ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
parent_b->unichar_id(), unichar_id, &temp_gap)) {
expected_gap += temp_gap;
num_addends++;
}
}
expected_gap_found = (num_addends > 0);
if (num_addends > 0) {
expected_gap /= static_cast<float>(num_addends);
}
}
if (expected_gap_found) {
float actual_gap =
static_cast<float>(word_res->GetBlobsGap(curr_col-1));
float gap_ratio = expected_gap / actual_gap;
// TODO(rays) The gaps seem to be way off most of the time, saved by
// the error here that the ratio was compared to 1/2, when it should
// have been 0.5f. Find the source of the gaps discrepancy and put
// the 0.5f here in place of 0.0f.
// Test on 2476595.sj, pages 0 to 6. (In French.)
if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
consistency_info->num_inconsistent_spaces++;
}
if (language_model_debug_level > 1) {
tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
unicharset.id_to_unichar(parent_b->unichar_id()),
parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
unichar_id, curr_col, expected_gap, actual_gap);
}
}
}
}
}
float LanguageModel::ComputeAdjustedPathCost(ViterbiStateEntry *vse) {
ASSERT_HOST(vse != nullptr);
if (params_model_.Initialized()) {
float features[PTRAIN_NUM_FEATURE_TYPES];
ExtractFeaturesFromPath(*vse, features);
float cost = params_model_.ComputeCost(features);
if (language_model_debug_level > 3) {
tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
if (language_model_debug_level >= 5) {
for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
}
}
}
return cost * vse->outline_length;
} else {
float adjustment = 1.0f;
if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
adjustment += language_model_penalty_non_freq_dict_word;
}
if (vse->dawg_info == nullptr) {
adjustment += language_model_penalty_non_dict_word;
if (vse->length > language_model_min_compound_length) {
adjustment += ((vse->length - language_model_min_compound_length) *
language_model_penalty_increment);
}
}
if (vse->associate_stats.shape_cost > 0) {
adjustment += vse->associate_stats.shape_cost /
static_cast<float>(vse->length);
}
if (language_model_ngram_on) {
ASSERT_HOST(vse->ngram_info != nullptr);
return vse->ngram_info->ngram_and_classifier_cost * adjustment;
} else {
adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
vse->consistency_info);
return vse->ratings_sum * adjustment;
}
}
}
void LanguageModel::UpdateBestChoice(
ViterbiStateEntry *vse,
LMPainPoints *pain_points,
WERD_RES *word_res,
BestChoiceBundle *best_choice_bundle,
BlamerBundle *blamer_bundle) {
bool truth_path;
WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
blamer_bundle, &truth_path);
ASSERT_HOST(word != nullptr);
if (dict_->stopper_debug_level >= 1) {
STRING word_str;
word->string_and_lengths(&word_str, nullptr);
vse->Print(word_str.string());
}
if (language_model_debug_level > 0) {
word->print("UpdateBestChoice() constructed word");
}
// Record features from the current path if necessary.
ParamsTrainingHypothesis curr_hyp;
if (blamer_bundle != nullptr) {
if (vse->dawg_info != nullptr) vse->dawg_info->permuter =
static_cast<PermuterType>(word->permuter());
ExtractFeaturesFromPath(*vse, curr_hyp.features);
word->string_and_lengths(&(curr_hyp.str), nullptr);
curr_hyp.cost = vse->cost; // record cost for error rate computations
if (language_model_debug_level > 0) {
tprintf("Raw features extracted from %s (cost=%g) [ ",
curr_hyp.str.string(), curr_hyp.cost);
for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
tprintf("%g ", curr_hyp.features[deb_i]);
}
tprintf("]\n");
}
// Record the current hypothesis in params_training_bundle.
blamer_bundle->AddHypothesis(curr_hyp);
if (truth_path)
blamer_bundle->UpdateBestRating(word->rating());
}
if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
// The word was constructed solely for blamer_bundle->AddHypothesis, so
// we no longer need it.
delete word;
return;
}
if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty())
word->SetScriptPositions(false, word_res->chopped_word);
// Update and log new raw_choice if needed.
if (word_res->raw_choice == nullptr ||
word->rating() < word_res->raw_choice->rating()) {
if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
tprintf("Updated raw choice\n");
}
// Set the modified rating for best choice to vse->cost and log best choice.
word->set_rating(vse->cost);
// Call LogNewChoice() for best choice from Dict::adjust_word() since it
// computes adjust_factor that is used by the adaption code (e.g. by
// ClassifyAdaptableWord() to compute adaption acceptance thresholds).
// Note: the rating of the word is not adjusted.
dict_->adjust_word(word, vse->dawg_info == nullptr,
vse->consistency_info.xht_decision, 0.0,
false, language_model_debug_level > 0);
// Hand ownership of the word over to the word_res.
if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,
dict_->stopper_debug_level >= 1, word)) {
// The word was so bad that it was deleted.
return;
}
if (word_res->best_choice == word) {
// Word was the new best.
if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
AcceptablePath(*vse)) {
acceptable_choice_found_ = true;
}
// Update best_choice_bundle.
best_choice_bundle->updated = true;
best_choice_bundle->best_vse = vse;
if (language_model_debug_level > 0) {
tprintf("Updated best choice\n");
word->print_state("New state ");
}
// Update hyphen state if we are dealing with a dictionary word.
if (vse->dawg_info != nullptr) {
if (dict_->has_hyphen_end(*word)) {
dict_->set_hyphen_word(*word, *(dawg_args_.active_dawgs));
} else {
dict_->reset_hyphen_vars(true);
}
}
if (blamer_bundle != nullptr) {
blamer_bundle->set_best_choice_is_dict_and_top_choice(
vse->dawg_info != nullptr && vse->top_choice_flags);
}
}
if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
word->DisplaySegmentation(word_res->chopped_word);
}
}
void LanguageModel::ExtractFeaturesFromPath(
const ViterbiStateEntry &vse, float features[]) {
memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
// Record dictionary match info.
int len = vse.length <= kMaxSmallWordUnichars ? 0 :
vse.length <= kMaxMediumWordUnichars ? 1 : 2;
if (vse.dawg_info != nullptr) {
int permuter = vse.dawg_info->permuter;
if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
if (vse.consistency_info.num_digits == vse.length) {
features[PTRAIN_DIGITS_SHORT+len] = 1.0;
} else {
features[PTRAIN_NUM_SHORT+len] = 1.0;
}
} else if (permuter == DOC_DAWG_PERM) {
features[PTRAIN_DOC_SHORT+len] = 1.0;
} else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
permuter == COMPOUND_PERM) {
features[PTRAIN_DICT_SHORT+len] = 1.0;
} else if (permuter == FREQ_DAWG_PERM) {
features[PTRAIN_FREQ_SHORT+len] = 1.0;
}
}
// Record shape cost feature (normalized by path length).
features[PTRAIN_SHAPE_COST_PER_CHAR] =
vse.associate_stats.shape_cost / static_cast<float>(vse.length);
// Record ngram cost. (normalized by the path length).
features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
if (vse.ngram_info != nullptr) {
features[PTRAIN_NGRAM_COST_PER_CHAR] =
vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
}
// Record consistency-related features.
// Disabled this feature for due to its poor performance.
// features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == nullptr ?
vse.consistency_info.NumInconsistentChartype() : 0.0;
features[PTRAIN_NUM_BAD_SPACING] =
vse.consistency_info.NumInconsistentSpaces();
// Disabled this feature for now due to its poor performance.
// features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
// Classifier-related features.
features[PTRAIN_RATING_PER_CHAR] =
vse.ratings_sum / static_cast<float>(vse.outline_length);
}
WERD_CHOICE *LanguageModel::ConstructWord(
ViterbiStateEntry *vse,
WERD_RES *word_res,
DANGERR *fixpt,
BlamerBundle *blamer_bundle,
bool *truth_path) {
if (truth_path != nullptr) {
*truth_path =
(blamer_bundle != nullptr &&
vse->length == blamer_bundle->correct_segmentation_length());
}
BLOB_CHOICE *curr_b = vse->curr_b;
ViterbiStateEntry *curr_vse = vse;
int i;
bool compound = dict_->hyphenated(); // treat hyphenated words as compound
// Re-compute the variance of the width-to-height ratios (since we now
// can compute the mean over the whole word).
float full_wh_ratio_mean = 0.0f;
if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
static_cast<float>(vse->length));
vse->associate_stats.full_wh_ratio_var = 0.0f;
}
// Construct a WERD_CHOICE by tracing parent pointers.
WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
word->set_length(vse->length);
int total_blobs = 0;
for (i = (vse->length-1); i >= 0; --i) {
if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
!blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
*truth_path = false;
}
// The number of blobs used for this choice is row - col + 1.
int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
total_blobs += num_blobs;
word->set_blob_choice(i, num_blobs, curr_b);
// Update the width-to-height ratio variance. Useful non-space delimited
// languages to ensure that the blobs are of uniform width.
// Skip leading and trailing punctuation when computing the variance.
if ((full_wh_ratio_mean != 0.0f &&
((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
!dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
vse->associate_stats.full_wh_ratio_var +=
pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
if (language_model_debug_level > 2) {
tprintf("full_wh_ratio_var += (%g-%g)^2\n",
full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
}
}
// Mark the word as compound if compound permuter was set for any of
// the unichars on the path (usually this will happen for unichars
// that are compounding operators, like "-" and "/").
if (!compound && curr_vse->dawg_info &&
curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
// Update curr_* pointers.
curr_vse = curr_vse->parent_vse;
if (curr_vse == nullptr) break;
curr_b = curr_vse->curr_b;
}
ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
ASSERT_HOST(total_blobs == word_res->ratings->dimension());
// Re-adjust shape cost to include the updated width-to-height variance.
if (full_wh_ratio_mean != 0.0f) {
vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
}
word->set_rating(vse->ratings_sum);
word->set_certainty(vse->min_certainty);
word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
vse->consistency_info.BodyMaxXHeight());
if (vse->dawg_info != nullptr) {
word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
} else if (language_model_ngram_on && !vse->ngram_info->pruned) {
word->set_permuter(NGRAM_PERM);
} else if (vse->top_choice_flags) {
word->set_permuter(TOP_CHOICE_PERM);
} else {
word->set_permuter(NO_PERM);
}
word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
word_res->ratings));
return word;
}
} // namespace tesseract