mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-22 09:53:03 +08:00
790 lines
35 KiB
C
790 lines
35 KiB
C
|
///////////////////////////////////////////////////////////////////////
|
||
|
// File: language_model.h
|
||
|
// Description: Functions that utilize the knowledge about the properties,
|
||
|
// structure and statistics of the language to help recognition.
|
||
|
// Author: Daria Antonova
|
||
|
// Created: Mon Nov 11 11:26:43 PST 2009
|
||
|
//
|
||
|
// (C) Copyright 2009, Google Inc.
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
|
||
|
#define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
|
||
|
|
||
|
#include "associate.h"
|
||
|
#include "dawg.h"
|
||
|
#include "dict.h"
|
||
|
#include "matrix.h"
|
||
|
#include "oldheap.h"
|
||
|
#include "params.h"
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
// Used for expressing various language model flags.
|
||
|
typedef unsigned char LanguageModelFlagsType;
|
||
|
|
||
|
// Struct for keeping track of the consistency of the path.
|
||
|
struct LanguageModelConsistencyInfo {
|
||
|
LanguageModelConsistencyInfo()
|
||
|
: punc_ref(NO_EDGE), num_punc(0), invalid_punc(false),
|
||
|
num_non_first_upper(0), num_lower(0),
|
||
|
script_id(0), inconsistent_script(false),
|
||
|
num_alphas(0), num_digits(0), num_other(0) {}
|
||
|
inline int NumInconsistentPunc() const {
|
||
|
return invalid_punc ? num_punc : 0;
|
||
|
}
|
||
|
inline int NumInconsistentCase() const {
|
||
|
return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper;
|
||
|
}
|
||
|
inline int NumInconsistentChartype() const {
|
||
|
return (NumInconsistentPunc() + num_other +
|
||
|
((num_alphas > num_digits) ? num_digits : num_alphas));
|
||
|
}
|
||
|
inline bool Consistent() const {
|
||
|
return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
|
||
|
NumInconsistentChartype() == 0 && !inconsistent_script);
|
||
|
}
|
||
|
|
||
|
EDGE_REF punc_ref;
|
||
|
int num_punc;
|
||
|
bool invalid_punc;
|
||
|
int num_non_first_upper;
|
||
|
int num_lower;
|
||
|
int script_id;
|
||
|
bool inconsistent_script;
|
||
|
int num_alphas;
|
||
|
int num_digits;
|
||
|
int num_other;
|
||
|
};
|
||
|
|
||
|
|
||
|
// The following structs are used for storing the state of the language model
|
||
|
// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
|
||
|
// and the links are the replationships between the underlying blobs (see
|
||
|
// segsearch.h for a more detailed description).
|
||
|
// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
|
||
|
// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
|
||
|
// search leading up to and including this BLOB_CHOICE.
|
||
|
// Each ViterbiStateEntry contains information from various components of the
|
||
|
// language model: dawgs in which the path is found, character ngram model
|
||
|
// probability of the path, script/chartype/font consistency info, state for
|
||
|
// language-specific heuristics (e.g. hyphenated and compund words, lower/upper
|
||
|
// case preferences, etc).
|
||
|
// Each ViterbiStateEntry also contains the parent pointer, so that the path
|
||
|
// that it represents (WERD_CHOICE) can be constructed by following these
|
||
|
// parent pointers.
|
||
|
|
||
|
// Struct for storing additional information used by Dawg language model
|
||
|
// component. It stores the set of active dawgs in which the sequence of
|
||
|
// letters on a path can be found and the constraints that have to be
|
||
|
// satisfied at the end of the word (e.g. beginning/ending punctuation).
|
||
|
struct LanguageModelDawgInfo {
|
||
|
LanguageModelDawgInfo(DawgInfoVector *a, DawgInfoVector *c,
|
||
|
PermuterType pt) : permuter(pt) {
|
||
|
active_dawgs = new DawgInfoVector(*a);
|
||
|
constraints = new DawgInfoVector(*c);
|
||
|
}
|
||
|
~LanguageModelDawgInfo() {
|
||
|
delete active_dawgs;
|
||
|
delete constraints;
|
||
|
}
|
||
|
DawgInfoVector *active_dawgs;
|
||
|
DawgInfoVector *constraints;
|
||
|
PermuterType permuter;
|
||
|
};
|
||
|
|
||
|
// Struct for storing additional information used by Ngram language model
|
||
|
// component.
|
||
|
struct LanguageModelNgramInfo {
|
||
|
LanguageModelNgramInfo(const char *c, int l, bool p, float nc)
|
||
|
: context(c), context_unichar_step_len(l), pruned(p), ngram_cost(nc) {}
|
||
|
STRING context; // context string
|
||
|
// Length of the context measured by advancing using UNICHAR::utf8_step()
|
||
|
// (should be at most the order of the character ngram model used).
|
||
|
int context_unichar_step_len;
|
||
|
// The paths with pruned set are pruned out from the perspective of the
|
||
|
// character ngram model. They are explored further because they represent
|
||
|
// a dictionary match or a top choice. Thus ngram_info is still computed
|
||
|
// for them in order to calculate the combined cost.
|
||
|
bool pruned;
|
||
|
// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path))
|
||
|
float ngram_cost;
|
||
|
};
|
||
|
|
||
|
// Struct for storing the information about a path in the segmentation graph
|
||
|
// explored by Viterbi search.
|
||
|
struct ViterbiStateEntry : public ELIST_LINK {
|
||
|
ViterbiStateEntry(BLOB_CHOICE *pb, ViterbiStateEntry *pe,
|
||
|
BLOB_CHOICE *b, float c,
|
||
|
const LanguageModelConsistencyInfo &ci,
|
||
|
const AssociateStats &as,
|
||
|
LanguageModelFlagsType tcf,
|
||
|
LanguageModelDawgInfo *d, LanguageModelNgramInfo *n)
|
||
|
: cost(c), parent_b(pb), parent_vse(pe), ratings_sum(b->rating()),
|
||
|
min_certainty(b->certainty()), length(1), consistency_info(ci),
|
||
|
associate_stats(as), top_choice_flags(tcf), dawg_info(d), ngram_info(n),
|
||
|
updated(true) {
|
||
|
if (pe != NULL) {
|
||
|
ratings_sum += pe->ratings_sum;
|
||
|
if (pe->min_certainty < min_certainty) {
|
||
|
min_certainty = pe->min_certainty;
|
||
|
}
|
||
|
length += pe->length;
|
||
|
}
|
||
|
}
|
||
|
~ViterbiStateEntry() {
|
||
|
delete dawg_info;
|
||
|
delete ngram_info;
|
||
|
}
|
||
|
// Comparator function for sorting ViterbiStateEntry_LISTs in
|
||
|
// non-increasing order of costs.
|
||
|
static int Compare(const void *e1, const void *e2) {
|
||
|
const ViterbiStateEntry *ve1 =
|
||
|
*reinterpret_cast<const ViterbiStateEntry * const *>(e1);
|
||
|
const ViterbiStateEntry *ve2 =
|
||
|
*reinterpret_cast<const ViterbiStateEntry * const *>(e2);
|
||
|
return (ve1->cost < ve2->cost) ? -1 : 1;
|
||
|
}
|
||
|
inline bool Consistent() const {
|
||
|
if (dawg_info != NULL && consistency_info.NumInconsistentCase() == 0) {
|
||
|
return true;
|
||
|
}
|
||
|
return consistency_info.Consistent();
|
||
|
}
|
||
|
|
||
|
// The cost is an adjusted ratings sum, that is adjusted by all the language
|
||
|
// model components that use Viterbi search.
|
||
|
float cost;
|
||
|
|
||
|
// Pointers to parent BLOB_CHOICE and ViterbiStateEntry (not owned by this).
|
||
|
BLOB_CHOICE *parent_b;
|
||
|
ViterbiStateEntry *parent_vse;
|
||
|
|
||
|
// Various information about the characters on the path represented
|
||
|
// by this ViterbiStateEntry.
|
||
|
float ratings_sum; // sum of ratings of character on the path
|
||
|
float min_certainty; // minimum certainty on the path
|
||
|
int length; // number of characters on the path
|
||
|
LanguageModelConsistencyInfo consistency_info; // path consistency info
|
||
|
AssociateStats associate_stats; // character widths/gaps/seams
|
||
|
|
||
|
// Flags for marking the entry as a top choice path with
|
||
|
// the smallest rating or lower/upper case letters).
|
||
|
LanguageModelFlagsType top_choice_flags;
|
||
|
|
||
|
// Extra information maintained by Dawg laguage model component
|
||
|
// (owned by ViterbiStateEntry).
|
||
|
LanguageModelDawgInfo *dawg_info;
|
||
|
|
||
|
// Extra information maintained by Ngram laguage model component
|
||
|
// (owned by ViterbiStateEntry).
|
||
|
LanguageModelNgramInfo *ngram_info;
|
||
|
|
||
|
bool updated; // set to true if the entry has just been created/updated
|
||
|
};
|
||
|
|
||
|
ELISTIZEH(ViterbiStateEntry);
|
||
|
|
||
|
// Struct to store information maintained by various language model components.
|
||
|
struct LanguageModelState {
|
||
|
LanguageModelState(int col, int row) : contained_in_col(col),
|
||
|
contained_in_row(row), viterbi_state_entries_prunable_length(0),
|
||
|
viterbi_state_entries_prunable_max_cost(MAX_FLOAT32) {}
|
||
|
~LanguageModelState() {}
|
||
|
|
||
|
// Ratings matrix cell that holds this LanguageModelState
|
||
|
// (needed to construct best STATE for rebuild_current_state()
|
||
|
// and best BLOB_CHOICE_LIST_VECTOR for AcceptableChoice()).
|
||
|
int contained_in_col;
|
||
|
int contained_in_row;
|
||
|
|
||
|
// Storage for the Viterbi state.
|
||
|
ViterbiStateEntry_LIST viterbi_state_entries;
|
||
|
// Number and max cost of prunable paths in viterbi_state_entries.
|
||
|
int viterbi_state_entries_prunable_length;
|
||
|
float viterbi_state_entries_prunable_max_cost;
|
||
|
|
||
|
// TODO(daria): add font consistency checking.
|
||
|
};
|
||
|
|
||
|
// Bundle together all the things pertaining to the best choice/state.
|
||
|
struct BestChoiceBundle {
|
||
|
BestChoiceBundle(STATE *s, WERD_CHOICE *bc, WERD_CHOICE *rc,
|
||
|
BLOB_CHOICE_LIST_VECTOR *bcc)
|
||
|
: best_state(s), best_choice(bc), raw_choice(rc),
|
||
|
best_char_choices(bcc), updated(false), best_vse(NULL), best_b(NULL) {}
|
||
|
|
||
|
STATE *best_state;
|
||
|
WERD_CHOICE *best_choice;
|
||
|
WERD_CHOICE *raw_choice;
|
||
|
BLOB_CHOICE_LIST_VECTOR *best_char_choices;
|
||
|
bool updated;
|
||
|
DANGERR fixpt;
|
||
|
ViterbiStateEntry *best_vse; // best ViterbiStateEntry and BLOB_CHOICE
|
||
|
BLOB_CHOICE *best_b; // at the end of the best choice path
|
||
|
};
|
||
|
|
||
|
struct BestPathByColumn {
|
||
|
float avg_cost;
|
||
|
ViterbiStateEntry *best_vse;
|
||
|
BLOB_CHOICE *best_b;
|
||
|
};
|
||
|
|
||
|
// This class that contains the data structures and functions necessary
|
||
|
// to represent and use the knowledge about the language.
|
||
|
class LanguageModel {
|
||
|
public:
|
||
|
// Adjustments to pain point priority.
|
||
|
static const float kInitialPainPointPriorityAdjustment;
|
||
|
static const float kDefaultPainPointPriorityAdjustment;
|
||
|
static const float kBestChoicePainPointPriorityAdjustment;
|
||
|
static const float kCriticalPainPointPriorityAdjustment;
|
||
|
|
||
|
// Denominator for normalizing per-letter ngram cost when deriving
|
||
|
// penalty adjustments.
|
||
|
static const float kMaxAvgNgramCost;
|
||
|
// Minimum word length for fixed length dawgs.
|
||
|
// TODO(daria): check in the new chi/jpn.traineddata without the
|
||
|
// fixed length dawg of length 1 and delete this variable.
|
||
|
static const int kMinFixedLengthDawgLength;
|
||
|
// If there is a significant drop in character ngram probability or a
|
||
|
// dangerous ambiguity make the thresholds on what blob combinations
|
||
|
// can be classified looser.
|
||
|
static const float kLooseMaxCharWhRatio;
|
||
|
|
||
|
// Masks for interpreting which language model components
|
||
|
// were changed by the call to UpdateState().
|
||
|
static const LanguageModelFlagsType kSmallestRatingFlag = 0x1;
|
||
|
static const LanguageModelFlagsType kLowerCaseFlag = 0x2;
|
||
|
static const LanguageModelFlagsType kUpperCaseFlag = 0x4;
|
||
|
static const LanguageModelFlagsType kConsistentFlag = 0x8;
|
||
|
static const LanguageModelFlagsType kDawgFlag = 0x10;
|
||
|
static const LanguageModelFlagsType kNgramFlag = 0x20;
|
||
|
static const LanguageModelFlagsType kJustClassifiedFlag = 0x80;
|
||
|
static const LanguageModelFlagsType kAllChangedFlag = 0xff;
|
||
|
|
||
|
LanguageModel(Dict *dict, WERD_CHOICE **prev_word_best_choice);
|
||
|
~LanguageModel();
|
||
|
|
||
|
// Updates data structures that are used for the duration of the segmentation
|
||
|
// search on the current word;
|
||
|
void InitForWord(const WERD_CHOICE *prev_word, const DENORM *denorm,
|
||
|
bool fixed_pitch, float best_choice_cert,
|
||
|
float max_char_wh_ratio,
|
||
|
HEAP *pain_points, CHUNKS_RECORD *chunks_record);
|
||
|
// Resets all the "updated" flags used by the Viterbi search that were
|
||
|
// "registered" during the update of the ratings matrix.
|
||
|
void CleanUp();
|
||
|
// Deletes and sets to NULL language model states of each of the
|
||
|
// BLOB_CHOICEs in the given BLOB_CHOICE_LIST.
|
||
|
void DeleteState(BLOB_CHOICE_LIST *choices);
|
||
|
|
||
|
// Updates language model state of the given BLOB_CHOICE_LIST (from
|
||
|
// the ratings matrix) a its parent. Updates pain_points if new
|
||
|
// problematic points are found in the segmentation graph.
|
||
|
//
|
||
|
// At most language_model_max_viterbi_list_size are kept in each
|
||
|
// LanguageModelState.viterbi_state_entries list.
|
||
|
// The entries that represent dictionary word paths are kept at the
|
||
|
// front of the list and do not count towards the size limit.
|
||
|
// The list ordered by cost that is computed collectively by several
|
||
|
// language model components (currently dawg and ngram components).
|
||
|
//
|
||
|
// best_path_by_column records the lowest cost path found so far for each
|
||
|
// column of the chunks_record->ratings matrix over all the rows. This
|
||
|
// array is updated if a lower cost ViterbiStateEntry is created in curr_col.
|
||
|
LanguageModelFlagsType UpdateState(
|
||
|
LanguageModelFlagsType changed,
|
||
|
int curr_col, int curr_row,
|
||
|
BLOB_CHOICE_LIST *curr_list,
|
||
|
BLOB_CHOICE_LIST *parent_list,
|
||
|
HEAP *pain_points,
|
||
|
BestPathByColumn *best_path_by_column[],
|
||
|
CHUNKS_RECORD *chunks_record,
|
||
|
BestChoiceBundle *best_choice_bundle);
|
||
|
|
||
|
// Generates pain points from the problematic top choice paths when the
|
||
|
// segmentation search is guided by the character ngram model.
|
||
|
// It is necessary to consider problematic the top choice paths instead of
|
||
|
// the problematic lowest cost paths because the character ngram model
|
||
|
// might assign a very high cost to very improbably paths. For example,
|
||
|
// "liot" might have a much lower cost than "llot", and the character ngram
|
||
|
// model might detect a dip in probability for p(t|lio) at the end of the
|
||
|
// word, but not at the beginning (p(i|l) would be ok). However, looking at
|
||
|
// the dips in character ngram probability of the top choices we would be
|
||
|
// able to stop the problematic points (p(l| l) would be low).
|
||
|
void GenerateNgramModelPainPointsFromColumn(int col, int row,
|
||
|
HEAP *pain_points,
|
||
|
CHUNKS_RECORD *chunks_record);
|
||
|
|
||
|
// Generates pain points from the problematic lowest cost paths that are
|
||
|
// "promising" (i.e. would have the cost lower than the one recorded in
|
||
|
// best_path_by_column if the problematic ending of the path is removed
|
||
|
// and after being combined with another blob the certainty of the last
|
||
|
// blob is improved).
|
||
|
void GenerateProblematicPathPainPointsFromColumn(
|
||
|
int col, int row, float best_choice_cert,
|
||
|
HEAP *pain_points, BestPathByColumn *best_path_by_column[],
|
||
|
CHUNKS_RECORD *chunks_record);
|
||
|
|
||
|
// This function can be called after processing column col of the
|
||
|
// chunks_record->ratings matrix in order to find the promising paths
|
||
|
// that were terminated or made inconsistent by the character choices
|
||
|
// in column col. If such paths are identified, this function generates
|
||
|
// pain points to combine the problematic cells of the matrix.
|
||
|
void GeneratePainPointsFromColumn(
|
||
|
int col,
|
||
|
const GenericVector<int> &non_empty_rows,
|
||
|
float best_choice_cert,
|
||
|
HEAP *pain_points,
|
||
|
BestPathByColumn *best_path_by_column[],
|
||
|
CHUNKS_RECORD *chunks_record);
|
||
|
|
||
|
// Generates a pain point for each problematic point on the best choice
|
||
|
// path. Such problematic points could be a termination of a dicionary
|
||
|
// word, dip in ngram probability, invalid punctuation, inconsistent
|
||
|
// case/chartype/script or punctuation in the middle of a word.
|
||
|
void GeneratePainPointsFromBestChoice(
|
||
|
HEAP *pain_points,
|
||
|
CHUNKS_RECORD *chunks_record,
|
||
|
BestChoiceBundle *best_choice_bundle);
|
||
|
|
||
|
// Adds a pain point to the given pain_points queue that will cause
|
||
|
// the entry at chunks_record->ratings(col, row) to be classified.
|
||
|
// The priority of the pain point is set to be:
|
||
|
//
|
||
|
// priority_adjustment * sqrt(avg_parent_cost)
|
||
|
// ----------------------------------------------------
|
||
|
// sqrt(dict_parent_path_length) * |worst_piece_cert|
|
||
|
//
|
||
|
// The priority is further lowered if fragmented is true.
|
||
|
//
|
||
|
void GeneratePainPoint(int col, int row, bool ok_to_extend,
|
||
|
float priority_adjustment,
|
||
|
float worst_piece_cert,
|
||
|
bool fragmented,
|
||
|
float best_choice_cert,
|
||
|
float max_char_wh_ratio,
|
||
|
BLOB_CHOICE *parent_b,
|
||
|
ViterbiStateEntry *parent_vse,
|
||
|
CHUNKS_RECORD *chunks_record,
|
||
|
HEAP *pain_points);
|
||
|
|
||
|
// Returns true if an acceptable best choice was discovered.
|
||
|
inline bool AcceptableChoiceFound() { return acceptable_choice_found_; }
|
||
|
|
||
|
// Fills cert with the worst certainty of the top non-fragmented choice
|
||
|
// of the left and right neighbor of the given col,row.
|
||
|
// Sets fragmented if any of the neighbors have a fragmented character
|
||
|
// as the top choice.
|
||
|
inline void GetWorstPieceCertainty(int col, int row, MATRIX *ratings,
|
||
|
float *cert, bool *fragmented) {
|
||
|
*cert = 0.0f;
|
||
|
*fragmented = false;
|
||
|
if (row > 0) {
|
||
|
GetPieceCertainty(ratings->get(col, row-1), cert, fragmented);
|
||
|
}
|
||
|
if (col+1 < ratings->dimension()) {
|
||
|
GetPieceCertainty(ratings->get(col+1, row), cert, fragmented);
|
||
|
}
|
||
|
ASSERT_HOST(*cert < 0.0f);
|
||
|
}
|
||
|
|
||
|
protected:
|
||
|
|
||
|
inline static float CertaintyScore(float cert) { return (-1.0f / cert); }
|
||
|
|
||
|
inline bool NonAlphaOrDigitMiddle(int col, int row, int dimension,
|
||
|
UNICHAR_ID unichar_id) {
|
||
|
return (!dict_->getUnicharset().get_isalpha(unichar_id) &&
|
||
|
!dict_->getUnicharset().get_isdigit(unichar_id) &&
|
||
|
col > 0 && row+1 < dimension);
|
||
|
}
|
||
|
|
||
|
inline bool IsFragment(BLOB_CHOICE *b) {
|
||
|
return dict_->getUnicharset().get_fragment(b->unichar_id());
|
||
|
}
|
||
|
|
||
|
inline bool IsHan(int script_id) {
|
||
|
return ((dict_->getUnicharset().han_sid() !=
|
||
|
dict_->getUnicharset().null_sid()) &&
|
||
|
(script_id == dict_->getUnicharset().han_sid()));
|
||
|
}
|
||
|
|
||
|
// Finds the first non-fragmented character in the given BLOB_CHOICE_LIST
|
||
|
// and update cert if its certainty is less than the one recorded in cert.
|
||
|
// Sets fragmented if the first choice in BLOB_CHOICE_LIST is a fragment.
|
||
|
inline void GetPieceCertainty(BLOB_CHOICE_LIST *blist,
|
||
|
float *cert, bool *fragmented) {
|
||
|
if (blist == NOT_CLASSIFIED || blist->empty()) return;
|
||
|
BLOB_CHOICE_IT bit(blist);
|
||
|
while (!bit.at_last() && IsFragment(bit.data())) {
|
||
|
*fragmented = true;
|
||
|
bit.forward(); // skip fragments
|
||
|
}
|
||
|
// Each classification must have at least one non-fragmented choice.
|
||
|
ASSERT_HOST(!IsFragment(bit.data()));
|
||
|
if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
|
||
|
}
|
||
|
|
||
|
inline float ComputeAdjustment(int num_problems, float penalty) {
|
||
|
if (num_problems == 0) return 0.0f;
|
||
|
if (num_problems == 1) return penalty;
|
||
|
return (penalty + (language_model_penalty_increment *
|
||
|
static_cast<float>(num_problems-1)));
|
||
|
}
|
||
|
|
||
|
// Computes the adjustment to the ratings sum based on the given
|
||
|
// consistency_info. The paths with invalid punctuation, inconsistent
|
||
|
// case and character type are penalized proportionally to the number
|
||
|
// of inconsistencies on the path.
|
||
|
inline float ComputeConsistencyAdjustment(
|
||
|
const LanguageModelDawgInfo *dawg_info,
|
||
|
const LanguageModelConsistencyInfo &consistency_info) {
|
||
|
if (dawg_info != NULL) {
|
||
|
return ComputeAdjustment(consistency_info.NumInconsistentCase(),
|
||
|
language_model_penalty_case);
|
||
|
}
|
||
|
return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
|
||
|
language_model_penalty_punc) +
|
||
|
ComputeAdjustment(consistency_info.NumInconsistentCase(),
|
||
|
language_model_penalty_case) +
|
||
|
ComputeAdjustment(consistency_info.NumInconsistentChartype(),
|
||
|
language_model_penalty_chartype) +
|
||
|
(consistency_info.inconsistent_script ?
|
||
|
language_model_penalty_script : 0.0f));
|
||
|
}
|
||
|
|
||
|
// Returns an andjusted ratings sum that includes inconsistency penalties.
|
||
|
inline float ComputeConsistencyAdjustedRatingsSum(
|
||
|
float ratings_sum,
|
||
|
const LanguageModelDawgInfo *dawg_info,
|
||
|
const LanguageModelConsistencyInfo &consistency_info) {
|
||
|
return (ratings_sum * (1.0f + ComputeConsistencyAdjustment(
|
||
|
dawg_info, consistency_info)));
|
||
|
}
|
||
|
|
||
|
// Returns an adjusted ratings sum that includes inconsistency penalties,
|
||
|
// penalties for non-dicionary paths and paths with dips in ngram
|
||
|
// probability.
|
||
|
float ComputeAdjustedPathCost(
|
||
|
float ratings_sum, int length, float dawg_score,
|
||
|
const LanguageModelDawgInfo *dawg_info,
|
||
|
const LanguageModelNgramInfo *ngram_info,
|
||
|
const LanguageModelConsistencyInfo &consistency_info,
|
||
|
const AssociateStats &associate_stats,
|
||
|
ViterbiStateEntry *parent_vse);
|
||
|
|
||
|
// Returns true if the given ViterbiStateEntry represents a problematic
|
||
|
// path. A path is considered problematic if the last unichar makes it
|
||
|
// inconsistent, introduces a dip in ngram probability or transforms a
|
||
|
// dictionary path into a non-dictionary one.
|
||
|
bool ProblematicPath(const ViterbiStateEntry &vse,
|
||
|
UNICHAR_ID unichar_id, bool word_end);
|
||
|
|
||
|
// Finds the first lower and upper case character in curr_list.
|
||
|
// If none found, choses the first character in the list.
|
||
|
void GetTopChoiceLowerUpper(LanguageModelFlagsType changed,
|
||
|
BLOB_CHOICE_LIST *curr_list,
|
||
|
BLOB_CHOICE **first_lower,
|
||
|
BLOB_CHOICE **first_upper);
|
||
|
|
||
|
// Helper function that computes the cost of the path composed of the
|
||
|
// path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
|
||
|
// Adds a new ViterbiStateEntry to the list of viterbi entries
|
||
|
// in the given BLOB_CHOICE if the new path looks good enough.
|
||
|
// Returns LanguageModelFlagsType that indicates which language
|
||
|
// model components were involved in creating the new entry.
|
||
|
LanguageModelFlagsType AddViterbiStateEntry(
|
||
|
LanguageModelFlagsType top_choice_flags,
|
||
|
float denom,
|
||
|
bool word_end,
|
||
|
int curr_col, int curr_row,
|
||
|
BLOB_CHOICE *b,
|
||
|
BLOB_CHOICE *parent_b,
|
||
|
ViterbiStateEntry *parent_vse,
|
||
|
HEAP *pain_points,
|
||
|
BestPathByColumn *best_path_by_column[],
|
||
|
CHUNKS_RECORD *chunks_record,
|
||
|
BestChoiceBundle *best_choice_bundle);
|
||
|
|
||
|
// Pretty print information in the given ViterbiStateEntry.
|
||
|
void PrintViterbiStateEntry(const char *msg,
|
||
|
ViterbiStateEntry *vse,
|
||
|
BLOB_CHOICE *b,
|
||
|
CHUNKS_RECORD *chunks_record);
|
||
|
|
||
|
// Determines whether a potential entry is a true top choice and
|
||
|
// updates changed accordingly.
|
||
|
//
|
||
|
// Note: The function assumes that b, top_choice_flags and changed
|
||
|
// are not NULL.
|
||
|
void GenerateTopChoiceInfo(
|
||
|
float ratings_sum,
|
||
|
const LanguageModelDawgInfo *dawg_info,
|
||
|
const LanguageModelConsistencyInfo &consistency_info,
|
||
|
const ViterbiStateEntry *parent_vse,
|
||
|
BLOB_CHOICE *b,
|
||
|
LanguageModelFlagsType *top_choice_flags,
|
||
|
LanguageModelFlagsType *changed);
|
||
|
|
||
|
// Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
|
||
|
// unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
|
||
|
// with updated active dawgs, constraints and permuter.
|
||
|
//
|
||
|
// Note: the caller is responsible for deleting the returned pointer.
|
||
|
LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int script_id,
|
||
|
int curr_col, int curr_row,
|
||
|
const BLOB_CHOICE &b,
|
||
|
const ViterbiStateEntry *parent_vse,
|
||
|
LanguageModelFlagsType *changed);
|
||
|
|
||
|
// Computes p(unichar | parent context) and records it in ngram_cost.
|
||
|
// If b.unichar_id() is an unlikely continuation of the parent context
|
||
|
// sets found_small_prob to true and returns NULL.
|
||
|
// Otherwise creates a new LanguageModelNgramInfo entry containing the
|
||
|
// updated context (that includes b.unichar_id() at the end) and returns it.
|
||
|
//
|
||
|
// Note: the caller is responsible for deleting the returned pointer.
|
||
|
LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar,
|
||
|
float certainty, float denom,
|
||
|
int curr_col, int curr_row,
|
||
|
const ViterbiStateEntry *parent_vse,
|
||
|
BLOB_CHOICE *parent_b,
|
||
|
LanguageModelFlagsType *changed);
|
||
|
|
||
|
// Computes -(log(prob(classifier)) + log(prob(ngram model)))
|
||
|
// for the given unichar in the given context. If there are multiple
|
||
|
// unichars at one position - takes the average of their probabilities.
|
||
|
// UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
|
||
|
// since probability_in_context() can only handle one at a time (while
|
||
|
// unicharset might contain ngrams and glyphs composed from multiple UTF8
|
||
|
// characters).
|
||
|
float ComputeNgramCost(const char *unichar, float certainty, float denom,
|
||
|
const char *context,
|
||
|
int *unichar_step_len, bool *found_small_prob);
|
||
|
|
||
|
// Computes the normalization factors for the classifier confidences
|
||
|
// (used by ComputeNgramCost()).
|
||
|
float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
|
||
|
|
||
|
// Fills the given consistenty_info based on parent_vse.consistency_info
|
||
|
// and on the consistency of the given unichar_id with parent_vse.
|
||
|
void FillConsistencyInfo(
|
||
|
bool word_end, UNICHAR_ID unichar_id,
|
||
|
ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b,
|
||
|
LanguageModelConsistencyInfo *consistency_info);
|
||
|
|
||
|
// Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
|
||
|
// on the path represented by the given BLOB_CHOICE and language model
|
||
|
// state entries (lmse, dse). The path is re-constructed by following
|
||
|
// the parent pointers in the the lang model state entries). If the
|
||
|
// constructed WERD_CHOICE is better than the best/raw choice recorded
|
||
|
// in the best_choice_bundle, this function updates the corresponding
|
||
|
// fields and sets best_choice_bunldle->updated to true.
|
||
|
void UpdateBestChoice(BLOB_CHOICE *b,
|
||
|
ViterbiStateEntry *vse,
|
||
|
HEAP *pain_points,
|
||
|
CHUNKS_RECORD *chunks_record,
|
||
|
BestChoiceBundle *best_choice_bundle);
|
||
|
|
||
|
// Constructs a WERD_CHOICE by tracing parent pointers starting with
|
||
|
// the given LanguageModelStateEntry. Returns the constructed word.
|
||
|
// Updates best_char_choices, certainties and state if they are not
|
||
|
// NULL (best_char_choices and certainties are assumed to have the
|
||
|
// length equal to lmse->length).
|
||
|
// The caller is resposible for freeing memory associated with the
|
||
|
// returned WERD_CHOICE.
|
||
|
WERD_CHOICE *ConstructWord(BLOB_CHOICE *b,
|
||
|
ViterbiStateEntry *vse,
|
||
|
CHUNKS_RECORD *chunks_record,
|
||
|
BLOB_CHOICE_LIST_VECTOR *best_char_choices,
|
||
|
float certainties[],
|
||
|
float *dawg_score,
|
||
|
STATE *state);
|
||
|
|
||
|
// This function is used for non-space delimited languages when looking
|
||
|
// for word endings recorded while trying to separate the path into words.
|
||
|
//
|
||
|
// The function increments covered if a valid word ending is found in
|
||
|
// active_dawgs (if covered is incremented, skip is set to the number
|
||
|
// of unichars that should be skipped because they are covered by the
|
||
|
// word whose ending was just discovered).
|
||
|
//
|
||
|
// dawg_score and dawg_score_done are updated if:
|
||
|
// -- at the end of the path we discover a valid word ending from a
|
||
|
// non-fixed length dawg (this means that the whole word is a
|
||
|
// valid word, so dawg_score is set to 1.0f
|
||
|
// -- word_start is true (dawg_score is set to covered / word length)
|
||
|
//
|
||
|
// Note: this function assumes that skip, covered, dawg_score and
|
||
|
// dawg_score_done are not NULL.
|
||
|
void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs,
|
||
|
int word_index, int word_length,
|
||
|
int *skip, int *covered,
|
||
|
float *dawg_score,
|
||
|
bool *dawg_score_done);
|
||
|
|
||
|
// Wrapper around AssociateUtils::ComputeStats().
|
||
|
inline void ComputeAssociateStats(int col, int row,
|
||
|
float max_char_wh_ratio,
|
||
|
ViterbiStateEntry *parent_vse,
|
||
|
CHUNKS_RECORD *chunks_record,
|
||
|
AssociateStats *associate_stats) {
|
||
|
AssociateUtils::ComputeStats(
|
||
|
col, row,
|
||
|
(parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
|
||
|
(parent_vse != NULL) ? parent_vse->length : 0,
|
||
|
fixed_pitch_, max_char_wh_ratio, denorm_,
|
||
|
chunks_record, language_model_debug_level, associate_stats);
|
||
|
}
|
||
|
|
||
|
// Returns true if the path with such top_choice_flags and dawg_info
|
||
|
// could be pruned out (i.e. is neither a dictionary nor a top choice path).
|
||
|
// In non-space delimited languages all paths can be "somewhat" dictionary
|
||
|
// words. In such languages we can not do dictionary-driven path prunning,
|
||
|
// so paths with non-empty dawg_info are considered prunable.
|
||
|
inline bool PrunablePath(LanguageModelFlagsType top_choice_flags,
|
||
|
const LanguageModelDawgInfo *dawg_info) {
|
||
|
if (top_choice_flags) return false;
|
||
|
if (dawg_info != NULL &&
|
||
|
dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// Returns true if the given script id indicates a path that might consist
|
||
|
// of non-space delimited words (e.g. when dealing with Chinese and Japanese
|
||
|
// languages), and fixed length dawgs were loaded.
|
||
|
//
|
||
|
// TODO(daria): generate fixed length dawgs for Thai.
|
||
|
inline bool UseFixedLengthDawgs(int script_id) {
|
||
|
if (dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
|
||
|
if ((dict_->getUnicharset().han_sid() !=
|
||
|
dict_->getUnicharset().null_sid()) &&
|
||
|
script_id == dict_->getUnicharset().han_sid()) return true;
|
||
|
if ((dict_->getUnicharset().hiragana_sid() !=
|
||
|
dict_->getUnicharset().null_sid()) &&
|
||
|
script_id == dict_->getUnicharset().hiragana_sid()) return true;
|
||
|
if ((dict_->getUnicharset().katakana_sid() !=
|
||
|
dict_->getUnicharset().null_sid()) &&
|
||
|
script_id == dict_->getUnicharset().katakana_sid()) return true;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Returns true if the given ViterbiStateEntry represents an acceptable path.
|
||
|
inline bool AcceptablePath(const ViterbiStateEntry &vse) {
|
||
|
return (vse.dawg_info != NULL || vse.Consistent() ||
|
||
|
(vse.ngram_info != NULL && !vse.ngram_info->pruned));
|
||
|
}
|
||
|
|
||
|
public:
|
||
|
// Parameters.
|
||
|
INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
|
||
|
BOOL_VAR_H(language_model_ngram_on, false,
|
||
|
"Turn on/off the use of character ngram model");
|
||
|
INT_VAR_H(language_model_ngram_order, 8,
|
||
|
"Maximum order of the character ngram model");
|
||
|
INT_VAR_H(language_model_max_viterbi_list_size, 10,
|
||
|
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
|
||
|
"(excluding entries that represent dictionary word paths)");
|
||
|
double_VAR_H(language_model_ngram_small_prob, 0.000001,
|
||
|
"To avoid overly small denominators use this as the floor"
|
||
|
" of the probability returned by the ngram model");
|
||
|
double_VAR_H(language_model_ngram_nonmatch_score, -40.0,
|
||
|
"Average classifier score of a non-matching unichar.");
|
||
|
BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false,
|
||
|
"Use only the first UTF8 step of the given string"
|
||
|
" when computing log probabilities");
|
||
|
double_VAR_H(language_model_ngram_scale_factor, 0.03,
|
||
|
"Strength of the character ngram model relative to the"
|
||
|
" character classifier ");
|
||
|
INT_VAR_H(language_model_min_compound_length, 3,
|
||
|
"Minimum length of compound words");
|
||
|
INT_VAR_H(language_model_fixed_length_choices_depth, 3,
|
||
|
"Depth of blob choice lists to explore"
|
||
|
" when fixed length dawgs are on");
|
||
|
// Penalties used for adjusting path costs and final word rating.
|
||
|
double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1,
|
||
|
"Penalty for words not in the frequent word dictionary");
|
||
|
double_VAR_H(language_model_penalty_non_dict_word, 0.15,
|
||
|
"Penalty for non-dictionary words");
|
||
|
double_VAR_H(language_model_penalty_punc, 0.2,
|
||
|
"Penalty for inconsistent punctuation");
|
||
|
double_VAR_H(language_model_penalty_case, 0.1,
|
||
|
"Penalty for inconsistent case");
|
||
|
double_VAR_H(language_model_penalty_script, 0.5,
|
||
|
"Penalty for inconsistent script");
|
||
|
double_VAR_H(language_model_penalty_chartype, 0.3,
|
||
|
"Penalty for inconsistent character type");
|
||
|
double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
|
||
|
|
||
|
protected:
|
||
|
// Member Variables.
|
||
|
|
||
|
// Temporary DawgArgs struct that is re-used across different words to
|
||
|
// avoid dynamic memory re-allocation (should be cleared before each use).
|
||
|
DawgArgs *dawg_args_;
|
||
|
// List of pointers to updated flags used by Viterbi search to mark
|
||
|
// recently updated ViterbiStateEntries.
|
||
|
GenericVector<bool *> updated_flags_;
|
||
|
|
||
|
// The following variables are set at construction time.
|
||
|
|
||
|
// Pointer to Dict class, that is used for querying the dictionaries
|
||
|
// (the pointer is not owned by LanguageModel).
|
||
|
Dict *dict_;
|
||
|
// DENORM computed by Tesseract (not owned by LanguageModel).
|
||
|
const DENORM *denorm_;
|
||
|
// TODO(daria): the following variables should become LanguageModel params
|
||
|
// when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
|
||
|
//
|
||
|
// Set to true if we are dealing with fixed pitch text
|
||
|
// (set to assume_fixed_pitch_char_segment).
|
||
|
bool fixed_pitch_;
|
||
|
// Max char width-to-height ratio allowed
|
||
|
// (set to segsearch_max_char_wh_ratio).
|
||
|
float max_char_wh_ratio_;
|
||
|
|
||
|
// The following variables are initialized with InitForWord().
|
||
|
|
||
|
// String representation of the classificaion of the previous word
|
||
|
// (since this is only used by the character ngram model component,
|
||
|
// only the last language_model_ngram_order of the word are stored).
|
||
|
STRING prev_word_str_;
|
||
|
int prev_word_unichar_step_len_;
|
||
|
// Active dawg and constraints vector.
|
||
|
DawgInfoVector *beginning_active_dawgs_;
|
||
|
DawgInfoVector *beginning_constraints_;
|
||
|
DawgInfoVector *fixed_length_beginning_active_dawgs_;
|
||
|
DawgInfoVector *empty_dawg_info_vec_;
|
||
|
// Maximum adjustment factor for character ngram choices.
|
||
|
float max_penalty_adjust_;
|
||
|
// Set to true if acceptable choice was discovered.
|
||
|
// Note: it would be nice to use this to terminate the search once an
|
||
|
// acceptable choices is found. However we do not do that and once an
|
||
|
// acceptable choice is found we finish looking for alternative choices
|
||
|
// in the current segmentation graph and then exit the search (no more
|
||
|
// classifications are done after an acceptable choice is found).
|
||
|
// This is needed in order to let the search find the words very close to
|
||
|
// the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
|
||
|
// choices. This way the stopper will know that the best choice is not
|
||
|
// ambiguous (i.e. there are best choices in the best choice list that have
|
||
|
// ratings close to the very best one) and will be less likely to mis-adapt.
|
||
|
bool acceptable_choice_found_;
|
||
|
|
||
|
};
|
||
|
|
||
|
} // namespace tesseract
|
||
|
|
||
|
#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
|