mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
483 lines
21 KiB
C
483 lines
21 KiB
C
|
///////////////////////////////////////////////////////////////////////
|
||
|
// File: dict.h
|
||
|
// Description: dict class.
|
||
|
// Author: Samuel Charron
|
||
|
//
|
||
|
// (C) Copyright 2006, Google Inc.
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
#ifndef TESSERACT_DICT_DICT_H_
|
||
|
#define TESSERACT_DICT_DICT_H_
|
||
|
|
||
|
#include "ambigs.h"
|
||
|
#include "choices.h"
|
||
|
#include "choicearr.h"
|
||
|
#include "dawg.h"
|
||
|
#include "image.h"
|
||
|
#include "ratngs.h"
|
||
|
#include "stopper.h"
|
||
|
#include "trie.h"
|
||
|
#include "unicharset.h"
|
||
|
|
||
|
extern STRING_VAR_H(global_user_words_suffix, "user-words",
|
||
|
"A list of user-provided words.");
|
||
|
extern INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
|
||
|
|
||
|
#define MAX_WERD_LENGTH (inT64) 40
|
||
|
#define NO_RATING -1
|
||
|
#define FREQ_WERD 1.0
|
||
|
#define GOOD_WERD 1.1
|
||
|
#define OK_WERD 1.3125
|
||
|
|
||
|
// Struct used to hold temporary information about fragments.
|
||
|
struct CHAR_FRAGMENT_INFO {
|
||
|
UNICHAR_ID unichar_id;
|
||
|
const CHAR_FRAGMENT *fragment;
|
||
|
int num_fragments;
|
||
|
float rating;
|
||
|
float certainty;
|
||
|
};
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
typedef GenericVector<Dawg *> DawgVector;
|
||
|
|
||
|
struct DawgArgs {
|
||
|
DawgArgs(DawgInfoVector *d, DawgInfoVector *c,
|
||
|
DawgInfoVector *ud, DawgInfoVector *uc, float r) :
|
||
|
active_dawgs(d), constraints(c), updated_active_dawgs(ud),
|
||
|
updated_constraints(uc), rating_margin(r) {
|
||
|
for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
|
||
|
rating_array[i] = NO_RATING;
|
||
|
}
|
||
|
permuter = NO_PERM;
|
||
|
}
|
||
|
DawgInfoVector *active_dawgs;
|
||
|
DawgInfoVector *constraints;
|
||
|
DawgInfoVector *updated_active_dawgs;
|
||
|
DawgInfoVector *updated_constraints;
|
||
|
PermuterType permuter;
|
||
|
float rating_margin; // prunning margin ratio
|
||
|
float rating_array[MAX_WERD_LENGTH];
|
||
|
};
|
||
|
|
||
|
class Dict {
|
||
|
public:
|
||
|
Dict(Image* image_ptr);
|
||
|
~Dict();
|
||
|
Image* getImage() {
|
||
|
return image_ptr_;
|
||
|
}
|
||
|
UNICHARSET& getUnicharset() {
|
||
|
return getImage()->getCCUtil()->unicharset;
|
||
|
}
|
||
|
const UnicharAmbigs &getUnicharAmbigs() {
|
||
|
return getImage()->getCCUtil()->unichar_ambigs;
|
||
|
}
|
||
|
|
||
|
/* hyphen.cpp ************************************************************/
|
||
|
|
||
|
// Returns true if we've recorded the beginning of a hyphenated word.
|
||
|
inline bool hyphenated() { return !last_word_on_line_ && hyphen_word_; }
|
||
|
// Size of the base word (the part on the line before) of a hyphenated word.
|
||
|
inline int hyphen_base_size() {
|
||
|
return this->hyphenated() ? hyphen_word_->length() : 0;
|
||
|
}
|
||
|
// If this word is hyphenated copy the base word (the part on
|
||
|
// the line before) of a hyphenated word into the given word.
|
||
|
// This function assumes that word is not NULL.
|
||
|
inline void copy_hyphen_info(WERD_CHOICE *word) {
|
||
|
if (this->hyphenated()) {
|
||
|
*word = *hyphen_word_;
|
||
|
if (hyphen_debug_level) word->print("copy_hyphen_info: ");
|
||
|
}
|
||
|
}
|
||
|
// Erase the unichar ids corresponding to the portion of the word
|
||
|
// from the previous line. The word is not changed if it is not
|
||
|
// split between lines and hyphenated.
|
||
|
inline void remove_hyphen_head(WERD_CHOICE *word) {
|
||
|
if (this->hyphenated()) {
|
||
|
word->remove_unichar_ids(0, hyphen_word_->length());
|
||
|
if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
|
||
|
}
|
||
|
}
|
||
|
// Check whether the word has a hyphen at the end.
|
||
|
inline bool has_hyphen_end(const WERD_CHOICE &word) {
|
||
|
int word_index = word.length() - 1;
|
||
|
return (last_word_on_line_ && word_index > 0 &&
|
||
|
word.unichar_id(word_index) == hyphen_unichar_id_);
|
||
|
}
|
||
|
// Unless the previous word was the last one on the line, and the current
|
||
|
// one is not (thus it is the first one on the line), erase hyphen_word_,
|
||
|
// clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_.
|
||
|
void reset_hyphen_vars(bool last_word_on_line);
|
||
|
// Update hyphen_word_, and copy the given DawgInfoVectors into
|
||
|
// hyphen_active_dawgs_ and hyphen_constraints_.
|
||
|
void set_hyphen_word(const WERD_CHOICE &word,
|
||
|
const DawgInfoVector &active_dawgs,
|
||
|
const DawgInfoVector &constraints);
|
||
|
|
||
|
/* permdawg.cpp ************************************************************/
|
||
|
// If new_rating < best_choice->rating(), copy word int best_choice
|
||
|
// and update rating and permuter of best_choice to the new given values.
|
||
|
inline void update_best_choice(
|
||
|
const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
|
||
|
if (word.rating() < best_choice->rating()) {
|
||
|
*best_choice = word;
|
||
|
}
|
||
|
}
|
||
|
// Fill the given active_dawgs vector with dawgs that could contain the
|
||
|
// beginning of the word. If hyphenated() returns true, copy the entries
|
||
|
// from hyphen_active_dawgs_ instead.
|
||
|
void init_active_dawgs(DawgInfoVector *active_dawgs);
|
||
|
// If hyphenated() returns true, copy the entries from hyphen_constraints_
|
||
|
// into the given constraints vector.
|
||
|
void init_constraints(DawgInfoVector *constraints);
|
||
|
// Recursively explore all the possible character combinations in
|
||
|
// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
|
||
|
// dawgs in the dawgs_ vector in parallel and discard invalid words.
|
||
|
//
|
||
|
// Allocate and return a WERD_CHOICE with the best valid word found.
|
||
|
WERD_CHOICE *dawg_permute_and_select(
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
|
||
|
void adjust_word(WERD_CHOICE *best_choice,
|
||
|
float *certainty_array);
|
||
|
// If the choice being composed so far could be a dictionary word
|
||
|
// and we have not reached the end of the word keep exploring the
|
||
|
// char_choices further.
|
||
|
// Also:
|
||
|
// -- set hyphen word if needed
|
||
|
// -- if word_ending is true and word is better than best_choice
|
||
|
// copy word to best_choice log new word choice
|
||
|
void go_deeper_dawg_fxn(
|
||
|
const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
|
||
|
bool word_ending, WERD_CHOICE *word, float certainties[],
|
||
|
float *limit, WERD_CHOICE *best_choice, void *void_more_args);
|
||
|
|
||
|
/* permute.cpp *************************************************************/
|
||
|
void add_document_word(const WERD_CHOICE &best_choice);
|
||
|
void init_permute();
|
||
|
WERD_CHOICE *permute_top_choice(
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
float* rating_limit,
|
||
|
WERD_CHOICE *raw_choice,
|
||
|
BOOL8 *any_alpha);
|
||
|
const char* choose_il1(const char *first_char, //first choice
|
||
|
const char *second_char, //second choice
|
||
|
const char *third_char, //third choice
|
||
|
const char *prev_char, //prev in word
|
||
|
const char *next_char, //next in word
|
||
|
const char *next_next_char); //after next next in word
|
||
|
int valid_word(const WERD_CHOICE &word) {
|
||
|
return valid_word(word, false); // return NO_PERM for words with digits
|
||
|
}
|
||
|
int valid_word_or_number(const WERD_CHOICE &word) {
|
||
|
return valid_word(word, true); // return NUMBER_PERM for valid numbers
|
||
|
}
|
||
|
int valid_word(const WERD_CHOICE &word, bool numbers_ok);
|
||
|
bool valid_punctuation(const WERD_CHOICE &word);
|
||
|
WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
float rating_limit,
|
||
|
WERD_CHOICE *raw_choice);
|
||
|
void end_permute();
|
||
|
void adjust_non_word(WERD_CHOICE *word, float *adjust_factor);
|
||
|
void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
float rating_limit,
|
||
|
int start,
|
||
|
int end,
|
||
|
WERD_CHOICE *current_word);
|
||
|
void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
float limit,
|
||
|
WERD_CHOICE *best_choice,
|
||
|
WERD_CHOICE *raw_choice);
|
||
|
WERD_CHOICE *permute_compound_words(
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
float rating_limit);
|
||
|
// checks if the dominant word script, if there is one, is same as target.
|
||
|
bool word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
int target_script_id);
|
||
|
// Incoporate segmentation cost into word rating
|
||
|
void incorporate_segcost(WERD_CHOICE* word);
|
||
|
// checks for script-consistent permutations
|
||
|
WERD_CHOICE *permute_script_words(
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices);
|
||
|
|
||
|
WERD_CHOICE *top_fragments_permute_and_select(
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
float rating_limit);
|
||
|
// If the choice being composed so far could be better
|
||
|
// than best_choice keep exploring char_choices.
|
||
|
// If we have reached the end of the word and word is better than
|
||
|
// best_choice, copy word to best_choice and log a new word choice.
|
||
|
void go_deeper_top_fragments_fxn(
|
||
|
const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
|
||
|
bool word_ending, WERD_CHOICE *word, float certainties[],
|
||
|
float *limit, WERD_CHOICE *best_choice, void *more_args);
|
||
|
|
||
|
// Semi-generic functions used by multiple permuters.
|
||
|
bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
|
||
|
float curr_rating, float curr_certainty,
|
||
|
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
|
||
|
const char *debug, int word_ending,
|
||
|
CHAR_FRAGMENT_INFO *char_frag_info);
|
||
|
void permute_choices(
|
||
|
const char *debug,
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
int char_choice_index,
|
||
|
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
|
||
|
WERD_CHOICE *word,
|
||
|
float certainties[],
|
||
|
float *limit,
|
||
|
WERD_CHOICE *best_choice,
|
||
|
void *more_args);
|
||
|
|
||
|
void append_choices(
|
||
|
const char *debug,
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
const BLOB_CHOICE &blob_choice,
|
||
|
int char_choice_index,
|
||
|
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
|
||
|
WERD_CHOICE *word,
|
||
|
float certainties[],
|
||
|
float *limit,
|
||
|
WERD_CHOICE *best_choice,
|
||
|
void *more_args);
|
||
|
// Pointer to go_deeper function that will be modified by various permuters.
|
||
|
void (Dict::*go_deeper_fxn_)(const char *debug,
|
||
|
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||
|
int char_choice_index,
|
||
|
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
|
||
|
bool word_ending, WERD_CHOICE *word,
|
||
|
float certainties[], float *limit,
|
||
|
WERD_CHOICE *best_choice, void *void_more_args);
|
||
|
/* stopper.cpp *************************************************************/
|
||
|
int NoDangerousAmbig(WERD_CHOICE *BestChoice,
|
||
|
DANGERR *fixpt,
|
||
|
bool fix_replaceable,
|
||
|
BLOB_CHOICE_LIST_VECTOR *Choices,
|
||
|
bool *modified_blobs);
|
||
|
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
|
||
|
UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
|
||
|
BLOB_CHOICE_LIST_VECTOR *blob_choices,
|
||
|
bool *modified_blobs);
|
||
|
|
||
|
inline void DisableChoiceAccum() { keep_word_choices_ = FALSE; }
|
||
|
inline void EnableChoiceAccum() { keep_word_choices_ = TRUE; }
|
||
|
|
||
|
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
|
||
|
VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
|
||
|
FLOAT32 AdjustFactor,
|
||
|
const float Certainties[]);
|
||
|
void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
|
||
|
int StringSameAs(const char *String,
|
||
|
const char *String_lengths,
|
||
|
VIABLE_CHOICE ViableChoice);
|
||
|
bool StringSameAs(const WERD_CHOICE &WordChoice,
|
||
|
VIABLE_CHOICE ViableChoice);
|
||
|
int AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
|
||
|
WERD_CHOICE *BestChoice,
|
||
|
const WERD_CHOICE &RawChoice,
|
||
|
DANGERR *fixpt,
|
||
|
ACCEPTABLE_CHOICE_CALLER caller,
|
||
|
bool *modified_blobs);
|
||
|
int AcceptableResult(const WERD_CHOICE &BestChoice,
|
||
|
const WERD_CHOICE &RawChoice);
|
||
|
int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
|
||
|
void LogNewChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor,
|
||
|
const float Certainties[], bool raw_choice);
|
||
|
void EndDangerousAmbigs();
|
||
|
int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
|
||
|
FLOAT32 CurrentBestChoiceAdjustFactor();
|
||
|
int CurrentWordAmbig();
|
||
|
void DebugWordChoices();
|
||
|
void PrintAmbigAlternatives(FILE *file, const char *label,
|
||
|
int label_num_unichars);
|
||
|
void FillViableChoice(const WERD_CHOICE &WordChoice,
|
||
|
FLOAT32 AdjustFactor, const float Certainties[],
|
||
|
bool SameString, VIABLE_CHOICE ViableChoice);
|
||
|
int AlternativeChoicesWorseThan(FLOAT32 Threshold);
|
||
|
void FilterWordChoices();
|
||
|
void FindClassifierErrors(FLOAT32 MinRating,
|
||
|
FLOAT32 MaxRating,
|
||
|
FLOAT32 RatingMargin,
|
||
|
FLOAT32 Thresholds[]);
|
||
|
void InitChoiceAccum();
|
||
|
void LogNewSegmentation(PIECES_STATE BlobWidth);
|
||
|
void LogNewSplit(int Blob);
|
||
|
void SettupStopperPass1();
|
||
|
void SettupStopperPass2();
|
||
|
/* choices.cpp *************************************************************/
|
||
|
void print_word_string(const char* str);
|
||
|
void print_word_choice(const char *label, A_CHOICE* choice);
|
||
|
void print_choices(const char *label,
|
||
|
CHOICES rating); // List of (A_CHOICE*).
|
||
|
/* permngram.cpp ***********************************************************/
|
||
|
A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
|
||
|
float rating_limit,
|
||
|
const Dawg *dawg);
|
||
|
/* dawg.cpp ****************************************************************/
|
||
|
|
||
|
// Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
|
||
|
// of the current state the letter at word_index in the given word
|
||
|
// is allowed according to at least one of the dawgs in dawgs_,
|
||
|
// otherwise returns NO_PERM.
|
||
|
//
|
||
|
// The state is described by void_dawg_args, which are interpreted as
|
||
|
// DawgArgs and contain two relevant input vectors: active_dawgs and
|
||
|
// constraints. Each entry in the active_dawgs vector contains an index
|
||
|
// into the dawgs_ vector and an EDGE_REF that indicates the last edge
|
||
|
// followed in the dawg. Each entry in the constraints vector contains
|
||
|
// an index into the dawgs_ vector and an EDGE_REF that indicates an edge
|
||
|
// in a pattern dawg followed to match a pattern. Currently constraints
|
||
|
// are used to save the state of punctuation dawgs after leading
|
||
|
// punctuation was found.
|
||
|
//
|
||
|
// Input:
|
||
|
// At word_index 0 dawg_args->active_dawgs should contain an entry for each
|
||
|
// dawg whose type has a bit set in kBeginningDawgsType,
|
||
|
// dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and
|
||
|
// constraints vectors should be initialized to NO_EDGE. If hyphen state
|
||
|
// needs to be applied, initial dawg_args->active_dawgs and
|
||
|
// dawg_args->constrains can be copied from the saved hyphen state
|
||
|
// (maintained by Dict).
|
||
|
// For word_index > 0 the corresponding state (active_dawgs and constraints)
|
||
|
// can be obtained from dawg_args->updated_* passed to def_letter_is_okay
|
||
|
// for word_index-1.
|
||
|
// Note: the function assumes that active_dags, constraints and updated_*
|
||
|
// member variables of dawg_args are not NULL.
|
||
|
//
|
||
|
// Output:
|
||
|
// The function fills in dawg_args->updated_active_dawgs vector with the
|
||
|
// entries for dawgs that contain the word up to the letter at word_index.
|
||
|
// The new constraints (if any) are added to dawg_args->updated_constraints,
|
||
|
// the constraints from dawg_args->constraints are also copied into it.
|
||
|
//
|
||
|
// Detailed description:
|
||
|
// In order to determine whether the word is still valid after considering
|
||
|
// all the letters up to the one at word_index the following is done for
|
||
|
// each entry in dawg_args->active_dawgs:
|
||
|
//
|
||
|
// -- next starting node is obtained from entry.ref and edge_char_of() is
|
||
|
// called to obtain the next edge
|
||
|
// -- if a valid edge is found, the function returns the updated permuter
|
||
|
// code true and an entry [entry.dawg_index, edge] is inserted in
|
||
|
// dawg_args->updated_active_dawgs
|
||
|
// otherwise:
|
||
|
// -- if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION,
|
||
|
// edge_char_of() is called again, but now with kPatternUnicharID
|
||
|
// as unichar_id; if a valid edge is found it is recorded in
|
||
|
// dawg_args->updated_constraints
|
||
|
// -- the function checks whether the word can end with the previous
|
||
|
// letter
|
||
|
// -- each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD
|
||
|
// could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the
|
||
|
// successors are defined by successors_ vector) is explored and if
|
||
|
// a letter is found in the successor dawg, a new entry is inserted
|
||
|
// into dawg_args->updated_active_dawgs with EDGE_REF being either
|
||
|
// NO_EDGE or an EDGE_REF recorded in constraints vector for the
|
||
|
// corresponding dawg index
|
||
|
|
||
|
//
|
||
|
int def_letter_is_okay(void* void_dawg_args, int word_index,
|
||
|
const void* word, bool word_end);
|
||
|
|
||
|
int new_letter_is_okay(void* void_dawg_args, int word_index,
|
||
|
const void* word, bool word_end);
|
||
|
int (Dict::*letter_is_okay_)(void* void_dawg_args, int word_index,
|
||
|
const void *word, bool word_end);
|
||
|
// Return the number of dawgs in the dawgs_ vector.
|
||
|
inline const int NumDawgs() const { return dawgs_.size(); }
|
||
|
// Return i-th dawg pointer recorded in the dawgs_ vector.
|
||
|
inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
|
||
|
// At word ending make sure all the recorded constraints are satisfied.
|
||
|
// Each constraint signifies that we found a beginning pattern in a
|
||
|
// pattern dawg. Check that this pattern can end here (e.g. if some
|
||
|
// leading punctuation is found this would ensure that we are not
|
||
|
// expecting any particular trailing punctuation after the word).
|
||
|
inline bool ConstraintsOk(const DawgInfoVector &constraints,
|
||
|
int word_end, DawgType current_dawg_type) {
|
||
|
if (!word_end) return true;
|
||
|
if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
|
||
|
for (int c = 0; c < constraints.length(); ++c) {
|
||
|
const DawgInfo &cinfo = constraints[c];
|
||
|
Dawg *cdawg = dawgs_[cinfo.dawg_index];
|
||
|
if (!cdawg->end_of_word(cinfo.ref)) {
|
||
|
if (dawg_debug_level >= 3) {
|
||
|
tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
|
||
|
cinfo.dawg_index, cinfo.ref);
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
// Record the maximum of the two permuters in permuter.
|
||
|
static inline void UpdatePermuter(PermuterType new_permuter,
|
||
|
PermuterType *permuter) {
|
||
|
if (dawg_debug_level >= 3) tprintf("Letter found\n");
|
||
|
if (new_permuter > *permuter) *permuter = new_permuter;
|
||
|
}
|
||
|
|
||
|
/* conversion.cpp **********************************************************/
|
||
|
// TODO(daria): remove these function when conversion.cpp is deprecated
|
||
|
// and all the code is converted to work with unichar ids.
|
||
|
void LogNewWordChoice(A_CHOICE *a_choice,
|
||
|
FLOAT32 adjust_factor,
|
||
|
const float certainties[],
|
||
|
const UNICHARSET &unicharset);
|
||
|
int valid_word(const char *string);
|
||
|
|
||
|
private:
|
||
|
// Private member variables.
|
||
|
Image* image_ptr_;
|
||
|
// Table that stores ambiguities computed during training
|
||
|
// (loaded when NoDangerousAmbigs() is called for the first time).
|
||
|
// Each entry i in the table stores a set of amibiguities whose
|
||
|
// wrong ngram starts with unichar id i.
|
||
|
UnicharAmbigs *dang_ambigs_table_;
|
||
|
// Same as above, but for ambiguities with replace flag set.
|
||
|
UnicharAmbigs *replace_ambigs_table_;
|
||
|
// Flag used to disable accumulation of word choices
|
||
|
// during compound word permutation.
|
||
|
BOOL8 keep_word_choices_;
|
||
|
// Additional certainty padding allowed before a word is rejected.
|
||
|
FLOAT32 reject_offset_;
|
||
|
// Current word segmentation.
|
||
|
PIECES_STATE current_segmentation_;
|
||
|
// Variables to keep track of best/raw word choices.
|
||
|
VIABLE_CHOICE best_raw_choice_;
|
||
|
LIST raw_choices_;
|
||
|
LIST best_choices_;
|
||
|
// Hyphen-related variables.
|
||
|
UNICHAR_ID hyphen_unichar_id_;
|
||
|
WERD_CHOICE *hyphen_word_;
|
||
|
DawgInfoVector hyphen_active_dawgs_;
|
||
|
DawgInfoVector hyphen_constraints_;
|
||
|
bool last_word_on_line_;
|
||
|
// Dawgs.
|
||
|
DawgVector dawgs_;
|
||
|
SuccessorListsVector successors_;
|
||
|
Dawg *freq_dawg_;
|
||
|
Trie *pending_words_;
|
||
|
// The following pointers are only cached for convenience.
|
||
|
// The dawgs will be deleted when dawgs_ vector is destroyed.
|
||
|
// TODO(daria): need to support multiple languages in the future,
|
||
|
// so maybe will need to maintain a list of dawgs of each kind.
|
||
|
Trie *document_words_;
|
||
|
};
|
||
|
} // namespace tesseract
|
||
|
|
||
|
#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
|