/////////////////////////////////////////////////////////////////////// // File: dict.h // Description: dict class. // Author: Samuel Charron // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_DICT_DICT_H_ #define TESSERACT_DICT_DICT_H_ #include "ambigs.h" #include "choices.h" #include "choicearr.h" #include "dawg.h" #include "image.h" #include "ratngs.h" #include "stopper.h" #include "trie.h" #include "unicharset.h" extern STRING_VAR_H(global_user_words_suffix, "user-words", "A list of user-provided words."); extern INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); #define MAX_WERD_LENGTH (inT64) 40 #define NO_RATING -1 #define FREQ_WERD 1.0 #define GOOD_WERD 1.1 #define OK_WERD 1.3125 // Struct used to hold temporary information about fragments. struct CHAR_FRAGMENT_INFO { UNICHAR_ID unichar_id; const CHAR_FRAGMENT *fragment; int num_fragments; float rating; float certainty; }; namespace tesseract { typedef GenericVector DawgVector; struct DawgArgs { DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud, DawgInfoVector *uc, float r) : active_dawgs(d), constraints(c), updated_active_dawgs(ud), updated_constraints(uc), rating_margin(r) { for (int i = 0; i < MAX_WERD_LENGTH; ++i) { rating_array[i] = NO_RATING; } permuter = NO_PERM; } DawgInfoVector *active_dawgs; DawgInfoVector *constraints; DawgInfoVector *updated_active_dawgs; DawgInfoVector *updated_constraints; PermuterType permuter; float rating_margin; // prunning margin ratio float rating_array[MAX_WERD_LENGTH]; }; class Dict { public: Dict(Image* image_ptr); ~Dict(); Image* getImage() { return image_ptr_; } UNICHARSET& getUnicharset() { return getImage()->getCCUtil()->unicharset; } const UnicharAmbigs &getUnicharAmbigs() { return getImage()->getCCUtil()->unichar_ambigs; } /* hyphen.cpp ************************************************************/ // Returns true if we've recorded the beginning of a hyphenated word. inline bool hyphenated() { return !last_word_on_line_ && hyphen_word_; } // Size of the base word (the part on the line before) of a hyphenated word. inline int hyphen_base_size() { return this->hyphenated() ? hyphen_word_->length() : 0; } // If this word is hyphenated copy the base word (the part on // the line before) of a hyphenated word into the given word. // This function assumes that word is not NULL. inline void copy_hyphen_info(WERD_CHOICE *word) { if (this->hyphenated()) { *word = *hyphen_word_; if (hyphen_debug_level) word->print("copy_hyphen_info: "); } } // Erase the unichar ids corresponding to the portion of the word // from the previous line. The word is not changed if it is not // split between lines and hyphenated. inline void remove_hyphen_head(WERD_CHOICE *word) { if (this->hyphenated()) { word->remove_unichar_ids(0, hyphen_word_->length()); if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: "); } } // Check whether the word has a hyphen at the end. inline bool has_hyphen_end(const WERD_CHOICE &word) { int word_index = word.length() - 1; return (last_word_on_line_ && word_index > 0 && word.unichar_id(word_index) == hyphen_unichar_id_); } // Unless the previous word was the last one on the line, and the current // one is not (thus it is the first one on the line), erase hyphen_word_, // clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_. void reset_hyphen_vars(bool last_word_on_line); // Update hyphen_word_, and copy the given DawgInfoVectors into // hyphen_active_dawgs_ and hyphen_constraints_. void set_hyphen_word(const WERD_CHOICE &word, const DawgInfoVector &active_dawgs, const DawgInfoVector &constraints); /* permdawg.cpp ************************************************************/ // If new_rating < best_choice->rating(), copy word int best_choice // and update rating and permuter of best_choice to the new given values. inline void update_best_choice( const WERD_CHOICE &word, WERD_CHOICE *best_choice) { if (word.rating() < best_choice->rating()) { *best_choice = word; } } // Fill the given active_dawgs vector with dawgs that could contain the // beginning of the word. If hyphenated() returns true, copy the entries // from hyphen_active_dawgs_ instead. void init_active_dawgs(DawgInfoVector *active_dawgs); // If hyphenated() returns true, copy the entries from hyphen_constraints_ // into the given constraints vector. void init_constraints(DawgInfoVector *constraints); // Recursively explore all the possible character combinations in // the given char_choices. Use go_deeper_dawg_fxn() to explore all the // dawgs in the dawgs_ vector in parallel and discard invalid words. // // Allocate and return a WERD_CHOICE with the best valid word found. WERD_CHOICE *dawg_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); void adjust_word(WERD_CHOICE *best_choice, float *certainty_array); // If the choice being composed so far could be a dictionary word // and we have not reached the end of the word keep exploring the // char_choices further. // Also: // -- set hyphen word if needed // -- if word_ending is true and word is better than best_choice // copy word to best_choice log new word choice void go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *void_more_args); /* permute.cpp *************************************************************/ void add_document_word(const WERD_CHOICE &best_choice); void init_permute(); WERD_CHOICE *permute_top_choice( const BLOB_CHOICE_LIST_VECTOR &char_choices, float* rating_limit, WERD_CHOICE *raw_choice, BOOL8 *any_alpha); const char* choose_il1(const char *first_char, //first choice const char *second_char, //second choice const char *third_char, //third choice const char *prev_char, //prev in word const char *next_char, //next in word const char *next_next_char); //after next next in word int valid_word(const WERD_CHOICE &word) { return valid_word(word, false); // return NO_PERM for words with digits } int valid_word_or_number(const WERD_CHOICE &word) { return valid_word(word, true); // return NUMBER_PERM for valid numbers } int valid_word(const WERD_CHOICE &word, bool numbers_ok); bool valid_punctuation(const WERD_CHOICE &word); WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, WERD_CHOICE *raw_choice); void end_permute(); void adjust_non_word(WERD_CHOICE *word, float *adjust_factor); void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int start, int end, WERD_CHOICE *current_word); void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, float limit, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice); WERD_CHOICE *permute_compound_words( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); // checks if the dominant word script, if there is one, is same as target. bool word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices, int target_script_id); // Incoporate segmentation cost into word rating void incorporate_segcost(WERD_CHOICE* word); // checks for script-consistent permutations WERD_CHOICE *permute_script_words( const BLOB_CHOICE_LIST_VECTOR &char_choices); WERD_CHOICE *top_fragments_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); // If the choice being composed so far could be better // than best_choice keep exploring char_choices. // If we have reached the end of the word and word is better than // best_choice, copy word to best_choice and log a new word choice. void go_deeper_top_fragments_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *more_args); // Semi-generic functions used by multiple permuters. bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info); void permute_choices( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *more_args); void append_choices( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *more_args); // Pointer to go_deeper function that will be modified by various permuters. void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *void_more_args); /* stopper.cpp *************************************************************/ int NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs); void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, BLOB_CHOICE_LIST_VECTOR *blob_choices, bool *modified_blobs); inline void DisableChoiceAccum() { keep_word_choices_ = FALSE; } inline void EnableChoiceAccum() { keep_word_choices_ = TRUE; } int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[]); void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice); int StringSameAs(const char *String, const char *String_lengths, VIABLE_CHOICE ViableChoice); bool StringSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice); int AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, WERD_CHOICE *BestChoice, const WERD_CHOICE &RawChoice, DANGERR *fixpt, ACCEPTABLE_CHOICE_CALLER caller, bool *modified_blobs); int AcceptableResult(const WERD_CHOICE &BestChoice, const WERD_CHOICE &RawChoice); int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice); void LogNewChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice); void EndDangerousAmbigs(); int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice); FLOAT32 CurrentBestChoiceAdjustFactor(); int CurrentWordAmbig(); void DebugWordChoices(); void PrintAmbigAlternatives(FILE *file, const char *label, int label_num_unichars); void FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], bool SameString, VIABLE_CHOICE ViableChoice); int AlternativeChoicesWorseThan(FLOAT32 Threshold); void FilterWordChoices(); void FindClassifierErrors(FLOAT32 MinRating, FLOAT32 MaxRating, FLOAT32 RatingMargin, FLOAT32 Thresholds[]); void InitChoiceAccum(); void LogNewSegmentation(PIECES_STATE BlobWidth); void LogNewSplit(int Blob); void SettupStopperPass1(); void SettupStopperPass2(); /* choices.cpp *************************************************************/ void print_word_string(const char* str); void print_word_choice(const char *label, A_CHOICE* choice); void print_choices(const char *label, CHOICES rating); // List of (A_CHOICE*). /* permngram.cpp ***********************************************************/ A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices, float rating_limit, const Dawg *dawg); /* dawg.cpp ****************************************************************/ // Returns the maximal permuter code (from ccstruct/ratngs.h) if in light // of the current state the letter at word_index in the given word // is allowed according to at least one of the dawgs in dawgs_, // otherwise returns NO_PERM. // // The state is described by void_dawg_args, which are interpreted as // DawgArgs and contain two relevant input vectors: active_dawgs and // constraints. Each entry in the active_dawgs vector contains an index // into the dawgs_ vector and an EDGE_REF that indicates the last edge // followed in the dawg. Each entry in the constraints vector contains // an index into the dawgs_ vector and an EDGE_REF that indicates an edge // in a pattern dawg followed to match a pattern. Currently constraints // are used to save the state of punctuation dawgs after leading // punctuation was found. // // Input: // At word_index 0 dawg_args->active_dawgs should contain an entry for each // dawg whose type has a bit set in kBeginningDawgsType, // dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and // constraints vectors should be initialized to NO_EDGE. If hyphen state // needs to be applied, initial dawg_args->active_dawgs and // dawg_args->constrains can be copied from the saved hyphen state // (maintained by Dict). // For word_index > 0 the corresponding state (active_dawgs and constraints) // can be obtained from dawg_args->updated_* passed to def_letter_is_okay // for word_index-1. // Note: the function assumes that active_dags, constraints and updated_* // member variables of dawg_args are not NULL. // // Output: // The function fills in dawg_args->updated_active_dawgs vector with the // entries for dawgs that contain the word up to the letter at word_index. // The new constraints (if any) are added to dawg_args->updated_constraints, // the constraints from dawg_args->constraints are also copied into it. // // Detailed description: // In order to determine whether the word is still valid after considering // all the letters up to the one at word_index the following is done for // each entry in dawg_args->active_dawgs: // // -- next starting node is obtained from entry.ref and edge_char_of() is // called to obtain the next edge // -- if a valid edge is found, the function returns the updated permuter // code true and an entry [entry.dawg_index, edge] is inserted in // dawg_args->updated_active_dawgs // otherwise: // -- if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION, // edge_char_of() is called again, but now with kPatternUnicharID // as unichar_id; if a valid edge is found it is recorded in // dawg_args->updated_constraints // -- the function checks whether the word can end with the previous // letter // -- each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD // could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the // successors are defined by successors_ vector) is explored and if // a letter is found in the successor dawg, a new entry is inserted // into dawg_args->updated_active_dawgs with EDGE_REF being either // NO_EDGE or an EDGE_REF recorded in constraints vector for the // corresponding dawg index // int def_letter_is_okay(void* void_dawg_args, int word_index, const void* word, bool word_end); int new_letter_is_okay(void* void_dawg_args, int word_index, const void* word, bool word_end); int (Dict::*letter_is_okay_)(void* void_dawg_args, int word_index, const void *word, bool word_end); // Return the number of dawgs in the dawgs_ vector. inline const int NumDawgs() const { return dawgs_.size(); } // Return i-th dawg pointer recorded in the dawgs_ vector. inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } // At word ending make sure all the recorded constraints are satisfied. // Each constraint signifies that we found a beginning pattern in a // pattern dawg. Check that this pattern can end here (e.g. if some // leading punctuation is found this would ensure that we are not // expecting any particular trailing punctuation after the word). inline bool ConstraintsOk(const DawgInfoVector &constraints, int word_end, DawgType current_dawg_type) { if (!word_end) return true; if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true; for (int c = 0; c < constraints.length(); ++c) { const DawgInfo &cinfo = constraints[c]; Dawg *cdawg = dawgs_[cinfo.dawg_index]; if (!cdawg->end_of_word(cinfo.ref)) { if (dawg_debug_level >= 3) { tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n", cinfo.dawg_index, cinfo.ref); } return false; } } return true; } // Record the maximum of the two permuters in permuter. static inline void UpdatePermuter(PermuterType new_permuter, PermuterType *permuter) { if (dawg_debug_level >= 3) tprintf("Letter found\n"); if (new_permuter > *permuter) *permuter = new_permuter; } /* conversion.cpp **********************************************************/ // TODO(daria): remove these function when conversion.cpp is deprecated // and all the code is converted to work with unichar ids. void LogNewWordChoice(A_CHOICE *a_choice, FLOAT32 adjust_factor, const float certainties[], const UNICHARSET &unicharset); int valid_word(const char *string); private: // Private member variables. Image* image_ptr_; // Table that stores ambiguities computed during training // (loaded when NoDangerousAmbigs() is called for the first time). // Each entry i in the table stores a set of amibiguities whose // wrong ngram starts with unichar id i. UnicharAmbigs *dang_ambigs_table_; // Same as above, but for ambiguities with replace flag set. UnicharAmbigs *replace_ambigs_table_; // Flag used to disable accumulation of word choices // during compound word permutation. BOOL8 keep_word_choices_; // Additional certainty padding allowed before a word is rejected. FLOAT32 reject_offset_; // Current word segmentation. PIECES_STATE current_segmentation_; // Variables to keep track of best/raw word choices. VIABLE_CHOICE best_raw_choice_; LIST raw_choices_; LIST best_choices_; // Hyphen-related variables. UNICHAR_ID hyphen_unichar_id_; WERD_CHOICE *hyphen_word_; DawgInfoVector hyphen_active_dawgs_; DawgInfoVector hyphen_constraints_; bool last_word_on_line_; // Dawgs. DawgVector dawgs_; SuccessorListsVector successors_; Dawg *freq_dawg_; Trie *pending_words_; // The following pointers are only cached for convenience. // The dawgs will be deleted when dawgs_ vector is destroyed. // TODO(daria): need to support multiple languages in the future, // so maybe will need to maintain a list of dawgs of each kind. Trie *document_words_; }; } // namespace tesseract #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_