/////////////////////////////////////////////////////////////////////// // File: dict.h // Description: dict class. // Author: Samuel Charron // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_DICT_DICT_H_ #define TESSERACT_DICT_DICT_H_ #include "ambigs.h" #include "dawg.h" #include "host.h" #include "image.h" #include "oldlist.h" #include "ratngs.h" #include "stopper.h" #include "trie.h" #include "unicharset.h" #include "permute.h" #define MAX_WERD_LENGTH (inT64) 128 #define NO_RATING -1 /** Struct used to hold temporary information about fragments. */ struct CHAR_FRAGMENT_INFO { UNICHAR_ID unichar_id; const CHAR_FRAGMENT *fragment; int num_fragments; float rating; float certainty; }; namespace tesseract { typedef GenericVector DawgVector; // // Constants // static const int kAnyWordLength = -1; static const int kRatingPad = 4; // TODO(daria): If hyphens are different in different languages and can be // inferred from training data we should load their values dynamically. static const char kHyphenSymbol[] = "-"; static const int kMaxNumDawgEdgees = 2000000; static const int kMaxDocDawgEdges = 250000; static const int kMaxUserDawgEdges = 50000; static const float kSimCertaintyScale = -10.0; // similarity matcher scaling static const float kSimCertaintyOffset = -10.0; // similarity matcher offset static const float kSimilarityFloor = 100.0; // worst E*L product to stop on static const int kDocDictMaxRepChars = 4; struct DawgArgs { DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud, DawgInfoVector *uc, float r, PermuterType p, int len, int e) : active_dawgs(d), constraints(c), updated_active_dawgs(ud), updated_constraints(uc), rating_margin(r) { for (int i = 0; i < MAX_WERD_LENGTH; ++i) { rating_array[i] = NO_RATING; } permuter = p; sought_word_length = len; end_char_choice_index = e; } DawgInfoVector *active_dawgs; DawgInfoVector *constraints; DawgInfoVector *updated_active_dawgs; DawgInfoVector *updated_constraints; PermuterType permuter; int sought_word_length; // TODO(daria): remove these fields when permdawg is deprecated. float rating_margin; /**< pruning margin ratio */ float rating_array[MAX_WERD_LENGTH]; int end_char_choice_index; }; class Dict { public: Dict(Image* image_ptr); ~Dict(); const Image* getImage() const { return image_ptr_; } Image* getImage() { return image_ptr_; } const UNICHARSET& getUnicharset() const { return getImage()->getCCUtil()->unicharset; } UNICHARSET& getUnicharset() { return getImage()->getCCUtil()->unicharset; } const UnicharAmbigs &getUnicharAmbigs() { return getImage()->getCCUtil()->unichar_ambigs; } inline bool compound_marker(UNICHAR_ID unichar_id) { return (unichar_id == getUnicharset().unichar_to_id("-") || unichar_id == getUnicharset().unichar_to_id("/")); } /* hyphen.cpp ************************************************************/ /// Returns true if we've recorded the beginning of a hyphenated word. inline bool hyphenated() const { return !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0; } /// Size of the base word (the part on the line before) of a hyphenated word. inline int hyphen_base_size() const { return this->hyphenated() ? hyphen_word_->length() : 0; } /// If this word is hyphenated copy the base word (the part on /// the line before) of a hyphenated word into the given word. /// This function assumes that word is not NULL. inline void copy_hyphen_info(WERD_CHOICE *word) const { if (this->hyphenated()) { *word = *hyphen_word_; if (hyphen_debug_level) word->print("copy_hyphen_info: "); } } /// Erase the unichar ids corresponding to the portion of the word /// from the previous line. The word is not changed if it is not /// split between lines and hyphenated. inline void remove_hyphen_head(WERD_CHOICE *word) const { if (this->hyphenated()) { word->remove_unichar_ids(0, hyphen_word_->length()); if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: "); } } /// Check whether the word has a hyphen at the end. inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const { return (last_word_on_line_ && !first_pos && unichar_id == hyphen_unichar_id_); } /// Same as above, but check the unichar at the end of the word. inline bool has_hyphen_end(const WERD_CHOICE &word) const { int word_index = word.length() - 1; return has_hyphen_end(word.unichar_id(word_index), word_index == 0); } /// Unless the previous word was the last one on the line, and the current /// one is not (thus it is the first one on the line), erase hyphen_word_, /// clear hyphen_active_dawgs_, hyphen_constraints_ update last_word_on_line_. void reset_hyphen_vars(bool last_word_on_line); /// Update hyphen_word_, and copy the given DawgInfoVectors into /// hyphen_active_dawgs_ and hyphen_constraints_. void set_hyphen_word(const WERD_CHOICE &word, const DawgInfoVector &active_dawgs, const DawgInfoVector &constraints); /* permdawg.cpp ************************************************************/ /// Copies word into best_choice if its rating is smaller /// than that of best_choice. inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) { if (word.rating() < best_choice->rating()) *best_choice = word; } /// Fill the given active_dawgs vector with dawgs that could contain the /// beginning of the word. If hyphenated() returns true, copy the entries /// from hyphen_active_dawgs_ instead. void init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, bool ambigs_mode) const; /// If hyphenated() returns true, copy the entries from hyphen_constraints_ /// into the given constraints vector. void init_constraints(DawgInfoVector *constraints) const; /// Returns true if we are operating in ambigs mode. inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; } /// Recursively explore all the possible character combinations in /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the /// dawgs in the dawgs_ vector in parallel and discard invalid words. /// /// Allocate and return a WERD_CHOICE with the best valid word found. WERD_CHOICE *dawg_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int end_char_choice_index); WERD_CHOICE *dawg_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { return dawg_permute_and_select(char_choices, rating_limit, kAnyWordLength, 0); } /// If the choice being composed so far could be a dictionary word /// and we have not reached the end of the word keep exploring the /// char_choices further. /// Also: /// -- sets hyphen word if needed /// -- if word_ending is true and the word is better than best_choice, /// copies word to best_choice and logs new word choice void go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args); /* permute.cpp *************************************************************/ WERD_CHOICE *get_top_choice_word( const BLOB_CHOICE_LIST_VECTOR &char_choices); WERD_CHOICE *permute_top_choice( const BLOB_CHOICE_LIST_VECTOR &char_choices, float* rating_limit, WERD_CHOICE *raw_choice, BOOL8 *any_alpha); const char* choose_il1(const char *first_char, //first choice const char *second_char, //second choice const char *third_char, //third choice const char *prev_char, //prev in word const char *next_char, //next in word const char *next_next_char); //after next next in word WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, const WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice); void end_permute(); void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int start, int end, WERD_CHOICE *current_word); bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice); WERD_CHOICE *permute_compound_words( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); /// Find permutations matching a list of fixed-char-length dawgs /// The bestchoice based on this permuter alone is returned. Alternatively, /// non-conflicting changes can be combined through permuter_state. WERD_CHOICE *permute_fixed_length_words( const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state); /// Incoporate segmentation cost into word rating void incorporate_segcost(WERD_CHOICE* word); /// Checks for script-consistent permutations. Similar to fixed-length /// permuter, the best choice is returned by the function, but the combined /// changes are also recorded into permuter_state. WERD_CHOICE *permute_script_words( const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state); /// checks for consistency in character property (eg. alpah, digit, punct) WERD_CHOICE *permute_chartype_words( const BLOB_CHOICE_LIST_VECTOR &char_choices, PermuterState *permuter_state); /// Look up the main chartype for each character position and store it in /// the given array. Also returns the dominant type from unambiguous top /// choices. char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices, char* pos_chartypes); WERD_CHOICE *top_fragments_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); /// While the choice being composed so far could be better /// than best_choice keeps exploring char_choices. /// If the end of the word is reached and the word is better than /// best_choice, copies word to best_choice and logs the new word choice. void go_deeper_top_fragments_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args); /// Semi-generic functions used by multiple permuters. bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info); void permute_choices( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args); void append_choices( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args); /// Pointer to go_deeper function that will be modified by various permuters. void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args); /* stopper.cpp *************************************************************/ bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs); double StopperAmbigThreshold(double f1, double f2) { return (f2 - f1) * stopper_ambiguity_threshold_gain - stopper_ambiguity_threshold_offset; } // If the certainty of any chunk in Choice (item1) is not ambiguous with the // corresponding chunk in the best choice (item2), frees Choice and // returns true. int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice void *item2); // EXPANDED_CHOICE *BestChoice /// Replaces the corresponding wrong ngram in werd_choice with the correct /// one. We indicate that this newly inserted ngram unichar is composed from /// several fragments and modify the corresponding entries in blob_choices to /// contain fragments of the correct ngram unichar instead of the original /// unichars. Ratings and certainties of entries in blob_choices and /// werd_choice are unichaged. E.g. for werd_choice mystring'' and ambiguity /// ''->": werd_choice becomes mystring", first ' in blob_choices becomes /// |"|0|2, second one is set to |"|1|2. void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, BLOB_CHOICE_LIST_VECTOR *blob_choices, bool *modified_blobs); inline void DisableChoiceAccum() { keep_word_choices_ = false; } inline void EnableChoiceAccum() { keep_word_choices_ = true; } inline bool ChoiceAccumEnabled() { return keep_word_choices_; } /// Returns the length of the shortest alpha run in WordChoice. int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); /// Allocates a new viable choice data structure, copies WordChoice, /// Certainties, and current_segmentation_ into it, returns a pointer to /// the newly created VIABLE_CHOICE. /// WordChoice is a choice to be converted to a viable choice. /// AdjustFactor is a factor used to adjust ratings for WordChoice. /// Certainties contain certainty for each character in WordChoice. VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[]); /// Dumps a text representation of the specified Choice to File. void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice); /// Compares unichar ids in word_choice to those in viable_choice, /// returns true if they are the same. bool StringSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice); /// Compares String to ViableChoice and returns true if they are the same. bool StringSameAs(const char *String, const char *String_lengths, VIABLE_CHOICE ViableChoice); /// Returns true if the certainty of the BestChoice word is within a /// reasonable range of the average certainties for the best choices for /// each character in the segmentation. This test is used to catch words /// in which one character is much worse than the other characters in the /// word (i.e. false will be returned in that case). The algorithm computes /// the mean and std deviation of the certainties in the word with the worst /// certainty thrown out. int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, const WERD_CHOICE &BestChoice); /// Returns true if the given best_choice is good enough to stop. bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, WERD_CHOICE *BestChoice, DANGERR *fixpt, ACCEPTABLE_CHOICE_CALLER caller, bool *modified_blobs); /// Returns false if the best choice for the current word is questionable /// and should be tried again on the second pass or should be flagged to /// the user. bool AcceptableResult(const WERD_CHOICE &BestChoice); /// Compares the corresponding strings of WordChoice and ViableChoice and /// returns true if they are the same. int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice); /// Adds Choice to ChoicesList if the adjusted certainty for Choice is within /// a reasonable range of the best choice in ChoicesList. The ChoicesList list /// is kept in sorted order by rating. Duplicates are removed. /// WordChoice is the new choice for current word. /// AdjustFactor is an adjustment factor which was applied to choice. /// Certainties are certainties for each char in new choice. /// raw_choice indicates whether WordChoice is a raw or best choice. void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice); void EndDangerousAmbigs(); /// Returns true if WordChoice is the same as the current best choice. bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice); /// Returns the adjustment factor for the best choice for the current word. FLOAT32 CurrentBestChoiceAdjustFactor(); /// Returns true if there are multiple good choices for the current word. bool CurrentWordAmbig(); /// Prints the current choices for this word to stdout. void DebugWordChoices(); /// Print all the choices in raw_choices_ list for non 1-1 ambiguities. void PrintAmbigAlternatives(FILE *file, const char *label, int label_num_unichars); /// Fill ViableChoice with information from WordChoice, AChoice, AdjustFactor, /// and Certainties. void FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], VIABLE_CHOICE ViableChoice); /// Returns true if there are no alternative choices for the current word /// or if all alternatives have an adjust factor worse than Threshold. bool AlternativeChoicesWorseThan(FLOAT32 Threshold); /// Removes from best_choices_ all choices which are not within a reasonable /// range of the best choice. void FilterWordChoices(); /// Compares the best choice for the current word to the best raw choice /// to determine which characters were classified incorrectly by the /// classifier. Then places a separate threshold into Thresholds for each /// character in the word. If the classifier was correct, MaxRating is placed /// into Thresholds. If the classifier was incorrect, the avg. match rating /// (error percentage) of the classifier's incorrect choice minus some margin /// is placed into thresholds.This can then be used by the caller to try to /// create a new template for the desired class that will classify the /// character with a rating better than the threshold value. The match rating /// placed into Thresholds is never allowed to be below MinRating in order to /// prevent trying to make overly tight templates. /// MinRating limits how tight to make a template. /// MaxRating limits how loose to make a template. /// RatingMargin denotes the amount of margin to put in template. void FindClassifierErrors(FLOAT32 MinRating, FLOAT32 MaxRating, FLOAT32 RatingMargin, FLOAT32 Thresholds[]); /// Initializes the data structures used to keep track the good word choices /// found for a word. void InitChoiceAccum(); /// Clears best_choices_ list accumulated by the stopper. void ClearBestChoiceAccum(); /// Updates the blob widths in current_segmentation_ to be the same as /// provided in BlobWidth. BlobWidth[] contains the number of chunks in each /// blob in the current segmentation. void LogNewSegmentation(PIECES_STATE BlobWidth); /// Given Blob (the index of the blob that was split), adds 1 chunk to the /// specified blob for each choice in best_choices_ and for best_raw_choice_. void LogNewSplit(int Blob); /// Increments the chunk count of the character in Choice which corresponds /// to Blob (index of the blob being split). void AddNewChunk(VIABLE_CHOICE Choice, int Blob); /// Sets up stopper variables in preparation for the first pass. void SettupStopperPass1(); /// Sets up stopper variables in preparation for the second pass. void SettupStopperPass2(); /* context.cpp *************************************************************/ /// Check a string to see if it matches a set of lexical rules. int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset); /// Returns true if the word looks like an absolute garbage /// (e.g. image mistakenly recognized as text). bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); /* dict.cpp ****************************************************************/ /// Initialize Dict class - load dawgs from [lang].traineddata and /// user-specified wordlist and parttern list. void Load(); void End(); // Resets the document dictionary analogous to ResetAdaptiveClassifier. void ResetDocumentDictionary() { if (pending_words_ != NULL) pending_words_->clear(); if (document_words_ != NULL) document_words_->clear(); } // Create unicharset adaptations of known, short lists of UTF-8 equivalent // characters (think all hyphen-like symbols). The first version of the // list is taken as equivalent for matching against the dictionary. void LoadEquivalenceList(const char *unichar_strings[]); // Normalize all hyphen and apostrophes to the canonicalized one for // matching; pass everything else through as is. See LoadEquivalenceList(). UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const; /** * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light * of the current state the letter at word_index in the given word * is allowed according to at least one of the dawgs in dawgs_, * otherwise returns NO_PERM. * * The state is described by void_dawg_args, which are interpreted as * DawgArgs and contain two relevant input vectors: active_dawgs and * constraints. Each entry in the active_dawgs vector contains an index * into the dawgs_ vector and an EDGE_REF that indicates the last edge * followed in the dawg. Each entry in the constraints vector contains * an index into the dawgs_ vector and an EDGE_REF that indicates an edge * in a pattern dawg followed to match a pattern. Currently constraints * are used to save the state of punctuation dawgs after leading * punctuation was found. * * Input: * At word_index 0 dawg_args->active_dawgs should contain an entry for each * dawg whose type has a bit set in kBeginningDawgsType, * dawg_args->constraints should be empty. EDGE_REFs in active_dawgs and * constraints vectors should be initialized to NO_EDGE. If hyphen state * needs to be applied, initial dawg_args->active_dawgs and * dawg_args->constrains can be copied from the saved hyphen state * (maintained by Dict). * For word_index > 0 the corresponding state (active_dawgs and constraints) * can be obtained from dawg_args->updated_* passed to def_letter_is_okay * for word_index-1. * Note: the function assumes that active_dags, constraints and updated_* * member variables of dawg_args are not NULL. * * Output: * The function fills in dawg_args->updated_active_dawgs vector with the * entries for dawgs that contain the word up to the letter at word_index. * The new constraints (if any) are added to dawg_args->updated_constraints, * the constraints from dawg_args->constraints are also copied into it. * * Detailed description: * In order to determine whether the word is still valid after considering * all the letters up to the one at word_index the following is done for * each entry in dawg_args->active_dawgs: * * - next starting node is obtained from entry.ref and edge_char_of() is * called to obtain the next edge * - if a valid edge is found, the function returns the updated permuter * code true and an entry [entry.dawg_index, edge] is inserted in * dawg_args->updated_active_dawgs * otherwise: * - if we are dealing with dawg of type DAWG_TYPE_PUNCTUATION, * edge_char_of() is called again, but now with kPatternUnicharID * as unichar_id; if a valid edge is found it is recorded in * dawg_args->updated_constraints * - the function checks whether the word can end with the previous * letter * - each successor of the dawg (e.g. dawgs with type DAWG_TYPE_WORD * could be successors to dawgs with type DAWG_TYPE_PUNCTUATION; the * successors are defined by successors_ vector) is explored and if * a letter is found in the successor dawg, a new entry is inserted * into dawg_args->updated_active_dawgs with EDGE_REF being either * NO_EDGE or an EDGE_REF recorded in constraints vector for the * corresponding dawg index */ // int def_letter_is_okay(void* void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const; int (Dict::*letter_is_okay_)(void* void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const; /// Calls letter_is_okay_ member function. int LetterIsOkay(void* void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const { return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end); } /// Probability in context function used by the ngram permuter. double (Dict::*probability_in_context_)(const char* lang, const char* context, int context_bytes, const char* character, int character_bytes); /// Calls probability_in_context_ member function. double ProbabilityInContext(const char* context, int context_bytes, const char* character, int character_bytes) { return (this->*probability_in_context_)( getImage()->getCCUtil()->lang.string(), context, context_bytes, character, character_bytes); } /// Default (no-op) implementation of probability in context function. double def_probability_in_context( const char* lang, const char* context, int context_bytes, const char* character, int character_bytes) { (void) context; (void) context_bytes; (void) character; (void) character_bytes; return 0.0; } double ngram_probability_in_context(const char* lang, const char* context, int context_bytes, const char* character, int character_bytes); /// Return the number of dawgs in the dawgs_ vector. inline const int NumDawgs() const { return dawgs_.size(); } /// Return i-th dawg pointer recorded in the dawgs_ vector. inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } /// Return the points to the punctuation dawg. inline const Dawg *GetPuncDawg() const { return punc_dawg_; } /// Return the points to the unambiguous words dawg. inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; } /// Return the pointer to the Dawg that contains words of length word_length. inline const Dawg *GetFixedLengthDawg(int word_length) const { if (word_length > max_fixed_length_dawgs_wdlen_) return NULL; assert(dawgs_.size() > word_length); return dawgs_[word_length]; } inline const int GetMaxFixedLengthDawgIndex() const { return max_fixed_length_dawgs_wdlen_; } /// Returns the appropriate next node given the EDGE_REF. static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg NODE_REF node = dawg->next_node(edge_ref); if (node == 0) node = NO_EDGE; // end of word return node; } /// At word ending make sure all the recorded constraints are satisfied. /// Each constraint signifies that we found a beginning pattern in a /// pattern dawg. Check that this pattern can end here (e.g. if some /// leading punctuation is found this would ensure that we are not /// expecting any particular trailing punctuation after the word). inline bool ConstraintsOk(const DawgInfoVector &constraints, int word_end, DawgType current_dawg_type) const { if (!word_end) return true; if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true; for (int c = 0; c < constraints.length(); ++c) { const DawgInfo &cinfo = constraints[c]; Dawg *cdawg = dawgs_[cinfo.dawg_index]; if (!cdawg->end_of_word(cinfo.ref)) { if (dawg_debug_level >= 3) { tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n", cinfo.dawg_index, cinfo.ref); } return false; } } return true; } /// For each of the character classes of the given unichar_id (and the /// unichar_id itself) finds the corresponding outgoing node or self-loop /// in the given dawg and (after checking that it is valid) records it in /// dawg_args->updated_ative_dawgs. Updates current_permuter if any valid /// edges were found. void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const; /// Read/Write/Access special purpose dawgs which contain words /// only of a certain length (used for phrase search for /// non-space-delimited languages). /// Reads a sequence of dawgs from the given file. /// Appends the constructed dawgs to the given dawg_vec. /// Fills the given table with indices of the dawgs in the /// dawg_vec corresponding to the dawgs with words /// of a particular length. static void ReadFixedLengthDawgs(DawgType type, const STRING &lang, PermuterType perm, int debug_level, FILE *file, DawgVector *dawg_vec, int *max_wdlen); /// Writes the dawgs in the dawgs_vec to a file. Updates the given table with /// the indices of dawgs in the dawg_vec for the corresponding word lengths. static void WriteFixedLengthDawgs( const GenericVector &dawg_vec, int num_dawgs, int debug_level, FILE *output_file); /// Check all the DAWGs to see if this word is in any of them. inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) { return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM)); } int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; int valid_word(const WERD_CHOICE &word) const { return valid_word(word, false); // return NO_PERM for words with digits } int valid_word_or_number(const WERD_CHOICE &word) const { return valid_word(word, true); // return NUMBER_PERM for valid numbers } /// This function is used by api/tesseract_cube_combiner.cpp int valid_word(const char *string) const { WERD_CHOICE word(string, getUnicharset()); return valid_word(word); } // Do the two WERD_CHOICEs form a meaningful bigram? bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; /// Returns true if the word contains a valid punctuation pattern. /// Note: Since the domains of punctuation symbols and symblos /// used in numbers are not disjoint, a valid number might contain /// an invalid punctuation pattern (e.g. .99). bool valid_punctuation(const WERD_CHOICE &word); /// Returns true if a good answer is found for the unknown blob rating. int good_choice(const WERD_CHOICE &choice); /// Adds a word found on this document to the document specific dictionary. void add_document_word(const WERD_CHOICE &best_choice); int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET &unicharset); /// Adjusts the rating of the given word. void adjust_word(WERD_CHOICE *word, float *certainty_array, const BLOB_CHOICE_LIST_VECTOR *char_choices, bool nonword, float additional_adjust, bool debug); void adjust_word(WERD_CHOICE *word, float *certainty_array, bool debug) { adjust_word(word, certainty_array, NULL, false, 0.0f, debug); } void adjust_non_word(WERD_CHOICE *word, float *certainty_array, bool debug) { adjust_word(word, certainty_array, NULL, true, 0.0f, debug); } /// Set wordseg_rating_adjust_factor_ to the given value. inline void SetWordsegRatingAdjustFactor(float f) { wordseg_rating_adjust_factor_ = f; } // Accessor for best_choices_. const LIST &getBestChoices() { return best_choices_; } private: /** Private member variables. */ Image* image_ptr_; /** * Table that stores ambiguities computed during training * (loaded when NoDangerousAmbigs() is called for the first time). * Each entry i in the table stores a set of amibiguities whose * wrong ngram starts with unichar id i. */ UnicharAmbigs *dang_ambigs_table_; /** Same as above, but for ambiguities with replace flag set. */ UnicharAmbigs *replace_ambigs_table_; /** * Flag used to disable accumulation of word choices * during compound word permutation. */ bool keep_word_choices_; /** Additional certainty padding allowed before a word is rejected. */ FLOAT32 reject_offset_; /** Current word segmentation. */ PIECES_STATE current_segmentation_; /** Variables to keep track of best/raw word choices. */ VIABLE_CHOICE best_raw_choice_; LIST raw_choices_; LIST best_choices_; // Hyphen-related variables. UNICHAR_ID hyphen_unichar_id_; WERD_CHOICE *hyphen_word_; DawgInfoVector hyphen_active_dawgs_; DawgInfoVector hyphen_constraints_; bool last_word_on_line_; // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary // matching. The first member of each list is taken as canonical. For // example, the first list contains hyphens and dashes with the first symbol // being the ASCII hyphen minus. GenericVector > equivalent_symbols_; // Dawgs. DawgVector dawgs_; SuccessorListsVector successors_; Trie *pending_words_; // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if // any of them are present on the best choices list for a word pair. // the bigrams are stored as space-separated words where: // (1) leading and trailing punctuation has been removed from each word and // (2) any digits have been replaced with '?' marks. Dawg *bigram_dawg_; /// The following pointers are only cached for convenience. /// The dawgs will be deleted when dawgs_ vector is destroyed. // TODO(daria): need to support multiple languages in the future, // so maybe will need to maintain a list of dawgs of each kind. Dawg *freq_dawg_; Dawg *unambig_dawg_; Dawg *punc_dawg_; Trie *document_words_; /// Maximum word length of fixed-length word dawgs. /// A value < 1 indicates that no fixed-length dawgs are loaded. int max_fixed_length_dawgs_wdlen_; /// Current segmentation cost adjust factor for word rating. /// See comments in incorporate_segcost. float wordseg_rating_adjust_factor_; // File for recording ambiguities discovered during dictionary search. FILE *output_ambig_words_file_; public: /// Variable members. /// These have to be declared and initialized after image_ptr_, which contains /// the pointer to the params vector - the member of its base CCUtil class. STRING_VAR_H(user_words_suffix, "", "A list of user-provided words."); STRING_VAR_H(user_patterns_suffix, "", "A list of user-provided patterns."); BOOL_VAR_H(load_system_dawg, true, "Load system word dawg."); BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg."); BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg."); BOOL_VAR_H(load_punc_dawg, true, "Load dawg with punctuation patterns."); BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns."); BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length" " dawgs (e.g. for non-space delimited languages)"); BOOL_VAR_H(load_bigram_dawg, false, "Load dawg with special word bigrams."); double_VAR_H(segment_penalty_dict_frequent_word, 1.0, "Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."); double_VAR_H(segment_penalty_dict_case_ok, 1.1, "Score multiplier for word matches that have good case " "(lower is better)."); double_VAR_H(segment_penalty_dict_case_bad, 1.3125, "Default score multiplier for word matches, which may have " "case issues (lower is better)."); // TODO(daria): remove this param when ngram permuter is deprecated. double_VAR_H(segment_penalty_ngram_best_choice, 1.24, "Multipler to for the best choice from the ngram model."); double_VAR_H(segment_penalty_dict_nonword, 1.25, "Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."); double_VAR_H(segment_penalty_garbage, 1.50, "Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."); STRING_VAR_H(output_ambig_words_file, "", "Output file for ambiguities found in the dictionary"); INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"); INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list."); BOOL_VAR_H(use_only_first_uft8_step, false, "Use only the first UTF8 step of the given string" " when computing log probabilities."); double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); double_VAR_H(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words"); double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset"); INT_VAR_H(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word"); double_VAR_H(stopper_certainty_per_char, -0.50, "Certainty to add for each dict char above small word size."); double_VAR_H(stopper_allowable_character_badness, 3.0, "Max certaintly variation allowed in a word (in sigma)"); INT_VAR_H(stopper_debug_level, 0, "Stopper debug level"); BOOL_VAR_H(stopper_no_acceptable_choices, false, "Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"); double_VAR_H(stopper_ambiguity_threshold_gain, 8.0, "Gain factor for ambiguity threshold."); double_VAR_H(stopper_ambiguity_threshold_offset, 1.5, "Certainty offset for ambiguity threshold."); BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices"); INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" " should be printed to stdout"); STRING_VAR_H(word_to_debug_lengths, "", "Lengths of unichars in word_to_debug"); INT_VAR_H(fragments_debug, 0, "Debug character fragments"); INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process"); BOOL_VAR_H(permute_debug, 0, "Debug char permutation process"); double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of" " current best rate to prune other hypotheses"); BOOL_VAR_H(permute_script_word, 0, "Turn on word script consistency permuter"); BOOL_VAR_H(segment_segcost_rating, 0, "incorporate segmentation cost in word rating?"); BOOL_VAR_H(segment_nonalphabetic_script, false, "Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"); double_VAR_H(segment_reward_script, 0.95, "Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward."); BOOL_VAR_H(permute_fixed_length_dawg, 0, "Turn on fixed-length phrasebook search permuter"); BOOL_VAR_H(permute_chartype_word, 0, "Turn on character type (property) consistency permuter"); double_VAR_H(segment_reward_chartype, 0.97, "Score multipler for char type consistency within a word. "); // TODO(daria): remove this param when ngram permuter is deprecated. double_VAR_H(segment_reward_ngram_best_choice, 0.99, "Score multipler for ngram permuter's best choice" " (only used in the Han script path)."); BOOL_VAR_H(save_doc_words, 0, "Save Document Words"); BOOL_VAR_H(doc_dict_enable, 1, "Enable Document Dictionary "); double_VAR_H(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary"); double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty" " for words that can be inserted into the document dictionary"); BOOL_VAR_H(ngram_permuter_activated, false, "Activate character-level n-gram-based permuter"); INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."); BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter"); }; } // namespace tesseract #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_