diff --git a/dict/dawg.cpp b/dict/dawg.cpp index 21da8e26e..ed304245d 100644 --- a/dict/dawg.cpp +++ b/dict/dawg.cpp @@ -98,6 +98,32 @@ int Dawg::check_for_words(const char *filename, return misses; } +void Dawg::iterate_words(const UNICHARSET &unicharset, + TessCallback1 *cb) const { + WERD_CHOICE word(&unicharset); + iterate_words_rec(word, 0, cb); +} + +void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far, + NODE_REF to_explore, + TessCallback1 *cb) const { + NodeChildVector children; + this->unichar_ids_of(to_explore, &children); + for (int i = 0; i < children.size(); i++) { + WERD_CHOICE next_word(word_so_far); + next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0); + if (this->end_of_word(children[i].edge_ref)) { + STRING s; + next_word.string_and_lengths(&s, NULL); + cb->Run(s.string()); + } + NODE_REF next = next_node(children[i].edge_ref); + if (next != 0) { + iterate_words_rec(next_word, next, cb); + } + } +} + bool Dawg::match_words(WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const { EDGE_REF edge; @@ -286,12 +312,12 @@ void SquishedDawg::read_squished_dawg(FILE *file, int unicharset_size; fread(&unicharset_size, sizeof(inT32), 1, file); fread(&num_edges_, sizeof(inT32), 1, file); - ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty if (swap) { unicharset_size = reverse32(unicharset_size); num_edges_ = reverse32(num_edges_); } + ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty Dawg::init(type, lang, perm, unicharset_size, debug_level); edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_); @@ -318,13 +344,13 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const { node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_); - for (edge=0; edge < num_edges_; edge++) // init all slots + for (edge = 0; edge < num_edges_; edge++) // init all slots node_map [edge] = -1; node_counter = num_forward_edges(0); *num_nodes = 0; - for (edge=0; edge < num_edges_; edge++) { // search all slots + for (edge = 0; edge < num_edges_; edge++) { // search all slots if (forward_edge(edge)) { (*num_nodes)++; // count nodes links @@ -332,6 +358,7 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const { num_edges = num_forward_edges(edge); if (edge != 0) node_counter += num_edges; edge += num_edges; + if (edge >= num_edges_) break; if (backward_edge(edge)) while (!last_edge(edge++)); edge--; } @@ -369,7 +396,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) { tprintf("%d edges in DAWG\n", num_edges); } - for (edge=0; edge= num_edges_) break; if (backward_edge(edge)) // skip back links while (!last_edge(edge++)); diff --git a/dict/dawg.h b/dict/dawg.h index 2606f6edf..81c213863 100644 --- a/dict/dawg.h +++ b/dict/dawg.h @@ -34,6 +34,7 @@ #include "elst.h" #include "ratngs.h" #include "params.h" +#include "tesscallback.h" #ifndef __GNUC__ #ifdef __MSW32__ @@ -142,6 +143,11 @@ class Dawg { const UNICHARSET &unicharset, bool enable_wildcard) const; + // For each word in the Dawg, call the given (permanent) callback with the + // text (UTF-8) version of the word. + void iterate_words(const UNICHARSET &unicharset, + TessCallback1 *cb) const; + // Pure virtual function that should be implemented by the derived classes. /// Returns the edge that corresponds to the letter out of this node. @@ -268,6 +274,11 @@ class Dawg { bool match_words(WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const; + // Recursively iterate over all words in a dawg (see public iterate_words). + void iterate_words_rec(const WERD_CHOICE &word_so_far, + NODE_REF to_explore, + TessCallback1 *cb) const; + // Member Variables. DawgType type_; STRING lang_; diff --git a/dict/dict.cpp b/dict/dict.cpp index add041aa3..5ad88cbac 100644 --- a/dict/dict.cpp +++ b/dict/dict.cpp @@ -16,7 +16,10 @@ // /////////////////////////////////////////////////////////////////////// +#include + #include "dict.h" +#include "unicodes.h" #ifdef _MSC_VER #pragma warning(disable:4244) // Conversion warnings @@ -41,6 +44,8 @@ Dict::Dict(Image* image_ptr) getImage()->getCCUtil()->params()), BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getImage()->getCCUtil()->params()), + BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", + getImage()->getCCUtil()->params()), BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation" " patterns.", getImage()->getCCUtil()->params()), BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number" @@ -48,6 +53,8 @@ Dict::Dict(Image* image_ptr) BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs" " (e.g. for non-space delimited languages)", getImage()->getCCUtil()->params()), + BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word " + "bigrams.", getImage()->getCCUtil()->params()), double_MEMBER(segment_penalty_dict_frequent_word, 1.0, "Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better).", @@ -70,6 +77,9 @@ Dict::Dict(Image* image_ptr) "Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better).", getImage()->getCCUtil()->params()), + STRING_MEMBER(output_ambig_words_file, "", + "Output file for ambiguities found in the dictionary", + getImage()->getCCUtil()->params()), INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages", getImage()->getCCUtil()->params()), @@ -104,6 +114,12 @@ Dict::Dict(Image* image_ptr) "Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations", getImage()->getCCUtil()->params()), + double_MEMBER(stopper_ambiguity_threshold_gain, 8.0, + "Gain factor for ambiguity threshold.", + getImage()->getCCUtil()->params()), + double_MEMBER(stopper_ambiguity_threshold_offset, 1.5, + "Certainty offset for ambiguity threshold.", + getImage()->getCCUtil()->params()), BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices", getImage()->getCCUtil()->params()), INT_MEMBER(tessedit_truncate_wordchoice_log, 10, @@ -130,6 +146,11 @@ Dict::Dict(Image* image_ptr) BOOL_MEMBER(segment_segcost_rating, 0, "incorporate segmentation cost in word rating?", getImage()->getCCUtil()->params()), + BOOL_MEMBER(segment_nonalphabetic_script, false, + "Don't use any alphabetic-specific tricks." + "Set to true in the traineddata config file for" + " scripts that are cursive or inherently fixed-pitch", + getImage()->getCCUtil()->params()), double_MEMBER(segment_reward_script, 0.95, "Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " @@ -144,10 +165,10 @@ Dict::Dict(Image* image_ptr) double_MEMBER(segment_reward_chartype, 0.97, "Score multipler for char type consistency within a word. ", getImage()->getCCUtil()->params()), - double_MEMBER(segment_reward_ngram_best_choice, 0.99, - "Score multipler for ngram permuter's best choice" - " (only used in the Han script path).", - getImage()->getCCUtil()->params()), + double_MEMBER(segment_reward_ngram_best_choice, 0.99, + "Score multipler for ngram permuter's best choice" + " (only used in the Han script path).", + getImage()->getCCUtil()->params()), BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getImage()->getCCUtil()->params()), BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ", @@ -182,14 +203,17 @@ Dict::Dict(Image* image_ptr) hyphen_unichar_id_ = INVALID_UNICHAR_ID; document_words_ = NULL; pending_words_ = NULL; + bigram_dawg_ = NULL; freq_dawg_ = NULL; punc_dawg_ = NULL; max_fixed_length_dawgs_wdlen_ = -1; wordseg_rating_adjust_factor_ = -1.0f; + output_ambig_words_file_ = NULL; } Dict::~Dict() { if (hyphen_word_ != NULL) delete hyphen_word_; + if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_); } void Dict::Load() { @@ -199,6 +223,10 @@ void Dict::Load() { if (dawgs_.length() != 0) this->End(); hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); + + LoadEquivalenceList(kHyphenLikeUTF8); + LoadEquivalenceList(kApostropheLikeUTF8); + TessdataManager &tessdata_manager = getImage()->getCCUtil()->tessdata_manager; @@ -219,12 +247,26 @@ void Dict::Load() { new SquishedDawg(tessdata_manager.GetDataFilePtr(), DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level); } - if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) { + if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) { + bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(), + DAWG_TYPE_WORD, // doesn't actually matter. + lang, + COMPOUND_PERM, // doesn't actually matter. + dawg_debug_level); + } + if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) { freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(), DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM, dawg_debug_level); dawgs_ += freq_dawg_; } + if (load_unambig_dawg && + tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) { + unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(), + DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM, + dawg_debug_level); + dawgs_ += unambig_dawg_; + } if (((STRING &)user_words_suffix).length() > 0) { Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, @@ -232,7 +274,8 @@ void Dict::Load() { dawg_debug_level); name = getImage()->getCCUtil()->language_data_path_prefix; name += user_words_suffix; - if (!trie_ptr->read_word_list(name.string(), getUnicharset())) { + if (!trie_ptr->read_word_list(name.string(), getUnicharset(), + Trie::RRP_REVERSE_IF_HAS_RTL)) { tprintf("Error: failed to load %s\n", name.string()); exit(1); } @@ -295,6 +338,7 @@ void Dict::End() { dawgs_.delete_data_pointers(); successors_.delete_data_pointers(); dawgs_.clear(); + delete bigram_dawg_; successors_.clear(); document_words_ = NULL; max_fixed_length_dawgs_wdlen_ = -1; @@ -304,12 +348,38 @@ void Dict::End() { } } +// Create unicharset adaptations of known, short lists of UTF-8 equivalent +// characters (think all hyphen-like symbols). The first version of the +// list is taken as equivalent for matching against the dictionary. +void Dict::LoadEquivalenceList(const char *unichar_strings[]) { + equivalent_symbols_.push_back(GenericVectorEqEq()); + const UNICHARSET &unicharset = getUnicharset(); + GenericVectorEqEq *equiv_list = &equivalent_symbols_.back(); + for (int i = 0; unichar_strings[i] != 0; i++) { + UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]); + if (unichar_id != INVALID_UNICHAR_ID) { + equiv_list->push_back(unichar_id); + } + } +} + +// Normalize all hyphen and apostrophes to the canonicalized one for +// matching; pass everything else through as is. +UNICHAR_ID Dict::NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const { + for (int i = 0; i < equivalent_symbols_.size(); i++) { + if (equivalent_symbols_[i].contains(unichar_id)) { + return equivalent_symbols_[i][0]; + } + } + return unichar_id; +} + // Returns true if in light of the current state unichar_id is allowed // according to at least one of the dawgs in the dawgs_ vector. // See more extensive comments in dict.h where this function is declared. int Dict::def_letter_is_okay(void* void_dawg_args, UNICHAR_ID unichar_id, - bool word_end) { + bool word_end) const { DawgArgs *dawg_args = reinterpret_cast(void_dawg_args); if (dawg_debug_level >= 3) { @@ -484,7 +554,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args, void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, - PermuterType *curr_perm) { + PermuterType *curr_perm) const { NODE_REF node = GetStartingNode(dawg, info.ref); // Try to find the edge corresponding to the exact unichar_id and to all the // edges corresponding to the character class of unichar_id. @@ -572,7 +642,7 @@ void Dict::WriteFixedLengthDawgs( // from hyphen_active_dawgs_ instead. void Dict::init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, - bool ambigs_mode) { + bool ambigs_mode) const { int i; if (sought_word_length != kAnyWordLength) { // Only search one fixed word length dawg. @@ -604,7 +674,7 @@ void Dict::init_active_dawgs(int sought_word_length, // If hyphenated() returns true, copy the entries from hyphen_constraints_ // into the given constraints vector. -void Dict::init_constraints(DawgInfoVector *constraints) { +void Dict::init_constraints(DawgInfoVector *constraints) const { if (hyphenated()) { *constraints = hyphen_constraints_; if (dawg_debug_level >= 3) { @@ -670,7 +740,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) { strcat(filename, ".doc"); doc_word_file = open_file (filename, "a"); fprintf(doc_word_file, "%s\n", - best_choice.debug_string(getUnicharset()).string()); + best_choice.debug_string().string()); fclose(doc_word_file); } document_words_->add_word_to_dawg(best_choice); @@ -693,7 +763,7 @@ void Dict::adjust_word(WERD_CHOICE *word, float new_rating = word->rating(); if (debug) { tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "", - word->debug_string(getUnicharset()).string(), word->rating()); + word->debug_string().string(), word->rating()); } new_rating += kRatingPad; if (nonword) { // non-dictionary word @@ -733,9 +803,9 @@ void Dict::adjust_word(WERD_CHOICE *word, LogNewChoice(adjust_factor, certainty_array, false, word); } -int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) { +int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { const WERD_CHOICE *word_ptr = &word; - WERD_CHOICE temp_word; + WERD_CHOICE temp_word(word.unicharset()); if (hyphenated()) { copy_hyphen_info(&temp_word); temp_word += word; @@ -775,10 +845,40 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) { dawg_args.permuter : NO_PERM; } +bool Dict::valid_bigram(const WERD_CHOICE &word1, + const WERD_CHOICE &word2) const { + if (bigram_dawg_ == NULL) return false; + + // Extract the core word from the middle of each word with any digits + // replaced with question marks. + int w1start, w1end, w2start, w2end; + word1.punct_stripped(&w1start, &w1end); + word2.punct_stripped(&w2start, &w2end); + + // We don't want to penalize a single guillemet, hyphen, etc. + // But our bigram list doesn't have any information about punctuation. + if (w1start >= w1end) return word1.length() < 3; + if (w2start >= w2end) return word2.length() < 3; + + const UNICHARSET& uchset = getUnicharset(); + STRING bigram_string; + for (int i = w1start; i < w1end; i++) { + UNICHAR_ID ch = NormalizeUnicharIdForMatch(word1.unichar_id(i)); + bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch); + } + bigram_string += " "; + for (int i = w2start; i < w2end; i++) { + UNICHAR_ID ch = NormalizeUnicharIdForMatch(word2.unichar_id(i)); + bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch); + } + WERD_CHOICE normalized_word(bigram_string.string(), uchset); + return bigram_dawg_->word_in_dawg(normalized_word); +} + bool Dict::valid_punctuation(const WERD_CHOICE &word) { if (word.length() == 0) return NO_PERM; int i; - WERD_CHOICE new_word; + WERD_CHOICE new_word(word.unicharset()); int last_index = word.length() - 1; int new_len = 0; for (i = 0; i <= last_index; ++i) { diff --git a/dict/dict.h b/dict/dict.h index 45cafd9c7..9ced54ee0 100644 --- a/dict/dict.h +++ b/dict/dict.h @@ -89,16 +89,17 @@ struct DawgArgs { class Dict { public: - // Gain factor for ambiguity threshold. - static const float kStopperAmbiguityThresholdGain; - // Certainty offset for ambiguity threshold. - static const float kStopperAmbiguityThresholdOffset; - Dict(Image* image_ptr); ~Dict(); + const Image* getImage() const { + return image_ptr_; + } Image* getImage() { return image_ptr_; } + const UNICHARSET& getUnicharset() const { + return getImage()->getCCUtil()->unicharset; + } UNICHARSET& getUnicharset() { return getImage()->getCCUtil()->unicharset; } @@ -114,17 +115,17 @@ class Dict { /* hyphen.cpp ************************************************************/ /// Returns true if we've recorded the beginning of a hyphenated word. - inline bool hyphenated() { return + inline bool hyphenated() const { return !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0; } /// Size of the base word (the part on the line before) of a hyphenated word. - inline int hyphen_base_size() { + inline int hyphen_base_size() const { return this->hyphenated() ? hyphen_word_->length() : 0; } /// If this word is hyphenated copy the base word (the part on /// the line before) of a hyphenated word into the given word. /// This function assumes that word is not NULL. - inline void copy_hyphen_info(WERD_CHOICE *word) { + inline void copy_hyphen_info(WERD_CHOICE *word) const { if (this->hyphenated()) { *word = *hyphen_word_; if (hyphen_debug_level) word->print("copy_hyphen_info: "); @@ -133,19 +134,19 @@ class Dict { /// Erase the unichar ids corresponding to the portion of the word /// from the previous line. The word is not changed if it is not /// split between lines and hyphenated. - inline void remove_hyphen_head(WERD_CHOICE *word) { + inline void remove_hyphen_head(WERD_CHOICE *word) const { if (this->hyphenated()) { word->remove_unichar_ids(0, hyphen_word_->length()); if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: "); } } /// Check whether the word has a hyphen at the end. - inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) { + inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const { return (last_word_on_line_ && !first_pos && unichar_id == hyphen_unichar_id_); } /// Same as above, but check the unichar at the end of the word. - inline bool has_hyphen_end(const WERD_CHOICE &word) { + inline bool has_hyphen_end(const WERD_CHOICE &word) const { int word_index = word.length() - 1; return has_hyphen_end(word.unichar_id(word_index), word_index == 0); } @@ -171,12 +172,14 @@ class Dict { /// from hyphen_active_dawgs_ instead. void init_active_dawgs(int sought_word_length, DawgInfoVector *active_dawgs, - bool ambigs_mode); + bool ambigs_mode) const; /// If hyphenated() returns true, copy the entries from hyphen_constraints_ /// into the given constraints vector. - void init_constraints(DawgInfoVector *constraints); + void init_constraints(DawgInfoVector *constraints) const; /// Returns true if we are operating in ambigs mode. - inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; } + inline bool ambigs_mode(float rating_limit) { + return rating_limit <= 0.0; + } /// Recursively explore all the possible character combinations in /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the /// dawgs in the dawgs_ vector in parallel and discard invalid words. @@ -316,6 +319,15 @@ class Dict { bool fix_replaceable, BLOB_CHOICE_LIST_VECTOR *Choices, bool *modified_blobs); + double StopperAmbigThreshold(double f1, double f2) { + return (f2 - f1) * stopper_ambiguity_threshold_gain - + stopper_ambiguity_threshold_offset; + } + // If the certainty of any chunk in Choice (item1) is not ambiguous with the + // corresponding chunk in the best choice (item2), frees Choice and + // returns true. + int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice + void *item2); // EXPANDED_CHOICE *BestChoice /// Replaces the corresponding wrong ngram in werd_choice with the correct /// one. We indicate that this newly inserted ngram unichar is composed from /// several fragments and modify the corresponding entries in blob_choices to @@ -401,7 +413,7 @@ class Dict { /// and Certainties. void FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], - bool SameString, VIABLE_CHOICE ViableChoice); + VIABLE_CHOICE ViableChoice); /// Returns true if there are no alternative choices for the current word /// or if all alternatives have an adjust factor worse than Threshold. bool AlternativeChoicesWorseThan(FLOAT32 Threshold); @@ -467,6 +479,15 @@ class Dict { document_words_->clear(); } + // Create unicharset adaptations of known, short lists of UTF-8 equivalent + // characters (think all hyphen-like symbols). The first version of the + // list is taken as equivalent for matching against the dictionary. + void LoadEquivalenceList(const char *unichar_strings[]); + + // Normalize all hyphen and apostrophes to the canonicalized one for + // matching; pass everything else through as is. See LoadEquivalenceList(). + UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const; + /** * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light * of the current state the letter at word_index in the given word @@ -531,13 +552,13 @@ class Dict { // int def_letter_is_okay(void* void_dawg_args, - UNICHAR_ID unichar_id, bool word_end); + UNICHAR_ID unichar_id, bool word_end) const; int (Dict::*letter_is_okay_)(void* void_dawg_args, - UNICHAR_ID unichar_id, bool word_end); + UNICHAR_ID unichar_id, bool word_end) const; /// Calls letter_is_okay_ member function. int LetterIsOkay(void* void_dawg_args, - UNICHAR_ID unichar_id, bool word_end) { + UNICHAR_ID unichar_id, bool word_end) const { return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end); } @@ -581,6 +602,8 @@ class Dict { inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } /// Return the points to the punctuation dawg. inline const Dawg *GetPuncDawg() const { return punc_dawg_; } + /// Return the points to the unambiguous words dawg. + inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; } /// Return the pointer to the Dawg that contains words of length word_length. inline const Dawg *GetFixedLengthDawg(int word_length) const { if (word_length > max_fixed_length_dawgs_wdlen_) return NULL; @@ -603,7 +626,7 @@ class Dict { /// leading punctuation is found this would ensure that we are not /// expecting any particular trailing punctuation after the word). inline bool ConstraintsOk(const DawgInfoVector &constraints, - int word_end, DawgType current_dawg_type) { + int word_end, DawgType current_dawg_type) const { if (!word_end) return true; if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true; for (int c = 0; c < constraints.length(); ++c) { @@ -627,7 +650,8 @@ class Dict { /// edges were found. void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, UNICHAR_ID unichar_id, bool word_end, - DawgArgs *dawg_args, PermuterType *current_permuter); + DawgArgs *dawg_args, + PermuterType *current_permuter) const; /// Read/Write/Access special purpose dawgs which contain words /// only of a certain length (used for phrase search for @@ -649,23 +673,25 @@ class Dict { int num_dawgs, int debug_level, FILE *output_file); /// Check all the DAWGs to see if this word is in any of them. - inline bool valid_word_permuter(uinT8 perm, bool numbers_ok) { + inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) { return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM)); } - int valid_word(const WERD_CHOICE &word, bool numbers_ok); - int valid_word(const WERD_CHOICE &word) { + int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; + int valid_word(const WERD_CHOICE &word) const { return valid_word(word, false); // return NO_PERM for words with digits } - int valid_word_or_number(const WERD_CHOICE &word) { + int valid_word_or_number(const WERD_CHOICE &word) const { return valid_word(word, true); // return NUMBER_PERM for valid numbers } /// This function is used by api/tesseract_cube_combiner.cpp - int valid_word(const char *string) { + int valid_word(const char *string) const { WERD_CHOICE word(string, getUnicharset()); return valid_word(word); } + // Do the two WERD_CHOICEs form a meaningful bigram? + bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; /// Returns true if the word contains a valid punctuation pattern. /// Note: Since the domains of punctuation symbols and symblos /// used in numbers are not disjoint, a valid number might contain @@ -691,6 +717,8 @@ class Dict { inline void SetWordsegRatingAdjustFactor(float f) { wordseg_rating_adjust_factor_ = f; } + // Accessor for best_choices_. + const LIST &getBestChoices() { return best_choices_; } private: /** Private member variables. */ @@ -723,15 +751,27 @@ class Dict { DawgInfoVector hyphen_active_dawgs_; DawgInfoVector hyphen_constraints_; bool last_word_on_line_; + // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary + // matching. The first member of each list is taken as canonical. For + // example, the first list contains hyphens and dashes with the first symbol + // being the ASCII hyphen minus. + GenericVector > equivalent_symbols_; // Dawgs. DawgVector dawgs_; SuccessorListsVector successors_; Trie *pending_words_; + // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if + // any of them are present on the best choices list for a word pair. + // the bigrams are stored as space-separated words where: + // (1) leading and trailing punctuation has been removed from each word and + // (2) any digits have been replaced with '?' marks. + Dawg *bigram_dawg_; /// The following pointers are only cached for convenience. /// The dawgs will be deleted when dawgs_ vector is destroyed. // TODO(daria): need to support multiple languages in the future, // so maybe will need to maintain a list of dawgs of each kind. Dawg *freq_dawg_; + Dawg *unambig_dawg_; Dawg *punc_dawg_; Trie *document_words_; /// Maximum word length of fixed-length word dawgs. @@ -740,6 +780,8 @@ class Dict { /// Current segmentation cost adjust factor for word rating. /// See comments in incorporate_segcost. float wordseg_rating_adjust_factor_; + // File for recording ambiguities discovered during dictionary search. + FILE *output_ambig_words_file_; public: /// Variable members. @@ -750,11 +792,14 @@ class Dict { "A list of user-provided patterns."); BOOL_VAR_H(load_system_dawg, true, "Load system word dawg."); BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg."); + BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg."); BOOL_VAR_H(load_punc_dawg, true, "Load dawg with punctuation patterns."); BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns."); BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length" " dawgs (e.g. for non-space delimited languages)"); + BOOL_VAR_H(load_bigram_dawg, false, + "Load dawg with special word bigrams."); double_VAR_H(segment_penalty_dict_frequent_word, 1.0, "Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."); @@ -779,6 +824,8 @@ class Dict { "Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."); + STRING_VAR_H(output_ambig_words_file, "", + "Output file for ambiguities found in the dictionary"); INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"); INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); @@ -801,6 +848,10 @@ class Dict { BOOL_VAR_H(stopper_no_acceptable_choices, false, "Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"); + double_VAR_H(stopper_ambiguity_threshold_gain, 8.0, + "Gain factor for ambiguity threshold."); + double_VAR_H(stopper_ambiguity_threshold_offset, 1.5, + "Certainty offset for ambiguity threshold."); BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices"); INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" @@ -816,6 +867,10 @@ class Dict { "Turn on word script consistency permuter"); BOOL_VAR_H(segment_segcost_rating, 0, "incorporate segmentation cost in word rating?"); + BOOL_VAR_H(segment_nonalphabetic_script, false, + "Don't use any alphabetic-specific tricks." + "Set to true in the traineddata config file for" + " scripts that are cursive or inherently fixed-pitch"); double_VAR_H(segment_reward_script, 0.95, "Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " diff --git a/dict/hyphen.cpp b/dict/hyphen.cpp index 6b5d5fba0..1f39afdc5 100644 --- a/dict/hyphen.cpp +++ b/dict/hyphen.cpp @@ -51,7 +51,7 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word, const DawgInfoVector &active_dawgs, const DawgInfoVector &constraints) { if (hyphen_word_ == NULL) { - hyphen_word_ = new WERD_CHOICE(); + hyphen_word_ = new WERD_CHOICE(word.unicharset()); hyphen_word_->make_bad(); } if (hyphen_word_->rating() > word.rating()) { diff --git a/dict/matchdefs.h b/dict/matchdefs.h index c2b321fe5..bfab1e6be 100644 --- a/dict/matchdefs.h +++ b/dict/matchdefs.h @@ -28,7 +28,7 @@ /* define the maximum number of classes defined for any matcher and the maximum class id for any matcher. This must be changed if more different classes need to be classified */ -#define MAX_NUM_CLASSES 8192 +#define MAX_NUM_CLASSES 12288 #define MAX_CLASS_ID (MAX_NUM_CLASSES - 1) /** a CLASS_ID is the ascii character to be associated with a class */ diff --git a/dict/permdawg.cpp b/dict/permdawg.cpp index 1852c2e0d..20fb5792f 100644 --- a/dict/permdawg.cpp +++ b/dict/permdawg.cpp @@ -86,7 +86,7 @@ void Dict::go_deeper_dawg_fxn( if (permute_debug && dawg_debug_level) { tprintf("early pruned word rating=%4.2f," " permdawg_limit=%4.2f, word=%s\n", word->rating(), - permdawg_limit, word->debug_string(getUnicharset()).string()); + permdawg_limit, word->debug_string().string()); } return; } @@ -106,8 +106,7 @@ void Dict::go_deeper_dawg_fxn( } if (clean_active_dawgs.size() > 0) { if (permute_debug && dawg_debug_level) - tprintf("new hyphen choice = %s\n", - word->debug_string(getUnicharset()).string()); + tprintf("new hyphen choice = %s\n", word->debug_string().string()); word->set_permuter(more_args->permuter); adjust_word(word, certainties, permute_debug); set_hyphen_word(*word, *(more_args->active_dawgs), @@ -190,11 +189,26 @@ void Dict::go_deeper_dawg_fxn( // Add a new word choice if (word_ending) { if (permute_debug && dawg_debug_level) { - tprintf("found word = %s\n", - word->debug_string(getUnicharset()).string()); + tprintf("found word = %s\n", word->debug_string().string()); + } + if (ambigs_mode(*limit) && + strcmp(output_ambig_words_file.string(), "") != 0) { + if (output_ambig_words_file_ == NULL) { + output_ambig_words_file_ = + fopen(output_ambig_words_file.string(), "w+"); + if (output_ambig_words_file_ == NULL) { + tprintf("Failed to open output_ambig_words_file %s\n", + output_ambig_words_file.string()); + exit(1); + } + } + STRING word_str; + word->string_and_lengths(&word_str, NULL); + word_str += " "; + fprintf(output_ambig_words_file_, word_str.string()); } WERD_CHOICE *adjusted_word = word; - WERD_CHOICE hyphen_tail_word; + WERD_CHOICE hyphen_tail_word(&getUnicharset()); if (hyphen_base_size() > 0) { hyphen_tail_word = *word; remove_hyphen_head(&hyphen_tail_word); @@ -226,7 +240,7 @@ void Dict::go_deeper_dawg_fxn( } else { if (permute_debug && dawg_debug_level) { tprintf("last unichar not OK at index %d in %s\n", - word_index, word->debug_string(getUnicharset()).string()); + word_index, word->debug_string().string()); } } } @@ -249,7 +263,7 @@ void Dict::go_deeper_dawg_fxn( WERD_CHOICE *Dict::dawg_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, int sought_word_length, int start_char_choice_index) { - WERD_CHOICE *best_choice = new WERD_CHOICE(); + WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset()); best_choice->make_bad(); best_choice->set_rating(rating_limit); if (char_choices.length() == 0) return best_choice; @@ -272,7 +286,7 @@ WERD_CHOICE *Dict::dawg_permute_and_select( (segment_penalty_dict_case_bad / segment_penalty_dict_case_ok), NO_PERM, sought_word_length, end_char_choice_index); - WERD_CHOICE word(MAX_WERD_LENGTH); + WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); copy_hyphen_info(&word); // Discard rating and certainty of the hyphen base (if any). word.set_rating(0.0); diff --git a/dict/permute.cpp b/dict/permute.cpp index 2fa0cce93..e41116698 100644 --- a/dict/permute.cpp +++ b/dict/permute.cpp @@ -126,12 +126,13 @@ int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) { * 1st choice of char 3, 2nd choice of char 4, 3rd choice of char 5, 2nd choice * of char 6. If n > number of choice, the closest (last) one is used. */ -WERD_CHOICE* get_choice_from_posstr(const BLOB_CHOICE_LIST_VECTOR &char_choices, +WERD_CHOICE* get_choice_from_posstr(const UNICHARSET *unicharset, + const BLOB_CHOICE_LIST_VECTOR &char_choices, int start_pos, const char* pos_str, float *certainties) { int pos_str_len = strlen(pos_str); - WERD_CHOICE* wchoice = new WERD_CHOICE(); + WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset); if (start_pos + pos_str_len > char_choices.length()) { wchoice->make_bad(); return wchoice; @@ -228,6 +229,7 @@ BLOB_CHOICE* find_choice_by_script( PermuterState::PermuterState() { + unicharset_ = NULL; char_choices_ = NULL; adjust_factor_ = 1.0f; allow_collision_ = false; @@ -240,6 +242,7 @@ void PermuterState::Init(const BLOB_CHOICE_LIST_VECTOR& char_choices, float default_bias, bool debug) { ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH); + unicharset_ = &unicharset; char_choices_ = &char_choices; word_length_ = char_choices.length(); for (int i = 0; i < word_length_; ++i) @@ -300,9 +303,8 @@ void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice, WERD_CHOICE* PermuterState::GetPermutedWord(float *certainties, float *adjust_factor) { ASSERT_HOST(char_choices_ != NULL); - WERD_CHOICE *word_choice = get_choice_from_posstr(*char_choices_, - 0, perm_state_, - certainties); + WERD_CHOICE *word_choice = get_choice_from_posstr( + unicharset_, *char_choices_, 0, perm_state_, certainties); float rating = word_choice->rating() * adjust_factor_; word_choice->set_rating(rating); *adjust_factor = adjust_factor_; @@ -431,7 +433,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words( if (permute_debug) print_char_choices_list("\n\nPermute FixedLength Word", char_choices, getUnicharset(), false); - WERD_CHOICE* best_choice = new WERD_CHOICE(char_choices.length()); + WERD_CHOICE* best_choice = + new WERD_CHOICE(&getUnicharset(), char_choices.length()); const int max_dict_len = max_fixed_length_dawgs_wdlen_; const int min_dict_len = 2; char posstr[256]; @@ -461,7 +464,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words( } if (part_choice && step > 1) { // found lexicon match - part_choice->populate_unichars(getUnicharset()); + part_choice->populate_unichars(); get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr); float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length()); if (permuter_state) @@ -472,8 +475,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words( part_choice->unichar_string().string()); } else { // no lexicon match step = 1; - part_choice = - get_choice_from_posstr(char_choices, anchor_pos, "0", NULL); + part_choice = get_choice_from_posstr(&getUnicharset(), char_choices, + anchor_pos, "0", NULL); if (permute_debug) tprintf("Single char %d %s\n", anchor_pos, part_choice->unichar_string().string()); @@ -493,7 +496,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words( best_choice->rating(), match_score, adjusted_score); best_choice->set_rating(adjusted_score); } - best_choice->populate_unichars(getUnicharset()); + best_choice->populate_unichars(); if (permute_debug) tprintf("Found Best CJK word %f: %s\n", best_choice->rating(), best_choice->unichar_string().string()); @@ -554,11 +557,12 @@ WERD_CHOICE* Dict::permute_chartype_words( print_char_choices_list("", char_choices, getUnicharset(), true); } - WERD_CHOICE *current_word = new WERD_CHOICE(); + WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset()); BLOB_CHOICE_IT blob_choice_it; const UNICHARSET& unicharset = getUnicharset(); bool replaced = false; // has any character choice been replaced int prev_unambig_type = 0; // the last chartype of an unambiguous char + float certainties[MAX_PERM_LENGTH + 1]; for (int x = 0; x < char_choices.length(); ++x) { BLOB_CHOICE_LIST* pos_choice = char_choices.get(x); UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice); @@ -640,12 +644,12 @@ WERD_CHOICE* Dict::permute_chartype_words( current_word->append_unichar_id(first_choice->unichar_id(), 1, first_choice->rating(), first_choice->certainty()); + certainties[x] = first_choice->certainty(); } // All permuter choices should go through adjust_non_word so the choice // rating would be adjusted on the same scale. - float certainties[MAX_PERM_LENGTH + 1]; adjust_non_word(current_word, certainties, permute_debug); - current_word->populate_unichars(unicharset); + current_word->populate_unichars(); if (replaced) { // Apply a reward multiplier on rating if an chartype permutation is made. float rating = current_word->rating(); @@ -682,10 +686,11 @@ WERD_CHOICE* Dict::permute_script_words( permute_debug > 1); } - WERD_CHOICE *current_word = new WERD_CHOICE(); + WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset()); BLOB_CHOICE_IT blob_choice_it; bool replaced = false; bool prev_is_consistent = false; + float certainties[MAX_PERM_LENGTH + 1]; for (int x = 0; x < char_choices.length(); ++x) { blob_choice_it.set_to_list(char_choices.get(x)); BLOB_CHOICE *first_choice = blob_choice_it.data(); @@ -737,13 +742,13 @@ WERD_CHOICE* Dict::permute_script_words( current_word->append_unichar_id(first_choice->unichar_id(), 1, first_choice->rating(), first_choice->certainty()); + certainties[x] = first_choice->certainty(); prev_is_consistent = sid_consistent; } // All permuter choices should go through adjust_non_word so the choice // rating would be adjusted on the same scale. - float certainties[MAX_PERM_LENGTH + 1]; adjust_non_word(current_word, certainties, permute_debug); - current_word->populate_unichars(getUnicharset()); + current_word->populate_unichars(); if (replaced) { // Apply a reward multiplier on rating if an script permutation is made. float rating = current_word->rating(); @@ -780,19 +785,19 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, // Populate unichars_ and unichar_lengths_ of raw_choice. This is // needed for various components that still work with unichars rather // than unichar ids (e.g. LearnWord). - raw_choice->populate_unichars(getUnicharset()); + raw_choice->populate_unichars(); } if (this_choice && this_choice->rating() < best_choice->rating()) { *best_choice = *this_choice; // Populate unichars_ and unichar_lengths_ of best_choice. This is // needed for various components that still work with unichars rather // than unichar ids (dawg, *_ok functions, various hard-coded hacks). - best_choice->populate_unichars(getUnicharset()); + best_choice->populate_unichars(); if (permute_debug) { best_choice->print("\n**** Populate BestChoice"); cprintf("populate best_choice\n\t%s\n", - best_choice->debug_string(getUnicharset()).string()); + best_choice->debug_string().string()); } delete this_choice; return true; @@ -811,13 +816,13 @@ WERD_CHOICE *Dict::permute_compound_words( float rating_limit) { BLOB_CHOICE *first_choice; WERD_CHOICE *best_choice = NULL; - WERD_CHOICE current_word(MAX_WERD_LENGTH); + WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH); int first_index = 0; int x; BLOB_CHOICE_IT blob_choice_it; if (char_choices.length() > MAX_WERD_LENGTH) { - WERD_CHOICE *bad_word_choice = new WERD_CHOICE(); + WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset()); bad_word_choice->make_bad(); return bad_word_choice; } @@ -874,7 +879,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, int x; BLOB_CHOICE_LIST_VECTOR subchoices; WERD_CHOICE *best_choice = NULL; - WERD_CHOICE raw_choice; + WERD_CHOICE raw_choice(&getUnicharset()); raw_choice.make_bad(); DisableChoiceAccum(); @@ -886,7 +891,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, } if (!subchoices.empty()) { - WERD_CHOICE initial_choice; + WERD_CHOICE initial_choice(&getUnicharset()); initial_choice.make_bad(); initial_choice.set_rating(rating_limit); @@ -906,10 +911,10 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, if (segment_debug && current_word->rating() < MAX_FLOAT32) { cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n", - current_word->debug_string(getUnicharset()).string(), + current_word->debug_string().string(), current_word->rating(), current_word->certainty()); } - current_word->populate_unichars(getUnicharset()); + current_word->populate_unichars(); EnableChoiceAccum(); } @@ -919,7 +924,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, */ WERD_CHOICE *Dict::get_top_choice_word( const BLOB_CHOICE_LIST_VECTOR &char_choices) { - WERD_CHOICE *top_word = new WERD_CHOICE(MAX_PERM_LENGTH); + WERD_CHOICE *top_word = new WERD_CHOICE(&getUnicharset(), MAX_PERM_LENGTH); float certainties[MAX_PERM_LENGTH]; top_word->set_permuter(TOP_CHOICE_PERM); for (int x = 0; x < char_choices.length(); x++) { @@ -956,11 +961,11 @@ WERD_CHOICE *Dict::permute_top_choice( const char *next_char = ""; //next in word const char *next_next_char = ""; //after next next in word - WERD_CHOICE word(MAX_PERM_LENGTH); + WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH); word.set_permuter(TOP_CHOICE_PERM); - WERD_CHOICE capital_word(MAX_PERM_LENGTH); + WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH); capital_word.set_permuter(UPPER_CASE_PERM); - WERD_CHOICE lower_word(MAX_PERM_LENGTH); + WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH); lower_word.set_permuter(LOWER_CASE_PERM); int x; @@ -1023,7 +1028,7 @@ WERD_CHOICE *Dict::permute_top_choice( if (first_choice == NULL) { cprintf("Permuter found only fragments for" " character at position %d; word=%s\n", - x, word.debug_string(getUnicharset()).string()); + x, word.debug_string().string()); } ASSERT_HOST(first_choice != NULL); @@ -1132,7 +1137,7 @@ WERD_CHOICE *Dict::permute_top_choice( } } - if (word.rating() < raw_choice->rating()) { + if (raw_choice != NULL && word.rating() < raw_choice->rating()) { *raw_choice = word; LogNewChoice(1.0, certainties, true, raw_choice); } @@ -1423,9 +1428,9 @@ WERD_CHOICE *Dict::top_fragments_permute_and_select( frag_char_choices += frag_choices; } - WERD_CHOICE *best_choice = new WERD_CHOICE(); + WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset()); best_choice->make_bad(); - WERD_CHOICE word(MAX_PERM_LENGTH); + WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH); word.set_permuter(TOP_CHOICE_PERM); float certainties[MAX_PERM_LENGTH]; this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn; @@ -1459,7 +1464,7 @@ void Dict::permute_choices( tprintf("%s permute_choices: char_choice_index=%d" " limit=%g rating=%g, certainty=%g word=%s\n", debug, char_choice_index, *limit, word->rating(), - word->certainty(), word->debug_string(getUnicharset()).string()); + word->certainty(), word->debug_string().string()); } if (char_choice_index < char_choices.length()) { BLOB_CHOICE_IT blob_choice_it; @@ -1554,7 +1559,7 @@ void Dict::go_deeper_top_fragments_fxn( if (word_ending) { if (fragments_debug > 1) { tprintf("fragments_debug new choice = %s\n", - word->debug_string(getUnicharset()).string()); + word->debug_string().string()); } *limit = word->rating(); adjust_non_word(word, certainties, permute_debug); @@ -1567,8 +1572,7 @@ void Dict::go_deeper_top_fragments_fxn( } else { if (fragments_debug > 1) { tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n", - word->debug_string(getUnicharset()).string(), - word->rating(), *limit); + word->debug_string().string(), word->rating(), *limit); } } } diff --git a/dict/permute.h b/dict/permute.h index ca66d6748..f7ff6cad9 100644 --- a/dict/permute.h +++ b/dict/permute.h @@ -133,6 +133,8 @@ class PermuterState { private: static const char kPosFree = '.'; + const UNICHARSET *unicharset_; + const BLOB_CHOICE_LIST_VECTOR *char_choices_; // reference pointer only // does not need to be allocated or freed char perm_state_[MAX_PERM_LENGTH]; // handles upto MAX_PERM_LENGTH-1 states diff --git a/dict/states.cpp b/dict/states.cpp index 35a06477e..0a5393f80 100644 --- a/dict/states.cpp +++ b/dict/states.cpp @@ -241,6 +241,19 @@ void print_state(const char *label, STATE *state, int num_joints) { new_line(); } +// Prints out the number of fragments in each segment in a state to +// toappend. +void print_state(STATE *state, int num_joints, STRING *toappend) { + PIECES_STATE pieces; + bin_to_pieces(state, num_joints, pieces); + for (int i = 0; pieces[i] > 0; i++) { + if (i > 0) { + toappend->add_str_int(" ", pieces[i]); + } else { + toappend->add_str_int("", pieces[i]); + } + } +} /** * set_n_ones diff --git a/dict/states.h b/dict/states.h index a478c39ba..ef0640171 100644 --- a/dict/states.h +++ b/dict/states.h @@ -29,6 +29,7 @@ I n c l u d e s ----------------------------------------------------------------------*/ #include "host.h" +#include "strngs.h" /*---------------------------------------------------------------------- T y p e s @@ -64,6 +65,8 @@ int ones_in_state(STATE *state, int num_joints); void print_state(const char *label, STATE *state, int num_joints); +void print_state(STATE *state, int num_joints, STRING *toappend); + void set_n_ones(STATE *state, int n); extern void free_state(STATE *); diff --git a/dict/stopper.cpp b/dict/stopper.cpp index 319da6712..01d99f09d 100644 --- a/dict/stopper.cpp +++ b/dict/stopper.cpp @@ -17,13 +17,11 @@ ******************************************************************************/ #include "stopper.h" -#include "emalloc.h" #include "matchdefs.h" #include "callcpp.h" #include "permute.h" #include "danerror.h" #include "const.h" -#include "freelist.h" #include "efio.h" #include "scanutils.h" #include "unichar.h" @@ -58,6 +56,10 @@ typedef struct UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS]; } EXPANDED_CHOICE; +void DeleteViableChoiceStruct(void *vcs) { + delete (static_cast(vcs)); +} + #define BestCertainty(Choices) \ (((VIABLE_CHOICE) first_node (Choices))->Certainty) @@ -66,10 +68,6 @@ typedef struct #define BestFactor(Choices) \ (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor) -#define AmbigThreshold(F1,F2) \ - (((F2) - (F1)) * tesseract::Dict::kStopperAmbiguityThresholdGain - \ - tesseract::Dict::kStopperAmbiguityThresholdOffset) - /**---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------**/ @@ -100,23 +98,72 @@ static void ExpandChoice(VIABLE_CHOICE Choice, } } +VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT(int length) + : Length(length) { + Blob = new CHAR_CHOICE[length]; + segmentation_state = new uinT8[length]; +} + +VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT() : Length(0) { + Blob = NULL; + segmentation_state = NULL; +} + +VIABLE_CHOICE_STRUCT::~VIABLE_CHOICE_STRUCT() { + delete []Blob; + delete []segmentation_state; +} + +void VIABLE_CHOICE_STRUCT::Init( + const WERD_CHOICE &word_choice, + const PIECES_STATE &pieces_state, + const float certainties[], + FLOAT32 adjust_factor) { + this->Rating = word_choice.rating(); + this->Certainty = word_choice.certainty(); + this->AdjustFactor = adjust_factor; + this->ComposedFromCharFragments = false; + ASSERT_HOST(this->Length == word_choice.length()); + + for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) { + int blob_width = pieces_state[bw_idx]; + CHAR_CHOICE *blob_choice = &this->Blob[i]; + blob_choice->Class = word_choice.unichar_id(i); + blob_choice->NumChunks = blob_width; + blob_choice->Certainty = certainties[i]; + for (int f = 1; f < word_choice.fragment_length(i); ++f) { + blob_width = pieces_state[++bw_idx]; + assert(blob_width > 0); + blob_choice->NumChunks += blob_width; + this->ComposedFromCharFragments = true; + } + this->segmentation_state[i] = blob_choice->NumChunks; + } +} + + +namespace tesseract { + // If the certainty of any chunk in Choice (item1) is not ambiguous with the // corresponding chunk in the best choice (item2), frees Choice and // returns true. -static int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice, - void *item2) { // EXPANDED_CHOICE *BestChoice +int Dict::FreeBadChoice( + void *item1, // VIABLE_CHOICE Choice, + void *item2) { // EXPANDED_CHOICE *BestChoice int i, j, Chunk; FLOAT32 Threshold; VIABLE_CHOICE Choice = reinterpret_cast(item1); EXPANDED_CHOICE *BestChoice = reinterpret_cast(item2); - Threshold = AmbigThreshold(BestChoice->Choice->AdjustFactor, - Choice->AdjustFactor); + Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor, + Choice->AdjustFactor); for (i = 0, Chunk = 0; i < Choice->Length; i++) { - for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++){ + for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) { if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] && Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] < Threshold) { - memfree(Choice); + if (stopper_debug_level >= 2) + PrintViableChoice(stderr, "\nDiscarding bad choice: ", Choice); + delete Choice; return true; } } @@ -124,11 +171,6 @@ static int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice, return false; } -namespace tesseract { - -const float Dict::kStopperAmbiguityThresholdGain = 8.0; -const float Dict::kStopperAmbiguityThresholdOffset = 1.5; - bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, WERD_CHOICE *BestChoice, DANGERR *fixpt, @@ -158,7 +200,7 @@ bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, if (stopper_debug_level >= 1) tprintf("\nStopper: %s (word=%c, case=%c)\n", - BestChoice->debug_string(getUnicharset()).string(), + BestChoice->debug_string().string(), (is_valid_word ? 'y' : 'n'), (is_case_ok ? 'y' : 'n')); @@ -198,7 +240,7 @@ bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) { if (stopper_debug_level >= 1) { tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n", - BestChoice.debug_string(getUnicharset()).string(), + BestChoice.debug_string().string(), (valid_word(BestChoice) ? 'y' : 'n'), (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'), ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y')); @@ -320,10 +362,16 @@ void Dict::FilterWordChoices() { return; // Compute certainties and class for each chunk in best choice. - ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice); - - set_rest (best_choices_, delete_d(list_rest (best_choices_), - &BestChoice, FreeBadChoice)); + VIABLE_CHOICE_STRUCT *best_choice = + (VIABLE_CHOICE_STRUCT *)first_node(best_choices_); + ExpandChoice(best_choice, &BestChoice); + if (stopper_debug_level >= 2) + PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice); + TessResultCallback2* is_bad = + NewPermanentTessCallback(this, &Dict::FreeBadChoice); + set_rest(best_choices_, delete_d(list_rest(best_choices_), + &BestChoice, is_bad)); + delete is_bad; } void Dict::FindClassifierErrors(FLOAT32 MinRating, @@ -371,15 +419,15 @@ void Dict::InitChoiceAccum() { BLOB_WIDTH *BlobWidth, *End; if (best_raw_choice_) - memfree(best_raw_choice_); + delete best_raw_choice_; best_raw_choice_ = NULL; if (best_choices_) - destroy_nodes(best_choices_, memfree); + destroy_nodes(best_choices_, DeleteViableChoiceStruct); best_choices_ = NIL_LIST; if (raw_choices_) - destroy_nodes(raw_choices_, memfree); + destroy_nodes(raw_choices_, DeleteViableChoiceStruct); raw_choices_ = NIL_LIST; EnableChoiceAccum(); @@ -391,7 +439,7 @@ void Dict::InitChoiceAccum() { } void Dict::ClearBestChoiceAccum() { - if (best_choices_) destroy_nodes(best_choices_, memfree); + if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct); best_choices_ = NIL_LIST; } @@ -420,7 +468,6 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], bool raw_choice, WERD_CHOICE *WordChoice) { - VIABLE_CHOICE NewChoice; LIST ChoicesList; LIST Choices; FLOAT32 Threshold; @@ -429,14 +476,15 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor, return; if (raw_choice) { - if (!best_raw_choice_) - best_raw_choice_ = NewViableChoice(*WordChoice, AdjustFactor, Certainties); - else if (WordChoice->rating() < best_raw_choice_->Rating) { - if (ChoiceSameAs(*WordChoice, best_raw_choice_)) - FillViableChoice(*WordChoice, AdjustFactor, Certainties, true, + if (!best_raw_choice_) { + best_raw_choice_ = + NewViableChoice(*WordChoice, AdjustFactor, Certainties); + } else if (WordChoice->rating() < best_raw_choice_->Rating) { + if (ChoiceSameAs(*WordChoice, best_raw_choice_)) { + FillViableChoice(*WordChoice, AdjustFactor, Certainties, best_raw_choice_); - else { - memfree(best_raw_choice_); + } else { + delete best_raw_choice_; best_raw_choice_ = NewViableChoice(*WordChoice, AdjustFactor, Certainties); } @@ -449,16 +497,20 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor, // Throw out obviously bad choices to save some work. if (ChoicesList != NIL_LIST) { - Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor); - if (Threshold > -kStopperAmbiguityThresholdOffset) - Threshold = -kStopperAmbiguityThresholdOffset; + Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor); + if (Threshold > -stopper_ambiguity_threshold_offset) + Threshold = -stopper_ambiguity_threshold_offset; if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) { // Set the rating of the word to be terrible, so that it does not // get chosen as the best choice. if (stopper_debug_level >= 2) { - tprintf("Discarding a choice with an overly low certainty" - " %.4f vs best choice certainty %.4f\n", - WordChoice->certainty(), BestCertainty(ChoicesList)); + STRING bad_string; + WordChoice->string_and_lengths(&bad_string, NULL); + tprintf("Discarding choice \"%s\" with an overly low certainty" + " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n", + bad_string.string(), WordChoice->certainty(), + BestCertainty(ChoicesList), + Threshold + BestCertainty(ChoicesList)); } WordChoice->set_rating(WERD_CHOICE::kBadRating); return; @@ -466,7 +518,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor, } // See if a choice with the same text string has already been found. - NewChoice = NULL; + VIABLE_CHOICE NewChoice = NULL; Choices = ChoicesList; iterate(Choices) { @@ -480,11 +532,10 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor, } if (NewChoice) { - FillViableChoice(*WordChoice, AdjustFactor, Certainties, true, NewChoice); + FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice); ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node); - } - else { - NewChoice = NewViableChoice (*WordChoice, AdjustFactor, Certainties); + } else { + NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties); } ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings); @@ -494,7 +545,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor, if (count (ChoicesList) > tessedit_truncate_wordchoice_log) { Choices = (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log); - destroy_nodes (list_rest (Choices), Efree); + destroy_nodes(list_rest (Choices), DeleteViableChoiceStruct); set_rest(Choices, NIL_LIST); } @@ -513,7 +564,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, bool *modified_blobs) { if (stopper_debug_level > 2) { tprintf("\nRunning NoDangerousAmbig() for %s\n", - best_choice->debug_string(getUnicharset()).string()); + best_choice->debug_string().string()); } // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities @@ -549,8 +600,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, for (i = 0; i < best_choice->length(); ++i) { BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST(); BLOB_CHOICE_IT lst_it(lst); + // TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights + // or are these fake ones good enough? lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), - 0.0, 0.0, -1, -1, -1)); + 0.0, 0.0, -1, -1, -1, 0, 1, false)); ambig_blob_choices.push_back(lst); } } @@ -630,7 +683,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); bc_it.add_to_end(new BLOB_CHOICE( ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, - -1, -1, -1)); + -1, -1, -1, 0, 1, false)); } } spec_it.forward(); @@ -650,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, } // end searching AmbigSpec_LIST } // end searching best_choice } // end searching replace and dangerous ambigs - if (modified_best_choice) best_choice->populate_unichars(getUnicharset()); + if (modified_best_choice) best_choice->populate_unichars(); // If any ambiguities were found permute the constructed ambig_blob_choices // to see if an alternative dictionary word can be found. if (ambigs_found) { @@ -666,7 +719,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, if (ambigs_found) { if (stopper_debug_level >= 1) { tprintf ("Stopper: Possible ambiguous word = %s\n", - alt_word->debug_string(getUnicharset()).string()); + alt_word->debug_string().string()); } if (fixpt != NULL) { // Note: Currently character choices combined from fragments can only @@ -691,6 +744,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, } delete alt_word; } + if (output_ambig_words_file_ != NULL) { + fprintf(output_ambig_words_file_, "\n"); + } + ambig_blob_choices.delete_data_pointers(); return !ambigs_found; } @@ -714,7 +771,6 @@ void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) { return; } } - mem_tidy (1); cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n", Choice->Length, LastChunk, Blob); assert(false); // this should never get executed @@ -748,7 +804,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, for (i = 0; i < fraglen; ++i) { if (fraglen > 1) { STRING frag_str = - CHAR_FRAGMENT::to_string(temp_uch, i, fraglen); + CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false); getUnicharset().unichar_insert(frag_str.string()); uch_id = getUnicharset().unichar_to_id(frag_str.string()); } @@ -756,7 +812,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, STRING correct_frag_uch = CHAR_FRAGMENT::to_string(correct_ngram_str, temp_blob_index - begin_blob_index, - num_blobs_to_replace); + num_blobs_to_replace, false); getUnicharset().unichar_insert(correct_frag_uch.string()); UNICHAR_ID correct_frag_uch_id = getUnicharset().unichar_to_id(correct_frag_uch.string()); @@ -825,10 +881,9 @@ VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice, const float Certainties[]) { int Length = WordChoice.length(); assert (Length <= MAX_NUM_CHUNKS && Length > 0); - VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc ( - sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE)); - FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice); - return (NewChoice); + VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length); + FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice); + return NewChoice; } void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) { @@ -864,35 +919,10 @@ void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice void Dict::FillViableChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor, const float Certainties[], - bool SameString, VIABLE_CHOICE ViableChoice) { - CHAR_CHOICE *NewChar; - BLOB_WIDTH *BlobWidth; - int x; + VIABLE_CHOICE ViableChoice) { + ViableChoice->Init(WordChoice, current_segmentation_, Certainties, + AdjustFactor); - ViableChoice->Rating = WordChoice.rating(); - ViableChoice->Certainty = WordChoice.certainty(); - ViableChoice->AdjustFactor = AdjustFactor; - ViableChoice->ComposedFromCharFragments = false; - if (!SameString) { - ViableChoice->Length = WordChoice.length(); - } - for (x = 0, - NewChar = &(ViableChoice->Blob[0]), - BlobWidth = current_segmentation_; - x < WordChoice.length(); - x++, NewChar++, Certainties++, BlobWidth++) { - if (!SameString) { - NewChar->Class = WordChoice.unichar_id(x); - } - NewChar->NumChunks = *BlobWidth; - NewChar->Certainty = *Certainties; - for (int i = 1; i < WordChoice.fragment_length(x); ++i) { - BlobWidth++; - assert(*BlobWidth > 0); - NewChar->NumChunks += *BlobWidth; - ViableChoice->ComposedFromCharFragments = true; - } - } } bool Dict::StringSameAs(const WERD_CHOICE &WordChoice, diff --git a/dict/stopper.h b/dict/stopper.h index d9993c4be..6ff597be9 100644 --- a/dict/stopper.h +++ b/dict/stopper.h @@ -27,6 +27,8 @@ #include "states.h" #include "unichar.h" +class WERD_CHOICE; + typedef uinT8 BLOB_WIDTH; struct DANGERR_INFO { @@ -50,13 +52,36 @@ struct CHAR_CHOICE { float Certainty; }; -struct VIABLE_CHOICE_STRUCT { +class VIABLE_CHOICE_STRUCT { + public: + VIABLE_CHOICE_STRUCT(); + explicit VIABLE_CHOICE_STRUCT(int length); + ~VIABLE_CHOICE_STRUCT(); + + // Fill in the data with these values. + void Init(const WERD_CHOICE& word_choice, + const PIECES_STATE& pieces_state, + const float certainties[], + FLOAT32 adjust_factor); + + int Length; float Rating; float Certainty; FLOAT32 AdjustFactor; - int Length; bool ComposedFromCharFragments; - CHAR_CHOICE Blob[1]; + CHAR_CHOICE *Blob; + + // segmentation_state: for each choice, how many consecutive blobs + // does it use? + uinT8 *segmentation_state; + + private: + // Disallow assignment and copy construction + VIABLE_CHOICE_STRUCT(const VIABLE_CHOICE_STRUCT &other) + : Length(0), Blob(NULL), segmentation_state(NULL) {} + VIABLE_CHOICE_STRUCT &operator=(const VIABLE_CHOICE_STRUCT &other) { + return *this; + } }; typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE; diff --git a/dict/trie.cpp b/dict/trie.cpp index a981d7e95..ededbaf12 100644 --- a/dict/trie.cpp +++ b/dict/trie.cpp @@ -40,6 +40,16 @@ namespace tesseract { +const char kDoNotReverse[] = "RRP_DO_NO_REVERSE"; +const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"; +const char kForceReverse[] = "RRP_FORCE_REVERSE"; + +const char * const RTLReversePolicyNames[] = { + kDoNotReverse, + kReverseIfHasRTL, + kForceReverse +}; + const char Trie::kAlphaPatternUnicode[] = "\u2000"; const char Trie::kDigitPatternUnicode[] = "\u2001"; const char Trie::kAlphanumPatternUnicode[] = "\u2002"; @@ -47,6 +57,10 @@ const char Trie::kPuncPatternUnicode[] = "\u2003"; const char Trie::kLowerPatternUnicode[] = "\u2004"; const char Trie::kUpperPatternUnicode[] = "\u2005"; +const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { + return RTLReversePolicyNames[reverse_policy]; +} + // Reset the Trie to empty. void Trie::clear() { nodes_.delete_data_pointers(); @@ -156,10 +170,15 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); } -void Trie::add_word_to_dawg(const WERD_CHOICE &word, +bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector *repetitions) { - if (word.length() <= 0) return; // can't add empty words + if (word.length() <= 0) return false; // can't add empty words if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length()); + // Make sure the word does not contain invalid unchar ids. + for (int i = 0; i < word.length(); ++i) { + if (word.unichar_id(i) < 0 || + word.unichar_id(i) >= unicharset_size_) return false; + } EDGE_RECORD *edge_ptr; NODE_REF last_node = 0; @@ -233,6 +252,9 @@ void Trie::add_word_to_dawg(const WERD_CHOICE &word, if (add_failed) { tprintf("Re-initializing document dictionary...\n"); clear(); + return false; + } else { + return true; } } @@ -244,7 +266,8 @@ NODE_REF Trie::new_dawg_node() { } bool Trie::read_word_list(const char *filename, - const UNICHARSET &unicharset) { + const UNICHARSET &unicharset, + Trie::RTLReversePolicy reverse_policy) { FILE *word_file; char string[CHARS_PER_LINE]; int word_count = 0; @@ -254,6 +277,11 @@ bool Trie::read_word_list(const char *filename, while (fgets(string, CHARS_PER_LINE, word_file) != NULL) { chomp_string(string); // remove newline WERD_CHOICE word(string, unicharset); + if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && + word.has_rtl_unichar_id()) || + reverse_policy == RRP_FORCE_REVERSE) { + word.reverse_and_mirror_unichar_ids(); + } ++word_count; if (debug_level_ && word_count % 10000 == 0) tprintf("Read %d words so far\n", word_count); @@ -290,6 +318,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) { unicharset->unichar_insert(kUpperPatternUnicode); upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); initialized_patterns_ = true; + unicharset_size_ = unicharset->size(); } void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, @@ -351,7 +380,7 @@ bool Trie::read_pattern_list(const char *filename, chomp_string(string); // remove newline // Parse the pattern and construct a unichar id vector. // Record the number of repetitions of each unichar in the parallel vector. - WERD_CHOICE word; + WERD_CHOICE word(&unicharset); GenericVector repetitions_vec; const char *str_ptr = string; int step = unicharset.step(str_ptr); @@ -397,7 +426,7 @@ bool Trie::read_pattern_list(const char *filename, // Insert the pattern into the trie. if (debug_level_ > 2) { tprintf("Inserting expanded user pattern %s\n", - word.debug_string(unicharset).string()); + word.debug_string().string()); } if (!this->word_in_dawg(word)) { this->add_word_to_dawg(word, &repetitions_vec); diff --git a/dict/trie.h b/dict/trie.h index 2196e28d1..bf1bfb83b 100644 --- a/dict/trie.h +++ b/dict/trie.h @@ -61,6 +61,12 @@ namespace tesseract { */ class Trie : public Dawg { public: + enum RTLReversePolicy { + RRP_DO_NO_REVERSE, + RRP_REVERSE_IF_HAS_RTL, + RRP_FORCE_REVERSE, + }; + // Minimum number of concrete characters at the beginning of user patterns. static const int kSaneNumConcreteChars = 4; // Various unicode whitespace characters are used to denote unichar patterns, @@ -73,6 +79,9 @@ class Trie : public Dawg { static const char kLowerPatternUnicode[]; static const char kUpperPatternUnicode[]; + static const char *get_reverse_policy_name( + RTLReversePolicy reverse_policy); + // max_num_edges argument allows limiting the amount of memory this // Trie can consume (if a new word insert would cause the Trie to // contain more edges than max_num_edges, all the edges are cleared @@ -86,7 +95,7 @@ class Trie : public Dawg { new_dawg_node(); // need to allocate node 0 initialized_patterns_ = false; } - ~Trie() { nodes_.delete_data_pointers(); } + virtual ~Trie() { nodes_.delete_data_pointers(); } // Reset the Trie to empty. void clear(); @@ -149,8 +158,11 @@ class Trie : public Dawg { SquishedDawg *trie_to_dawg(); // Inserts the list of words from the given file into the Trie. + // If reverse is true, calls WERD_CHOICE::reverse_unichar_ids_if_rtl() + // on each word before inserting it into the Trie. bool read_word_list(const char *filename, - const UNICHARSET &unicharset); + const UNICHARSET &unicharset, + Trie::RTLReversePolicy reverse); // Inserts the list of patterns from the given file into the Trie. // The pattern list file should contain one pattern per line in UTF-8 format. @@ -225,10 +237,13 @@ class Trie : public Dawg { // whether the unichar id with the corresponding index in the word is allowed // to repeat an unlimited number of times. For each entry that is true, MARKER // flag of the corresponding edge created for this unichar id is set to true). - void add_word_to_dawg(const WERD_CHOICE &word, + // + // Return true if add succeeded, false otherwise (e.g. when a word contained + // an invalid unichar id or the trie was getting too large and was cleared). + bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector *repetitions); - void add_word_to_dawg(const WERD_CHOICE &word) { - add_word_to_dawg(word, NULL); + bool add_word_to_dawg(const WERD_CHOICE &word) { + return add_word_to_dawg(word, NULL); } protected: @@ -377,11 +392,11 @@ class Trie : public Dawg { UNICHAR_ID character_class_to_pattern(char ch); // Member variables - TRIE_NODES nodes_; ///< vector of nodes in the Trie - uinT64 num_edges_; ///< sum of all edges (forward and backward) - uinT64 max_num_edges_; ///< maximum number of edges allowed - uinT64 deref_direction_mask_; ///< mask for EDGE_REF to extract direction - uinT64 deref_node_index_mask_; ///< mask for EDGE_REF to extract node index + TRIE_NODES nodes_; // vector of nodes in the Trie + uinT64 num_edges_; // sum of all edges (forward and backward) + uinT64 max_num_edges_; // maximum number of edges allowed + uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction + uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index // Variables for translating character class codes denoted in user patterns // file to the unichar ids used to represent them in a Trie. bool initialized_patterns_;