/////////////////////////////////////////////////////////////////////// // File: dict.cpp // Description: dict class. // Author: Samuel Charron // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #include "dict.h" namespace tesseract { class Image; Dict::Dict(Image* image_ptr) : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), image_ptr_(image_ptr) { dang_ambigs_table_ = NULL; replace_ambigs_table_ = NULL; keep_word_choices_ = false; reject_offset_ = 0.0; best_raw_choice_ = NULL; best_choices_ = NIL; raw_choices_ = NIL; go_deeper_fxn_ = NULL; hyphen_word_ = NULL; last_word_on_line_ = false; hyphen_unichar_id_ = INVALID_UNICHAR_ID; document_words_ = NULL; pending_words_ = NULL; freq_dawg_ = NULL; } Dict::~Dict() { if (hyphen_word_ != NULL) delete hyphen_word_; } // Returns true if in light of the current state the letter at word_index // in the given word is allowed according to at least one of the dawgs in // dawgs_. // // See more extensive comments in dict.h where this function is declared. // int Dict::def_letter_is_okay(void* void_dawg_args, int word_index, const void *void_word, bool word_end) { DawgArgs *dawg_args = reinterpret_cast(void_dawg_args); const WERD_CHOICE *word = reinterpret_cast(void_word); if (dawg_debug_level >= 3) { tprintf("def_letter_is_okay: word_index=%d word_end=%d" " word=%s num active dawgs=%d num constraints=%d\n", word_index, word_end, word->debug_string(getUnicharset()).string(), dawg_args->active_dawgs->length(), dawg_args->constraints->length()); } // Do not accept words that contain kPatternUnicharID. // (otherwise pattern dawgs would not function correctly). // Do not accept words containing INVALID_UNICHAR_IDs. UNICHAR_ID unichar_id = word->unichar_id(word_index); if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) { dawg_args->permuter = NO_PERM; return NO_PERM; } // Initialization. PermuterType current_permuter = NO_PERM; dawg_args->updated_active_dawgs->clear(); const DawgInfoVector &constraints = *(dawg_args->constraints); *dawg_args->updated_constraints = constraints; // Go over the active_dawgs vector and insert DawgInfo records with the // updated ref (an edge with the corresponding unichar id) into // dawg_args->updated_active_dawgs. for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) { const DawgInfo &info = (*dawg_args->active_dawgs)[a]; const Dawg *dawg = dawgs_[info.dawg_index]; // Obtain unichar_id at this position (could be changed later, so this // needs to be inside the loop over all active dawgs). unichar_id = word->unichar_id(word_index); // The number dawg generalizes all digits to be kPatternUnicharID, // so try to match kPatternUnicharID if the current unichar is a digit. if (dawg->type() == DAWG_TYPE_NUMBER && getUnicharset().get_isdigit(unichar_id)) { unichar_id = Dawg::kPatternUnicharID; } // Get the starting node for this letter. NODE_REF node; if (info.ref == NO_EDGE) { node = 0; // beginning to explore this dawg } else { node = dawg->next_node(info.ref); if (node == 0) node = NO_EDGE; // end of word } // Find the edge out of the node for the curent unichar_id. EDGE_REF edge = (node != NO_EDGE) ? dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE; if (dawg_debug_level >= 3) { tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", info.dawg_index, node, edge); } if (edge != NO_EDGE) { // the unichar was found in the current dawg if (ConstraintsOk(*(dawg_args->updated_constraints), word_end, dawg->type())) { UpdatePermuter(dawg->permuter(), ¤t_permuter); dawg_args->updated_active_dawgs->add_unique( DawgInfo(info.dawg_index, edge), "Append current dawg to updated active dawgs: "); } } else { // the unichar was not found in the current dawg // Handle leading/trailing punctuation dawgs that denote a word pattern // as an edge with kPatternUnicharID. If such an edge is found we add a // constraint denoting the state of the dawg before the word pattern. // This constraint will be applied later when this dawg is found among // successor dawgs as well potentially at the end of the word. if (dawg->type() == DAWG_TYPE_PUNCTUATION) { edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end); if (edge != NO_EDGE) { dawg_args->updated_constraints->add_unique( DawgInfo(info.dawg_index, edge), "Recording constraint: "); } else { // Do not explore successors of this dawg, since this // must be invalid leading or trailing punctuation. if (dawg_debug_level >= 3) { tprintf("Invalid punctuation from dawg %d\n", info.dawg_index); } continue; } } if (info.ref == NO_EDGE) { if (dawg_debug_level >= 3) { tprintf("No letters matched in dawg %d\n", info.dawg_index); } continue; } // Discard the dawg if the pattern can not end at previous letter. if (edge == NO_EDGE && // previous part is not leading punctuation !dawg->end_of_word(info.ref)) { if (dawg_debug_level >= 3) { tprintf("No valid pattern end in dawg %d\n", info.dawg_index); } continue; } // Look for the unichar in each of this dawg's successors // and append those in which it is found to active_dawgs. const SuccessorList &slist = *(successors_[info.dawg_index]); for (int s = 0; s < slist.length(); ++s) { int sdawg_index = slist[s]; const Dawg *sdawg = dawgs_[sdawg_index]; NODE_REF snode = 0; // Apply constraints to the successor dawg. for (int c = 0; c < constraints.length(); ++c) { // If the successor dawg is described in the constraints change // the start ref from 0 to the one recorded as the constraint. const DawgInfo &cinfo = constraints[c]; if (cinfo.dawg_index == sdawg_index) { snode = sdawg->next_node(cinfo.ref); // Make sure we do not search the successor dawg if after // applying the saved constraint we are at the end of the word. if (snode == 0) snode = NO_EDGE; if (dawg_debug_level >= 3) { tprintf("Applying constraint [%d, " REFFORMAT "]\n", sdawg_index, snode); } } } // Look for the letter in this successor dawg. EDGE_REF sedge = sdawg->edge_char_of( snode, word->unichar_id(word_index), word_end); // If we found the letter append sdawg to the active_dawgs list. if (sedge != NO_EDGE && ConstraintsOk(*(dawg_args->updated_constraints), word_end, dawgs_[sdawg_index]->type())) { UpdatePermuter(sdawg->permuter(), ¤t_permuter); if (sdawg->next_node(sedge) != 0) { // if not word end dawg_args->updated_active_dawgs->add_unique( DawgInfo(sdawg_index, sedge), "Append successor to updated active dawgs: "); } } } // end successors loop } // end if/else } // end for // Update dawg_args->permuter if it used to be NO_PERM or if we found // the current letter in a non-punctuation dawg. This allows preserving // information on which dawg the "core" word came from. if ((current_permuter == PUNC_PERM && current_permuter > dawg_args->permuter) || current_permuter != PUNC_PERM) { dawg_args->permuter = current_permuter; } return dawg_args->permuter; } } // namespace tesseract