tesseract/dict/dict.cpp
theraysmith 3a13d80d24 Changes to dict for 3.00
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2009-07-11 02:20:33 +00:00

213 lines
8.5 KiB
C++

///////////////////////////////////////////////////////////////////////
// File: dict.cpp
// Description: dict class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "dict.h"
namespace tesseract {
class Image;
Dict::Dict(Image* image_ptr)
: letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
image_ptr_(image_ptr) {
dang_ambigs_table_ = NULL;
replace_ambigs_table_ = NULL;
keep_word_choices_ = false;
reject_offset_ = 0.0;
best_raw_choice_ = NULL;
best_choices_ = NIL;
raw_choices_ = NIL;
go_deeper_fxn_ = NULL;
hyphen_word_ = NULL;
last_word_on_line_ = false;
hyphen_unichar_id_ = INVALID_UNICHAR_ID;
document_words_ = NULL;
pending_words_ = NULL;
freq_dawg_ = NULL;
}
Dict::~Dict() {
if (hyphen_word_ != NULL) delete hyphen_word_;
}
// Returns true if in light of the current state the letter at word_index
// in the given word is allowed according to at least one of the dawgs in
// dawgs_.
//
// See more extensive comments in dict.h where this function is declared.
//
int Dict::def_letter_is_okay(void* void_dawg_args, int word_index,
const void *void_word, bool word_end) {
DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
const WERD_CHOICE *word = reinterpret_cast<const WERD_CHOICE*>(void_word);
if (dawg_debug_level >= 3) {
tprintf("def_letter_is_okay: word_index=%d word_end=%d"
" word=%s num active dawgs=%d num constraints=%d\n",
word_index, word_end,
word->debug_string(getUnicharset()).string(),
dawg_args->active_dawgs->length(),
dawg_args->constraints->length());
}
// Do not accept words that contain kPatternUnicharID.
// (otherwise pattern dawgs would not function correctly).
// Do not accept words containing INVALID_UNICHAR_IDs.
UNICHAR_ID unichar_id = word->unichar_id(word_index);
if (unichar_id == Dawg::kPatternUnicharID ||
unichar_id == INVALID_UNICHAR_ID) {
dawg_args->permuter = NO_PERM;
return NO_PERM;
}
// Initialization.
PermuterType current_permuter = NO_PERM;
dawg_args->updated_active_dawgs->clear();
const DawgInfoVector &constraints = *(dawg_args->constraints);
*dawg_args->updated_constraints = constraints;
// Go over the active_dawgs vector and insert DawgInfo records with the
// updated ref (an edge with the corresponding unichar id) into
// dawg_args->updated_active_dawgs.
for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
const DawgInfo &info = (*dawg_args->active_dawgs)[a];
const Dawg *dawg = dawgs_[info.dawg_index];
// Obtain unichar_id at this position (could be changed later, so this
// needs to be inside the loop over all active dawgs).
unichar_id = word->unichar_id(word_index);
// The number dawg generalizes all digits to be kPatternUnicharID,
// so try to match kPatternUnicharID if the current unichar is a digit.
if (dawg->type() == DAWG_TYPE_NUMBER &&
getUnicharset().get_isdigit(unichar_id)) {
unichar_id = Dawg::kPatternUnicharID;
}
// Get the starting node for this letter.
NODE_REF node;
if (info.ref == NO_EDGE) {
node = 0; // beginning to explore this dawg
} else {
node = dawg->next_node(info.ref);
if (node == 0) node = NO_EDGE; // end of word
}
// Find the edge out of the node for the curent unichar_id.
EDGE_REF edge = (node != NO_EDGE) ?
dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE;
if (dawg_debug_level >= 3) {
tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
info.dawg_index, node, edge);
}
if (edge != NO_EDGE) { // the unichar was found in the current dawg
if (ConstraintsOk(*(dawg_args->updated_constraints),
word_end, dawg->type())) {
UpdatePermuter(dawg->permuter(), &current_permuter);
dawg_args->updated_active_dawgs->add_unique(
DawgInfo(info.dawg_index, edge),
"Append current dawg to updated active dawgs: ");
}
} else { // the unichar was not found in the current dawg
// Handle leading/trailing punctuation dawgs that denote a word pattern
// as an edge with kPatternUnicharID. If such an edge is found we add a
// constraint denoting the state of the dawg before the word pattern.
// This constraint will be applied later when this dawg is found among
// successor dawgs as well potentially at the end of the word.
if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
if (edge != NO_EDGE) {
dawg_args->updated_constraints->add_unique(
DawgInfo(info.dawg_index, edge), "Recording constraint: ");
} else {
// Do not explore successors of this dawg, since this
// must be invalid leading or trailing punctuation.
if (dawg_debug_level >= 3) {
tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
}
continue;
}
}
if (info.ref == NO_EDGE) {
if (dawg_debug_level >= 3) {
tprintf("No letters matched in dawg %d\n", info.dawg_index);
}
continue;
}
// Discard the dawg if the pattern can not end at previous letter.
if (edge == NO_EDGE && // previous part is not leading punctuation
!dawg->end_of_word(info.ref)) {
if (dawg_debug_level >= 3) {
tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
}
continue;
}
// Look for the unichar in each of this dawg's successors
// and append those in which it is found to active_dawgs.
const SuccessorList &slist = *(successors_[info.dawg_index]);
for (int s = 0; s < slist.length(); ++s) {
int sdawg_index = slist[s];
const Dawg *sdawg = dawgs_[sdawg_index];
NODE_REF snode = 0;
// Apply constraints to the successor dawg.
for (int c = 0; c < constraints.length(); ++c) {
// If the successor dawg is described in the constraints change
// the start ref from 0 to the one recorded as the constraint.
const DawgInfo &cinfo = constraints[c];
if (cinfo.dawg_index == sdawg_index) {
snode = sdawg->next_node(cinfo.ref);
// Make sure we do not search the successor dawg if after
// applying the saved constraint we are at the end of the word.
if (snode == 0) snode = NO_EDGE;
if (dawg_debug_level >= 3) {
tprintf("Applying constraint [%d, " REFFORMAT "]\n",
sdawg_index, snode);
}
}
}
// Look for the letter in this successor dawg.
EDGE_REF sedge = sdawg->edge_char_of(
snode, word->unichar_id(word_index), word_end);
// If we found the letter append sdawg to the active_dawgs list.
if (sedge != NO_EDGE &&
ConstraintsOk(*(dawg_args->updated_constraints), word_end,
dawgs_[sdawg_index]->type())) {
UpdatePermuter(sdawg->permuter(), &current_permuter);
if (sdawg->next_node(sedge) != 0) { // if not word end
dawg_args->updated_active_dawgs->add_unique(
DawgInfo(sdawg_index, sedge),
"Append successor to updated active dawgs: ");
}
}
} // end successors loop
} // end if/else
} // end for
// Update dawg_args->permuter if it used to be NO_PERM or if we found
// the current letter in a non-punctuation dawg. This allows preserving
// information on which dawg the "core" word came from.
if ((current_permuter == PUNC_PERM &&
current_permuter > dawg_args->permuter) ||
current_permuter != PUNC_PERM) {
dawg_args->permuter = current_permuter;
}
return dawg_args->permuter;
}
} // namespace tesseract