mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-05 02:47:00 +08:00
3a13d80d24
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20
213 lines
8.5 KiB
C++
213 lines
8.5 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: dict.cpp
|
|
// Description: dict class.
|
|
// Author: Samuel Charron
|
|
//
|
|
// (C) Copyright 2006, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#include "dict.h"
|
|
|
|
namespace tesseract {
|
|
|
|
class Image;
|
|
|
|
Dict::Dict(Image* image_ptr)
|
|
: letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
|
|
image_ptr_(image_ptr) {
|
|
dang_ambigs_table_ = NULL;
|
|
replace_ambigs_table_ = NULL;
|
|
keep_word_choices_ = false;
|
|
reject_offset_ = 0.0;
|
|
best_raw_choice_ = NULL;
|
|
best_choices_ = NIL;
|
|
raw_choices_ = NIL;
|
|
go_deeper_fxn_ = NULL;
|
|
hyphen_word_ = NULL;
|
|
last_word_on_line_ = false;
|
|
hyphen_unichar_id_ = INVALID_UNICHAR_ID;
|
|
document_words_ = NULL;
|
|
pending_words_ = NULL;
|
|
freq_dawg_ = NULL;
|
|
}
|
|
|
|
Dict::~Dict() {
|
|
if (hyphen_word_ != NULL) delete hyphen_word_;
|
|
}
|
|
|
|
// Returns true if in light of the current state the letter at word_index
|
|
// in the given word is allowed according to at least one of the dawgs in
|
|
// dawgs_.
|
|
//
|
|
// See more extensive comments in dict.h where this function is declared.
|
|
//
|
|
int Dict::def_letter_is_okay(void* void_dawg_args, int word_index,
|
|
const void *void_word, bool word_end) {
|
|
DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
|
|
const WERD_CHOICE *word = reinterpret_cast<const WERD_CHOICE*>(void_word);
|
|
|
|
if (dawg_debug_level >= 3) {
|
|
tprintf("def_letter_is_okay: word_index=%d word_end=%d"
|
|
" word=%s num active dawgs=%d num constraints=%d\n",
|
|
word_index, word_end,
|
|
word->debug_string(getUnicharset()).string(),
|
|
dawg_args->active_dawgs->length(),
|
|
dawg_args->constraints->length());
|
|
}
|
|
|
|
// Do not accept words that contain kPatternUnicharID.
|
|
// (otherwise pattern dawgs would not function correctly).
|
|
// Do not accept words containing INVALID_UNICHAR_IDs.
|
|
UNICHAR_ID unichar_id = word->unichar_id(word_index);
|
|
if (unichar_id == Dawg::kPatternUnicharID ||
|
|
unichar_id == INVALID_UNICHAR_ID) {
|
|
dawg_args->permuter = NO_PERM;
|
|
return NO_PERM;
|
|
}
|
|
|
|
// Initialization.
|
|
PermuterType current_permuter = NO_PERM;
|
|
dawg_args->updated_active_dawgs->clear();
|
|
const DawgInfoVector &constraints = *(dawg_args->constraints);
|
|
*dawg_args->updated_constraints = constraints;
|
|
|
|
// Go over the active_dawgs vector and insert DawgInfo records with the
|
|
// updated ref (an edge with the corresponding unichar id) into
|
|
// dawg_args->updated_active_dawgs.
|
|
for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
|
|
const DawgInfo &info = (*dawg_args->active_dawgs)[a];
|
|
const Dawg *dawg = dawgs_[info.dawg_index];
|
|
// Obtain unichar_id at this position (could be changed later, so this
|
|
// needs to be inside the loop over all active dawgs).
|
|
unichar_id = word->unichar_id(word_index);
|
|
// The number dawg generalizes all digits to be kPatternUnicharID,
|
|
// so try to match kPatternUnicharID if the current unichar is a digit.
|
|
if (dawg->type() == DAWG_TYPE_NUMBER &&
|
|
getUnicharset().get_isdigit(unichar_id)) {
|
|
unichar_id = Dawg::kPatternUnicharID;
|
|
}
|
|
// Get the starting node for this letter.
|
|
NODE_REF node;
|
|
if (info.ref == NO_EDGE) {
|
|
node = 0; // beginning to explore this dawg
|
|
} else {
|
|
node = dawg->next_node(info.ref);
|
|
if (node == 0) node = NO_EDGE; // end of word
|
|
}
|
|
// Find the edge out of the node for the curent unichar_id.
|
|
EDGE_REF edge = (node != NO_EDGE) ?
|
|
dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE;
|
|
|
|
if (dawg_debug_level >= 3) {
|
|
tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
|
|
info.dawg_index, node, edge);
|
|
}
|
|
|
|
if (edge != NO_EDGE) { // the unichar was found in the current dawg
|
|
if (ConstraintsOk(*(dawg_args->updated_constraints),
|
|
word_end, dawg->type())) {
|
|
UpdatePermuter(dawg->permuter(), ¤t_permuter);
|
|
dawg_args->updated_active_dawgs->add_unique(
|
|
DawgInfo(info.dawg_index, edge),
|
|
"Append current dawg to updated active dawgs: ");
|
|
}
|
|
} else { // the unichar was not found in the current dawg
|
|
// Handle leading/trailing punctuation dawgs that denote a word pattern
|
|
// as an edge with kPatternUnicharID. If such an edge is found we add a
|
|
// constraint denoting the state of the dawg before the word pattern.
|
|
// This constraint will be applied later when this dawg is found among
|
|
// successor dawgs as well potentially at the end of the word.
|
|
if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
|
|
edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
|
|
if (edge != NO_EDGE) {
|
|
dawg_args->updated_constraints->add_unique(
|
|
DawgInfo(info.dawg_index, edge), "Recording constraint: ");
|
|
} else {
|
|
// Do not explore successors of this dawg, since this
|
|
// must be invalid leading or trailing punctuation.
|
|
if (dawg_debug_level >= 3) {
|
|
tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (info.ref == NO_EDGE) {
|
|
if (dawg_debug_level >= 3) {
|
|
tprintf("No letters matched in dawg %d\n", info.dawg_index);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Discard the dawg if the pattern can not end at previous letter.
|
|
if (edge == NO_EDGE && // previous part is not leading punctuation
|
|
!dawg->end_of_word(info.ref)) {
|
|
if (dawg_debug_level >= 3) {
|
|
tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Look for the unichar in each of this dawg's successors
|
|
// and append those in which it is found to active_dawgs.
|
|
const SuccessorList &slist = *(successors_[info.dawg_index]);
|
|
for (int s = 0; s < slist.length(); ++s) {
|
|
int sdawg_index = slist[s];
|
|
const Dawg *sdawg = dawgs_[sdawg_index];
|
|
NODE_REF snode = 0;
|
|
// Apply constraints to the successor dawg.
|
|
for (int c = 0; c < constraints.length(); ++c) {
|
|
// If the successor dawg is described in the constraints change
|
|
// the start ref from 0 to the one recorded as the constraint.
|
|
const DawgInfo &cinfo = constraints[c];
|
|
if (cinfo.dawg_index == sdawg_index) {
|
|
snode = sdawg->next_node(cinfo.ref);
|
|
// Make sure we do not search the successor dawg if after
|
|
// applying the saved constraint we are at the end of the word.
|
|
if (snode == 0) snode = NO_EDGE;
|
|
if (dawg_debug_level >= 3) {
|
|
tprintf("Applying constraint [%d, " REFFORMAT "]\n",
|
|
sdawg_index, snode);
|
|
}
|
|
}
|
|
}
|
|
// Look for the letter in this successor dawg.
|
|
EDGE_REF sedge = sdawg->edge_char_of(
|
|
snode, word->unichar_id(word_index), word_end);
|
|
// If we found the letter append sdawg to the active_dawgs list.
|
|
if (sedge != NO_EDGE &&
|
|
ConstraintsOk(*(dawg_args->updated_constraints), word_end,
|
|
dawgs_[sdawg_index]->type())) {
|
|
UpdatePermuter(sdawg->permuter(), ¤t_permuter);
|
|
if (sdawg->next_node(sedge) != 0) { // if not word end
|
|
dawg_args->updated_active_dawgs->add_unique(
|
|
DawgInfo(sdawg_index, sedge),
|
|
"Append successor to updated active dawgs: ");
|
|
}
|
|
}
|
|
} // end successors loop
|
|
} // end if/else
|
|
} // end for
|
|
// Update dawg_args->permuter if it used to be NO_PERM or if we found
|
|
// the current letter in a non-punctuation dawg. This allows preserving
|
|
// information on which dawg the "core" word came from.
|
|
if ((current_permuter == PUNC_PERM &&
|
|
current_permuter > dawg_args->permuter) ||
|
|
current_permuter != PUNC_PERM) {
|
|
dawg_args->permuter = current_permuter;
|
|
}
|
|
return dawg_args->permuter;
|
|
}
|
|
|
|
} // namespace tesseract
|