mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
aea5be1995
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@335 d0cd1f9f-072b-0410-8dd7-cf729c803f20
370 lines
15 KiB
C++
370 lines
15 KiB
C++
/* -*-C-*-
|
|
********************************************************************************
|
|
*
|
|
* File: permdawg.c (Formerly permdawg.c)
|
|
* Description: Scale word choices by a dictionary
|
|
* Author: Mark Seaman, OCR Technology
|
|
* Created: Fri Oct 16 14:37:00 1987
|
|
* Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
|
|
* Language: C
|
|
* Package: N/A
|
|
* Status: Reusable Software Component
|
|
*
|
|
* (c) Copyright 1987, Hewlett-Packard Company.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
*********************************************************************************/
|
|
/*----------------------------------------------------------------------
|
|
I n c l u d e s
|
|
----------------------------------------------------------------------*/
|
|
|
|
#include "context.h"
|
|
#include "conversion.h"
|
|
#include "cutil.h"
|
|
#include "dawg.h"
|
|
#include "freelist.h"
|
|
#include "globals.h"
|
|
#include "ndminx.h"
|
|
#include "permdawg.h"
|
|
#include "permute.h"
|
|
#include "stopper.h"
|
|
#include "tordvars.h"
|
|
#include "tprintf.h"
|
|
#include "varable.h"
|
|
|
|
#include <ctype.h>
|
|
#include "dict.h"
|
|
#include "image.h"
|
|
|
|
/*----------------------------------------------------------------------
|
|
V a r i a b l e s
|
|
----------------------------------------------------------------------*/
|
|
BOOL_VAR(segment_dawg_debug, 0, "Debug mode for word segmentation");
|
|
|
|
double_VAR(segment_penalty_dict_case_bad, OK_WERD,
|
|
"Default score multiplier for word matches, which may have "
|
|
"case issues (lower is better).");
|
|
|
|
double_VAR(segment_penalty_dict_case_ok, GOOD_WERD,
|
|
"Score multiplier for word matches that have good case "
|
|
"(lower is better).");
|
|
|
|
double_VAR(segment_penalty_dict_frequent_word, FREQ_WERD,
|
|
"Score multiplier for word matches which have good case and are "
|
|
"frequent in the given language (lower is better).");
|
|
|
|
/*----------------------------------------------------------------------
|
|
F u n c t i o n s
|
|
----------------------------------------------------------------------*/
|
|
namespace tesseract {
|
|
|
|
static const float kPermDawgRatingPad = 5.0;
|
|
|
|
/**********************************************************************
|
|
* adjust_word
|
|
*
|
|
* Assign an adjusted value to a string that is a word. The value
|
|
* that this word choice has is based on case and punctuation rules.
|
|
**********************************************************************/
|
|
void Dict::adjust_word(WERD_CHOICE *word,
|
|
float *certainty_array) {
|
|
float adjust_factor;
|
|
float new_rating = word->rating();
|
|
|
|
if (segment_dawg_debug) {
|
|
tprintf("Word: %s %4.2f ",
|
|
word->debug_string(getUnicharset()).string(), word->rating());
|
|
}
|
|
|
|
new_rating += RATING_PAD;
|
|
if (Context::case_ok(*word, getUnicharset())) {
|
|
if (freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
|
|
word->set_permuter(FREQ_DAWG_PERM);
|
|
new_rating *= segment_penalty_dict_frequent_word;
|
|
adjust_factor = segment_penalty_dict_frequent_word;
|
|
if (segment_dawg_debug)
|
|
tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word);
|
|
} else {
|
|
new_rating *= segment_penalty_dict_case_ok;
|
|
adjust_factor = segment_penalty_dict_case_ok;
|
|
if (segment_dawg_debug)
|
|
tprintf(", %4.2f ", (double)segment_penalty_dict_case_ok);
|
|
}
|
|
} else {
|
|
new_rating *= segment_penalty_dict_case_bad;
|
|
adjust_factor = segment_penalty_dict_case_bad;
|
|
if (segment_dawg_debug) {
|
|
tprintf(", C %4.2f ", (double)segment_penalty_dict_case_bad);
|
|
}
|
|
}
|
|
new_rating -= RATING_PAD;
|
|
word->set_rating(new_rating);
|
|
|
|
LogNewChoice(*word, adjust_factor, certainty_array, false);
|
|
|
|
if (segment_dawg_debug)
|
|
tprintf(" --> %4.2f\n", new_rating);
|
|
}
|
|
|
|
/**********************************************************************
|
|
* go_deeper_dawg_fxn
|
|
*
|
|
* If the choice being composed so far could be a dictionary word
|
|
* keep exploring choices.
|
|
**********************************************************************/
|
|
void Dict::go_deeper_dawg_fxn(
|
|
const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
|
int char_choice_index,
|
|
const CHAR_FRAGMENT_INFO *prev_char_frag_info,
|
|
bool word_ending, WERD_CHOICE *word, float certainties[],
|
|
float *limit, WERD_CHOICE *best_choice, void *void_more_args) {
|
|
DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
|
|
int word_index = word->length() - 1;
|
|
|
|
// There are two modes for deciding whether to go deeper: regular dawg
|
|
// permuter mode and the special ambigs mode. If *limit is <= 0.0 the
|
|
// function switches to the ambigs mode (this is the case when
|
|
// dawg_permute_and_select() function is called from NoDangerousAmbigs()) and
|
|
// only searches for the first choice that has a rating better than *limit
|
|
// (in this case ratings are fake, since the real ratings can not be < 0).
|
|
// Modification of the hyphen state is turned off in the ambigs mode.
|
|
// When in the regular dawg permuter mode, the function explores all the
|
|
// possible words and chooses the one with the best rating. The letters with
|
|
// ratings that are far worse than the ones seen so far are pruned out.
|
|
bool ambigs_mode = (*limit <= 0.0);
|
|
if (ambigs_mode) {
|
|
if (best_choice->rating() < *limit) return;
|
|
} else {
|
|
// Prune bad subwords
|
|
if (more_args->rating_array[word_index] == NO_RATING) {
|
|
more_args->rating_array[word_index] = word->rating();
|
|
} else {
|
|
float permdawg_limit = more_args->rating_array[word_index] *
|
|
more_args->rating_margin + kPermDawgRatingPad;
|
|
if (permdawg_limit < word->rating()) {
|
|
if (segment_dawg_debug) {
|
|
tprintf("early pruned word rating=%4.2f,"
|
|
" permdawg_limit=%4.2f, word=%s\n", word->rating(),
|
|
permdawg_limit, word->debug_string(getUnicharset()).string());
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
// Deal with hyphens
|
|
if (word_ending && has_hyphen_end(*word) && !ambigs_mode) {
|
|
if (segment_dawg_debug)
|
|
tprintf("new hyphen choice = %s\n",
|
|
word->debug_string(getUnicharset()).string());
|
|
word->set_permuter(more_args->permuter);
|
|
adjust_word(word, certainties);
|
|
set_hyphen_word(*word, *(more_args->active_dawgs),
|
|
*(more_args->constraints));
|
|
update_best_choice(*word, best_choice);
|
|
} else { // Look up char in DAWG
|
|
// TODO(daria): update the rest of the code that specifies alternative
|
|
// letter_is_okay_ functions (e.g. TessCharNgram class) to work with
|
|
// multi-byte unichars and/or unichar ids.
|
|
|
|
// If the current unichar is an ngram first try calling
|
|
// letter_is_okay() for each unigram it contains separately.
|
|
UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
|
|
bool checked_unigrams = false;
|
|
if (getUnicharset().get_isngram(orig_uch_id)) {
|
|
if (segment_dawg_debug) {
|
|
tprintf("checking unigrams in an ngram %s\n",
|
|
getUnicharset().debug_str(orig_uch_id).string());
|
|
}
|
|
int orig_num_fragments = word->fragment_length(word_index);
|
|
int num_unigrams = 0;
|
|
word->remove_last_unichar_id();
|
|
const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
|
|
const char *ngram_str_end = ngram_str + strlen(ngram_str);
|
|
const char *ngram_ptr = ngram_str;
|
|
bool unigrams_ok = true;
|
|
// Construct DawgArgs that reflect the current state.
|
|
DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
|
|
DawgInfoVector unigram_constraints = *(more_args->constraints);
|
|
DawgInfoVector unigram_updated_active_dawgs;
|
|
DawgInfoVector unigram_updated_constraints;
|
|
DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints,
|
|
&unigram_updated_active_dawgs,
|
|
&unigram_updated_constraints, 0.0);
|
|
unigram_dawg_args.permuter = more_args->permuter;
|
|
// Check unigrams in the ngram with letter_is_okay().
|
|
while (unigrams_ok && ngram_ptr < ngram_str_end) {
|
|
int step = getUnicharset().step(ngram_ptr);
|
|
UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
|
|
getUnicharset().unichar_to_id(ngram_ptr, step);
|
|
ngram_ptr += step;
|
|
++num_unigrams;
|
|
word->append_unichar_id(uch_id, 1, 0.0, 0.0);
|
|
unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
|
|
&unigram_dawg_args, word_index+num_unigrams-1, word,
|
|
word_ending && (ngram_ptr == ngram_str_end));
|
|
(*unigram_dawg_args.active_dawgs) =
|
|
*(unigram_dawg_args.updated_active_dawgs);
|
|
(*unigram_dawg_args.constraints) =
|
|
*(unigram_dawg_args.updated_constraints);
|
|
if (segment_dawg_debug) {
|
|
tprintf("unigram %s is %s\n",
|
|
getUnicharset().debug_str(uch_id).string(),
|
|
unigrams_ok ? "OK" : "not OK");
|
|
}
|
|
}
|
|
// Restore the word and copy the updated dawg state if needed.
|
|
while (num_unigrams-- > 0) word->remove_last_unichar_id();
|
|
word->append_unichar_id_space_allocated(
|
|
orig_uch_id, orig_num_fragments, 0.0, 0.0);
|
|
if (unigrams_ok) {
|
|
checked_unigrams = true;
|
|
more_args->permuter = unigram_dawg_args.permuter;
|
|
*(more_args->updated_active_dawgs) =
|
|
*(unigram_dawg_args.updated_active_dawgs);
|
|
*(more_args->updated_constraints) =
|
|
*(unigram_dawg_args.updated_constraints);
|
|
}
|
|
}
|
|
|
|
// Check which dawgs from dawgs_ vector contain the word
|
|
// up to and including the current unichar.
|
|
if (checked_unigrams ||
|
|
(this->*letter_is_okay_)(more_args, word_index, word, word_ending)) {
|
|
// Add a new word choice
|
|
if (word_ending) {
|
|
if (segment_dawg_debug) {
|
|
tprintf("found word = %s\n",
|
|
word->debug_string(getUnicharset()).string());
|
|
}
|
|
WERD_CHOICE *adjusted_word = word;
|
|
WERD_CHOICE hyphen_tail_word;
|
|
if (!ambigs_mode && hyphen_base_size() > 0) {
|
|
hyphen_tail_word = *word;
|
|
remove_hyphen_head(&hyphen_tail_word);
|
|
adjusted_word = &hyphen_tail_word;
|
|
}
|
|
adjusted_word->set_permuter(more_args->permuter);
|
|
if (!ambigs_mode) {
|
|
adjust_word(adjusted_word, &certainties[hyphen_base_size()]);
|
|
}
|
|
update_best_choice(*adjusted_word, best_choice);
|
|
} else { // search the next letter
|
|
// Make updated_* point to the next entries in the DawgInfoVector
|
|
// arrays (that were originally created in dawg_permute_and_select)
|
|
++(more_args->updated_active_dawgs);
|
|
++(more_args->updated_constraints);
|
|
// Make active_dawgs and constraints point to the updated ones.
|
|
++(more_args->active_dawgs);
|
|
++(more_args->constraints);
|
|
permute_choices(debug, char_choices, char_choice_index + 1,
|
|
prev_char_frag_info, word, certainties, limit,
|
|
best_choice, more_args);
|
|
// Restore previous state to explore another letter in this position.
|
|
--(more_args->updated_active_dawgs);
|
|
--(more_args->updated_constraints);
|
|
--(more_args->active_dawgs);
|
|
--(more_args->constraints);
|
|
}
|
|
} else {
|
|
if (segment_dawg_debug) {
|
|
tprintf("last unichar not OK at index %d in %s\n",
|
|
word_index, word->debug_string(getUnicharset()).string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**********************************************************************
|
|
* dawg_permute_and_select
|
|
*
|
|
* Recursively explore all the possible character combinations in
|
|
* the given char_choices. Use go_deeper_dawg_fxn() to search all the
|
|
* dawgs in the dawgs_ vector in parallel and discard invalid words.
|
|
*
|
|
* Allocate and return a WERD_CHOICE with the best valid word found.
|
|
* **********************************************************************/
|
|
WERD_CHOICE *Dict::dawg_permute_and_select(
|
|
const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
|
|
WERD_CHOICE *best_choice = new WERD_CHOICE();
|
|
best_choice->make_bad();
|
|
best_choice->set_rating(rating_limit);
|
|
if (char_choices.length() == 0) return best_choice;
|
|
DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
|
|
DawgInfoVector *constraints = new DawgInfoVector[char_choices.length() + 1];
|
|
init_active_dawgs(&(active_dawgs[0]));
|
|
init_constraints(&(constraints[0]));
|
|
DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
|
|
&(active_dawgs[1]), &(constraints[1]),
|
|
(segment_penalty_dict_case_bad /
|
|
segment_penalty_dict_case_ok));
|
|
WERD_CHOICE word(MAX_WERD_LENGTH);
|
|
copy_hyphen_info(&word);
|
|
// Discard rating and certainty of the hyphen base (if any).
|
|
word.set_rating(0.0);
|
|
word.set_certainty(0.0);
|
|
if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
|
|
delete[] active_dawgs;
|
|
delete[] constraints;
|
|
return best_choice; // the word is too long to permute
|
|
}
|
|
float certainties[MAX_WERD_LENGTH];
|
|
this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
|
|
permute_choices(segment_dawg_debug ? "segment_dawg_debug" : NULL,
|
|
char_choices, 0, NULL, &word, certainties,
|
|
&rating_limit, best_choice, &dawg_args);
|
|
delete[] active_dawgs;
|
|
delete[] constraints;
|
|
return best_choice;
|
|
}
|
|
|
|
// Fill the given active_dawgs vector with dawgs that could contain the
|
|
// beginning of the word. If hyphenated() returns true, copy the entries
|
|
// from hyphen_active_dawgs_ instead.
|
|
void Dict::init_active_dawgs(DawgInfoVector *active_dawgs) {
|
|
int i;
|
|
if (hyphenated()) {
|
|
*active_dawgs = hyphen_active_dawgs_;
|
|
if (dawg_debug_level >= 3) {
|
|
for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
|
|
tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
|
|
hyphen_active_dawgs_[i].dawg_index,
|
|
hyphen_active_dawgs_[i].ref);
|
|
}
|
|
}
|
|
} else {
|
|
for (i = 0; i < dawgs_.length(); ++i) {
|
|
if (kBeginningDawgsType[(dawgs_[i])->type()]) {
|
|
*active_dawgs += DawgInfo(i, NO_EDGE);
|
|
if (dawg_debug_level >= 3) {
|
|
tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// If hyphenated() returns true, copy the entries from hyphen_constraints_
|
|
// into the given constraints vector.
|
|
void Dict::init_constraints(DawgInfoVector *constraints) {
|
|
if (hyphenated()) {
|
|
*constraints = hyphen_constraints_;
|
|
if (dawg_debug_level >= 3) {
|
|
for (int i = 0; i < hyphen_constraints_.size(); ++i) {
|
|
tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
|
|
hyphen_constraints_[i].dawg_index,
|
|
hyphen_constraints_[i].ref);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace tesseract
|