2007-03-08 04:03:40 +08:00
|
|
|
/******************************************************************************
|
|
|
|
** Filename: stopper.c
|
|
|
|
** Purpose: Stopping criteria for word classifier.
|
|
|
|
** Author: Dan Johnson
|
|
|
|
** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
|
|
|
|
**
|
|
|
|
** (c) Copyright Hewlett-Packard Company, 1988.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
******************************************************************************/
|
2010-11-24 02:34:14 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <math.h>
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "stopper.h"
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "ambigs.h"
|
|
|
|
#include "ccutil.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "const.h"
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "danerror.h"
|
2009-07-11 10:20:33 +08:00
|
|
|
#include "dict.h"
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "efio.h"
|
|
|
|
#include "helpers.h"
|
|
|
|
#include "matchdefs.h"
|
|
|
|
#include "pageres.h"
|
|
|
|
#include "params.h"
|
2009-07-11 10:20:33 +08:00
|
|
|
#include "ratngs.h"
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "scanutils.h"
|
|
|
|
#include "unichar.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-07-21 23:11:19 +08:00
|
|
|
#ifdef _MSC_VER
|
|
|
|
#pragma warning(disable:4244) // Conversion warnings
|
|
|
|
#pragma warning(disable:4800) // int/bool warnings
|
|
|
|
#endif
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
using tesseract::ScriptPos;
|
2014-09-13 04:41:19 +08:00
|
|
|
/*----------------------------------------------------------------------------
|
2010-11-24 02:34:14 +08:00
|
|
|
Private Code
|
2014-09-13 04:41:19 +08:00
|
|
|
----------------------------------------------------------------------------*/
|
2012-02-02 10:56:18 +08:00
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
|
|
|
|
XHeightConsistencyEnum xheight_consistency) {
|
2009-07-11 10:20:33 +08:00
|
|
|
float CertaintyThreshold = stopper_nondict_certainty_base;
|
2007-03-08 04:03:40 +08:00
|
|
|
int WordSize;
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
if (stopper_no_acceptable_choices) return false;
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
if (best_choice.length() == 0) return false;
|
2009-07-11 10:20:33 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
|
|
|
|
bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
|
|
|
|
bool is_case_ok = case_ok(best_choice, getUnicharset());
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
if (stopper_debug_level >= 1) {
|
|
|
|
const char *xht = "UNKNOWN";
|
|
|
|
switch (xheight_consistency) {
|
|
|
|
case XH_GOOD: xht = "NORMAL"; break;
|
|
|
|
case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
|
|
|
|
case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
|
|
|
|
default: xht = "UNKNOWN";
|
|
|
|
}
|
|
|
|
tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
|
|
|
|
best_choice.unichar_string().string(),
|
2010-11-24 02:34:14 +08:00
|
|
|
(is_valid_word ? 'y' : 'n'),
|
2013-09-23 23:26:50 +08:00
|
|
|
(is_case_ok ? 'y' : 'n'),
|
|
|
|
xht,
|
|
|
|
best_choice.min_x_height(),
|
|
|
|
best_choice.max_x_height());
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
// Do not accept invalid words in PASS1.
|
|
|
|
if (reject_offset_ <= 0.0f && !is_valid_word) return false;
|
|
|
|
if (is_valid_word && is_case_ok) {
|
2013-09-23 23:26:50 +08:00
|
|
|
WordSize = LengthOfShortestAlphaRun(best_choice);
|
2009-07-11 10:20:33 +08:00
|
|
|
WordSize -= stopper_smallword_size;
|
2007-03-08 04:03:40 +08:00
|
|
|
if (WordSize < 0)
|
|
|
|
WordSize = 0;
|
2009-07-11 10:20:33 +08:00
|
|
|
CertaintyThreshold += WordSize * stopper_certainty_per_char;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
if (stopper_debug_level >= 1)
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
|
|
|
|
best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
if (no_dang_ambigs &&
|
2013-09-23 23:26:50 +08:00
|
|
|
best_choice.certainty() > CertaintyThreshold &&
|
|
|
|
xheight_consistency < XH_INCONSISTENT &&
|
|
|
|
UniformCertainties(best_choice)) {
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
2009-07-11 10:20:33 +08:00
|
|
|
} else {
|
2013-09-23 23:26:50 +08:00
|
|
|
if (stopper_debug_level >= 1) {
|
2010-11-24 02:34:14 +08:00
|
|
|
tprintf("AcceptableChoice() returned false"
|
2013-09-23 23:26:50 +08:00
|
|
|
" (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
|
|
|
|
no_dang_ambigs, best_choice.certainty(),
|
2010-11-24 02:34:14 +08:00
|
|
|
CertaintyThreshold,
|
2013-09-23 23:26:50 +08:00
|
|
|
UniformCertainties(best_choice));
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
return false;
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
bool Dict::AcceptableResult(WERD_RES* word) {
|
|
|
|
if (word->best_choice == NULL) return false;
|
2009-07-11 10:20:33 +08:00
|
|
|
float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
|
2007-03-08 04:03:40 +08:00
|
|
|
int WordSize;
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
if (stopper_debug_level >= 1) {
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
|
|
|
|
word->best_choice->debug_string().string(),
|
|
|
|
(valid_word(*word->best_choice) ? 'y' : 'n'),
|
|
|
|
(case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
|
|
|
|
word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
|
|
|
|
word->best_choices.singleton() ? 'n' : 'y');
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
if (word->best_choice->length() == 0 || !word->best_choices.singleton())
|
2009-07-11 10:20:33 +08:00
|
|
|
return false;
|
2013-09-23 23:26:50 +08:00
|
|
|
if (valid_word(*word->best_choice) &&
|
|
|
|
case_ok(*word->best_choice, getUnicharset())) {
|
|
|
|
WordSize = LengthOfShortestAlphaRun(*word->best_choice);
|
2009-07-11 10:20:33 +08:00
|
|
|
WordSize -= stopper_smallword_size;
|
2007-03-08 04:03:40 +08:00
|
|
|
if (WordSize < 0)
|
|
|
|
WordSize = 0;
|
2009-07-11 10:20:33 +08:00
|
|
|
CertaintyThreshold += WordSize * stopper_certainty_per_char;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
if (stopper_debug_level >= 1)
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
|
|
|
|
word->best_choice->certainty(), CertaintyThreshold);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
if (word->best_choice->certainty() > CertaintyThreshold &&
|
2009-07-11 10:20:33 +08:00
|
|
|
!stopper_no_acceptable_choices) {
|
|
|
|
if (stopper_debug_level >= 1)
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("ACCEPTED\n");
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
2013-09-23 23:26:50 +08:00
|
|
|
} else {
|
2009-07-11 10:20:33 +08:00
|
|
|
if (stopper_debug_level >= 1)
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("REJECTED\n");
|
2010-11-24 02:34:14 +08:00
|
|
|
return false;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
|
|
|
|
DANGERR *fixpt,
|
|
|
|
bool fix_replaceable,
|
2013-09-23 23:26:50 +08:00
|
|
|
MATRIX *ratings) {
|
2009-07-11 10:20:33 +08:00
|
|
|
if (stopper_debug_level > 2) {
|
|
|
|
tprintf("\nRunning NoDangerousAmbig() for %s\n",
|
2012-02-02 10:56:18 +08:00
|
|
|
best_choice->debug_string().string());
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
// Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
|
|
|
|
// for each unichar id in BestChoice.
|
|
|
|
BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
|
|
|
|
int i;
|
|
|
|
bool ambigs_found = false;
|
|
|
|
// For each position in best_choice:
|
|
|
|
// -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
|
|
|
|
// -- initialize wrong_ngram with a single unichar_id at best_choice[i]
|
|
|
|
// -- look for ambiguities corresponding to wrong_ngram in the list while
|
|
|
|
// adding the following unichar_ids from best_choice to wrong_ngram
|
|
|
|
//
|
|
|
|
// Repeat the above procedure twice: first time look through
|
|
|
|
// ambigs to be replaced and replace all the ambiguities found;
|
|
|
|
// second time look through dangerous ambiguities and construct
|
|
|
|
// ambig_blob_choices with fake a blob choice for each ambiguity
|
|
|
|
// and pass them to dawg_permute_and_select() to search for
|
|
|
|
// ambiguous words in the dictionaries.
|
|
|
|
//
|
|
|
|
// Note that during the execution of the for loop (on the first pass)
|
|
|
|
// if replacements are made the length of best_choice might change.
|
2010-11-24 02:34:14 +08:00
|
|
|
for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
|
|
|
|
bool replace = (fix_replaceable && pass == 0);
|
2009-07-11 10:20:33 +08:00
|
|
|
const UnicharAmbigsVector &table = replace ?
|
|
|
|
getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
|
|
|
|
if (!replace) {
|
|
|
|
// Initialize ambig_blob_choices with lists containing a single
|
|
|
|
// unichar id for the correspoding position in best_choice.
|
|
|
|
// best_choice consisting from only the original letters will
|
|
|
|
// have a rating of 0.0.
|
|
|
|
for (i = 0; i < best_choice->length(); ++i) {
|
|
|
|
BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
|
|
|
|
BLOB_CHOICE_IT lst_it(lst);
|
2013-09-23 23:26:50 +08:00
|
|
|
// TODO(rays/antonova) Put real xheights and y shifts here.
|
2009-07-11 10:20:33 +08:00
|
|
|
lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
|
2015-05-13 08:24:34 +08:00
|
|
|
0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
|
2009-07-11 10:20:33 +08:00
|
|
|
ambig_blob_choices.push_back(lst);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:20:33 +08:00
|
|
|
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
|
|
|
|
int wrong_ngram_index;
|
|
|
|
int next_index;
|
2010-11-24 02:34:14 +08:00
|
|
|
int blob_index = 0;
|
2013-09-23 23:26:50 +08:00
|
|
|
for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
|
|
|
|
++i) {
|
2009-07-11 10:20:33 +08:00
|
|
|
UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
|
|
|
|
if (stopper_debug_level > 2) {
|
|
|
|
tprintf("Looking for %s ngrams starting with %s:\n",
|
|
|
|
replace ? "replaceable" : "ambiguous",
|
|
|
|
getUnicharset().debug_str(curr_unichar_id).string());
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
int num_wrong_blobs = best_choice->state(i);
|
2009-07-11 10:20:33 +08:00
|
|
|
wrong_ngram_index = 0;
|
|
|
|
wrong_ngram[wrong_ngram_index] = curr_unichar_id;
|
|
|
|
if (curr_unichar_id == INVALID_UNICHAR_ID ||
|
|
|
|
curr_unichar_id >= table.size() ||
|
|
|
|
table[curr_unichar_id] == NULL) {
|
|
|
|
continue; // there is no ambig spec for this unichar id
|
|
|
|
}
|
|
|
|
AmbigSpec_IT spec_it(table[curr_unichar_id]);
|
|
|
|
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
|
|
|
|
const AmbigSpec *ambig_spec = spec_it.data();
|
|
|
|
wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
|
|
|
|
int compare = UnicharIdArrayUtils::compare(wrong_ngram,
|
|
|
|
ambig_spec->wrong_ngram);
|
|
|
|
if (stopper_debug_level > 2) {
|
|
|
|
tprintf("candidate ngram: ");
|
|
|
|
UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
|
|
|
|
tprintf("current ngram from spec: ");
|
|
|
|
UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
|
|
|
|
tprintf("comparison result: %d\n", compare);
|
|
|
|
}
|
|
|
|
if (compare == 0) {
|
2010-11-24 02:34:14 +08:00
|
|
|
// Record the place where we found an ambiguity.
|
|
|
|
if (fixpt != NULL) {
|
2013-09-23 23:26:50 +08:00
|
|
|
UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
|
2010-11-24 02:34:14 +08:00
|
|
|
fixpt->push_back(DANGERR_INFO(
|
2013-09-23 23:26:50 +08:00
|
|
|
blob_index, blob_index + num_wrong_blobs, replace,
|
|
|
|
getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
|
|
|
|
leftmost_id));
|
2010-11-24 02:34:14 +08:00
|
|
|
if (stopper_debug_level > 1) {
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
|
|
|
|
blob_index + num_wrong_blobs, false,
|
2010-11-24 02:34:14 +08:00
|
|
|
getUnicharset().get_isngram(
|
2013-09-23 23:26:50 +08:00
|
|
|
ambig_spec->correct_ngram_id),
|
|
|
|
getUnicharset().id_to_unichar(leftmost_id));
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
if (replace) {
|
|
|
|
if (stopper_debug_level > 2) {
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("replace ambiguity with %s : ",
|
|
|
|
getUnicharset().id_to_unichar(
|
|
|
|
ambig_spec->correct_ngram_id));
|
2009-07-11 10:20:33 +08:00
|
|
|
UnicharIdArrayUtils::print(
|
|
|
|
ambig_spec->correct_fragments, getUnicharset());
|
|
|
|
}
|
|
|
|
ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
|
|
|
|
ambig_spec->correct_ngram_id,
|
2013-09-23 23:26:50 +08:00
|
|
|
best_choice, ratings);
|
2009-07-11 10:20:33 +08:00
|
|
|
} else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
|
|
|
|
// We found dang ambig - update ambig_blob_choices.
|
|
|
|
if (stopper_debug_level > 2) {
|
|
|
|
tprintf("found ambiguity: ");
|
|
|
|
UnicharIdArrayUtils::print(
|
|
|
|
ambig_spec->correct_fragments, getUnicharset());
|
|
|
|
}
|
|
|
|
ambigs_found = true;
|
|
|
|
for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
|
|
|
|
++tmp_index) {
|
|
|
|
// Add a blob choice for the corresponding fragment of the
|
|
|
|
// ambiguity. These fake blob choices are initialized with
|
|
|
|
// negative ratings (which are not possible for real blob
|
|
|
|
// choices), so that dawg_permute_and_select() considers any
|
|
|
|
// word not consisting of only the original letters a better
|
|
|
|
// choice and stops searching for alternatives once such a
|
|
|
|
// choice is found.
|
|
|
|
BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
|
|
|
|
bc_it.add_to_end(new BLOB_CHOICE(
|
2010-11-24 02:34:14 +08:00
|
|
|
ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
|
2015-05-13 08:24:34 +08:00
|
|
|
-1, 0, 1, 0, BCC_AMBIG));
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
spec_it.forward();
|
|
|
|
} else if (compare == -1) {
|
|
|
|
if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
|
|
|
|
((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
|
|
|
|
// Add the next unichar id to wrong_ngram and keep looking for
|
|
|
|
// more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
|
|
|
|
wrong_ngram[++wrong_ngram_index] =
|
|
|
|
best_choice->unichar_id(next_index);
|
2013-09-23 23:26:50 +08:00
|
|
|
num_wrong_blobs += best_choice->state(next_index);
|
2009-07-11 10:20:33 +08:00
|
|
|
} else {
|
|
|
|
break; // no more matching ambigs in this AMBIG_SPEC_LIST
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
spec_it.forward();
|
|
|
|
}
|
|
|
|
} // end searching AmbigSpec_LIST
|
|
|
|
} // end searching best_choice
|
|
|
|
} // end searching replace and dangerous ambigs
|
2012-02-15 09:37:00 +08:00
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
// If any ambiguities were found permute the constructed ambig_blob_choices
|
|
|
|
// to see if an alternative dictionary word can be found.
|
|
|
|
if (ambigs_found) {
|
|
|
|
if (stopper_debug_level > 2) {
|
|
|
|
tprintf("\nResulting ambig_blob_choices:\n");
|
|
|
|
for (i = 0; i < ambig_blob_choices.length(); ++i) {
|
|
|
|
print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
|
|
|
|
tprintf("\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
|
|
|
|
ambigs_found = (alt_word->rating() < 0.0);
|
2010-11-24 02:34:14 +08:00
|
|
|
if (ambigs_found) {
|
|
|
|
if (stopper_debug_level >= 1) {
|
|
|
|
tprintf ("Stopper: Possible ambiguous word = %s\n",
|
2012-02-02 10:56:18 +08:00
|
|
|
alt_word->debug_string().string());
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
if (fixpt != NULL) {
|
|
|
|
// Note: Currently character choices combined from fragments can only
|
|
|
|
// be generated by NoDangrousAmbigs(). This code should be updated if
|
|
|
|
// the capability to produce classifications combined from character
|
|
|
|
// fragments is added to other functions.
|
|
|
|
int orig_i = 0;
|
|
|
|
for (i = 0; i < alt_word->length(); ++i) {
|
2013-09-23 23:26:50 +08:00
|
|
|
const UNICHARSET &uchset = getUnicharset();
|
2012-03-29 05:02:54 +08:00
|
|
|
bool replacement_is_ngram =
|
2013-09-23 23:26:50 +08:00
|
|
|
uchset.get_isngram(alt_word->unichar_id(i));
|
|
|
|
UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
|
|
|
|
if (replacement_is_ngram) {
|
|
|
|
// we have to extract the leftmost unichar from the ngram.
|
|
|
|
const char *str = uchset.id_to_unichar(leftmost_id);
|
|
|
|
int step = uchset.step(str);
|
|
|
|
if (step) leftmost_id = uchset.unichar_to_id(str, step);
|
2012-03-29 14:15:33 +08:00
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
int end_i = orig_i + alt_word->state(i);
|
|
|
|
if (alt_word->state(i) > 1 ||
|
|
|
|
(orig_i + 1 == end_i && replacement_is_ngram)) {
|
|
|
|
// Compute proper blob indices.
|
|
|
|
int blob_start = 0;
|
|
|
|
for (int j = 0; j < orig_i; ++j)
|
|
|
|
blob_start += best_choice->state(j);
|
|
|
|
int blob_end = blob_start;
|
|
|
|
for (int j = orig_i; j < end_i; ++j)
|
|
|
|
blob_end += best_choice->state(j);
|
|
|
|
fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
|
|
|
|
replacement_is_ngram, leftmost_id));
|
|
|
|
if (stopper_debug_level > 1) {
|
|
|
|
tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
|
|
|
|
true, replacement_is_ngram,
|
|
|
|
uchset.id_to_unichar(leftmost_id));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
orig_i += alt_word->state(i);
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
|
|
|
delete alt_word;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2012-02-02 10:56:18 +08:00
|
|
|
if (output_ambig_words_file_ != NULL) {
|
|
|
|
fprintf(output_ambig_words_file_, "\n");
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
ambig_blob_choices.delete_data_pointers();
|
|
|
|
return !ambigs_found;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
void Dict::EndDangerousAmbigs() {}
|
|
|
|
|
|
|
|
void Dict::SettupStopperPass1() {
|
|
|
|
reject_offset_ = 0.0;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
void Dict::SettupStopperPass2() {
|
|
|
|
reject_offset_ = stopper_phase2_certainty_rejection_offset;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
|
|
|
|
UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
|
2013-09-23 23:26:50 +08:00
|
|
|
MATRIX *ratings) {
|
2009-07-11 10:20:33 +08:00
|
|
|
int num_blobs_to_replace = 0;
|
|
|
|
int begin_blob_index = 0;
|
|
|
|
int i;
|
2013-09-23 23:26:50 +08:00
|
|
|
// Rating and certainty for the new BLOB_CHOICE are derived from the
|
|
|
|
// replaced choices.
|
|
|
|
float new_rating = 0.0f;
|
|
|
|
float new_certainty = 0.0f;
|
|
|
|
BLOB_CHOICE* old_choice = NULL;
|
2009-07-11 10:20:33 +08:00
|
|
|
for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
|
|
|
|
if (i >= wrong_ngram_begin_index) {
|
2013-09-23 23:26:50 +08:00
|
|
|
int num_blobs = werd_choice->state(i);
|
|
|
|
int col = begin_blob_index + num_blobs_to_replace;
|
|
|
|
int row = col + num_blobs - 1;
|
|
|
|
BLOB_CHOICE_LIST* choices = ratings->get(col, row);
|
|
|
|
ASSERT_HOST(choices != NULL);
|
|
|
|
old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
|
|
|
|
ASSERT_HOST(old_choice != NULL);
|
|
|
|
new_rating += old_choice->rating();
|
|
|
|
new_certainty += old_choice->certainty();
|
|
|
|
num_blobs_to_replace += num_blobs;
|
2009-07-11 10:20:33 +08:00
|
|
|
} else {
|
2013-09-23 23:26:50 +08:00
|
|
|
begin_blob_index += werd_choice->state(i);
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
new_certainty /= wrong_ngram_size;
|
|
|
|
// If there is no entry in the ratings matrix, add it.
|
|
|
|
MATRIX_COORD coord(begin_blob_index,
|
|
|
|
begin_blob_index + num_blobs_to_replace - 1);
|
|
|
|
if (!coord.Valid(*ratings)) {
|
|
|
|
ratings->IncreaseBandSize(coord.row - coord.col + 1);
|
|
|
|
}
|
|
|
|
if (ratings->get(coord.col, coord.row) == NULL)
|
|
|
|
ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
|
|
|
|
BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
|
|
|
|
BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
|
|
|
|
if (choice != NULL) {
|
|
|
|
// Already there. Upgrade if new rating better.
|
|
|
|
if (new_rating < choice->rating())
|
|
|
|
choice->set_rating(new_rating);
|
|
|
|
if (new_certainty < choice->certainty())
|
|
|
|
choice->set_certainty(new_certainty);
|
2013-10-10 10:07:26 +08:00
|
|
|
// DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
|
2013-09-23 23:26:50 +08:00
|
|
|
} else {
|
|
|
|
// Need a new choice with the correct_ngram_id.
|
|
|
|
choice = new BLOB_CHOICE(*old_choice);
|
|
|
|
choice->set_unichar_id(correct_ngram_id);
|
|
|
|
choice->set_rating(new_rating);
|
|
|
|
choice->set_certainty(new_certainty);
|
|
|
|
choice->set_classifier(BCC_AMBIG);
|
|
|
|
choice->set_matrix_cell(coord.col, coord.row);
|
2013-10-10 10:07:26 +08:00
|
|
|
BLOB_CHOICE_IT it (new_choices);
|
|
|
|
it.add_to_end(choice);
|
2013-09-23 23:26:50 +08:00
|
|
|
}
|
|
|
|
// Remove current unichar from werd_choice. On the last iteration
|
|
|
|
// set the correct replacement unichar instead of removing a unichar.
|
2009-07-11 10:20:33 +08:00
|
|
|
for (int replaced_count = 0; replaced_count < wrong_ngram_size;
|
|
|
|
++replaced_count) {
|
|
|
|
if (replaced_count + 1 == wrong_ngram_size) {
|
2013-09-23 23:26:50 +08:00
|
|
|
werd_choice->set_blob_choice(wrong_ngram_begin_index,
|
|
|
|
num_blobs_to_replace, choice);
|
2009-07-11 10:20:33 +08:00
|
|
|
} else {
|
2013-09-23 23:26:50 +08:00
|
|
|
werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
if (stopper_debug_level >= 1) {
|
2010-11-24 02:34:14 +08:00
|
|
|
werd_choice->print("ReplaceAmbig() ");
|
2009-07-11 10:20:33 +08:00
|
|
|
tprintf("Modified blob_choices: ");
|
2013-09-23 23:26:50 +08:00
|
|
|
print_ratings_list("\n", new_choices, getUnicharset());
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
|
2010-11-24 02:34:14 +08:00
|
|
|
int shortest = MAX_INT32;
|
|
|
|
int curr_len = 0;
|
|
|
|
for (int w = 0; w < WordChoice.length(); ++w) {
|
|
|
|
if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
|
|
|
|
curr_len++;
|
|
|
|
} else if (curr_len > 0) {
|
|
|
|
if (curr_len < shortest) shortest = curr_len;
|
|
|
|
curr_len = 0;
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (curr_len > 0 && curr_len < shortest) {
|
|
|
|
shortest = curr_len;
|
|
|
|
} else if (shortest == MAX_INT32) {
|
|
|
|
shortest = 0;
|
|
|
|
}
|
|
|
|
return shortest;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
int Dict::UniformCertainties(const WERD_CHOICE& word) {
|
2007-03-08 04:03:40 +08:00
|
|
|
float Certainty;
|
|
|
|
float WorstCertainty = MAX_FLOAT32;
|
|
|
|
float CertaintyThreshold;
|
|
|
|
FLOAT64 TotalCertainty;
|
|
|
|
FLOAT64 TotalCertaintySquared;
|
|
|
|
FLOAT64 Variance;
|
|
|
|
FLOAT32 Mean, StdDev;
|
2013-09-23 23:26:50 +08:00
|
|
|
int word_length = word.length();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
if (word_length < 3)
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
TotalCertainty = TotalCertaintySquared = 0.0;
|
2013-09-23 23:26:50 +08:00
|
|
|
for (int i = 0; i < word_length; ++i) {
|
|
|
|
Certainty = word.certainty(i);
|
2007-03-08 04:03:40 +08:00
|
|
|
TotalCertainty += Certainty;
|
|
|
|
TotalCertaintySquared += Certainty * Certainty;
|
|
|
|
if (Certainty < WorstCertainty)
|
|
|
|
WorstCertainty = Certainty;
|
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Subtract off worst certainty from statistics.
|
2013-09-23 23:26:50 +08:00
|
|
|
word_length--;
|
2007-03-08 04:03:40 +08:00
|
|
|
TotalCertainty -= WorstCertainty;
|
|
|
|
TotalCertaintySquared -= WorstCertainty * WorstCertainty;
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
Mean = TotalCertainty / word_length;
|
|
|
|
Variance = ((word_length * TotalCertaintySquared -
|
2007-03-08 04:03:40 +08:00
|
|
|
TotalCertainty * TotalCertainty) /
|
2013-09-23 23:26:50 +08:00
|
|
|
(word_length * (word_length - 1)));
|
2007-03-08 04:03:40 +08:00
|
|
|
if (Variance < 0.0)
|
|
|
|
Variance = 0.0;
|
2013-10-10 10:07:26 +08:00
|
|
|
StdDev = sqrt(Variance);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
|
|
|
|
if (CertaintyThreshold > stopper_nondict_certainty_base)
|
|
|
|
CertaintyThreshold = stopper_nondict_certainty_base;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
if (word.certainty() < CertaintyThreshold) {
|
2009-07-11 10:20:33 +08:00
|
|
|
if (stopper_debug_level >= 1)
|
2013-09-23 23:26:50 +08:00
|
|
|
tprintf("Stopper: Non-uniform certainty = %4.1f"
|
2009-07-11 10:20:33 +08:00
|
|
|
" (m=%4.1f, s=%4.1f, t=%4.1f)\n",
|
2013-09-23 23:26:50 +08:00
|
|
|
word.certainty(), Mean, StdDev, CertaintyThreshold);
|
2010-11-24 02:34:14 +08:00
|
|
|
return false;
|
2009-07-11 10:20:33 +08:00
|
|
|
} else {
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
2009-07-11 10:20:33 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace tesseract
|