2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: reject.cpp (Formerly reject.c)
|
|
|
|
* Description: Rejection functions used in tessedit
|
|
|
|
* Author: Phil Cheatle
|
|
|
|
* Created: Wed Sep 23 16:50:21 BST 1992
|
|
|
|
*
|
|
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
2010-05-28 20:03:45 +08:00
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
#include "config_auto.h"
|
|
|
|
#endif
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "mfcpch.h"
|
|
|
|
#include "tessvars.h"
|
|
|
|
#ifdef __UNIX__
|
|
|
|
#include <assert.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#endif
|
|
|
|
#include "scanutils.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <string.h>
|
|
|
|
//#include "tessbox.h"
|
|
|
|
#include "memry.h"
|
|
|
|
#include "reject.h"
|
|
|
|
#include "tfacep.h"
|
|
|
|
#include "mainblk.h"
|
|
|
|
#include "charcut.h"
|
|
|
|
#include "imgs.h"
|
|
|
|
#include "scaleimg.h"
|
|
|
|
#include "control.h"
|
|
|
|
#include "docqual.h"
|
|
|
|
#include "secname.h"
|
2007-07-18 09:15:07 +08:00
|
|
|
#include "globals.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/* #define SECURE_NAMES done in secnames.h when necessary */
|
|
|
|
|
|
|
|
//extern "C" {
|
|
|
|
#include "callnet.h"
|
|
|
|
//}
|
2009-07-11 10:03:51 +08:00
|
|
|
#include "tesseractclass.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "notdll.h"
|
|
|
|
|
|
|
|
CLISTIZEH (STRING) CLISTIZE (STRING)
|
|
|
|
#define EXTERN
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (tessedit_use_nn, FALSE, "");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
|
|
|
|
EXTERN
|
|
|
|
double_VAR (tessedit_lower_flip_hyphen, 1.5,
|
|
|
|
"Aspect ratio dot/hyphen test");
|
|
|
|
EXTERN
|
|
|
|
double_VAR (tessedit_upper_flip_hyphen, 1.8,
|
|
|
|
"Aspect ratio dot/hyphen test");
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_trust_doc_dawg, FALSE,
|
|
|
|
"Use DOC dawg in 11l conf. detector");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_double_check_dict, TRUE,
|
|
|
|
"Double check for confusions");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
|
|
|
|
"Require stronger NN match");
|
|
|
|
EXTERN
|
|
|
|
double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (nn_conf_initial_i_level, 3,
|
|
|
|
"NN accept initial Ii match level ");
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
|
|
|
|
EXTERN
|
|
|
|
BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
|
|
|
|
"if >this fract");
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (rej_mostly_reject_mode, 1,
|
|
|
|
"0-never, 1-afterNN, 2-after new xht");
|
|
|
|
EXTERN
|
|
|
|
double_VAR (tessed_fullstop_aspect_ratio, 1.2,
|
|
|
|
"if >this fract then reject");
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (net_image_width, 40, "NN input image width");
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (net_image_height, 36, "NN input image height");
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (net_image_x_height, 22, "NN input image x_height");
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
|
|
|
|
|
|
|
|
/*
|
|
|
|
Net input is assumed to have (net_image_width * net_image_height) input
|
|
|
|
units of image pixels, followed by 0, 1, or N units representing the
|
|
|
|
baseline position. 0 implies no baseline information. 1 implies a floating
|
|
|
|
point value. N implies a "guage" of N units. For any char an initial set
|
|
|
|
of these are ON, the remainder OFF to indicate the "level" of the
|
|
|
|
baseline.
|
|
|
|
|
|
|
|
HOWEVER!!! NOTE THAT EACH NEW INPUT LAYER FORMAT EXPECTS TO BE RUN WITH A
|
|
|
|
DIFFERENT tessed/netmatch/nmatch.c MODULE. - These are classic C modules
|
|
|
|
generated by aspirin with HARD CODED CONSTANTS
|
|
|
|
*/
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
|
|
|
|
EXTERN
|
|
|
|
double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
|
|
|
|
|
|
|
|
/* NOTE - ctoh doesn't handle "=" properly, hence \075 */
|
|
|
|
EXTERN
|
|
|
|
STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
|
|
|
|
"Allow NN to unrej");
|
|
|
|
EXTERN
|
|
|
|
STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
|
|
|
|
"Allow NN to unrej");
|
|
|
|
EXTERN
|
|
|
|
STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
|
|
|
|
EXTERN
|
|
|
|
STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
|
|
|
|
EXTERN
|
|
|
|
STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
|
|
|
|
EXTERN
|
|
|
|
STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
|
|
|
|
"Unreliable chars");
|
|
|
|
EXTERN
|
|
|
|
STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
|
|
|
|
"Unreliable chars");
|
|
|
|
|
|
|
|
EXTERN
|
|
|
|
INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* set_done()
|
|
|
|
*
|
|
|
|
* Set the done flag based on the word acceptability criteria
|
|
|
|
*************************************************************************/
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
namespace tesseract {
|
|
|
|
void Tesseract::set_done( //set done flag
|
|
|
|
WERD_RES *word,
|
|
|
|
inT16 pass) {
|
2007-03-08 04:03:40 +08:00
|
|
|
/*
|
|
|
|
0: Original heuristic used in Tesseract and Ray's prototype Resaljet
|
|
|
|
*/
|
|
|
|
if (tessedit_ok_mode == 0) {
|
|
|
|
/* NOTE - done even if word contains some or all spaces !!! */
|
|
|
|
word->done = word->tess_accepted;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
|
|
|
|
*/
|
|
|
|
else if (tessedit_ok_mode == 1) {
|
|
|
|
word->done = word->tess_accepted &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
|
|
|
word->done = FALSE;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
2: as 1 + only accept dict words or numerics in pass 1
|
|
|
|
*/
|
|
|
|
else if (tessedit_ok_mode == 2) {
|
|
|
|
word->done = word->tess_accepted &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
|
|
|
word->done = FALSE;
|
|
|
|
|
|
|
|
if (word->done &&
|
|
|
|
(pass == 1) &&
|
|
|
|
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != NUMBER_PERM)) {
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (tessedit_rejection_debug)
|
|
|
|
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
2009-07-11 10:03:51 +08:00
|
|
|
word->best_choice->unichar_string().string ());
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
|
|
|
word->done = FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
3: as 2 + only accept dict words or numerics in pass 2 as well
|
|
|
|
*/
|
|
|
|
else if (tessedit_ok_mode == 3) {
|
|
|
|
word->done = word->tess_accepted &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
|
|
|
word->done = FALSE;
|
|
|
|
|
|
|
|
if (word->done &&
|
|
|
|
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != NUMBER_PERM)) {
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (tessedit_rejection_debug)
|
|
|
|
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
2009-07-11 10:03:51 +08:00
|
|
|
word->best_choice->unichar_string().string ());
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
|
|
|
word->done = FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
4: as 2 + reject dict ambigs in pass 1
|
|
|
|
*/
|
|
|
|
else if (tessedit_ok_mode == 4) {
|
|
|
|
word->done = word->tess_accepted &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
|
|
|
word->done = FALSE;
|
|
|
|
|
|
|
|
if (word->done &&
|
|
|
|
(pass == 1) &&
|
2008-02-01 08:33:18 +08:00
|
|
|
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
2007-03-08 04:03:40 +08:00
|
|
|
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != NUMBER_PERM)) ||
|
2008-02-01 08:33:18 +08:00
|
|
|
(test_ambig_word (word)))) {
|
2007-03-08 04:03:40 +08:00
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (tessedit_rejection_debug)
|
|
|
|
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
2009-07-11 10:03:51 +08:00
|
|
|
word->best_choice->unichar_string().string ());
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
|
|
|
word->done = FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
5: as 3 + reject dict ambigs in both passes
|
|
|
|
*/
|
|
|
|
else if (tessedit_ok_mode == 5) {
|
|
|
|
word->done = word->tess_accepted &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
|
|
|
word->done = FALSE;
|
|
|
|
|
|
|
|
if (word->done &&
|
2008-02-01 08:33:18 +08:00
|
|
|
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
2007-03-08 04:03:40 +08:00
|
|
|
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
|
|
|
(word->best_choice->permuter () != NUMBER_PERM)) ||
|
2008-02-01 08:33:18 +08:00
|
|
|
(test_ambig_word (word)))) {
|
2007-03-08 04:03:40 +08:00
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (tessedit_rejection_debug)
|
|
|
|
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
2009-07-11 10:03:51 +08:00
|
|
|
word->best_choice->unichar_string().string ());
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
|
|
|
word->done = FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
else {
|
|
|
|
tprintf ("BAD tessedit_ok_mode\n");
|
|
|
|
err_exit();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* make_reject_map()
|
|
|
|
*
|
|
|
|
* Sets the done flag to indicate whether the resylt is acceptable.
|
|
|
|
*
|
|
|
|
* Sets a reject map for the word.
|
|
|
|
*************************************************************************/
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::make_reject_map( //make rej map for wd //detailed results
|
|
|
|
WERD_RES *word,
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
|
|
|
ROW *row,
|
|
|
|
inT16 pass //1st or 2nd?
|
|
|
|
) {
|
2007-07-18 09:15:07 +08:00
|
|
|
int i;
|
|
|
|
int offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
flip_0O(word);
|
|
|
|
check_debug_pt (word, -1); //For trap only
|
|
|
|
set_done(word, pass); //Set acceptance
|
2009-07-11 10:03:51 +08:00
|
|
|
word->reject_map.initialise (word->best_choice->unichar_lengths().length ());
|
2007-03-08 04:03:40 +08:00
|
|
|
reject_blanks(word);
|
|
|
|
/*
|
|
|
|
0: Rays original heuristic - the baseline
|
|
|
|
*/
|
|
|
|
if (tessedit_reject_mode == 0) {
|
|
|
|
if (!word->done)
|
|
|
|
reject_poor_matches(word, blob_choices);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
5: Reject I/1/l from words where there is no strong contextual confirmation;
|
|
|
|
the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
|
|
|
|
and the whole of any words which are very small
|
|
|
|
*/
|
|
|
|
else if (tessedit_reject_mode == 5) {
|
|
|
|
if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
|
|
|
|
word->reject_map.rej_word_small_xht ();
|
|
|
|
else {
|
|
|
|
one_ell_conflict(word, TRUE);
|
|
|
|
/*
|
|
|
|
Originally the code here just used the done flag. Now I have duplicated
|
|
|
|
and unpacked the conditions for setting the done flag so that each
|
|
|
|
mechanism can be turned on or off independently. This works WITHOUT
|
|
|
|
affecting the done flag setting.
|
|
|
|
*/
|
|
|
|
if (rej_use_tess_accepted && !word->tess_accepted)
|
|
|
|
word->reject_map.rej_word_not_tess_accepted ();
|
|
|
|
|
|
|
|
if (rej_use_tess_blanks &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
|
2007-03-08 04:03:40 +08:00
|
|
|
word->reject_map.rej_word_contains_blanks ();
|
|
|
|
|
|
|
|
if (rej_use_good_perm) {
|
|
|
|
if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
|
|
|
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
|
|
|
|
(word->best_choice->permuter () == USER_DAWG_PERM)) &&
|
|
|
|
(!rej_use_sensible_wd ||
|
|
|
|
(acceptable_word_string
|
2009-07-11 10:03:51 +08:00
|
|
|
(word->best_choice->unichar_string().string (),
|
|
|
|
word->best_choice->unichar_lengths().string ()) !=
|
2007-03-08 04:03:40 +08:00
|
|
|
AC_UNACCEPTABLE))) {
|
|
|
|
//PASSED TEST
|
|
|
|
}
|
|
|
|
else if (word->best_choice->permuter () == NUMBER_PERM) {
|
|
|
|
if (rej_alphas_in_number_perm) {
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0;
|
2009-07-11 10:03:51 +08:00
|
|
|
word->best_choice->unichar_string()[offset] != '\0';
|
|
|
|
offset += word->best_choice->unichar_lengths()[i++]) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (word->reject_map[i].accepted () &&
|
2009-07-11 10:03:51 +08:00
|
|
|
unicharset.get_isalpha(
|
|
|
|
word->best_choice->unichar_string().string() + offset,
|
|
|
|
word->best_choice->unichar_lengths()[i]))
|
2007-03-08 04:03:40 +08:00
|
|
|
word->reject_map[i].setrej_bad_permuter ();
|
|
|
|
//rej alpha
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
word->reject_map.rej_word_bad_permuter ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Ambig word rejection was here once !!*/
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
tprintf ("BAD tessedit_reject_mode\n");
|
|
|
|
err_exit();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tessedit_image_border > -1)
|
|
|
|
reject_edge_blobs(word);
|
|
|
|
|
|
|
|
check_debug_pt (word, 10);
|
|
|
|
if (tessedit_rejection_debug) {
|
|
|
|
tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
|
|
|
|
tprintf ("Certainty: %f Rating: %f\n",
|
|
|
|
word->best_choice->certainty (), word->best_choice->rating ());
|
2009-07-11 10:03:51 +08:00
|
|
|
tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Un-reject any rejected characters if NN permits */
|
|
|
|
|
|
|
|
if (tessedit_use_nn && (pass == 2) &&
|
|
|
|
word->reject_map.recoverable_rejects ())
|
|
|
|
nn_recover_rejects(word, row);
|
|
|
|
flip_hyphens(word);
|
|
|
|
check_debug_pt (word, 20);
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
void reject_blanks(WERD_RES *word) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i;
|
|
|
|
inT16 offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
|
|
|
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
|
|
|
if (word->best_choice->unichar_string()[offset] == ' ')
|
2007-03-08 04:03:40 +08:00
|
|
|
//rej unrecognised blobs
|
|
|
|
word->reject_map[i].setrej_tess_failure ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void reject_I_1_L(WERD_RES *word) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i;
|
|
|
|
inT16 offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
|
|
|
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (STRING (conflict_set_I_l_1).
|
2009-07-11 10:03:51 +08:00
|
|
|
contains (word->best_choice->unichar_string()[offset])) {
|
2007-03-08 04:03:40 +08:00
|
|
|
//rej 1Il conflict
|
|
|
|
word->reject_map[i].setrej_1Il_conflict ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void reject_poor_matches( //detailed results
|
|
|
|
WERD_RES *word,
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
|
|
|
float threshold;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i = 0;
|
|
|
|
inT16 offset = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
//super iterator
|
|
|
|
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
|
|
|
|
BLOB_CHOICE_IT choice_it; //real iterator
|
|
|
|
|
|
|
|
#ifndef SECURE_NAMES
|
2009-07-11 10:03:51 +08:00
|
|
|
if (strlen(word->best_choice->unichar_lengths().string()) !=
|
|
|
|
list_it.length()) {
|
2007-03-08 04:03:40 +08:00
|
|
|
tprintf
|
|
|
|
("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
|
2009-07-11 10:03:51 +08:00
|
|
|
word->best_choice->unichar_string().string(),
|
|
|
|
strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
|
|
|
|
word->outword->blob_list()->length());
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
#endif
|
2009-07-11 10:03:51 +08:00
|
|
|
ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
|
2007-03-08 04:03:40 +08:00
|
|
|
list_it.length ());
|
|
|
|
ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
|
|
|
|
threshold = compute_reject_threshold (blob_choices);
|
|
|
|
|
|
|
|
for (list_it.mark_cycle_pt ();
|
2007-07-18 09:15:07 +08:00
|
|
|
!list_it.cycled_list (); list_it.forward (), i++,
|
2009-07-11 10:03:51 +08:00
|
|
|
offset += word->best_choice->unichar_lengths()[i]) {
|
2007-03-08 04:03:40 +08:00
|
|
|
/* NB - only compares the threshold against the TOP choice char in the
|
2009-07-11 10:03:51 +08:00
|
|
|
choices list for a blob !! - the selected one may be below the threshold
|
|
|
|
*/
|
2007-03-08 04:03:40 +08:00
|
|
|
choice_it.set_to_list (list_it.data ());
|
2009-07-11 10:03:51 +08:00
|
|
|
if ((word->best_choice->unichar_string()[offset] == ' ') ||
|
2007-03-08 04:03:40 +08:00
|
|
|
(choice_it.length () == 0))
|
|
|
|
//rej unrecognised blobs
|
|
|
|
word->reject_map[i].setrej_tess_failure ();
|
|
|
|
else if (choice_it.data ()->certainty () < threshold)
|
|
|
|
//rej poor score blob
|
|
|
|
word->reject_map[i].setrej_poor_match ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* compute_reject_threshold
|
|
|
|
*
|
|
|
|
* Set a rejection threshold for this word.
|
|
|
|
* Initially this is a trivial function which looks for the largest
|
|
|
|
* gap in the certainty value.
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
float compute_reject_threshold( //compute threshold //detailed results
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 index; //to ratings
|
|
|
|
inT16 blob_count; //no of blobs in word
|
|
|
|
inT16 ok_blob_count = 0; //non TESS rej blobs in word
|
2007-03-08 04:03:40 +08:00
|
|
|
float *ratings; //array of confidences
|
|
|
|
float threshold; //rejection threshold
|
|
|
|
float bestgap; //biggest gap
|
|
|
|
float gapstart; //bottom of gap
|
|
|
|
//super iterator
|
|
|
|
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
|
|
|
|
BLOB_CHOICE_IT choice_it; //real iterator
|
|
|
|
|
|
|
|
blob_count = blob_choices->length ();
|
|
|
|
ratings = (float *) alloc_mem (blob_count * sizeof (float));
|
|
|
|
for (list_it.mark_cycle_pt (), index = 0;
|
|
|
|
!list_it.cycled_list (); list_it.forward (), index++) {
|
|
|
|
choice_it.set_to_list (list_it.data ());
|
|
|
|
if (choice_it.length () > 0) {
|
|
|
|
ratings[ok_blob_count] = choice_it.data ()->certainty ();
|
|
|
|
//get in an array
|
|
|
|
// tprintf("Rating[%d]=%c %g %g\n",
|
|
|
|
// index,choice_it.data()->char_class(),
|
|
|
|
// choice_it.data()->rating(),choice_it.data()->certainty());
|
|
|
|
ok_blob_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_HOST (index == blob_count);
|
|
|
|
qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
|
|
|
|
//sort them
|
|
|
|
bestgap = 0;
|
|
|
|
gapstart = ratings[0] - 1; //all reject if none better
|
|
|
|
if (ok_blob_count >= 3) {
|
|
|
|
for (index = 0; index < ok_blob_count - 1; index++) {
|
|
|
|
if (ratings[index + 1] - ratings[index] > bestgap) {
|
|
|
|
bestgap = ratings[index + 1] - ratings[index];
|
|
|
|
//find biggest
|
|
|
|
gapstart = ratings[index];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
threshold = gapstart + bestgap / 2;
|
|
|
|
// tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
|
|
|
|
// ratings[0],ratings[index],bestgap,threshold);
|
|
|
|
|
|
|
|
free_mem(ratings);
|
|
|
|
return threshold;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* reject_edge_blobs()
|
|
|
|
*
|
|
|
|
* If the word is perilously close to the edge of the image, reject those blobs
|
|
|
|
* in the word which are too close to the edge as they could be clipped.
|
|
|
|
*************************************************************************/
|
|
|
|
|
|
|
|
void reject_edge_blobs(WERD_RES *word) {
|
2008-04-22 08:32:14 +08:00
|
|
|
TBOX word_box = word->word->bounding_box ();
|
|
|
|
TBOX blob_box;
|
2007-03-08 04:03:40 +08:00
|
|
|
PBLOB_IT blob_it = word->outword->blob_list ();
|
|
|
|
//blobs
|
|
|
|
int blobindex = 0;
|
|
|
|
float centre;
|
|
|
|
|
|
|
|
if ((word_box.left () < tessedit_image_border) ||
|
|
|
|
(word_box.bottom () < tessedit_image_border) ||
|
|
|
|
(word_box.right () + tessedit_image_border >
|
|
|
|
page_image.get_xsize () - 1) ||
|
|
|
|
(word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
|
|
|
|
ASSERT_HOST (word->reject_map.length () == blob_it.length ());
|
|
|
|
for (blobindex = 0, blob_it.mark_cycle_pt ();
|
|
|
|
!blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
|
|
|
|
blob_box = blob_it.data ()->bounding_box ();
|
|
|
|
centre = (blob_box.left () + blob_box.right ()) / 2.0;
|
|
|
|
if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
|
|
|
|
(word->denorm.y (blob_box.bottom (), centre) <
|
|
|
|
tessedit_image_border) ||
|
|
|
|
(word->denorm.x (blob_box.right ()) + tessedit_image_border >
|
|
|
|
page_image.get_xsize () - 1) ||
|
|
|
|
(word->denorm.y (blob_box.top (), centre)
|
|
|
|
+ tessedit_image_border > page_image.get_ysize () - 1)) {
|
|
|
|
word->reject_map[blobindex].setrej_edge_char ();
|
|
|
|
//close to edge
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* one_ell_conflict()
|
|
|
|
*
|
|
|
|
* Identify words where there is a potential I/l/1 error.
|
|
|
|
* - A bundle of contextual heuristics!
|
|
|
|
**********************************************************************/
|
2009-07-11 10:03:51 +08:00
|
|
|
namespace tesseract {
|
|
|
|
BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
|
2007-03-08 04:03:40 +08:00
|
|
|
const char *word;
|
2007-07-18 09:15:07 +08:00
|
|
|
const char *lengths;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 word_len; //its length
|
|
|
|
inT16 first_alphanum_index_;
|
|
|
|
inT16 first_alphanum_offset_;
|
|
|
|
inT16 i;
|
|
|
|
inT16 offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
BOOL8 non_conflict_set_char; //non conf set a/n?
|
|
|
|
BOOL8 conflict = FALSE;
|
|
|
|
BOOL8 allow_1s;
|
|
|
|
ACCEPTABLE_WERD_TYPE word_type;
|
|
|
|
BOOL8 dict_perm_type;
|
|
|
|
BOOL8 dict_word_ok;
|
|
|
|
int dict_word_type;
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
word = word_res->best_choice->unichar_string().string ();
|
|
|
|
lengths = word_res->best_choice->unichar_lengths().string();
|
2007-07-18 09:15:07 +08:00
|
|
|
word_len = strlen (lengths);
|
2007-03-08 04:03:40 +08:00
|
|
|
/*
|
|
|
|
If there are no occurrences of the conflict set characters then the word
|
|
|
|
is OK.
|
|
|
|
*/
|
|
|
|
if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
|
|
|
|
return FALSE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
There is a conflict if there are NO other (confirmed) alphanumerics apart
|
|
|
|
from those in the conflict set.
|
|
|
|
*/
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0, non_conflict_set_char = FALSE;
|
|
|
|
(i < word_len) && !non_conflict_set_char; offset += lengths[i++])
|
|
|
|
non_conflict_set_char =
|
|
|
|
(unicharset.get_isalpha(word + offset, lengths[i]) ||
|
|
|
|
unicharset.get_isdigit(word + offset, lengths[i])) &&
|
|
|
|
!STRING (conflict_set_I_l_1).contains (word[offset]);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (!non_conflict_set_char) {
|
|
|
|
if (update_map)
|
|
|
|
reject_I_1_L(word_res);
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
If the word is accepted by a dawg permuter, and the first alpha character
|
|
|
|
is "I" or "l", check to see if the alternative is also a dawg word. If it
|
|
|
|
is, then there is a potential error otherwise the word is ok.
|
|
|
|
*/
|
|
|
|
|
|
|
|
dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
|
|
|
(word_res->best_choice->permuter () == USER_DAWG_PERM) ||
|
|
|
|
(rej_trust_doc_dawg &&
|
|
|
|
(word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
|
|
|
|
(word_res->best_choice->permuter () == FREQ_DAWG_PERM);
|
2009-07-11 10:03:51 +08:00
|
|
|
dict_word_type = dict_word(*(word_res->best_choice));
|
2007-03-08 04:03:40 +08:00
|
|
|
dict_word_ok = (dict_word_type > 0) &&
|
|
|
|
(rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
|
|
|
|
|
|
|
|
if ((rej_1Il_use_dict_word && dict_word_ok) ||
|
|
|
|
(rej_1Il_trust_permuter_type && dict_perm_type) ||
|
|
|
|
(dict_perm_type && dict_word_ok)) {
|
2007-07-18 09:15:07 +08:00
|
|
|
first_alphanum_index_ = first_alphanum_index (word, lengths);
|
|
|
|
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
|
|
|
|
if (lengths[first_alphanum_index_] == 1 &&
|
|
|
|
word[first_alphanum_offset_] == 'I') {
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
|
|
|
if (safe_dict_word(*(word_res->best_choice)) > 0) {
|
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
2007-03-08 04:03:40 +08:00
|
|
|
if (update_map)
|
2007-07-18 09:15:07 +08:00
|
|
|
word_res->reject_map[first_alphanum_index_].
|
2007-03-08 04:03:40 +08:00
|
|
|
setrej_1Il_conflict();
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
else {
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
2007-03-08 04:03:40 +08:00
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
if (lengths[first_alphanum_index_] == 1 &&
|
|
|
|
word[first_alphanum_offset_] == 'l') {
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
|
|
|
if (safe_dict_word(*(word_res->best_choice)) > 0) {
|
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
2007-03-08 04:03:40 +08:00
|
|
|
if (update_map)
|
2007-07-18 09:15:07 +08:00
|
|
|
word_res->reject_map[first_alphanum_index_].
|
2007-03-08 04:03:40 +08:00
|
|
|
setrej_1Il_conflict();
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
else {
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
2007-03-08 04:03:40 +08:00
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
NEW 1Il code. The old code relied on permuter types too much. In fact,
|
|
|
|
tess will use TOP_CHOICE permute for good things like "palette".
|
|
|
|
In this code the string is examined independently to see if it looks like
|
|
|
|
a well formed word.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
|
|
|
|
dictionary word.
|
|
|
|
*/
|
2007-07-18 09:15:07 +08:00
|
|
|
first_alphanum_index_ = first_alphanum_index (word, lengths);
|
|
|
|
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
|
|
|
|
if (lengths[first_alphanum_index_] == 1 &&
|
|
|
|
word[first_alphanum_offset_] == 'l') {
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
|
|
|
if (safe_dict_word(*(word_res->best_choice)) > 0)
|
2007-03-08 04:03:40 +08:00
|
|
|
return FALSE;
|
|
|
|
else
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2007-07-18 09:15:07 +08:00
|
|
|
else if (lengths[first_alphanum_index_] == 1 &&
|
|
|
|
word[first_alphanum_offset_] == 'I') {
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
|
|
|
if (safe_dict_word(*(word_res->best_choice)) > 0)
|
2007-03-08 04:03:40 +08:00
|
|
|
return FALSE;
|
|
|
|
else
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
For strings containing digits:
|
|
|
|
If there are no alphas OR the numeric permuter liked the word,
|
|
|
|
reject any non 1 conflict chs
|
|
|
|
Else reject all conflict chs
|
|
|
|
*/
|
2007-07-18 09:15:07 +08:00
|
|
|
if (word_contains_non_1_digit (word, lengths)) {
|
|
|
|
allow_1s = (alpha_count (word, lengths) == 0) ||
|
2007-03-08 04:03:40 +08:00
|
|
|
(word_res->best_choice->permuter () == NUMBER_PERM);
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
conflict = FALSE;
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0; word[offset] != '\0';
|
2009-07-11 10:03:51 +08:00
|
|
|
offset += word_res->best_choice->unichar_lengths()[i++]) {
|
2007-07-18 09:15:07 +08:00
|
|
|
if ((!allow_1s || (word[offset] != '1')) &&
|
|
|
|
STRING (conflict_set_I_l_1).contains (word[offset])) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (update_map)
|
|
|
|
word_res->reject_map[i].setrej_1Il_conflict ();
|
|
|
|
conflict = TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return conflict;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
For anything else. See if it conforms to an acceptable word type. If so,
|
|
|
|
treat accordingly.
|
|
|
|
*/
|
2007-07-18 09:15:07 +08:00
|
|
|
word_type = acceptable_word_string (word, lengths);
|
2007-03-08 04:03:40 +08:00
|
|
|
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
|
2007-07-18 09:15:07 +08:00
|
|
|
first_alphanum_index_ = first_alphanum_index (word, lengths);
|
|
|
|
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
|
|
|
|
if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (update_map)
|
2007-07-18 09:15:07 +08:00
|
|
|
word_res->reject_map[first_alphanum_index_].
|
|
|
|
setrej_1Il_conflict ();
|
2007-03-08 04:03:40 +08:00
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
else if (word_type == AC_UPPER_CASE) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (update_map)
|
|
|
|
reject_I_1_L(word_res);
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
inT16 Tesseract::first_alphanum_index(const char *word,
|
|
|
|
const char *word_lengths) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i;
|
|
|
|
inT16 offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
|
|
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
|
|
|
|
unicharset.get_isdigit(word + offset, word_lengths[i]))
|
2007-03-08 04:03:40 +08:00
|
|
|
return i;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
inT16 Tesseract::first_alphanum_offset(const char *word,
|
|
|
|
const char *word_lengths) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i;
|
|
|
|
inT16 offset;
|
2007-07-18 09:15:07 +08:00
|
|
|
|
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
|
|
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
|
|
|
|
unicharset.get_isdigit(word + offset, word_lengths[i]))
|
|
|
|
return offset;
|
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
inT16 Tesseract::alpha_count(const char *word,
|
|
|
|
const char *word_lengths) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i;
|
|
|
|
inT16 offset;
|
|
|
|
inT16 count = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
|
|
if (unicharset.get_isalpha (word + offset, word_lengths[i]))
|
2007-03-08 04:03:40 +08:00
|
|
|
count++;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
|
|
|
|
const char *word_lengths) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i;
|
|
|
|
inT16 offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
|
|
if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
|
|
|
|
(word_lengths[i] != 1 || word[offset] != '1'))
|
2007-03-08 04:03:40 +08:00
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
BOOL8 Tesseract::test_ambig_word( //test for ambiguity
|
|
|
|
WERD_RES *word) {
|
|
|
|
BOOL8 ambig = FALSE;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
|
|
|
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
|
|
|
|
(word->best_choice->permuter () == USER_DAWG_PERM)) {
|
|
|
|
ambig = !getDict().NoDangerousAmbig(
|
|
|
|
word->best_choice, NULL, false, NULL, NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
return ambig;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* char_ambiguities()
|
|
|
|
*
|
|
|
|
* Return a pointer to a string containing the full conflict set of characters
|
|
|
|
* which includes the specified character, if there is one. If the specified
|
|
|
|
* character is not a member of a conflict set, return NULL.
|
|
|
|
* (NOTE that a character is assumed to be a member of only ONE conflict set.)
|
|
|
|
*************************************************************************/
|
2009-07-11 10:03:51 +08:00
|
|
|
const char *Tesseract::char_ambiguities(char c) {
|
2007-03-08 04:03:40 +08:00
|
|
|
static STRING_CLIST conflict_sets;
|
|
|
|
static BOOL8 read_conflict_sets = FALSE;
|
|
|
|
STRING_C_IT cs_it(&conflict_sets);
|
|
|
|
const char *cs;
|
|
|
|
STRING cs_file_name;
|
|
|
|
FILE *cs_file;
|
|
|
|
char buff[1024];
|
|
|
|
|
|
|
|
if (!read_conflict_sets) {
|
|
|
|
cs_file_name = datadir + "confsets";
|
|
|
|
if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
|
|
|
|
CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
|
|
|
|
cs_file_name.string (), errno);
|
|
|
|
}
|
|
|
|
while (fscanf (cs_file, "%s", buff) == 1) {
|
|
|
|
cs_it.add_after_then_move (new STRING (buff));
|
|
|
|
}
|
|
|
|
read_conflict_sets = TRUE;
|
|
|
|
cs_it.move_to_first ();
|
|
|
|
if (tessedit_rejection_debug) {
|
|
|
|
for (cs_it.mark_cycle_pt ();
|
|
|
|
!cs_it.cycled_list (); cs_it.forward ()) {
|
|
|
|
tprintf ("\"%s\"\n", cs_it.data ()->string ());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cs_it.move_to_first ();
|
|
|
|
for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
|
|
|
|
cs = cs_it.data ()->string ();
|
|
|
|
if (strchr (cs, c) != NULL)
|
|
|
|
return cs;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* nn_recover_rejects()
|
|
|
|
* Generate the nn_reject_map - a copy of the current reject map, but dont
|
|
|
|
* reject previously rejected chars if the NN matcher agrees with the best
|
|
|
|
* choice.
|
|
|
|
*************************************************************************/
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::nn_recover_rejects(WERD_RES *word, ROW *row) {
|
2007-03-08 04:03:40 +08:00
|
|
|
//copy for debug
|
|
|
|
REJMAP old_map = word->reject_map;
|
|
|
|
/*
|
|
|
|
NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS
|
|
|
|
MATCHED BY THE NN MATCHER. IF COULD EASILY BE RESTRICTED TO JUST THE
|
|
|
|
REJECT CHARACTERS (Though initial use is when words are total rejects
|
|
|
|
anyway).
|
|
|
|
*/
|
|
|
|
|
|
|
|
set_global_subsubloc_code(SUBSUBLOC_NN);
|
|
|
|
nn_match_word(word, row);
|
|
|
|
|
|
|
|
if (no_unrej_1Il)
|
|
|
|
dont_allow_1Il(word);
|
|
|
|
if (no_unrej_dubious_chars)
|
|
|
|
dont_allow_dubious_chars(word);
|
|
|
|
|
|
|
|
if (rej_mostly_reject_mode == 1)
|
|
|
|
reject_mostly_rejects(word);
|
|
|
|
/*
|
|
|
|
IF there are no unrejected alphanumerics AND
|
|
|
|
The word is not an acceptable single non alphanum char word AND
|
|
|
|
The word is not an acceptable repeated non alphanum char word
|
|
|
|
THEN Reject whole word
|
|
|
|
*/
|
|
|
|
if (no_unrej_no_alphanum_wds &&
|
|
|
|
(count_alphanums (word) < 1) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
!((word->best_choice->unichar_lengths().length () == 1) &&
|
|
|
|
STRING(ok_single_ch_non_alphanum_wds).contains(
|
|
|
|
word->best_choice->unichar_string()[0]))
|
2007-03-08 04:03:40 +08:00
|
|
|
&& !repeated_nonalphanum_wd (word, row))
|
|
|
|
|
|
|
|
word->reject_map.rej_word_no_alphanums ();
|
|
|
|
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
|
|
|
|
if (nn_debug) {
|
2009-07-11 10:03:51 +08:00
|
|
|
tprintf ("\nTess: \"%s\" MAP ",
|
|
|
|
word->best_choice->unichar_string().string());
|
2007-03-08 04:03:40 +08:00
|
|
|
old_map.print (stdout);
|
|
|
|
tprintf ("->");
|
|
|
|
word->reject_map.print (stdout);
|
|
|
|
tprintf ("\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
set_global_subsubloc_code(SUBSUBLOC_OTHER);
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::nn_match_word( //Match a word
|
|
|
|
WERD_RES *word,
|
|
|
|
ROW *row) {
|
2007-03-08 04:03:40 +08:00
|
|
|
PIXROW_LIST *pixrow_list;
|
|
|
|
PIXROW_IT pixrow_it;
|
|
|
|
IMAGELINE *imlines; //lines of the image
|
2008-04-22 08:32:14 +08:00
|
|
|
TBOX pix_box; //box of imlines extent
|
2007-03-08 04:03:40 +08:00
|
|
|
#ifndef GRAPHICS_DISABLED
|
2008-02-01 08:33:18 +08:00
|
|
|
ScrollView* win = NULL;
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
|
|
|
IMAGE clip_image;
|
|
|
|
IMAGE scaled_image;
|
|
|
|
float baseline_pos;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 net_image_size;
|
|
|
|
inT16 clip_image_size;
|
2007-03-08 04:03:40 +08:00
|
|
|
WERD copy_outword; // copy to denorm
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 i;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
const char *word_string;
|
2007-07-18 09:15:07 +08:00
|
|
|
const char *word_string_lengths;
|
2007-03-08 04:03:40 +08:00
|
|
|
BOOL8 word_in_dict; //Tess wd in dict
|
|
|
|
BOOL8 checked_dict_word; //Tess wd definitely in dict
|
|
|
|
BOOL8 sensible_word; //OK char string
|
|
|
|
BOOL8 centre; //Not at word end chs
|
|
|
|
BOOL8 good_quality_word;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 char_quality;
|
|
|
|
inT16 accepted_char_quality;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 conf_level; //0:REJECT
|
2007-03-08 04:03:40 +08:00
|
|
|
//1:DODGY ACCEPT
|
|
|
|
//2:DICT ACCEPT
|
|
|
|
//3:CLEAR ACCEPT
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 first_alphanum_index_;
|
|
|
|
inT16 first_alphanum_offset_;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
word_string = word->best_choice->unichar_string().string();
|
|
|
|
word_string_lengths = word->best_choice->unichar_lengths().string();
|
2007-07-18 09:15:07 +08:00
|
|
|
first_alphanum_index_ = first_alphanum_index (word_string,
|
|
|
|
word_string_lengths);
|
|
|
|
first_alphanum_offset_ = first_alphanum_offset (word_string,
|
|
|
|
word_string_lengths);
|
2007-03-08 04:03:40 +08:00
|
|
|
word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
|
|
|
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
|
|
|
|
(word->best_choice->permuter () == USER_DAWG_PERM));
|
2009-07-11 10:03:51 +08:00
|
|
|
checked_dict_word = word_in_dict &&
|
|
|
|
(safe_dict_word(*(word->best_choice)) > 0);
|
2007-07-18 09:15:07 +08:00
|
|
|
sensible_word = acceptable_word_string (word_string, word_string_lengths) !=
|
|
|
|
AC_UNACCEPTABLE;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
word_char_quality(word, row, &char_quality, &accepted_char_quality);
|
2009-07-11 10:03:51 +08:00
|
|
|
good_quality_word =
|
|
|
|
word->best_choice->unichar_lengths().length () == char_quality;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (nn_reject_debug) {
|
|
|
|
tprintf ("Dict: %c Checked Dict: %c Sensible: %c Quality: %c\n",
|
|
|
|
word_in_dict ? 'T' : 'F',
|
|
|
|
checked_dict_word ? 'T' : 'F',
|
|
|
|
sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
if (word->best_choice->unichar_lengths().length () !=
|
2007-03-08 04:03:40 +08:00
|
|
|
word->outword->blob_list ()->length ()) {
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
tprintf ("nn_match_word ASSERT FAIL String:\"%s\"; #Blobs=%d\n",
|
2009-07-11 10:03:51 +08:00
|
|
|
word->best_choice->unichar_string().string (),
|
2007-03-08 04:03:40 +08:00
|
|
|
word->outword->blob_list ()->length ());
|
|
|
|
#endif
|
|
|
|
err_exit();
|
|
|
|
}
|
|
|
|
|
|
|
|
copy_outword = *(word->outword);
|
|
|
|
copy_outword.baseline_denormalise (&word->denorm);
|
|
|
|
/*
|
|
|
|
For each character, generate and match a new image, containing JUST the
|
|
|
|
character we have clipped, centered in the image, on a white background.
|
|
|
|
Note that we MUST have a square image so that we can scale it uniformly in
|
|
|
|
x and y. We base the size on x_height as this can be found fairly reliably.
|
|
|
|
*/
|
|
|
|
net_image_size = (net_image_width > net_image_height) ?
|
|
|
|
net_image_width : net_image_height;
|
2008-04-22 08:32:14 +08:00
|
|
|
clip_image_size = (inT16) floor (0.5 +
|
2007-03-08 04:03:40 +08:00
|
|
|
net_image_size * word->x_height /
|
|
|
|
net_image_x_height);
|
|
|
|
if ((clip_image_size <= 1) || (net_image_size <= 1)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
Get the image of the word and the pix positions of each char
|
|
|
|
*/
|
|
|
|
char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
|
|
if (show_char_clipping) {
|
|
|
|
win = display_clip_image (©_outword, page_image,
|
|
|
|
pixrow_list, pix_box);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
pixrow_it.set_to_list (pixrow_list);
|
|
|
|
pixrow_it.move_to_first ();
|
|
|
|
for (pixrow_it.mark_cycle_pt (), i = 0;
|
|
|
|
!pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
|
|
|
|
if (pixrow_it.data ()->
|
|
|
|
bad_box (page_image.get_xsize (), page_image.get_ysize ()))
|
|
|
|
continue;
|
|
|
|
clip_image.create (clip_image_size, clip_image_size, 1);
|
|
|
|
//make bin imge
|
|
|
|
if (!copy_outword.flag (W_INVERSE))
|
|
|
|
invert_image(&clip_image); //white background for black on white
|
|
|
|
pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
|
|
|
|
clip_image, baseline_pos);
|
|
|
|
if (copy_outword.flag (W_INVERSE))
|
|
|
|
invert_image(&clip_image); //invert white on black for scaling &NN
|
|
|
|
scaled_image.create (net_image_size, net_image_size, 1);
|
|
|
|
scale_image(clip_image, scaled_image);
|
|
|
|
baseline_pos *= net_image_size / clip_image_size;
|
|
|
|
//scale with im
|
|
|
|
centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
|
|
|
|
|
|
|
|
conf_level = nn_match_char (scaled_image, baseline_pos,
|
|
|
|
word_in_dict, checked_dict_word,
|
|
|
|
sensible_word, centre,
|
|
|
|
good_quality_word, word_string[i]);
|
|
|
|
if (word->reject_map[i].recoverable ()) {
|
2007-07-18 09:15:07 +08:00
|
|
|
if ((i == first_alphanum_index_) &&
|
|
|
|
word_string_lengths[first_alphanum_index_] == 1 &&
|
|
|
|
((word_string[first_alphanum_offset_] == 'I') ||
|
|
|
|
(word_string[first_alphanum_offset_] == 'i'))) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (conf_level >= nn_conf_initial_i_level)
|
|
|
|
word->reject_map[i].setrej_nn_accept ();
|
|
|
|
//un-reject char
|
|
|
|
}
|
|
|
|
else if (conf_level > 0)
|
|
|
|
//un-reject char
|
|
|
|
word->reject_map[i].setrej_nn_accept ();
|
|
|
|
}
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
|
|
if (show_char_clipping)
|
|
|
|
display_images(clip_image, scaled_image);
|
|
|
|
#endif
|
2008-02-01 08:33:18 +08:00
|
|
|
clip_image.destroy();
|
|
|
|
scaled_image.destroy();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
delete[]imlines; // Free array of imlines
|
|
|
|
delete pixrow_list;
|
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
|
|
if (show_char_clipping) {
|
2008-02-01 08:33:18 +08:00
|
|
|
// destroy_window(win);
|
|
|
|
// win->Destroy();
|
|
|
|
delete win;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* nn_match_char()
|
|
|
|
* Call Neural Net matcher to match a single character, given a scaled,
|
|
|
|
* square image
|
|
|
|
*************************************************************************/
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 nn_match_char( //of character
|
2007-03-08 04:03:40 +08:00
|
|
|
IMAGE &scaled_image,
|
|
|
|
float baseline_pos, //rel to scaled_image
|
|
|
|
BOOL8 dict_word, //part of dict wd?
|
|
|
|
BOOL8 checked_dict_word, //part of dict wd?
|
|
|
|
BOOL8 sensible_word, //part acceptable str?
|
|
|
|
BOOL8 centre, //not at word ends?
|
|
|
|
BOOL8 good_quality_word, //initial segmentation
|
|
|
|
char tess_ch //confirm this?
|
|
|
|
) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 conf_level; //0..2
|
|
|
|
inT32 row;
|
|
|
|
inT32 col;
|
|
|
|
inT32 y_size = scaled_image.get_ysize ();
|
|
|
|
inT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
|
|
|
|
inT32 end_y = start_y - net_image_height + 1;
|
2007-03-08 04:03:40 +08:00
|
|
|
IMAGELINE imline;
|
|
|
|
float *input_vector;
|
|
|
|
float *input_vec_ptr;
|
|
|
|
char top;
|
|
|
|
float top_score;
|
|
|
|
char next;
|
|
|
|
float next_score;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
|
|
|
|
inT16 j;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
|
|
|
|
input_vec_ptr = input_vector;
|
|
|
|
|
|
|
|
invert_image(&scaled_image); //cos nns work better
|
|
|
|
for (row = start_y; row >= end_y; row--) {
|
|
|
|
scaled_image.fast_get_line (0, row, net_image_width, &imline);
|
|
|
|
for (col = 0; col < net_image_width; col++)
|
|
|
|
*input_vec_ptr++ = imline.pixels[col];
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
The bit map presented to the net may be shorter than the image, so shift
|
|
|
|
the coord to be relative to the bitmap portion.
|
|
|
|
*/
|
|
|
|
baseline_pos -= (y_size - net_image_height) / 2.0;
|
|
|
|
/*
|
|
|
|
Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise.
|
|
|
|
This is represented to the net as a set of bl_nodes, an initial proportion
|
|
|
|
of which are set to 1.0, indicating the level of the baseline. The
|
|
|
|
remainder are 0.0
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (baseline_pos < 0)
|
|
|
|
baseline_pos = 0;
|
|
|
|
else if (baseline_pos >= net_image_height)
|
|
|
|
baseline_pos = net_image_height + 1;
|
|
|
|
else
|
|
|
|
baseline_pos = baseline_pos + 1;
|
|
|
|
baseline_pos = baseline_pos / (net_image_height + 1);
|
|
|
|
|
|
|
|
if (net_bl_nodes > 0) {
|
|
|
|
baseline_pos *= 1.7; //Use a wider range
|
|
|
|
if (net_bl_nodes > 1) {
|
|
|
|
/* Multi-node baseline representation */
|
|
|
|
for (j = 0; j < net_bl_nodes; j++) {
|
|
|
|
if (baseline_pos > ((float) j / net_bl_nodes))
|
|
|
|
*input_vec_ptr++ = 1.0;
|
|
|
|
else
|
|
|
|
*input_vec_ptr++ = 0.0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* Single node baseline */
|
|
|
|
*input_vec_ptr++ = baseline_pos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
callnet(input_vector, &top, &top_score, &next, &next_score);
|
|
|
|
conf_level = evaluate_net_match (top, top_score, next, next_score,
|
|
|
|
tess_ch, dict_word, checked_dict_word,
|
|
|
|
sensible_word, centre, good_quality_word);
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (nn_reject_debug) {
|
|
|
|
tprintf ("top:\"%c\" %4.2f next:\"%c\" %4.2f TESS:\"%c\" Conf: %d\n",
|
|
|
|
top, top_score, next, next_score, tess_ch, conf_level);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
free_mem(input_vector);
|
|
|
|
return conf_level;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 evaluate_net_match(char top,
|
2007-03-08 04:03:40 +08:00
|
|
|
float top_score,
|
|
|
|
char next,
|
|
|
|
float next_score,
|
|
|
|
char tess_ch,
|
|
|
|
BOOL8 dict_word,
|
|
|
|
BOOL8 checked_dict_word,
|
|
|
|
BOOL8 sensible_word,
|
|
|
|
BOOL8 centre,
|
|
|
|
BOOL8 good_quality_word) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 accept_level; //0 Very clearly matched
|
2007-03-08 04:03:40 +08:00
|
|
|
//1 Clearly top
|
|
|
|
//2 Top but poor match
|
|
|
|
//3 Next & poor top match
|
|
|
|
//4 Next but good top match
|
|
|
|
//5 No chance
|
|
|
|
BOOL8 good_top_choice;
|
|
|
|
BOOL8 excellent_top_choice;
|
|
|
|
BOOL8 confusion_match = FALSE;
|
|
|
|
BOOL8 dodgy_char = !isalnum (tess_ch);
|
|
|
|
|
|
|
|
good_top_choice = (top_score > nn_reject_threshold) &&
|
|
|
|
(nn_reject_head_and_shoulders * top_score > next_score);
|
|
|
|
|
|
|
|
excellent_top_choice = good_top_choice &&
|
|
|
|
(top_score > nn_dodgy_char_threshold);
|
|
|
|
|
|
|
|
if (top == tess_ch) {
|
|
|
|
if (excellent_top_choice)
|
|
|
|
accept_level = 0;
|
|
|
|
else if (good_top_choice)
|
|
|
|
accept_level = 1; //Top correct and well matched
|
|
|
|
else
|
|
|
|
accept_level = 2; //Top correct but poor match
|
|
|
|
}
|
|
|
|
else if ((nn_conf_1Il &&
|
|
|
|
STRING (conflict_set_I_l_1).contains (tess_ch) &&
|
|
|
|
STRING (conflict_set_I_l_1).contains (top)) ||
|
|
|
|
(nn_conf_hyphen &&
|
|
|
|
STRING (conflict_set_hyphen).contains (tess_ch) &&
|
|
|
|
STRING (conflict_set_hyphen).contains (top)) ||
|
|
|
|
(nn_conf_Ss &&
|
|
|
|
STRING (conflict_set_S_s).contains (tess_ch) &&
|
|
|
|
STRING (conflict_set_S_s).contains (top))) {
|
|
|
|
confusion_match = TRUE;
|
|
|
|
if (good_top_choice)
|
|
|
|
accept_level = 1; //Good top confusion
|
|
|
|
else
|
|
|
|
accept_level = 2; //Poor top confusion
|
|
|
|
}
|
|
|
|
else if ((nn_conf_1Il &&
|
|
|
|
STRING (conflict_set_I_l_1).contains (tess_ch) &&
|
|
|
|
STRING (conflict_set_I_l_1).contains (next)) ||
|
|
|
|
(nn_conf_hyphen &&
|
|
|
|
STRING (conflict_set_hyphen).contains (tess_ch) &&
|
|
|
|
STRING (conflict_set_hyphen).contains (next)) ||
|
|
|
|
(nn_conf_Ss &&
|
|
|
|
STRING (conflict_set_S_s).contains (tess_ch) &&
|
|
|
|
STRING (conflict_set_S_s).contains (next))) {
|
|
|
|
confusion_match = TRUE;
|
|
|
|
if (!good_top_choice)
|
|
|
|
accept_level = 3; //Next confusion and top match dodgy
|
|
|
|
else
|
|
|
|
accept_level = 4; //Next confusion and good top match
|
|
|
|
}
|
|
|
|
else if (next == tess_ch) {
|
|
|
|
if (!good_top_choice)
|
|
|
|
accept_level = 3; //Next match and top match dodgy
|
|
|
|
else
|
|
|
|
accept_level = 4; //Next match and good top match
|
|
|
|
}
|
|
|
|
else
|
|
|
|
accept_level = 5;
|
|
|
|
|
|
|
|
/* Could allow some match flexibility here sS$ etc */
|
|
|
|
|
|
|
|
/* Now set confirmation level according to how much we can believe the tess
|
|
|
|
char. */
|
|
|
|
|
|
|
|
if ((accept_level == 0) && !confusion_match)
|
|
|
|
return 3;
|
|
|
|
|
|
|
|
if ((accept_level <= 1) &&
|
|
|
|
(!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
|
|
|
|
return 3;
|
|
|
|
|
|
|
|
if ((accept_level == 2) &&
|
|
|
|
!confusion_match && !dodgy_char &&
|
|
|
|
good_quality_word &&
|
|
|
|
dict_word &&
|
|
|
|
(checked_dict_word || !nn_double_check_dict) && sensible_word)
|
|
|
|
return 2;
|
|
|
|
|
|
|
|
if (confusion_match &&
|
|
|
|
(accept_level <= nn_conf_accept_level) &&
|
|
|
|
(good_quality_word ||
|
|
|
|
(!nn_conf_test_good_qual &&
|
|
|
|
!STRING (conflict_set_I_l_1).contains (tess_ch))) &&
|
|
|
|
(dict_word || !nn_conf_test_dict) &&
|
|
|
|
(checked_dict_word || !nn_conf_double_check_dict) &&
|
|
|
|
(sensible_word || !nn_conf_test_sensible))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (!confusion_match &&
|
|
|
|
nn_lax &&
|
|
|
|
(accept_level == 3) &&
|
|
|
|
(good_quality_word || !nn_conf_test_good_qual) &&
|
|
|
|
(dict_word || !nn_conf_test_dict) &&
|
|
|
|
(sensible_word || !nn_conf_test_sensible))
|
|
|
|
return 1;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* dont_allow_dubious_chars()
|
|
|
|
* Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong
|
|
|
|
* if adjacent to a reject.
|
|
|
|
*************************************************************************/
|
|
|
|
void dont_allow_dubious_chars(WERD_RES *word) {
|
|
|
|
int i = 0;
|
2007-07-18 09:15:07 +08:00
|
|
|
int offset = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
int rej_pos;
|
|
|
|
int word_len = word->reject_map.length ();
|
|
|
|
|
|
|
|
while (i < word_len) {
|
|
|
|
/* Find next reject */
|
|
|
|
|
|
|
|
while ((i < word_len) && (word->reject_map[i].accepted ()))
|
2007-07-18 09:15:07 +08:00
|
|
|
{
|
2009-07-11 10:03:51 +08:00
|
|
|
offset += word->best_choice->unichar_lengths()[i];
|
2007-03-08 04:03:40 +08:00
|
|
|
i++;
|
2007-07-18 09:15:07 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (i < word_len) {
|
|
|
|
rej_pos = i;
|
|
|
|
|
|
|
|
/* Reject dubious chars to the left */
|
|
|
|
i--;
|
2009-07-11 10:03:51 +08:00
|
|
|
offset -= word->best_choice->unichar_lengths()[i];
|
2007-03-08 04:03:40 +08:00
|
|
|
while ((i >= 0) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
STRING(dubious_chars_left_of_reject).contains(
|
|
|
|
word->best_choice->unichar_string()[offset])) {
|
2007-03-08 04:03:40 +08:00
|
|
|
word->reject_map[i--].setrej_dubious ();
|
2009-07-11 10:03:51 +08:00
|
|
|
offset -= word->best_choice->unichar_lengths()[i];
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Skip adjacent rejects */
|
|
|
|
|
|
|
|
for (i = rej_pos;
|
2007-07-18 09:15:07 +08:00
|
|
|
(i < word_len) && (word->reject_map[i].rejected ());
|
2009-07-11 10:03:51 +08:00
|
|
|
offset += word->best_choice->unichar_lengths()[i++]);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/* Reject dubious chars to the right */
|
|
|
|
|
|
|
|
while ((i < word_len) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
STRING(dubious_chars_right_of_reject).contains(
|
|
|
|
word->best_choice->unichar_string()[offset])) {
|
|
|
|
offset += word->best_choice->unichar_lengths()[i];
|
2007-03-08 04:03:40 +08:00
|
|
|
word->reject_map[i++].setrej_dubious ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*************************************************************************
|
|
|
|
* dont_allow_1Il()
|
|
|
|
* Dont unreject LONE accepted 1Il conflict set chars
|
|
|
|
*************************************************************************/
|
2009-07-11 10:03:51 +08:00
|
|
|
namespace tesseract {
|
|
|
|
void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int i = 0;
|
2007-07-18 09:15:07 +08:00
|
|
|
int offset;
|
2007-03-08 04:03:40 +08:00
|
|
|
int word_len = word->reject_map.length ();
|
2009-07-11 10:03:51 +08:00
|
|
|
const char *s = word->best_choice->unichar_string().string ();
|
|
|
|
const char *lengths = word->best_choice->unichar_lengths().string ();
|
2007-03-08 04:03:40 +08:00
|
|
|
BOOL8 accepted_1Il = FALSE;
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0; i < word_len;
|
2009-07-11 10:03:51 +08:00
|
|
|
offset += word->best_choice->unichar_lengths()[i++]) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (word->reject_map[i].accepted ()) {
|
2007-07-18 09:15:07 +08:00
|
|
|
if (STRING (conflict_set_I_l_1).contains (s[offset]))
|
2007-03-08 04:03:40 +08:00
|
|
|
accepted_1Il = TRUE;
|
|
|
|
else {
|
2007-07-18 09:15:07 +08:00
|
|
|
if (unicharset.get_isalpha (s + offset, lengths[i]) ||
|
|
|
|
unicharset.get_isdigit (s + offset, lengths[i]))
|
2007-03-08 04:03:40 +08:00
|
|
|
return; // >=1 non 1Il ch accepted
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!accepted_1Il)
|
|
|
|
return; //Nothing to worry about
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
for (i = 0, offset = 0; i < word_len;
|
2009-07-11 10:03:51 +08:00
|
|
|
offset += word->best_choice->unichar_lengths()[i++]) {
|
2007-07-18 09:15:07 +08:00
|
|
|
if (STRING (conflict_set_I_l_1).contains (s[offset]) &&
|
2007-03-08 04:03:40 +08:00
|
|
|
word->reject_map[i].accepted ())
|
|
|
|
word->reject_map[i].setrej_postNN_1Il ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
inT16 Tesseract::count_alphanums( //how many alphanums
|
|
|
|
WERD_RES *word_res) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int count = 0;
|
2009-07-11 10:03:51 +08:00
|
|
|
const WERD_CHOICE *best_choice = word_res->best_choice;
|
|
|
|
for (int i = 0; i < word_res->reject_map.length(); ++i) {
|
|
|
|
if ((word_res->reject_map[i].accepted()) &&
|
|
|
|
(unicharset.get_isalpha(best_choice->unichar_id(i)) ||
|
|
|
|
unicharset.get_isdigit(best_choice->unichar_id(i)))) {
|
2007-03-08 04:03:40 +08:00
|
|
|
count++;
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
void reject_mostly_rejects( //rej all if most rejectd
|
|
|
|
WERD_RES *word) {
|
|
|
|
/* Reject the whole of the word if the fraction of rejects exceeds a limit */
|
|
|
|
|
|
|
|
if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
|
|
|
|
rej_whole_of_mostly_reject_word_fract)
|
|
|
|
word->reject_map.rej_word_mostly_rej ();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
namespace tesseract {
|
|
|
|
BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT16 char_quality;
|
|
|
|
inT16 accepted_char_quality;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
if (word->best_choice->unichar_lengths().length () <= 1)
|
2007-03-08 04:03:40 +08:00
|
|
|
return FALSE;
|
|
|
|
|
|
|
|
if (!STRING (ok_repeated_ch_non_alphanum_wds).
|
2009-07-11 10:03:51 +08:00
|
|
|
contains (word->best_choice->unichar_string()[0]))
|
2007-03-08 04:03:40 +08:00
|
|
|
return FALSE;
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
if (!repeated_ch_string (word->best_choice->unichar_string().string (),
|
|
|
|
word->best_choice->unichar_lengths().string ()))
|
2007-03-08 04:03:40 +08:00
|
|
|
return FALSE;
|
|
|
|
|
|
|
|
word_char_quality(word, row, &char_quality, &accepted_char_quality);
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
if ((word->best_choice->unichar_lengths().length () == char_quality) &&
|
2007-03-08 04:03:40 +08:00
|
|
|
(char_quality == accepted_char_quality))
|
|
|
|
return TRUE;
|
|
|
|
else
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
BOOL8 Tesseract::repeated_ch_string(const char *rep_ch_str,
|
|
|
|
const char *lengths) {
|
2007-07-18 09:15:07 +08:00
|
|
|
UNICHAR_ID c;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
c = unicharset.unichar_to_id(rep_ch_str, *lengths);
|
|
|
|
rep_ch_str += *(lengths++);
|
|
|
|
while (*rep_ch_str != '\0' &&
|
|
|
|
unicharset.unichar_to_id(rep_ch_str, *lengths) == c) {
|
2007-03-08 04:03:40 +08:00
|
|
|
rep_ch_str++;
|
|
|
|
}
|
|
|
|
if (*rep_ch_str == '\0')
|
|
|
|
return TRUE;
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
inT16 Tesseract::safe_dict_word(const WERD_CHOICE &word) {
|
|
|
|
int dict_word_type = dict_word(word);
|
|
|
|
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
|
|
|
WERD_CHOICE *best_choice = word_res->best_choice;
|
|
|
|
int i;
|
2007-03-08 04:03:40 +08:00
|
|
|
PBLOB_IT outword_it;
|
|
|
|
int prev_right = -9999;
|
|
|
|
int next_left;
|
2008-04-22 08:32:14 +08:00
|
|
|
TBOX out_box;
|
2007-03-08 04:03:40 +08:00
|
|
|
float aspect_ratio;
|
|
|
|
|
|
|
|
if (tessedit_lower_flip_hyphen <= 1)
|
|
|
|
return;
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
outword_it.set_to_list(word_res->outword->blob_list());
|
|
|
|
UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
|
|
|
|
bool modified = false;
|
|
|
|
for (i = 0, outword_it.mark_cycle_pt();
|
|
|
|
i < best_choice->length() && !outword_it.cycled_list();
|
|
|
|
++i, outword_it.forward()) {
|
|
|
|
out_box = outword_it.data()->bounding_box();
|
|
|
|
if (outword_it.at_last())
|
2007-03-08 04:03:40 +08:00
|
|
|
next_left = 9999;
|
|
|
|
else
|
2009-07-11 10:03:51 +08:00
|
|
|
next_left = outword_it.data_relative(1)->bounding_box().left();
|
|
|
|
// Dont touch small or touching blobs - it is too dangerous.
|
|
|
|
if ((out_box.width() > 8 * word_res->denorm.scale()) &&
|
|
|
|
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
|
|
|
|
aspect_ratio = out_box.width() / (float) out_box.height();
|
|
|
|
if (unicharset.eq(best_choice->unichar_id(i), ".")) {
|
|
|
|
if (aspect_ratio >= tessedit_upper_flip_hyphen &&
|
|
|
|
unicharset.contains_unichar_id(unichar_dash) &&
|
|
|
|
unicharset.get_enabled(unichar_dash)) {
|
2007-03-08 04:03:40 +08:00
|
|
|
/* Certain HYPHEN */
|
2009-07-11 10:03:51 +08:00
|
|
|
best_choice->set_unichar_id(unichar_dash, i);
|
|
|
|
modified = true;
|
|
|
|
if (word_res->reject_map[i].rejected())
|
|
|
|
word_res->reject_map[i].setrej_hyphen_accept();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->reject_map[i].accepted())
|
2007-03-08 04:03:40 +08:00
|
|
|
//Suspected HYPHEN
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->reject_map[i].setrej_hyphen ();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
else if (best_choice->unichar_id(i) == unichar_dash) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(word_res->reject_map[i].rejected()))
|
|
|
|
word_res->reject_map[i].setrej_hyphen_accept();
|
2007-03-08 04:03:40 +08:00
|
|
|
//Certain HYPHEN
|
|
|
|
|
|
|
|
if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(word_res->reject_map[i].accepted()))
|
2007-03-08 04:03:40 +08:00
|
|
|
//Suspected HYPHEN
|
2009-07-11 10:03:51 +08:00
|
|
|
word_res->reject_map[i].setrej_hyphen();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
prev_right = out_box.right();
|
|
|
|
}
|
|
|
|
if (modified) {
|
|
|
|
best_choice->populate_unichars(unicharset);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::flip_0O(WERD_RES *word_res) {
|
|
|
|
WERD_CHOICE *best_choice = word_res->best_choice;
|
2007-03-08 04:03:40 +08:00
|
|
|
int i;
|
|
|
|
PBLOB_IT outword_it;
|
2008-04-22 08:32:14 +08:00
|
|
|
TBOX out_box;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (!tessedit_flip_0O)
|
|
|
|
return;
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
outword_it.set_to_list(word_res->outword->blob_list ());
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
for (i = 0, outword_it.mark_cycle_pt ();
|
|
|
|
i < best_choice->length() && !outword_it.cycled_list ();
|
|
|
|
++i, outword_it.forward ()) {
|
|
|
|
if (unicharset.get_isupper(best_choice->unichar_id(i)) ||
|
|
|
|
unicharset.get_isdigit(best_choice->unichar_id(i))) {
|
|
|
|
out_box = outword_it.data()->bounding_box ();
|
|
|
|
if ((out_box.top() < bln_baseline_offset + bln_x_height) ||
|
|
|
|
(out_box.bottom() > bln_baseline_offset + bln_x_height / 4))
|
2007-03-08 04:03:40 +08:00
|
|
|
return; //Beware words with sub/superscripts
|
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
UNICHAR_ID unichar_0 = unicharset.unichar_to_id("0");
|
|
|
|
UNICHAR_ID unichar_O = unicharset.unichar_to_id("O");
|
|
|
|
if (unichar_0 == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_0) ||
|
|
|
|
unichar_O == INVALID_UNICHAR_ID || !unicharset.get_enabled(unichar_O)) {
|
|
|
|
return; // 0 or O are not present/enabled in unicharset
|
|
|
|
}
|
|
|
|
bool modified = false;
|
|
|
|
for (i = 1; i < best_choice->length(); ++i, outword_it.forward ()) {
|
|
|
|
if (best_choice->unichar_id(i) == unichar_0 ||
|
|
|
|
best_choice->unichar_id(i) == unichar_O) {
|
2007-03-08 04:03:40 +08:00
|
|
|
/* A0A */
|
2009-07-11 10:03:51 +08:00
|
|
|
if ((i+1) < best_choice->length() &&
|
|
|
|
non_O_upper(best_choice->unichar_id(i-1)) &&
|
|
|
|
non_O_upper(best_choice->unichar_id(i+1))) {
|
|
|
|
best_choice->set_unichar_id(unichar_O, i);
|
|
|
|
modified = true;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/* A00A */
|
2009-07-11 10:03:51 +08:00
|
|
|
if (non_O_upper(best_choice->unichar_id(i-1)) &&
|
|
|
|
(i+1) < best_choice->length() &&
|
|
|
|
(best_choice->unichar_id(i+1) == unichar_0 ||
|
|
|
|
best_choice->unichar_id(i+1) == unichar_O) &&
|
|
|
|
(i+2) < best_choice->length() &&
|
|
|
|
non_O_upper(best_choice->unichar_id(i+2))) {
|
|
|
|
best_choice->set_unichar_id(unichar_O, i);
|
|
|
|
modified = true;
|
|
|
|
i++;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/* AA0<non digit or end of word> */
|
|
|
|
if ((i > 1) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
non_O_upper(best_choice->unichar_id(i-2)) &&
|
|
|
|
non_O_upper(best_choice->unichar_id(i-1)) &&
|
|
|
|
(((i+1) < best_choice->length() &&
|
|
|
|
!unicharset.get_isdigit(best_choice->unichar_id(i+1)) &&
|
|
|
|
!unicharset.eq(best_choice->unichar_id(i+1), "l") &&
|
|
|
|
!unicharset.eq(best_choice->unichar_id(i+1), "I")) ||
|
|
|
|
(i == best_choice->length() - 1))) {
|
|
|
|
best_choice->set_unichar_id(unichar_O, i);
|
|
|
|
modified = true;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/* 9O9 */
|
2009-07-11 10:03:51 +08:00
|
|
|
if (non_0_digit(best_choice->unichar_id(i-1)) &&
|
|
|
|
(i+1) < best_choice->length() &&
|
|
|
|
non_0_digit(best_choice->unichar_id(i+1))) {
|
|
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
|
|
modified = true;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/* 9OOO */
|
2009-07-11 10:03:51 +08:00
|
|
|
if (non_0_digit(best_choice->unichar_id(i-1)) &&
|
|
|
|
(i+2) < best_choice->length() &&
|
|
|
|
(best_choice->unichar_id(i+1) == unichar_0 ||
|
|
|
|
best_choice->unichar_id(i+1) == unichar_O) &&
|
|
|
|
(best_choice->unichar_id(i+2) == unichar_0 ||
|
|
|
|
best_choice->unichar_id(i+2) == unichar_O)) {
|
|
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
|
|
best_choice->set_unichar_id(unichar_0, i+1);
|
|
|
|
best_choice->set_unichar_id(unichar_0, i+2);
|
|
|
|
modified = true;
|
|
|
|
i += 2;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/* 9OO<non upper> */
|
2009-07-11 10:03:51 +08:00
|
|
|
if (non_0_digit(best_choice->unichar_id(i-1)) &&
|
|
|
|
(i+2) < best_choice->length() &&
|
|
|
|
(best_choice->unichar_id(i+1) == unichar_0 ||
|
|
|
|
best_choice->unichar_id(i+1) == unichar_O) &&
|
|
|
|
!unicharset.get_isupper(best_choice->unichar_id(i+2))) {
|
|
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
|
|
best_choice->set_unichar_id(unichar_0, i+1);
|
|
|
|
modified = true;
|
|
|
|
i++;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/* 9O<non upper> */
|
2009-07-11 10:03:51 +08:00
|
|
|
if (non_0_digit(best_choice->unichar_id(i-1)) &&
|
|
|
|
(i+1) < best_choice->length() &&
|
|
|
|
!unicharset.get_isupper(best_choice->unichar_id(i+1))) {
|
|
|
|
best_choice->set_unichar_id(unichar_0, i);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
/* 9[.,]OOO.. */
|
|
|
|
if ((i > 1) &&
|
2009-07-11 10:03:51 +08:00
|
|
|
(unicharset.eq(best_choice->unichar_id(i-1), ".") ||
|
|
|
|
unicharset.eq(best_choice->unichar_id(i-1), ",")) &&
|
|
|
|
(unicharset.get_isdigit(best_choice->unichar_id(i-2)) ||
|
|
|
|
best_choice->unichar_id(i-2) == unichar_O)) {
|
|
|
|
if (best_choice->unichar_id(i-2) == unichar_O) {
|
|
|
|
best_choice->set_unichar_id(unichar_0, i-2);
|
|
|
|
modified = true;
|
|
|
|
}
|
|
|
|
while (i < best_choice->length() &&
|
|
|
|
(best_choice->unichar_id(i) == unichar_O ||
|
|
|
|
best_choice->unichar_id(i) == unichar_0)) {
|
|
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
|
|
modified = true;
|
|
|
|
i++;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
i--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
if (modified) {
|
|
|
|
best_choice->populate_unichars(unicharset);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
BOOL8 Tesseract::non_O_upper(UNICHAR_ID unichar_id) {
|
|
|
|
return (unicharset.get_isupper(unichar_id) &&
|
|
|
|
(!unicharset.eq(unichar_id, "O")));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
BOOL8 Tesseract::non_0_digit(UNICHAR_ID unichar_id) {
|
|
|
|
return (unicharset.get_isdigit(unichar_id) &&
|
|
|
|
(!unicharset.eq(unichar_id, "0")));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
} // namespace tesseract
|