mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-06 01:07:49 +08:00
793 lines
28 KiB
C++
793 lines
28 KiB
C++
/**********************************************************************
|
|
* File: reject.cpp (Formerly reject.c)
|
|
* Description: Rejection functions used in tessedit
|
|
* Author: Phil Cheatle
|
|
* Created: Wed Sep 23 16:50:21 BST 1992
|
|
*
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#ifdef _MSC_VER
|
|
#pragma warning(disable:4244) // Conversion warnings
|
|
#pragma warning(disable:4305) // int/float warnings
|
|
#endif
|
|
|
|
#include "tessvars.h"
|
|
#ifdef __UNIX__
|
|
#include <assert.h>
|
|
#include <errno.h>
|
|
#endif
|
|
#include "scanutils.h"
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
#include "genericvector.h"
|
|
#include "reject.h"
|
|
#include "control.h"
|
|
#include "docqual.h"
|
|
#include "globaloc.h" // For err_exit.
|
|
#include "globals.h"
|
|
#include "helpers.h"
|
|
|
|
#include "tesseractclass.h"
|
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
|
|
CLISTIZEH (STRING) CLISTIZE (STRING)
|
|
|
|
/*************************************************************************
|
|
* set_done()
|
|
*
|
|
* Set the done flag based on the word acceptability criteria
|
|
*************************************************************************/
|
|
|
|
namespace tesseract {
|
|
void Tesseract::set_done(WERD_RES *word, inT16 pass) {
|
|
word->done = word->tess_accepted &&
|
|
(strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
|
|
bool word_is_ambig = word->best_choice->dangerous_ambig_found();
|
|
bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
|
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
|
word->best_choice->permuter() == USER_DAWG_PERM;
|
|
if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
|
|
one_ell_conflict(word, FALSE)) {
|
|
if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
|
|
word->done = FALSE;
|
|
}
|
|
if (word->done && ((!word_from_dict &&
|
|
word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
|
|
if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
|
|
word->done = FALSE;
|
|
}
|
|
if (tessedit_rejection_debug) {
|
|
tprintf("set_done(): done=%d\n", word->done);
|
|
word->best_choice->print("");
|
|
}
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
* make_reject_map()
|
|
*
|
|
* Sets the done flag to indicate whether the resylt is acceptable.
|
|
*
|
|
* Sets a reject map for the word.
|
|
*************************************************************************/
|
|
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
|
|
int i;
|
|
int offset;
|
|
|
|
flip_0O(word);
|
|
check_debug_pt(word, -1); // For trap only
|
|
set_done(word, pass); // Set acceptance
|
|
word->reject_map.initialise(word->best_choice->unichar_lengths().length());
|
|
reject_blanks(word);
|
|
/*
|
|
0: Rays original heuristic - the baseline
|
|
*/
|
|
if (tessedit_reject_mode == 0) {
|
|
if (!word->done)
|
|
reject_poor_matches(word);
|
|
} else if (tessedit_reject_mode == 5) {
|
|
/*
|
|
5: Reject I/1/l from words where there is no strong contextual confirmation;
|
|
the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
|
|
and the whole of any words which are very small
|
|
*/
|
|
if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
|
|
word->reject_map.rej_word_small_xht();
|
|
} else {
|
|
one_ell_conflict(word, TRUE);
|
|
/*
|
|
Originally the code here just used the done flag. Now I have duplicated
|
|
and unpacked the conditions for setting the done flag so that each
|
|
mechanism can be turned on or off independently. This works WITHOUT
|
|
affecting the done flag setting.
|
|
*/
|
|
if (rej_use_tess_accepted && !word->tess_accepted)
|
|
word->reject_map.rej_word_not_tess_accepted ();
|
|
|
|
if (rej_use_tess_blanks &&
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
|
|
word->reject_map.rej_word_contains_blanks ();
|
|
|
|
WERD_CHOICE* best_choice = word->best_choice;
|
|
if (rej_use_good_perm) {
|
|
if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
|
best_choice->permuter() == FREQ_DAWG_PERM ||
|
|
best_choice->permuter() == USER_DAWG_PERM) &&
|
|
(!rej_use_sensible_wd ||
|
|
acceptable_word_string(*word->uch_set,
|
|
best_choice->unichar_string().string(),
|
|
best_choice->unichar_lengths().string()) !=
|
|
AC_UNACCEPTABLE)) {
|
|
// PASSED TEST
|
|
} else if (best_choice->permuter() == NUMBER_PERM) {
|
|
if (rej_alphas_in_number_perm) {
|
|
for (i = 0, offset = 0;
|
|
best_choice->unichar_string()[offset] != '\0';
|
|
offset += best_choice->unichar_lengths()[i++]) {
|
|
if (word->reject_map[i].accepted() &&
|
|
word->uch_set->get_isalpha(
|
|
best_choice->unichar_string().string() + offset,
|
|
best_choice->unichar_lengths()[i]))
|
|
word->reject_map[i].setrej_bad_permuter();
|
|
// rej alpha
|
|
}
|
|
}
|
|
} else {
|
|
word->reject_map.rej_word_bad_permuter();
|
|
}
|
|
}
|
|
/* Ambig word rejection was here once !!*/
|
|
}
|
|
} else {
|
|
tprintf("BAD tessedit_reject_mode\n");
|
|
err_exit();
|
|
}
|
|
|
|
if (tessedit_image_border > -1)
|
|
reject_edge_blobs(word);
|
|
|
|
check_debug_pt (word, 10);
|
|
if (tessedit_rejection_debug) {
|
|
tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
|
|
tprintf("Certainty: %f Rating: %f\n",
|
|
word->best_choice->certainty (), word->best_choice->rating ());
|
|
tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
|
|
}
|
|
|
|
flip_hyphens(word);
|
|
check_debug_pt(word, 20);
|
|
}
|
|
} // namespace tesseract
|
|
|
|
|
|
void reject_blanks(WERD_RES *word) {
|
|
inT16 i;
|
|
inT16 offset;
|
|
|
|
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
|
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
|
if (word->best_choice->unichar_string()[offset] == ' ')
|
|
//rej unrecognised blobs
|
|
word->reject_map[i].setrej_tess_failure ();
|
|
}
|
|
}
|
|
|
|
namespace tesseract {
|
|
void Tesseract::reject_I_1_L(WERD_RES *word) {
|
|
inT16 i;
|
|
inT16 offset;
|
|
|
|
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
|
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
|
if (STRING (conflict_set_I_l_1).
|
|
contains (word->best_choice->unichar_string()[offset])) {
|
|
//rej 1Il conflict
|
|
word->reject_map[i].setrej_1Il_conflict ();
|
|
}
|
|
}
|
|
}
|
|
} // namespace tesseract
|
|
|
|
|
|
void reject_poor_matches(WERD_RES *word) {
|
|
float threshold = compute_reject_threshold(word->best_choice);
|
|
for (int i = 0; i < word->best_choice->length(); ++i) {
|
|
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
|
|
word->reject_map[i].setrej_tess_failure();
|
|
else if (word->best_choice->certainty(i) < threshold)
|
|
word->reject_map[i].setrej_poor_match();
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* compute_reject_threshold
|
|
*
|
|
* Set a rejection threshold for this word.
|
|
* Initially this is a trivial function which looks for the largest
|
|
* gap in the certainty value.
|
|
**********************************************************************/
|
|
|
|
float compute_reject_threshold(WERD_CHOICE* word) {
|
|
float threshold; // rejection threshold
|
|
float bestgap = 0.0f; // biggest gap
|
|
float gapstart; // bottom of gap
|
|
// super iterator
|
|
BLOB_CHOICE_IT choice_it; // real iterator
|
|
|
|
int blob_count = word->length();
|
|
GenericVector<float> ratings;
|
|
ratings.init_to_size(blob_count, 0.0f);
|
|
for (int i = 0; i < blob_count; ++i) {
|
|
ratings[i] = word->certainty(i);
|
|
}
|
|
ratings.sort();
|
|
gapstart = ratings[0] - 1; // all reject if none better
|
|
if (blob_count >= 3) {
|
|
for (int index = 0; index < blob_count - 1; index++) {
|
|
if (ratings[index + 1] - ratings[index] > bestgap) {
|
|
bestgap = ratings[index + 1] - ratings[index];
|
|
// find biggest
|
|
gapstart = ratings[index];
|
|
}
|
|
}
|
|
}
|
|
threshold = gapstart + bestgap / 2;
|
|
|
|
return threshold;
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
* reject_edge_blobs()
|
|
*
|
|
* If the word is perilously close to the edge of the image, reject those blobs
|
|
* in the word which are too close to the edge as they could be clipped.
|
|
*************************************************************************/
|
|
namespace tesseract {
|
|
void Tesseract::reject_edge_blobs(WERD_RES *word) {
|
|
TBOX word_box = word->word->bounding_box();
|
|
// Use the box_word as it is already denormed back to image coordinates.
|
|
int blobcount = word->box_word->length();
|
|
|
|
if (word_box.left() < tessedit_image_border ||
|
|
word_box.bottom() < tessedit_image_border ||
|
|
word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
|
|
word_box.top() + tessedit_image_border > ImageHeight() - 1) {
|
|
ASSERT_HOST(word->reject_map.length() == blobcount);
|
|
for (int blobindex = 0; blobindex < blobcount; blobindex++) {
|
|
TBOX blob_box = word->box_word->BlobBox(blobindex);
|
|
if (blob_box.left() < tessedit_image_border ||
|
|
blob_box.bottom() < tessedit_image_border ||
|
|
blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
|
|
blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
|
|
word->reject_map[blobindex].setrej_edge_char();
|
|
// Close to edge
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**********************************************************************
|
|
* one_ell_conflict()
|
|
*
|
|
* Identify words where there is a potential I/l/1 error.
|
|
* - A bundle of contextual heuristics!
|
|
**********************************************************************/
|
|
BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
|
|
const char *word;
|
|
const char *lengths;
|
|
inT16 word_len; //its length
|
|
inT16 first_alphanum_index_;
|
|
inT16 first_alphanum_offset_;
|
|
inT16 i;
|
|
inT16 offset;
|
|
BOOL8 non_conflict_set_char; //non conf set a/n?
|
|
BOOL8 conflict = FALSE;
|
|
BOOL8 allow_1s;
|
|
ACCEPTABLE_WERD_TYPE word_type;
|
|
BOOL8 dict_perm_type;
|
|
BOOL8 dict_word_ok;
|
|
int dict_word_type;
|
|
|
|
word = word_res->best_choice->unichar_string().string ();
|
|
lengths = word_res->best_choice->unichar_lengths().string();
|
|
word_len = strlen (lengths);
|
|
/*
|
|
If there are no occurrences of the conflict set characters then the word
|
|
is OK.
|
|
*/
|
|
if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
|
|
return FALSE;
|
|
|
|
/*
|
|
There is a conflict if there are NO other (confirmed) alphanumerics apart
|
|
from those in the conflict set.
|
|
*/
|
|
|
|
for (i = 0, offset = 0, non_conflict_set_char = FALSE;
|
|
(i < word_len) && !non_conflict_set_char; offset += lengths[i++])
|
|
non_conflict_set_char =
|
|
(word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
|
|
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
|
|
!STRING (conflict_set_I_l_1).contains (word[offset]);
|
|
if (!non_conflict_set_char) {
|
|
if (update_map)
|
|
reject_I_1_L(word_res);
|
|
return TRUE;
|
|
}
|
|
|
|
/*
|
|
If the word is accepted by a dawg permuter, and the first alpha character
|
|
is "I" or "l", check to see if the alternative is also a dawg word. If it
|
|
is, then there is a potential error otherwise the word is ok.
|
|
*/
|
|
|
|
dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
|
(word_res->best_choice->permuter () == USER_DAWG_PERM) ||
|
|
(rej_trust_doc_dawg &&
|
|
(word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
|
|
(word_res->best_choice->permuter () == FREQ_DAWG_PERM);
|
|
dict_word_type = dict_word(*(word_res->best_choice));
|
|
dict_word_ok = (dict_word_type > 0) &&
|
|
(rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
|
|
|
|
if ((rej_1Il_use_dict_word && dict_word_ok) ||
|
|
(rej_1Il_trust_permuter_type && dict_perm_type) ||
|
|
(dict_perm_type && dict_word_ok)) {
|
|
first_alphanum_index_ = first_alphanum_index (word, lengths);
|
|
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
|
|
if (lengths[first_alphanum_index_] == 1 &&
|
|
word[first_alphanum_offset_] == 'I') {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
|
if (safe_dict_word(word_res) > 0) {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
|
if (update_map)
|
|
word_res->reject_map[first_alphanum_index_].
|
|
setrej_1Il_conflict();
|
|
return TRUE;
|
|
}
|
|
else {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
if (lengths[first_alphanum_index_] == 1 &&
|
|
word[first_alphanum_offset_] == 'l') {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
|
if (safe_dict_word(word_res) > 0) {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
|
if (update_map)
|
|
word_res->reject_map[first_alphanum_index_].
|
|
setrej_1Il_conflict();
|
|
return TRUE;
|
|
}
|
|
else {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
|
return FALSE;
|
|
}
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
/*
|
|
NEW 1Il code. The old code relied on permuter types too much. In fact,
|
|
tess will use TOP_CHOICE permute for good things like "palette".
|
|
In this code the string is examined independently to see if it looks like
|
|
a well formed word.
|
|
*/
|
|
|
|
/*
|
|
REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
|
|
dictionary word.
|
|
*/
|
|
first_alphanum_index_ = first_alphanum_index (word, lengths);
|
|
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
|
|
if (lengths[first_alphanum_index_] == 1 &&
|
|
word[first_alphanum_offset_] == 'l') {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
|
if (safe_dict_word(word_res) > 0)
|
|
return FALSE;
|
|
else
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
|
}
|
|
else if (lengths[first_alphanum_index_] == 1 &&
|
|
word[first_alphanum_offset_] == 'I') {
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
|
|
if (safe_dict_word(word_res) > 0)
|
|
return FALSE;
|
|
else
|
|
word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
|
|
}
|
|
/*
|
|
For strings containing digits:
|
|
If there are no alphas OR the numeric permuter liked the word,
|
|
reject any non 1 conflict chs
|
|
Else reject all conflict chs
|
|
*/
|
|
if (word_contains_non_1_digit (word, lengths)) {
|
|
allow_1s = (alpha_count (word, lengths) == 0) ||
|
|
(word_res->best_choice->permuter () == NUMBER_PERM);
|
|
|
|
inT16 offset;
|
|
conflict = FALSE;
|
|
for (i = 0, offset = 0; word[offset] != '\0';
|
|
offset += word_res->best_choice->unichar_lengths()[i++]) {
|
|
if ((!allow_1s || (word[offset] != '1')) &&
|
|
STRING (conflict_set_I_l_1).contains (word[offset])) {
|
|
if (update_map)
|
|
word_res->reject_map[i].setrej_1Il_conflict ();
|
|
conflict = TRUE;
|
|
}
|
|
}
|
|
return conflict;
|
|
}
|
|
/*
|
|
For anything else. See if it conforms to an acceptable word type. If so,
|
|
treat accordingly.
|
|
*/
|
|
word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
|
|
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
|
|
first_alphanum_index_ = first_alphanum_index (word, lengths);
|
|
first_alphanum_offset_ = first_alphanum_offset (word, lengths);
|
|
if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
|
|
if (update_map)
|
|
word_res->reject_map[first_alphanum_index_].
|
|
setrej_1Il_conflict ();
|
|
return TRUE;
|
|
}
|
|
else
|
|
return FALSE;
|
|
}
|
|
else if (word_type == AC_UPPER_CASE) {
|
|
return FALSE;
|
|
}
|
|
else {
|
|
if (update_map)
|
|
reject_I_1_L(word_res);
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
|
|
inT16 Tesseract::first_alphanum_index(const char *word,
|
|
const char *word_lengths) {
|
|
inT16 i;
|
|
inT16 offset;
|
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
|
|
unicharset.get_isdigit(word + offset, word_lengths[i]))
|
|
return i;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
inT16 Tesseract::first_alphanum_offset(const char *word,
|
|
const char *word_lengths) {
|
|
inT16 i;
|
|
inT16 offset;
|
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
|
|
unicharset.get_isdigit(word + offset, word_lengths[i]))
|
|
return offset;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
inT16 Tesseract::alpha_count(const char *word,
|
|
const char *word_lengths) {
|
|
inT16 i;
|
|
inT16 offset;
|
|
inT16 count = 0;
|
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
if (unicharset.get_isalpha (word + offset, word_lengths[i]))
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
|
|
BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
|
|
const char *word_lengths) {
|
|
inT16 i;
|
|
inT16 offset;
|
|
|
|
for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
|
|
if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
|
|
(word_lengths[i] != 1 || word[offset] != '1'))
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* dont_allow_1Il()
|
|
* Don't unreject LONE accepted 1Il conflict set chars
|
|
*************************************************************************/
|
|
void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
|
int i = 0;
|
|
int offset;
|
|
int word_len = word->reject_map.length();
|
|
const char *s = word->best_choice->unichar_string().string();
|
|
const char *lengths = word->best_choice->unichar_lengths().string();
|
|
BOOL8 accepted_1Il = FALSE;
|
|
|
|
for (i = 0, offset = 0; i < word_len;
|
|
offset += word->best_choice->unichar_lengths()[i++]) {
|
|
if (word->reject_map[i].accepted()) {
|
|
if (STRING(conflict_set_I_l_1).contains(s[offset])) {
|
|
accepted_1Il = TRUE;
|
|
} else {
|
|
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
|
|
word->uch_set->get_isdigit(s + offset, lengths[i]))
|
|
return; // >=1 non 1Il ch accepted
|
|
}
|
|
}
|
|
}
|
|
if (!accepted_1Il)
|
|
return; //Nothing to worry about
|
|
|
|
for (i = 0, offset = 0; i < word_len;
|
|
offset += word->best_choice->unichar_lengths()[i++]) {
|
|
if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
|
|
word->reject_map[i].accepted())
|
|
word->reject_map[i].setrej_postNN_1Il();
|
|
}
|
|
}
|
|
|
|
|
|
inT16 Tesseract::count_alphanums(WERD_RES *word_res) {
|
|
int count = 0;
|
|
const WERD_CHOICE *best_choice = word_res->best_choice;
|
|
for (int i = 0; i < word_res->reject_map.length(); ++i) {
|
|
if ((word_res->reject_map[i].accepted()) &&
|
|
(word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
|
|
word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
|
|
count++;
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
|
|
// reject all if most rejected.
|
|
void Tesseract::reject_mostly_rejects(WERD_RES *word) {
|
|
/* Reject the whole of the word if the fraction of rejects exceeds a limit */
|
|
|
|
if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
|
|
rej_whole_of_mostly_reject_word_fract)
|
|
word->reject_map.rej_word_mostly_rej();
|
|
}
|
|
|
|
|
|
BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
|
inT16 char_quality;
|
|
inT16 accepted_char_quality;
|
|
|
|
if (word->best_choice->unichar_lengths().length() <= 1)
|
|
return FALSE;
|
|
|
|
if (!STRING(ok_repeated_ch_non_alphanum_wds).
|
|
contains(word->best_choice->unichar_string()[0]))
|
|
return FALSE;
|
|
|
|
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
|
|
for (int i = 1; i < word->best_choice->length(); ++i) {
|
|
if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
|
|
}
|
|
|
|
word_char_quality(word, row, &char_quality, &accepted_char_quality);
|
|
|
|
if ((word->best_choice->unichar_lengths().length () == char_quality) &&
|
|
(char_quality == accepted_char_quality))
|
|
return TRUE;
|
|
else
|
|
return FALSE;
|
|
}
|
|
|
|
inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
|
|
const WERD_CHOICE &word = *werd_res->best_choice;
|
|
int dict_word_type = werd_res->tesseract->dict_word(word);
|
|
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
|
|
}
|
|
|
|
// Note: After running this function word_res->ratings
|
|
// might not contain the right BLOB_CHOICE corresponding to each character
|
|
// in word_res->best_choice.
|
|
void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
|
WERD_CHOICE *best_choice = word_res->best_choice;
|
|
int i;
|
|
int prev_right = -9999;
|
|
int next_left;
|
|
TBOX out_box;
|
|
float aspect_ratio;
|
|
|
|
if (tessedit_lower_flip_hyphen <= 1)
|
|
return;
|
|
|
|
int num_blobs = word_res->rebuild_word->NumBlobs();
|
|
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
|
|
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
|
TBLOB* blob = word_res->rebuild_word->blobs[i];
|
|
out_box = blob->bounding_box();
|
|
if (i + 1 == num_blobs)
|
|
next_left = 9999;
|
|
else
|
|
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
|
|
// Don't touch small or touching blobs - it is too dangerous.
|
|
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
|
|
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
|
|
aspect_ratio = out_box.width() / (float) out_box.height();
|
|
if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
|
|
if (aspect_ratio >= tessedit_upper_flip_hyphen &&
|
|
word_res->uch_set->contains_unichar_id(unichar_dash) &&
|
|
word_res->uch_set->get_enabled(unichar_dash)) {
|
|
/* Certain HYPHEN */
|
|
best_choice->set_unichar_id(unichar_dash, i);
|
|
if (word_res->reject_map[i].rejected())
|
|
word_res->reject_map[i].setrej_hyphen_accept();
|
|
}
|
|
if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
|
|
word_res->reject_map[i].accepted())
|
|
//Suspected HYPHEN
|
|
word_res->reject_map[i].setrej_hyphen ();
|
|
}
|
|
else if (best_choice->unichar_id(i) == unichar_dash) {
|
|
if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
|
|
(word_res->reject_map[i].rejected()))
|
|
word_res->reject_map[i].setrej_hyphen_accept();
|
|
//Certain HYPHEN
|
|
|
|
if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
|
|
(word_res->reject_map[i].accepted()))
|
|
//Suspected HYPHEN
|
|
word_res->reject_map[i].setrej_hyphen();
|
|
}
|
|
}
|
|
prev_right = out_box.right();
|
|
}
|
|
}
|
|
|
|
// Note: After running this function word_res->ratings
|
|
// might not contain the right BLOB_CHOICE corresponding to each character
|
|
// in word_res->best_choice.
|
|
void Tesseract::flip_0O(WERD_RES *word_res) {
|
|
WERD_CHOICE *best_choice = word_res->best_choice;
|
|
int i;
|
|
TBOX out_box;
|
|
|
|
if (!tessedit_flip_0O)
|
|
return;
|
|
|
|
int num_blobs = word_res->rebuild_word->NumBlobs();
|
|
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
|
TBLOB* blob = word_res->rebuild_word->blobs[i];
|
|
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
|
|
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
|
|
out_box = blob->bounding_box();
|
|
if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
|
|
(out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
|
|
return; //Beware words with sub/superscripts
|
|
}
|
|
}
|
|
UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
|
|
UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
|
|
if (unichar_0 == INVALID_UNICHAR_ID ||
|
|
!word_res->uch_set->get_enabled(unichar_0) ||
|
|
unichar_O == INVALID_UNICHAR_ID ||
|
|
!word_res->uch_set->get_enabled(unichar_O)) {
|
|
return; // 0 or O are not present/enabled in unicharset
|
|
}
|
|
for (i = 1; i < best_choice->length(); ++i) {
|
|
if (best_choice->unichar_id(i) == unichar_0 ||
|
|
best_choice->unichar_id(i) == unichar_O) {
|
|
/* A0A */
|
|
if ((i+1) < best_choice->length() &&
|
|
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
|
|
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
|
|
best_choice->set_unichar_id(unichar_O, i);
|
|
}
|
|
/* A00A */
|
|
if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
|
|
(i+1) < best_choice->length() &&
|
|
(best_choice->unichar_id(i+1) == unichar_0 ||
|
|
best_choice->unichar_id(i+1) == unichar_O) &&
|
|
(i+2) < best_choice->length() &&
|
|
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
|
|
best_choice->set_unichar_id(unichar_O, i);
|
|
i++;
|
|
}
|
|
/* AA0<non digit or end of word> */
|
|
if ((i > 1) &&
|
|
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
|
|
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
|
|
(((i+1) < best_choice->length() &&
|
|
!word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
|
|
!word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
|
|
!word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
|
|
(i == best_choice->length() - 1))) {
|
|
best_choice->set_unichar_id(unichar_O, i);
|
|
}
|
|
/* 9O9 */
|
|
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
|
|
(i+1) < best_choice->length() &&
|
|
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
}
|
|
/* 9OOO */
|
|
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
|
|
(i+2) < best_choice->length() &&
|
|
(best_choice->unichar_id(i+1) == unichar_0 ||
|
|
best_choice->unichar_id(i+1) == unichar_O) &&
|
|
(best_choice->unichar_id(i+2) == unichar_0 ||
|
|
best_choice->unichar_id(i+2) == unichar_O)) {
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
best_choice->set_unichar_id(unichar_0, i+1);
|
|
best_choice->set_unichar_id(unichar_0, i+2);
|
|
i += 2;
|
|
}
|
|
/* 9OO<non upper> */
|
|
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
|
|
(i+2) < best_choice->length() &&
|
|
(best_choice->unichar_id(i+1) == unichar_0 ||
|
|
best_choice->unichar_id(i+1) == unichar_O) &&
|
|
!word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
best_choice->set_unichar_id(unichar_0, i+1);
|
|
i++;
|
|
}
|
|
/* 9O<non upper> */
|
|
if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
|
|
(i+1) < best_choice->length() &&
|
|
!word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
}
|
|
/* 9[.,]OOO.. */
|
|
if ((i > 1) &&
|
|
(word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
|
|
word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
|
|
(word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
|
|
best_choice->unichar_id(i-2) == unichar_O)) {
|
|
if (best_choice->unichar_id(i-2) == unichar_O) {
|
|
best_choice->set_unichar_id(unichar_0, i-2);
|
|
}
|
|
while (i < best_choice->length() &&
|
|
(best_choice->unichar_id(i) == unichar_O ||
|
|
best_choice->unichar_id(i) == unichar_0)) {
|
|
best_choice->set_unichar_id(unichar_0, i);
|
|
i++;
|
|
}
|
|
i--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
|
|
return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
|
|
}
|
|
|
|
BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
|
|
return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
|
|
}
|
|
} // namespace tesseract
|