tesseract/ccmain/docqual.cpp

1072 lines
35 KiB
C++

/******************************************************************
* File: docqual.cpp (Formerly docqual.c)
* Description: Document Quality Metrics
* Author: Phil Cheatle
* Created: Mon May 9 11:27:28 BST 1994
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#include "mfcpch.h"
#include <ctype.h>
#include "docqual.h"
#include "tfacep.h"
#include "reject.h"
#include "tesscallback.h"
#include "tessvars.h"
#include "secname.h"
#include "globals.h"
#include "tesseractclass.h"
namespace tesseract{
// A little class to provide the callbacks as we have no pre-bound args.
struct DocQualCallbacks {
explicit DocQualCallbacks(WERD_RES* word0)
: word(word0), match_count(0), accepted_match_count(0) {}
void CountMatchingBlobs(int index) {
++match_count;
}
void CountAcceptedBlobs(int index) {
if (word->reject_map[index].accepted())
++accepted_match_count;
++match_count;
}
void AcceptIfGoodQuality(int index) {
if (word->reject_map[index].accept_if_good_quality())
word->reject_map[index].setrej_quality_accept();
}
WERD_RES* word;
inT16 match_count;
inT16 accepted_match_count;
};
/*************************************************************************
* word_blob_quality()
* How many blobs in the box_word are identical to those of the inword?
* ASSUME blobs in both initial word and box_word are in ascending order of
* left hand blob edge.
*************************************************************************/
inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
return 0;
DocQualCallbacks cb(word);
word->bln_boxes->ProcessMatchedBlobs(
*word->rebuild_word,
NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
return cb.match_count;
}
inT16 Tesseract::word_outline_errs(WERD_RES *word) {
inT16 i = 0;
inT16 err_count = 0;
TBLOB* blob = word->rebuild_word->blobs;
for (; blob != NULL; blob = blob->next) {
err_count += count_outline_errs(word->best_choice->unichar_string()[i],
blob->NumOutlines());
i++;
}
return err_count;
}
/*************************************************************************
* word_char_quality()
* Combination of blob quality and outline quality - how many good chars are
* there? - I.e chars which pass the blob AND outline tests.
*************************************************************************/
void Tesseract::word_char_quality(WERD_RES *word,
ROW *row,
inT16 *match_count,
inT16 *accepted_match_count) {
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
return;
DocQualCallbacks cb(word);
word->bln_boxes->ProcessMatchedBlobs(
*word->rebuild_word,
NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
*match_count = cb.match_count;
*accepted_match_count = cb.accepted_match_count;
}
/*************************************************************************
* unrej_good_chs()
* Unreject POTENTIAL rejects if the blob passes the blob and outline checks
*************************************************************************/
void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
return;
DocQualCallbacks cb(word);
word->bln_boxes->ProcessMatchedBlobs(
*word->rebuild_word,
NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
}
inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
int expected_outline_count;
if (STRING (outlines_odd).contains (c))
return 0; //Dont use this char
else if (STRING (outlines_2).contains (c))
expected_outline_count = 2;
else
expected_outline_count = 1;
return abs (outline_count - expected_outline_count);
}
void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
BOOL8 good_quality_doc) {
if ((tessedit_good_quality_unrej && good_quality_doc))
unrej_good_quality_words(page_res_it);
doc_and_block_rejection(page_res_it, good_quality_doc);
if (unlv_tilde_crunching) {
tilde_crunch(page_res_it);
tilde_delete(page_res_it);
}
}
/*************************************************************************
* unrej_good_quality_words()
* Accept potential rejects in words which pass the following checks:
* - Contains a potential reject
* - Word looks like a sensible alpha word.
* - Word segmentation is the same as the original image
* - All characters have the expected number of outlines
* NOTE - the rejection counts are recalculated after unrejection
* - CANT do it in a single pass without a bit of fiddling
* - keep it simple but inefficient
*************************************************************************/
void Tesseract::unrej_good_quality_words( //unreject potential
PAGE_RES_IT &page_res_it) {
WERD_RES *word;
ROW_RES *current_row;
BLOCK_RES *current_block;
int i;
page_res_it.restart_page ();
while (page_res_it.word () != NULL) {
check_debug_pt (page_res_it.word (), 100);
if (bland_unrej) {
word = page_res_it.word ();
for (i = 0; i < word->reject_map.length (); i++) {
if (word->reject_map[i].accept_if_good_quality ())
word->reject_map[i].setrej_quality_accept ();
}
page_res_it.forward ();
}
else if ((page_res_it.row ()->char_count > 0) &&
((page_res_it.row ()->rej_count /
(float) page_res_it.row ()->char_count) <=
quality_rowrej_pc)) {
word = page_res_it.word ();
if (word->reject_map.quality_recoverable_rejects () &&
(tessedit_unrej_any_wd ||
acceptable_word_string (word->best_choice->unichar_string().string(),
word->best_choice->unichar_lengths().string())
!= AC_UNACCEPTABLE)) {
unrej_good_chs (word, page_res_it.row ()->row);
}
page_res_it.forward ();
}
else {
/* Skip to end of dodgy row */
current_row = page_res_it.row ();
while ((page_res_it.word () != NULL) &&
(page_res_it.row () == current_row))
page_res_it.forward ();
}
check_debug_pt (page_res_it.word (), 110);
}
page_res_it.restart_page ();
page_res_it.page_res->char_count = 0;
page_res_it.page_res->rej_count = 0;
current_block = NULL;
current_row = NULL;
while (page_res_it.word () != NULL) {
if (current_block != page_res_it.block ()) {
current_block = page_res_it.block ();
current_block->char_count = 0;
current_block->rej_count = 0;
}
if (current_row != page_res_it.row ()) {
current_row = page_res_it.row ();
current_row->char_count = 0;
current_row->rej_count = 0;
current_row->whole_word_rej_count = 0;
}
page_res_it.rej_stat_word ();
page_res_it.forward ();
}
}
/*************************************************************************
* doc_and_block_rejection()
*
* If the page has too many rejects - reject all of it.
* If any block has too many rejects - reject all words in the block
*************************************************************************/
void Tesseract::doc_and_block_rejection( //reject big chunks
PAGE_RES_IT &page_res_it,
BOOL8 good_quality_doc) {
inT16 block_no = 0;
inT16 row_no = 0;
BLOCK_RES *current_block;
ROW_RES *current_row;
BOOL8 rej_word;
BOOL8 prev_word_rejected;
inT16 char_quality = 0;
inT16 accepted_char_quality;
if ((page_res_it.page_res->rej_count * 100.0 /
page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
reject_whole_page(page_res_it);
#ifndef SECURE_NAMES
if (tessedit_debug_doc_rejection) {
tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
page_res_it.page_res->char_count,
page_res_it.page_res->rej_count);
}
#endif
}
else {
#ifndef SECURE_NAMES
if (tessedit_debug_doc_rejection)
tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
page_res_it.page_res->char_count,
page_res_it.page_res->rej_count);
#endif
/* Walk blocks testing for block rejection */
page_res_it.restart_page ();
while (page_res_it.word () != NULL) {
current_block = page_res_it.block();
block_no = current_block->block->index();
if ((page_res_it.block ()->char_count > 0) &&
((page_res_it.block ()->rej_count * 100.0 /
page_res_it.block ()->char_count) >
tessedit_reject_block_percent)) {
#ifndef SECURE_NAMES
if (tessedit_debug_block_rejection)
tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
block_no,
page_res_it.block ()->char_count,
page_res_it.block ()->rej_count);
#endif
prev_word_rejected = FALSE;
while ((page_res_it.word () != NULL) &&
(page_res_it.block () == current_block)) {
if (tessedit_preserve_blk_rej_perfect_wds) {
rej_word =
(page_res_it.word ()->reject_map.reject_count () > 0)
|| (page_res_it.word ()->reject_map.length () <
tessedit_preserve_min_wd_len);
if (rej_word && tessedit_dont_blkrej_good_wds
&& !(page_res_it.word ()->reject_map.length () <
tessedit_preserve_min_wd_len)
&&
(acceptable_word_string
(page_res_it.word()->best_choice->unichar_string().string(),
page_res_it.word ()->best_choice->unichar_lengths().string()) !=
AC_UNACCEPTABLE)) {
word_char_quality (page_res_it.word (),
page_res_it.row ()->row,
&char_quality,
&accepted_char_quality);
rej_word = char_quality !=
page_res_it.word ()->reject_map.length ();
}
}
else
rej_word = TRUE;
if (rej_word) {
/*
Reject spacing if both current and prev words are rejected.
NOTE - this is NOT restricted to FUZZY spaces. - When tried this
generated more space errors.
*/
if (tessedit_use_reject_spaces &&
prev_word_rejected &&
(page_res_it.prev_row () == page_res_it.row ()) &&
(page_res_it.word ()->word->space () == 1))
page_res_it.word ()->reject_spaces = TRUE;
page_res_it.word ()->reject_map.rej_word_block_rej ();
}
prev_word_rejected = rej_word;
page_res_it.forward ();
}
}
else {
#ifndef SECURE_NAMES
if (tessedit_debug_block_rejection)
tprintf
("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
block_no, page_res_it.block ()->char_count,
page_res_it.block ()->rej_count);
#endif
/* Walk rows in block testing for row rejection */
row_no = 0;
while ((page_res_it.word () != NULL) &&
(page_res_it.block () == current_block)) {
current_row = page_res_it.row ();
row_no++;
/* Reject whole row if:
fraction of chars on row which are rejected exceed a limit AND
fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
limit
*/
if ((page_res_it.row ()->char_count > 0) &&
((page_res_it.row ()->rej_count * 100.0 /
page_res_it.row ()->char_count) >
tessedit_reject_row_percent) &&
((page_res_it.row ()->whole_word_rej_count * 100.0 /
page_res_it.row ()->rej_count) <
tessedit_whole_wd_rej_row_percent)) {
#ifndef SECURE_NAMES
if (tessedit_debug_block_rejection)
tprintf
("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
row_no, page_res_it.row ()->char_count,
page_res_it.row ()->rej_count);
#endif
prev_word_rejected = FALSE;
while ((page_res_it.word () != NULL) &&
(page_res_it.row () == current_row)) {
/* Preserve words on good docs unless they are mostly rejected*/
if (!tessedit_row_rej_good_docs && good_quality_doc) {
rej_word =
page_res_it.word ()->reject_map.
reject_count () /
(float) page_res_it.word ()->reject_map.
length () > tessedit_good_doc_still_rowrej_wd;
}
/* Preserve perfect words anyway */
else if (tessedit_preserve_row_rej_perfect_wds) {
rej_word =
(page_res_it.word ()->reject_map.
reject_count () > 0)
|| (page_res_it.word ()->reject_map.
length () < tessedit_preserve_min_wd_len);
if (rej_word && tessedit_dont_rowrej_good_wds
&& !(page_res_it.word ()->reject_map.
length () <
tessedit_preserve_min_wd_len)
&&
(acceptable_word_string
(page_res_it.word ()->best_choice->
unichar_string().string(),
page_res_it.word ()->best_choice->
unichar_lengths().string()) != AC_UNACCEPTABLE)) {
word_char_quality (page_res_it.word (),
page_res_it.row ()->row,
&char_quality,
&accepted_char_quality);
rej_word = char_quality !=
page_res_it.word ()->reject_map.length ();
}
}
else
rej_word = TRUE;
if (rej_word) {
/*
Reject spacing if both current and prev words are rejected.
NOTE - this is NOT restricted to FUZZY spaces. - When tried
this generated more space errors.
*/
if (tessedit_use_reject_spaces &&
prev_word_rejected &&
(page_res_it.prev_row () ==
page_res_it.row ())
&& (page_res_it.word ()->word->space () ==
1))
page_res_it.word ()->reject_spaces = TRUE;
page_res_it.word ()->reject_map.
rej_word_row_rej();
}
prev_word_rejected = rej_word;
page_res_it.forward ();
}
}
else {
#ifndef SECURE_NAMES
if (tessedit_debug_block_rejection)
tprintf
("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
row_no, page_res_it.row ()->char_count,
page_res_it.row ()->rej_count);
#endif
while ((page_res_it.word () != NULL) &&
(page_res_it.row () == current_row))
page_res_it.forward ();
}
}
}
}
}
}
} // namespace tesseract
/*************************************************************************
* reject_whole_page()
* Dont believe any of it - set the reject map to 00..00 in all words
*
*************************************************************************/
void reject_whole_page(PAGE_RES_IT &page_res_it) {
page_res_it.restart_page ();
while (page_res_it.word () != NULL) {
page_res_it.word ()->reject_map.rej_word_doc_rej ();
page_res_it.forward ();
}
//whole page is rejected
page_res_it.page_res->rejected = TRUE;
}
namespace tesseract {
void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
WERD_RES *word;
GARBAGE_LEVEL garbage_level;
PAGE_RES_IT copy_it;
BOOL8 prev_potential_marked = FALSE;
BOOL8 found_terrible_word = FALSE;
BOOL8 ok_dict_word;
page_res_it.restart_page ();
while (page_res_it.word () != NULL) {
word = page_res_it.word ();
if (crunch_early_convert_bad_unlv_chs)
convert_bad_unlv_chs(word);
if (crunch_early_merge_tess_fails)
merge_tess_fails(word);
if (word->reject_map.accept_count () != 0) {
found_terrible_word = FALSE;
//Forget earlier potential crunches
prev_potential_marked = FALSE;
}
else {
ok_dict_word = safe_dict_word(*(word->best_choice));
garbage_level = garbage_word (word, ok_dict_word);
if ((garbage_level != G_NEVER_CRUNCH) &&
(terrible_word_crunch (word, garbage_level))) {
if (crunch_debug > 0) {
tprintf ("T CRUNCHING: \"%s\"\n",
word->best_choice->unichar_string().string());
}
word->unlv_crunch_mode = CR_KEEP_SPACE;
if (prev_potential_marked) {
while (copy_it.word () != word) {
if (crunch_debug > 0) {
tprintf ("P1 CRUNCHING: \"%s\"\n",
copy_it.word()->best_choice->unichar_string().string());
}
copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
copy_it.forward ();
}
prev_potential_marked = FALSE;
}
found_terrible_word = TRUE;
}
else if ((garbage_level != G_NEVER_CRUNCH) &&
(potential_word_crunch (word,
garbage_level, ok_dict_word))) {
if (found_terrible_word) {
if (crunch_debug > 0) {
tprintf ("P2 CRUNCHING: \"%s\"\n",
word->best_choice->unichar_string().string());
}
word->unlv_crunch_mode = CR_KEEP_SPACE;
}
else if (!prev_potential_marked) {
copy_it = page_res_it;
prev_potential_marked = TRUE;
if (crunch_debug > 1) {
tprintf ("P3 CRUNCHING: \"%s\"\n",
word->best_choice->unichar_string().string());
}
}
}
else {
found_terrible_word = FALSE;
//Forget earlier potential crunches
prev_potential_marked = FALSE;
if (crunch_debug > 2) {
tprintf ("NO CRUNCH: \"%s\"\n",
word->best_choice->unichar_string().string());
}
}
}
page_res_it.forward ();
}
}
BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
GARBAGE_LEVEL garbage_level) {
float rating_per_ch;
int adjusted_len;
int crunch_mode = 0;
if ((word->best_choice->unichar_string().length () == 0) ||
(strspn (word->best_choice->unichar_string().string(), " ") ==
word->best_choice->unichar_string().length ()))
crunch_mode = 1;
else {
adjusted_len = word->reject_map.length ();
if (adjusted_len > crunch_rating_max)
adjusted_len = crunch_rating_max;
rating_per_ch = word->best_choice->rating () / adjusted_len;
if (rating_per_ch > crunch_terrible_rating)
crunch_mode = 2;
else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
crunch_mode = 3;
else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
(garbage_level != G_OK))
crunch_mode = 4;
else if ((rating_per_ch > crunch_poor_garbage_rate) &&
(garbage_level != G_OK))
crunch_mode = 5;
}
if (crunch_mode > 0) {
if (crunch_debug > 2) {
tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
crunch_mode, word->best_choice->unichar_string().string());
}
return TRUE;
}
else
return FALSE;
}
BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
GARBAGE_LEVEL garbage_level,
BOOL8 ok_dict_word) {
float rating_per_ch;
int adjusted_len;
const char *str = word->best_choice->unichar_string().string();
const char *lengths = word->best_choice->unichar_lengths().string();
BOOL8 word_crunchable;
int poor_indicator_count = 0;
word_crunchable =
!crunch_leave_accept_strings ||
(word->reject_map.length () < 3) ||
((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) &&
!ok_dict_word);
adjusted_len = word->reject_map.length ();
if (adjusted_len > 10)
adjusted_len = 10;
rating_per_ch = word->best_choice->rating () / adjusted_len;
if (rating_per_ch > crunch_pot_poor_rate) {
if (crunch_debug > 2) {
tprintf ("Potential poor rating on \"%s\"\n",
word->best_choice->unichar_string().string());
}
poor_indicator_count++;
}
if (word_crunchable &&
(word->best_choice->certainty () < crunch_pot_poor_cert)) {
if (crunch_debug > 2) {
tprintf ("Potential poor cert on \"%s\"\n",
word->best_choice->unichar_string().string());
}
poor_indicator_count++;
}
if (garbage_level != G_OK) {
if (crunch_debug > 2) {
tprintf ("Potential garbage on \"%s\"\n",
word->best_choice->unichar_string().string());
}
poor_indicator_count++;
}
return (poor_indicator_count >= crunch_pot_indicators);
}
} // namespace tesseract
namespace tesseract {
void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
WERD_RES *word;
PAGE_RES_IT copy_it;
BOOL8 deleting_from_bol = FALSE;
BOOL8 marked_delete_point = FALSE;
inT16 debug_delete_mode;
CRUNCH_MODE delete_mode;
inT16 x_debug_delete_mode;
CRUNCH_MODE x_delete_mode;
page_res_it.restart_page ();
while (page_res_it.word () != NULL) {
word = page_res_it.word ();
delete_mode = word_deletable (word, debug_delete_mode);
if (delete_mode != CR_NONE) {
if (word->word->flag (W_BOL) || deleting_from_bol) {
if (crunch_debug > 0) {
tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
debug_delete_mode,
word->best_choice->unichar_string().string());
}
word->unlv_crunch_mode = delete_mode;
deleting_from_bol = TRUE;
}
else if (word->word->flag (W_EOL)) {
if (marked_delete_point) {
while (copy_it.word () != word) {
x_delete_mode = word_deletable (copy_it.word (),
x_debug_delete_mode);
if (crunch_debug > 0) {
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
x_debug_delete_mode,
copy_it.word()->best_choice->unichar_string().string());
}
copy_it.word ()->unlv_crunch_mode = x_delete_mode;
copy_it.forward ();
}
}
if (crunch_debug > 0) {
tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
debug_delete_mode,
word->best_choice->unichar_string().string());
}
word->unlv_crunch_mode = delete_mode;
deleting_from_bol = FALSE;
marked_delete_point = FALSE;
}
else {
if (!marked_delete_point) {
copy_it = page_res_it;
marked_delete_point = TRUE;
}
}
}
else {
deleting_from_bol = FALSE;
//Forget earlier potential crunches
marked_delete_point = FALSE;
}
/*
The following step has been left till now as the tess fails are used to
determine if the word is deletable.
*/
if (!crunch_early_merge_tess_fails)
merge_tess_fails(word);
page_res_it.forward ();
}
}
void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
int i;
UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-");
UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~");
UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^");
bool modified = false;
for (i = 0; i < word_res->reject_map.length(); ++i) {
if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
word_res->best_choice->set_unichar_id(unichar_dash, i);
modified = true;
if (word_res->reject_map[i].accepted ())
word_res->reject_map[i].setrej_unlv_rej ();
}
if (word_res->best_choice->unichar_id(i) == unichar_pow) {
word_res->best_choice->set_unichar_id(unichar_space, i);
modified = true;
if (word_res->reject_map[i].accepted ())
word_res->reject_map[i].setrej_unlv_rej ();
}
}
if (modified) {
word_res->best_choice->populate_unichars(unicharset);
}
}
// Callback helper for merge_tess_fails returns a space if both
// arguments are space, otherwise INVALID_UNICHAR_ID.
UNICHAR_ID Tesseract::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
if (id1 == id2 && id1 == unicharset.unichar_to_id(" "))
return id1;
else
return INVALID_UNICHAR_ID;
}
// Change pairs of tess failures to a single one
void Tesseract::merge_tess_fails(WERD_RES *word_res) {
if (word_res->ConditionalBlobMerge(
unicharset,
NewPermanentTessCallback(this, &Tesseract::BothSpaces), NULL,
word_res->best_choice->blob_choices())) {
if (crunch_debug) {
tprintf("Post:bc len=%d, rejmap=%d, boxword=%d, chopword=%d,"
" rebuild=%d\n",
word_res->best_choice->length(),
word_res->reject_map.length(),
word_res->box_word->length(),
word_res->chopped_word->NumBlobs(),
word_res->rebuild_word->NumBlobs());
}
int len = word_res->best_choice->length();
ASSERT_HOST(word_res->reject_map.length() == len);
ASSERT_HOST(word_res->box_word->length() == len);
}
}
GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
enum STATES
{
JUNK,
FIRST_UPPER,
FIRST_LOWER,
FIRST_NUM,
SUBSEQUENT_UPPER,
SUBSEQUENT_LOWER,
SUBSEQUENT_NUM
};
const char *str = word->best_choice->unichar_string().string();
const char *lengths = word->best_choice->unichar_lengths().string();
STATES state = JUNK;
int len = 0;
int isolated_digits = 0;
int isolated_alphas = 0;
int bad_char_count = 0;
int tess_rejs = 0;
int dodgy_chars = 0;
int ok_chars;
UNICHAR_ID last_char = -1;
int alpha_repetition_count = 0;
int longest_alpha_repetition_count = 0;
int longest_lower_run_len = 0;
int lower_string_count = 0;
int longest_upper_run_len = 0;
int upper_string_count = 0;
int total_alpha_count = 0;
int total_digit_count = 0;
for (; *str != '\0'; str += *(lengths++)) {
len++;
if (unicharset.get_isupper (str, *lengths)) {
total_alpha_count++;
switch (state) {
case SUBSEQUENT_UPPER:
case FIRST_UPPER:
state = SUBSEQUENT_UPPER;
upper_string_count++;
if (longest_upper_run_len < upper_string_count)
longest_upper_run_len = upper_string_count;
if (last_char == unicharset.unichar_to_id(str, *lengths)) {
alpha_repetition_count++;
if (longest_alpha_repetition_count < alpha_repetition_count) {
longest_alpha_repetition_count = alpha_repetition_count;
}
}
else {
last_char = unicharset.unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
}
break;
case FIRST_NUM:
isolated_digits++;
default:
state = FIRST_UPPER;
last_char = unicharset.unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
upper_string_count = 1;
break;
}
}
else if (unicharset.get_islower (str, *lengths)) {
total_alpha_count++;
switch (state) {
case SUBSEQUENT_LOWER:
case FIRST_LOWER:
state = SUBSEQUENT_LOWER;
lower_string_count++;
if (longest_lower_run_len < lower_string_count)
longest_lower_run_len = lower_string_count;
if (last_char == unicharset.unichar_to_id(str, *lengths)) {
alpha_repetition_count++;
if (longest_alpha_repetition_count < alpha_repetition_count) {
longest_alpha_repetition_count = alpha_repetition_count;
}
}
else {
last_char = unicharset.unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
}
break;
case FIRST_NUM:
isolated_digits++;
default:
state = FIRST_LOWER;
last_char = unicharset.unichar_to_id(str, *lengths);
alpha_repetition_count = 1;
lower_string_count = 1;
break;
}
}
else if (unicharset.get_isdigit (str, *lengths)) {
total_digit_count++;
switch (state) {
case FIRST_NUM:
state = SUBSEQUENT_NUM;
case SUBSEQUENT_NUM:
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
state = FIRST_NUM;
break;
}
}
else {
if (*lengths == 1 && *str == ' ')
tess_rejs++;
else
bad_char_count++;
switch (state) {
case FIRST_NUM:
isolated_digits++;
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
break;
}
state = JUNK;
}
}
switch (state) {
case FIRST_NUM:
isolated_digits++;
break;
case FIRST_UPPER:
case FIRST_LOWER:
isolated_alphas++;
default:
break;
}
if (crunch_include_numerals) {
total_alpha_count += total_digit_count - isolated_digits;
}
if (crunch_leave_ok_strings &&
(len >= 4) &&
(2 * (total_alpha_count - isolated_alphas) > len) &&
(longest_alpha_repetition_count < crunch_long_repetitions)) {
if ((crunch_accept_ok &&
(acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) ||
(longest_lower_run_len > crunch_leave_lc_strings) ||
(longest_upper_run_len > crunch_leave_uc_strings))
return G_NEVER_CRUNCH;
}
if ((word->reject_map.length () > 1) &&
(strpbrk (str, " ") == NULL) &&
((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
(word->best_choice->permuter () == USER_DAWG_PERM) ||
(word->best_choice->permuter () == NUMBER_PERM) ||
(acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word))
return G_OK;
ok_chars = len - bad_char_count - isolated_digits -
isolated_alphas - tess_rejs;
if (crunch_debug > 3) {
tprintf ("garbage_word: \"%s\"\n",
word->best_choice->unichar_string().string());
tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
len,
bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
}
if ((bad_char_count == 0) &&
(tess_rejs == 0) &&
((len > isolated_digits + isolated_alphas) || (len <= 2)))
return G_OK;
if ((tess_rejs > ok_chars) ||
((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
return G_TERRIBLE;
if (len > 4) {
dodgy_chars = 2 * tess_rejs + bad_char_count +
isolated_digits + isolated_alphas;
if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
return G_DODGY;
else
return G_OK;
}
else {
dodgy_chars = 2 * tess_rejs + bad_char_count;
if (((len == 4) && (dodgy_chars > 2)) ||
((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
return G_DODGY;
else
return G_OK;
}
}
/*************************************************************************
* word_deletable()
* DELETE WERDS AT ENDS OF ROWS IF
* Word is crunched &&
* ( string length = 0 OR
* > 50% of chars are "|" (before merging) OR
* certainty < -10 OR
* rating /char > 60 OR
* TOP of word is more than 0.5 xht BELOW baseline OR
* BOTTOM of word is more than 0.5 xht ABOVE xht OR
* length of word < 3xht OR
* height of word < 0.7 xht OR
* height of word > 3.0 xht OR
* >75% of the outline BBs have longest dimension < 0.5xht
*************************************************************************/
CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {
int word_len = word->reject_map.length ();
float rating_per_ch;
TBOX box; //BB of word
if (word->unlv_crunch_mode == CR_NONE) {
delete_mode = 0;
return CR_NONE;
}
if (word_len == 0) {
delete_mode = 1;
return CR_DELETE;
}
box = word->rebuild_word->bounding_box();
if (box.height () < crunch_del_min_ht * kBlnXHeight) {
delete_mode = 4;
return CR_DELETE;
}
if (noise_outlines(word->rebuild_word)) {
delete_mode = 5;
return CR_DELETE;
}
if ((failure_count (word) * 1.5) > word_len) {
delete_mode = 2;
return CR_LOOSE_SPACE;
}
if (word->best_choice->certainty () < crunch_del_cert) {
delete_mode = 7;
return CR_LOOSE_SPACE;
}
rating_per_ch = word->best_choice->rating () / word_len;
if (rating_per_ch > crunch_del_rating) {
delete_mode = 8;
return CR_LOOSE_SPACE;
}
if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
delete_mode = 9;
return CR_LOOSE_SPACE;
}
if (box.bottom () >
kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
delete_mode = 10;
return CR_LOOSE_SPACE;
}
if (box.height () > crunch_del_max_ht * kBlnXHeight) {
delete_mode = 11;
return CR_LOOSE_SPACE;
}
if (box.width () < crunch_del_min_width * kBlnXHeight) {
delete_mode = 3;
return CR_LOOSE_SPACE;
}
delete_mode = 0;
return CR_NONE;
}
inT16 Tesseract::failure_count(WERD_RES *word) {
const char *str = word->best_choice->unichar_string().string();
int tess_rejs = 0;
for (; *str != '\0'; str++) {
if (*str == ' ')
tess_rejs++;
}
return tess_rejs;
}
BOOL8 Tesseract::noise_outlines(TWERD *word) {
TBOX box; // BB of outline
inT16 outline_count = 0;
inT16 small_outline_count = 0;
inT16 max_dimension;
float small_limit = kBlnXHeight * crunch_small_outlines_size;
for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
outline_count++;
box = ol->bounding_box();
if (box.height() > box.width())
max_dimension = box.height();
else
max_dimension = box.width();
if (max_dimension < small_limit)
small_outline_count++;
}
}
return (small_outline_count >= outline_count);
}
} // namespace tesseract