tesseract/ccmain/adaptions.cpp
2010-05-28 12:03:45 +00:00

1133 lines
37 KiB
C++

/**********************************************************************
* File: adaptions.cpp (Formerly adaptions.c)
* Description: Functions used to adapt to blobs already confidently
* identified
* Author: Chris Newton
* Created: Thu Oct 7 10:17:28 BST 1993
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include "mfcpch.h"
#ifdef __UNIX__
#include <assert.h>
#endif
#include <ctype.h>
#include <string.h>
#include "tessbox.h"
#include "tessvars.h"
#include "memry.h"
#include "mainblk.h"
#include "charcut.h"
#include "imgs.h"
#include "scaleimg.h"
#include "reject.h"
#include "control.h"
#include "adaptions.h"
#include "stopper.h"
#include "charsample.h"
#include "matmatch.h"
#include "secname.h"
#include "tesseractclass.h"
inT32 demo_word = 0;
#define WINDOWNAMESIZE 13 /*max size of name */
#define EXTERN
EXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's");
EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");
EXTERN double_VAR (tessedit_cluster_t1, 0.20,
"t1 threshold for clustering samples");
EXTERN double_VAR (tessedit_cluster_t2, 0.40,
"t2 threshold for clustering samples");
EXTERN double_VAR (tessedit_cluster_t3, 0.12,
"Extra threshold for clustering samples, only keep a new sample if best score greater than this value");
EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80,
"Largest fraction of characters in cluster for it to be used for adaption");
EXTERN INT_VAR (tessedit_cluster_min_size, 3,
"Smallest number of samples in a cluster for it to be used for adaption");
EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE,
"Generate and print debug information for adaption by clustering");
EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE,
"Use best sample from cluster when adapting");
EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE,
"Set reject map to enable cluster input to be measured");
EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher");
EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE,
"Don't try to adapt to characters on this list");
EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*",
"Characters to be avoided when adapting");
EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE,
"Use prototypes when adapting");
EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE,
"Use prototypes as clusters are built");
EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE,
"Adapt to characters using reject map");
EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE,
"Adapt to all characters using, matrix matcher");
EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE,
"Only match samples against clusters for the same character");
EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs");
EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE,
"Display cut images and matrix match for demo purposes");
EXTERN INT_VAR (tessedit_demo_word1, 62,
"Word number of first word to display");
EXTERN INT_VAR (tessedit_demo_word2, 64,
"Word number of second word to display");
EXTERN STRING_VAR (tessedit_demo_file, "academe",
"Name of document containing demo words");
EXTERN BOOL_VAR(tessedit_adapt_to_char_fragments, TRUE,
"Adapt to words that contain "
" a character composed form fragments");
namespace tesseract {
BOOL8 Tesseract::word_adaptable( //should we adapt?
WERD_RES *word,
uinT16 mode) {
if (tessedit_adaption_debug) {
tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
word->best_choice == NULL ? "" :
word->best_choice->unichar_string().string(),
word->best_choice->rating(), word->best_choice->certainty());
}
BOOL8 status = FALSE;
BITS16 flags(mode);
enum MODES
{
ADAPTABLE_WERD,
ACCEPTABLE_WERD,
CHECK_DAWGS,
CHECK_SPACES,
CHECK_ONE_ELL_CONFLICT,
CHECK_AMBIG_WERD
};
/*
0: NO adaption
*/
if (mode == 0) {
if (tessedit_adaption_debug) tprintf("adaption disabled\n");
return FALSE;
}
if (flags.bit (ADAPTABLE_WERD)) {
status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
if (tessedit_adaption_debug && !status) {
tprintf("tess_would_adapt bit is false\n");
}
}
if (flags.bit (ACCEPTABLE_WERD)) {
status |= word->tess_accepted;
if (tessedit_adaption_debug && !status) {
tprintf("tess_accepted bit is false\n");
}
}
if (!status) { // If not set then
return FALSE; // ignore other checks
}
if (flags.bit (CHECK_DAWGS) &&
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) {
if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
return FALSE;
}
if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
return FALSE;
}
if (flags.bit (CHECK_SPACES) &&
(strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
if (tessedit_adaption_debug) tprintf("word contains spaces\n");
return FALSE;
}
// if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
if (flags.bit (CHECK_AMBIG_WERD) &&
!getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
return FALSE;
}
// Do not adapt to words that are composed from fragments if
// tessedit_adapt_to_char_fragments is false.
if (!tessedit_adapt_to_char_fragments) {
const char *fragment_lengths = word->best_choice->fragment_lengths();
if (fragment_lengths != NULL && *fragment_lengths != '\0') {
for (int i = 0; i < word->best_choice->length(); ++i) {
if (fragment_lengths[i] > 1) {
if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
return false; // found a character composed from fragments
}
}
}
}
if (tessedit_adaption_debug) {
tprintf("returning status %d\n", status);
}
return status;
}
void Tesseract::collect_ems_for_adaption(WERD_RES *word,
CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting) {
PBLOB_LIST *blobs = word->outword->blob_list ();
PBLOB_IT blob_it(blobs);
inT16 i;
CHAR_SAMPLE *sample;
PIXROW_LIST *pixrow_list;
PIXROW_IT pixrow_it;
IMAGELINE *imlines; // lines of the image
TBOX pix_box; // box of imlines
// extent
WERD copy_outword; // copy to denorm
PBLOB_IT copy_blob_it;
OUTLINE_IT copy_outline_it;
inT32 resolution = page_image.get_res ();
if (tessedit_reject_ems || tessedit_reject_suspect_ems)
return; // Do nothing
if (word->word->bounding_box ().height () > resolution / 3)
return;
if (tessedit_demo_adaption)
// Make sure not set
tessedit_display_mm.set_value (FALSE);
if (word_adaptable (word, tessedit_em_adaption_mode)
&& word->reject_map.reject_count () == 0
&& (strchr (word->best_choice->unichar_string().string (), 'm') != NULL
|| (tessedit_process_rns
&& strstr (word->best_choice->unichar_string().string (),
"rn") != NULL))) {
if (tessedit_process_rns
&& strstr (word->best_choice->unichar_string().string (), "rn") != NULL) {
copy_outword = *(word->outword);
copy_blob_it.set_to_list (copy_outword.blob_list ());
i = 0;
while (word->best_choice->unichar_string()[i] != '\0') {
if (word->best_choice->unichar_string()[i] == 'r'
&& word->best_choice->unichar_string()[i + 1] == 'n') {
copy_outline_it.set_to_list (copy_blob_it.data ()->
out_list ());
copy_outline_it.add_list_after (copy_blob_it.
data_relative (1)->
out_list ());
copy_blob_it.forward ();
delete (copy_blob_it.extract ());
i++;
}
copy_blob_it.forward ();
i++;
}
}
else
copy_outword = *(word->outword);
copy_outword.baseline_denormalise (&word->denorm);
char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
pixrow_it.set_to_list (pixrow_list);
pixrow_it.move_to_first ();
blob_it.move_to_first ();
for (i = 0;
word->best_choice->unichar_string()[i] != '\0';
i++, pixrow_it.forward (), blob_it.forward ()) {
if (word->best_choice->unichar_string()[i] == 'm'
|| (word->best_choice->unichar_string()[i] == 'r'
&& word->best_choice->unichar_string()[i + 1] == 'n')) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample %c for adaption found in %s, index %d\n",
word->best_choice->unichar_string()[i],
word->best_choice->unichar_string().string (), i);
#endif
if (tessedit_matrix_match) {
sample = clip_sample (pixrow_it.data (),
imlines,
pix_box,
copy_outword.flag (W_INVERSE),
word->best_choice->unichar_string()[i]);
if (sample == NULL) { //Clip failed
#ifndef SECURE_NAMES
tprintf ("Unable to clip sample from %s, index %d\n",
word->best_choice->unichar_string().string (), i);
#endif
if (word->best_choice->unichar_string()[i] == 'r')
i++;
continue;
}
}
else
sample = new CHAR_SAMPLE (blob_it.data (),
&word->denorm,
word->best_choice->unichar_string()[i]);
cluster_sample(sample, char_clusters, chars_waiting);
if (word->best_choice->unichar_string()[i] == 'r')
i++; // Skip next character
}
}
delete[]imlines; // Free array of imlines
delete pixrow_list;
}
}
void Tesseract::collect_characters_for_adaption(
WERD_RES *word,
CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting) {
PBLOB_LIST *blobs = word->outword->blob_list ();
PBLOB_IT blob_it(blobs);
inT16 i;
CHAR_SAMPLE *sample;
PIXROW_LIST *pixrow_list;
PIXROW_IT pixrow_it;
IMAGELINE *imlines; // lines of the image
TBOX pix_box; // box of imlines
// extent
WERD copy_outword; // copy to denorm
inT32 resolution = page_image.get_res ();
if (word->word->bounding_box ().height () > resolution / 3)
return;
if (tessedit_demo_adaption)
// Make sure not set
tessedit_display_mm.set_value (FALSE);
if ((word_adaptable (word, tessedit_cluster_adaption_mode)
&& word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {
if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
return; // Reject map set to acceptable
/* Collect information about good matches */
copy_outword = *(word->outword);
copy_outword.baseline_denormalise (&word->denorm);
char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
pixrow_it.set_to_list (pixrow_list);
pixrow_it.move_to_first ();
blob_it.move_to_first ();
for (i = 0;
word->best_choice->unichar_string()[i] != '\0';
i++, pixrow_it.forward (), blob_it.forward ()) {
if (!(tessedit_mm_use_non_adaption_set
&& STRING(tessedit_non_adaption_set).contains(
word->best_choice->unichar_string()[i]))
|| (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample %c for adaption found in %s, index %d\n",
word->best_choice->unichar_string()[i],
word->best_choice->unichar_string().string (), i);
#endif
sample = clip_sample (pixrow_it.data (),
imlines,
pix_box,
copy_outword.flag (W_INVERSE),
word->best_choice->unichar_string()[i]);
if (sample == NULL) { //Clip failed
#ifndef SECURE_NAMES
tprintf ("Unable to clip sample from %s, index %d\n",
word->best_choice->unichar_string().string (), i);
#endif
continue;
}
cluster_sample(sample, char_clusters, chars_waiting);
}
}
delete[]imlines; // Free array of imlines
delete pixrow_list;
}
else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
// Set word to all rejects
word->reject_map.rej_word_tess_failure ();
}
void Tesseract::cluster_sample(CHAR_SAMPLE *sample,
CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting) {
CHAR_SAMPLES *best_cluster = NULL;
CHAR_SAMPLES_IT c_it = char_clusters;
CHAR_SAMPLE_IT cw_it = chars_waiting;
float score;
float best_score = MAX_INT32;
if (c_it.empty ())
c_it.add_to_end (new CHAR_SAMPLES (sample));
else {
for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
score = c_it.data ()->match_score (sample, this);
if (score < best_score) {
best_score = score;
best_cluster = c_it.data ();
}
}
if (tessedit_cluster_debug)
tprintf ("Sample's best score %f\n", best_score);
if (best_score < tessedit_cluster_t1) {
if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) {
best_cluster->add_sample (sample, this);
check_wait_list(chars_waiting, sample, best_cluster);
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample added to an existing cluster\n");
#endif
}
else {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf
("Sample dropped, good match to an existing cluster\n");
#endif
}
}
else if (best_score > tessedit_cluster_t2) {
c_it.add_to_end (new CHAR_SAMPLES (sample));
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("New cluster created for this sample\n");
#endif
}
else {
cw_it.add_to_end (sample);
if (tessedit_cluster_debug)
tprintf ("Sample added to the wait list\n");
}
}
}
void Tesseract::check_wait_list(CHAR_SAMPLE_LIST *chars_waiting,
CHAR_SAMPLE *sample,
CHAR_SAMPLES *best_cluster) {
CHAR_SAMPLE *wait_sample;
CHAR_SAMPLE *test_sample = sample;
CHAR_SAMPLE_IT cw_it = chars_waiting;
CHAR_SAMPLE_LIST add_list; //Samples added to best cluster
CHAR_SAMPLE_IT add_it = &add_list;
float score;
add_list.clear ();
if (!cw_it.empty ()) {
do {
if (!add_list.empty ()) {
add_it.forward ();
test_sample = add_it.extract ();
best_cluster->add_sample (test_sample, this);
}
for (cw_it.mark_cycle_pt ();
!cw_it.cycled_list (); cw_it.forward ()) {
wait_sample = cw_it.data ();
if (tessedit_mm_use_prototypes)
score = best_cluster->match_score (wait_sample, this);
else
score = sample->match_sample (wait_sample, FALSE, this);
if (score < tessedit_cluster_t1) {
if (score > tessedit_cluster_t3
|| tessedit_mm_use_prototypes) {
add_it.add_after_stay_put (cw_it.extract ());
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf
("Wait sample added to an existing cluster\n");
#endif
}
else {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf
("Wait sample dropped, good match to an existing cluster\n");
#endif
}
}
}
}
while (!add_list.empty ());
}
}
void Tesseract::complete_clustering(CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting) {
CHAR_SAMPLES *best_cluster;
CHAR_SAMPLES_IT c_it = char_clusters;
CHAR_SAMPLE_IT cw_it = chars_waiting;
CHAR_SAMPLE *sample;
inT32 total_sample_count = 0;
while (!cw_it.empty ()) {
cw_it.move_to_first ();
sample = cw_it.extract ();
best_cluster = new CHAR_SAMPLES (sample);
c_it.add_to_end (best_cluster);
check_wait_list(chars_waiting, sample, best_cluster);
}
for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
c_it.data ()->assign_to_char ();
if (tessedit_use_best_sample)
c_it.data ()->find_best_sample ();
else if (tessedit_mm_adapt_using_prototypes)
c_it.data ()->build_prototype ();
if (tessedit_cluster_debug)
total_sample_count += c_it.data ()->n_samples ();
}
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Clustering completed, %d samples in all\n", total_sample_count);
#endif
#ifndef GRAPHICS_DISABLED
if (tessedit_demo_adaption)
display_cluster_prototypes(char_clusters);
#endif
}
void Tesseract::adapt_to_good_ems(WERD_RES *word,
CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting) {
PBLOB_LIST *blobs = word->outword->blob_list ();
PBLOB_IT blob_it(blobs);
inT16 i;
CHAR_SAMPLE *sample;
CHAR_SAMPLES_IT c_it = char_clusters;
CHAR_SAMPLE_IT cw_it = chars_waiting;
float score;
float best_score;
char best_char;
CHAR_SAMPLES *best_cluster;
PIXROW_LIST *pixrow_list;
PIXROW_IT pixrow_it;
IMAGELINE *imlines; // lines of the image
TBOX pix_box; // box of imlines
// extent
WERD copy_outword; // copy to denorm
TBOX b_box;
PBLOB_IT copy_blob_it;
OUTLINE_IT copy_outline_it;
PIXROW *pixrow = NULL;
static inT32 word_number = 0;
#ifndef GRAPHICS_DISABLED
ScrollView* demo_win = NULL;
#endif
inT32 resolution = page_image.get_res ();
if (word->word->bounding_box ().height () > resolution / 3)
return;
word_number++;
if (strchr (word->best_choice->unichar_string().string (), 'm') == NULL
&& (tessedit_process_rns
&& strstr (word->best_choice->unichar_string().string (), "rn") == NULL))
return;
if (tessedit_reject_ems)
reject_all_ems(word);
else if (tessedit_reject_suspect_ems)
reject_suspect_ems(word);
else {
if (char_clusters->length () == 0) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("No clusters to use for em adaption\n");
#endif
return;
}
if (!cw_it.empty ()) {
complete_clustering(char_clusters, chars_waiting);
print_em_stats(char_clusters, chars_waiting);
}
if ((!word_adaptable (word, tessedit_em_adaption_mode) ||
word->reject_map.reject_count () != 0)
&& (strchr (word->best_choice->unichar_string().string (), 'm') != NULL
|| (tessedit_process_rns
&& strstr (word->best_choice->unichar_string().string (),
"rn") != NULL))) {
if (tessedit_process_rns
&& strstr (word->best_choice->unichar_string().string (),
"rn") != NULL) {
copy_outword = *(word->outword);
copy_blob_it.set_to_list (copy_outword.blob_list ());
i = 0;
while (word->best_choice->unichar_string()[i] != '\0') {
if (word->best_choice->unichar_string()[i] == 'r'
&& word->best_choice->unichar_string()[i + 1] == 'n') {
copy_outline_it.set_to_list (copy_blob_it.data ()->
out_list ());
copy_outline_it.add_list_after (copy_blob_it.
data_relative (1)->
out_list ());
copy_blob_it.forward ();
delete (copy_blob_it.extract ());
i++;
}
copy_blob_it.forward ();
i++;
}
}
else
copy_outword = *(word->outword);
copy_outword.baseline_denormalise (&word->denorm);
copy_blob_it.set_to_list (copy_outword.blob_list ());
char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
pixrow_it.set_to_list (pixrow_list);
pixrow_it.move_to_first ();
// For debugging only
b_box = copy_outword.bounding_box ();
pixrow = pixrow_it.data ();
blob_it.move_to_first ();
copy_blob_it.move_to_first ();
for (i = 0;
word->best_choice->unichar_string()[i] != '\0';
i++, pixrow_it.forward (), blob_it.forward (),
copy_blob_it.forward ()) {
if ((word->best_choice->unichar_string()[i] == 'm'
|| (word->best_choice->unichar_string()[i] == 'r'
&& word->best_choice->unichar_string()[i + 1] == 'n'))
&& !word->reject_map[i].perm_rejected ()) {
if (tessedit_cluster_debug)
tprintf ("Sample %c to check found in %s, index %d\n",
word->best_choice->unichar_string()[i],
word->best_choice->unichar_string().string (), i);
if (tessedit_demo_adaption)
tprintf
("Sample %c to check found in %s (%d), index %d\n",
word->best_choice->unichar_string()[i],
word->best_choice->unichar_string().string (), word_number,
i);
if (tessedit_matrix_match) {
TBOX copy_box = copy_blob_it.data ()->bounding_box ();
sample = clip_sample (pixrow_it.data (),
imlines,
pix_box,
copy_outword.flag (W_INVERSE),
word->best_choice->unichar_string()[i]);
//Clip failed
if (sample == NULL) {
tprintf
("Unable to clip sample from %s, index %d\n",
word->best_choice->unichar_string().string (), i);
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample rejected (no sample)\n");
#endif
word->reject_map[i].setrej_mm_reject ();
if (word->best_choice->unichar_string()[i] == 'r') {
word->reject_map[i + 1].setrej_mm_reject ();
i++;
}
continue;
}
}
else
sample = new CHAR_SAMPLE(blob_it.data(),
&word->denorm,
word->best_choice->unichar_string()[i]);
best_score = MAX_INT32;
best_char = '\0';
best_cluster = NULL;
for (c_it.mark_cycle_pt ();
!c_it.cycled_list (); c_it.forward ()) {
if (c_it.data ()->character () != '\0') {
score = c_it.data ()->match_score (sample, this);
if (score < best_score) {
best_cluster = c_it.data ();
best_score = score;
best_char = c_it.data ()->character ();
}
}
}
if (best_score > tessedit_cluster_t1) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample rejected (score %f)\n", best_score);
if (tessedit_demo_adaption)
tprintf ("Sample rejected (score %f)\n", best_score);
#endif
word->reject_map[i].setrej_mm_reject ();
if (word->best_choice->unichar_string()[i] == 'r')
word->reject_map[i + 1].setrej_mm_reject ();
}
else {
if (word->best_choice->unichar_string()[i] == best_char) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample accepted (score %f)\n",
best_score);
if (tessedit_demo_adaption)
tprintf ("Sample accepted (score %f)\n",
best_score);
#endif
word->reject_map[i].setrej_mm_accept ();
if (word->best_choice->unichar_string()[i] == 'r')
word->reject_map[i + 1].setrej_mm_accept ();
}
else {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample rejected (char %c, score %f)\n",
best_char, best_score);
if (tessedit_demo_adaption)
tprintf ("Sample rejected (char %c, score %f)\n",
best_char, best_score);
#endif
word->reject_map[i].setrej_mm_reject ();
if (word->best_choice->unichar_string()[i] == 'r')
word->reject_map[i + 1].setrej_mm_reject ();
}
}
if (tessedit_demo_adaption) {
if (strcmp (imagebasename.string (),
tessedit_demo_file.string ()) != 0
|| word_number == tessedit_demo_word1
|| word_number == tessedit_demo_word2) {
#ifndef GRAPHICS_DISABLED
demo_win =
display_clip_image(&copy_outword,
page_image,
pixrow_list,
pix_box);
#endif
demo_word = word_number;
best_cluster->match_score (sample, this);
demo_word = 0;
}
}
if (word->best_choice->unichar_string()[i] == 'r')
i++; // Skip next character
}
}
delete[]imlines; // Free array of imlines
delete pixrow_list;
}
}
}
void Tesseract::adapt_to_good_samples(WERD_RES *word,
CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting) {
PBLOB_LIST *blobs = word->outword->blob_list ();
PBLOB_IT blob_it(blobs);
inT16 i;
CHAR_SAMPLE *sample;
CHAR_SAMPLES_IT c_it = char_clusters;
CHAR_SAMPLE_IT cw_it = chars_waiting;
float score;
float best_score;
char best_char;
CHAR_SAMPLES *best_cluster;
PIXROW_LIST *pixrow_list;
PIXROW_IT pixrow_it;
IMAGELINE *imlines; // lines of the image
TBOX pix_box; // box of imlines
// extent
WERD copy_outword; // copy to denorm
TBOX b_box;
PBLOB_IT copy_blob_it;
PIXROW *pixrow = NULL;
static inT32 word_number = 0;
#ifndef GRAPHICS_DISABLED
ScrollView* demo_win = NULL;
#endif
inT32 resolution = page_image.get_res ();
word_number++;
if (tessedit_test_cluster_input)
return;
if (word->word->bounding_box ().height () > resolution / 3)
return;
if (char_clusters->length () == 0) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("No clusters to use for adaption\n");
#endif
return;
}
if (!cw_it.empty ()) {
complete_clustering(char_clusters, chars_waiting);
print_em_stats(char_clusters, chars_waiting);
}
if ((!word_adaptable (word, tessedit_cluster_adaption_mode)
&& word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) {
if (tessedit_cluster_debug) {
tprintf ("\nChecking: \"%s\" MAP ",
word->best_choice->unichar_string().string ());
word->reject_map.print (debug_fp);
tprintf ("\n");
}
copy_outword = *(word->outword);
copy_outword.baseline_denormalise (&word->denorm);
copy_blob_it.set_to_list (copy_outword.blob_list ());
char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
pixrow_it.set_to_list (pixrow_list);
pixrow_it.move_to_first ();
// For debugging only
b_box = copy_outword.bounding_box ();
pixrow = pixrow_it.data ();
blob_it.move_to_first ();
copy_blob_it.move_to_first ();
for (i = 0;
word->best_choice->unichar_string()[i] != '\0';
i++, pixrow_it.forward (), blob_it.forward (),
copy_blob_it.forward ()) {
if (word->reject_map[i].recoverable ()
|| (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) {
TBOX copy_box = copy_blob_it.data ()->bounding_box ();
if (tessedit_cluster_debug)
tprintf ("Sample %c to check found in %s, index %d\n",
word->best_choice->unichar_string()[i],
word->best_choice->unichar_string().string (), i);
if (tessedit_demo_adaption)
tprintf ("Sample %c to check found in %s (%d), index %d\n",
word->best_choice->unichar_string()[i],
word->best_choice->unichar_string().string (),
word_number, i);
sample = clip_sample (pixrow_it.data (),
imlines,
pix_box,
copy_outword.flag (W_INVERSE),
word->best_choice->unichar_string()[i]);
if (sample == NULL) { //Clip failed
tprintf ("Unable to clip sample from %s, index %d\n",
word->best_choice->unichar_string().string (), i);
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample rejected (no sample)\n");
#endif
word->reject_map[i].setrej_mm_reject ();
continue;
}
best_score = MAX_INT32;
best_char = '\0';
best_cluster = NULL;
for (c_it.mark_cycle_pt ();
!c_it.cycled_list (); c_it.forward ()) {
if (c_it.data ()->character () != '\0') {
score = c_it.data ()->match_score (sample, this);
if (score < best_score) {
best_cluster = c_it.data ();
best_score = score;
best_char = c_it.data ()->character ();
}
}
}
if (best_score > tessedit_cluster_t1) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample rejected (score %f)\n", best_score);
if (tessedit_demo_adaption)
tprintf ("Sample rejected (score %f)\n", best_score);
#endif
word->reject_map[i].setrej_mm_reject ();
}
else {
if (word->best_choice->unichar_string()[i] == best_char) {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample accepted (score %f)\n", best_score);
if (tessedit_demo_adaption)
tprintf ("Sample accepted (score %f)\n", best_score);
#endif
if (tessedit_test_adaption)
word->reject_map[i].setrej_minimal_rej_accept ();
else
word->reject_map[i].setrej_mm_accept ();
}
else {
#ifndef SECURE_NAMES
if (tessedit_cluster_debug)
tprintf ("Sample rejected (char %c, score %f)\n",
best_char, best_score);
if (tessedit_demo_adaption)
tprintf ("Sample rejected (char %c, score %f)\n",
best_char, best_score);
#endif
word->reject_map[i].setrej_mm_reject ();
}
}
if (tessedit_demo_adaption) {
if (strcmp (imagebasename.string (),
tessedit_demo_file.string ()) != 0
|| word_number == tessedit_demo_word1
|| word_number == tessedit_demo_word2) {
#ifndef GRAPHICS_DISABLED
demo_win =
display_clip_image(&copy_outword,
page_image,
pixrow_list,
pix_box);
#endif
demo_word = word_number;
best_cluster->match_score (sample, this);
demo_word = 0;
}
}
}
}
delete[]imlines; // Free array of imlines
delete pixrow_list;
if (tessedit_cluster_debug) {
tprintf ("\nFinal: \"%s\" MAP ",
word->best_choice->unichar_string().string ());
word->reject_map.print (debug_fp);
tprintf ("\n");
}
}
}
} // namespace tesseract
void print_em_stats(CHAR_SAMPLES_LIST *char_clusters,
CHAR_SAMPLE_LIST *chars_waiting) {
CHAR_SAMPLES_IT c_it = char_clusters;
if (!tessedit_cluster_debug)
return;
#ifndef SECURE_NAMES
tprintf ("There are %d clusters and %d samples waiting\n",
char_clusters->length (), chars_waiting->length ());
for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ())
c_it.data ()->print (debug_fp);
#endif
tprintf ("\n");
}
CHAR_SAMPLE *clip_sample( //lines of the image
PIXROW *pixrow,
IMAGELINE *imlines,
TBOX pix_box, //box of imlines extent
BOOL8 white_on_black,
char c) {
TBOX b_box = pixrow->bounding_box ();
float baseline_pos = 0;
inT32 resolution = page_image.get_res ();
if (!b_box.null_box ()) {
ASSERT_HOST (b_box.width () < page_image.get_xsize () &&
b_box.height () < page_image.get_ysize ());
if (b_box.width () > resolution || b_box.height () > resolution) {
tprintf ("clip sample: sample too big (%d x %d)\n",
b_box.width (), b_box.height ());
return NULL;
}
IMAGE *image = new (IMAGE);
if (image->create (b_box.width (), b_box.height (), 1) == -1) {
tprintf ("clip sample: create image failed (%d x %d)\n",
b_box.width (), b_box.height ());
delete image;
return NULL;
}
if (!white_on_black)
invert_image(image); // Set background to white
pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos);
if (white_on_black)
invert_image(image); //invert white on black for scaling &NN
return new CHAR_SAMPLE (image, c);
}
else
return NULL;
}
#ifndef GRAPHICS_DISABLED
void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters) {
inT16 proto_number = 0;
CHAR_SAMPLES_IT c_it = char_clusters;
char title[WINDOWNAMESIZE];
for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
proto_number++;
#ifndef SECURE_NAMES
tprintf ("Displaying proto number %d\n", proto_number);
#endif
if (c_it.data ()->prototype () != NULL) {
sprintf (title, "Proto - %d", proto_number);
display_image (c_it.data ()->prototype ()->make_image (),
title, (proto_number - 1) * 400, 0, FALSE);
}
}
}
#endif
// *********************************************************************
// Simplistic routines to test the effect of rejecting ems and fullstops
// *********************************************************************
void reject_all_ems(WERD_RES *word) {
inT16 i;
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
if (word->best_choice->unichar_string()[i] == 'm')
// reject all ems
word->reject_map[i].setrej_mm_reject ();
}
}
void reject_all_fullstops(WERD_RES *word) {
inT16 i;
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
if (word->best_choice->unichar_string()[i] == '.')
// reject all fullstops
word->reject_map[i].setrej_mm_reject ();
}
}
namespace tesseract {
void Tesseract::reject_suspect_ems(WERD_RES *word) {
inT16 i;
if (!word_adaptable (word, tessedit_cluster_adaption_mode))
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
if (word->best_choice->unichar_string()[i] == 'm' && suspect_em (word, i))
// reject all ems
word->reject_map[i].setrej_mm_reject ();
}
}
} // namespace tesseract
void reject_suspect_fullstops(WERD_RES *word) {
inT16 i;
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
if (word->best_choice->unichar_string()[i] == '.'
&& suspect_fullstop (word, i))
// reject all commas
word->reject_map[i].setrej_mm_reject ();
}
}
BOOL8 suspect_em(WERD_RES *word, inT16 index) {
PBLOB_LIST *blobs = word->outword->blob_list ();
PBLOB_IT blob_it(blobs);
inT16 j;
for (j = 0; j < index; j++)
blob_it.forward ();
return (blob_it.data ()->out_list ()->length () != 1);
}
BOOL8 suspect_fullstop(WERD_RES *word, inT16 i) {
float aspect_ratio;
PBLOB_LIST *blobs = word->outword->blob_list ();
PBLOB_IT blob_it(blobs);
inT16 j;
TBOX box;
inT16 width;
inT16 height;
for (j = 0; j < i; j++)
blob_it.forward ();
box = blob_it.data ()->bounding_box ();
width = box.width ();
height = box.height ();
aspect_ratio = ((width > height) ? ((float) width) / height :
((float) height) / width);
return (aspect_ratio > tessed_fullstop_aspect_ratio);
}