/********************************************************************** * File: applybox.cpp (Formerly applybox.c) * Description: Re segment rows according to box file data * Author: Phil Cheatle * Created: Wed Nov 24 09:11:23 GMT 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include "mfcpch.h" // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif #ifdef _MSC_VER #pragma warning(disable:4244) // Conversion warnings #endif #ifdef HAVE_LIBLEPT // Include leptonica library only if autoconf (or makefile etc) tell us to. #include "allheaders.h" #endif #include "applybox.h" #include #include #ifdef __UNIX__ #include #include #endif #include "boxread.h" #include "control.h" #include "genblob.h" #include "globals.h" #include "fixxht.h" #include "mainblk.h" #include "matchdefs.h" #include "secname.h" #include "tessbox.h" #include "unichar.h" #include "unicharset.h" #include "matchdefs.h" #include "tesseractclass.h" #define SECURE_NAMES #ifndef SECURE_NAMES #include "wordstats.h" #endif #define EXTERN EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead"); EXTERN INT_VAR (applybox_debug, 5, "Debug level"); EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from"); EXTERN STRING_VAR (applybox_test_exclusions, "", "Chars ignored for testing"); EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht"); EXTERN STRING_VAR(exposure_pattern, ".exp", "Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"); EXTERN BOOL_VAR(learn_chars_and_char_frags_mode, FALSE, "Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."); extern IMAGE page_image; // The unicharset used during box training static UNICHARSET unicharset_boxes; /************************************************************************* * The code re-assigns outlines to form words each with ONE labelled blob. * Noise is left in UNLABELLED words. The chars on the page are checked crudely * for sensible position relative to baseline and xht. Failed boxes are * compensated for by duplicating other believable instances of the character. * * The box file is assumed to contain box definitions, one per line, of the * following format: * ... arbitrary trailing fields unused * * The approach taken is to search the WHOLE page for stuff overlapping each box. * - This is not too inefficient and is SAFE. * - We can detect overlapping blobs as we will be attempting to put a blob * from a LABELLED word into the current word. * - When all the boxes have been processed we can detect any stuff which is * being ignored - it is the unlabelled words left on the page. * * A box should only overlap one row. * * A warning is given if the box is on the same row as the previous box, but NOT * on the same row as the previous blob. * * Any OUTLINE which overlaps the box is put into the new word. * * ascender chars must ascend above xht significantly * xht chars must not rise above row xht significantly * bl chars must not descend below baseline significantly * descender chars must descend below baseline significantly * * ?? Certain chars are DROPPED - to limit the training data. * *************************************************************************/ namespace tesseract { void Tesseract::apply_boxes(const STRING& fname, BLOCK_LIST *block_list //real blocks ) { inT16 boxfile_lineno = 0; inT16 boxfile_charno = 0; TBOX box; //boxfile box UNICHAR_ID uch_id; //correct ch from boxfile ROW *row; ROW *prev_row = NULL; inT16 prev_box_right = MAX_INT16; inT16 block_id; inT16 row_id; inT16 box_count = 0; inT16 box_failures = 0; inT16 labels_ok; inT16 rows_ok; inT16 bad_blobs; inT16 *tgt_char_counts = NULL; // No. of box samples inT16 i; inT16 rebalance_count = 0; UNICHAR_ID min_uch_id = INVALID_UNICHAR_ID; inT16 min_samples; inT16 final_labelled_blob_count; bool low_exposure = false; // Clean the unichar set unicharset_boxes.clear(); // Space character needed to represent NIL classification unicharset_boxes.unichar_insert(" "); // Figure out whether this image file's exposure is less than 1, in which // case when learning we will only pay attention to character fragments. const char *ptr = strstr(imagefile.string(), exposure_pattern.string()); if (ptr != NULL && strtol(ptr += strlen(exposure_pattern.string()), NULL, 10) < 0) { low_exposure = true; } FILE* box_file; STRING filename = fname; const char *lastdot; //of name lastdot = strrchr (filename.string (), '.'); if (lastdot != NULL) filename[lastdot - filename.string()] = '\0'; filename += ".box"; if (!(box_file = fopen (filename.string(), "r"))) { CANTOPENFILE.error ("read_next_box", EXIT, "Cant open box file %s %d", filename.string(), errno); } tgt_char_counts = new inT16[MAX_NUM_CLASSES]; for (i = 0; i < MAX_NUM_CLASSES; i++) tgt_char_counts[i] = 0; clear_any_old_text(block_list); while (read_next_box(applybox_page, box_file, &box, &uch_id)) { box_count++; if (!low_exposure || learn_chars_and_char_frags_mode) { tgt_char_counts[uch_id]++; } row = find_row_of_box (block_list, box, block_id, row_id); if (box.left () < prev_box_right) { boxfile_lineno++; boxfile_charno = 1; } else boxfile_charno++; if (row == NULL) { box_failures++; report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! box overlaps no blobs or blobs in multiple rows"); } else { if ((box.left () >= prev_box_right) && (row != prev_row)) report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "WARNING! false row break"); box_failures += resegment_box (row, box, uch_id, block_id, row_id, boxfile_lineno, boxfile_charno, tgt_char_counts, low_exposure, true); prev_row = row; } prev_box_right = box.right (); } tidy_up(block_list, labels_ok, rows_ok, bad_blobs, tgt_char_counts, rebalance_count, &min_uch_id, min_samples, final_labelled_blob_count, low_exposure, true); tprintf ("APPLY_BOXES:\n"); tprintf (" Boxes read from boxfile: %6d\n", box_count); tprintf (" Initially labelled blobs: %6d in %d rows\n", labels_ok, rows_ok); tprintf (" Box failures detected: %6d\n", box_failures); tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); tprintf (" \"%s\" has fewest samples:%6d\n", unicharset_boxes.id_to_unichar(min_uch_id), min_samples); tprintf (" Total unlabelled words: %6d\n", bad_blobs); tprintf (" Final labelled words: %6d\n", final_labelled_blob_count); // Clean up. delete[] tgt_char_counts; } int Tesseract::Boxes2BlockList(int box_cnt, TBOX *boxes, BLOCK_LIST *block_list, bool right2left) { inT16 boxfile_lineno = 0; inT16 boxfile_charno = 0; TBOX box; ROW *row; ROW *prev_row = NULL; inT16 prev_box_right = MAX_INT16; inT16 prev_box_left = 0; inT16 block_id; inT16 row_id; inT16 box_failures = 0; inT16 labels_ok; inT16 rows_ok; inT16 bad_blobs; inT16 rebalance_count = 0; UNICHAR_ID min_uch_id; inT16 min_samples; inT16 final_labelled_blob_count; clear_any_old_text(block_list); for (int box_idx = 0; box_idx < box_cnt; box_idx++) { box = boxes[box_idx]; row = find_row_of_box(block_list, box, block_id, row_id); // check for a new row if ((right2left && box.right () > prev_box_left) || (!right2left && box.left () < prev_box_right)) { boxfile_lineno++; boxfile_charno = 1; } else { boxfile_charno++; } if (row == NULL) { box_failures++; } else { box_failures += resegment_box(row, box, 0, block_id, row_id, boxfile_lineno, boxfile_charno, NULL, false, false); prev_row = row; } prev_box_right = box.right (); prev_box_left = box.left (); } tidy_up(block_list, labels_ok, rows_ok, bad_blobs, NULL, rebalance_count, &min_uch_id, min_samples, final_labelled_blob_count, false, false); return box_failures; } } // namespace tesseract void clear_any_old_text( //remove correct text BLOCK_LIST *block_list //real blocks ) { BLOCK_IT block_it(block_list); ROW_IT row_it; WERD_IT word_it; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { word_it.set_to_list (row_it.data ()->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word_it.data ()->set_text (""); } } } } UNICHAR_ID register_char(const char *uch) { if (!unicharset_boxes.contains_unichar(uch)) { unicharset_boxes.unichar_insert(uch); if (unicharset_boxes.size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset of boxes is " "greater than MAX_NUM_CLASSES (%d)\n", MAX_NUM_CLASSES); exit(1); } } return unicharset_boxes.unichar_to_id(uch); } BOOL8 read_next_box(int page, FILE* box_file, TBOX *box, UNICHAR_ID *uch_id) { int x_min; int y_min; int x_max; int y_max; char uch[kBoxReadBufSize]; if (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) { *uch_id = register_char(uch); *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); return TRUE; // read a box ok } else { return FALSE; // EOF } } ROW *find_row_of_box( // BLOCK_LIST *block_list, //real blocks const TBOX &box, //from boxfile inT16 &block_id, inT16 &row_id_to_process) { BLOCK_IT block_it(block_list); BLOCK *block; ROW_IT row_it; ROW *row; ROW *row_to_process = NULL; inT16 row_id; WERD_IT word_it; WERD *word; BOOL8 polyg; PBLOB_IT blob_it; PBLOB *blob; OUTLINE_IT outline_it; OUTLINE *outline; /* Find row to process - error if box REALLY overlaps more than one row. (I.e it overlaps blobs in the row - not just overlaps the bounding box of the whole row.) */ block_id = 0; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_id++; row_id = 0; block = block_it.data (); if (block->bounding_box ().overlap (box)) { row_it.set_to_list (block->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_id++; row = row_it.data (); if (row->bounding_box ().overlap (box)) { word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg). overlap (box)) { outline_it. set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg).major_overlap (box)) { if ((row_to_process == NULL) || (row_to_process == row)) { row_to_process = row; row_id_to_process = row_id; } else /* RETURN ERROR Box overlaps blobs in more than one row */ return NULL; } } } } } } } } } } return row_to_process; } inT16 resegment_box( // ROW *row, TBOX &box, UNICHAR_ID uch_id, inT16 block_id, inT16 row_id, inT16 boxfile_lineno, inT16 boxfile_charno, inT16 *tgt_char_counts, bool learn_char_fragments, bool learning) { WERD_LIST new_word_list; WERD_IT word_it; WERD_IT new_word_it(&new_word_list); WERD *word = NULL; WERD *new_word = NULL; BOOL8 polyg = false; PBLOB_IT blob_it; PBLOB_IT new_blob_it; PBLOB *blob; PBLOB *new_blob; OUTLINE_IT outline_it; OUTLINE_LIST dummy; // Just to initialize new_outline_it. OUTLINE_IT new_outline_it = &dummy; OUTLINE *outline; TBOX new_word_box; TBOX curr_outline_box; TBOX prev_outline_box; float word_x_centre; float baseline; inT16 error_count = 0; //number of chars lost STRING label; UNICHAR_ID fragment_uch_id; int fragment_index; int new_word_it_len; if (learning && applybox_debug > 6) { tprintf("\nAPPLY_BOX: in resegment_box() for %s(%d)\n", unicharset_boxes.id_to_unichar(uch_id), uch_id); } word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); prev_outline_box = TBOX(); // clear prev_outline_box curr_outline_box = TBOX(); // clear curr_outline_box for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg).overlap (box)) { outline_it.set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); prev_outline_box += curr_outline_box; curr_outline_box = goutline_bounding_box(outline, polyg); if (curr_outline_box.major_overlap (box)) { if (strlen (word->text ()) > 0) { if (error_count == 0) { error_count = 1; if (learning && applybox_debug > 4) report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! box overlaps blob in labelled word"); } if (learning && applybox_debug > 4) tprintf ("APPLY_BOXES: ALSO ignoring corrupted char" " blk:%d row:%d \"%s\"\n", block_id, row_id, word_it.data()->text()); word_it.data ()->set_text (""); // UN label it error_count++; } // Do not learn from fragments of characters that are broken // into very small pieces to avoid picking up noise. if ((learn_char_fragments || learn_chars_and_char_frags_mode) && ((C_OUTLINE *)outline)->area() < kMinFragmentOutlineArea) { if (applybox_debug > 6) { tprintf("APPLY_BOX: fragment outline area %d is too small" " - not recording fragments of this character\n", ((C_OUTLINE *)outline)->area()); } error_count++; } if (error_count == 0) { if (applybox_debug > 6 ) { tprintf("APPLY_BOX: Previous "); prev_outline_box.print(); tprintf("APPLY_BOX: Current area: %d ", ((C_OUTLINE *)outline)->area()); curr_outline_box.print(); } // When learning character fragments is enabled, we put // outlines that do not overlap on x axis in separate WERDs. bool start_new_word = (learn_char_fragments || learn_chars_and_char_frags_mode) && !curr_outline_box.major_x_overlap(prev_outline_box); if (new_word == NULL || start_new_word) { if (new_word != NULL) { // add prev new_word to new_word_list new_word_it.add_to_end(new_word); } // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_flag(W_FUZZY_NON, false); new_word->set_flag(W_FUZZY_SP, false); if (polyg){ new_blob = new PBLOB; } else { new_blob = (PBLOB *) new C_BLOB; } new_blob_it.set_to_list(new_word->gblob_list()); new_blob_it.add_to_end(new_blob); new_outline_it.set_to_list( gblob_out_list(new_blob, polyg)); } new_outline_it.add_to_end(outline_it.extract()); // move blob } } } if (outline_it.empty()) // no outlines in blob delete blob_it.extract(); // so delete blob } } if (blob_it.empty()) // no blobs in word delete word_it.extract(); // so delete word } } if (new_word != NULL) { // add prev new_word to new_word_list new_word_it.add_to_end(new_word); } new_word_it_len = new_word_it.length(); // Check for failures. if (error_count > 0) return error_count; if (learning && new_word_it_len <= 0) { report_failed_box(boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! Couldn't find any blobs"); return 1; // failure } if (learning && new_word_it_len > CHAR_FRAGMENT::kMaxChunks) { tprintf("APPLY_BOXES: too many fragments (%d) for char %s\n", new_word_it_len, unicharset_boxes.id_to_unichar(uch_id)); return 1; // failure } // Add labelled character or character fragments to the word list. fragment_index = 0; new_word_it.move_to_first(); for (new_word_it.mark_cycle_pt(); !new_word_it.cycled_list(); new_word_it.forward()) { new_word = new_word_it.extract(); if (new_word_it_len > 1) { // deal with a fragment if (learning) { label = CHAR_FRAGMENT::to_string(unicharset_boxes.id_to_unichar(uch_id), fragment_index, new_word_it_len); fragment_uch_id = register_char(label.string()); new_word->set_text(label.string()); ++fragment_index; // For now we cheat by setting the expected number of char fragments // to the number of char fragments actually parsed and labelled. // TODO(daria): find out whether this can be improved. tgt_char_counts[fragment_uch_id]++; } else { // No learning involved, Just stick a place-holder string new_word->set_text("*"); } if (applybox_debug > 5) { tprintf("APPLY_BOX: adding char fragment %s\n", label.string()); } } else { // deal with a regular character if (learning) { if (!learn_char_fragments || learn_chars_and_char_frags_mode) { new_word->set_text(unicharset_boxes.id_to_unichar(uch_id)); } else { // not interested in non-fragmented chars if learning fragments, so // unlabel it. new_word->set_text(""); } } else { // No learning involved here. Just stick a place holder string new_word->set_text("*"); } } gblob_sort_list(new_word->gblob_list(), polyg); word_it.add_to_end(new_word); new_word_box = new_word->bounding_box(); word_x_centre = (new_word_box.left() + new_word_box.right()) / 2.0f; baseline = row->base_line(word_x_centre); } // All done. Now check if the EOL, BOL flags are set correctly. word_it.move_to_first(); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); word->set_flag(W_BOL, false); word->set_flag(W_EOL, false); } word->set_flag(W_EOL, true); word_it.move_to_first(); word_it.data()->set_flag(W_BOL, true); return 0; //success #if 0 if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) { if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && (new_word_box.top () < baseline + (1 + applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! caps-ht char didn't ascend"); new_word->set_text (""); return 1; } if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && (new_word_box.top () < baseline + (1 - applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! Odd top char below xht"); new_word->set_text (""); return 1; } if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && ((new_word_box.top () > baseline + (1 + applybox_error_band) * row->x_height ()) || (new_word_box.top () < baseline + (1 - applybox_error_band) * row->x_height ()))) { report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! x-ht char didn't have top near xht"); new_word->set_text (""); return 1; } if (STRING (chs_non_ambig_bl).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && ((new_word_box.bottom () < baseline - applybox_error_band * row->x_height ()) || (new_word_box.bottom () > baseline + applybox_error_band * row->x_height ()))) { report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! non ambig BL char didnt have bottom near baseline"); new_word->set_text (""); return 1; } if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && (new_word_box.bottom () > baseline + applybox_error_band * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! Odd bottom char above baseline"); new_word->set_text (""); return 1; } if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && (new_word_box.bottom () > baseline - applybox_error_band * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! Descender doesn't descend"); new_word->set_text (""); return 1; } } #endif } /************************************************************************* * tidy_up() * - report >1 block * - sort the words in each row. * - report any rows with no labelled words. * - report any remaining unlabelled words * - report total labelled words * *************************************************************************/ void tidy_up( // BLOCK_LIST *block_list, //real blocks inT16 &ok_char_count, inT16 &ok_row_count, inT16 &unlabelled_words, inT16 *tgt_char_counts, inT16 &rebalance_count, UNICHAR_ID *min_uch_id, inT16 &min_samples, inT16 &final_labelled_blob_count, bool learn_character_fragments, bool learning) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; WERD_IT word_it; WERD *word; WERD *duplicate_word; inT16 block_idx = 0; inT16 row_idx; inT16 all_row_idx = 0; BOOL8 row_ok; BOOL8 rebalance_needed = FALSE; inT16 *labelled_char_counts = NULL; // num unique labelled samples inT16 i; UNICHAR_ID uch_id; UNICHAR_ID prev_uch_id = -1; BOOL8 at_dupe_of_prev_word; ROW *prev_row = NULL; inT16 left; inT16 prev_left = -1; labelled_char_counts = new inT16[MAX_NUM_CLASSES]; for (i = 0; i < MAX_NUM_CLASSES; i++) labelled_char_counts[i] = 0; ok_char_count = 0; ok_row_count = 0; unlabelled_words = 0; if (learning && (applybox_debug > 4) && (block_it.length () != 1)) { if (block_it.length() > 1) { tprintf("APPLY_BOXES: More than one block??\n"); } else { tprintf("APPLY_BOXES: No blocks identified.\n"); } } for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_idx++; row_idx = 0; row_ok = FALSE; row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_idx++; all_row_idx++; row = row_it.data (); word_it.set_to_list (row->word_list ()); word_it.sort (word_comparator); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if (strlen (word->text ()) == 0 || unicharset_boxes.unichar_to_id(word->text()) < 0) { unlabelled_words++; if (learning && applybox_debug > 4 && !learn_character_fragments) { tprintf("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); } } else { if (word->gblob_list ()->length () != 1) tprintf ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d" " row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); ok_char_count++; ++labelled_char_counts[unicharset_boxes.unichar_to_id(word->text())]; row_ok = TRUE; } } if ((applybox_debug > 6) && (!row_ok)) { tprintf("APPLY_BOXES: Row with no labelled words blk:%d row:%d" " allrows:%d\n", block_idx, row_idx, all_row_idx); } else ok_row_count++; } } min_samples = 9999; for (i = 0; i < unicharset_boxes.size(); i++) { if (tgt_char_counts[i] > labelled_char_counts[i]) { if (labelled_char_counts[i] <= 1) { tprintf("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" -" " target is %d:\n", labelled_char_counts[i], unicharset_boxes.debug_str(i).string(), tgt_char_counts[i]); } else { rebalance_needed = TRUE; if (applybox_debug > 0) tprintf("APPLY_BOXES: REBALANCE REQD \"%s\" - target of" " %d from %d labelled samples\n", unicharset_boxes.debug_str(i).string(), tgt_char_counts[i], labelled_char_counts[i]); } } if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { min_samples = labelled_char_counts[i]; *min_uch_id = i; } } while (applybox_rebalance && rebalance_needed) { block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); left = word->bounding_box ().left (); if (*word->text () != '\0') uch_id = unicharset_boxes.unichar_to_id(word->text ()); else uch_id = -1; at_dupe_of_prev_word = ((row == prev_row) && (left = prev_left) && (uch_id == prev_uch_id)); if ((uch_id != -1) && (labelled_char_counts[uch_id] > 1) && (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) && (!at_dupe_of_prev_word)) { /* Duplicate the word to rebalance the labelled samples */ if (applybox_debug > 9) { tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id)); word->bounding_box ().print (); } duplicate_word = new WERD; *duplicate_word = *word; word_it.add_after_then_move (duplicate_word); rebalance_count++; labelled_char_counts[uch_id]++; } prev_row = row; prev_left = left; prev_uch_id = uch_id; } } } rebalance_needed = FALSE; for (i = 0; i < unicharset_boxes.size(); i++) { if ((tgt_char_counts[i] > labelled_char_counts[i]) && (labelled_char_counts[i] > 1)) { rebalance_needed = TRUE; break; } } } /* Now final check - count labeled blobs */ final_labelled_blob_count = 0; block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); word_it.sort (word_comparator); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen(word->text ()) > 0) && (word->gblob_list()->length() == 1)) { final_labelled_blob_count++; } else { delete word_it.extract(); } } // delete the row if empty if (row->word_list()->empty()) { delete row_it.extract(); } } } // Clean up. delete[] labelled_char_counts; } void report_failed_box(inT16 boxfile_lineno, inT16 boxfile_charno, TBOX box, const char *box_ch, const char *err_msg) { if (applybox_debug > 4) tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n", boxfile_lineno, boxfile_charno, box_ch, box.left (), box.bottom (), box.right (), box.top (), err_msg); } void apply_box_training(const STRING& filename, BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; WERD_IT word_it; WERD *word; WERD *bln_word; WERD copy_outword; // copy to denorm PBLOB_IT blob_it; DENORM denorm; inT16 count = 0; char unichar[UNICHAR_LEN + 1]; unichar[UNICHAR_LEN] = '\0'; tprintf ("Generating training data\n"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen (word->text ()) > 0) && (word->gblob_list ()->length () == 1)) { // Here is a word with a single unichar label and a single blob so train on it. bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); strncpy(unichar, word->text (), UNICHAR_LEN); tess_training_tester (filename, blob_it.data (), //single blob &denorm, TRUE, //correct unichar, //correct character strlen(unichar), //character length NULL); copy_outword = *(bln_word); copy_outword.baseline_denormalise (&denorm); blob_it.set_to_list (copy_outword.blob_list ()); delete bln_word; count++; } } } } tprintf ("Generated training data for %d blobs\n", count); } namespace tesseract { void Tesseract::apply_box_testing(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; inT16 row_count = 0; WERD_IT word_it; WERD *word; WERD *bln_word; inT16 word_count = 0; PBLOB_IT blob_it; DENORM denorm; inT16 count = 0; char ch[2]; WERD *outword; //bln best choice //segmentation WERD_CHOICE *best_choice; //tess output WERD_CHOICE *raw_choice; //top choice permuter //detailed results BLOB_CHOICE_LIST_CLIST blob_choices; inT16 char_count = 0; inT16 correct_count = 0; inT16 err_count = 0; inT16 rej_count = 0; #ifndef SECURE_NAMES WERDSTATS wordstats; //As from newdiff #endif char tess_rej_str[3]; char tess_long_str[3]; ch[1] = '\0'; strcpy (tess_rej_str, "|A"); strcpy (tess_long_str, "|B"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); row_count++; word_count = 0; word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_count++; if ((strlen (word->text ()) == 1) && !STRING (applybox_test_exclusions).contains (*word->text ()) && (word->gblob_list ()->length () == 1)) { // Here is a word with a single char label and a single blob so test it. bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); ch[0] = *word->text (); char_count++; best_choice = tess_segment_pass1 (bln_word, &denorm, &Tesseract::tess_default_matcher, raw_choice, &blob_choices, outword); /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ if ((best_choice->length() == 0) || (strspn(best_choice->unichar_string().string(), " ") == best_choice->unichar_string().length())) { rej_count++; tprintf ("%d:%d: \"%s\" -> TESS FAILED\n", row_count, word_count, ch); #ifndef SECURE_NAMES wordstats.word (tess_rej_str, 2, ch, 1); #endif } else { if ((best_choice->length() != outword->blob_list()->length()) || (best_choice->length() != blob_choices.length())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", best_choice->unichar_string().string(), best_choice->length(), outword->blob_list ()->length(), blob_choices.length()); } ASSERT_HOST(best_choice->length() == outword->blob_list()->length()); ASSERT_HOST(best_choice->length() == blob_choices.length()); fix_quotes (best_choice, //turn to double outword, &blob_choices); if (strcmp (best_choice->unichar_string().string(), ch) != 0) { err_count++; tprintf ("%d:%d: \"%s\" -> \"%s\"\n", row_count, word_count, ch, best_choice->unichar_string().string()); } else correct_count++; #ifndef SECURE_NAMES if (best_choice->unichar_string().length() > 2) wordstats.word(tess_long_str, 2, ch, 1); else wordstats.word(best_choice->unichar_string().string(), best_choice->unichar_string().length(), ch, 1); #endif } delete bln_word; delete outword; delete best_choice; delete raw_choice; blob_choices.deep_clear (); count++; } } } } #ifndef SECURE_NAMES wordstats.print (1, 100.0); wordstats.conf_matrix (); tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n", char_count, correct_count, rej_count, err_count); #endif } } // namespace tesseract