tesseract/ccmain/applybox.cpp
Jim O'Regan 524a61452d Doxygen
Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen
closes #14

Commits:
6317305  doxygen
9f42f69  doxygen
0fc4d52  doxygen
37b4b55  fix typo
bded8f1  some more doxy
020eb00  slight tweak
524666d  doxygenify
2a36a3e  doxygenify
229d218  doxygenify
7fd28ae  doxygenify
a8c64bc  doxygenify
f5d21b6  fix
5d8ede8  doxygenify
a58a4e0  language_model.cpp
fa85709  lm_pain_points.cpp lm_state.cpp
6418da3  merge
06190ba  Merge branch 'old_doxygen_merge' into more-doxygen
84acf08  Merge branch 'master' into more-doxygen
50fe1ff  pagewalk.cpp cube_reco_context.cpp
2982583  change to relative
192a24a  applybox.cpp, take one
8eeb053  delete docs for obsolete params
52e4c77  modernise classify/ocrfeatures.cpp
2a1cba6  modernise cutil/emalloc.cpp
773e006  silence doxygen warning
aeb1731  silence doxygen warning
f18387f  silence doxygen; new params are unused?
15ad6bd  doxygenify cutil/efio.cpp
c8b5dad  doxygenify cutil/danerror.cpp
784450f  the globals and exceptions parts are obsolete; remove
8bca324  doxygen classify/normfeat.cpp
9bcbe16  doxygen classify/normmatch.cpp
aa9a971  doxygen ccmain/cube_control.cpp
c083ff2  doxygen ccmain/cube_reco_context.cpp
f842850  params changed
5c94f12  doxygen ccmain/cubeclassifier.cpp
15ba750  case sensitive
f5c71d4  case sensitive
f85655b  doxygen classify/intproto.cpp
4bbc7aa  partial doxygen classify/mfx.cpp
dbb6041  partial doxygen classify/intproto.cpp
2aa72db  finish doxygen classify/intproto.cpp
0b8de99  doxygen training/mftraining.cpp
0b5b35c  partial doxygen ccstruct/coutln.cpp
b81c766  partial doxygen ccstruct/coutln.cpp
40fc415  finished? doxygen ccstruct/coutln.cpp
6e4165c  doxygen classify/clusttool.cpp
0267dec  doxygen classify/cutoffs.cpp
7f0c70c  doxygen classify/fpoint.cpp
512f3bd  ignore ~ files
5668a52  doxygen classify/intmatcher.cpp
84788d4  doxygen classify/kdtree.cpp
29f36ca  doxygen classify/mfoutline.cpp
40b94b1  silence doxygen warnings
6c511b9  doxygen classify/mfx.cpp
f9b4080  doxygen classify/outfeat.cpp
aa1df05  doxygen classify/picofeat.cpp
cc5f466  doxygen training/cntraining.cpp
cce044f  doxygen training/commontraining.cpp
167e216  missing param
9498383  renamed params
37eeac2  renamed param
d87b5dd  case
c8ee174  renamed params
b858db8  typo
4c2a838  h2 context?
81a2c0c  fix some param names; add some missing params, no docs
bcf8a4c  add some missing params, no docs
af77f86  add some missing params, no docs; fix some param names
01df24e  fix some params
6161056  fix some params
68508b6  fix some params
285aeb6  doxygen complains here no matter what
529bcfa  rm some missing params, typos
cd21226  rm some missing params, add some new ones
48a4bc2  fix params
c844628  missing param
312ce37  missing param; rename one
ec2fdec  missing param
05e15e0  missing params
d515858  change "<" to &lt; to make doxygen happy
b476a28  wrong place
2015-07-20 18:48:00 +01:00

809 lines
33 KiB
C++

/**********************************************************************
* File: applybox.cpp (Formerly applybox.c)
* Description: Re segment rows according to box file data
* Author: Phil Cheatle
* Created: Wed Nov 24 09:11:23 GMT 1993
*
* (C) Copyright 1993, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#include <ctype.h>
#include <string.h>
#ifdef __UNIX__
#include <assert.h>
#include <errno.h>
#endif
#include "allheaders.h"
#include "boxread.h"
#include "chopper.h"
#include "pageres.h"
#include "unichar.h"
#include "unicharset.h"
#include "tesseractclass.h"
#include "genericvector.h"
/** Max number of blobs to classify together in FindSegmentation. */
const int kMaxGroupSize = 4;
/// Max fraction of median allowed as deviation in xheight before switching
/// to median.
const double kMaxXHeightDeviationFraction = 0.125;
/**
* The box file is assumed to contain box definitions, one per line, of the
* following format for blob-level boxes:
* @verbatim
* <UTF8 str> <left> <bottom> <right> <top> <page id>
* @endverbatim
* and for word/line-level boxes:
* @verbatim
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
* @endverbatim
* NOTES:
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
*
* <page id> is 0-based, and the page number is used for multipage input (tiff).
*
* In the blob-level form, each line represents a recognizable unit, which may
* be several UTF-8 bytes, but there is a bounding box around each recognizable
* unit, and no classifier is needed to train in this mode (bootstrapping.)
*
* In the word/line-level form, the line begins with the literal "WordStr", and
* the bounding box bounds either a whole line or a whole word. The recognizable
* units in the word/line are listed after the # at the end of the line and
* are space delimited, ignoring any original spaces on the line.
* Eg.
* @verbatim
* word -> #w o r d
* multi word line -> #m u l t i w o r d l i n e
* @endverbatim
* The recognizable units must be space-delimited in order to allow multiple
* unicodes to be used for a single recognizable unit, eg Hindi.
*
* In this mode, the classifier must have been pre-trained with the desired
* character set, or it will not be able to find the character segmentations.
*/
namespace tesseract {
static void clear_any_old_text(BLOCK_LIST *block_list) {
BLOCK_IT block_it(block_list);
for (block_it.mark_cycle_pt();
!block_it.cycled_list(); block_it.forward()) {
ROW_IT row_it(block_it.data()->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
WERD_IT word_it(row_it.data()->word_list());
for (word_it.mark_cycle_pt();
!word_it.cycled_list(); word_it.forward()) {
word_it.data()->set_text("");
}
}
}
}
// Applies the box file based on the image name fname, and resegments
// the words in the block_list (page), with:
// blob-mode: one blob per line in the box file, words as input.
// word/line-mode: one blob per space-delimited unit after the #, and one word
// per line in the box file. (See comment above for box file format.)
// If find_segmentation is true, (word/line mode) then the classifier is used
// to re-segment words/lines to match the space-delimited truth string for
// each box. In this case, the input box may be for a word or even a whole
// text line, and the output words will contain multiple blobs corresponding
// to the space-delimited input string.
// With find_segmentation false, no classifier is needed, but the chopper
// can still be used to correctly segment touching characters with the help
// of the input boxes.
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
// from normal classification, ie. with a word, chopped_word, rebuild_word,
// seam_array, denorm, box_word, and best_state, but NO best_choice or
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
// Instead, the correct_text member of WERD_RES is set, and this may be later
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
// is not required before calling ApplyBoxTraining.
PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
bool find_segmentation,
BLOCK_LIST *block_list) {
GenericVector<TBOX> boxes;
GenericVector<STRING> texts, full_texts;
if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
NULL)) {
return NULL; // Can't do it.
}
int box_count = boxes.size();
int box_failures = 0;
// Add an empty everything to the end.
boxes.push_back(TBOX());
texts.push_back(STRING());
full_texts.push_back(STRING());
// In word mode, we use the boxes to make a word for each box, but
// in blob mode we use the existing words and maximally chop them first.
PAGE_RES* page_res = find_segmentation ?
NULL : SetupApplyBoxes(boxes, block_list);
clear_any_old_text(block_list);
for (int i = 0; i < boxes.size() - 1; i++) {
bool foundit = false;
if (page_res != NULL) {
if (i == 0) {
foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
full_texts[i].string());
} else {
foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
boxes[i + 1], full_texts[i].string());
}
} else {
foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
texts[i].string());
}
if (!foundit) {
box_failures++;
ReportFailedBox(i, boxes[i], texts[i].string(),
"FAILURE! Couldn't find a matching blob");
}
}
if (page_res == NULL) {
// In word/line mode, we now maximally chop all the words and resegment
// them with the classifier.
page_res = SetupApplyBoxes(boxes, block_list);
ReSegmentByClassification(page_res);
}
if (applybox_debug > 0) {
tprintf("APPLY_BOXES:\n");
tprintf(" Boxes read from boxfile: %6d\n", box_count);
if (box_failures > 0)
tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
}
TidyUp(page_res);
return page_res;
}
// Helper computes median xheight in the image.
static double MedianXHeight(BLOCK_LIST *block_list) {
BLOCK_IT block_it(block_list);
STATS xheights(0, block_it.data()->bounding_box().height());
for (block_it.mark_cycle_pt();
!block_it.cycled_list(); block_it.forward()) {
ROW_IT row_it(block_it.data()->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
}
}
return xheights.median();
}
/// Any row xheight that is significantly different from the median is set
/// to the median.
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
double median_xheight = MedianXHeight(block_list);
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
ROW* row = r_it.data();
float diff = fabs(row->x_height() - median_xheight);
if (diff > max_deviation) {
if (applybox_debug) {
tprintf("row xheight=%g, but median xheight = %g\n",
row->x_height(), median_xheight);
}
row->set_x_height(static_cast<float>(median_xheight));
}
}
}
}
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
/// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
BLOCK_LIST *block_list) {
PreenXHeights(block_list);
// Strip all fuzzy space markers to simplify the PAGE_RES.
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
ROW* row = r_it.data();
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
if (word->cblob_list()->empty()) {
delete w_it.extract();
} else {
word->set_flag(W_FUZZY_SP, false);
word->set_flag(W_FUZZY_NON, false);
}
}
}
}
PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
while ((word_res = pr_it.word()) != NULL) {
MaximallyChopWord(boxes, pr_it.block()->block,
pr_it.row()->row, word_res);
pr_it.forward();
}
return page_res;
}
/// Tests the chopper by exhaustively running chop_one_blob.
/// The word_res will contain filled chopped_word, seam_array, denorm,
/// box_word and best_state for the maximally chopped word.
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
BLOCK* block, ROW* row,
WERD_RES* word_res) {
if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
tessedit_ocr_engine_mode, NULL,
classify_bln_numeric_mode,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
row, block)) {
word_res->CloneChoppedToRebuild();
return;
}
if (chop_debug) {
tprintf("Maximally chopping word at:");
word_res->word->bounding_box().print();
}
GenericVector<BLOB_CHOICE*> blob_choices;
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
float rating = static_cast<float>(MAX_INT8);
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
// The rating and certainty are not quite arbitrary. Since
// select_blob_to_chop uses the worst certainty to choose, they all have
// to be different, so starting with MAX_INT8, subtract 1/8 for each blob
// in here, and then divide by e each time they are chopped, which
// should guarantee a set of unequal values for the whole tree of blobs
// produced, however much chopping is required. The chops are thus only
// limited by the ability of the chopper to find suitable chop points,
// and not by the value of the certainties.
BLOB_CHOICE* choice =
new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
blob_choices.push_back(choice);
rating -= 0.125f;
}
const double e = exp(1.0); // The base of natural logs.
int blob_number;
int right_chop_index = 0;
if (!assume_fixed_pitch_char_segment) {
// We only chop if the language is not fixed pitch like CJK.
SEAM* seam = NULL;
while ((seam = chop_one_blob(boxes, blob_choices, word_res,
&blob_number)) != NULL) {
word_res->InsertSeam(blob_number, seam);
BLOB_CHOICE* left_choice = blob_choices[blob_number];
rating = left_choice->rating() / e;
left_choice->set_rating(rating);
left_choice->set_certainty(-rating);
// combine confidence w/ serial #
BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
rating - 0.125f, -rating, -1,
0.0f, 0.0f, 0.0f, BCC_FAKE);
blob_choices.insert(right_choice, blob_number + 1);
}
}
word_res->CloneChoppedToRebuild();
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
}
/// Helper to compute the dispute resolution metric.
/// Disputed blob resolution. The aim is to give the blob to the most
/// appropriate boxfile box. Most of the time it is obvious, but if
/// two boxfile boxes overlap significantly it is not. If a small boxfile
/// box takes most of the blob, and a large boxfile box does too, then
/// we want the small boxfile box to get it, but if the small box
/// is much smaller than the blob, we don't want it to get it.
/// Details of the disputed blob resolution:
/// Given a box with area A, and a blob with area B, with overlap area C,
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
/// miss metric gets the blob.
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
int overlap_area = box1.intersection(box2).area();
double miss_metric = box1.area()- overlap_area;
miss_metric /= box1.area();
miss_metric *= box2.area() - overlap_area;
miss_metric /= box2.area();
return miss_metric;
}
/// Gather consecutive blobs that match the given box into the best_state
/// and corresponding correct_text.
///
/// Fights over which box owns which blobs are settled by pre-chopping and
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an appropriate blob for a box.
///
/// This means that occasionally, blobs may be incorrectly segmented if the
/// chopper fails to find a suitable chop point.
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
}
PAGE_RES_IT page_res_it(page_res);
WERD_RES* word_res;
for (word_res = page_res_it.word(); word_res != NULL;
word_res = page_res_it.forward()) {
if (!word_res->box_word->bounding_box().major_overlap(box))
continue;
if (applybox_debug > 1) {
tprintf("Checking word box:");
word_res->box_word->bounding_box().print();
}
int word_len = word_res->box_word->length();
for (int i = 0; i < word_len; ++i) {
TBOX char_box = TBOX();
int blob_count = 0;
for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
if (!blob_box.major_overlap(box))
break;
if (word_res->correct_text[i + blob_count].length() > 0)
break; // Blob is claimed already.
double current_box_miss_metric = BoxMissMetric(blob_box, box);
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
break; // Blob is a better match for next box.
char_box += blob_box;
}
if (blob_count > 0) {
if (applybox_debug > 1) {
tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
}
if (!char_box.almost_equal(box, 3) &&
(box.x_gap(next_box) < -3 ||
(prev_box != NULL && prev_box->x_gap(box) < -3))) {
return false;
}
// We refine just the box_word, best_state and correct_text here.
// The rebuild_word is made in TidyUp.
// blob_count blobs are put together to match the box. Merge the
// box_word boxes, save the blob_count in the state and the text.
word_res->box_word->MergeBoxes(i, i + blob_count);
word_res->best_state[i] = blob_count;
word_res->correct_text[i] = correct_text;
if (applybox_debug > 2) {
tprintf("%d Blobs match: blob box:", blob_count);
word_res->box_word->BlobBox(i).print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
}
// Eliminated best_state and correct_text entries for the consumed
// blobs.
for (int j = 1; j < blob_count; ++j) {
word_res->best_state.remove(i + 1);
word_res->correct_text.remove(i + 1);
}
// Assume that no box spans multiple source words, so we are done with
// this box.
if (applybox_debug > 1) {
tprintf("Best state = ");
for (int j = 0; j < word_res->best_state.size(); ++j) {
tprintf("%d ", word_res->best_state[j]);
}
tprintf("\n");
tprintf("Correct text = [[ ");
for (int j = 0; j < word_res->correct_text.size(); ++j) {
tprintf("%s ", word_res->correct_text[j].string());
}
tprintf("]]\n");
}
return true;
}
}
}
if (applybox_debug > 0) {
tprintf("FAIL!\n");
}
return false; // Failure.
}
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
}
WERD* new_word = NULL;
BLOCK_IT b_it(block_list);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
if (!box.major_overlap(block->bounding_box()))
continue;
ROW_IT r_it(block->row_list());
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
ROW* row = r_it.data();
if (!box.major_overlap(row->bounding_box()))
continue;
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
if (applybox_debug > 2) {
tprintf("Checking word:");
word->bounding_box().print();
}
if (word->text() != NULL && word->text()[0] != '\0')
continue; // Ignore words that are already done.
if (!box.major_overlap(word->bounding_box()))
continue;
C_BLOB_IT blob_it(word->cblob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
blob_it.forward()) {
C_BLOB* blob = blob_it.data();
TBOX blob_box = blob->bounding_box();
if (!blob_box.major_overlap(box))
continue;
double current_box_miss_metric = BoxMissMetric(blob_box, box);
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
continue; // Blob is a better match for next box.
if (applybox_debug > 2) {
tprintf("Blob match: blob:");
blob_box.print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
}
if (new_word == NULL) {
// Make a new word with a single blob.
new_word = word->shallow_copy();
new_word->set_text(correct_text);
w_it.add_to_end(new_word);
}
C_BLOB_IT new_blob_it(new_word->cblob_list());
new_blob_it.add_to_end(blob_it.extract());
}
}
}
}
if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
return new_word != NULL;
}
/// Resegments the words by running the classifier in an attempt to find the
/// correct segmentation that produces the required string.
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
WERD* word = word_res->word;
if (word->text() == NULL || word->text()[0] == '\0')
continue; // Ignore words that have no text.
// Convert the correct text to a vector of UNICHAR_ID
GenericVector<UNICHAR_ID> target_text;
if (!ConvertStringToUnichars(word->text(), &target_text)) {
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
word->text());
pr_it.DeleteCurrentWord();
continue;
}
if (!FindSegmentation(target_text, word_res)) {
tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
word->text());
pr_it.DeleteCurrentWord();
continue;
}
}
}
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
/// @return false if an invalid UNICHAR_ID is encountered.
bool Tesseract::ConvertStringToUnichars(const char* utf8,
GenericVector<UNICHAR_ID>* class_ids) {
for (int step = 0; *utf8 != '\0'; utf8 += step) {
const char* next_space = strchr(utf8, ' ');
if (next_space == NULL)
next_space = utf8 + strlen(utf8);
step = next_space - utf8;
UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
if (class_id == INVALID_UNICHAR_ID) {
return false;
}
while (utf8[step] == ' ')
++step;
class_ids->push_back(class_id);
}
return true;
}
/// Resegments the word to achieve the target_text from the classifier.
/// Returns false if the re-segmentation fails.
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
WERD_RES* word_res) {
// Classify all required combinations of blobs and save results in choices.
int word_length = word_res->box_word->length();
GenericVector<BLOB_CHOICE_LIST*>* choices =
new GenericVector<BLOB_CHOICE_LIST*>[word_length];
for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST* match_result = classify_piece(
word_res->seam_array, i, i + j - 1, "Applybox",
word_res->chopped_word, word_res->blamer_bundle);
if (applybox_debug > 2) {
tprintf("%d+%d:", i, j);
print_ratings_list("Segment:", match_result, unicharset);
}
choices[i].push_back(match_result);
}
}
// Search the segmentation graph for the target text. Must be an exact
// match. Using wildcards makes it difficult to find the correct
// segmentation even when it is there.
word_res->best_state.clear();
GenericVector<int> search_segmentation;
float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
&search_segmentation, &best_rating, &word_res->best_state);
for (int i = 0; i < word_length; ++i)
choices[i].delete_data_pointers();
delete [] choices;
if (word_res->best_state.empty()) {
// Build the original segmentation and if it is the same length as the
// truth, assume it will do.
int blob_count = 1;
for (int s = 0; s < word_res->seam_array.size(); ++s) {
SEAM* seam = word_res->seam_array[s];
if (!seam->HasAnySplits()) {
word_res->best_state.push_back(blob_count);
blob_count = 1;
} else {
++blob_count;
}
}
word_res->best_state.push_back(blob_count);
if (word_res->best_state.size() != target_text.size()) {
word_res->best_state.clear(); // No good. Original segmentation bad size.
return false;
}
}
word_res->correct_text.clear();
for (int i = 0; i < target_text.size(); ++i) {
word_res->correct_text.push_back(
STRING(unicharset.id_to_unichar(target_text[i])));
}
return true;
}
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length,
/// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos
/// @param choices_length
/// @param target_text
/// @param text_index
/// @param rating
/// @param segmentation
/// @param best_rating
/// @param best_segmentation
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
int choices_pos, int choices_length,
const GenericVector<UNICHAR_ID>& target_text,
int text_index,
float rating, GenericVector<int>* segmentation,
float* best_rating,
GenericVector<int>* best_segmentation) {
const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
for (int length = 1; length <= choices[choices_pos].size(); ++length) {
// Rating of matching choice or worst choice if no match.
float choice_rating = 0.0f;
// Find the corresponding best BLOB_CHOICE.
BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
BLOB_CHOICE* choice = choice_it.data();
choice_rating = choice->rating();
UNICHAR_ID class_id = choice->unichar_id();
if (class_id == target_text[text_index]) {
break;
}
// Search ambigs table.
if (class_id < table.size() && table[class_id] != NULL) {
AmbigSpec_IT spec_it(table[class_id]);
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
spec_it.forward()) {
const AmbigSpec *ambig_spec = spec_it.data();
// We'll only do 1-1.
if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
ambig_spec->correct_ngram_id == target_text[text_index])
break;
}
if (!spec_it.cycled_list())
break; // Found an ambig.
}
}
if (choice_it.cycled_list())
continue; // No match.
segmentation->push_back(length);
if (choices_pos + length == choices_length &&
text_index + 1 == target_text.size()) {
// This is a complete match. If the rating is good record a new best.
if (applybox_debug > 2) {
tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
rating + choice_rating, *best_rating, segmentation->size(),
best_segmentation->size());
}
if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
*best_segmentation = *segmentation;
*best_rating = rating + choice_rating;
}
} else if (choices_pos + length < choices_length &&
text_index + 1 < target_text.size()) {
if (applybox_debug > 3) {
tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]),
choice_it.data()->unichar_id() == target_text[text_index]
? "Match" : "Ambig",
choices_pos, length);
}
SearchForText(choices, choices_pos + length, choices_length, target_text,
text_index + 1, rating + choice_rating, segmentation,
best_rating, best_segmentation);
if (applybox_debug > 3) {
tprintf("End recursion for %d=%s\n", target_text[text_index],
unicharset.id_to_unichar(target_text[text_index]));
}
}
segmentation->truncate(segmentation->size() - 1);
}
}
/// - Counts up the labelled words and the blobs within.
/// - Deletes all unused or emptied words, counting the unused ones.
/// - Resets W_BOL and W_EOL flags correctly.
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
void Tesseract::TidyUp(PAGE_RES* page_res) {
int ok_blob_count = 0;
int bad_blob_count = 0;
int ok_word_count = 0;
int unlabelled_words = 0;
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
int ok_in_word = 0;
int blob_count = word_res->correct_text.size();
WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
word_choice->set_permuter(TOP_CHOICE_PERM);
for (int c = 0; c < blob_count; ++c) {
if (word_res->correct_text[c].length() > 0) {
++ok_in_word;
}
// Since we only need a fake word_res->best_choice, the actual
// unichar_ids do not matter. Which is fortunate, since TidyUp()
// can be called while training Tesseract, at the stage where
// unicharset is not meaningful yet.
word_choice->append_unichar_id_space_allocated(
INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
}
if (ok_in_word > 0) {
ok_blob_count += ok_in_word;
bad_blob_count += word_res->correct_text.size() - ok_in_word;
word_res->LogNewRawChoice(word_choice);
word_res->LogNewCookedChoice(1, false, word_choice);
} else {
++unlabelled_words;
if (applybox_debug > 0) {
tprintf("APPLY_BOXES: Unlabelled word at :");
word_res->word->bounding_box().print();
}
pr_it.DeleteCurrentWord();
delete word_choice;
}
}
pr_it.restart_page();
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
// Denormalize back to a BoxWord.
word_res->RebuildBestState();
word_res->SetupBoxWord();
word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
}
if (applybox_debug > 0) {
tprintf(" Found %d good blobs.\n", ok_blob_count);
if (bad_blob_count > 0) {
tprintf(" Leaving %d unlabelled blobs in %d words.\n",
bad_blob_count, ok_word_count);
}
if (unlabelled_words > 0)
tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
}
}
/** Logs a bad box by line in the box file and box coords.*/
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
const char *box_ch, const char *err_msg) {
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
boxfile_lineno + 1, box_ch,
box.left(), box.bottom(), box.right(), box.top(), err_msg);
}
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
word_res->correct_text.size());
for (int i = 0; i < word_res->correct_text.size(); ++i) {
// The part before the first space is the real ground truth, and the
// rest is the bounding box location and page number.
GenericVector<STRING> tokens;
word_res->correct_text[i].split(' ', &tokens);
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
choice->append_unichar_id_space_allocated(char_id,
word_res->best_state[i],
0.0f, 0.0f);
}
word_res->ClearWordChoices();
word_res->LogNewRawChoice(choice);
word_res->LogNewCookedChoice(1, false, choice);
}
}
/// Calls #LearnWord to extract features for labelled blobs within each word.
/// Features are stored in an internal buffer.
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
int word_count = 0;
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
LearnWord(fontname.string(), word_res);
++word_count;
}
tprintf("Generated training data for %d words\n", word_count);
}
} // namespace tesseract