2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: applybox.cpp (Formerly applybox.c)
|
|
|
|
* Description: Re segment rows according to box file data
|
2009-07-11 10:03:51 +08:00
|
|
|
* Author: Phil Cheatle
|
|
|
|
* Created: Wed Nov 24 09:11:23 GMT 1993
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
|
|
|
* (C) Copyright 1993, Hewlett-Packard Ltd.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
#include "mfcpch.h"
|
2009-07-11 10:03:51 +08:00
|
|
|
|
2010-05-26 18:22:27 +08:00
|
|
|
#ifdef _MSC_VER
|
|
|
|
#pragma warning(disable:4244) // Conversion warnings
|
|
|
|
#endif
|
|
|
|
|
2007-07-18 09:11:18 +08:00
|
|
|
#include <ctype.h>
|
|
|
|
#include <string.h>
|
2007-03-08 04:03:40 +08:00
|
|
|
#ifdef __UNIX__
|
2007-07-18 09:11:18 +08:00
|
|
|
#include <assert.h>
|
|
|
|
#include <errno.h>
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "allheaders.h"
|
2007-08-31 02:18:35 +08:00
|
|
|
#include "boxread.h"
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "chopper.h"
|
|
|
|
#include "pageres.h"
|
2007-07-18 09:11:18 +08:00
|
|
|
#include "unichar.h"
|
2009-07-11 10:03:51 +08:00
|
|
|
#include "unicharset.h"
|
|
|
|
#include "tesseractclass.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Max number of blobs to classify together in FindSegmentation.
|
|
|
|
const int kMaxGroupSize = 4;
|
2010-07-27 02:21:10 +08:00
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
/*************************************************************************
|
|
|
|
* The box file is assumed to contain box definitions, one per line, of the
|
2010-11-24 02:34:14 +08:00
|
|
|
* following format for blob-level boxes:
|
|
|
|
* <UTF8 str> <left> <bottom> <right> <top> <page id>
|
|
|
|
* and for word/line-level boxes:
|
|
|
|
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
|
|
|
* NOTES:
|
|
|
|
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
2010-11-24 02:34:14 +08:00
|
|
|
* <page id> is 0-based, and the page number is used for multipage input (tiff).
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
2010-11-24 02:34:14 +08:00
|
|
|
* In the blob-level form, each line represents a recognizable unit, which may
|
|
|
|
* be several UTF-8 bytes, but there is a bounding box around each recognizable
|
|
|
|
* unit, and no classifier is needed to train in this mode (bootstrapping.)
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
2010-11-24 02:34:14 +08:00
|
|
|
* In the word/line-level form, the line begins with the literal "WordStr", and
|
|
|
|
* the bounding box bounds either a whole line or a whole word. The recognizable
|
|
|
|
* units in the word/line are listed after the # at the end of the line and
|
|
|
|
* are space delimited, ignoring any original spaces on the line.
|
|
|
|
* Eg.
|
|
|
|
* word -> #w o r d
|
|
|
|
* multi word line -> #m u l t i w o r d l i n e
|
|
|
|
* The recognizable units must be space-delimited in order to allow multiple
|
|
|
|
* unicodes to be used for a single recognizable unit, eg Hindi.
|
|
|
|
* In this mode, the classifier must have been pre-trained with the desired
|
|
|
|
* character set, or it will not be able to find the character segmentations.
|
2007-03-08 04:03:40 +08:00
|
|
|
*************************************************************************/
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
namespace tesseract {
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
static void clear_any_old_text(BLOCK_LIST *block_list) {
|
|
|
|
BLOCK_IT block_it(block_list);
|
|
|
|
for (block_it.mark_cycle_pt();
|
|
|
|
!block_it.cycled_list(); block_it.forward()) {
|
|
|
|
ROW_IT row_it(block_it.data()->row_list());
|
|
|
|
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
|
|
|
WERD_IT word_it(row_it.data()->word_list());
|
|
|
|
for (word_it.mark_cycle_pt();
|
|
|
|
!word_it.cycled_list(); word_it.forward()) {
|
|
|
|
word_it.data()->set_text("");
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Applies the box file based on the image name fname, and resegments
|
|
|
|
// the words in the block_list (page), with:
|
|
|
|
// blob-mode: one blob per line in the box file, words as input.
|
|
|
|
// word/line-mode: one blob per space-delimited unit after the #, and one word
|
|
|
|
// per line in the box file. (See comment above for box file format.)
|
|
|
|
// If find_segmentation is true, (word/line mode) then the classifier is used
|
|
|
|
// to re-segment words/lines to match the space-delimited truth string for
|
|
|
|
// each box. In this case, the input box may be for a word or even a whole
|
|
|
|
// text line, and the output words will contain multiple blobs corresponding
|
|
|
|
// to the space-delimited input string.
|
|
|
|
// With find_segmentation false, no classifier is needed, but the chopper
|
|
|
|
// can still be used to correctly segment touching characters with the help
|
|
|
|
// of the input boxes.
|
|
|
|
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
|
|
|
|
// from normal classification, ie. with a word, chopped_word, rebuild_word,
|
|
|
|
// seam_array, denorm, box_word, and best_state, but NO best_choice or
|
|
|
|
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
|
|
|
|
// Instead, the correct_text member of WERD_RES is set, and this may be later
|
|
|
|
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
|
|
|
|
// is not required before calling ApplyBoxTraining.
|
|
|
|
PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
|
|
|
|
bool find_segmentation,
|
|
|
|
BLOCK_LIST *block_list) {
|
|
|
|
// In word mode, we use the boxes to make a word for each box, but
|
|
|
|
// in blob mode we use the existing words and maximally chop them first.
|
|
|
|
PAGE_RES* page_res = find_segmentation ? NULL : SetupApplyBoxes(block_list);
|
|
|
|
int box_count = 0;
|
|
|
|
int box_failures = 0;
|
|
|
|
|
|
|
|
FILE* box_file = OpenBoxFile(fname);
|
2009-07-11 10:03:51 +08:00
|
|
|
clear_any_old_text(block_list);
|
2010-11-24 02:34:14 +08:00
|
|
|
TBOX prev_box, box, next_box;
|
|
|
|
bool found_box = false;
|
|
|
|
char text[kBoxReadBufSize];
|
|
|
|
do {
|
|
|
|
prev_box = box;
|
|
|
|
box = next_box;
|
|
|
|
int line_number = 0; // Line number of the box file.
|
|
|
|
int x_min;
|
|
|
|
int y_min;
|
|
|
|
int x_max;
|
|
|
|
int y_max;
|
|
|
|
char next_text[kBoxReadBufSize];
|
|
|
|
// Keep a look-ahead box, so we can pass the next box into the resegment
|
|
|
|
// functions.
|
|
|
|
found_box = read_next_box(applybox_page, &line_number, box_file, next_text,
|
|
|
|
&x_min, &y_min, &x_max, &y_max);
|
|
|
|
if (found_box) {
|
|
|
|
next_box = TBOX(ICOORD(x_min, y_min), ICOORD (x_max, y_max));
|
|
|
|
++box_count;
|
|
|
|
} else {
|
|
|
|
next_box = TBOX();
|
|
|
|
next_text[0] = '\0';
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (!box.null_box()) {
|
|
|
|
bool foundit = false;
|
|
|
|
if (page_res != NULL)
|
|
|
|
foundit = ResegmentCharBox(page_res, box, next_box, text);
|
|
|
|
else
|
|
|
|
foundit = ResegmentWordBox(block_list, box, next_box, text);
|
|
|
|
if (!foundit) {
|
|
|
|
box_failures++;
|
|
|
|
ReportFailedBox(box_count, box, text,
|
|
|
|
"FAILURE! Couldn't find a matching blob");
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
strcpy(text, next_text);
|
|
|
|
} while (found_box);
|
|
|
|
if (page_res == NULL) {
|
|
|
|
// In word/line mode, we now maximally chop all the words and resegment
|
|
|
|
// them with the classifier.
|
|
|
|
page_res = SetupApplyBoxes(block_list);
|
|
|
|
ReSegmentByClassification(page_res);
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (applybox_debug > 0) {
|
|
|
|
tprintf("APPLY_BOXES:\n");
|
|
|
|
tprintf(" Boxes read from boxfile: %6d\n", box_count);
|
|
|
|
tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
|
|
|
|
}
|
|
|
|
TidyUp(page_res);
|
|
|
|
return page_res;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
|
|
|
|
// All fuzzy spaces are removed, and all the words are maximally chopped.
|
|
|
|
PAGE_RES* Tesseract::SetupApplyBoxes(BLOCK_LIST *block_list) {
|
|
|
|
// Strip all fuzzy space markers to simplify the PAGE_RES.
|
|
|
|
BLOCK_IT b_it(block_list);
|
|
|
|
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
|
|
|
BLOCK* block = b_it.data();
|
|
|
|
ROW_IT r_it(block->row_list());
|
|
|
|
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
|
|
|
|
ROW* row = r_it.data();
|
|
|
|
WERD_IT w_it(row->word_list());
|
|
|
|
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
|
|
|
WERD* word = w_it.data();
|
|
|
|
if (word->cblob_list()->empty()) {
|
|
|
|
delete w_it.extract();
|
|
|
|
} else {
|
|
|
|
word->set_flag(W_FUZZY_SP, false);
|
|
|
|
word->set_flag(W_FUZZY_NON, false);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
|
|
|
|
PAGE_RES_IT pr_it(page_res);
|
|
|
|
WERD_RES* word_res;
|
|
|
|
while ((word_res = pr_it.word()) != NULL) {
|
|
|
|
MaximallyChopWord(pr_it.block()->block, pr_it.row()->row, word_res);
|
|
|
|
pr_it.forward();
|
|
|
|
}
|
|
|
|
return page_res;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
|
|
|
|
// the top choices. Avoids problems with very long words.
|
|
|
|
static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
|
|
|
|
const UNICHARSET& unicharset,
|
|
|
|
WERD_CHOICE* word_choice) {
|
|
|
|
word_choice->make_bad();
|
|
|
|
for (int i = 0; i < char_choices.size(); ++i) {
|
|
|
|
BLOB_CHOICE_IT it(char_choices[i]);
|
|
|
|
BLOB_CHOICE* bc = it.data();
|
|
|
|
word_choice->append_unichar_id(bc->unichar_id(), 1,
|
|
|
|
bc->rating(), bc->certainty());
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
word_choice->populate_unichars(unicharset);
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Tests the chopper by exhaustively running chop_one_blob.
|
|
|
|
// The word_res will contain filled chopped_word, seam_array, denorm,
|
|
|
|
// box_word and best_state for the maximally chopped word.
|
|
|
|
void Tesseract::MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
|
|
|
if (!word_res->SetupForRecognition(unicharset, false, row, block))
|
|
|
|
return;
|
|
|
|
if (chop_debug) {
|
|
|
|
tprintf("Maximally chopping word at:");
|
|
|
|
word_res->word->bounding_box().print();
|
|
|
|
}
|
|
|
|
blob_match_table.init_match_table();
|
|
|
|
BLOB_CHOICE_LIST *match_result;
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
|
|
|
|
set_denorm(&word_res->denorm);
|
|
|
|
ASSERT_HOST(word_res->chopped_word->blobs != NULL);
|
|
|
|
float rating = static_cast<float>(MAX_INT8);
|
|
|
|
for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
|
|
|
|
blob = blob->next) {
|
|
|
|
// The rating and certainty are not quite arbitrary. Since
|
|
|
|
// select_blob_to_chop uses the worst certainty to choose, they all have
|
|
|
|
// to be different, so starting with MAX_INT8, subtract 1/8 for each blob
|
|
|
|
// in here, and then divide by e each time they are chopped, which
|
|
|
|
// should guarantee a set of unequal values for the whole tree of blobs
|
|
|
|
// produced, however much chopping is required. The chops are thus only
|
|
|
|
// limited by the ability of the chopper to find suitable chop points,
|
|
|
|
// and not by the value of the certainties.
|
|
|
|
match_result = fake_classify_blob(0, rating, -rating);
|
|
|
|
modify_blob_choice(match_result, 0);
|
|
|
|
ASSERT_HOST(!match_result->empty());
|
|
|
|
*char_choices += match_result;
|
|
|
|
rating -= 0.125f;
|
|
|
|
}
|
|
|
|
inT32 blob_number;
|
|
|
|
int right_chop_index = 0;
|
|
|
|
while (chop_one_blob(word_res->chopped_word, char_choices,
|
|
|
|
&blob_number, &word_res->seam_array, &right_chop_index));
|
|
|
|
MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
|
|
|
|
MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
|
|
|
|
word_res->CloneChoppedToRebuild();
|
|
|
|
blob_match_table.end_match_table();
|
|
|
|
if (char_choices != NULL) {
|
|
|
|
char_choices->delete_data_pointers();
|
|
|
|
delete char_choices;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Helper to compute the dispute resolution metric.
|
|
|
|
// Disputed blob resolution. The aim is to give the blob to the most
|
|
|
|
// appropriate boxfile box. Most of the time it is obvious, but if
|
|
|
|
// two boxfile boxes overlap significantly it is not. If a small boxfile
|
|
|
|
// box takes most of the blob, and a large boxfile box does too, then
|
|
|
|
// we want the small boxfile box to get it, but if the small box
|
|
|
|
// is much smaller than the blob, we don't want it to get it.
|
|
|
|
// Details of the disputed blob resolution:
|
|
|
|
// Given a box with area A, and a blob with area B, with overlap area C,
|
|
|
|
// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
|
|
|
|
// miss metric gets the blob.
|
|
|
|
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
|
|
|
|
int overlap_area = box1.intersection(box2).area();
|
|
|
|
double miss_metric = box1.area()- overlap_area;
|
|
|
|
miss_metric /= box1.area();
|
|
|
|
miss_metric *= box2.area() - overlap_area;
|
|
|
|
miss_metric /= box2.area();
|
|
|
|
return miss_metric;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Gather consecutive blobs that match the given box into the best_state
|
|
|
|
// and corresponding correct_text.
|
|
|
|
// Fights over which box owns which blobs are settled by pre-chopping and
|
|
|
|
// applying the blobs to box or next_box with the least non-overlap.
|
|
|
|
// Returns false if the box was in error, which can only be caused by
|
|
|
|
// failing to find an appropriate blob for a box.
|
|
|
|
// This means that occasionally, blobs may be incorrectly segmented if the
|
|
|
|
// chopper fails to find a suitable chop point.
|
|
|
|
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res,
|
|
|
|
const TBOX& box, const TBOX& next_box,
|
|
|
|
const char* correct_text) {
|
|
|
|
if (applybox_debug > 1) {
|
|
|
|
tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
|
|
|
|
}
|
|
|
|
PAGE_RES_IT page_res_it(page_res);
|
|
|
|
WERD_RES* word_res;
|
|
|
|
for (word_res = page_res_it.word(); word_res != NULL;
|
|
|
|
word_res = page_res_it.forward()) {
|
|
|
|
if (!word_res->box_word->bounding_box().major_overlap(box))
|
|
|
|
continue;
|
|
|
|
if (applybox_debug > 1) {
|
|
|
|
tprintf("Checking word box:");
|
|
|
|
word_res->box_word->bounding_box().print();
|
|
|
|
}
|
|
|
|
int word_len = word_res->box_word->length();
|
|
|
|
for (int i = 0; i < word_len; ++i) {
|
|
|
|
int blob_count = 0;
|
|
|
|
for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
|
|
|
|
TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
|
|
|
|
if (!blob_box.major_overlap(box))
|
|
|
|
break;
|
|
|
|
if (word_res->correct_text[i + blob_count].length() > 0)
|
|
|
|
break; // Blob is claimed already.
|
|
|
|
double current_box_miss_metric = BoxMissMetric(blob_box, box);
|
|
|
|
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
|
|
|
|
if (applybox_debug > 2) {
|
|
|
|
tprintf("Checking blob:");
|
|
|
|
blob_box.print();
|
|
|
|
tprintf("Current miss metric = %g, next = %g\n",
|
|
|
|
current_box_miss_metric, next_box_miss_metric);
|
|
|
|
}
|
|
|
|
if (current_box_miss_metric > next_box_miss_metric)
|
|
|
|
break; // Blob is a better match for next box.
|
|
|
|
}
|
|
|
|
if (blob_count > 0) {
|
|
|
|
// We refine just the box_word, best_state and correct_text here.
|
|
|
|
// The rebuild_word is made in TidyUp.
|
|
|
|
// blob_count blobs are put together to match the box. Merge the
|
|
|
|
// box_word boxes, save the blob_count in the state and the text.
|
|
|
|
word_res->box_word->MergeBoxes(i, i + blob_count);
|
|
|
|
word_res->best_state[i] = blob_count;
|
|
|
|
word_res->correct_text[i] = correct_text;
|
|
|
|
if (applybox_debug > 2) {
|
|
|
|
tprintf("%d Blobs match: blob box:", blob_count);
|
|
|
|
word_res->box_word->BlobBox(i).print();
|
|
|
|
tprintf("Matches box:");
|
|
|
|
box.print();
|
|
|
|
tprintf("With next box:");
|
|
|
|
next_box.print();
|
|
|
|
}
|
|
|
|
// Eliminated best_state and correct_text entries for the consumed
|
|
|
|
// blobs.
|
|
|
|
for (int j = 1; j < blob_count; ++j) {
|
|
|
|
word_res->best_state.remove(i + 1);
|
|
|
|
word_res->correct_text.remove(i + 1);
|
|
|
|
}
|
|
|
|
// Assume that no box spans multiple source words, so we are done with
|
|
|
|
// this box.
|
|
|
|
if (applybox_debug > 1) {
|
|
|
|
tprintf("Best state = ");
|
|
|
|
for (int j = 0; j < word_res->best_state.size(); ++j) {
|
|
|
|
tprintf("%d ", word_res->best_state[j]);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
tprintf("\n");
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return false; // Failure.
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Consume all source blobs that strongly overlap the given box,
|
|
|
|
// putting them into a new word, with the correct_text label.
|
|
|
|
// Fights over which box owns which blobs are settled by
|
|
|
|
// applying the blobs to box or next_box with the least non-overlap.
|
|
|
|
// Returns false if the box was in error, which can only be caused by
|
|
|
|
// failing to find an overlapping blob for a box.
|
|
|
|
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
|
|
|
|
const TBOX& box, const TBOX& next_box,
|
|
|
|
const char* correct_text) {
|
|
|
|
if (applybox_debug > 1) {
|
|
|
|
tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
WERD* new_word = NULL;
|
|
|
|
BLOCK_IT b_it(block_list);
|
|
|
|
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
|
|
|
BLOCK* block = b_it.data();
|
|
|
|
if (!box.major_overlap(block->bounding_box()))
|
|
|
|
continue;
|
|
|
|
ROW_IT r_it(block->row_list());
|
|
|
|
for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
|
|
|
|
ROW* row = r_it.data();
|
|
|
|
if (!box.major_overlap(row->bounding_box()))
|
|
|
|
continue;
|
|
|
|
WERD_IT w_it(row->word_list());
|
|
|
|
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
|
|
|
WERD* word = w_it.data();
|
|
|
|
if (applybox_debug > 2) {
|
|
|
|
tprintf("Checking word:");
|
|
|
|
word->bounding_box().print();
|
|
|
|
}
|
|
|
|
if (word->text() != NULL && word->text()[0] != '\0')
|
|
|
|
continue; // Ignore words that are already done.
|
|
|
|
if (!box.major_overlap(word->bounding_box()))
|
|
|
|
continue;
|
|
|
|
C_BLOB_IT blob_it(word->cblob_list());
|
|
|
|
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
|
|
|
|
blob_it.forward()) {
|
|
|
|
C_BLOB* blob = blob_it.data();
|
|
|
|
TBOX blob_box = blob->bounding_box();
|
|
|
|
if (!blob_box.major_overlap(box))
|
|
|
|
continue;
|
|
|
|
double current_box_miss_metric = BoxMissMetric(blob_box, box);
|
|
|
|
double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
|
|
|
|
if (applybox_debug > 2) {
|
|
|
|
tprintf("Checking blob:");
|
|
|
|
blob_box.print();
|
|
|
|
tprintf("Current miss metric = %g, next = %g\n",
|
|
|
|
current_box_miss_metric, next_box_miss_metric);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (current_box_miss_metric > next_box_miss_metric)
|
|
|
|
continue; // Blob is a better match for next box.
|
|
|
|
if (applybox_debug > 2) {
|
|
|
|
tprintf("Blob match: blob:");
|
|
|
|
blob_box.print();
|
|
|
|
tprintf("Matches box:");
|
|
|
|
box.print();
|
|
|
|
tprintf("With next box:");
|
|
|
|
next_box.print();
|
|
|
|
}
|
|
|
|
if (new_word == NULL) {
|
|
|
|
// Make a new word with a single blob.
|
|
|
|
new_word = word->shallow_copy();
|
|
|
|
new_word->set_text(correct_text);
|
|
|
|
w_it.add_to_end(new_word);
|
|
|
|
}
|
|
|
|
C_BLOB_IT new_blob_it(new_word->cblob_list());
|
|
|
|
new_blob_it.add_to_end(blob_it.extract());
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return new_word != NULL;
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Resegments the words by running the classifier in an attempt to find the
|
|
|
|
// correct segmentation that produces the required string.
|
|
|
|
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
|
|
|
|
PAGE_RES_IT pr_it(page_res);
|
|
|
|
WERD_RES* word_res;
|
|
|
|
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
|
|
|
|
WERD* word = word_res->word;
|
|
|
|
if (word->text() == NULL || word->text()[0] == '\0')
|
|
|
|
continue; // Ignore words that have no text.
|
|
|
|
// Convert the correct text to a vector of UNICHAR_ID
|
|
|
|
GenericVector<UNICHAR_ID> target_text;
|
|
|
|
if (!ConvertStringToUnichars(word->text(), &target_text)) {
|
|
|
|
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
|
|
|
|
word->text());
|
|
|
|
pr_it.DeleteCurrentWord();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!FindSegmentation(target_text, word_res)) {
|
|
|
|
tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
|
|
|
|
word->text());
|
|
|
|
pr_it.DeleteCurrentWord();
|
|
|
|
continue;
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
|
|
|
// Returns false if an invalid UNICHAR_ID is encountered.
|
|
|
|
bool Tesseract::ConvertStringToUnichars(const char* utf8,
|
|
|
|
GenericVector<UNICHAR_ID>* class_ids) {
|
|
|
|
for (int step = 0; *utf8 != '\0'; utf8 += step) {
|
|
|
|
const char* next_space = strchr(utf8, ' ');
|
|
|
|
if (next_space == NULL)
|
|
|
|
next_space = utf8 + strlen(utf8);
|
|
|
|
step = next_space - utf8;
|
|
|
|
UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
|
|
|
|
if (class_id == INVALID_UNICHAR_ID) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
while (utf8[step] == ' ')
|
|
|
|
++step;
|
|
|
|
class_ids->push_back(class_id);
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Resegments the word to achieve the target_text from the classifier.
|
|
|
|
// Returns false if the re-segmentation fails.
|
|
|
|
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
|
|
|
|
// applies a full search on the classifier results to find the best classified
|
|
|
|
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
|
|
|
|
// substitutions ARE used.
|
|
|
|
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
|
|
|
WERD_RES* word_res) {
|
|
|
|
blob_match_table.init_match_table();
|
|
|
|
// Classify all required combinations of blobs and save results in choices.
|
|
|
|
int word_length = word_res->box_word->length();
|
|
|
|
GenericVector<BLOB_CHOICE_LIST*>* choices =
|
|
|
|
new GenericVector<BLOB_CHOICE_LIST*>[word_length];
|
|
|
|
for (int i = 0; i < word_length; ++i) {
|
|
|
|
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
|
|
|
|
BLOB_CHOICE_LIST* match_result = classify_piece(
|
|
|
|
word_res->chopped_word->blobs, word_res->seam_array,
|
|
|
|
i, i + j - 1);
|
|
|
|
if (applybox_debug > 2) {
|
|
|
|
tprintf("%d+%d:", i, j);
|
|
|
|
print_ratings_list("Segment:", match_result, unicharset);
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
choices[i].push_back(match_result);
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
// Search the segmentation graph for the target text. Must be an exact
|
|
|
|
// match. Using wildcards makes it difficult to find the correct
|
|
|
|
// segmentation even when it is there.
|
|
|
|
word_res->best_state.clear();
|
|
|
|
GenericVector<int> search_segmentation;
|
|
|
|
float best_rating = 0.0f;
|
|
|
|
SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
|
|
|
|
&search_segmentation, &best_rating, &word_res->best_state);
|
|
|
|
blob_match_table.end_match_table();
|
|
|
|
for (int i = 0; i < word_length; ++i)
|
|
|
|
choices[i].delete_data_pointers();
|
|
|
|
delete [] choices;
|
|
|
|
if (word_res->best_state.empty())
|
|
|
|
return false;
|
|
|
|
word_res->correct_text.clear();
|
|
|
|
for (int i = 0; i < target_text.size(); ++i) {
|
|
|
|
word_res->correct_text.push_back(
|
|
|
|
STRING(unicharset.id_to_unichar(target_text[i])));
|
2009-07-11 10:03:51 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Recursive helper to find a match to the target_text (from text_index
|
|
|
|
// position) in the choices (from choices_pos position).
|
|
|
|
// Choices is an array of GenericVectors, of length choices_length, with each
|
|
|
|
// element representing a starting position in the word, and the
|
|
|
|
// GenericVector holding classification results for a sequence of consecutive
|
|
|
|
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
|
|
|
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
|
|
|
|
int choices_pos, int choices_length,
|
|
|
|
const GenericVector<UNICHAR_ID>& target_text,
|
|
|
|
int text_index,
|
|
|
|
float rating, GenericVector<int>* segmentation,
|
|
|
|
float* best_rating,
|
|
|
|
GenericVector<int>* best_segmentation) {
|
|
|
|
const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
|
|
|
|
for (int length = 1; length <= choices[choices_pos].size(); ++length) {
|
|
|
|
// Rating of matching choice or worst choice if no match.
|
|
|
|
float choice_rating = 0.0f;
|
|
|
|
// Find the corresponding best BLOB_CHOICE.
|
|
|
|
BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
|
|
|
|
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
|
|
|
|
choice_it.forward()) {
|
|
|
|
BLOB_CHOICE* choice = choice_it.data();
|
|
|
|
choice_rating = choice->rating();
|
|
|
|
UNICHAR_ID class_id = choice->unichar_id();
|
|
|
|
if (class_id == target_text[text_index]) {
|
|
|
|
break;
|
2007-07-18 09:11:18 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
// Search ambigs table.
|
|
|
|
if (class_id < table.size() && table[class_id] != NULL) {
|
|
|
|
AmbigSpec_IT spec_it(table[class_id]);
|
|
|
|
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
|
|
|
|
spec_it.forward()) {
|
|
|
|
const AmbigSpec *ambig_spec = spec_it.data();
|
|
|
|
// We'll only do 1-1.
|
|
|
|
if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
|
|
|
|
ambig_spec->correct_ngram_id == target_text[text_index])
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!spec_it.cycled_list())
|
|
|
|
break; // Found an ambig.
|
2007-07-18 09:11:18 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (choice_it.cycled_list())
|
|
|
|
continue; // No match.
|
|
|
|
segmentation->push_back(length);
|
|
|
|
if (choices_pos + length == choices_length &&
|
|
|
|
text_index + 1 == target_text.size()) {
|
|
|
|
// This is a complete match. If the rating is good record a new best.
|
|
|
|
if (applybox_debug > 2) {
|
|
|
|
tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
|
|
|
|
rating + choice_rating, *best_rating, segmentation->size(),
|
|
|
|
best_segmentation->size());
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
|
|
|
|
*best_segmentation = *segmentation;
|
|
|
|
*best_rating = rating + choice_rating;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
} else if (choices_pos + length < choices_length &&
|
|
|
|
text_index + 1 < target_text.size()) {
|
|
|
|
if (applybox_debug > 3) {
|
|
|
|
tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
|
|
|
|
target_text[text_index],
|
|
|
|
unicharset.id_to_unichar(target_text[text_index]),
|
|
|
|
choice_it.data()->unichar_id() == target_text[text_index]
|
|
|
|
? "Match" : "Ambig",
|
|
|
|
choices_pos, length);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
SearchForText(choices, choices_pos + length, choices_length, target_text,
|
|
|
|
text_index + 1, rating + choice_rating, segmentation,
|
|
|
|
best_rating, best_segmentation);
|
|
|
|
if (applybox_debug > 3) {
|
|
|
|
tprintf("End recursion for %d=%s\n", target_text[text_index],
|
|
|
|
unicharset.id_to_unichar(target_text[text_index]));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
segmentation->truncate(segmentation->size() - 1);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Counts up the labelled words and the blobs within.
|
|
|
|
// Deletes all unused or emptied words, counting the unused ones.
|
|
|
|
// Resets W_BOL and W_EOL flags correctly.
|
|
|
|
// Builds the rebuild_word and rebuilds the box_word.
|
|
|
|
void Tesseract::TidyUp(PAGE_RES* page_res) {
|
|
|
|
int ok_blob_count = 0;
|
|
|
|
int bad_blob_count = 0;
|
|
|
|
int ok_word_count = 0;
|
|
|
|
int unlabelled_words = 0;
|
|
|
|
PAGE_RES_IT pr_it(page_res);
|
|
|
|
WERD_RES* word_res;
|
|
|
|
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
|
|
|
|
int ok_in_word = 0;
|
|
|
|
for (int i = 0; i < word_res->correct_text.size(); ++i) {
|
|
|
|
if (word_res->correct_text[i].length() > 0) {
|
|
|
|
++ok_in_word;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (ok_in_word > 0) {
|
|
|
|
ok_blob_count += ok_in_word;
|
|
|
|
bad_blob_count += word_res->correct_text.size() - ok_in_word;
|
|
|
|
} else {
|
|
|
|
++unlabelled_words;
|
|
|
|
if (applybox_debug > 0) {
|
|
|
|
tprintf("APPLY_BOXES: Unlabelled word at :");
|
|
|
|
word_res->word->bounding_box().print();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
pr_it.DeleteCurrentWord();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
pr_it.restart_page();
|
|
|
|
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
|
|
|
|
// Denormalize back to a BoxWord.
|
|
|
|
word_res->RebuildBestState();
|
|
|
|
word_res->SetupBoxWord();
|
|
|
|
word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
|
|
|
|
word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
|
|
|
|
}
|
|
|
|
if (applybox_debug > 0) {
|
|
|
|
tprintf(" Found %d good blobs and %d unlabelled blobs in %d words.\n",
|
|
|
|
ok_blob_count, bad_blob_count, ok_word_count);
|
|
|
|
tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Logs a bad box by line in the box file and box coords.
|
|
|
|
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
|
|
|
|
const char *box_ch, const char *err_msg) {
|
|
|
|
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
|
|
|
|
boxfile_lineno, box_ch,
|
|
|
|
box.left(), box.bottom(), box.right(), box.top(), err_msg);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Creates a fake best_choice entry in each WERD_RES with the correct text.
|
|
|
|
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
|
|
|
PAGE_RES_IT pr_it(page_res);
|
|
|
|
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
|
|
|
|
word_res = pr_it.forward()) {
|
|
|
|
WERD_CHOICE* choice = new WERD_CHOICE(word_res->correct_text.size());
|
|
|
|
for (int i = 0; i < word_res->correct_text.size(); ++i) {
|
|
|
|
UNICHAR_ID char_id = unicharset.unichar_to_id(
|
|
|
|
word_res->correct_text[i].string());
|
|
|
|
choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
choice->populate_unichars(unicharset);
|
|
|
|
if (word_res->best_choice != NULL)
|
|
|
|
delete word_res->best_choice;
|
|
|
|
word_res->best_choice = choice;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Calls LearnWord to extract features for labelled blobs within each word.
|
|
|
|
// Features are written to the given filename.
|
|
|
|
void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
|
|
|
|
PAGE_RES_IT pr_it(page_res);
|
|
|
|
int word_count = 0;
|
|
|
|
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
|
|
|
|
word_res = pr_it.forward()) {
|
|
|
|
LearnWord(filename.string(), NULL, word_res);
|
|
|
|
++word_count;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
tprintf ("Generated training data for %d words\n", word_count);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
} // namespace tesseract
|