applybox.cpp, take one

This commit is contained in:
Jim O'Regan 2015-07-20 11:27:21 +01:00
parent 2982583daa
commit 192a24ab7b

View File

@ -36,18 +36,22 @@
#include "tesseractclass.h"
#include "genericvector.h"
// Max number of blobs to classify together in FindSegmentation.
/** Max number of blobs to classify together in FindSegmentation. */
const int kMaxGroupSize = 4;
// Max fraction of median allowed as deviation in xheight before switching
// to median.
/// Max fraction of median allowed as deviation in xheight before switching
/// to median.
const double kMaxXHeightDeviationFraction = 0.125;
/*************************************************************************
/**
* The box file is assumed to contain box definitions, one per line, of the
* following format for blob-level boxes:
* @verbatim
* <UTF8 str> <left> <bottom> <right> <top> <page id>
* @endverbatim
* and for word/line-level boxes:
* @verbatim
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
* @endverbatim
* NOTES:
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
*
@ -62,13 +66,16 @@ const double kMaxXHeightDeviationFraction = 0.125;
* units in the word/line are listed after the # at the end of the line and
* are space delimited, ignoring any original spaces on the line.
* Eg.
* @verbatim
* word -> #w o r d
* multi word line -> #m u l t i w o r d l i n e
* @endverbatim
* The recognizable units must be space-delimited in order to allow multiple
* unicodes to be used for a single recognizable unit, eg Hindi.
*
* In this mode, the classifier must have been pre-trained with the desired
* character set, or it will not be able to find the character segmentations.
*************************************************************************/
*/
namespace tesseract {
@ -181,8 +188,8 @@ static double MedianXHeight(BLOCK_LIST *block_list) {
return xheights.median();
}
// Any row xheight that is significantly different from the median is set
// to the median.
/// Any row xheight that is significantly different from the median is set
/// to the median.
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
double median_xheight = MedianXHeight(block_list);
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
@ -205,8 +212,8 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
}
}
// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
// All fuzzy spaces are removed, and all the words are maximally chopped.
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
/// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
BLOCK_LIST *block_list) {
PreenXHeights(block_list);
@ -240,9 +247,9 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
return page_res;
}
// Tests the chopper by exhaustively running chop_one_blob.
// The word_res will contain filled chopped_word, seam_array, denorm,
// box_word and best_state for the maximally chopped word.
/// Tests the chopper by exhaustively running chop_one_blob.
/// The word_res will contain filled chopped_word, seam_array, denorm,
/// box_word and best_state for the maximally chopped word.
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
BLOCK* block, ROW* row,
WERD_RES* word_res) {
@ -300,17 +307,17 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
}
// Helper to compute the dispute resolution metric.
// Disputed blob resolution. The aim is to give the blob to the most
// appropriate boxfile box. Most of the time it is obvious, but if
// two boxfile boxes overlap significantly it is not. If a small boxfile
// box takes most of the blob, and a large boxfile box does too, then
// we want the small boxfile box to get it, but if the small box
// is much smaller than the blob, we don't want it to get it.
// Details of the disputed blob resolution:
// Given a box with area A, and a blob with area B, with overlap area C,
// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
// miss metric gets the blob.
/// Helper to compute the dispute resolution metric.
/// Disputed blob resolution. The aim is to give the blob to the most
/// appropriate boxfile box. Most of the time it is obvious, but if
/// two boxfile boxes overlap significantly it is not. If a small boxfile
/// box takes most of the blob, and a large boxfile box does too, then
/// we want the small boxfile box to get it, but if the small box
/// is much smaller than the blob, we don't want it to get it.
/// Details of the disputed blob resolution:
/// Given a box with area A, and a blob with area B, with overlap area C,
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
/// miss metric gets the blob.
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
int overlap_area = box1.intersection(box2).area();
double miss_metric = box1.area()- overlap_area;
@ -320,14 +327,16 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
return miss_metric;
}
// Gather consecutive blobs that match the given box into the best_state
// and corresponding correct_text.
// Fights over which box owns which blobs are settled by pre-chopping and
// applying the blobs to box or next_box with the least non-overlap.
// Returns false if the box was in error, which can only be caused by
// failing to find an appropriate blob for a box.
// This means that occasionally, blobs may be incorrectly segmented if the
// chopper fails to find a suitable chop point.
/// Gather consecutive blobs that match the given box into the best_state
/// and corresponding correct_text.
///
/// Fights over which box owns which blobs are settled by pre-chopping and
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an appropriate blob for a box.
///
/// This means that occasionally, blobs may be incorrectly segmented if the
/// chopper fails to find a suitable chop point.
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
@ -420,12 +429,12 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
return false; // Failure.
}
// Consume all source blobs that strongly overlap the given box,
// putting them into a new word, with the correct_text label.
// Fights over which box owns which blobs are settled by
// applying the blobs to box or next_box with the least non-overlap.
// Returns false if the box was in error, which can only be caused by
// failing to find an overlapping blob for a box.
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
@ -495,8 +504,8 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
return new_word != NULL;
}
// Resegments the words by running the classifier in an attempt to find the
// correct segmentation that produces the required string.
/// Resegments the words by running the classifier in an attempt to find the
/// correct segmentation that produces the required string.
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
@ -521,8 +530,8 @@ void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
}
}
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
// Returns false if an invalid UNICHAR_ID is encountered.
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
/// @return false if an invalid UNICHAR_ID is encountered.
bool Tesseract::ConvertStringToUnichars(const char* utf8,
GenericVector<UNICHAR_ID>* class_ids) {
for (int step = 0; *utf8 != '\0'; utf8 += step) {
@ -541,12 +550,12 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
return true;
}
// Resegments the word to achieve the target_text from the classifier.
// Returns false if the re-segmentation fails.
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
// applies a full search on the classifier results to find the best classified
// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
// substitutions ARE used.
/// Resegments the word to achieve the target_text from the classifier.
/// Returns false if the re-segmentation fails.
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
WERD_RES* word_res) {
// Classify all required combinations of blobs and save results in choices.
@ -603,12 +612,12 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
return true;
}
// Recursive helper to find a match to the target_text (from text_index
// position) in the choices (from choices_pos position).
// Choices is an array of GenericVectors, of length choices_length, with each
// element representing a starting position in the word, and the
// GenericVector holding classification results for a sequence of consecutive
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length,
/// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
int choices_pos, int choices_length,
const GenericVector<UNICHAR_ID>& target_text,
@ -682,10 +691,10 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
}
}
// Counts up the labelled words and the blobs within.
// Deletes all unused or emptied words, counting the unused ones.
// Resets W_BOL and W_EOL flags correctly.
// Builds the rebuild_word and rebuilds the box_word and the best_choice.
/// - Counts up the labelled words and the blobs within.
/// - Deletes all unused or emptied words, counting the unused ones.
/// - Resets W_BOL and W_EOL flags correctly.
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
void Tesseract::TidyUp(PAGE_RES* page_res) {
int ok_blob_count = 0;
int bad_blob_count = 0;
@ -743,7 +752,7 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
}
}
// Logs a bad box by line in the box file and box coords.
/** Logs a bad box by line in the box file and box coords.*/
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
const char *box_ch, const char *err_msg) {
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
@ -751,7 +760,7 @@ void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
box.left(), box.bottom(), box.right(), box.top(), err_msg);
}
// Creates a fake best_choice entry in each WERD_RES with the correct text.
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
@ -774,8 +783,8 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
}
}
// Calls LearnWord to extract features for labelled blobs within each word.
// Features are stored in an internal buffer.
/// Calls #LearnWord to extract features for labelled blobs within each word.
/// Features are stored in an internal buffer.
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
int word_count = 0;