mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 09:52:40 +08:00
applybox.cpp, take one
This commit is contained in:
parent
2982583daa
commit
192a24ab7b
@ -36,18 +36,22 @@
|
||||
#include "tesseractclass.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
// Max number of blobs to classify together in FindSegmentation.
|
||||
/** Max number of blobs to classify together in FindSegmentation. */
|
||||
const int kMaxGroupSize = 4;
|
||||
// Max fraction of median allowed as deviation in xheight before switching
|
||||
// to median.
|
||||
/// Max fraction of median allowed as deviation in xheight before switching
|
||||
/// to median.
|
||||
const double kMaxXHeightDeviationFraction = 0.125;
|
||||
|
||||
/*************************************************************************
|
||||
/**
|
||||
* The box file is assumed to contain box definitions, one per line, of the
|
||||
* following format for blob-level boxes:
|
||||
* @verbatim
|
||||
* <UTF8 str> <left> <bottom> <right> <top> <page id>
|
||||
* @endverbatim
|
||||
* and for word/line-level boxes:
|
||||
* @verbatim
|
||||
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
||||
* @endverbatim
|
||||
* NOTES:
|
||||
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
|
||||
*
|
||||
@ -62,13 +66,16 @@ const double kMaxXHeightDeviationFraction = 0.125;
|
||||
* units in the word/line are listed after the # at the end of the line and
|
||||
* are space delimited, ignoring any original spaces on the line.
|
||||
* Eg.
|
||||
* @verbatim
|
||||
* word -> #w o r d
|
||||
* multi word line -> #m u l t i w o r d l i n e
|
||||
* @endverbatim
|
||||
* The recognizable units must be space-delimited in order to allow multiple
|
||||
* unicodes to be used for a single recognizable unit, eg Hindi.
|
||||
*
|
||||
* In this mode, the classifier must have been pre-trained with the desired
|
||||
* character set, or it will not be able to find the character segmentations.
|
||||
*************************************************************************/
|
||||
*/
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -181,8 +188,8 @@ static double MedianXHeight(BLOCK_LIST *block_list) {
|
||||
return xheights.median();
|
||||
}
|
||||
|
||||
// Any row xheight that is significantly different from the median is set
|
||||
// to the median.
|
||||
/// Any row xheight that is significantly different from the median is set
|
||||
/// to the median.
|
||||
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
|
||||
double median_xheight = MedianXHeight(block_list);
|
||||
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
|
||||
@ -205,8 +212,8 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
|
||||
}
|
||||
}
|
||||
|
||||
// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
|
||||
// All fuzzy spaces are removed, and all the words are maximally chopped.
|
||||
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
|
||||
/// All fuzzy spaces are removed, and all the words are maximally chopped.
|
||||
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
|
||||
BLOCK_LIST *block_list) {
|
||||
PreenXHeights(block_list);
|
||||
@ -240,9 +247,9 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
|
||||
return page_res;
|
||||
}
|
||||
|
||||
// Tests the chopper by exhaustively running chop_one_blob.
|
||||
// The word_res will contain filled chopped_word, seam_array, denorm,
|
||||
// box_word and best_state for the maximally chopped word.
|
||||
/// Tests the chopper by exhaustively running chop_one_blob.
|
||||
/// The word_res will contain filled chopped_word, seam_array, denorm,
|
||||
/// box_word and best_state for the maximally chopped word.
|
||||
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
||||
BLOCK* block, ROW* row,
|
||||
WERD_RES* word_res) {
|
||||
@ -300,17 +307,17 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
||||
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
|
||||
}
|
||||
|
||||
// Helper to compute the dispute resolution metric.
|
||||
// Disputed blob resolution. The aim is to give the blob to the most
|
||||
// appropriate boxfile box. Most of the time it is obvious, but if
|
||||
// two boxfile boxes overlap significantly it is not. If a small boxfile
|
||||
// box takes most of the blob, and a large boxfile box does too, then
|
||||
// we want the small boxfile box to get it, but if the small box
|
||||
// is much smaller than the blob, we don't want it to get it.
|
||||
// Details of the disputed blob resolution:
|
||||
// Given a box with area A, and a blob with area B, with overlap area C,
|
||||
// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
|
||||
// miss metric gets the blob.
|
||||
/// Helper to compute the dispute resolution metric.
|
||||
/// Disputed blob resolution. The aim is to give the blob to the most
|
||||
/// appropriate boxfile box. Most of the time it is obvious, but if
|
||||
/// two boxfile boxes overlap significantly it is not. If a small boxfile
|
||||
/// box takes most of the blob, and a large boxfile box does too, then
|
||||
/// we want the small boxfile box to get it, but if the small box
|
||||
/// is much smaller than the blob, we don't want it to get it.
|
||||
/// Details of the disputed blob resolution:
|
||||
/// Given a box with area A, and a blob with area B, with overlap area C,
|
||||
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
|
||||
/// miss metric gets the blob.
|
||||
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
|
||||
int overlap_area = box1.intersection(box2).area();
|
||||
double miss_metric = box1.area()- overlap_area;
|
||||
@ -320,14 +327,16 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
|
||||
return miss_metric;
|
||||
}
|
||||
|
||||
// Gather consecutive blobs that match the given box into the best_state
|
||||
// and corresponding correct_text.
|
||||
// Fights over which box owns which blobs are settled by pre-chopping and
|
||||
// applying the blobs to box or next_box with the least non-overlap.
|
||||
// Returns false if the box was in error, which can only be caused by
|
||||
// failing to find an appropriate blob for a box.
|
||||
// This means that occasionally, blobs may be incorrectly segmented if the
|
||||
// chopper fails to find a suitable chop point.
|
||||
/// Gather consecutive blobs that match the given box into the best_state
|
||||
/// and corresponding correct_text.
|
||||
///
|
||||
/// Fights over which box owns which blobs are settled by pre-chopping and
|
||||
/// applying the blobs to box or next_box with the least non-overlap.
|
||||
/// @return false if the box was in error, which can only be caused by
|
||||
/// failing to find an appropriate blob for a box.
|
||||
///
|
||||
/// This means that occasionally, blobs may be incorrectly segmented if the
|
||||
/// chopper fails to find a suitable chop point.
|
||||
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
|
||||
const TBOX& box, const TBOX& next_box,
|
||||
const char* correct_text) {
|
||||
@ -420,12 +429,12 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
|
||||
return false; // Failure.
|
||||
}
|
||||
|
||||
// Consume all source blobs that strongly overlap the given box,
|
||||
// putting them into a new word, with the correct_text label.
|
||||
// Fights over which box owns which blobs are settled by
|
||||
// applying the blobs to box or next_box with the least non-overlap.
|
||||
// Returns false if the box was in error, which can only be caused by
|
||||
// failing to find an overlapping blob for a box.
|
||||
/// Consume all source blobs that strongly overlap the given box,
|
||||
/// putting them into a new word, with the correct_text label.
|
||||
/// Fights over which box owns which blobs are settled by
|
||||
/// applying the blobs to box or next_box with the least non-overlap.
|
||||
/// @return false if the box was in error, which can only be caused by
|
||||
/// failing to find an overlapping blob for a box.
|
||||
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
|
||||
const TBOX& box, const TBOX& next_box,
|
||||
const char* correct_text) {
|
||||
@ -495,8 +504,8 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
|
||||
return new_word != NULL;
|
||||
}
|
||||
|
||||
// Resegments the words by running the classifier in an attempt to find the
|
||||
// correct segmentation that produces the required string.
|
||||
/// Resegments the words by running the classifier in an attempt to find the
|
||||
/// correct segmentation that produces the required string.
|
||||
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
WERD_RES* word_res;
|
||||
@ -521,8 +530,8 @@ void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
|
||||
}
|
||||
}
|
||||
|
||||
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||
// Returns false if an invalid UNICHAR_ID is encountered.
|
||||
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||
/// @return false if an invalid UNICHAR_ID is encountered.
|
||||
bool Tesseract::ConvertStringToUnichars(const char* utf8,
|
||||
GenericVector<UNICHAR_ID>* class_ids) {
|
||||
for (int step = 0; *utf8 != '\0'; utf8 += step) {
|
||||
@ -541,12 +550,12 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Resegments the word to achieve the target_text from the classifier.
|
||||
// Returns false if the re-segmentation fails.
|
||||
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
|
||||
// applies a full search on the classifier results to find the best classified
|
||||
// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
|
||||
// substitutions ARE used.
|
||||
/// Resegments the word to achieve the target_text from the classifier.
|
||||
/// Returns false if the re-segmentation fails.
|
||||
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
|
||||
/// applies a full search on the classifier results to find the best classified
|
||||
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
|
||||
/// substitutions ARE used.
|
||||
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
||||
WERD_RES* word_res) {
|
||||
// Classify all required combinations of blobs and save results in choices.
|
||||
@ -603,12 +612,12 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
||||
return true;
|
||||
}
|
||||
|
||||
// Recursive helper to find a match to the target_text (from text_index
|
||||
// position) in the choices (from choices_pos position).
|
||||
// Choices is an array of GenericVectors, of length choices_length, with each
|
||||
// element representing a starting position in the word, and the
|
||||
// GenericVector holding classification results for a sequence of consecutive
|
||||
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||
/// Recursive helper to find a match to the target_text (from text_index
|
||||
/// position) in the choices (from choices_pos position).
|
||||
/// @param choices is an array of GenericVectors, of length choices_length,
|
||||
/// with each element representing a starting position in the word, and the
|
||||
/// #GenericVector holding classification results for a sequence of consecutive
|
||||
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
|
||||
int choices_pos, int choices_length,
|
||||
const GenericVector<UNICHAR_ID>& target_text,
|
||||
@ -682,10 +691,10 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
|
||||
}
|
||||
}
|
||||
|
||||
// Counts up the labelled words and the blobs within.
|
||||
// Deletes all unused or emptied words, counting the unused ones.
|
||||
// Resets W_BOL and W_EOL flags correctly.
|
||||
// Builds the rebuild_word and rebuilds the box_word and the best_choice.
|
||||
/// - Counts up the labelled words and the blobs within.
|
||||
/// - Deletes all unused or emptied words, counting the unused ones.
|
||||
/// - Resets W_BOL and W_EOL flags correctly.
|
||||
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
|
||||
void Tesseract::TidyUp(PAGE_RES* page_res) {
|
||||
int ok_blob_count = 0;
|
||||
int bad_blob_count = 0;
|
||||
@ -743,7 +752,7 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
|
||||
}
|
||||
}
|
||||
|
||||
// Logs a bad box by line in the box file and box coords.
|
||||
/** Logs a bad box by line in the box file and box coords.*/
|
||||
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
|
||||
const char *box_ch, const char *err_msg) {
|
||||
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
|
||||
@ -751,7 +760,7 @@ void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
|
||||
box.left(), box.bottom(), box.right(), box.top(), err_msg);
|
||||
}
|
||||
|
||||
// Creates a fake best_choice entry in each WERD_RES with the correct text.
|
||||
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
|
||||
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
|
||||
@ -774,8 +783,8 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
||||
}
|
||||
}
|
||||
|
||||
// Calls LearnWord to extract features for labelled blobs within each word.
|
||||
// Features are stored in an internal buffer.
|
||||
/// Calls #LearnWord to extract features for labelled blobs within each word.
|
||||
/// Features are stored in an internal buffer.
|
||||
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
int word_count = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user