mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-08 02:12:40 +08:00
applybox.cpp, take one
This commit is contained in:
parent
2982583daa
commit
192a24ab7b
@ -36,18 +36,22 @@
|
|||||||
#include "tesseractclass.h"
|
#include "tesseractclass.h"
|
||||||
#include "genericvector.h"
|
#include "genericvector.h"
|
||||||
|
|
||||||
// Max number of blobs to classify together in FindSegmentation.
|
/** Max number of blobs to classify together in FindSegmentation. */
|
||||||
const int kMaxGroupSize = 4;
|
const int kMaxGroupSize = 4;
|
||||||
// Max fraction of median allowed as deviation in xheight before switching
|
/// Max fraction of median allowed as deviation in xheight before switching
|
||||||
// to median.
|
/// to median.
|
||||||
const double kMaxXHeightDeviationFraction = 0.125;
|
const double kMaxXHeightDeviationFraction = 0.125;
|
||||||
|
|
||||||
/*************************************************************************
|
/**
|
||||||
* The box file is assumed to contain box definitions, one per line, of the
|
* The box file is assumed to contain box definitions, one per line, of the
|
||||||
* following format for blob-level boxes:
|
* following format for blob-level boxes:
|
||||||
|
* @verbatim
|
||||||
* <UTF8 str> <left> <bottom> <right> <top> <page id>
|
* <UTF8 str> <left> <bottom> <right> <top> <page id>
|
||||||
|
* @endverbatim
|
||||||
* and for word/line-level boxes:
|
* and for word/line-level boxes:
|
||||||
|
* @verbatim
|
||||||
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
||||||
|
* @endverbatim
|
||||||
* NOTES:
|
* NOTES:
|
||||||
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
|
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
|
||||||
*
|
*
|
||||||
@ -62,13 +66,16 @@ const double kMaxXHeightDeviationFraction = 0.125;
|
|||||||
* units in the word/line are listed after the # at the end of the line and
|
* units in the word/line are listed after the # at the end of the line and
|
||||||
* are space delimited, ignoring any original spaces on the line.
|
* are space delimited, ignoring any original spaces on the line.
|
||||||
* Eg.
|
* Eg.
|
||||||
|
* @verbatim
|
||||||
* word -> #w o r d
|
* word -> #w o r d
|
||||||
* multi word line -> #m u l t i w o r d l i n e
|
* multi word line -> #m u l t i w o r d l i n e
|
||||||
|
* @endverbatim
|
||||||
* The recognizable units must be space-delimited in order to allow multiple
|
* The recognizable units must be space-delimited in order to allow multiple
|
||||||
* unicodes to be used for a single recognizable unit, eg Hindi.
|
* unicodes to be used for a single recognizable unit, eg Hindi.
|
||||||
|
*
|
||||||
* In this mode, the classifier must have been pre-trained with the desired
|
* In this mode, the classifier must have been pre-trained with the desired
|
||||||
* character set, or it will not be able to find the character segmentations.
|
* character set, or it will not be able to find the character segmentations.
|
||||||
*************************************************************************/
|
*/
|
||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
@ -181,8 +188,8 @@ static double MedianXHeight(BLOCK_LIST *block_list) {
|
|||||||
return xheights.median();
|
return xheights.median();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Any row xheight that is significantly different from the median is set
|
/// Any row xheight that is significantly different from the median is set
|
||||||
// to the median.
|
/// to the median.
|
||||||
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
|
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
|
||||||
double median_xheight = MedianXHeight(block_list);
|
double median_xheight = MedianXHeight(block_list);
|
||||||
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
|
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
|
||||||
@ -205,8 +212,8 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
|
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
|
||||||
// All fuzzy spaces are removed, and all the words are maximally chopped.
|
/// All fuzzy spaces are removed, and all the words are maximally chopped.
|
||||||
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
|
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
|
||||||
BLOCK_LIST *block_list) {
|
BLOCK_LIST *block_list) {
|
||||||
PreenXHeights(block_list);
|
PreenXHeights(block_list);
|
||||||
@ -240,9 +247,9 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
|
|||||||
return page_res;
|
return page_res;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tests the chopper by exhaustively running chop_one_blob.
|
/// Tests the chopper by exhaustively running chop_one_blob.
|
||||||
// The word_res will contain filled chopped_word, seam_array, denorm,
|
/// The word_res will contain filled chopped_word, seam_array, denorm,
|
||||||
// box_word and best_state for the maximally chopped word.
|
/// box_word and best_state for the maximally chopped word.
|
||||||
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
||||||
BLOCK* block, ROW* row,
|
BLOCK* block, ROW* row,
|
||||||
WERD_RES* word_res) {
|
WERD_RES* word_res) {
|
||||||
@ -300,17 +307,17 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
|||||||
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
|
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper to compute the dispute resolution metric.
|
/// Helper to compute the dispute resolution metric.
|
||||||
// Disputed blob resolution. The aim is to give the blob to the most
|
/// Disputed blob resolution. The aim is to give the blob to the most
|
||||||
// appropriate boxfile box. Most of the time it is obvious, but if
|
/// appropriate boxfile box. Most of the time it is obvious, but if
|
||||||
// two boxfile boxes overlap significantly it is not. If a small boxfile
|
/// two boxfile boxes overlap significantly it is not. If a small boxfile
|
||||||
// box takes most of the blob, and a large boxfile box does too, then
|
/// box takes most of the blob, and a large boxfile box does too, then
|
||||||
// we want the small boxfile box to get it, but if the small box
|
/// we want the small boxfile box to get it, but if the small box
|
||||||
// is much smaller than the blob, we don't want it to get it.
|
/// is much smaller than the blob, we don't want it to get it.
|
||||||
// Details of the disputed blob resolution:
|
/// Details of the disputed blob resolution:
|
||||||
// Given a box with area A, and a blob with area B, with overlap area C,
|
/// Given a box with area A, and a blob with area B, with overlap area C,
|
||||||
// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
|
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
|
||||||
// miss metric gets the blob.
|
/// miss metric gets the blob.
|
||||||
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
|
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
|
||||||
int overlap_area = box1.intersection(box2).area();
|
int overlap_area = box1.intersection(box2).area();
|
||||||
double miss_metric = box1.area()- overlap_area;
|
double miss_metric = box1.area()- overlap_area;
|
||||||
@ -320,14 +327,16 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
|
|||||||
return miss_metric;
|
return miss_metric;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gather consecutive blobs that match the given box into the best_state
|
/// Gather consecutive blobs that match the given box into the best_state
|
||||||
// and corresponding correct_text.
|
/// and corresponding correct_text.
|
||||||
// Fights over which box owns which blobs are settled by pre-chopping and
|
///
|
||||||
// applying the blobs to box or next_box with the least non-overlap.
|
/// Fights over which box owns which blobs are settled by pre-chopping and
|
||||||
// Returns false if the box was in error, which can only be caused by
|
/// applying the blobs to box or next_box with the least non-overlap.
|
||||||
// failing to find an appropriate blob for a box.
|
/// @return false if the box was in error, which can only be caused by
|
||||||
// This means that occasionally, blobs may be incorrectly segmented if the
|
/// failing to find an appropriate blob for a box.
|
||||||
// chopper fails to find a suitable chop point.
|
///
|
||||||
|
/// This means that occasionally, blobs may be incorrectly segmented if the
|
||||||
|
/// chopper fails to find a suitable chop point.
|
||||||
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
|
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
|
||||||
const TBOX& box, const TBOX& next_box,
|
const TBOX& box, const TBOX& next_box,
|
||||||
const char* correct_text) {
|
const char* correct_text) {
|
||||||
@ -420,12 +429,12 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
|
|||||||
return false; // Failure.
|
return false; // Failure.
|
||||||
}
|
}
|
||||||
|
|
||||||
// Consume all source blobs that strongly overlap the given box,
|
/// Consume all source blobs that strongly overlap the given box,
|
||||||
// putting them into a new word, with the correct_text label.
|
/// putting them into a new word, with the correct_text label.
|
||||||
// Fights over which box owns which blobs are settled by
|
/// Fights over which box owns which blobs are settled by
|
||||||
// applying the blobs to box or next_box with the least non-overlap.
|
/// applying the blobs to box or next_box with the least non-overlap.
|
||||||
// Returns false if the box was in error, which can only be caused by
|
/// @return false if the box was in error, which can only be caused by
|
||||||
// failing to find an overlapping blob for a box.
|
/// failing to find an overlapping blob for a box.
|
||||||
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
|
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
|
||||||
const TBOX& box, const TBOX& next_box,
|
const TBOX& box, const TBOX& next_box,
|
||||||
const char* correct_text) {
|
const char* correct_text) {
|
||||||
@ -495,8 +504,8 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
|
|||||||
return new_word != NULL;
|
return new_word != NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resegments the words by running the classifier in an attempt to find the
|
/// Resegments the words by running the classifier in an attempt to find the
|
||||||
// correct segmentation that produces the required string.
|
/// correct segmentation that produces the required string.
|
||||||
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
|
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
|
||||||
PAGE_RES_IT pr_it(page_res);
|
PAGE_RES_IT pr_it(page_res);
|
||||||
WERD_RES* word_res;
|
WERD_RES* word_res;
|
||||||
@ -521,8 +530,8 @@ void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||||
// Returns false if an invalid UNICHAR_ID is encountered.
|
/// @return false if an invalid UNICHAR_ID is encountered.
|
||||||
bool Tesseract::ConvertStringToUnichars(const char* utf8,
|
bool Tesseract::ConvertStringToUnichars(const char* utf8,
|
||||||
GenericVector<UNICHAR_ID>* class_ids) {
|
GenericVector<UNICHAR_ID>* class_ids) {
|
||||||
for (int step = 0; *utf8 != '\0'; utf8 += step) {
|
for (int step = 0; *utf8 != '\0'; utf8 += step) {
|
||||||
@ -541,12 +550,12 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resegments the word to achieve the target_text from the classifier.
|
/// Resegments the word to achieve the target_text from the classifier.
|
||||||
// Returns false if the re-segmentation fails.
|
/// Returns false if the re-segmentation fails.
|
||||||
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
|
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
|
||||||
// applies a full search on the classifier results to find the best classified
|
/// applies a full search on the classifier results to find the best classified
|
||||||
// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
|
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
|
||||||
// substitutions ARE used.
|
/// substitutions ARE used.
|
||||||
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
||||||
WERD_RES* word_res) {
|
WERD_RES* word_res) {
|
||||||
// Classify all required combinations of blobs and save results in choices.
|
// Classify all required combinations of blobs and save results in choices.
|
||||||
@ -603,12 +612,12 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Recursive helper to find a match to the target_text (from text_index
|
/// Recursive helper to find a match to the target_text (from text_index
|
||||||
// position) in the choices (from choices_pos position).
|
/// position) in the choices (from choices_pos position).
|
||||||
// Choices is an array of GenericVectors, of length choices_length, with each
|
/// @param choices is an array of GenericVectors, of length choices_length,
|
||||||
// element representing a starting position in the word, and the
|
/// with each element representing a starting position in the word, and the
|
||||||
// GenericVector holding classification results for a sequence of consecutive
|
/// #GenericVector holding classification results for a sequence of consecutive
|
||||||
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||||
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
|
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
|
||||||
int choices_pos, int choices_length,
|
int choices_pos, int choices_length,
|
||||||
const GenericVector<UNICHAR_ID>& target_text,
|
const GenericVector<UNICHAR_ID>& target_text,
|
||||||
@ -682,10 +691,10 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Counts up the labelled words and the blobs within.
|
/// - Counts up the labelled words and the blobs within.
|
||||||
// Deletes all unused or emptied words, counting the unused ones.
|
/// - Deletes all unused or emptied words, counting the unused ones.
|
||||||
// Resets W_BOL and W_EOL flags correctly.
|
/// - Resets W_BOL and W_EOL flags correctly.
|
||||||
// Builds the rebuild_word and rebuilds the box_word and the best_choice.
|
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
|
||||||
void Tesseract::TidyUp(PAGE_RES* page_res) {
|
void Tesseract::TidyUp(PAGE_RES* page_res) {
|
||||||
int ok_blob_count = 0;
|
int ok_blob_count = 0;
|
||||||
int bad_blob_count = 0;
|
int bad_blob_count = 0;
|
||||||
@ -743,7 +752,7 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Logs a bad box by line in the box file and box coords.
|
/** Logs a bad box by line in the box file and box coords.*/
|
||||||
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
|
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
|
||||||
const char *box_ch, const char *err_msg) {
|
const char *box_ch, const char *err_msg) {
|
||||||
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
|
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
|
||||||
@ -751,7 +760,7 @@ void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
|
|||||||
box.left(), box.bottom(), box.right(), box.top(), err_msg);
|
box.left(), box.bottom(), box.right(), box.top(), err_msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Creates a fake best_choice entry in each WERD_RES with the correct text.
|
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
|
||||||
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
||||||
PAGE_RES_IT pr_it(page_res);
|
PAGE_RES_IT pr_it(page_res);
|
||||||
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
|
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
|
||||||
@ -774,8 +783,8 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calls LearnWord to extract features for labelled blobs within each word.
|
/// Calls #LearnWord to extract features for labelled blobs within each word.
|
||||||
// Features are stored in an internal buffer.
|
/// Features are stored in an internal buffer.
|
||||||
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
|
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
|
||||||
PAGE_RES_IT pr_it(page_res);
|
PAGE_RES_IT pr_it(page_res);
|
||||||
int word_count = 0;
|
int word_count = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user