applybox.cpp, take one

2025-06-07 18:02:40 +08:00 · 2015-07-20 11:27:21 +01:00 · 2015-07-20 11:27:21 +01:00 · 192a24ab7b
commit 192a24ab7b
parent 2982583daa
1 changed files with 70 additions and 61 deletions
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@ -36,18 +36,22 @@
 #include "tesseractclass.h"
 #include "genericvector.h"

-// Max number of blobs to classify together in FindSegmentation.
+/** Max number of blobs to classify together in FindSegmentation. */
 const int kMaxGroupSize = 4;
-// Max fraction of median allowed as deviation in xheight before switching
-// to median.
+/// Max fraction of median allowed as deviation in xheight before switching
+/// to median.
 const double kMaxXHeightDeviationFraction = 0.125;

-/*************************************************************************
+/**
 * The box file is assumed to contain box definitions, one per line, of the
 * following format for blob-level boxes:
+ * @verbatim
 *   <UTF8 str> <left> <bottom> <right> <top> <page id>
+ * @endverbatim
 * and for word/line-level boxes:
+ * @verbatim
 *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
+ * @endverbatim
 * NOTES:
 * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
 *
@ -62,13 +66,16 @@ const double kMaxXHeightDeviationFraction = 0.125;
 * units in the word/line are listed after the # at the end of the line and
 * are space delimited, ignoring any original spaces on the line.
 * Eg.
+ * @verbatim
 * word -> #w o r d
 * multi word line -> #m u l t i w o r d l i n e
+ * @endverbatim
 * The recognizable units must be space-delimited in order to allow multiple
 * unicodes to be used for a single recognizable unit, eg Hindi.
+ *
 * In this mode, the classifier must have been pre-trained with the desired
 * character set, or it will not be able to find the character segmentations.
- *************************************************************************/
+ */

 namespace tesseract {

@ -181,8 +188,8 @@ static double MedianXHeight(BLOCK_LIST *block_list) {
  return xheights.median();
 }

-// Any row xheight that is significantly different from the median is set
-// to the median.
+/// Any row xheight that is significantly different from the median is set
+/// to the median.
 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
  double median_xheight = MedianXHeight(block_list);
  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
@ -205,8 +212,8 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
  }
 }

-// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
-// All fuzzy spaces are removed, and all the words are maximally chopped.
+/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
+/// All fuzzy spaces are removed, and all the words are maximally chopped.
 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
                                     BLOCK_LIST *block_list) {
  PreenXHeights(block_list);
@ -240,9 +247,9 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
  return page_res;
 }

-// Tests the chopper by exhaustively running chop_one_blob.
-// The word_res will contain filled chopped_word, seam_array, denorm,
-// box_word and best_state for the maximally chopped word.
+/// Tests the chopper by exhaustively running chop_one_blob.
+/// The word_res will contain filled chopped_word, seam_array, denorm,
+/// box_word and best_state for the maximally chopped word.
 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
                                  BLOCK* block, ROW* row,
                                  WERD_RES* word_res) {
@ -300,17 +307,17 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
 }

-// Helper to compute the dispute resolution metric.
-// Disputed blob resolution. The aim is to give the blob to the most
-// appropriate boxfile box. Most of the time it is obvious, but if
-// two boxfile boxes overlap significantly it is not. If a small boxfile
-// box takes most of the blob, and a large boxfile box does too, then
-// we want the small boxfile box to get it, but if the small box
-// is much smaller than the blob, we don't want it to get it.
-// Details of the disputed blob resolution:
-// Given a box with area A, and a blob with area B, with overlap area C,
-// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
-// miss metric gets the blob.
+/// Helper to compute the dispute resolution metric.
+/// Disputed blob resolution. The aim is to give the blob to the most
+/// appropriate boxfile box. Most of the time it is obvious, but if
+/// two boxfile boxes overlap significantly it is not. If a small boxfile
+/// box takes most of the blob, and a large boxfile box does too, then
+/// we want the small boxfile box to get it, but if the small box
+/// is much smaller than the blob, we don't want it to get it.
+/// Details of the disputed blob resolution:
+/// Given a box with area A, and a blob with area B, with overlap area C,
+/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
+/// miss metric gets the blob.
 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
  int overlap_area = box1.intersection(box2).area();
  double miss_metric = box1.area()- overlap_area;
@ -320,14 +327,16 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
  return miss_metric;
 }

-// Gather consecutive blobs that match the given box into the best_state
-// and corresponding correct_text.
-// Fights over which box owns which blobs are settled by pre-chopping and
-// applying the blobs to box or next_box with the least non-overlap.
-// Returns false if the box was in error, which can only be caused by
-// failing to find an appropriate blob for a box.
-// This means that occasionally, blobs may be incorrectly segmented if the
-// chopper fails to find a suitable chop point.
+/// Gather consecutive blobs that match the given box into the best_state
+/// and corresponding correct_text.
+///
+/// Fights over which box owns which blobs are settled by pre-chopping and
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an appropriate blob for a box.
+///
+/// This means that occasionally, blobs may be incorrectly segmented if the
+/// chopper fails to find a suitable chop point.
 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
@ -420,12 +429,12 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
  return false;  // Failure.
 }

-// Consume all source blobs that strongly overlap the given box,
-// putting them into a new word, with the correct_text label.
-// Fights over which box owns which blobs are settled by
-// applying the blobs to box or next_box with the least non-overlap.
-// Returns false if the box was in error, which can only be caused by
-// failing to find an overlapping blob for a box.
+/// Consume all source blobs that strongly overlap the given box,
+/// putting them into a new word, with the correct_text label.
+/// Fights over which box owns which blobs are settled by
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an overlapping blob for a box.
 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
@ -495,8 +504,8 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
  return new_word != NULL;
 }

-// Resegments the words by running the classifier in an attempt to find the
-// correct segmentation that produces the required string.
+/// Resegments the words by running the classifier in an attempt to find the
+/// correct segmentation that produces the required string.
 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
  PAGE_RES_IT pr_it(page_res);
  WERD_RES* word_res;
@ -521,8 +530,8 @@ void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
  }
 }

-// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
-// Returns false if an invalid UNICHAR_ID is encountered.
+/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
+/// @return false if an invalid UNICHAR_ID is encountered.
 bool Tesseract::ConvertStringToUnichars(const char* utf8,
                                        GenericVector<UNICHAR_ID>* class_ids) {
  for (int step = 0; *utf8 != '\0'; utf8 += step) {
@ -541,12 +550,12 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
  return true;
 }

-// Resegments the word to achieve the target_text from the classifier.
-// Returns false if the re-segmentation fails.
-// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
-// applies a full search on the classifier results to find the best classified
-// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
-// substitutions ARE used.
+/// Resegments the word to achieve the target_text from the classifier.
+/// Returns false if the re-segmentation fails.
+/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
+/// applies a full search on the classifier results to find the best classified
+/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
+/// substitutions ARE used.
 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                 WERD_RES* word_res) {
  // Classify all required combinations of blobs and save results in choices.
@ -603,12 +612,12 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
  return true;
 }

-// Recursive helper to find a match to the target_text (from text_index
-// position) in the choices (from choices_pos position).
-// Choices is an array of GenericVectors, of length choices_length, with each
-// element representing a starting position in the word, and the
-// GenericVector holding classification results for a sequence of consecutive
-// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+/// Recursive helper to find a match to the target_text (from text_index
+/// position) in the choices (from choices_pos position).
+/// @param choices is an array of GenericVectors, of length choices_length,
+/// with each element representing a starting position in the word, and the
+/// #GenericVector holding classification results for a sequence of consecutive
+/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
                              int choices_pos, int choices_length,
                              const GenericVector<UNICHAR_ID>& target_text,
@ -682,10 +691,10 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
  }
 }

-// Counts up the labelled words and the blobs within.
-// Deletes all unused or emptied words, counting the unused ones.
-// Resets W_BOL and W_EOL flags correctly.
-// Builds the rebuild_word and rebuilds the box_word and the best_choice.
+/// - Counts up the labelled words and the blobs within.
+/// - Deletes all unused or emptied words, counting the unused ones.
+/// - Resets W_BOL and W_EOL flags correctly.
+/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
 void Tesseract::TidyUp(PAGE_RES* page_res) {
  int ok_blob_count = 0;
  int bad_blob_count = 0;
@ -743,7 +752,7 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
  }
 }

-// Logs a bad box by line in the box file and box coords.
+/** Logs a bad box by line in the box file and box coords.*/
 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
                                const char *box_ch, const char *err_msg) {
  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
@ -751,7 +760,7 @@ void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
          box.left(), box.bottom(), box.right(), box.top(), err_msg);
 }

-// Creates a fake best_choice entry in each WERD_RES with the correct text.
+/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
  PAGE_RES_IT pr_it(page_res);
  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
@ -774,8 +783,8 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
  }
 }

-// Calls LearnWord to extract features for labelled blobs within each word.
-// Features are stored in an internal buffer.
+/// Calls #LearnWord to extract features for labelled blobs within each word.
+/// Features are stored in an internal buffer.
 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
  PAGE_RES_IT pr_it(page_res);
  int word_count = 0;