tesseract/ccmain/applybox.cpp

/**********************************************************************
 * File:        applybox.cpp  (Formerly applybox.c)
 * Description: Re segment rows according to box file data
 * Author:      Phil Cheatle
 * Created:     Wed Nov 24 09:11:23 GMT 1993
 *
 * (C) Copyright 1993, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/
#include "mfcpch.h"

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif

#ifdef _MSC_VER
#pragma warning(disable:4244)  // Conversion warnings
#endif

#ifdef HAVE_LIBLEPT
// Include leptonica library only if autoconf (or makefile etc) tell us to.
#include "allheaders.h"
#endif

#include "applybox.h"
#include <ctype.h>
#include <string.h>
#ifdef __UNIX__
#include <assert.h>
#include <errno.h>
#endif
#include "boxread.h"
#include "control.h"
#include "genblob.h"
#include "globals.h"
#include "fixxht.h"
#include "mainblk.h"
#include "matchdefs.h"
#include "secname.h"
#include "tessbox.h"
#include "unichar.h"
#include "unicharset.h"
#include "matchdefs.h"
#include "tesseractclass.h"

#define SECURE_NAMES
#ifndef SECURE_NAMES
#include          "wordstats.h"
#endif

#define EXTERN
EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
EXTERN INT_VAR (applybox_debug, 5, "Debug level");
EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from");
EXTERN STRING_VAR (applybox_test_exclusions, "",
                   "Chars ignored for testing");
EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");

EXTERN STRING_VAR(exposure_pattern, ".exp",
                  "Exposure value follows this pattern in the image"
                  " filename. The name of the image files are expected"
                  " to be in the form [lang].[fontname].exp[num].tif");

EXTERN BOOL_VAR(learn_chars_and_char_frags_mode, FALSE,
                "Learn both character fragments (as is done in the"
                " special low exposure mode) as well as unfragmented"
                " characters.");

extern IMAGE page_image;

// The unicharset used during box training
static UNICHARSET unicharset_boxes;

/*************************************************************************
 * The code re-assigns outlines to form words each with ONE labelled blob.
 * Noise is left in UNLABELLED words. The chars on the page are checked crudely
 * for sensible position relative to baseline and xht. Failed boxes are
 * compensated for by duplicating other believable instances of the character.
 *
 * The box file is assumed to contain box definitions, one per line, of the
 * following format:
 *   <Char> <left> <bottom> <right> <top> <page> ... arbitrary trailing fields unused
 *
 * The approach taken is to search the WHOLE page for stuff overlapping each box.
 *  - This is not too inefficient and is SAFE.
 *    - We can detect overlapping blobs as we will be attempting to put a blob
 *      from a LABELLED word into the current word.
 *    - When all the boxes have been processed we can detect any stuff which is
 *      being ignored - it is the unlabelled words left on the page.
 *
 * A box should only overlap one row.
 *
 * A warning is given if the box is on the same row as the previous box, but NOT
 * on the same row as the previous blob.
 *
 * Any OUTLINE which overlaps the box is put into the new word.
 *
 * ascender chars must ascend above xht significantly
 * xht chars must not rise above row xht significantly
 * bl chars must not descend below baseline significantly
 * descender chars must descend below baseline significantly
 *
 * ?? Certain chars are DROPPED - to limit the training data.
 *
 *************************************************************************/
namespace tesseract {
void Tesseract::apply_boxes(const STRING& fname,
                            BLOCK_LIST *block_list    //real blocks
                           ) {
  inT16 boxfile_lineno = 0;
  inT16 boxfile_charno = 0;
  TBOX box;                       //boxfile box
  UNICHAR_ID uch_id;             //correct ch from boxfile
  ROW *row;
  ROW *prev_row = NULL;
  inT16 prev_box_right = MAX_INT16;
  inT16 block_id;
  inT16 row_id;
  inT16 box_count = 0;
  inT16 box_failures = 0;
  inT16 labels_ok;
  inT16 rows_ok;
  inT16 bad_blobs;
  inT16 *tgt_char_counts = NULL; // No. of box samples
  inT16 i;
  inT16 rebalance_count = 0;
  UNICHAR_ID min_uch_id = INVALID_UNICHAR_ID;
  inT16 min_samples;
  inT16 final_labelled_blob_count;
  bool low_exposure = false;

  // Clean the unichar set
  unicharset_boxes.clear();
  // Space character needed to represent NIL classification
  unicharset_boxes.unichar_insert(" ");

  // Figure out whether this image file's exposure is less than 1, in which
  // case when learning we will only pay attention to character fragments.
  const char *ptr = strstr(imagefile.string(), exposure_pattern.string());
  if (ptr != NULL &&
      strtol(ptr += strlen(exposure_pattern.string()), NULL, 10) < 0) {
    low_exposure = true;
  }

  FILE* box_file;
  STRING filename = fname;
  const char *lastdot;           //of name

  lastdot = strrchr (filename.string (), '.');
  if (lastdot != NULL)
    filename[lastdot - filename.string()] = '\0';

  filename += ".box";
  if (!(box_file = fopen (filename.string(), "r"))) {
    CANTOPENFILE.error ("read_next_box", EXIT,
      "Cant open box file %s %d",
      filename.string(), errno);
  }

  tgt_char_counts = new inT16[MAX_NUM_CLASSES];
  for (i = 0; i < MAX_NUM_CLASSES; i++)
    tgt_char_counts[i] = 0;

  clear_any_old_text(block_list);
  while (read_next_box(applybox_page, box_file, &box, &uch_id)) {
    box_count++;
    if (!low_exposure || learn_chars_and_char_frags_mode) {
    tgt_char_counts[uch_id]++;
    }
    row = find_row_of_box (block_list, box, block_id, row_id);
    if (box.left () < prev_box_right) {
      boxfile_lineno++;
      boxfile_charno = 1;
    }
    else
      boxfile_charno++;

    if (row == NULL) {
      box_failures++;
      report_failed_box (boxfile_lineno, boxfile_charno, box,
                         unicharset_boxes.id_to_unichar(uch_id),
        "FAILURE! box overlaps no blobs or blobs in multiple rows");
    }
    else {
      if ((box.left () >= prev_box_right) && (row != prev_row))
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
          "WARNING! false row break");
      box_failures += resegment_box (row, box, uch_id, block_id, row_id,
        boxfile_lineno, boxfile_charno, tgt_char_counts, low_exposure, true);
      prev_row = row;
    }
    prev_box_right = box.right ();
  }
  tidy_up(block_list,
          labels_ok,
          rows_ok,
          bad_blobs,
          tgt_char_counts,
          rebalance_count,
          &min_uch_id,
          min_samples,
          final_labelled_blob_count,
          low_exposure,
          true);
  tprintf ("APPLY_BOXES:\n");
  tprintf ("   Boxes read from boxfile:  %6d\n", box_count);
  tprintf ("   Initially labelled blobs: %6d in %d rows\n",
    labels_ok, rows_ok);
  tprintf ("   Box failures detected:       %6d\n", box_failures);
  tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
  tprintf ("   \"%s\" has fewest samples:%6d\n",
           unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
  tprintf ("                Total unlabelled words:   %6d\n",
    bad_blobs);
  tprintf ("                Final labelled words:     %6d\n",
    final_labelled_blob_count);

  // Clean up.
  delete[] tgt_char_counts;
}

int Tesseract::Boxes2BlockList(int box_cnt, TBOX *boxes,
                               BLOCK_LIST *block_list,
                               bool right2left) {
  inT16 boxfile_lineno = 0;
  inT16 boxfile_charno = 0;
  TBOX box;
  ROW *row;
  ROW *prev_row = NULL;
  inT16 prev_box_right = MAX_INT16;
  inT16 prev_box_left = 0;
  inT16 block_id;
  inT16 row_id;
  inT16 box_failures = 0;
  inT16 labels_ok;
  inT16 rows_ok;
  inT16 bad_blobs;
  inT16 rebalance_count = 0;
  UNICHAR_ID min_uch_id;
  inT16 min_samples;
  inT16 final_labelled_blob_count;

  clear_any_old_text(block_list);
  for (int box_idx = 0; box_idx < box_cnt; box_idx++) {
    box = boxes[box_idx];

    row = find_row_of_box(block_list, box, block_id, row_id);
    // check for a new row
    if ((right2left && box.right () > prev_box_left) ||
        (!right2left && box.left () < prev_box_right)) {
      boxfile_lineno++;
      boxfile_charno = 1;
    }
    else {
      boxfile_charno++;
    }

    if (row == NULL) {
      box_failures++;
    }
    else {
      box_failures += resegment_box(row, box, 0, block_id, row_id,
                                    boxfile_lineno, boxfile_charno,
                                    NULL, false, false);
      prev_row = row;
    }
    prev_box_right = box.right ();
    prev_box_left = box.left ();
  }

  tidy_up(block_list, labels_ok, rows_ok, bad_blobs, NULL,
          rebalance_count, &min_uch_id, min_samples, final_labelled_blob_count,
          false, false);

  return box_failures;
}

}  // namespace tesseract


void clear_any_old_text(                        //remove correct text
                        BLOCK_LIST *block_list  //real blocks
                       ) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  WERD_IT word_it;

  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      word_it.set_to_list (row_it.data ()->word_list ());
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word_it.data ()->set_text ("");
      }
    }
  }
}

UNICHAR_ID register_char(const char *uch) {
  if (!unicharset_boxes.contains_unichar(uch)) {
    unicharset_boxes.unichar_insert(uch);
    if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
      tprintf("Error: Size of unicharset of boxes is "
              "greater than MAX_NUM_CLASSES (%d)\n", MAX_NUM_CLASSES);
      exit(1);
    }
  }
  return unicharset_boxes.unichar_to_id(uch);
}

BOOL8 read_next_box(int page,
                    FILE* box_file,
                    TBOX *box,
                    UNICHAR_ID *uch_id) {
  int x_min;
  int y_min;
  int x_max;
  int y_max;
  char uch[kBoxReadBufSize];

  if (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) {
    *uch_id = register_char(uch);
    *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
    return TRUE;  // read a box ok
  } else {
    return FALSE;  // EOF
  }
}


ROW *find_row_of_box(                         //
                     BLOCK_LIST *block_list,  //real blocks
                     const TBOX &box,                 //from boxfile
                     inT16 &block_id,
                     inT16 &row_id_to_process) {
  BLOCK_IT block_it(block_list);
  BLOCK *block;
  ROW_IT row_it;
  ROW *row;
  ROW *row_to_process = NULL;
  inT16 row_id;
  WERD_IT word_it;
  WERD *word;
  BOOL8 polyg;
  PBLOB_IT blob_it;
  PBLOB *blob;
  OUTLINE_IT outline_it;
  OUTLINE *outline;

  /*
    Find row to process - error if box REALLY overlaps more than one row. (I.e
    it overlaps blobs in the row - not just overlaps the bounding box of the
    whole row.)
  */

  block_id = 0;
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    block_id++;
    row_id = 0;
    block = block_it.data ();
    if (block->bounding_box ().overlap (box)) {
      row_it.set_to_list (block->row_list ());
      for (row_it.mark_cycle_pt ();
      !row_it.cycled_list (); row_it.forward ()) {
        row_id++;
        row = row_it.data ();
        if (row->bounding_box ().overlap (box)) {
          word_it.set_to_list (row->word_list ());
          for (word_it.mark_cycle_pt ();
          !word_it.cycled_list (); word_it.forward ()) {
            word = word_it.data ();
            polyg = word->flag (W_POLYGON);
            if (word->bounding_box ().overlap (box)) {
              blob_it.set_to_list (word->gblob_list ());
              for (blob_it.mark_cycle_pt ();
              !blob_it.cycled_list (); blob_it.forward ()) {
                blob = blob_it.data ();
                if (gblob_bounding_box (blob, polyg).
                overlap (box)) {
                  outline_it.
                    set_to_list (gblob_out_list
                    (blob, polyg));
                  for (outline_it.mark_cycle_pt ();
                    !outline_it.cycled_list ();
                  outline_it.forward ()) {
                    outline = outline_it.data ();
                    if (goutline_bounding_box
                    (outline, polyg).major_overlap (box)) {
                      if ((row_to_process == NULL) ||
                      (row_to_process == row)) {
                        row_to_process = row;
                        row_id_to_process = row_id;
                      }
                      else
                        /* RETURN ERROR Box overlaps blobs in more than one row  */
                        return NULL;
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
  return row_to_process;
}


inT16 resegment_box(  //
                    ROW *row,
                    TBOX &box,
                    UNICHAR_ID uch_id,
                    inT16 block_id,
                    inT16 row_id,
                    inT16 boxfile_lineno,
                    inT16 boxfile_charno,
                    inT16 *tgt_char_counts,
                    bool learn_char_fragments,
                    bool learning) {
  WERD_LIST new_word_list;
  WERD_IT word_it;
  WERD_IT new_word_it(&new_word_list);
  WERD *word = NULL;
  WERD *new_word = NULL;
  BOOL8 polyg = false;
  PBLOB_IT blob_it;
  PBLOB_IT new_blob_it;
  PBLOB *blob;
  PBLOB *new_blob;
  OUTLINE_IT outline_it;
  OUTLINE_LIST dummy;  // Just to initialize new_outline_it.
  OUTLINE_IT new_outline_it = &dummy;
  OUTLINE *outline;
  TBOX new_word_box;
  TBOX curr_outline_box;
  TBOX prev_outline_box;
  float word_x_centre;
  float baseline;
  inT16 error_count = 0;         //number of chars lost
  STRING label;
  UNICHAR_ID fragment_uch_id;
  int fragment_index;
  int new_word_it_len;

  if (learning && applybox_debug > 6) {
    tprintf("\nAPPLY_BOX: in resegment_box() for %s(%d)\n",
            unicharset_boxes.id_to_unichar(uch_id), uch_id);
  }
  word_it.set_to_list (row->word_list ());
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();
    polyg = word->flag (W_POLYGON);
    if (word->bounding_box ().overlap (box)) {
      blob_it.set_to_list (word->gblob_list ());
      prev_outline_box = TBOX();  // clear prev_outline_box
      curr_outline_box = TBOX();  // clear curr_outline_box
      for (blob_it.mark_cycle_pt ();
      !blob_it.cycled_list (); blob_it.forward ()) {
        blob = blob_it.data ();
        if (gblob_bounding_box (blob, polyg).overlap (box)) {
          outline_it.set_to_list (gblob_out_list (blob, polyg));
          for (outline_it.mark_cycle_pt ();
          !outline_it.cycled_list (); outline_it.forward ()) {
            outline = outline_it.data ();
            prev_outline_box += curr_outline_box;
            curr_outline_box = goutline_bounding_box(outline, polyg);
            if (curr_outline_box.major_overlap (box)) {
              if (strlen (word->text ()) > 0) {
                if (error_count == 0) {
                  error_count = 1;
                  if (learning && applybox_debug > 4)
                    report_failed_box (boxfile_lineno,
                      boxfile_charno,
                      box, unicharset_boxes.id_to_unichar(uch_id),
                      "FAILURE! box overlaps blob in labelled word");
                }
                if (learning && applybox_debug > 4)
                  tprintf ("APPLY_BOXES: ALSO ignoring corrupted char"
                           " blk:%d row:%d \"%s\"\n",
                           block_id, row_id, word_it.data()->text());
                word_it.data ()->set_text ("");  // UN label it
                error_count++;
              }
              // Do not learn from fragments of characters that are broken
              // into very small pieces to avoid picking up noise.
              if ((learn_char_fragments || learn_chars_and_char_frags_mode) &&
                  ((C_OUTLINE *)outline)->area() < kMinFragmentOutlineArea) {
                if (applybox_debug > 6) {
                  tprintf("APPLY_BOX: fragment outline area %d is too small"
                          " - not recording fragments of this character\n",
                          ((C_OUTLINE *)outline)->area());
                }
                error_count++;
              }

              if (error_count == 0) {
                if (applybox_debug > 6 ) {
                  tprintf("APPLY_BOX: Previous ");
                  prev_outline_box.print();
                  tprintf("APPLY_BOX: Current area: %d ",
                          ((C_OUTLINE *)outline)->area());
                  curr_outline_box.print();
                }
                // When learning character fragments is enabled, we put
                // outlines that do not overlap on x axis in separate WERDs.
                bool start_new_word =
                    (learn_char_fragments || learn_chars_and_char_frags_mode) &&
                    !curr_outline_box.major_x_overlap(prev_outline_box);
                if (new_word == NULL || start_new_word) {
                  if (new_word != NULL) {  // add prev new_word to new_word_list
                    new_word_it.add_to_end(new_word);
                  }
                  // Make a new word with a single blob.
                  new_word = word->shallow_copy();
                  new_word->set_flag(W_FUZZY_NON, false);
                  new_word->set_flag(W_FUZZY_SP, false);
                  if (polyg){
                    new_blob = new PBLOB;
                  } else {
                    new_blob = (PBLOB *) new C_BLOB;
                  }
                  new_blob_it.set_to_list(new_word->gblob_list());
                  new_blob_it.add_to_end(new_blob);
                  new_outline_it.set_to_list(
                      gblob_out_list(new_blob, polyg));
                }
                new_outline_it.add_to_end(outline_it.extract());  // move blob
              }
            }
          }
          if (outline_it.empty())      // no outlines in blob
            delete blob_it.extract();  // so delete blob
        }
      }
      if (blob_it.empty())         // no blobs in word
        delete word_it.extract();  // so delete word
    }
  }
  if (new_word != NULL) {  // add prev new_word to new_word_list
    new_word_it.add_to_end(new_word);
  }
  new_word_it_len = new_word_it.length();

  // Check for failures.
  if (error_count > 0)
    return error_count;
  if (learning && new_word_it_len <= 0) {
    report_failed_box(boxfile_lineno, boxfile_charno, box,
                      unicharset_boxes.id_to_unichar(uch_id),
                      "FAILURE! Couldn't find any blobs");
    return 1;  // failure
  }

  if (learning && new_word_it_len > CHAR_FRAGMENT::kMaxChunks) {
    tprintf("APPLY_BOXES: too many fragments (%d) for char %s\n",
            new_word_it_len, unicharset_boxes.id_to_unichar(uch_id));
    return 1;  // failure
  }

  // Add labelled character or character fragments to the word list.
  fragment_index = 0;
  new_word_it.move_to_first();
  for (new_word_it.mark_cycle_pt(); !new_word_it.cycled_list();
       new_word_it.forward()) {
    new_word = new_word_it.extract();
    if (new_word_it_len > 1) {  // deal with a fragment
      if (learning) {
        label = CHAR_FRAGMENT::to_string(unicharset_boxes.id_to_unichar(uch_id),
                                         fragment_index, new_word_it_len);
        fragment_uch_id = register_char(label.string());
        new_word->set_text(label.string());
        ++fragment_index;
        // For now we cheat by setting the expected number of char fragments
        // to the number of char fragments actually parsed and labelled.
        // TODO(daria): find out whether this can be improved.
        tgt_char_counts[fragment_uch_id]++;
      } else {
        // No learning involved, Just stick a place-holder string
        new_word->set_text("*");
      }
      if (applybox_debug > 5) {
        tprintf("APPLY_BOX: adding char fragment %s\n", label.string());
      }
    } else {  // deal with a regular character
      if (learning) {
        if (!learn_char_fragments || learn_chars_and_char_frags_mode) {
          new_word->set_text(unicharset_boxes.id_to_unichar(uch_id));
        } else {
          // not interested in non-fragmented chars if learning fragments, so
          // unlabel it.
          new_word->set_text("");
        }
      } else {
        // No learning involved here. Just stick a place holder string
        new_word->set_text("*");
      }
    }
    gblob_sort_list(new_word->gblob_list(), polyg);
    word_it.add_to_end(new_word);
    new_word_box = new_word->bounding_box();
    word_x_centre = (new_word_box.left() + new_word_box.right()) / 2.0f;
    baseline = row->base_line(word_x_centre);
  }

  // All done. Now check if the EOL, BOL flags are set correctly.
  word_it.move_to_first();
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    word = word_it.data();
    word->set_flag(W_BOL, false);
    word->set_flag(W_EOL, false);
  }
  word->set_flag(W_EOL, true);
  word_it.move_to_first();
  word_it.data()->set_flag(W_BOL, true);
  return 0;  //success

#if 0
    if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
      if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
          (new_word_box.top () <
           baseline + (1 + applybox_error_band) * row->x_height ())) {
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
                           "FAILURE! caps-ht char didn't ascend");
        new_word->set_text ("");
        return 1;
      }
      if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
          (new_word_box.top () <
           baseline + (1 - applybox_error_band) * row->x_height ())) {
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
                           "FAILURE! Odd top char below xht");
        new_word->set_text ("");
        return 1;
      }
      if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
          ((new_word_box.top () >
            baseline + (1 + applybox_error_band) * row->x_height ()) ||
           (new_word_box.top () <
            baseline + (1 - applybox_error_band) * row->x_height ()))) {
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
                           "FAILURE! x-ht char didn't have top near xht");
        new_word->set_text ("");
        return 1;
      }
      if (STRING (chs_non_ambig_bl).contains
          (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
          ((new_word_box.bottom () <
            baseline - applybox_error_band * row->x_height ()) ||
           (new_word_box.bottom () >
            baseline + applybox_error_band * row->x_height ()))) {
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
                           "FAILURE! non ambig BL char didnt have bottom near baseline");
        new_word->set_text ("");
        return 1;
      }
      if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
          (new_word_box.bottom () >
           baseline + applybox_error_band * row->x_height ())) {
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
                           "FAILURE! Odd bottom char above baseline");
        new_word->set_text ("");
        return 1;
      }
      if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
          (new_word_box.bottom () >
           baseline - applybox_error_band * row->x_height ())) {
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
        "FAILURE! Descender doesn't descend");
        new_word->set_text ("");
        return 1;
      }
    }
#endif
}


/*************************************************************************
 * tidy_up()
 *   - report >1 block
 *   - sort the words in each row.
 *   - report any rows with no labelled words.
 *   - report any remaining unlabelled words
 *   - report total labelled words
 *
 *************************************************************************/
void tidy_up(                         //
             BLOCK_LIST *block_list,  //real blocks
             inT16 &ok_char_count,
             inT16 &ok_row_count,
             inT16 &unlabelled_words,
             inT16 *tgt_char_counts,
             inT16 &rebalance_count,
             UNICHAR_ID *min_uch_id,
             inT16 &min_samples,
             inT16 &final_labelled_blob_count,
             bool learn_character_fragments,
             bool learning) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  WERD_IT word_it;
  WERD *word;
  WERD *duplicate_word;
  inT16 block_idx = 0;
  inT16 row_idx;
  inT16 all_row_idx = 0;
  BOOL8 row_ok;
  BOOL8 rebalance_needed = FALSE;
  inT16 *labelled_char_counts = NULL;  // num unique labelled samples
  inT16 i;
  UNICHAR_ID uch_id;
  UNICHAR_ID prev_uch_id = -1;
  BOOL8 at_dupe_of_prev_word;
  ROW *prev_row = NULL;
  inT16 left;
  inT16 prev_left = -1;

  labelled_char_counts = new inT16[MAX_NUM_CLASSES];
  for (i = 0; i < MAX_NUM_CLASSES; i++)
    labelled_char_counts[i] = 0;

  ok_char_count = 0;
  ok_row_count = 0;
  unlabelled_words = 0;
  if (learning && (applybox_debug > 4) && (block_it.length () != 1)) {
    if (block_it.length() > 1) {
      tprintf("APPLY_BOXES: More than one block??\n");
    } else {
      tprintf("APPLY_BOXES: No blocks identified.\n");
    }
  }

  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    block_idx++;
    row_idx = 0;
    row_ok = FALSE;
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row_idx++;
      all_row_idx++;
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      word_it.sort (word_comparator);
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if (strlen (word->text ()) == 0 ||
            unicharset_boxes.unichar_to_id(word->text()) < 0) {
          unlabelled_words++;
          if (learning && applybox_debug > 4 && !learn_character_fragments) {
            tprintf("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
                     block_idx, row_idx, all_row_idx);
          }
        } else {
          if (word->gblob_list ()->length () != 1)
            tprintf ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d"
                     " row:%d allrows:%d\n", block_idx, row_idx, all_row_idx);

          ok_char_count++;
          ++labelled_char_counts[unicharset_boxes.unichar_to_id(word->text())];
          row_ok = TRUE;
        }
      }
      if ((applybox_debug > 6) && (!row_ok)) {
        tprintf("APPLY_BOXES: Row with no labelled words blk:%d row:%d"
                " allrows:%d\n", block_idx, row_idx, all_row_idx);
      }
      else
        ok_row_count++;
    }
  }

  min_samples = 9999;
  for (i = 0; i < unicharset_boxes.size(); i++) {
    if (tgt_char_counts[i] > labelled_char_counts[i]) {
      if (labelled_char_counts[i] <= 1) {
        tprintf("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" -"
                " target is %d:\n",
                labelled_char_counts[i], unicharset_boxes.debug_str(i).string(),
                tgt_char_counts[i]);
      }
      else {
        rebalance_needed = TRUE;
        if (applybox_debug > 0)
          tprintf("APPLY_BOXES: REBALANCE REQD \"%s\" - target of"
                  " %d from %d labelled samples\n",
                  unicharset_boxes.debug_str(i).string(), tgt_char_counts[i],
                  labelled_char_counts[i]);
      }
    }
    if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
      min_samples = labelled_char_counts[i];
      *min_uch_id = i;
    }
  }

  while (applybox_rebalance && rebalance_needed) {
    block_it.set_to_list (block_list);
    for (block_it.mark_cycle_pt ();
    !block_it.cycled_list (); block_it.forward ()) {
      row_it.set_to_list (block_it.data ()->row_list ());
      for (row_it.mark_cycle_pt ();
      !row_it.cycled_list (); row_it.forward ()) {
        row = row_it.data ();
        word_it.set_to_list (row->word_list ());
        for (word_it.mark_cycle_pt ();
        !word_it.cycled_list (); word_it.forward ()) {
          word = word_it.data ();
          left = word->bounding_box ().left ();
          if (*word->text () != '\0')
            uch_id = unicharset_boxes.unichar_to_id(word->text ());
          else
            uch_id = -1;
          at_dupe_of_prev_word = ((row == prev_row) &&
            (left = prev_left) &&
            (uch_id == prev_uch_id));
          if ((uch_id != -1) &&
            (labelled_char_counts[uch_id] > 1) &&
            (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
          (!at_dupe_of_prev_word)) {
            /* Duplicate the word to rebalance the labelled samples */
            if (applybox_debug > 9) {
              tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
              word->bounding_box ().print ();
            }
            duplicate_word = new WERD;
            *duplicate_word = *word;
            word_it.add_after_then_move (duplicate_word);
            rebalance_count++;
            labelled_char_counts[uch_id]++;
          }
          prev_row = row;
          prev_left = left;
          prev_uch_id = uch_id;
        }
      }
    }
    rebalance_needed = FALSE;
    for (i = 0; i < unicharset_boxes.size(); i++) {
      if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
      (labelled_char_counts[i] > 1)) {
        rebalance_needed = TRUE;
        break;
      }
    }
  }

  /* Now final check - count labeled blobs */
  final_labelled_blob_count = 0;
  block_it.set_to_list (block_list);
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      word_it.sort (word_comparator);
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if ((strlen(word->text ()) > 0) &&
            (word->gblob_list()->length() == 1)) {
          final_labelled_blob_count++;
        } else {
          delete word_it.extract();
        }
      }
      // delete the row if empty
      if (row->word_list()->empty()) {
        delete row_it.extract();
      }
    }
  }

  // Clean up.
  delete[] labelled_char_counts;
}


void report_failed_box(inT16 boxfile_lineno,
                       inT16 boxfile_charno,
                       TBOX box,
                       const char *box_ch,
                       const char *err_msg) {
  if (applybox_debug > 4)
    tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
      boxfile_lineno,
      boxfile_charno,
      box_ch,
      box.left (), box.bottom (), box.right (), box.top (), err_msg);
}


void apply_box_training(const STRING& filename, BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  WERD_IT word_it;
  WERD *word;
  WERD *bln_word;
  WERD copy_outword;             // copy to denorm
  PBLOB_IT blob_it;
  DENORM denorm;
  inT16 count = 0;
  char unichar[UNICHAR_LEN + 1];

  unichar[UNICHAR_LEN] = '\0';
  tprintf ("Generating training data\n");
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if ((strlen (word->text ()) > 0) &&
        (word->gblob_list ()->length () == 1)) {
          // Here is a word with a single unichar label and a single blob so train on it.
          bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
          strncpy(unichar, word->text (), UNICHAR_LEN);
          tess_training_tester (filename,
            blob_it.data (),     //single blob
            &denorm, TRUE,       //correct
            unichar,             //correct character
            strlen(unichar),     //character length
            NULL);
          copy_outword = *(bln_word);
          copy_outword.baseline_denormalise (&denorm);
          blob_it.set_to_list (copy_outword.blob_list ());
          delete bln_word;
          count++;
        }
      }
    }
  }
  tprintf ("Generated training data for %d blobs\n", count);
}

namespace tesseract {
void Tesseract::apply_box_testing(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  inT16 row_count = 0;
  WERD_IT word_it;
  WERD *word;
  WERD *bln_word;
  inT16 word_count = 0;
  PBLOB_IT blob_it;
  DENORM denorm;
  inT16 count = 0;
  char ch[2];
  WERD *outword;                 //bln best choice
  //segmentation
  WERD_CHOICE *best_choice;      //tess output
  WERD_CHOICE *raw_choice;       //top choice permuter
                                 //detailed results
  BLOB_CHOICE_LIST_CLIST blob_choices;
  inT16 char_count = 0;
  inT16 correct_count = 0;
  inT16 err_count = 0;
  inT16 rej_count = 0;
  #ifndef SECURE_NAMES
  WERDSTATS wordstats;           //As from newdiff
  #endif
  char tess_rej_str[3];
  char tess_long_str[3];

  ch[1] = '\0';
  strcpy (tess_rej_str, "|A");
  strcpy (tess_long_str, "|B");

  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      row_count++;
      word_count = 0;
      word_it.set_to_list (row->word_list ());
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        word_count++;
        if ((strlen (word->text ()) == 1) &&
          !STRING (applybox_test_exclusions).contains (*word->text ())
        && (word->gblob_list ()->length () == 1)) {
          // Here is a word with a single char label and a single blob so test it.
          bln_word = make_bln_copy(word, row, NULL, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
          ch[0] = *word->text ();
          char_count++;
          best_choice = tess_segment_pass1 (bln_word,
            &denorm,
            &Tesseract::tess_default_matcher,
            raw_choice,
            &blob_choices, outword);

          /*
            Test for TESS screw up on word. Recog_word has already ensured that the
            choice list, outword blob lists and best_choice string are the same
            length. A TESS screw up is indicated by a blank filled or 0 length string.
          */
          if ((best_choice->length() == 0) ||
            (strspn(best_choice->unichar_string().string(), " ") ==
             best_choice->unichar_string().length())) {
            rej_count++;
            tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
              row_count, word_count, ch);
            #ifndef SECURE_NAMES
            wordstats.word (tess_rej_str, 2, ch, 1);
            #endif
          }
          else {
            if ((best_choice->length() != outword->blob_list()->length()) ||
                (best_choice->length() != blob_choices.length())) {
              tprintf
                ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
                best_choice->unichar_string().string(),
                best_choice->length(),
                outword->blob_list ()->length(),
                blob_choices.length());
            }
            ASSERT_HOST(best_choice->length() ==
                        outword->blob_list()->length());
            ASSERT_HOST(best_choice->length() == blob_choices.length());
            fix_quotes (best_choice,
                                 //turn to double
              outword, &blob_choices);
            if (strcmp (best_choice->unichar_string().string(), ch) != 0) {
              err_count++;
              tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
                row_count, word_count, ch,
                best_choice->unichar_string().string());
            }
            else
              correct_count++;
            #ifndef SECURE_NAMES
            if (best_choice->unichar_string().length() > 2)
              wordstats.word(tess_long_str, 2, ch, 1);
            else
              wordstats.word(best_choice->unichar_string().string(),
                             best_choice->unichar_string().length(),
                             ch, 1);
            #endif
          }
          delete bln_word;
          delete outword;
          delete best_choice;
          delete raw_choice;
          blob_choices.deep_clear ();
          count++;
        }
      }
    }
  }
  #ifndef SECURE_NAMES
  wordstats.print (1, 100.0);
  wordstats.conf_matrix ();
  tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
    char_count, correct_count, rej_count, err_count);
  #endif
}

}  // namespace tesseract