tesseract/textord/tordmain.cpp

/**********************************************************************
 * File:        tordmain.cpp  (Formerly textordp.c)
 * Description: C++ top level textord code.
 * Author:                  Ray Smith
 * Created:                 Tue Jul 28 17:12:33 BST 1992
 *
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/
#include "mfcpch.h"
#ifdef __UNIX__
#include <assert.h>
#endif
#include "stderr.h"
#include "globaloc.h"
#include "tessout.h"
#include "blread.h"
#include "blobbox.h"
#include "edgblob.h"
#include "drawtord.h"
#include "makerow.h"
#include "wordseg.h"
#include "ocrclass.h"
#include "genblob.h"
#include "imgs.h"
#include "tordmain.h"
#include "secname.h"
#include "tesseractclass.h"

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif

// Some of the code in this file is dependent upon leptonica. If you don't
// have it, you don't get this functionality.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#ifdef HAVE_LIBLEPT
#include "allheaders.h"
#endif

const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";

#undef EXTERN
#define EXTERN

EXTERN BOOL_VAR (textord_no_rejects, FALSE, "Don't remove noise blobs");
EXTERN BOOL_VAR (textord_show_blobs, FALSE, "Display unsorted blobs");
EXTERN BOOL_VAR (textord_show_boxes, FALSE, "Display unsorted blobs");
EXTERN BOOL_VAR (textord_new_initial_xheight, TRUE,
"Use test xheight mechanism");
EXTERN BOOL_VAR (textord_exit_after, FALSE, "Exit after completing textord");
EXTERN INT_VAR (textord_max_noise_size, 7, "Pixel size of noise");
EXTERN double_VAR (textord_blob_size_bigile, 95,
"Percentile for large blobs");
EXTERN double_VAR (textord_noise_area_ratio, 0.7,
"Fraction of bounding box for noise");
EXTERN double_VAR (textord_blob_size_smallile, 20,
"Percentile for small blobs");
EXTERN double_VAR (textord_initialx_ile, 0.75,
"Ile of sizes for xheight guess");
EXTERN double_VAR (textord_initialasc_ile, 0.90,
"Ile of sizes for xheight guess");
EXTERN INT_VAR (textord_noise_sizefraction, 10,
"Fraction of size for maxima");
EXTERN double_VAR (textord_noise_sizelimit, 0.5,
"Fraction of x for big t count");
EXTERN INT_VAR (textord_noise_translimit, 16, "Transitions for normal blob");
EXTERN double_VAR (textord_noise_normratio, 2.0,
"Dot to norm ratio for deletion");
EXTERN BOOL_VAR (textord_noise_rejwords, TRUE, "Reject noise-like words");
EXTERN BOOL_VAR (textord_noise_rejrows, TRUE, "Reject noise-like rows");
EXTERN double_VAR (textord_noise_syfract, 0.2,
"xh fract error for norm blobs");
EXTERN double_VAR (textord_noise_sxfract, 0.4,
"xh fract width error for norm blobs");
EXTERN double_VAR(textord_noise_hfract, 1.0/64,
"Height fraction to discard outlines as speckle noise");
EXTERN INT_VAR (textord_noise_sncount, 1, "super norm blobs to save row");
EXTERN double_VAR (textord_noise_rowratio, 6.0,
"Dot to norm ratio for deletion");

EXTERN BOOL_VAR (textord_noise_debug, FALSE, "Debug row garbage detector");
EXTERN double_VAR (textord_blshift_maxshift, 0.00, "Max baseline shift");
EXTERN double_VAR (textord_blshift_xfraction, 9.99,
"Min size of baseline shift");
EXTERN STRING_EVAR (tessedit_image_ext, ".tif", "Externsion for image file");

#ifndef EMBEDDED
EXTERN clock_t previous_cpu;
#endif

extern BOOL_VAR_H (polygon_tess_approximation, TRUE,
"Do tess poly instead of grey scale");

#define MAX_NEAREST_DIST  600    //for block skew stats
#define MAX_BLOB_TRANSITIONS100  //for nois stats

extern IMAGE page_image;         //must be defined somewhere
extern BOOL_VAR_H (interactive_mode, TRUE, "Run interactively?");
extern /*"C" */ ETEXT_DESC *global_monitor;     //progress monitor

/**********************************************************************
 * find_components
 *
 * Find the C_OUTLINEs of the connected components in each block, put them
 * in C_BLOBs, and filter them by size, putting the different size
 * grades on different lists in the matching TO_BLOCK in port_blocks.
 **********************************************************************/

void find_components(
                       BLOCK_LIST *blocks,
                       TO_BLOCK_LIST *land_blocks,
                       TO_BLOCK_LIST *port_blocks,
                       TBOX *page_box) {
  BLOCK *block;                  //current block
  PDBLK_CLIST pd_blocks;         //copy of list
  BLOCK_IT block_it = blocks;    //iterator
  PDBLK_C_IT pd_it = &pd_blocks; //iterator
  IMAGE thresh_image;            //thresholded

  int width = page_image.get_xsize();
  int height = page_image.get_ysize();
  if (width > MAX_INT16 || height > MAX_INT16) {
    tprintf("Input image too large! (%d, %d)\n", width, height);
    return;  // Can't handle it.
  }

  ICOORD page_tr(width, height);
  block_it.set_to_list (blocks);
  if (global_monitor != NULL)
    global_monitor->ocr_alive = TRUE;

  set_global_loc_code(LOC_EDGE_PROG);
  if (!page_image.white_high ())
    invert_image(&page_image);

#ifndef EMBEDDED
  previous_cpu = clock ();
#endif

  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
       block_it.forward()) {
    block = block_it.data();
    if (block->poly_block() == NULL ||
        block->poly_block()->IsText()) {
#ifndef GRAPHICS_DISABLED
      extract_edges(NULL, &page_image, &page_image, page_tr, block);
#else
      extract_edges(&page_image, &page_image, page_tr, block);
#endif
      *page_box += block->bounding_box ();
    }
  }
  if (global_monitor != NULL) {
    global_monitor->ocr_alive = TRUE;
    global_monitor->progress = 10;
  }

  assign_blobs_to_blocks2(blocks, land_blocks, port_blocks);
  if (global_monitor != NULL)
    global_monitor->ocr_alive = TRUE;
  filter_blobs (page_box->topright (), land_blocks, textord_test_landscape);
#ifndef EMBEDDED
  previous_cpu = clock ();
#endif
  filter_blobs (page_box->topright (), port_blocks, !textord_test_landscape);
  if (global_monitor != NULL)
    global_monitor->ocr_alive = TRUE;
}

/**********************************************************************
 * SetBlobStrokeWidth
 *
 * Set the horizontal and vertical stroke widths in the blob.
 **********************************************************************/
void SetBlobStrokeWidth(bool debug, BLOBNBOX* blob) {
#ifdef HAVE_LIBLEPT
  // Cut the blob rectangle into a Pix.
  // TODO(rays) make the page_image a Pix so this is more direct.
  const TBOX& box = blob->bounding_box();
  IMAGE blob_im;
  int width = box.width();
  int height = box.height();
  blob_im.create(width, height, 1);
  copy_sub_image(&page_image, box.left(), box.bottom(), width, height,
                 &blob_im, 0, 0, false);
  Pix* pix = blob_im.ToPix();
  Pix* dist_pix = pixDistanceFunction(pix, 4, 8, L_BOUNDARY_BG);
  if (debug) {
    pixWrite("cutpix.png", pix, IFF_PNG);
    pixWrite("distpix.png", dist_pix, IFF_PNG);
  }
  pixDestroy(&pix);
  // Compute the stroke widths.
  uinT32* data = pixGetData(dist_pix);
  int wpl = pixGetWpl(dist_pix);
  // Horizontal width of stroke.
  STATS h_stats(0, width + 1);
  for (int y = 0; y < height; ++y) {
    uinT32* pixels = data + y*wpl;
    int prev_pixel = 0;
    int pixel = GET_DATA_BYTE(pixels, 0);
    for (int x = 1; x < width; ++x) {
      int next_pixel = GET_DATA_BYTE(pixels, x);
      // We are looking for a pixel that is equal to its vertical neighbours,
      // yet greater than its left neighbour.
      if (prev_pixel < pixel &&
          (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
          (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
        if (pixel > next_pixel) {
          // Single local max, so an odd width.
          h_stats.add(pixel * 2 - 1, 1);
        } else if (pixel == next_pixel && x + 1 < width &&
                 pixel > GET_DATA_BYTE(pixels, x + 1)) {
          // Double local max, so an even width.
          h_stats.add(pixel * 2, 1);
        }
      }
      prev_pixel = pixel;
      pixel = next_pixel;
    }
  }
  if (debug) {
    h_stats.print(stderr, true);
  }
  // Vertical width of stroke.
  STATS v_stats(0, height + 1);
  for (int x = 0; x < width; ++x) {
    int prev_pixel = 0;
    int pixel = GET_DATA_BYTE(data, x);
    for (int y = 1; y < height; ++y) {
      uinT32* pixels = data + y*wpl;
      int next_pixel = GET_DATA_BYTE(pixels, x);
      // We are looking for a pixel that is equal to its horizontal neighbours,
      // yet greater than its upper neighbour.
      if (prev_pixel < pixel &&
          (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
          (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
        if (pixel > next_pixel) {
          // Single local max, so an odd width.
          v_stats.add(pixel * 2 - 1, 1);
        } else if (pixel == next_pixel && y + 1 < height &&
                 pixel > GET_DATA_BYTE(pixels + wpl, x)) {
          // Double local max, so an even width.
          v_stats.add(pixel * 2, 1);
        }
      }
      prev_pixel = pixel;
      pixel = next_pixel;
    }
  }
  if (debug) {
    v_stats.print(stderr, true);
  }
  pixDestroy(&dist_pix);
  // Store the horizontal and vertical width in the blob, keeping both
  // widths if there is enough information, otherwse only the one with
  // the most samples.
  // If there are insufficent samples, store zero, rather than using
  // 2*area/perimeter, as the numbers that gives do not match the numbers
  // from the distance method.
  if (debug) {
    tprintf("box=%d,%d->%d,%d, hcount=%d, vcount=%d, target=%d\n",
            box.left(), box.bottom(), box.right(), box.top(),
            h_stats.get_total(), v_stats.get_total(), (width+height) /4);
    tprintf("hstats median=%f, lq=%f, uq=%f, sd=%f\n",
            h_stats.median(), h_stats.ile(0.25f), h_stats.ile(0.75f),
            h_stats.sd());
    tprintf("vstats median=%f, lq=%f, uq=%f, sd=%f\n",
            v_stats.median(), v_stats.ile(0.25f), v_stats.ile(0.75f),
            v_stats.sd());

  }
  if (h_stats.get_total() >= (width + height) / 4) {
    blob->set_horz_stroke_width(h_stats.ile(0.5f));
    if (v_stats.get_total() >= (width + height) / 4)
      blob->set_vert_stroke_width(v_stats.ile(0.5f));
    else
      blob->set_vert_stroke_width(0.0f);
  } else {
    if (v_stats.get_total() >= (width + height) / 4 ||
        v_stats.get_total() > h_stats.get_total()) {
      blob->set_horz_stroke_width(0.0f);
      blob->set_vert_stroke_width(v_stats.ile(0.5f));
    } else {
      blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
                                                          : 0.0f);
      blob->set_vert_stroke_width(0.0f);
    }
  }
#else
  // Without leptonica present, use the 2*area/perimeter as an approximation.
  float width = 2.0f * blob->cblob()->area();
  width /= blob->cblob()->perimeter();
  blob->set_horz_stroke_width(width);
  blob->set_vert_stroke_width(width);
#endif
}


/**********************************************************************
 * assign_blobs_to_blocks2
 *
 * Make a list of TO_BLOCKs for portrait and landscape orientation.
 **********************************************************************/

void assign_blobs_to_blocks2(                             // split into groups
                             BLOCK_LIST *blocks,          // blocks to process
                             TO_BLOCK_LIST *land_blocks,  // ** unused **
                             TO_BLOCK_LIST *port_blocks   // output list
                            ) {
  BLOCK *block;                  // current block
  BLOBNBOX *newblob;             // created blob
  C_BLOB *blob;                  // current blob
  BLOCK_IT block_it = blocks;
  C_BLOB_IT blob_it;             // iterator
  BLOBNBOX_IT port_box_it;       // iterator
                                 // destination iterator
  TO_BLOCK_IT port_block_it = port_blocks;
  TO_BLOCK *port_block;          // created block

  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    block = block_it.data ();
    port_block = new TO_BLOCK(block);

    // Convert the good outlines to block->blob_list
    port_box_it.set_to_list(&port_block->blobs);
    blob_it.set_to_list(block->blob_list());
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      blob = blob_it.extract();
      newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
      SetBlobStrokeWidth(false, newblob);
      port_box_it.add_after_then_move(newblob);
    }

    // Put the rejected outlines in block->noise_blobs, which allows them to
    // be reconsidered and sorted back into rows and recover outlines mistakenly
    // rejected.
    port_box_it.set_to_list(&port_block->noise_blobs);
    blob_it.set_to_list(block->reject_blobs());
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      blob = blob_it.extract();
      newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
      SetBlobStrokeWidth(false, newblob);
      port_box_it.add_after_then_move(newblob);
    }

    port_block_it.add_after_then_move(port_block);
  }
}


/**********************************************************************
 * filter_blobs
 *
 * Sort the blobs into sizes in all the blocks for later work.
 **********************************************************************/

void filter_blobs(                        //split into groups
                  ICOORD page_tr,         //top right
                  TO_BLOCK_LIST *blocks,  //output list
                  BOOL8 testing_on        //for plotting
                 ) {
  TO_BLOCK_IT block_it = blocks; //destination iterator
  TO_BLOCK *block;               //created block

  if (to_win != NULL)
    to_win->Clear();
  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
  block_it.forward ()) {
    block = block_it.data ();
    block->line_size = filter_noise_blobs (&block->blobs,
      &block->noise_blobs,
      &block->small_blobs,
      &block->large_blobs);
    block->line_spacing =
      block->line_size * (textord_merge_desc + textord_merge_x +
      textord_merge_asc +
      textord_merge_asc) / textord_merge_x;
    block->line_size *= textord_min_linesize;
    block->max_blob_size = block->line_size * textord_excess_blobsize;
#ifndef GRAPHICS_DISABLED
    if (textord_show_blobs && testing_on) {
      if (to_win == NULL)
        create_to_win(page_tr);
      block->plot_graded_blobs(to_win);
    }
    if (textord_show_boxes && testing_on) {
      if (to_win == NULL)
        create_to_win(page_tr);
      plot_box_list (to_win, &block->noise_blobs, ScrollView::WHITE);
      plot_box_list (to_win, &block->small_blobs, ScrollView::WHITE);
      plot_box_list (to_win, &block->large_blobs, ScrollView::WHITE);
      plot_box_list (to_win, &block->blobs, ScrollView::WHITE);
    }
#endif
  }
}


/**********************************************************************
 * filter_noise_blobs
 *
 * Move small blobs to a separate list.
 **********************************************************************/

float filter_noise_blobs(                            //separate noise
                         BLOBNBOX_LIST *src_list,    //origonal list
                         BLOBNBOX_LIST *noise_list,  //noise list
                         BLOBNBOX_LIST *small_list,  //small blobs
                         BLOBNBOX_LIST *large_list   //large blobs
                        ) {
  inT16 height;                  //height of blob
  inT16 width;                   //of blob
  BLOBNBOX_IT src_it = src_list; //iterators
  BLOBNBOX_IT noise_it = noise_list;
  BLOBNBOX_IT small_it = small_list;
  BLOBNBOX_IT large_it = large_list;
  STATS size_stats (0, MAX_NEAREST_DIST);
  //blob heights
  if (textord_new_initial_xheight)
    return filter_noise_blobs2 (src_list, noise_list, small_list, large_list);
  float min_y;                   //size limits
  float max_y;
  float max_x;

  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
    if (src_it.data ()->bounding_box ().height () < textord_max_noise_size)
      noise_it.add_after_then_move (src_it.extract ());
  }
  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
    size_stats.add (src_it.data ()->bounding_box ().height (), 1);
  }
  min_y = floor (size_stats.ile (textord_blob_size_smallile / 100.0));
  max_y = ceil (size_stats.ile (textord_blob_size_bigile / 100.0));
  max_x = ceil (size_stats.ile (0.5) * textord_width_limit);
  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
    height = src_it.data ()->bounding_box ().height ();
    width = src_it.data ()->bounding_box ().width ();
    if (height < min_y)
      small_it.add_after_then_move (src_it.extract ());
    else if (height > max_y || width > max_x)
      large_it.add_after_then_move (src_it.extract ());
  }
  return size_stats.ile (textord_initialx_ile);
}


/**********************************************************************
 * filter_noise_blobs2
 *
 * Move small blobs to a separate list.
 **********************************************************************/

float filter_noise_blobs2(                            //separate noise
                          BLOBNBOX_LIST *src_list,    //origonal list
                          BLOBNBOX_LIST *noise_list,  //noise list
                          BLOBNBOX_LIST *small_list,  //small blobs
                          BLOBNBOX_LIST *large_list   //large blobs
                         ) {
  inT16 height;                  //height of blob
  inT16 width;                   //of blob
  BLOBNBOX *blob;                //current blob
  float initial_x;               //first guess
  BLOBNBOX_IT src_it = src_list; //iterators
  BLOBNBOX_IT noise_it = noise_list;
  BLOBNBOX_IT small_it = small_list;
  BLOBNBOX_IT large_it = large_list;
  STATS size_stats (0, MAX_NEAREST_DIST);
  //blob heights
  float min_y;                   //size limits
  float max_y;
  float max_x;
  float max_height;              //of good blobs

  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
    blob = src_it.data ();
    if (blob->bounding_box ().height () < textord_max_noise_size)
      noise_it.add_after_then_move (src_it.extract ());
    else if (blob->enclosed_area () >= blob->bounding_box ().height ()
      * blob->bounding_box ().width () * textord_noise_area_ratio)
      small_it.add_after_then_move (src_it.extract ());
  }
  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
    size_stats.add (src_it.data ()->bounding_box ().height (), 1);
  }
  initial_x = size_stats.ile (textord_initialx_ile);
  max_y =
    ceil (initial_x *
    (textord_merge_desc + textord_merge_x +
    2 * textord_merge_asc) / textord_merge_x);
  min_y = floor (initial_x / 2);
  max_x = ceil (initial_x * textord_width_limit);
  small_it.move_to_first ();
  for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
  small_it.forward ()) {
    height = small_it.data()->bounding_box().height();
    if (height > max_y)
      large_it.add_after_then_move(small_it.extract ());
    else if (height >= min_y)
      src_it.add_after_then_move(small_it.extract ());
  }
  size_stats.clear ();
  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
    height = src_it.data ()->bounding_box ().height ();
    width = src_it.data ()->bounding_box ().width ();
    if (height < min_y)
      small_it.add_after_then_move (src_it.extract ());
    else if (height > max_y || width > max_x)
      large_it.add_after_then_move (src_it.extract ());
    else
      size_stats.add (height, 1);
  }
  max_height = size_stats.ile (textord_initialasc_ile);
  //      printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
  //              max_y,min_y,initial_x,max_height);
  max_height *= textord_merge_x / (textord_merge_x + textord_merge_asc);
  if (max_height > initial_x)
    initial_x = max_height;
  //      printf(" ret=%g\n",initial_x);
  return initial_x;
}


/**********************************************************************
 * textord_page
 *
 * Textord the list of blobs and return a list of proper blocks.
 **********************************************************************/

void textord_page(                             //make rows & words
                  ICOORD page_tr,              //top right
                  BLOCK_LIST *blocks,          //block list
                  TO_BLOCK_LIST *land_blocks,  //rotated for landscape
                  TO_BLOCK_LIST *port_blocks,  //output list
                  tesseract::Tesseract* tess
                 ) {
  float gradient;                //global skew

  set_global_loc_code(LOC_TEXT_ORD_ROWS);
  gradient = make_rows (page_tr, blocks, land_blocks, port_blocks, tess);
  if (global_monitor != NULL) {
    global_monitor->ocr_alive = TRUE;
    global_monitor->progress = 20;
  }
  set_global_loc_code(LOC_TEXT_ORD_WORDS);
  make_words(page_tr, gradient, blocks, land_blocks, port_blocks, tess);
  if (global_monitor != NULL) {
    global_monitor->ocr_alive = TRUE;
    global_monitor->progress = 30;
  }
  cleanup_blocks(blocks);  //remove empties
#ifndef GRAPHICS_DISABLED
  close_to_win();
#endif
  if (textord_exit_after && !interactive_mode)
    exit (0);
}

/**********************************************************************
 * cleanup_blocks
 *
 * Delete empty blocks, rows from the page.
 **********************************************************************/

void cleanup_blocks(                    //remove empties
                    BLOCK_LIST *blocks  //list
                   ) {
  BLOCK_IT block_it = blocks;    //iterator
  ROW_IT row_it;                 //row iterator

  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
  block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      clean_small_noise_from_words(row_it.data());
      if ((textord_noise_rejrows
        && !row_it.data ()->word_list ()->empty ()
        && clean_noise_from_row (row_it.data ()))
        || row_it.data ()->word_list ()->empty ())
        delete row_it.extract ();//lose empty row
      else {
        if (textord_noise_rejwords)
          clean_noise_from_words (row_it.data ());
        if (textord_blshift_maxshift >= 0)
          tweak_row_baseline (row_it.data ());
      }
    }
    if (block_it.data ()->row_list ()->empty ()) {
      delete block_it.extract ();//lose empty block
    }
  }
}


/**********************************************************************
 * clean_noise_from_row
 *
 * Move blobs of words from rows of garbage into the reject blobs list.
 **********************************************************************/

BOOL8 clean_noise_from_row(          //remove empties
                           ROW *row  //row to clean
                          ) {
  BOOL8 testing_on;
  TBOX blob_box;                  //bounding box
  C_BLOB *blob;                  //current blob
  C_OUTLINE *outline;            //current outline
  WERD *word;                    //current word
  inT32 blob_size;               //biggest size
  inT32 trans_count = 0;         //no of transitions
  inT32 trans_threshold;         //noise tolerance
  inT32 dot_count;               //small objects
  inT32 norm_count;              //normal objects
  inT32 super_norm_count;        //real char-like
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator
  C_OUTLINE_IT out_it;           //outline iterator

  if (textord_test_y > row->base_line (textord_test_x)
    && textord_show_blobs
    && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
    testing_on = TRUE;
  else
    testing_on = FALSE;
  dot_count = 0;
  norm_count = 0;
  super_norm_count = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!word->flag (W_DONT_CHOP)) {
                                 //get outlines
        out_it.set_to_list (blob->out_list ());
        for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
        out_it.forward ()) {
          outline = out_it.data ();
          blob_box = outline->bounding_box ();
          blob_size =
            blob_box.width () >
            blob_box.height ()? blob_box.width () : blob_box.
            height();
          if (blob_size < textord_noise_sizelimit * row->x_height ())
            dot_count++;         //count smal outlines
          if (!outline->child ()->empty ()
            && blob_box.height () <
            (1 + textord_noise_syfract) * row->x_height ()
            && blob_box.height () >
            (1 - textord_noise_syfract) * row->x_height ()
            && blob_box.width () <
            (1 + textord_noise_sxfract) * row->x_height ()
            && blob_box.width () >
            (1 - textord_noise_sxfract) * row->x_height ())
            super_norm_count++;  //count smal outlines
        }
      }
      else
        super_norm_count++;
      blob_box = blob->bounding_box ();
      blob_size =
        blob_box.width () >
        blob_box.height ()? blob_box.width () : blob_box.height ();
      if (blob_size >= textord_noise_sizelimit * row->x_height ()
      && blob_size < row->x_height () * 2) {
        trans_threshold = blob_size / textord_noise_sizefraction;
        trans_count = blob->count_transitions (trans_threshold);
        if (trans_count < textord_noise_translimit)
          norm_count++;
      }
      else if (blob_box.height () > row->x_height () * 2
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
      #ifndef SECURE_NAMES
      if (testing_on) {
        tprintf
          ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
          blob_box.left (), blob_box.bottom (), blob_box.right (),
          blob_box.top (), blob->out_list ()->length (), trans_count,
          blob_box.bottom () - row->base_line (blob_box.left ()));
      }
      #endif
    }
  }
  #ifndef SECURE_NAMES
  if (textord_noise_debug) {
    tprintf ("Row ending at (%d,%g):",
      blob_box.right (), row->base_line (blob_box.right ()));
    tprintf (" R=%g, dc=%d, nc=%d, %s\n",
      norm_count > 0 ? (float) dot_count / norm_count : 9999,
      dot_count, norm_count,
      dot_count > norm_count * textord_noise_normratio
      && dot_count > 2 ? "REJECTED" : "ACCEPTED");
  }
  #endif
  return super_norm_count < textord_noise_sncount
    && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
}


/**********************************************************************
 * clean_noise_from_words
 *
 * Move blobs of words from rows of garbage into the reject blobs list.
 **********************************************************************/

void clean_noise_from_words(          //remove empties
                            ROW *row  //row to clean
                           ) {
  TBOX blob_box;                  //bounding box
  inT8 *word_dud;                //was it chucked
  C_BLOB *blob;                  //current blob
  C_OUTLINE *outline;            //current outline
  WERD *word;                    //current word
  inT32 blob_size;               //biggest size
  inT32 trans_count;             //no of transitions
  inT32 trans_threshold;         //noise tolerance
  inT32 dot_count;               //small objects
  inT32 norm_count;              //normal objects
  inT32 dud_words;               //number discarded
  inT32 ok_words;                //number remaining
  inT32 word_index;              //current word
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator
  C_OUTLINE_IT out_it;           //outline iterator

  ok_words = word_it.length ();
  if (ok_words == 0 || textord_no_rejects)
    return;
  word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
  dud_words = 0;
  ok_words = 0;
  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
    dot_count = 0;
    norm_count = 0;
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!word->flag (W_DONT_CHOP)) {
                                 //get outlines
        out_it.set_to_list (blob->out_list ());
        for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
        out_it.forward ()) {
          outline = out_it.data ();
          blob_box = outline->bounding_box ();
          blob_size =
            blob_box.width () >
            blob_box.height ()? blob_box.width () : blob_box.
            height();
          if (blob_size < textord_noise_sizelimit * row->x_height ())
            dot_count++;         //count smal outlines
          if (!outline->child ()->empty ()
            && blob_box.height () <
            (1 + textord_noise_syfract) * row->x_height ()
            && blob_box.height () >
            (1 - textord_noise_syfract) * row->x_height ()
            && blob_box.width () <
            (1 + textord_noise_sxfract) * row->x_height ()
            && blob_box.width () >
            (1 - textord_noise_sxfract) * row->x_height ())
            norm_count++;        //count smal outlines
        }
      }
      else
        norm_count++;
      blob_box = blob->bounding_box ();
      blob_size =
        blob_box.width () >
        blob_box.height ()? blob_box.width () : blob_box.height ();
      if (blob_size >= textord_noise_sizelimit * row->x_height ()
      && blob_size < row->x_height () * 2) {
        trans_threshold = blob_size / textord_noise_sizefraction;
        trans_count = blob->count_transitions (trans_threshold);
        if (trans_count < textord_noise_translimit)
          norm_count++;
      }
      else if (blob_box.height () > row->x_height () * 2
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
    }
    if (dot_count > 2) {
      if (dot_count > norm_count * textord_noise_normratio * 2)
        word_dud[word_index] = 2;
      else if (dot_count > norm_count * textord_noise_normratio)
        word_dud[word_index] = 1;
      else
        word_dud[word_index] = 0;
    }
    else
      word_dud[word_index] = 0;
    if (word_dud[word_index] == 2)
      dud_words++;
    else
      ok_words++;
    word_index++;
  }

  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    if (word_dud[word_index] == 2
    || (word_dud[word_index] == 1 && dud_words > ok_words)) {
      word = word_it.data ();    //current word
                                 //rejected blobs
      blob_it.set_to_list (word->rej_cblob_list ());
                                 //move from blobs
      blob_it.add_list_after (word->cblob_list ());
    }
    word_index++;
  }
  free_mem(word_dud);
}

// Remove outlines that are a tiny fraction in either width or height
// of the word height.
void clean_small_noise_from_words(ROW *row) {
  WERD_IT word_it(row->word_list());
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    WERD* word = word_it.data();
    int min_size = static_cast<int>(
      textord_noise_hfract * word->bounding_box().height() + 0.5);
    C_BLOB_IT blob_it(word->cblob_list());
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      C_BLOB* blob = blob_it.data();
      C_OUTLINE_IT out_it(blob->out_list());
      for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
        C_OUTLINE* outline = out_it.data();
        outline->RemoveSmallRecursive(min_size, &out_it);
      }
      if (blob->out_list()->empty()) {
        delete blob_it.extract();
      }
    }
    if (word->cblob_list()->empty()) {
      if (!word_it.at_last()) {
        // The next word is no longer a fuzzy non space if it was before,
        // since the word before is about to be deleted.
        WERD* next_word = word_it.data_relative(1);
        if (next_word->flag(W_FUZZY_NON)) {
          next_word->set_flag(W_FUZZY_NON, false);
        }
      }
      delete word_it.extract();
    }
  }
}


/**********************************************************************
 * tweak_row_baseline
 *
 * Shift baseline to fit the blobs more accurately where they are
 * close enough.
 **********************************************************************/

void tweak_row_baseline(          //remove empties
                        ROW *row  //row to clean
                       ) {
  TBOX blob_box;                  //bounding box
  C_BLOB *blob;                  //current blob
  WERD *word;                    //current word
  inT32 blob_count;              //no of blobs
  inT32 src_index;               //source segment
  inT32 dest_index;              //destination segment
  inT32 *xstarts;                //spline segments
  double *coeffs;                //spline coeffs
  float ydiff;                   //baseline error
  float x_centre;                //centre of blob
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator

  blob_count = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
                                 //get total blobs
    blob_count += word->cblob_list ()->length ();
  }
  if (blob_count == 0)
    return;
  xstarts =
    (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
    sizeof (inT32));
  coeffs =
    (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
    sizeof (double));

  src_index = 0;
  dest_index = 0;
  xstarts[0] = row->baseline.xcoords[0];
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      blob_box = blob->bounding_box ();
      x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
      ydiff = blob_box.bottom () - row->base_line (x_centre);
      if (ydiff < 0)
        ydiff = -ydiff / row->x_height ();
      else
        ydiff = ydiff / row->x_height ();
      if (ydiff < textord_blshift_maxshift
        && blob_box.height () / row->x_height () >
      textord_blshift_xfraction) {
        if (xstarts[dest_index] >= x_centre)
          xstarts[dest_index] = blob_box.left ();
        coeffs[dest_index * 3] = 0;
        coeffs[dest_index * 3 + 1] = 0;
        coeffs[dest_index * 3 + 2] = blob_box.bottom ();
        //shift it
        dest_index++;
        xstarts[dest_index] = blob_box.right () + 1;
      }
      else {
        if (xstarts[dest_index] <= x_centre) {
          while (row->baseline.xcoords[src_index + 1] <= x_centre
          && src_index < row->baseline.segments - 1) {
            if (row->baseline.xcoords[src_index + 1] >
            xstarts[dest_index]) {
              coeffs[dest_index * 3] =
                row->baseline.quadratics[src_index].a;
              coeffs[dest_index * 3 + 1] =
                row->baseline.quadratics[src_index].b;
              coeffs[dest_index * 3 + 2] =
                row->baseline.quadratics[src_index].c;
              dest_index++;
              xstarts[dest_index] =
                row->baseline.xcoords[src_index + 1];
            }
            src_index++;
          }
          coeffs[dest_index * 3] =
            row->baseline.quadratics[src_index].a;
          coeffs[dest_index * 3 + 1] =
            row->baseline.quadratics[src_index].b;
          coeffs[dest_index * 3 + 2] =
            row->baseline.quadratics[src_index].c;
          dest_index++;
          xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
        }
      }
    }
  }
  while (src_index < row->baseline.segments
    && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
    src_index++;
  while (src_index < row->baseline.segments) {
    coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
    coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
    coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
    dest_index++;
    src_index++;
    xstarts[dest_index] = row->baseline.xcoords[src_index];
  }
                                 //turn to spline
  row->baseline = QSPLINE (dest_index, xstarts, coeffs);
  free_mem(xstarts);
  free_mem(coeffs);
}


/**********************************************************************
 * blob_y_order
 *
 * Sort function to sort blobs in y from page top.
 **********************************************************************/

inT32 blob_y_order(              //sort function
                   void *item1,  //items to compare
                   void *item2) {
                                 //converted ptr
  BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
                                 //converted ptr
  BLOBNBOX *blob2 = *(BLOBNBOX **) item2;

  if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
    return -1;
  else if (blob1->bounding_box ().bottom () <
    blob2->bounding_box ().bottom ())
    return 1;
  else {
    if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
      return -1;
    else if (blob1->bounding_box ().left () >
      blob2->bounding_box ().left ())
      return 1;
    else
      return 0;
  }
}