tesseract/textord/tordmain.cpp
Ray Smith 0e868ef377 Major change to improve layout analysis for heavily diacritic languages:
Tha, Vie, Kan, Tel etc.
There is a new overlap detector that detects when diacritics
cause a big increase in textline overlap. In such cases, diacritics from
overlap regions are kept separate from layout analysis completely, allowing
textline formation to happen without them. The diacritics are then assigned
to 0, 1 or 2 close words at the end of layout analysis, using and modifying
an old noise detection data path.
The stored diacritics are used or not during recognition according to the
character classifier's liking for them.
2015-05-12 16:47:02 -07:00

996 lines
38 KiB
C++

/**********************************************************************
* File: tordmain.cpp (Formerly textordp.c)
* Description: C++ top level textord code.
* Author: Ray Smith
* Created: Tue Jul 28 17:12:33 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#ifdef __UNIX__
#include <assert.h>
#endif
#include "stderr.h"
#include "globaloc.h"
#include "blread.h"
#include "blobbox.h"
#include "ccstruct.h"
#include "edgblob.h"
#include "drawtord.h"
#include "makerow.h"
#include "wordseg.h"
#include "textord.h"
#include "tordmain.h"
#include "allheaders.h"
// Gridsize for word grid when reassigning diacritics to words. Not critical.
const int kWordGridSize = 50;
#undef EXTERN
#define EXTERN
#define MAX_NEAREST_DIST 600 //for block skew stats
namespace tesseract {
CLISTIZE(WordWithBox)
/**********************************************************************
* SetBlobStrokeWidth
*
* Set the horizontal and vertical stroke widths in the blob.
**********************************************************************/
void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
// Cut the blob rectangle into a Pix.
int pix_height = pixGetHeight(pix);
const TBOX& box = blob->bounding_box();
int width = box.width();
int height = box.height();
Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
width, height);
Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
boxDestroy(&blob_pix_box);
Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
pixDestroy(&pix_blob);
// Compute the stroke widths.
uinT32* data = pixGetData(dist_pix);
int wpl = pixGetWpl(dist_pix);
// Horizontal width of stroke.
STATS h_stats(0, width + 1);
for (int y = 0; y < height; ++y) {
uinT32* pixels = data + y*wpl;
int prev_pixel = 0;
int pixel = GET_DATA_BYTE(pixels, 0);
for (int x = 1; x < width; ++x) {
int next_pixel = GET_DATA_BYTE(pixels, x);
// We are looking for a pixel that is equal to its vertical neighbours,
// yet greater than its left neighbour.
if (prev_pixel < pixel &&
(y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
(y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
if (pixel > next_pixel) {
// Single local max, so an odd width.
h_stats.add(pixel * 2 - 1, 1);
} else if (pixel == next_pixel && x + 1 < width &&
pixel > GET_DATA_BYTE(pixels, x + 1)) {
// Double local max, so an even width.
h_stats.add(pixel * 2, 1);
}
}
prev_pixel = pixel;
pixel = next_pixel;
}
}
// Vertical width of stroke.
STATS v_stats(0, height + 1);
for (int x = 0; x < width; ++x) {
int prev_pixel = 0;
int pixel = GET_DATA_BYTE(data, x);
for (int y = 1; y < height; ++y) {
uinT32* pixels = data + y*wpl;
int next_pixel = GET_DATA_BYTE(pixels, x);
// We are looking for a pixel that is equal to its horizontal neighbours,
// yet greater than its upper neighbour.
if (prev_pixel < pixel &&
(x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
(x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
if (pixel > next_pixel) {
// Single local max, so an odd width.
v_stats.add(pixel * 2 - 1, 1);
} else if (pixel == next_pixel && y + 1 < height &&
pixel > GET_DATA_BYTE(pixels + wpl, x)) {
// Double local max, so an even width.
v_stats.add(pixel * 2, 1);
}
}
prev_pixel = pixel;
pixel = next_pixel;
}
}
pixDestroy(&dist_pix);
// Store the horizontal and vertical width in the blob, keeping both
// widths if there is enough information, otherwse only the one with
// the most samples.
// If there are insufficent samples, store zero, rather than using
// 2*area/perimeter, as the numbers that gives do not match the numbers
// from the distance method.
if (h_stats.get_total() >= (width + height) / 4) {
blob->set_horz_stroke_width(h_stats.ile(0.5f));
if (v_stats.get_total() >= (width + height) / 4)
blob->set_vert_stroke_width(v_stats.ile(0.5f));
else
blob->set_vert_stroke_width(0.0f);
} else {
if (v_stats.get_total() >= (width + height) / 4 ||
v_stats.get_total() > h_stats.get_total()) {
blob->set_horz_stroke_width(0.0f);
blob->set_vert_stroke_width(v_stats.ile(0.5f));
} else {
blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
: 0.0f);
blob->set_vert_stroke_width(0.0f);
}
}
}
/**********************************************************************
* assign_blobs_to_blocks2
*
* Make a list of TO_BLOCKs for portrait and landscape orientation.
**********************************************************************/
void assign_blobs_to_blocks2(Pix* pix,
BLOCK_LIST *blocks, // blocks to process
TO_BLOCK_LIST *port_blocks) { // output list
BLOCK *block; // current block
BLOBNBOX *newblob; // created blob
C_BLOB *blob; // current blob
BLOCK_IT block_it = blocks;
C_BLOB_IT blob_it; // iterator
BLOBNBOX_IT port_box_it; // iterator
// destination iterator
TO_BLOCK_IT port_block_it = port_blocks;
TO_BLOCK *port_block; // created block
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
block = block_it.data();
port_block = new TO_BLOCK(block);
// Convert the good outlines to block->blob_list
port_box_it.set_to_list(&port_block->blobs);
blob_it.set_to_list(block->blob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
blob = blob_it.extract();
newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
SetBlobStrokeWidth(pix, newblob);
port_box_it.add_after_then_move(newblob);
}
// Put the rejected outlines in block->noise_blobs, which allows them to
// be reconsidered and sorted back into rows and recover outlines mistakenly
// rejected.
port_box_it.set_to_list(&port_block->noise_blobs);
blob_it.set_to_list(block->reject_blobs());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
blob = blob_it.extract();
newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
SetBlobStrokeWidth(pix, newblob);
port_box_it.add_after_then_move(newblob);
}
port_block_it.add_after_then_move(port_block);
}
}
/**********************************************************************
* find_components
*
* Find the C_OUTLINEs of the connected components in each block, put them
* in C_BLOBs, and filter them by size, putting the different size
* grades on different lists in the matching TO_BLOCK in to_blocks.
**********************************************************************/
void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
TO_BLOCK_LIST *to_blocks) {
int width = pixGetWidth(pix);
int height = pixGetHeight(pix);
if (width > MAX_INT16 || height > MAX_INT16) {
tprintf("Input image too large! (%d, %d)\n", width, height);
return; // Can't handle it.
}
set_global_loc_code(LOC_EDGE_PROG);
BLOCK_IT block_it(blocks); // iterator
for (block_it.mark_cycle_pt(); !block_it.cycled_list();
block_it.forward()) {
BLOCK* block = block_it.data();
if (block->poly_block() == NULL || block->poly_block()->IsText()) {
extract_edges(pix, block);
}
}
assign_blobs_to_blocks2(pix, blocks, to_blocks);
ICOORD page_tr(width, height);
filter_blobs(page_tr, to_blocks, !textord_test_landscape);
}
/**********************************************************************
* filter_blobs
*
* Sort the blobs into sizes in all the blocks for later work.
**********************************************************************/
void Textord::filter_blobs(ICOORD page_tr, // top right
TO_BLOCK_LIST *blocks, // output list
BOOL8 testing_on) { // for plotting
TO_BLOCK_IT block_it = blocks; // destination iterator
TO_BLOCK *block; // created block
#ifndef GRAPHICS_DISABLED
if (to_win != NULL)
to_win->Clear();
#endif // GRAPHICS_DISABLED
for (block_it.mark_cycle_pt(); !block_it.cycled_list();
block_it.forward()) {
block = block_it.data();
block->line_size = filter_noise_blobs(&block->blobs,
&block->noise_blobs,
&block->small_blobs,
&block->large_blobs);
block->line_spacing = block->line_size *
(tesseract::CCStruct::kDescenderFraction +
tesseract::CCStruct::kXHeightFraction +
2 * tesseract::CCStruct::kAscenderFraction) /
tesseract::CCStruct::kXHeightFraction;
block->line_size *= textord_min_linesize;
block->max_blob_size = block->line_size * textord_excess_blobsize;
#ifndef GRAPHICS_DISABLED
if (textord_show_blobs && testing_on) {
if (to_win == NULL)
create_to_win(page_tr);
block->plot_graded_blobs(to_win);
}
if (textord_show_boxes && testing_on) {
if (to_win == NULL)
create_to_win(page_tr);
plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE);
plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE);
plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE);
plot_box_list(to_win, &block->blobs, ScrollView::WHITE);
}
#endif // GRAPHICS_DISABLED
}
}
/**********************************************************************
* filter_noise_blobs
*
* Move small blobs to a separate list.
**********************************************************************/
float Textord::filter_noise_blobs(
BLOBNBOX_LIST *src_list, // original list
BLOBNBOX_LIST *noise_list, // noise list
BLOBNBOX_LIST *small_list, // small blobs
BLOBNBOX_LIST *large_list) { // large blobs
inT16 height; //height of blob
inT16 width; //of blob
BLOBNBOX *blob; //current blob
float initial_x; //first guess
BLOBNBOX_IT src_it = src_list; //iterators
BLOBNBOX_IT noise_it = noise_list;
BLOBNBOX_IT small_it = small_list;
BLOBNBOX_IT large_it = large_list;
STATS size_stats (0, MAX_NEAREST_DIST);
//blob heights
float min_y; //size limits
float max_y;
float max_x;
float max_height; //of good blobs
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
blob = src_it.data();
if (blob->bounding_box().height() < textord_max_noise_size)
noise_it.add_after_then_move(src_it.extract());
else if (blob->enclosed_area() >= blob->bounding_box().height()
* blob->bounding_box().width() * textord_noise_area_ratio)
small_it.add_after_then_move(src_it.extract());
}
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
size_stats.add(src_it.data()->bounding_box().height(), 1);
}
initial_x = size_stats.ile(textord_initialx_ile);
max_y = ceil(initial_x *
(tesseract::CCStruct::kDescenderFraction +
tesseract::CCStruct::kXHeightFraction +
2 * tesseract::CCStruct::kAscenderFraction) /
tesseract::CCStruct::kXHeightFraction);
min_y = floor (initial_x / 2);
max_x = ceil (initial_x * textord_width_limit);
small_it.move_to_first ();
for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
small_it.forward ()) {
height = small_it.data()->bounding_box().height();
if (height > max_y)
large_it.add_after_then_move(small_it.extract ());
else if (height >= min_y)
src_it.add_after_then_move(small_it.extract ());
}
size_stats.clear ();
for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
height = src_it.data ()->bounding_box ().height ();
width = src_it.data ()->bounding_box ().width ();
if (height < min_y)
small_it.add_after_then_move (src_it.extract ());
else if (height > max_y || width > max_x)
large_it.add_after_then_move (src_it.extract ());
else
size_stats.add (height, 1);
}
max_height = size_stats.ile (textord_initialasc_ile);
// tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
// max_y,min_y,initial_x,max_height);
max_height *= tesseract::CCStruct::kXHeightCapRatio;
if (max_height > initial_x)
initial_x = max_height;
// tprintf(" ret=%g\n",initial_x);
return initial_x;
}
// Fixes the block so it obeys all the rules:
// Must have at least one ROW.
// Must have at least one WERD.
// WERDs contain a fake blob.
void Textord::cleanup_nontext_block(BLOCK* block) {
// Non-text blocks must contain at least one row.
ROW_IT row_it(block->row_list());
if (row_it.empty()) {
TBOX box = block->bounding_box();
float height = box.height();
inT32 xstarts[2] = {box.left(), box.right()};
double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
height / 4.0f, 0, 1);
row_it.add_after_then_move(row);
}
// Each row must contain at least one word.
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
ROW* row = row_it.data();
WERD_IT w_it(row->word_list());
if (w_it.empty()) {
// Make a fake blob to put in the word.
TBOX box = block->row_list()->singleton() ? block->bounding_box()
: row->bounding_box();
C_BLOB* blob = C_BLOB::FakeBlob(box);
C_BLOB_LIST blobs;
C_BLOB_IT blob_it(&blobs);
blob_it.add_after_then_move(blob);
WERD* word = new WERD(&blobs, 0, NULL);
w_it.add_after_then_move(word);
}
// Each word must contain a fake blob.
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
// Just assert that this is true, as it would be useful to find
// out why it isn't.
ASSERT_HOST(!word->cblob_list()->empty());
}
row->recalc_bounding_box();
}
}
/**********************************************************************
* cleanup_blocks
*
* Delete empty blocks, rows from the page.
**********************************************************************/
void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
BLOCK_IT block_it = blocks; //iterator
ROW_IT row_it; //row iterator
int num_rows = 0;
int num_rows_all = 0;
int num_blocks = 0;
int num_blocks_all = 0;
for (block_it.mark_cycle_pt(); !block_it.cycled_list();
block_it.forward()) {
BLOCK* block = block_it.data();
if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
cleanup_nontext_block(block);
continue;
}
num_rows = 0;
num_rows_all = 0;
if (clean_noise) {
row_it.set_to_list(block->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
ROW* row = row_it.data();
++num_rows_all;
clean_small_noise_from_words(row);
if ((textord_noise_rejrows && !row->word_list()->empty() &&
clean_noise_from_row(row)) ||
row->word_list()->empty()) {
delete row_it.extract(); // lose empty row.
} else {
if (textord_noise_rejwords)
clean_noise_from_words(row_it.data());
if (textord_blshift_maxshift >= 0)
tweak_row_baseline(row, textord_blshift_maxshift,
textord_blshift_xfraction);
++num_rows;
}
}
}
if (block->row_list()->empty()) {
delete block_it.extract(); // Lose empty text blocks.
} else {
++num_blocks;
}
++num_blocks_all;
if (textord_noise_debug)
tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
}
if (textord_noise_debug)
tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
}
/**********************************************************************
* clean_noise_from_row
*
* Move blobs of words from rows of garbage into the reject blobs list.
**********************************************************************/
BOOL8 Textord::clean_noise_from_row( //remove empties
ROW *row //row to clean
) {
BOOL8 testing_on;
TBOX blob_box; //bounding box
C_BLOB *blob; //current blob
C_OUTLINE *outline; //current outline
WERD *word; //current word
inT32 blob_size; //biggest size
inT32 trans_count = 0; //no of transitions
inT32 trans_threshold; //noise tolerance
inT32 dot_count; //small objects
inT32 norm_count; //normal objects
inT32 super_norm_count; //real char-like
//words of row
WERD_IT word_it = row->word_list ();
C_BLOB_IT blob_it; //blob iterator
C_OUTLINE_IT out_it; //outline iterator
if (textord_test_y > row->base_line (textord_test_x)
&& textord_show_blobs
&& textord_test_y < row->base_line (textord_test_x) + row->x_height ())
testing_on = TRUE;
else
testing_on = FALSE;
dot_count = 0;
norm_count = 0;
super_norm_count = 0;
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
word = word_it.data (); //current word
//blobs in word
blob_it.set_to_list (word->cblob_list ());
for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
blob_it.forward ()) {
blob = blob_it.data ();
if (!word->flag (W_DONT_CHOP)) {
//get outlines
out_it.set_to_list (blob->out_list ());
for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
out_it.forward ()) {
outline = out_it.data ();
blob_box = outline->bounding_box ();
blob_size =
blob_box.width () >
blob_box.height ()? blob_box.width () : blob_box.
height();
if (blob_size < textord_noise_sizelimit * row->x_height ())
dot_count++; //count smal outlines
if (!outline->child ()->empty ()
&& blob_box.height () <
(1 + textord_noise_syfract) * row->x_height ()
&& blob_box.height () >
(1 - textord_noise_syfract) * row->x_height ()
&& blob_box.width () <
(1 + textord_noise_sxfract) * row->x_height ()
&& blob_box.width () >
(1 - textord_noise_sxfract) * row->x_height ())
super_norm_count++; //count smal outlines
}
}
else
super_norm_count++;
blob_box = blob->bounding_box ();
blob_size =
blob_box.width () >
blob_box.height ()? blob_box.width () : blob_box.height ();
if (blob_size >= textord_noise_sizelimit * row->x_height ()
&& blob_size < row->x_height () * 2) {
trans_threshold = blob_size / textord_noise_sizefraction;
trans_count = blob->count_transitions (trans_threshold);
if (trans_count < textord_noise_translimit)
norm_count++;
}
else if (blob_box.height () > row->x_height () * 2
&& (!word_it.at_first () || !blob_it.at_first ()))
dot_count += 2;
if (testing_on) {
tprintf
("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
blob_box.left (), blob_box.bottom (), blob_box.right (),
blob_box.top (), blob->out_list ()->length (), trans_count,
blob_box.bottom () - row->base_line (blob_box.left ()));
}
}
}
if (textord_noise_debug) {
tprintf ("Row ending at (%d,%g):",
blob_box.right (), row->base_line (blob_box.right ()));
tprintf (" R=%g, dc=%d, nc=%d, %s\n",
norm_count > 0 ? (float) dot_count / norm_count : 9999,
dot_count, norm_count,
dot_count > norm_count * textord_noise_normratio
&& dot_count > 2 ? "REJECTED" : "ACCEPTED");
}
return super_norm_count < textord_noise_sncount
&& dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
}
/**********************************************************************
* clean_noise_from_words
*
* Move blobs of words from rows of garbage into the reject blobs list.
**********************************************************************/
void Textord::clean_noise_from_words( //remove empties
ROW *row //row to clean
) {
TBOX blob_box; //bounding box
inT8 *word_dud; //was it chucked
C_BLOB *blob; //current blob
C_OUTLINE *outline; //current outline
WERD *word; //current word
inT32 blob_size; //biggest size
inT32 trans_count; //no of transitions
inT32 trans_threshold; //noise tolerance
inT32 dot_count; //small objects
inT32 norm_count; //normal objects
inT32 dud_words; //number discarded
inT32 ok_words; //number remaining
inT32 word_index; //current word
//words of row
WERD_IT word_it = row->word_list ();
C_BLOB_IT blob_it; //blob iterator
C_OUTLINE_IT out_it; //outline iterator
ok_words = word_it.length ();
if (ok_words == 0 || textord_no_rejects)
return;
word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
dud_words = 0;
ok_words = 0;
word_index = 0;
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
word = word_it.data (); //current word
dot_count = 0;
norm_count = 0;
//blobs in word
blob_it.set_to_list (word->cblob_list ());
for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
blob_it.forward ()) {
blob = blob_it.data ();
if (!word->flag (W_DONT_CHOP)) {
//get outlines
out_it.set_to_list (blob->out_list ());
for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
out_it.forward ()) {
outline = out_it.data ();
blob_box = outline->bounding_box ();
blob_size =
blob_box.width () >
blob_box.height ()? blob_box.width () : blob_box.
height();
if (blob_size < textord_noise_sizelimit * row->x_height ())
dot_count++; //count smal outlines
if (!outline->child ()->empty ()
&& blob_box.height () <
(1 + textord_noise_syfract) * row->x_height ()
&& blob_box.height () >
(1 - textord_noise_syfract) * row->x_height ()
&& blob_box.width () <
(1 + textord_noise_sxfract) * row->x_height ()
&& blob_box.width () >
(1 - textord_noise_sxfract) * row->x_height ())
norm_count++; //count smal outlines
}
}
else
norm_count++;
blob_box = blob->bounding_box ();
blob_size =
blob_box.width () >
blob_box.height ()? blob_box.width () : blob_box.height ();
if (blob_size >= textord_noise_sizelimit * row->x_height ()
&& blob_size < row->x_height () * 2) {
trans_threshold = blob_size / textord_noise_sizefraction;
trans_count = blob->count_transitions (trans_threshold);
if (trans_count < textord_noise_translimit)
norm_count++;
}
else if (blob_box.height () > row->x_height () * 2
&& (!word_it.at_first () || !blob_it.at_first ()))
dot_count += 2;
}
if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
if (dot_count > norm_count * textord_noise_normratio * 2)
word_dud[word_index] = 2;
else if (dot_count > norm_count * textord_noise_normratio)
word_dud[word_index] = 1;
else
word_dud[word_index] = 0;
} else {
word_dud[word_index] = 0;
}
if (word_dud[word_index] == 2)
dud_words++;
else
ok_words++;
word_index++;
}
word_index = 0;
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
if (word_dud[word_index] == 2
|| (word_dud[word_index] == 1 && dud_words > ok_words)) {
word = word_it.data(); // Current word.
// Previously we threw away the entire word.
// Now just aggressively throw all small blobs into the reject list, where
// the classifier can decide whether they are actually needed.
word->CleanNoise(textord_noise_sizelimit * row->x_height());
}
word_index++;
}
free_mem(word_dud);
}
// Remove outlines that are a tiny fraction in either width or height
// of the word height.
void Textord::clean_small_noise_from_words(ROW *row) {
WERD_IT word_it(row->word_list());
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
WERD* word = word_it.data();
int min_size = static_cast<int>(
textord_noise_hfract * word->bounding_box().height() + 0.5);
C_BLOB_IT blob_it(word->cblob_list());
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
C_BLOB* blob = blob_it.data();
C_OUTLINE_IT out_it(blob->out_list());
for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
C_OUTLINE* outline = out_it.data();
outline->RemoveSmallRecursive(min_size, &out_it);
}
if (blob->out_list()->empty()) {
delete blob_it.extract();
}
}
if (word->cblob_list()->empty()) {
if (!word_it.at_last()) {
// The next word is no longer a fuzzy non space if it was before,
// since the word before is about to be deleted.
WERD* next_word = word_it.data_relative(1);
if (next_word->flag(W_FUZZY_NON)) {
next_word->set_flag(W_FUZZY_NON, false);
}
}
delete word_it.extract();
}
}
}
// Local struct to hold a group of blocks.
struct BlockGroup {
BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
explicit BlockGroup(BLOCK* block)
: bounding_box(block->bounding_box()),
rotation(block->re_rotation()),
angle(block->re_rotation().angle()),
min_xheight(block->x_height()) {
blocks.push_back(block);
}
// Union of block bounding boxes.
TBOX bounding_box;
// Common rotation of the blocks.
FCOORD rotation;
// Angle of rotation.
float angle;
// Min xheight of the blocks.
float min_xheight;
// Collection of borrowed pointers to the blocks in the group.
GenericVector<BLOCK*> blocks;
};
// Groups blocks by rotation, then, for each group, makes a WordGrid and calls
// TransferDiacriticsToWords to copy the diacritic blobs to the most
// appropriate words in the group of blocks. Source blobs are not touched.
void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
BLOCK_LIST* blocks) {
// Angle difference larger than this is too much to consider equal.
// They should only be in multiples of M_PI/2 anyway.
const double kMaxAngleDiff = 0.01; // About 0.6 degrees.
PointerVector<BlockGroup> groups;
BLOCK_IT bk_it(blocks);
for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
BLOCK* block = bk_it.data();
if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
continue;
}
// Linear search of the groups to find a matching rotation.
float block_angle = block->re_rotation().angle();
int best_g = 0;
float best_angle_diff = MAX_FLOAT32;
for (int g = 0; g < groups.size(); ++g) {
double angle_diff = fabs(block_angle - groups[g]->angle);
if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
if (angle_diff < best_angle_diff) {
best_angle_diff = angle_diff;
best_g = g;
}
}
if (best_angle_diff > kMaxAngleDiff) {
groups.push_back(new BlockGroup(block));
} else {
groups[best_g]->blocks.push_back(block);
groups[best_g]->bounding_box += block->bounding_box();
float x_height = block->x_height();
if (x_height < groups[best_g]->min_xheight)
groups[best_g]->min_xheight = x_height;
}
}
// Now process each group of blocks.
PointerVector<WordWithBox> word_ptrs;
for (int g = 0; g < groups.size(); ++g) {
const BlockGroup* group = groups[g];
tprintf("group %d, xh=%g, %d blocks\n", g, group->min_xheight,
group->blocks.size());
WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
group->bounding_box.topright());
for (int b = 0; b < group->blocks.size(); ++b) {
tprintf("block %d, %d rows\n", b, group->blocks[b]->row_list()->length());
ROW_IT row_it(group->blocks[b]->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
ROW* row = row_it.data();
tprintf("%d words in row\n", row->word_list()->length());
// Put the words of the row into the grid.
WERD_IT w_it(row->word_list());
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
WordWithBox* box_word = new WordWithBox(word);
word_grid.InsertBBox(true, true, box_word);
// Save the pointer where it will be auto-deleted.
word_ptrs.push_back(box_word);
}
}
}
FCOORD rotation = group->rotation;
// Make it a forward rotation that will transform blob coords to block.
rotation.set_y(-rotation.y());
TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
}
}
// Places a copy of blobs that are near a word (after applying rotation to the
// blob) in the most appropriate word, unless there is doubt, in which case a
// blob can end up in two words. Source blobs are not touched.
void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
const FCOORD& rotation,
WordGrid* word_grid) {
WordSearch ws(word_grid);
BLOBNBOX_IT b_it(diacritic_blobs);
// Apply rotation to each blob before finding the nearest words. The rotation
// allows us to only consider above/below placement and not left/right on
// vertical text, because all text is horizontal here.
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOBNBOX* blobnbox = b_it.data();
TBOX blob_box = blobnbox->bounding_box();
blob_box.rotate(rotation);
ws.StartRectSearch(blob_box);
// Above/below refer to word position relative to diacritic. Since some
// scripts eg Kannada/Telugu habitually put diacritics below words, and
// others eg Thai/Vietnamese/Latin put most diacritics above words, try
// for both if there isn't much in it.
WordWithBox* best_above_word = NULL;
WordWithBox* best_below_word = NULL;
int best_above_distance = 0;
int best_below_distance = 0;
for (WordWithBox* word = ws.NextRectSearch(); word != NULL;
word = ws.NextRectSearch()) {
if (word->word()->flag(W_REP_CHAR)) continue;
TBOX word_box = word->true_bounding_box();
int x_distance = blob_box.x_gap(word_box);
int y_distance = blob_box.y_gap(word_box);
if (x_distance > 0) {
// Arbitrarily divide x-distance by 2 if there is a major y overlap,
// and the word is to the left of the diacritic. If the
// diacritic is a dropped broken character between two words, this will
// help send all the pieces to a single word, instead of splitting them
// over the 2 words.
if (word_box.major_y_overlap(blob_box) &&
blob_box.left() > word_box.right()) {
x_distance /= 2;
}
y_distance += x_distance;
}
if (word_box.y_middle() > blob_box.y_middle() &&
(best_above_word == NULL || y_distance < best_above_distance)) {
best_above_word = word;
best_above_distance = y_distance;
}
if (word_box.y_middle() <= blob_box.y_middle() &&
(best_below_word == NULL || y_distance < best_below_distance)) {
best_below_word = word;
best_below_distance = y_distance;
}
}
bool above_good =
best_above_word != NULL &&
(best_below_word == NULL ||
best_above_distance < best_below_distance + blob_box.height());
bool below_good =
best_below_word != NULL && best_below_word != best_above_word &&
(best_above_word == NULL ||
best_below_distance < best_above_distance + blob_box.height());
if (below_good) {
C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
copied_blob->rotate(rotation);
// Put the blob into the word's reject blobs list.
C_BLOB_IT blob_it(best_below_word->RejBlobs());
blob_it.add_to_end(copied_blob);
}
if (above_good) {
C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
copied_blob->rotate(rotation);
// Put the blob into the word's reject blobs list.
C_BLOB_IT blob_it(best_above_word->RejBlobs());
blob_it.add_to_end(copied_blob);
}
}
}
} // tesseract
/**********************************************************************
* tweak_row_baseline
*
* Shift baseline to fit the blobs more accurately where they are
* close enough.
**********************************************************************/
void tweak_row_baseline(ROW *row,
double blshift_maxshift,
double blshift_xfraction) {
TBOX blob_box; //bounding box
C_BLOB *blob; //current blob
WERD *word; //current word
inT32 blob_count; //no of blobs
inT32 src_index; //source segment
inT32 dest_index; //destination segment
inT32 *xstarts; //spline segments
double *coeffs; //spline coeffs
float ydiff; //baseline error
float x_centre; //centre of blob
//words of row
WERD_IT word_it = row->word_list ();
C_BLOB_IT blob_it; //blob iterator
blob_count = 0;
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
word = word_it.data (); //current word
//get total blobs
blob_count += word->cblob_list ()->length ();
}
if (blob_count == 0)
return;
xstarts =
(inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
sizeof (inT32));
coeffs =
(double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
sizeof (double));
src_index = 0;
dest_index = 0;
xstarts[0] = row->baseline.xcoords[0];
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
word = word_it.data (); //current word
//blobs in word
blob_it.set_to_list (word->cblob_list ());
for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
blob_it.forward ()) {
blob = blob_it.data ();
blob_box = blob->bounding_box ();
x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
ydiff = blob_box.bottom () - row->base_line (x_centre);
if (ydiff < 0)
ydiff = -ydiff / row->x_height ();
else
ydiff = ydiff / row->x_height ();
if (ydiff < blshift_maxshift
&& blob_box.height () / row->x_height () > blshift_xfraction) {
if (xstarts[dest_index] >= x_centre)
xstarts[dest_index] = blob_box.left ();
coeffs[dest_index * 3] = 0;
coeffs[dest_index * 3 + 1] = 0;
coeffs[dest_index * 3 + 2] = blob_box.bottom ();
//shift it
dest_index++;
xstarts[dest_index] = blob_box.right () + 1;
}
else {
if (xstarts[dest_index] <= x_centre) {
while (row->baseline.xcoords[src_index + 1] <= x_centre
&& src_index < row->baseline.segments - 1) {
if (row->baseline.xcoords[src_index + 1] >
xstarts[dest_index]) {
coeffs[dest_index * 3] =
row->baseline.quadratics[src_index].a;
coeffs[dest_index * 3 + 1] =
row->baseline.quadratics[src_index].b;
coeffs[dest_index * 3 + 2] =
row->baseline.quadratics[src_index].c;
dest_index++;
xstarts[dest_index] =
row->baseline.xcoords[src_index + 1];
}
src_index++;
}
coeffs[dest_index * 3] =
row->baseline.quadratics[src_index].a;
coeffs[dest_index * 3 + 1] =
row->baseline.quadratics[src_index].b;
coeffs[dest_index * 3 + 2] =
row->baseline.quadratics[src_index].c;
dest_index++;
xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
}
}
}
}
while (src_index < row->baseline.segments
&& row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
src_index++;
while (src_index < row->baseline.segments) {
coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
dest_index++;
src_index++;
xstarts[dest_index] = row->baseline.xcoords[src_index];
}
//turn to spline
row->baseline = QSPLINE (dest_index, xstarts, coeffs);
free_mem(xstarts);
free_mem(coeffs);
}