/********************************************************************** * File: tordmain.cpp (Formerly textordp.c) * Description: C++ top level textord code. * Author: Ray Smith * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #define _USE_MATH_DEFINES // for M_PI #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif #include "tordmain.h" #include // for FLT_MAX #include // for ceil, floor, M_PI #include // for INT16_MAX, uint32_t, int32_t, int16_t #include "allheaders.h" // for pixDestroy, pixGetHeight, boxCreate #include "arrayaccess.h" // for GET_DATA_BYTE #include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B... #include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction #include "clst.h" // for CLISTIZE #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE #include "drawtord.h" // for plot_box_list, to_win, create_to_win #include "edgblob.h" // for extract_edges #include "errcode.h" // for set_global_loc_code, ASSERT_HOST, LOC... #include "genericvector.h" // for PointerVector, GenericVector #include "makerow.h" // for textord_test_x, textord_test_y, texto... #include "morph.h" // for L_BOUNDARY_BG #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only) #include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base... #include "params.h" // for DoubleParam, BoolParam, IntParam #include "pdblock.h" // for PDBLK #include "points.h" // for FCOORD, ICOORD #include "polyblk.h" // for POLY_BLOCK #include "quadratc.h" // for QUAD_COEFFS #include "quspline.h" // for QSPLINE, tweak_row_baseline #include "rect.h" // for TBOX #include "scrollview.h" // for ScrollView, ScrollView::WHITE #include "statistc.h" // for STATS #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST #include "textord.h" // for Textord, WordWithBox, WordGrid, WordS... #include "tprintf.h" // for tprintf #include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP struct Box; #define MAX_NEAREST_DIST 600 //for block skew stats namespace tesseract { CLISTIZE(WordWithBox) /********************************************************************** * SetBlobStrokeWidth * * Set the horizontal and vertical stroke widths in the blob. **********************************************************************/ void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) { // Cut the blob rectangle into a Pix. int pix_height = pixGetHeight(pix); const TBOX& box = blob->bounding_box(); int width = box.width(); int height = box.height(); Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(), width, height); Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr); boxDestroy(&blob_pix_box); Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG); pixDestroy(&pix_blob); // Compute the stroke widths. uint32_t* data = pixGetData(dist_pix); int wpl = pixGetWpl(dist_pix); // Horizontal width of stroke. STATS h_stats(0, width + 1); for (int y = 0; y < height; ++y) { uint32_t* pixels = data + y*wpl; int prev_pixel = 0; int pixel = GET_DATA_BYTE(pixels, 0); for (int x = 1; x < width; ++x) { int next_pixel = GET_DATA_BYTE(pixels, x); // We are looking for a pixel that is equal to its vertical neighbours, // yet greater than its left neighbour. if (prev_pixel < pixel && (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) { if (pixel > next_pixel) { // Single local max, so an odd width. h_stats.add(pixel * 2 - 1, 1); } else if (pixel == next_pixel && x + 1 < width && pixel > GET_DATA_BYTE(pixels, x + 1)) { // Double local max, so an even width. h_stats.add(pixel * 2, 1); } } prev_pixel = pixel; pixel = next_pixel; } } // Vertical width of stroke. STATS v_stats(0, height + 1); for (int x = 0; x < width; ++x) { int prev_pixel = 0; int pixel = GET_DATA_BYTE(data, x); for (int y = 1; y < height; ++y) { uint32_t* pixels = data + y*wpl; int next_pixel = GET_DATA_BYTE(pixels, x); // We are looking for a pixel that is equal to its horizontal neighbours, // yet greater than its upper neighbour. if (prev_pixel < pixel && (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) { if (pixel > next_pixel) { // Single local max, so an odd width. v_stats.add(pixel * 2 - 1, 1); } else if (pixel == next_pixel && y + 1 < height && pixel > GET_DATA_BYTE(pixels + wpl, x)) { // Double local max, so an even width. v_stats.add(pixel * 2, 1); } } prev_pixel = pixel; pixel = next_pixel; } } pixDestroy(&dist_pix); // Store the horizontal and vertical width in the blob, keeping both // widths if there is enough information, otherwise only the one with // the most samples. // If there are insufficient samples, store zero, rather than using // 2*area/perimeter, as the numbers that gives do not match the numbers // from the distance method. if (h_stats.get_total() >= (width + height) / 4) { blob->set_horz_stroke_width(h_stats.ile(0.5f)); if (v_stats.get_total() >= (width + height) / 4) blob->set_vert_stroke_width(v_stats.ile(0.5f)); else blob->set_vert_stroke_width(0.0f); } else { if (v_stats.get_total() >= (width + height) / 4 || v_stats.get_total() > h_stats.get_total()) { blob->set_horz_stroke_width(0.0f); blob->set_vert_stroke_width(v_stats.ile(0.5f)); } else { blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) : 0.0f); blob->set_vert_stroke_width(0.0f); } } } /********************************************************************** * assign_blobs_to_blocks2 * * Make a list of TO_BLOCKs for portrait and landscape orientation. **********************************************************************/ void assign_blobs_to_blocks2(Pix* pix, BLOCK_LIST *blocks, // blocks to process TO_BLOCK_LIST *port_blocks) { // output list BLOCK *block; // current block BLOBNBOX *newblob; // created blob C_BLOB *blob; // current blob BLOCK_IT block_it = blocks; C_BLOB_IT blob_it; // iterator BLOBNBOX_IT port_box_it; // iterator // destination iterator TO_BLOCK_IT port_block_it = port_blocks; TO_BLOCK *port_block; // created block for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { block = block_it.data(); port_block = new TO_BLOCK(block); // Convert the good outlines to block->blob_list port_box_it.set_to_list(&port_block->blobs); blob_it.set_to_list(block->blob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { blob = blob_it.extract(); newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. SetBlobStrokeWidth(pix, newblob); port_box_it.add_after_then_move(newblob); } // Put the rejected outlines in block->noise_blobs, which allows them to // be reconsidered and sorted back into rows and recover outlines mistakenly // rejected. port_box_it.set_to_list(&port_block->noise_blobs); blob_it.set_to_list(block->reject_blobs()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { blob = blob_it.extract(); newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. SetBlobStrokeWidth(pix, newblob); port_box_it.add_after_then_move(newblob); } port_block_it.add_after_then_move(port_block); } } /********************************************************************** * find_components * * Find the C_OUTLINEs of the connected components in each block, put them * in C_BLOBs, and filter them by size, putting the different size * grades on different lists in the matching TO_BLOCK in to_blocks. **********************************************************************/ void Textord::find_components(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) { int width = pixGetWidth(pix); int height = pixGetHeight(pix); if (width > INT16_MAX || height > INT16_MAX) { tprintf("Input image too large! (%d, %d)\n", width, height); return; // Can't handle it. } set_global_loc_code(LOC_EDGE_PROG); BLOCK_IT block_it(blocks); // iterator for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) { extract_edges(pix, block); } } assign_blobs_to_blocks2(pix, blocks, to_blocks); ICOORD page_tr(width, height); filter_blobs(page_tr, to_blocks, !textord_test_landscape); } /********************************************************************** * filter_blobs * * Sort the blobs into sizes in all the blocks for later work. **********************************************************************/ void Textord::filter_blobs(ICOORD page_tr, // top right TO_BLOCK_LIST* blocks, // output list bool testing_on) { // for plotting TO_BLOCK_IT block_it = blocks; // destination iterator TO_BLOCK *block; // created block #ifndef GRAPHICS_DISABLED if (to_win != nullptr) to_win->Clear(); #endif // GRAPHICS_DISABLED for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { block = block_it.data(); block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs, &block->large_blobs); if (block->line_size == 0) block->line_size = 1; block->line_spacing = block->line_size * (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + 2 * tesseract::CCStruct::kAscenderFraction) / tesseract::CCStruct::kXHeightFraction; block->line_size *= textord_min_linesize; block->max_blob_size = block->line_size * textord_excess_blobsize; #ifndef GRAPHICS_DISABLED if (textord_show_blobs && testing_on) { if (to_win == nullptr) create_to_win(page_tr); block->plot_graded_blobs(to_win); } if (textord_show_boxes && testing_on) { if (to_win == nullptr) create_to_win(page_tr); plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); plot_box_list(to_win, &block->blobs, ScrollView::WHITE); } #endif // GRAPHICS_DISABLED } } /********************************************************************** * filter_noise_blobs * * Move small blobs to a separate list. **********************************************************************/ float Textord::filter_noise_blobs( BLOBNBOX_LIST *src_list, // original list BLOBNBOX_LIST *noise_list, // noise list BLOBNBOX_LIST *small_list, // small blobs BLOBNBOX_LIST *large_list) { // large blobs int16_t height; //height of blob int16_t width; //of blob BLOBNBOX *blob; //current blob float initial_x; //first guess BLOBNBOX_IT src_it = src_list; //iterators BLOBNBOX_IT noise_it = noise_list; BLOBNBOX_IT small_it = small_list; BLOBNBOX_IT large_it = large_list; STATS size_stats (0, MAX_NEAREST_DIST); //blob heights float min_y; //size limits float max_y; float max_x; float max_height; //of good blobs for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { blob = src_it.data(); if (blob->bounding_box().height() < textord_max_noise_size) noise_it.add_after_then_move(src_it.extract()); else if (blob->enclosed_area() >= blob->bounding_box().height() * blob->bounding_box().width() * textord_noise_area_ratio) small_it.add_after_then_move(src_it.extract()); } for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { size_stats.add(src_it.data()->bounding_box().height(), 1); } initial_x = size_stats.ile(textord_initialx_ile); max_y = ceil(initial_x * (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + 2 * tesseract::CCStruct::kAscenderFraction) / tesseract::CCStruct::kXHeightFraction); min_y = floor (initial_x / 2); max_x = ceil (initial_x * textord_width_limit); small_it.move_to_first (); for (small_it.mark_cycle_pt (); !small_it.cycled_list (); small_it.forward ()) { height = small_it.data()->bounding_box().height(); if (height > max_y) large_it.add_after_then_move(small_it.extract ()); else if (height >= min_y) src_it.add_after_then_move(small_it.extract ()); } size_stats.clear (); for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { height = src_it.data ()->bounding_box ().height (); width = src_it.data ()->bounding_box ().width (); if (height < min_y) small_it.add_after_then_move (src_it.extract ()); else if (height > max_y || width > max_x) large_it.add_after_then_move (src_it.extract ()); else size_stats.add (height, 1); } max_height = size_stats.ile (textord_initialasc_ile); // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", // max_y,min_y,initial_x,max_height); max_height *= tesseract::CCStruct::kXHeightCapRatio; if (max_height > initial_x) initial_x = max_height; // tprintf(" ret=%g\n",initial_x); return initial_x; } // Fixes the block so it obeys all the rules: // Must have at least one ROW. // Must have at least one WERD. // WERDs contain a fake blob. void Textord::cleanup_nontext_block(BLOCK* block) { // Non-text blocks must contain at least one row. ROW_IT row_it(block->row_list()); if (row_it.empty()) { const TBOX& box = block->pdblk.bounding_box(); float height = box.height(); int32_t xstarts[2] = {box.left(), box.right()}; double coeffs[3] = {0.0, 0.0, static_cast(box.bottom())}; ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); row_it.add_after_then_move(row); } // Each row must contain at least one word. for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); WERD_IT w_it(row->word_list()); if (w_it.empty()) { // Make a fake blob to put in the word. TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); C_BLOB* blob = C_BLOB::FakeBlob(box); C_BLOB_LIST blobs; C_BLOB_IT blob_it(&blobs); blob_it.add_after_then_move(blob); WERD* word = new WERD(&blobs, 0, nullptr); w_it.add_after_then_move(word); } // Each word must contain a fake blob. for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); // Just assert that this is true, as it would be useful to find // out why it isn't. ASSERT_HOST(!word->cblob_list()->empty()); } row->recalc_bounding_box(); } } /********************************************************************** * cleanup_blocks * * Delete empty blocks, rows from the page. **********************************************************************/ void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) { BLOCK_IT block_it = blocks; //iterator ROW_IT row_it; //row iterator int num_rows = 0; int num_rows_all = 0; int num_blocks = 0; int num_blocks_all = 0; for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { cleanup_nontext_block(block); continue; } num_rows = 0; num_rows_all = 0; if (clean_noise) { row_it.set_to_list(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); ++num_rows_all; clean_small_noise_from_words(row); if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) || row->word_list()->empty()) { delete row_it.extract(); // lose empty row. } else { if (textord_noise_rejwords) clean_noise_from_words(row_it.data()); if (textord_blshift_maxshift >= 0) tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); ++num_rows; } } } if (block->row_list()->empty()) { delete block_it.extract(); // Lose empty text blocks. } else { ++num_blocks; } ++num_blocks_all; if (textord_noise_debug) tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); } if (textord_noise_debug) tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); } /********************************************************************** * clean_noise_from_row * * Move blobs of words from rows of garbage into the reject blobs list. **********************************************************************/ bool Textord::clean_noise_from_row( //remove empties ROW* row //row to clean ) { bool testing_on; TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count = 0; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t super_norm_count; //real char-like //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator testing_on = textord_test_y > row->base_line (textord_test_x) && textord_show_blobs && textord_test_y < row->base_line (textord_test_x) + row->x_height (); dot_count = 0; norm_count = 0; super_norm_count = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) super_norm_count++; //count smal outlines } } else super_norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; if (testing_on) { tprintf ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left (), blob_box.bottom (), blob_box.right (), blob_box.top (), blob->out_list ()->length (), trans_count, blob_box.bottom () - row->base_line (blob_box.left ())); } } } if (textord_noise_debug) { tprintf ("Row ending at (%d,%g):", blob_box.right (), row->base_line (blob_box.right ())); tprintf (" R=%g, dc=%d, nc=%d, %s\n", norm_count > 0 ? static_cast(dot_count) / norm_count : 9999, dot_count, norm_count, dot_count > norm_count * textord_noise_normratio && dot_count > 2 ? "REJECTED" : "ACCEPTED"); } return super_norm_count < textord_noise_sncount && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; } /********************************************************************** * clean_noise_from_words * * Move blobs of words from rows of garbage into the reject blobs list. **********************************************************************/ void Textord::clean_noise_from_words( //remove empties ROW *row //row to clean ) { TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t dud_words; //number discarded int32_t ok_words; //number remaining int32_t word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0 || textord_no_rejects) return; // was it chucked std::vector word_dud(ok_words); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2 && !word->flag(W_REP_CHAR)) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else { word_dud[word_index] = 0; } if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { word = word_it.data(); // Current word. // Previously we threw away the entire word. // Now just aggressively throw all small blobs into the reject list, where // the classifier can decide whether they are actually needed. word->CleanNoise(textord_noise_sizelimit * row->x_height()); } word_index++; } } // Remove outlines that are a tiny fraction in either width or height // of the word height. void Textord::clean_small_noise_from_words(ROW *row) { WERD_IT word_it(row->word_list()); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { WERD* word = word_it.data(); int min_size = static_cast( textord_noise_hfract * word->bounding_box().height() + 0.5); C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT out_it(blob->out_list()); for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { C_OUTLINE* outline = out_it.data(); outline->RemoveSmallRecursive(min_size, &out_it); } if (blob->out_list()->empty()) { delete blob_it.extract(); } } if (word->cblob_list()->empty()) { if (!word_it.at_last()) { // The next word is no longer a fuzzy non space if it was before, // since the word before is about to be deleted. WERD* next_word = word_it.data_relative(1); if (next_word->flag(W_FUZZY_NON)) { next_word->set_flag(W_FUZZY_NON, false); } } delete word_it.extract(); } } } // Local struct to hold a group of blocks. struct BlockGroup { BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} explicit BlockGroup(BLOCK* block) : bounding_box(block->pdblk.bounding_box()), rotation(block->re_rotation()), angle(block->re_rotation().angle()), min_xheight(block->x_height()) { blocks.push_back(block); } // Union of block bounding boxes. TBOX bounding_box; // Common rotation of the blocks. FCOORD rotation; // Angle of rotation. float angle; // Min xheight of the blocks. float min_xheight; // Collection of borrowed pointers to the blocks in the group. GenericVector blocks; }; // Groups blocks by rotation, then, for each group, makes a WordGrid and calls // TransferDiacriticsToWords to copy the diacritic blobs to the most // appropriate words in the group of blocks. Source blobs are not touched. void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, BLOCK_LIST* blocks) { // Angle difference larger than this is too much to consider equal. // They should only be in multiples of M_PI/2 anyway. const double kMaxAngleDiff = 0.01; // About 0.6 degrees. PointerVector groups; BLOCK_IT bk_it(blocks); for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { BLOCK* block = bk_it.data(); if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { continue; } // Linear search of the groups to find a matching rotation. float block_angle = block->re_rotation().angle(); int best_g = 0; float best_angle_diff = FLT_MAX; for (int g = 0; g < groups.size(); ++g) { double angle_diff = fabs(block_angle - groups[g]->angle); if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI); if (angle_diff < best_angle_diff) { best_angle_diff = angle_diff; best_g = g; } } if (best_angle_diff > kMaxAngleDiff) { groups.push_back(new BlockGroup(block)); } else { groups[best_g]->blocks.push_back(block); groups[best_g]->bounding_box += block->pdblk.bounding_box(); float x_height = block->x_height(); if (x_height < groups[best_g]->min_xheight) groups[best_g]->min_xheight = x_height; } } // Now process each group of blocks. PointerVector word_ptrs; for (int g = 0; g < groups.size(); ++g) { const BlockGroup* group = groups[g]; if (group->bounding_box.null_box()) continue; WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), group->bounding_box.topright()); for (int b = 0; b < group->blocks.size(); ++b) { ROW_IT row_it(group->blocks[b]->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Put the words of the row into the grid. WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); auto* box_word = new WordWithBox(word); word_grid.InsertBBox(true, true, box_word); // Save the pointer where it will be auto-deleted. word_ptrs.push_back(box_word); } } } FCOORD rotation = group->rotation; // Make it a forward rotation that will transform blob coords to block. rotation.set_y(-rotation.y()); TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); } } // Places a copy of blobs that are near a word (after applying rotation to the // blob) in the most appropriate word, unless there is doubt, in which case a // blob can end up in two words. Source blobs are not touched. void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs, const FCOORD& rotation, WordGrid* word_grid) { WordSearch ws(word_grid); BLOBNBOX_IT b_it(diacritic_blobs); // Apply rotation to each blob before finding the nearest words. The rotation // allows us to only consider above/below placement and not left/right on // vertical text, because all text is horizontal here. for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOBNBOX* blobnbox = b_it.data(); TBOX blob_box = blobnbox->bounding_box(); blob_box.rotate(rotation); ws.StartRectSearch(blob_box); // Above/below refer to word position relative to diacritic. Since some // scripts eg Kannada/Telugu habitually put diacritics below words, and // others eg Thai/Vietnamese/Latin put most diacritics above words, try // for both if there isn't much in it. WordWithBox* best_above_word = nullptr; WordWithBox* best_below_word = nullptr; int best_above_distance = 0; int best_below_distance = 0; for (WordWithBox* word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) { if (word->word()->flag(W_REP_CHAR)) continue; TBOX word_box = word->true_bounding_box(); int x_distance = blob_box.x_gap(word_box); int y_distance = blob_box.y_gap(word_box); if (x_distance > 0) { // Arbitrarily divide x-distance by 2 if there is a major y overlap, // and the word is to the left of the diacritic. If the // diacritic is a dropped broken character between two words, this will // help send all the pieces to a single word, instead of splitting them // over the 2 words. if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) { x_distance /= 2; } y_distance += x_distance; } if (word_box.y_middle() > blob_box.y_middle() && (best_above_word == nullptr || y_distance < best_above_distance)) { best_above_word = word; best_above_distance = y_distance; } if (word_box.y_middle() <= blob_box.y_middle() && (best_below_word == nullptr || y_distance < best_below_distance)) { best_below_word = word; best_below_distance = y_distance; } } bool above_good = best_above_word != nullptr && (best_below_word == nullptr || best_above_distance < best_below_distance + blob_box.height()); bool below_good = best_below_word != nullptr && best_below_word != best_above_word && (best_above_word == nullptr || best_below_distance < best_above_distance + blob_box.height()); if (below_good) { C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); copied_blob->rotate(rotation); // Put the blob into the word's reject blobs list. C_BLOB_IT blob_it(best_below_word->RejBlobs()); blob_it.add_to_end(copied_blob); } if (above_good) { C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); copied_blob->rotate(rotation); // Put the blob into the word's reject blobs list. C_BLOB_IT blob_it(best_above_word->RejBlobs()); blob_it.add_to_end(copied_blob); } } } } // tesseract /********************************************************************** * tweak_row_baseline * * Shift baseline to fit the blobs more accurately where they are * close enough. **********************************************************************/ void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) { TBOX blob_box; //bounding box C_BLOB *blob; //current blob WERD *word; //current word int32_t blob_count; //no of blobs int32_t src_index; //source segment int32_t dest_index; //destination segment float ydiff; //baseline error float x_centre; //centre of blob //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator blob_count = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //get total blobs blob_count += word->cblob_list ()->length (); } if (blob_count == 0) return; // spline segments std::vector xstarts(blob_count + row->baseline.segments + 1); // spline coeffs std::vector coeffs((blob_count + row->baseline.segments) * 3); src_index = 0; dest_index = 0; xstarts[0] = row->baseline.xcoords[0]; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); x_centre = (blob_box.left () + blob_box.right ()) / 2.0; ydiff = blob_box.bottom () - row->base_line (x_centre); if (ydiff < 0) ydiff = -ydiff / row->x_height (); else ydiff = ydiff / row->x_height (); if (ydiff < blshift_maxshift && blob_box.height () / row->x_height () > blshift_xfraction) { if (xstarts[dest_index] >= x_centre) xstarts[dest_index] = blob_box.left (); coeffs[dest_index * 3] = 0; coeffs[dest_index * 3 + 1] = 0; coeffs[dest_index * 3 + 2] = blob_box.bottom (); //shift it dest_index++; xstarts[dest_index] = blob_box.right () + 1; } else { if (xstarts[dest_index] <= x_centre) { while (row->baseline.xcoords[src_index + 1] <= x_centre && src_index < row->baseline.segments - 1) { if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) { coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; } src_index++; } coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; } } } } while (src_index < row->baseline.segments && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) src_index++; while (src_index < row->baseline.segments) { coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; src_index++; xstarts[dest_index] = row->baseline.xcoords[src_index]; } //turn to spline row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]); }