mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-24 15:47:48 +08:00
167 lines
5.4 KiB
C++
167 lines
5.4 KiB
C++
|
#include "mfcpch.h"
|
||
|
#include "statistc.h"
|
||
|
#include "gap_map.h"
|
||
|
|
||
|
#define EXTERN
|
||
|
EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables");
|
||
|
EXTERN BOOL_VAR (gapmap_use_ends, FALSE,
|
||
|
"Use large space at start and end of rows");
|
||
|
EXTERN BOOL_VAR (gapmap_no_isolated_quanta, FALSE,
|
||
|
"Ensure gaps not less than 2quanta wide");
|
||
|
EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier");
|
||
|
|
||
|
/*************************************************************************
|
||
|
* A block gap map is a quantised histogram of whitespace regions in the
|
||
|
* block. It is a vertical projection of wide gaps WITHIN lines
|
||
|
*
|
||
|
* The map is held as an array of counts of rows which have a wide gap
|
||
|
* covering that region of the row. Each bucket in the map represents a width
|
||
|
* of about half an xheight - (The median of the xhts in the rows is used.)
|
||
|
*
|
||
|
* The block is considered RECTANGULAR - delimited by the left and right
|
||
|
* extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
|
||
|
* counted.
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
GAPMAP::GAPMAP( //Constructor
|
||
|
TO_BLOCK *block //block
|
||
|
) {
|
||
|
TO_ROW_IT row_it; //row iterator
|
||
|
TO_ROW *row; //current row
|
||
|
BLOBNBOX_IT blob_it; //iterator
|
||
|
BOX blob_box;
|
||
|
BOX prev_blob_box;
|
||
|
INT16 gap_width;
|
||
|
INT16 start_of_row;
|
||
|
INT16 end_of_row;
|
||
|
STATS xht_stats (0, 128);
|
||
|
INT16 min_quantum;
|
||
|
INT16 max_quantum;
|
||
|
INT16 i;
|
||
|
|
||
|
row_it.set_to_list (block->get_rows ());
|
||
|
/*
|
||
|
Find left and right extremes and bucket size
|
||
|
*/
|
||
|
map = NULL;
|
||
|
min_left = MAX_INT16;
|
||
|
max_right = -MAX_INT16;
|
||
|
total_rows = 0;
|
||
|
any_tabs = FALSE;
|
||
|
for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
|
||
|
row = row_it.data ();
|
||
|
if (!row->blob_list ()->empty ()) {
|
||
|
total_rows++;
|
||
|
xht_stats.add ((INT16) floor (row->xheight + 0.5), 1);
|
||
|
blob_it.set_to_list (row->blob_list ());
|
||
|
start_of_row = blob_it.data ()->bounding_box ().left ();
|
||
|
end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
|
||
|
if (min_left > start_of_row)
|
||
|
min_left = start_of_row;
|
||
|
if (max_right < end_of_row)
|
||
|
max_right = end_of_row;
|
||
|
}
|
||
|
}
|
||
|
if ((total_rows < 3) || (min_left >= max_right)) {
|
||
|
total_rows = 0;
|
||
|
min_left = max_right = 0;
|
||
|
return;
|
||
|
}
|
||
|
bucket_size = (INT16) floor (xht_stats.median () + 0.5) / 2;
|
||
|
map_max = (max_right - min_left) / bucket_size;
|
||
|
map = (INT16 *) alloc_mem ((map_max + 1) * sizeof (INT16));
|
||
|
for (i = 0; i <= map_max; i++)
|
||
|
map[i] = 0;
|
||
|
|
||
|
for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
|
||
|
row = row_it.data ();
|
||
|
if (!row->blob_list ()->empty ()) {
|
||
|
blob_it.set_to_list (row->blob_list ());
|
||
|
blob_it.mark_cycle_pt ();
|
||
|
blob_box = box_next (&blob_it);
|
||
|
prev_blob_box = blob_box;
|
||
|
if (gapmap_use_ends) {
|
||
|
/* Leading space */
|
||
|
gap_width = blob_box.left () - min_left;
|
||
|
if ((gap_width > gapmap_big_gaps * row->xheight)
|
||
|
&& gap_width > 2) {
|
||
|
max_quantum = (blob_box.left () - min_left) / bucket_size;
|
||
|
for (i = 0; i <= max_quantum; i++)
|
||
|
map[i]++;
|
||
|
}
|
||
|
}
|
||
|
while (!blob_it.cycled_list ()) {
|
||
|
blob_box = box_next (&blob_it);
|
||
|
gap_width = blob_box.left () - prev_blob_box.right ();
|
||
|
if ((gap_width > gapmap_big_gaps * row->xheight)
|
||
|
&& gap_width > 2) {
|
||
|
min_quantum =
|
||
|
(prev_blob_box.right () - min_left) / bucket_size;
|
||
|
max_quantum = (blob_box.left () - min_left) / bucket_size;
|
||
|
for (i = min_quantum; i <= max_quantum; i++)
|
||
|
map[i]++;
|
||
|
}
|
||
|
prev_blob_box = blob_box;
|
||
|
}
|
||
|
if (gapmap_use_ends) {
|
||
|
/* Trailing space */
|
||
|
gap_width = max_right - prev_blob_box.right ();
|
||
|
if ((gap_width > gapmap_big_gaps * row->xheight)
|
||
|
&& gap_width > 2) {
|
||
|
min_quantum =
|
||
|
(prev_blob_box.right () - min_left) / bucket_size;
|
||
|
for (i = min_quantum; i <= map_max; i++)
|
||
|
map[i]++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
for (i = 0; i <= map_max; i++) {
|
||
|
if (map[i] > total_rows / 2) {
|
||
|
if (gapmap_no_isolated_quanta &&
|
||
|
(((i == 0) &&
|
||
|
(map[i + 1] <= total_rows / 2)) ||
|
||
|
((i == map_max) &&
|
||
|
(map[i - 1] <= total_rows / 2)) ||
|
||
|
((i > 0) &&
|
||
|
(i < map_max) &&
|
||
|
(map[i - 1] <= total_rows / 2) &&
|
||
|
(map[i + 1] <= total_rows / 2)))) {
|
||
|
map[i] = 0; //prevent isolated quantum
|
||
|
}
|
||
|
else
|
||
|
any_tabs = TRUE;
|
||
|
}
|
||
|
}
|
||
|
if (gapmap_debug && any_tabs)
|
||
|
tprintf ("Table found\n");
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* GAPMAP::table_gap()
|
||
|
* Is there a bucket in the specified range where more than half the rows in the
|
||
|
* block have a wide gap?
|
||
|
*************************************************************************/
|
||
|
|
||
|
BOOL8 GAPMAP::table_gap( //Is gap a table?
|
||
|
INT16 left, //From here
|
||
|
INT16 right //To here
|
||
|
) {
|
||
|
INT16 min_quantum;
|
||
|
INT16 max_quantum;
|
||
|
INT16 i;
|
||
|
BOOL8 tab_found = FALSE;
|
||
|
|
||
|
if (!any_tabs)
|
||
|
return FALSE;
|
||
|
|
||
|
min_quantum = (left - min_left) / bucket_size;
|
||
|
max_quantum = (right - min_left) / bucket_size;
|
||
|
for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
|
||
|
if (map[i] > total_rows / 2)
|
||
|
tab_found = TRUE;
|
||
|
return tab_found;
|
||
|
}
|