tesseract/textord/blkocc.cpp

811 lines
26 KiB
C++
Raw Normal View History

/*****************************************************************************
*
* File: blkocc.cpp (Formerly blockocc.c)
* Description: Block Occupancy routines
* Author: Chris Newton
* Created: Fri Nov 8
* Modified:
* Language: C++
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1991, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
******************************************************************************/
/*
----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------
*/
#include "mfcpch.h"
#include <ctype.h>
#include <math.h>
#include "errcode.h"
#include "grphics.h"
#include "drawtord.h"
#include "blkocc.h"
#include "notdll.h"
const ERRCODE BLOCKOCC = "BlockOcc";
ELISTIZE (REGION_OCC)
#define EXTERN
EXTERN BOOL_VAR (blockocc_show_result, FALSE,
"Show intermediate results");
/* The values given here should reflect the values of bln_x_height and
* bln_baseline_offset. These are defined as part of the word class
* definition */
EXTERN INT_VAR (blockocc_desc_height, 0,
"Descender height after normalisation");
EXTERN INT_VAR (blockocc_asc_height, 255,
"Ascender height after normalisation");
EXTERN INT_VAR (blockocc_band_count, 4, "Number of bands used");
EXTERN double_VAR (textord_underline_threshold, 0.5,
"Fraction of width occupied");
/* ********************************************************************
A note on transitions.
We want to record occupancy in various bands. In general we need to consider
7 situations:
(1) (2) (3) (4)
\ / \ / \ /
__\_____/_____\_________/_____\_________/______ Upper Limit
\ / \ / \ /
/ \ \-->--/ \--<--/ /-----\
v ^ / \(7)
\ \ \ /
\ \ /--<--\ /-->--\ \-----/
____\______\____/_______\____/_______\_________ Lower Limit
\ \ / \ / \
(5) (6)
We know that following "next" pointers around an outline keeps the black area
on the LEFT. We only need be concerned with situations 1,2,3,5 and 7.
4 and 6 can be ignored as they represent small incursions into a large black
region which will be recorded elsewhere. Situations 3 and 5 define encloseed
areas bounded by the upper and lower limits respectively. Situation 1 is open
to the right, awaiting a closure by a situation 2 which is open to the right.
Situation 7 is entirely enclosed within the band.
The situations are refered to as region types and are determined by
find_region_type.
An empty region type is used to denote entry to an adjacent band and return
to the original band at the same x location.
***********************************************************************/
#define REGION_TYPE_EMPTY 0
#define REGION_TYPE_OPEN_RIGHT 1
#define REGION_TYPE_OPEN_LEFT 2
#define REGION_TYPE_UPPER_BOUND 3
#define REGION_TYPE_UPPER_UNBOUND 4
#define REGION_TYPE_LOWER_BOUND 5
#define REGION_TYPE_LOWER_UNBOUND 6
#define REGION_TYPE_ENCLOSED 7
BAND bands[MAX_NUM_BANDS + 1]; // band defns
/**********************************************************************
* test_underline
*
* Check to see if the blob is an underline.
* Return TRUE if it is.
**********************************************************************/
BOOL8 test_underline( //look for underlines
BOOL8 testing_on, //drawing blob
PBLOB *blob, //blob to test
float baseline, //coords of baseline
float xheight //height of line
) {
INT16 occ;
INT16 blob_width; //width of blob
BOX blob_box; //bounding box
float occs[MAX_NUM_BANDS + 1]; //total occupancy
blob_box = blob->bounding_box ();
set_bands(baseline, xheight); //setup block occ
blob_width = blob->bounding_box ().width ();
if (testing_on) {
// blob->plot(to_win,GOLDENROD,GOLDENROD);
// line_color_index(to_win,GOLDENROD);
// move2d(to_win,blob_box.left(),baseline);
// draw2d(to_win,blob_box.right(),baseline);
// move2d(to_win,blob_box.left(),baseline+xheight);
// draw2d(to_win,blob_box.right(),baseline+xheight);
tprintf
("Testing underline on blob at (%d,%d)->(%d,%d), base=%g\nOccs:",
blob->bounding_box ().left (), blob->bounding_box ().bottom (),
blob->bounding_box ().right (), blob->bounding_box ().top (),
baseline);
}
block_occ(blob, occs);
if (testing_on) {
for (occ = 0; occ <= MAX_NUM_BANDS; occ++)
tprintf ("%g ", occs[occ]);
tprintf ("\n");
}
if (occs[1] > occs[2] + occs[2] && occs[1] > occs[3] + occs[3]
&& occs[1] > blob_width * textord_underline_threshold)
return TRUE; //real underline
if (occs[4] > occs[2] + occs[2]
&& occs[4] > blob_width * textord_underline_threshold)
return TRUE; //overline
return FALSE; //neither
}
/**********************************************************************
* test_underline
*
* Check to see if the blob is an underline.
* Return TRUE if it is.
**********************************************************************/
BOOL8 test_underline( //look for underlines
BOOL8 testing_on, //drawing blob
C_BLOB *blob, //blob to test
INT16 baseline, //coords of baseline
INT16 xheight //height of line
) {
INT16 occ;
INT16 blob_width; //width of blob
BOX blob_box; //bounding box
INT32 desc_occ;
INT32 x_occ;
INT32 asc_occ;
STATS projection;
blob_box = blob->bounding_box ();
blob_width = blob->bounding_box ().width ();
projection.set_range (blob_box.bottom (), blob_box.top () + 1);
if (testing_on) {
// blob->plot(to_win,GOLDENROD,GOLDENROD);
// line_color_index(to_win,GOLDENROD);
// move2d(to_win,blob_box.left(),baseline);
// draw2d(to_win,blob_box.right(),baseline);
// move2d(to_win,blob_box.left(),baseline+xheight);
// draw2d(to_win,blob_box.right(),baseline+xheight);
tprintf
("Testing underline on blob at (%d,%d)->(%d,%d), base=%d\nOccs:",
blob->bounding_box ().left (), blob->bounding_box ().bottom (),
blob->bounding_box ().right (), blob->bounding_box ().top (),
baseline);
}
horizontal_cblob_projection(blob, &projection);
desc_occ = 0;
for (occ = blob_box.bottom (); occ < baseline; occ++)
if (occ <= blob_box.top () && projection.pile_count (occ) > desc_occ)
//max in region
desc_occ = projection.pile_count (occ);
x_occ = 0;
for (occ = baseline; occ <= baseline + xheight; occ++)
if (occ >= blob_box.bottom () && occ <= blob_box.top ()
&& projection.pile_count (occ) > x_occ)
//max in region
x_occ = projection.pile_count (occ);
asc_occ = 0;
for (occ = baseline + xheight + 1; occ <= blob_box.top (); occ++)
if (occ >= blob_box.bottom () && projection.pile_count (occ) > asc_occ)
asc_occ = projection.pile_count (occ);
if (testing_on) {
tprintf ("%d %d %d\n", desc_occ, x_occ, asc_occ);
}
if (desc_occ == 0 && x_occ == 0 && asc_occ == 0) {
tprintf ("Bottom=%d, top=%d, base=%d, x=%d\n",
blob_box.bottom (), blob_box.top (), baseline, xheight);
projection.print (stdout, TRUE);
}
if (desc_occ > x_occ + x_occ
&& desc_occ > blob_width * textord_underline_threshold)
return TRUE; //real underline
if (asc_occ > x_occ + x_occ
&& asc_occ > blob_width * textord_underline_threshold)
return TRUE; //overline
return FALSE; //neither
}
/**********************************************************************
* horizontal_cblob_projection
*
* Compute the horizontal projection of a cblob from its outlines
* and add to the given STATS.
**********************************************************************/
void horizontal_cblob_projection( //project outlines
C_BLOB *blob, //blob to project
STATS *stats //output
) {
//outlines of blob
C_OUTLINE_IT out_it = blob->out_list ();
for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
horizontal_coutline_projection (out_it.data (), stats);
}
}
/**********************************************************************
* horizontal_coutline_projection
*
* Compute the horizontal projection of a outline from its outlines
* and add to the given STATS.
**********************************************************************/
void horizontal_coutline_projection( //project outlines
C_OUTLINE *outline, //outline to project
STATS *stats //output
) {
ICOORD pos; //current point
ICOORD step; //edge step
INT32 length; //of outline
INT16 stepindex; //current step
C_OUTLINE_IT out_it = outline->child ();
pos = outline->start_pos ();
length = outline->pathlength ();
for (stepindex = 0; stepindex < length; stepindex++) {
step = outline->step (stepindex);
if (step.y () > 0) {
stats->add (pos.y (), pos.x ());
}
else if (step.y () < 0) {
stats->add (pos.y () - 1, -pos.x ());
}
pos += step;
}
for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
horizontal_coutline_projection (out_it.data (), stats);
}
}
void set_bands( //init from varibles
float baseline, //top of bottom band
float xheight //height of split band
) {
INT16 int_bl, int_xh; //for band.set
bands[DOT_BAND].set (0, 0, 0, 0, 0, 0);
int_bl = (INT16) baseline;
int_xh = (INT16) xheight;
bands[1].set (int_bl, int_bl, int_bl,
NO_LOWER_LIMIT, NO_LOWER_LIMIT, NO_LOWER_LIMIT);
bands[2].set (int_bl + int_xh / 2, int_bl + int_xh / 2, int_bl + int_xh / 2,
int_bl, int_bl, int_bl);
bands[3].set (int_bl + int_xh, int_bl + int_xh, int_bl + int_xh,
int_bl + int_xh / 2, int_bl + int_xh / 2,
int_bl + int_xh / 2);
bands[4].set (NO_UPPER_LIMIT, NO_UPPER_LIMIT, NO_UPPER_LIMIT,
int_bl + int_xh, int_bl + int_xh, int_bl + int_xh);
}
void
block_occ (PBLOB * blob, //blob to do
float occs[] //output histogram
) {
int band_index; //current band
REGION_OCC *region; //current segment
REGION_OCC_LIST region_occ_list[MAX_NUM_BANDS + 1];
REGION_OCC_IT region_it; //region iterator
find_transitions(blob, region_occ_list);
compress_region_list(region_occ_list);
for (band_index = 0; band_index <= MAX_NUM_BANDS; band_index++) {
occs[band_index] = 0.0f;
region_it.set_to_list (&region_occ_list[band_index]);
for (region_it.mark_cycle_pt (); !region_it.cycled_list ();
region_it.forward ()) {
region = region_it.data ();
occs[band_index] += region->max_x - region->min_x;
}
}
}
void find_transitions(PBLOB *blob, //blob to do
REGION_OCC_LIST *region_occ_list) {
OUTLINE_IT outline_it;
BOX box;
POLYPT_IT pt_it;
FCOORD point1;
FCOORD point2;
FCOORD *entry_pt = &point1;
FCOORD *exit_pt = &point2;
FCOORD *temp_pt;
INT16 increment;
INT16 prev_band;
INT16 band;
INT16 next_band;
float min_x;
float max_x;
float min_y;
float max_y;
BOOL8 doubly_contained;
outline_it = blob->out_list ();
for (outline_it.mark_cycle_pt (); !outline_it.cycled_list ();
outline_it.forward ()) {
find_fbox(&outline_it, &min_x, &min_y, &max_x, &max_y);
if (bands[DOT_BAND].range_in_nominal (max_y, min_y)) {
record_region(DOT_BAND,
min_x,
max_x,
REGION_TYPE_ENCLOSED,
region_occ_list);
}
else {
band = find_containing_maximal_band (max_y, min_y,
&doubly_contained);
if (band != UNDEFINED_BAND) {
//No transitions
if (!doubly_contained)
record_region(band,
min_x,
max_x,
REGION_TYPE_ENCLOSED,
region_occ_list);
else {
// if (wordocc_debug_on && blockocc_show_result)
// {
// fprintf( db_win,
// "Ignoring doubly contained outline (%d, %d) (%d, %d)\n",
// box.left(), box.top(),
// box.right(), box.bottom());
// fprintf( db_win, "\tContained in bands %d and %d\n",
// band, band + 1 );
// }
}
}
else {
//There are transitns
/*
Determining a good start point for recognising transitions between bands
is complicated by error limits on bands. We need to find a line which
significantly occupies a band.
Having found such a point, we need to find a significant transition out of
its band and start the walk around the outline from there.
Note that we are relying on having recognised and dealt with elsewhere,
outlines which do not significantly occupy more than one region. A
particularly nasty case of this are outlines which do not significantly
occupy ANY band. I.e. they lie entirely within the error limits.
Given this condition, all remaining outlines must contain at least one
significant line. */
pt_it = outline_it.data ()->polypts ();
find_significant_line(pt_it, &band);
*entry_pt = pt_it.data ()->pos;
next_region(&pt_it,
band,
&next_band,
&min_x,
&max_x,
&increment,
exit_pt);
pt_it.mark_cycle_pt ();
// Found the first real transition, so start walking the outline from here.
do {
prev_band = band;
band = band + increment;
while (band != next_band) {
temp_pt = entry_pt;
entry_pt = exit_pt;
exit_pt = temp_pt;
min_x = max_x = entry_pt->x ();
find_trans_point (&pt_it, band, band + increment,
exit_pt);
maintain_limits (&min_x, &max_x, exit_pt->x ());
record_region (band,
min_x,
max_x,
find_region_type (prev_band,
band,
band + increment,
entry_pt->x (),
exit_pt->x ()),
region_occ_list);
prev_band = band;
band = band + increment;
}
temp_pt = entry_pt;
entry_pt = exit_pt;
exit_pt = temp_pt;
min_x = max_x = entry_pt->x ();
next_region(&pt_it,
band,
&next_band,
&min_x,
&max_x,
&increment,
exit_pt);
record_region (band,
min_x,
max_x,
find_region_type (prev_band,
band,
band + increment,
entry_pt->x (),
exit_pt->x ()),
region_occ_list);
}
while (!pt_it.cycled_list ());
}
}
}
}
void record_region( //add region on list
INT16 band,
float new_min,
float new_max,
INT16 region_type,
REGION_OCC_LIST *region_occ_list) {
REGION_OCC_IT it (&(region_occ_list[band]));
// if (wordocc_debug_on && blockocc_show_result)
// fprintf( db_win, "\nBand %d, region type %d, from %f to %f",
// band, region_type, new_min, new_max );
if ((region_type == REGION_TYPE_UPPER_UNBOUND) ||
(region_type == REGION_TYPE_LOWER_UNBOUND) ||
(region_type == REGION_TYPE_EMPTY))
return;
if (it.empty ()) {
it.add_after_stay_put (new REGION_OCC (new_min, new_max, region_type));
}
else {
/* Insert in sorted order of average limit */
while ((new_min + new_max > it.data ()->min_x + it.data ()->max_x) &&
(!it.at_last ()))
it.forward ();
if ((it.at_last ()) && //at the end
(new_min + new_max > it.data ()->min_x + it.data ()->max_x))
//new range > current
it.add_after_stay_put (new REGION_OCC (new_min,
new_max, region_type));
else {
it.add_before_stay_put (new REGION_OCC (new_min,
new_max, region_type));
}
}
}
INT16 find_containing_maximal_band( //find range's band
float y1,
float y2,
BOOL8 *doubly_contained) {
INT16 band;
*doubly_contained = FALSE;
for (band = 1; band <= blockocc_band_count; band++) {
if (bands[band].range_in_maximal (y1, y2)) {
if ((band < blockocc_band_count) &&
(bands[band + 1].range_in_maximal (y1, y2)))
*doubly_contained = TRUE;
return band;
}
}
return UNDEFINED_BAND;
}
void find_significant_line(POLYPT_IT it, INT16 *band) {
/* Look for a line which significantly occupies at least one band. I.e. part
of the line is in the non-margin part of the band. */
*band = find_overlapping_minimal_band (it.data ()->pos.y (),
it.data ()->pos.y () +
it.data ()->vec.y ());
while (*band == UNDEFINED_BAND) {
it.forward ();
*band = find_overlapping_minimal_band (it.data ()->pos.y (),
it.data ()->pos.y () +
it.data ()->vec.y ());
}
}
INT16 find_overlapping_minimal_band( //find range's band
float y1,
float y2) {
INT16 band;
for (band = 1; band <= blockocc_band_count; band++) {
if (bands[band].range_overlaps_minimal (y1, y2))
return band;
}
return UNDEFINED_BAND;
}
INT16 find_region_type(INT16 entry_band,
INT16 current_band,
INT16 exit_band,
float entry_x,
float exit_x) {
if (entry_band > exit_band)
return REGION_TYPE_OPEN_RIGHT;
if (entry_band < exit_band)
return REGION_TYPE_OPEN_LEFT;
if (entry_x == exit_x)
return REGION_TYPE_EMPTY;
if (entry_band > current_band) {
if (entry_x < exit_x)
return REGION_TYPE_UPPER_BOUND;
else
return REGION_TYPE_UPPER_UNBOUND;
}
else {
if (entry_x > exit_x)
return REGION_TYPE_LOWER_BOUND;
else
return REGION_TYPE_LOWER_UNBOUND;
}
}
void find_trans_point(POLYPT_IT *pt_it,
INT16 current_band,
INT16 next_band,
FCOORD *transition_pt) {
float x1, x2, y1, y2; // points of edge
float gradient; // m in y = mx + c
float offset; // c in y = mx + c
if (current_band < next_band)
transition_pt->set_y (bands[current_band].max);
//going up
else
transition_pt->set_y (bands[current_band].min);
//going down
x1 = pt_it->data ()->pos.x ();
x2 = x1 + pt_it->data ()->vec.x ();
y1 = pt_it->data ()->pos.y ();
y2 = y1 + pt_it->data ()->vec.y ();
if (x1 == x2)
transition_pt->set_x (x1); //avoid div by 0
else {
if (y1 == y2) //avoid div by 0
transition_pt->set_x ((x1 + x2) / 2.0);
else {
gradient = (y1 - y2) / (float) (x1 - x2);
offset = y1 - x1 * gradient;
transition_pt->set_x ((transition_pt->y () - offset) / gradient);
}
}
}
void next_region(POLYPT_IT *start_pt_it,
INT16 start_band,
INT16 *to_band,
float *min_x,
float *max_x,
INT16 *increment,
FCOORD *exit_pt) {
/*
Given an edge and a band which the edge significantly occupies, find the
significant end of the region containing the band. I.e. find an edge which
points to another band such that the outline subsequetly moves significantly
out of the starting band.
Note that we can assume that we are significantly inside the current band to
start with because the edges passed will be from previous calls to this
routine apart from the first - the result of which is only used to establish
the start of the first region.
*/
INT16 band; //band of current edge
INT16 prev_band = start_band; //band of prev edge
//edge crossing out
POLYPT_IT last_transition_out_it;
//band it pts to
INT16 last_trans_out_to_band = 0;
float ext_min_x = 0.0f;
float ext_max_x = 0.0f;
start_pt_it->forward ();
band = find_band (start_pt_it->data ()->pos.y ());
while ((band == start_band) ||
bands[start_band].in_maximal (start_pt_it->data ()->pos.y ())) {
if (band == start_band) {
//Return to start band
if (prev_band != start_band) {
*min_x = ext_min_x;
*max_x = ext_max_x;
}
maintain_limits (min_x, max_x, start_pt_it->data ()->pos.x ());
}
else {
if (prev_band == start_band) {
//Exit from start band
//so remember edge
last_transition_out_it = *start_pt_it;
//before we left
last_transition_out_it.backward ();
//and band it pts to
last_trans_out_to_band = band;
ext_min_x = *min_x;
ext_max_x = *max_x;
}
maintain_limits (&ext_min_x, &ext_max_x,
start_pt_it->data ()->pos.x ());
}
prev_band = band;
start_pt_it->forward ();
band = find_band (start_pt_it->data ()->pos.y ());
}
if (prev_band == start_band) { //exit from start band
*to_band = band;
//so remember edge
last_transition_out_it = *start_pt_it;
//before we left
last_transition_out_it.backward ();
}
else {
*to_band = last_trans_out_to_band;
}
if (*to_band > start_band)
*increment = 1;
else
*increment = -1;
find_trans_point (&last_transition_out_it, start_band,
start_band + *increment, exit_pt);
maintain_limits (min_x, max_x, exit_pt->x ());
*start_pt_it = last_transition_out_it;
}
INT16 find_band( // find POINT's band
float y) {
INT16 band;
for (band = 1; band <= blockocc_band_count; band++) {
if (bands[band].in_nominal (y))
return band;
}
BLOCKOCC.error ("find_band", ABORT, "Cant find band for %d", y);
return 0;
}
void compress_region_list( // join open regions
REGION_OCC_LIST *region_occ_list) {
REGION_OCC_IT it (&(region_occ_list[0]));
REGION_OCC *open_right = NULL;
INT16 i = 0;
for (i = 0; i <= blockocc_band_count; i++) {
it.set_to_list (&(region_occ_list[i]));
if (!it.empty ()) {
/* First check for left right pairs. Merge them into the open right and delete
the open left. */
open_right = NULL;
for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
switch (it.data ()->region_type) {
case REGION_TYPE_OPEN_RIGHT:
{
if (open_right != NULL)
BLOCKOCC.error ("compress_region_list", ABORT,
"unmatched right");
else
open_right = it.data ();
break;
}
case REGION_TYPE_OPEN_LEFT:
{
if (open_right == NULL)
BLOCKOCC.error ("compress_region_list", ABORT,
"unmatched left");
else {
open_right->max_x = it.data ()->max_x;
open_right = NULL;
delete it.extract ();
}
break;
}
default:
break;
}
}
if (open_right != NULL)
BLOCKOCC.error ("compress_region_list", ABORT,
"unmatched right remaining");
/* Now cycle the list again, merging and deleting any redundant regions */
it.move_to_first ();
open_right = it.data ();
while (!it.at_last ()) {
it.forward ();
if (it.data ()->min_x <= open_right->max_x) {
// Overlaps
if (it.data ()->max_x > open_right->max_x)
open_right->max_x = it.data ()->max_x;
// Extend
delete it.extract ();
}
else
open_right = it.data ();
}
}
}
}
void find_fbox(OUTLINE_IT *out_it,
float *min_x,
float *min_y,
float *max_x,
float *max_y) {
POLYPT_IT pt_it = out_it->data ()->polypts ();
FCOORD pt;
*min_x = 9999.0f;
*min_y = 9999.0f;
*max_x = 0.0f;
*max_y = 0.0f;
for (pt_it.mark_cycle_pt (); !pt_it.cycled_list (); pt_it.forward ()) {
pt = pt_it.data ()->pos;
maintain_limits (min_x, max_x, pt.x ());
maintain_limits (min_y, max_y, pt.y ());
}
}
void maintain_limits(float *min_x, float *max_x, float x) {
if (x > *max_x)
*max_x = x;
if (x < *min_x)
*min_x = x;
}