tesseract/textord/edgblob.cpp
2008-02-01 00:48:25 +00:00

413 lines
15 KiB
C++

/**********************************************************************
* File: edgblob.c (Formerly edgeloop.c)
* Description: Functions to clean up an outline before approximation.
* Author: Ray Smith
* Created: Tue Mar 26 16:56:25 GMT 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfcpch.h"
//#include "dirtab.h"
#include "scanedg.h"
#include "drawedg.h"
#include "edgloop.h"
#include "edgblob.h"
#define EXTERN
EXTERN INT_VAR (edges_children_per_grandchild, 10,
"Importance ratio for chucking outlines");
EXTERN INT_VAR (edges_children_count_limit, 45, "Max holes allowed in blob");
EXTERN BOOL_VAR (edges_children_fix, FALSE,
"Remove boxy parents of char-like children");
EXTERN INT_VAR (edges_min_nonhole, 12,
"Min pixels for potential char in box");
EXTERN INT_VAR (edges_patharea_ratio, 40,
"Max lensq/area for acceptable child outline");
EXTERN double_VAR (edges_childarea, 0.5,
"Max area fraction of child outline");
EXTERN double_VAR (edges_boxarea, 0.875,
"Min area fraction of grandchild for box");
/**********************************************************************
* OL_BUCKETS::OL_BUCKETS
*
* Construct an array of buckets for associating outlines into blobs.
**********************************************************************/
OL_BUCKETS::OL_BUCKETS (
////constructor
ICOORD bleft, //corners
ICOORD tright): bl (bleft), tr (tright) {
bxdim = (tright.x () - bleft.x ()) / BUCKETSIZE + 1;
bydim = (tright.y () - bleft.y ()) / BUCKETSIZE + 1;
//make array
buckets = new C_OUTLINE_LIST[bxdim * bydim];
index = 0;
}
/**********************************************************************
* OL_BUCKETS::operator(
*
* Return a pointer to a list of C_OUTLINEs corresponding to the
* given pixel coordinates.
**********************************************************************/
C_OUTLINE_LIST *
OL_BUCKETS::operator () ( //array access
INT16 x, //image coords
INT16 y) {
return &buckets[(y - bl.y ()) / BUCKETSIZE * bxdim +
(x - bl.x ()) / BUCKETSIZE];
}
/**********************************************************************
* OL_BUCKETS::count_children
*
* Find number of descendants of this outline.
**********************************************************************/
INT32 OL_BUCKETS::count_children( //recursive count
C_OUTLINE *outline, //parent outline
INT32 max_count //max output
) {
BOOL8 parent_box; //could it be boxy
INT16 xmin, xmax; //coord limits
INT16 ymin, ymax;
INT16 xindex, yindex; //current bucket
C_OUTLINE *child; //current child
INT32 child_count; //no of children
INT32 grandchild_count; //no of grandchildren
INT32 parent_area; //potential box
FLOAT32 max_parent_area; //potential box
INT32 child_area; //current child
INT32 child_length; //current child
BOX olbox;
C_OUTLINE_IT child_it; //search iterator
olbox = outline->bounding_box ();
xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
child_count = 0;
grandchild_count = 0;
parent_area = 0;
max_parent_area = 0;
parent_box = TRUE;
for (yindex = ymin; yindex <= ymax; yindex++) {
for (xindex = xmin; xindex <= xmax; xindex++) {
child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
if (child_it.empty ())
continue;
for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
child_it.forward ()) {
child = child_it.data ();
if (child != outline && *child < *outline) {
child_count++;
if (child_count <= max_count) {
int max_grand = (max_count - child_count) /
edges_children_per_grandchild;
if (max_grand > 0)
grandchild_count += count_children (child, max_grand) *
edges_children_per_grandchild;
else
grandchild_count += count_children(child, 1);
}
if (child_count + grandchild_count > max_count) {
/* err.log(RESULT_OKAY,E_LOC,ERR_OCR,
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
"Discarding parent with child count=%d, gc=%d",
child_count,grandchild_count);*/
return child_count + grandchild_count;
}
if (parent_area == 0) {
parent_area = outline->outer_area ();
if (parent_area < 0)
parent_area = -parent_area;
max_parent_area = outline->bounding_box ().width ()
* outline->bounding_box ().height () * edges_boxarea;
if (parent_area < max_parent_area)
parent_box = FALSE;
}
if (parent_box
&& (!edges_children_fix
|| child->bounding_box ().height () >
edges_min_nonhole) /**/) {
child_area = child->outer_area ();
if (child_area < 0)
child_area = -child_area;
if (edges_children_fix) {
if (parent_area - child_area < max_parent_area) {
parent_box = FALSE;
continue;
}
if (grandchild_count > 0) {
/* err.log(RESULT_OKAY,E_LOC,ERR_OCR,
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
"Discarding parent of area %d, child area=%d, max%g with gc=%d",
parent_area,child_area,max_parent_area,grandchild_count);*/
return max_count + 1;
}
child_length = child->pathlength ();
if (child_length * child_length >
child_area * edges_patharea_ratio) {
/*/ err.log(RESULT_OKAY,E_LOC,ERR_OCR,
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
"Discarding parent of area %d, child area=%d, max%g with child length=%d",
parent_area,child_area,max_parent_area,child_length);*/
return max_count + 1;
}
}
if (child_area < child->bounding_box ().width ()
* child->bounding_box ().height () *
edges_childarea) {
/* err.log(RESULT_OKAY,E_LOC,ERR_OCR,
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
"Discarding parent of area %d, child area=%d, max%g with child rect=%d",
parent_area,child_area,max_parent_area,child->bounding_box().width()
*child->bounding_box().height()); */
return max_count + 1;
}
}
}
}
}
}
return child_count + grandchild_count;
}
/**********************************************************************
* OL_BUCKETS::extract_children
*
* Find number of descendants of this outline.
**********************************************************************/
void OL_BUCKETS::extract_children( //recursive count
C_OUTLINE *outline, //parent outline
C_OUTLINE_IT *it //destination iterator
) {
INT16 xmin, xmax; //coord limits
INT16 ymin, ymax;
INT16 xindex, yindex; //current bucket
BOX olbox;
C_OUTLINE_IT child_it; //search iterator
olbox = outline->bounding_box ();
xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
for (yindex = ymin; yindex <= ymax; yindex++) {
for (xindex = xmin; xindex <= xmax; xindex++) {
child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
child_it.forward ()) {
if (*child_it.data () < *outline) {
it->add_after_then_move (child_it.extract ());
}
}
}
}
}
/**********************************************************************
* extract_edges
*
* Run the edge detector over the block and return a list of blobs.
**********************************************************************/
void extract_edges( //find blobs
#ifndef GRAPHICS_DISABLED
ScrollView* window, //window for output
#endif
IMAGE *image, //image to scan
IMAGE *t_image, //thresholded image
ICOORD page_tr, //corner of page
BLOCK *block //block to scan
) {
ICOORD bleft; //block box
ICOORD tright;
C_OUTLINE_LIST outlines; //outlines in block
//iterator
C_OUTLINE_IT out_it = &outlines;
#ifndef GRAPHICS_DISABLED
get_outlines (window, image, t_image, page_tr, (PDBLK *) block, &out_it);
#else
get_outlines (image, t_image, page_tr, (PDBLK *) block, &out_it);
#endif
//block box
block->bounding_box (bleft, tright);
//make blobs
outlines_to_blobs(block, bleft, tright, &outlines);
}
/**********************************************************************
* outlines_to_blobs
*
* Gather together outlines into blobs using the usual bucket sort.
**********************************************************************/
void outlines_to_blobs( //find blobs
BLOCK *block, //block to scan
ICOORD bleft, //block box //outlines in block
ICOORD tright,
C_OUTLINE_LIST *outlines) {
//make buckets
OL_BUCKETS buckets(bleft, tright);
fill_buckets(outlines, &buckets);
empty_buckets(block, &buckets);
}
/**********************************************************************
* fill_buckets
*
* Run the edge detector over the block and return a list of blobs.
**********************************************************************/
void fill_buckets( //find blobs
C_OUTLINE_LIST *outlines, //outlines in block
OL_BUCKETS *buckets //output buckets
) {
BOX ol_box; //outline box
C_OUTLINE_IT out_it = outlines;//iterator
C_OUTLINE_IT bucket_it; //iterator in bucket
C_OUTLINE *outline; //current outline
for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
outline = out_it.extract (); //take off list
//get box
ol_box = outline->bounding_box ();
bucket_it.set_to_list ((*buckets) (ol_box.left (), ol_box.bottom ()));
bucket_it.add_to_end (outline);
}
}
/**********************************************************************
* empty_buckets
*
* Run the edge detector over the block and return a list of blobs.
**********************************************************************/
void empty_buckets( //find blobs
BLOCK *block, //block to scan
OL_BUCKETS *buckets //output buckets
) {
BOOL8 good_blob; //healthy blob
C_OUTLINE_LIST outlines; //outlines in block
//iterator
C_OUTLINE_IT out_it = &outlines;
C_OUTLINE_IT bucket_it = buckets->start_scan ();
C_OUTLINE_IT parent_it; //parent outline
C_BLOB *blob; //new blob
C_BLOB_IT good_blobs = block->blob_list ();
C_BLOB_IT junk_blobs = block->reject_blobs ();
while (!bucket_it.empty ()) {
out_it.set_to_list (&outlines);
do {
parent_it = bucket_it; //find outermost
do
bucket_it.forward ();
while (!bucket_it.at_first ()
&& !(*parent_it.data () < *bucket_it.data ()));
}
while (!bucket_it.at_first ());
//move to new list
out_it.add_after_then_move (parent_it.extract ());
good_blob = capture_children (buckets, &junk_blobs, &out_it);
blob = new C_BLOB (&outlines);
if (good_blob)
good_blobs.add_after_then_move (blob);
else
junk_blobs.add_after_then_move (blob);
bucket_it.set_to_list (buckets->scan_next ());
}
}
/**********************************************************************
* capture_children
*
* Find all neighbouring outlines that are children of this outline
* and either move them to the output list or declare this outline
* illegal and return FALSE.
**********************************************************************/
BOOL8 capture_children( //find children
OL_BUCKETS *buckets, //bucket sort clanss
C_BLOB_IT *reject_it, //dead grandchildren
C_OUTLINE_IT *blob_it //output outlines
) {
C_OUTLINE *outline; //master outline
INT32 child_count; //no of children
outline = blob_it->data ();
child_count = buckets->count_children (outline, edges_children_count_limit);
if (child_count > edges_children_count_limit)
return FALSE;
if (child_count == 0)
return TRUE;
//get single level
buckets->extract_children (outline, blob_it);
#if 0
C_BLOB *blob; //reject
C_OUTLINE *child; //child under test
C_OUTLINE_LIST r_list; //rejects
C_OUTLINE_IT r_it; //iterator
C_OUTLINE_IT test_it; //for grandchildren
BOOL8 anydone; //anything canned
if (child_count == 1)
return TRUE;
do {
anydone = FALSE;
blob_it->move_to_first ();
for (blob_it->mark_cycle_pt (); !blob_it->cycled_list ();
blob_it->forward ()) {
child = blob_it->data ();
if (child != outline) {
for (test_it = *blob_it, test_it.mark_cycle_pt ();
!test_it.cycled_list (); test_it.forward ()) {
if (test_it.data () != child && *test_it.data () < *child) {
r_it.set_to_list (&r_list);
r_it.add_after_then_move (test_it.extract ());
//turn to blob
blob = new C_BLOB (&r_list);
reject_it->add_after_then_move (blob);
anydone = TRUE;
}
}
if (anydone)
break; //got to reatart
}
}
}
while (anydone); //got to restart
#endif
return TRUE;
}