mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 06:53:36 +08:00
0f3067db36
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@147 d0cd1f9f-072b-0410-8dd7-cf729c803f20
413 lines
15 KiB
C++
413 lines
15 KiB
C++
/**********************************************************************
|
|
* File: edgblob.c (Formerly edgeloop.c)
|
|
* Description: Functions to clean up an outline before approximation.
|
|
* Author: Ray Smith
|
|
* Created: Tue Mar 26 16:56:25 GMT 1991
|
|
*
|
|
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include "mfcpch.h"
|
|
//#include "dirtab.h"
|
|
#include "scanedg.h"
|
|
#include "drawedg.h"
|
|
#include "edgloop.h"
|
|
#include "edgblob.h"
|
|
|
|
#define EXTERN
|
|
|
|
EXTERN INT_VAR (edges_children_per_grandchild, 10,
|
|
"Importance ratio for chucking outlines");
|
|
EXTERN INT_VAR (edges_children_count_limit, 45, "Max holes allowed in blob");
|
|
EXTERN BOOL_VAR (edges_children_fix, FALSE,
|
|
"Remove boxy parents of char-like children");
|
|
EXTERN INT_VAR (edges_min_nonhole, 12,
|
|
"Min pixels for potential char in box");
|
|
EXTERN INT_VAR (edges_patharea_ratio, 40,
|
|
"Max lensq/area for acceptable child outline");
|
|
EXTERN double_VAR (edges_childarea, 0.5,
|
|
"Max area fraction of child outline");
|
|
EXTERN double_VAR (edges_boxarea, 0.875,
|
|
"Min area fraction of grandchild for box");
|
|
|
|
/**********************************************************************
|
|
* OL_BUCKETS::OL_BUCKETS
|
|
*
|
|
* Construct an array of buckets for associating outlines into blobs.
|
|
**********************************************************************/
|
|
|
|
OL_BUCKETS::OL_BUCKETS (
|
|
////constructor
|
|
ICOORD bleft, //corners
|
|
ICOORD tright): bl (bleft), tr (tright) {
|
|
|
|
bxdim = (tright.x () - bleft.x ()) / BUCKETSIZE + 1;
|
|
bydim = (tright.y () - bleft.y ()) / BUCKETSIZE + 1;
|
|
//make array
|
|
buckets = new C_OUTLINE_LIST[bxdim * bydim];
|
|
index = 0;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* OL_BUCKETS::operator(
|
|
*
|
|
* Return a pointer to a list of C_OUTLINEs corresponding to the
|
|
* given pixel coordinates.
|
|
**********************************************************************/
|
|
|
|
C_OUTLINE_LIST *
|
|
OL_BUCKETS::operator () ( //array access
|
|
INT16 x, //image coords
|
|
INT16 y) {
|
|
return &buckets[(y - bl.y ()) / BUCKETSIZE * bxdim +
|
|
(x - bl.x ()) / BUCKETSIZE];
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* OL_BUCKETS::count_children
|
|
*
|
|
* Find number of descendants of this outline.
|
|
**********************************************************************/
|
|
|
|
INT32 OL_BUCKETS::count_children( //recursive count
|
|
C_OUTLINE *outline, //parent outline
|
|
INT32 max_count //max output
|
|
) {
|
|
BOOL8 parent_box; //could it be boxy
|
|
INT16 xmin, xmax; //coord limits
|
|
INT16 ymin, ymax;
|
|
INT16 xindex, yindex; //current bucket
|
|
C_OUTLINE *child; //current child
|
|
INT32 child_count; //no of children
|
|
INT32 grandchild_count; //no of grandchildren
|
|
INT32 parent_area; //potential box
|
|
FLOAT32 max_parent_area; //potential box
|
|
INT32 child_area; //current child
|
|
INT32 child_length; //current child
|
|
BOX olbox;
|
|
C_OUTLINE_IT child_it; //search iterator
|
|
|
|
olbox = outline->bounding_box ();
|
|
xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
|
|
xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
|
|
ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
|
|
ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
|
|
child_count = 0;
|
|
grandchild_count = 0;
|
|
parent_area = 0;
|
|
max_parent_area = 0;
|
|
parent_box = TRUE;
|
|
for (yindex = ymin; yindex <= ymax; yindex++) {
|
|
for (xindex = xmin; xindex <= xmax; xindex++) {
|
|
child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
|
|
if (child_it.empty ())
|
|
continue;
|
|
for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
|
|
child_it.forward ()) {
|
|
child = child_it.data ();
|
|
if (child != outline && *child < *outline) {
|
|
child_count++;
|
|
if (child_count <= max_count) {
|
|
int max_grand = (max_count - child_count) /
|
|
edges_children_per_grandchild;
|
|
if (max_grand > 0)
|
|
grandchild_count += count_children (child, max_grand) *
|
|
edges_children_per_grandchild;
|
|
else
|
|
grandchild_count += count_children(child, 1);
|
|
}
|
|
if (child_count + grandchild_count > max_count) {
|
|
/* err.log(RESULT_OKAY,E_LOC,ERR_OCR,
|
|
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
|
|
"Discarding parent with child count=%d, gc=%d",
|
|
child_count,grandchild_count);*/
|
|
return child_count + grandchild_count;
|
|
}
|
|
if (parent_area == 0) {
|
|
parent_area = outline->outer_area ();
|
|
if (parent_area < 0)
|
|
parent_area = -parent_area;
|
|
max_parent_area = outline->bounding_box ().width ()
|
|
* outline->bounding_box ().height () * edges_boxarea;
|
|
if (parent_area < max_parent_area)
|
|
parent_box = FALSE;
|
|
}
|
|
if (parent_box
|
|
&& (!edges_children_fix
|
|
|| child->bounding_box ().height () >
|
|
edges_min_nonhole) /**/) {
|
|
child_area = child->outer_area ();
|
|
if (child_area < 0)
|
|
child_area = -child_area;
|
|
if (edges_children_fix) {
|
|
if (parent_area - child_area < max_parent_area) {
|
|
parent_box = FALSE;
|
|
continue;
|
|
}
|
|
if (grandchild_count > 0) {
|
|
/* err.log(RESULT_OKAY,E_LOC,ERR_OCR,
|
|
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
|
|
"Discarding parent of area %d, child area=%d, max%g with gc=%d",
|
|
parent_area,child_area,max_parent_area,grandchild_count);*/
|
|
return max_count + 1;
|
|
}
|
|
child_length = child->pathlength ();
|
|
if (child_length * child_length >
|
|
child_area * edges_patharea_ratio) {
|
|
/*/ err.log(RESULT_OKAY,E_LOC,ERR_OCR,
|
|
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
|
|
"Discarding parent of area %d, child area=%d, max%g with child length=%d",
|
|
parent_area,child_area,max_parent_area,child_length);*/
|
|
return max_count + 1;
|
|
}
|
|
}
|
|
if (child_area < child->bounding_box ().width ()
|
|
* child->bounding_box ().height () *
|
|
edges_childarea) {
|
|
/* err.log(RESULT_OKAY,E_LOC,ERR_OCR,
|
|
ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
|
|
"Discarding parent of area %d, child area=%d, max%g with child rect=%d",
|
|
parent_area,child_area,max_parent_area,child->bounding_box().width()
|
|
*child->bounding_box().height()); */
|
|
return max_count + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return child_count + grandchild_count;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* OL_BUCKETS::extract_children
|
|
*
|
|
* Find number of descendants of this outline.
|
|
**********************************************************************/
|
|
|
|
void OL_BUCKETS::extract_children( //recursive count
|
|
C_OUTLINE *outline, //parent outline
|
|
C_OUTLINE_IT *it //destination iterator
|
|
) {
|
|
INT16 xmin, xmax; //coord limits
|
|
INT16 ymin, ymax;
|
|
INT16 xindex, yindex; //current bucket
|
|
BOX olbox;
|
|
C_OUTLINE_IT child_it; //search iterator
|
|
|
|
olbox = outline->bounding_box ();
|
|
xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
|
|
xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
|
|
ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
|
|
ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
|
|
for (yindex = ymin; yindex <= ymax; yindex++) {
|
|
for (xindex = xmin; xindex <= xmax; xindex++) {
|
|
child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
|
|
for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
|
|
child_it.forward ()) {
|
|
if (*child_it.data () < *outline) {
|
|
it->add_after_then_move (child_it.extract ());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* extract_edges
|
|
*
|
|
* Run the edge detector over the block and return a list of blobs.
|
|
**********************************************************************/
|
|
|
|
void extract_edges( //find blobs
|
|
#ifndef GRAPHICS_DISABLED
|
|
ScrollView* window, //window for output
|
|
#endif
|
|
IMAGE *image, //image to scan
|
|
IMAGE *t_image, //thresholded image
|
|
ICOORD page_tr, //corner of page
|
|
BLOCK *block //block to scan
|
|
) {
|
|
ICOORD bleft; //block box
|
|
ICOORD tright;
|
|
C_OUTLINE_LIST outlines; //outlines in block
|
|
//iterator
|
|
C_OUTLINE_IT out_it = &outlines;
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
get_outlines (window, image, t_image, page_tr, (PDBLK *) block, &out_it);
|
|
#else
|
|
get_outlines (image, t_image, page_tr, (PDBLK *) block, &out_it);
|
|
#endif
|
|
//block box
|
|
block->bounding_box (bleft, tright);
|
|
//make blobs
|
|
outlines_to_blobs(block, bleft, tright, &outlines);
|
|
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* outlines_to_blobs
|
|
*
|
|
* Gather together outlines into blobs using the usual bucket sort.
|
|
**********************************************************************/
|
|
|
|
void outlines_to_blobs( //find blobs
|
|
BLOCK *block, //block to scan
|
|
ICOORD bleft, //block box //outlines in block
|
|
ICOORD tright,
|
|
C_OUTLINE_LIST *outlines) {
|
|
//make buckets
|
|
OL_BUCKETS buckets(bleft, tright);
|
|
|
|
fill_buckets(outlines, &buckets);
|
|
empty_buckets(block, &buckets);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* fill_buckets
|
|
*
|
|
* Run the edge detector over the block and return a list of blobs.
|
|
**********************************************************************/
|
|
|
|
void fill_buckets( //find blobs
|
|
C_OUTLINE_LIST *outlines, //outlines in block
|
|
OL_BUCKETS *buckets //output buckets
|
|
) {
|
|
BOX ol_box; //outline box
|
|
C_OUTLINE_IT out_it = outlines;//iterator
|
|
C_OUTLINE_IT bucket_it; //iterator in bucket
|
|
C_OUTLINE *outline; //current outline
|
|
|
|
for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
|
|
outline = out_it.extract (); //take off list
|
|
//get box
|
|
ol_box = outline->bounding_box ();
|
|
bucket_it.set_to_list ((*buckets) (ol_box.left (), ol_box.bottom ()));
|
|
bucket_it.add_to_end (outline);
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* empty_buckets
|
|
*
|
|
* Run the edge detector over the block and return a list of blobs.
|
|
**********************************************************************/
|
|
|
|
void empty_buckets( //find blobs
|
|
BLOCK *block, //block to scan
|
|
OL_BUCKETS *buckets //output buckets
|
|
) {
|
|
BOOL8 good_blob; //healthy blob
|
|
C_OUTLINE_LIST outlines; //outlines in block
|
|
//iterator
|
|
C_OUTLINE_IT out_it = &outlines;
|
|
C_OUTLINE_IT bucket_it = buckets->start_scan ();
|
|
C_OUTLINE_IT parent_it; //parent outline
|
|
C_BLOB *blob; //new blob
|
|
C_BLOB_IT good_blobs = block->blob_list ();
|
|
C_BLOB_IT junk_blobs = block->reject_blobs ();
|
|
|
|
while (!bucket_it.empty ()) {
|
|
out_it.set_to_list (&outlines);
|
|
do {
|
|
parent_it = bucket_it; //find outermost
|
|
do
|
|
bucket_it.forward ();
|
|
while (!bucket_it.at_first ()
|
|
&& !(*parent_it.data () < *bucket_it.data ()));
|
|
}
|
|
while (!bucket_it.at_first ());
|
|
|
|
//move to new list
|
|
out_it.add_after_then_move (parent_it.extract ());
|
|
good_blob = capture_children (buckets, &junk_blobs, &out_it);
|
|
blob = new C_BLOB (&outlines);
|
|
if (good_blob)
|
|
good_blobs.add_after_then_move (blob);
|
|
else
|
|
junk_blobs.add_after_then_move (blob);
|
|
|
|
bucket_it.set_to_list (buckets->scan_next ());
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* capture_children
|
|
*
|
|
* Find all neighbouring outlines that are children of this outline
|
|
* and either move them to the output list or declare this outline
|
|
* illegal and return FALSE.
|
|
**********************************************************************/
|
|
|
|
BOOL8 capture_children( //find children
|
|
OL_BUCKETS *buckets, //bucket sort clanss
|
|
C_BLOB_IT *reject_it, //dead grandchildren
|
|
C_OUTLINE_IT *blob_it //output outlines
|
|
) {
|
|
C_OUTLINE *outline; //master outline
|
|
INT32 child_count; //no of children
|
|
|
|
outline = blob_it->data ();
|
|
child_count = buckets->count_children (outline, edges_children_count_limit);
|
|
if (child_count > edges_children_count_limit)
|
|
return FALSE;
|
|
if (child_count == 0)
|
|
return TRUE;
|
|
//get single level
|
|
buckets->extract_children (outline, blob_it);
|
|
#if 0
|
|
C_BLOB *blob; //reject
|
|
C_OUTLINE *child; //child under test
|
|
C_OUTLINE_LIST r_list; //rejects
|
|
C_OUTLINE_IT r_it; //iterator
|
|
C_OUTLINE_IT test_it; //for grandchildren
|
|
BOOL8 anydone; //anything canned
|
|
if (child_count == 1)
|
|
return TRUE;
|
|
do {
|
|
anydone = FALSE;
|
|
blob_it->move_to_first ();
|
|
for (blob_it->mark_cycle_pt (); !blob_it->cycled_list ();
|
|
blob_it->forward ()) {
|
|
child = blob_it->data ();
|
|
if (child != outline) {
|
|
for (test_it = *blob_it, test_it.mark_cycle_pt ();
|
|
!test_it.cycled_list (); test_it.forward ()) {
|
|
if (test_it.data () != child && *test_it.data () < *child) {
|
|
r_it.set_to_list (&r_list);
|
|
r_it.add_after_then_move (test_it.extract ());
|
|
//turn to blob
|
|
blob = new C_BLOB (&r_list);
|
|
reject_it->add_after_then_move (blob);
|
|
anydone = TRUE;
|
|
}
|
|
}
|
|
if (anydone)
|
|
break; //got to reatart
|
|
}
|
|
}
|
|
}
|
|
while (anydone); //got to restart
|
|
#endif
|
|
return TRUE;
|
|
}
|