mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-11 23:19:04 +08:00
d11dc049e3
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1015 d0cd1f9f-072b-0410-8dd7-cf729c803f20
503 lines
19 KiB
C++
503 lines
19 KiB
C++
/**********************************************************************
|
|
* File: devanagari_processing.cpp
|
|
* Description: Methods to process images containing devanagari symbols,
|
|
* prior to classification.
|
|
* Author: Shobhit Saxena
|
|
* Created: Mon Nov 17 20:26:01 IST 2008
|
|
*
|
|
* (C) Copyright 2008, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
|
|
#include "devanagari_processing.h"
|
|
#include "allheaders.h"
|
|
#include "tordmain.h"
|
|
#include "statistc.h"
|
|
|
|
// Flags controlling the debugging information for shiro-rekha splitting
|
|
// strategies.
|
|
INT_VAR(devanagari_split_debuglevel, 0,
|
|
"Debug level for split shiro-rekha process.");
|
|
|
|
BOOL_VAR(devanagari_split_debugimage, 0,
|
|
"Whether to create a debug image for split shiro-rekha process.");
|
|
|
|
namespace tesseract {
|
|
|
|
ShiroRekhaSplitter::ShiroRekhaSplitter() {
|
|
orig_pix_ = NULL;
|
|
segmentation_block_list_ = NULL;
|
|
splitted_image_ = NULL;
|
|
global_xheight_ = kUnspecifiedXheight;
|
|
perform_close_ = false;
|
|
debug_image_ = NULL;
|
|
pageseg_split_strategy_ = NO_SPLIT;
|
|
ocr_split_strategy_ = NO_SPLIT;
|
|
}
|
|
|
|
ShiroRekhaSplitter::~ShiroRekhaSplitter() {
|
|
Clear();
|
|
}
|
|
|
|
void ShiroRekhaSplitter::Clear() {
|
|
pixDestroy(&orig_pix_);
|
|
pixDestroy(&splitted_image_);
|
|
pageseg_split_strategy_ = NO_SPLIT;
|
|
ocr_split_strategy_ = NO_SPLIT;
|
|
pixDestroy(&debug_image_);
|
|
segmentation_block_list_ = NULL;
|
|
global_xheight_ = kUnspecifiedXheight;
|
|
perform_close_ = false;
|
|
}
|
|
|
|
// This method dumps a debug image to the specified location.
|
|
void ShiroRekhaSplitter::DumpDebugImage(const char* filename) const {
|
|
pixWrite(filename, debug_image_, IFF_PNG);
|
|
}
|
|
|
|
// On setting the input image, a clone of it is owned by this class.
|
|
void ShiroRekhaSplitter::set_orig_pix(Pix* pix) {
|
|
if (orig_pix_) {
|
|
pixDestroy(&orig_pix_);
|
|
}
|
|
orig_pix_ = pixClone(pix);
|
|
}
|
|
|
|
// Top-level method to perform splitting based on current settings.
|
|
// Returns true if a split was actually performed.
|
|
// split_for_pageseg should be true if the splitting is being done prior to
|
|
// page segmentation. This mode uses the flag
|
|
// pageseg_devanagari_split_strategy to determine the splitting strategy.
|
|
bool ShiroRekhaSplitter::Split(bool split_for_pageseg) {
|
|
SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ :
|
|
ocr_split_strategy_;
|
|
if (split_strategy == NO_SPLIT) {
|
|
return false; // Nothing to do.
|
|
}
|
|
ASSERT_HOST(split_strategy == MINIMAL_SPLIT ||
|
|
split_strategy == MAXIMAL_SPLIT);
|
|
ASSERT_HOST(orig_pix_);
|
|
if (devanagari_split_debuglevel > 0) {
|
|
tprintf("Splitting shiro-rekha ...\n");
|
|
tprintf("Split strategy = %s\n",
|
|
split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
|
|
tprintf("Initial pageseg available = %s\n",
|
|
segmentation_block_list_ ? "yes" : "no");
|
|
}
|
|
// Create a copy of original image to store the splitting output.
|
|
pixDestroy(&splitted_image_);
|
|
splitted_image_ = pixCopy(NULL, orig_pix_);
|
|
|
|
// Initialize debug image if required.
|
|
if (devanagari_split_debugimage) {
|
|
pixDestroy(&debug_image_);
|
|
debug_image_ = pixConvertTo32(orig_pix_);
|
|
}
|
|
|
|
// Determine all connected components in the input image. A close operation
|
|
// may be required prior to this, depending on the current settings.
|
|
Pix* pix_for_ccs = pixClone(orig_pix_);
|
|
if (perform_close_ && global_xheight_ != kUnspecifiedXheight &&
|
|
!segmentation_block_list_) {
|
|
if (devanagari_split_debuglevel > 0) {
|
|
tprintf("Performing a global close operation..\n");
|
|
}
|
|
// A global measure is available for xheight, but no local information
|
|
// exists.
|
|
pixDestroy(&pix_for_ccs);
|
|
pix_for_ccs = pixCopy(NULL, orig_pix_);
|
|
PerformClose(pix_for_ccs, global_xheight_);
|
|
}
|
|
Pixa* ccs;
|
|
Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
|
|
boxaDestroy(&tmp_boxa);
|
|
pixDestroy(&pix_for_ccs);
|
|
|
|
// Iterate over all connected components. Get their bounding boxes and clip
|
|
// out the image regions corresponding to these boxes from the original image.
|
|
// Conditionally run splitting on each of them.
|
|
Boxa* regions_to_clear = boxaCreate(0);
|
|
for (int i = 0; i < pixaGetCount(ccs); ++i) {
|
|
Box* box = ccs->boxa->box[i];
|
|
Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL);
|
|
ASSERT_HOST(word_pix);
|
|
int xheight = GetXheightForCC(box);
|
|
if (xheight == kUnspecifiedXheight && segmentation_block_list_ &&
|
|
devanagari_split_debugimage) {
|
|
pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
|
|
}
|
|
// If some xheight measure is available, attempt to pre-eliminate small
|
|
// blobs from the shiro-rekha process. This is primarily to save the CCs
|
|
// corresponding to punctuation marks/small dots etc which are part of
|
|
// larger graphemes.
|
|
if (xheight == kUnspecifiedXheight ||
|
|
(box->w > xheight / 3 && box->h > xheight / 2)) {
|
|
SplitWordShiroRekha(split_strategy, word_pix, xheight,
|
|
box->x, box->y, regions_to_clear);
|
|
} else if (devanagari_split_debuglevel > 0) {
|
|
tprintf("CC dropped from splitting: %d,%d (%d, %d)\n",
|
|
box->x, box->y, box->w, box->h);
|
|
}
|
|
pixDestroy(&word_pix);
|
|
}
|
|
// Actually clear the boxes now.
|
|
for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
|
|
Box* box = boxaGetBox(regions_to_clear, i, L_CLONE);
|
|
pixClearInRect(splitted_image_, box);
|
|
boxDestroy(&box);
|
|
}
|
|
boxaDestroy(®ions_to_clear);
|
|
pixaDestroy(&ccs);
|
|
if (devanagari_split_debugimage) {
|
|
DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" :
|
|
"ocr_split_debug.png");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Method to perform a close operation on the input image. The xheight
|
|
// estimate decides the size of sel used.
|
|
void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) {
|
|
pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
|
|
}
|
|
|
|
// This method resolves the cc bbox to a particular row and returns the row's
|
|
// xheight.
|
|
int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
|
|
if (!segmentation_block_list_) {
|
|
return global_xheight_;
|
|
}
|
|
// Compute the box coordinates in Tesseract's coordinate system.
|
|
TBOX bbox(cc_bbox->x,
|
|
pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
|
|
cc_bbox->x + cc_bbox->w,
|
|
pixGetHeight(orig_pix_) - cc_bbox->y - 1);
|
|
// Iterate over all blocks.
|
|
BLOCK_IT block_it(segmentation_block_list_);
|
|
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
|
|
BLOCK* block = block_it.data();
|
|
// Iterate over all rows in the block.
|
|
ROW_IT row_it(block->row_list());
|
|
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
|
ROW* row = row_it.data();
|
|
if (!row->bounding_box().major_overlap(bbox)) {
|
|
continue;
|
|
}
|
|
// Row could be skewed, warped, etc. Use the position of the box to
|
|
// determine the baseline position of the row for that x-coordinate.
|
|
// Create a square TBOX whose baseline's mid-point lies at this point
|
|
// and side is row's xheight. Take the overlap of this box with the input
|
|
// box and check if it is a 'major overlap'. If so, this box lies in this
|
|
// row. In that case, return the xheight for this row.
|
|
float box_middle = 0.5 * (bbox.left() + bbox.right());
|
|
int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
|
|
TBOX test_box(box_middle - row->x_height() / 2,
|
|
baseline,
|
|
box_middle + row->x_height() / 2,
|
|
static_cast<int>(baseline + row->x_height()));
|
|
// Compute overlap. If it is is a major overlap, this is the right row.
|
|
if (bbox.major_overlap(test_box)) {
|
|
return row->x_height();
|
|
}
|
|
}
|
|
}
|
|
// No row found for this bbox.
|
|
return kUnspecifiedXheight;
|
|
}
|
|
|
|
// Returns a list of regions (boxes) which should be cleared in the original
|
|
// image so as to perform shiro-rekha splitting. Pix is assumed to carry one
|
|
// (or less) word only. Xheight measure could be the global estimate, the row
|
|
// estimate, or unspecified. If unspecified, over splitting may occur, since a
|
|
// conservative estimate of stroke width along with an associated multiplier
|
|
// is used in its place. It is advisable to have a specified xheight when
|
|
// splitting for classification/training.
|
|
// A vertical projection histogram of all the on-pixels in the input pix is
|
|
// computed. The maxima of this histogram is regarded as an approximate location
|
|
// of the shiro-rekha. By descending on the maxima's peak on both sides,
|
|
// stroke width of shiro-rekha is estimated.
|
|
// A horizontal projection histogram is computed for a sub-image of the input
|
|
// image, which extends from just below the shiro-rekha down to a certain
|
|
// leeway. The leeway depends on the input xheight, if provided, else a
|
|
// conservative multiplier on approximate stroke width is used (which may lead
|
|
// to over-splitting).
|
|
void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy,
|
|
Pix* pix,
|
|
int xheight,
|
|
int word_left,
|
|
int word_top,
|
|
Boxa* regions_to_clear) {
|
|
if (split_strategy == NO_SPLIT) {
|
|
return;
|
|
}
|
|
int width = pixGetWidth(pix);
|
|
int height = pixGetHeight(pix);
|
|
// Statistically determine the yextents of the shiro-rekha.
|
|
int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
|
|
GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom,
|
|
&shirorekha_ylevel);
|
|
// Since the shiro rekha is also a stroke, its width is equal to the stroke
|
|
// width.
|
|
int stroke_width = shirorekha_bottom - shirorekha_top + 1;
|
|
|
|
// Some safeguards to protect CCs we do not want to be split.
|
|
// These are particularly useful when the word wasn't eliminated earlier
|
|
// because xheight information was unavailable.
|
|
if (shirorekha_ylevel > height / 2) {
|
|
// Shirorekha shouldn't be in the bottom half of the word.
|
|
if (devanagari_split_debuglevel > 0) {
|
|
tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n",
|
|
word_left, word_top);
|
|
}
|
|
return;
|
|
}
|
|
if (stroke_width > height / 3) {
|
|
// Even the boldest of fonts shouldn't do this.
|
|
if (devanagari_split_debuglevel > 0) {
|
|
tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n",
|
|
word_left, word_top);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Clear the ascender and descender regions of the word.
|
|
// Obtain a vertical projection histogram for the resulting image.
|
|
Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3,
|
|
width, 5 * stroke_width / 3);
|
|
Pix* word_in_xheight = pixCopy(NULL, pix);
|
|
pixClearInRect(word_in_xheight, box_to_clear);
|
|
// Also clear any pixels which are below shirorekha_bottom + some leeway.
|
|
// The leeway is set to xheight if the information is available, else it is a
|
|
// multiplier applied to the stroke width.
|
|
int leeway_to_keep = stroke_width * 3;
|
|
if (xheight != kUnspecifiedXheight) {
|
|
// This is because the xheight-region typically includes the shiro-rekha
|
|
// inside it, i.e., the top of the xheight range corresponds to the top of
|
|
// shiro-rekha.
|
|
leeway_to_keep = xheight - stroke_width;
|
|
}
|
|
box_to_clear->y = shirorekha_bottom + leeway_to_keep;
|
|
box_to_clear->h = height - box_to_clear->y;
|
|
pixClearInRect(word_in_xheight, box_to_clear);
|
|
boxDestroy(&box_to_clear);
|
|
|
|
PixelHistogram vert_hist;
|
|
vert_hist.ConstructVerticalCountHist(word_in_xheight);
|
|
pixDestroy(&word_in_xheight);
|
|
|
|
// If the number of black pixel in any column of the image is less than a
|
|
// fraction of the stroke width, treat it as noise / a stray mark. Perform
|
|
// these changes inside the vert_hist data itself, as that is used later on as
|
|
// a bit vector for the final split decision at every column.
|
|
for (int i = 0; i < width; ++i) {
|
|
if (vert_hist.hist()[i] <= stroke_width / 4)
|
|
vert_hist.hist()[i] = 0;
|
|
else
|
|
vert_hist.hist()[i] = 1;
|
|
}
|
|
// In order to split the line at any point, we make sure that the width of the
|
|
// gap is atleast half the stroke width.
|
|
int i = 0;
|
|
int cur_component_width = 0;
|
|
while (i < width) {
|
|
if (!vert_hist.hist()[i]) {
|
|
int j = 0;
|
|
while (i + j < width && !vert_hist.hist()[i+j])
|
|
++j;
|
|
if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
|
|
// Perform a shiro-rekha split. The intervening region lies from i to
|
|
// i+j-1.
|
|
// A minimal single-pixel split makes the estimation of intra- and
|
|
// inter-word spacing easier during page layout analysis,
|
|
// whereas a maximal split may be needed for OCR, depending on
|
|
// how the engine was trained.
|
|
bool minimal_split = (split_strategy == MINIMAL_SPLIT);
|
|
int split_width = minimal_split ? 1 : j;
|
|
int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
|
|
if (!minimal_split || (i != 0 && i + j != width)) {
|
|
Box* box_to_clear =
|
|
boxCreate(word_left + split_left,
|
|
word_top + shirorekha_top - stroke_width / 3,
|
|
split_width,
|
|
5 * stroke_width / 3);
|
|
if (box_to_clear) {
|
|
boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
|
|
// Mark this in the debug image if needed.
|
|
if (devanagari_split_debugimage) {
|
|
pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
|
|
}
|
|
boxDestroy(&box_to_clear);
|
|
cur_component_width = 0;
|
|
}
|
|
}
|
|
}
|
|
i += j;
|
|
} else {
|
|
++i;
|
|
++cur_component_width;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Refreshes the words in the segmentation block list by using blobs in the
|
|
// input block list.
|
|
// The segmentation block list must be set.
|
|
void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(
|
|
C_BLOB_LIST* new_blobs) {
|
|
// The segmentation block list must have been specified.
|
|
ASSERT_HOST(segmentation_block_list_);
|
|
if (devanagari_split_debuglevel > 0) {
|
|
tprintf("Before refreshing blobs:\n");
|
|
PrintSegmentationStats(segmentation_block_list_);
|
|
tprintf("New Blobs found: %d\n", new_blobs->length());
|
|
}
|
|
|
|
C_BLOB_LIST not_found_blobs;
|
|
RefreshWordBlobsFromNewBlobs(segmentation_block_list_,
|
|
new_blobs,
|
|
((devanagari_split_debugimage && debug_image_) ?
|
|
¬_found_blobs : NULL));
|
|
|
|
if (devanagari_split_debuglevel > 0) {
|
|
tprintf("After refreshing blobs:\n");
|
|
PrintSegmentationStats(segmentation_block_list_);
|
|
}
|
|
if (devanagari_split_debugimage && debug_image_) {
|
|
// Plot out the original blobs for which no match was found in the new
|
|
// all_blobs list.
|
|
C_BLOB_IT not_found_it(¬_found_blobs);
|
|
for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
|
|
not_found_it.forward()) {
|
|
C_BLOB* not_found = not_found_it.data();
|
|
TBOX not_found_box = not_found->bounding_box();
|
|
Box* box_to_plot = GetBoxForTBOX(not_found_box);
|
|
pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
|
|
boxDestroy(&box_to_plot);
|
|
}
|
|
|
|
// Plot out the blobs unused from all blobs.
|
|
C_BLOB_IT all_blobs_it(new_blobs);
|
|
for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
|
|
all_blobs_it.forward()) {
|
|
C_BLOB* a_blob = all_blobs_it.data();
|
|
Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
|
|
pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
|
|
boxDestroy(&box_to_plot);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Returns a new box object for the corresponding TBOX, based on the original
|
|
// image's coordinate system.
|
|
Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const {
|
|
return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1,
|
|
tbox.width(), tbox.height());
|
|
}
|
|
|
|
// This method returns the computed mode-height of blobs in the pix.
|
|
// It also prunes very small blobs from calculation.
|
|
int ShiroRekhaSplitter::GetModeHeight(Pix* pix) {
|
|
Boxa* boxa = pixConnComp(pix, NULL, 8);
|
|
STATS heights(0, pixGetHeight(pix));
|
|
heights.clear();
|
|
for (int i = 0; i < boxaGetCount(boxa); ++i) {
|
|
Box* box = boxaGetBox(boxa, i, L_CLONE);
|
|
if (box->h >= 3 || box->w >= 3) {
|
|
heights.add(box->h, 1);
|
|
}
|
|
boxDestroy(&box);
|
|
}
|
|
boxaDestroy(&boxa);
|
|
return heights.mode();
|
|
}
|
|
|
|
// This method returns y-extents of the shiro-rekha computed from the input
|
|
// word image.
|
|
void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix,
|
|
int* shirorekha_top,
|
|
int* shirorekha_bottom,
|
|
int* shirorekha_ylevel) {
|
|
// Compute a histogram from projecting the word on a vertical line.
|
|
PixelHistogram hist_horiz;
|
|
hist_horiz.ConstructHorizontalCountHist(word_pix);
|
|
// Get the ylevel where the top-line exists. This is basically the global
|
|
// maxima in the horizontal histogram.
|
|
int topline_onpixel_count = 0;
|
|
int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
|
|
|
|
// Get the upper and lower extents of the shiro rekha.
|
|
int thresh = (topline_onpixel_count * 70) / 100;
|
|
int ulimit = topline_ylevel;
|
|
int llimit = topline_ylevel;
|
|
while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh)
|
|
--ulimit;
|
|
while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh)
|
|
++llimit;
|
|
|
|
if (shirorekha_top) *shirorekha_top = ulimit;
|
|
if (shirorekha_bottom) *shirorekha_bottom = llimit;
|
|
if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel;
|
|
}
|
|
|
|
// This method returns the global-maxima for the histogram. The frequency of
|
|
// the global maxima is returned in count, if specified.
|
|
int PixelHistogram::GetHistogramMaximum(int* count) const {
|
|
int best_value = 0;
|
|
for (int i = 0; i < length_; ++i) {
|
|
if (hist_[i] > hist_[best_value]) {
|
|
best_value = i;
|
|
}
|
|
}
|
|
if (count) {
|
|
*count = hist_[best_value];
|
|
}
|
|
return best_value;
|
|
}
|
|
|
|
// Methods to construct histograms from images.
|
|
void PixelHistogram::ConstructVerticalCountHist(Pix* pix) {
|
|
Clear();
|
|
int width = pixGetWidth(pix);
|
|
int height = pixGetHeight(pix);
|
|
hist_ = new int[width];
|
|
length_ = width;
|
|
int wpl = pixGetWpl(pix);
|
|
l_uint32 *data = pixGetData(pix);
|
|
for (int i = 0; i < width; ++i)
|
|
hist_[i] = 0;
|
|
for (int i = 0; i < height; ++i) {
|
|
l_uint32 *line = data + i * wpl;
|
|
for (int j = 0; j < width; ++j)
|
|
if (GET_DATA_BIT(line, j))
|
|
++(hist_[j]);
|
|
}
|
|
}
|
|
|
|
void PixelHistogram::ConstructHorizontalCountHist(Pix* pix) {
|
|
Clear();
|
|
Numa* counts = pixCountPixelsByRow(pix, NULL);
|
|
length_ = numaGetCount(counts);
|
|
hist_ = new int[length_];
|
|
for (int i = 0; i < length_; ++i) {
|
|
l_int32 val = 0;
|
|
numaGetIValue(counts, i, &val);
|
|
hist_[i] = val;
|
|
}
|
|
numaDestroy(&counts);
|
|
}
|
|
|
|
} // namespace tesseract.
|