Moved params from global in page layout to tesseractclass, improved single column layout analysis

This commit is contained in:
Ray Smith 2014-10-07 09:31:00 -07:00
parent a441993100
commit 55d11ad3c2
12 changed files with 150 additions and 62 deletions

View File

@ -340,6 +340,7 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
finder = new ColumnFinder(static_cast<int>(to_block->line_size),
blkbox.botleft(), blkbox.topright(),
source_resolution_, textord_use_cjk_fp_model,
textord_tabfind_aligned_gap_fraction,
&v_lines, &h_lines, vertical_x, vertical_y);
finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
@ -354,7 +355,12 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
// We want the text lines horizontal, (vertical text indicates vertical
// textlines) which may conflict (eg vertically written CJK).
int osd_orientation = 0;
bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
bool vertical_text = textord_tabfind_force_vertical_text;
if (!vertical_text && textord_tabfind_vertical_text) {
vertical_text =
finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
to_block, &osd_blobs);
}
if (osd && osd_tess != NULL && osr != NULL) {
GenericVector<int> osd_scripts;
if (osd_tess != this) {

View File

@ -419,6 +419,16 @@ Tesseract::Tesseract()
"for layout analysis.", this->params()),
BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
this->params()),
BOOL_MEMBER(textord_tabfind_vertical_text, true,
"Enable vertical detection", this->params()),
BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
"Force using vertical text page mode", this->params()),
double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
"Fraction of textlines deemed vertical to use vertical page "
"mode", this->params()),
double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned blobs.",
this->params()),
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
this->params()),
@ -430,6 +440,9 @@ Tesseract::Tesseract()
// reasonably sure that Tesseract users have updated their data files.
//
// BEGIN DEPRECATED PARAMETERS
BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
"find horizontal lines such as headers in vertical page mode",
this->params()),
INT_MEMBER(tessedit_ok_mode, 5,
"Acceptance decision algorithm", this->params()),
BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"

View File

@ -1000,6 +1000,14 @@ class Tesseract : public Wordrec {
"Only initialize with the config file. Useful if the instance is "
"not going to be used for OCR but say only for layout analysis.");
BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
"Force using vertical text page mode");
double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
"Fraction of textlines deemed vertical to use vertical page "
"mode");
double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned blobs.");
INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
// The following parameters were deprecated and removed from their original
@ -1010,6 +1018,8 @@ class Tesseract : public Wordrec {
// reasonably sure that Tesseract users have updated their data files.
//
// BEGIN DEPRECATED PARAMETERS
BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true,
"find horizontal lines such as headers in vertical page mode");
INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
" dawgs (e.g. for non-space delimited languages)");

View File

@ -84,6 +84,7 @@ ScrollView* ColumnFinder::blocks_win_ = NULL;
ColumnFinder::ColumnFinder(int gridsize,
const ICOORD& bleft, const ICOORD& tright,
int resolution, bool cjk_script,
double aligned_gap_fraction,
TabVector_LIST* vlines, TabVector_LIST* hlines,
int vertical_x, int vertical_y)
: TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y,
@ -91,6 +92,7 @@ ColumnFinder::ColumnFinder(int gridsize,
cjk_script_(cjk_script),
min_gutter_width_(static_cast<int>(kMinGutterWidthGrid * gridsize)),
mean_column_gap_(tright.x() - bleft.x()),
tabfind_aligned_gap_fraction_(aligned_gap_fraction),
reskew_(1.0f, 0.0f), rotation_(1.0f, 0.0f), rerotate_(1.0f, 0.0f),
best_columns_(NULL), stroke_width_(NULL),
part_grid_(gridsize, bleft, tright), nontext_map_(NULL),
@ -184,9 +186,11 @@ void ColumnFinder::SetupAndFilterNoise(Pix* photo_mask_pix,
// is vertical, like say Japanese, or due to text whose writing direction is
// horizontal but whose text appears vertically aligned because the image is
// not the right way up.
bool ColumnFinder::IsVerticallyAlignedText(TO_BLOCK* block,
bool ColumnFinder::IsVerticallyAlignedText(double find_vertical_text_ratio,
TO_BLOCK* block,
BLOBNBOX_CLIST* osd_blobs) {
return stroke_width_->TestVerticalTextDirection(block, osd_blobs);
return stroke_width_->TestVerticalTextDirection(find_vertical_text_ratio,
block, osd_blobs);
}
// Rotates the blobs and the TabVectors so that the gross writing direction
@ -292,7 +296,8 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
pixOr(photo_mask_pix, photo_mask_pix, nontext_map_);
stroke_width_->FindLeaderPartitions(input_block, &part_grid_);
stroke_width_->RemoveLineResidue(&big_parts_);
FindInitialTabVectors(NULL, min_gutter_width_, input_block);
FindInitialTabVectors(NULL, min_gutter_width_, tabfind_aligned_gap_fraction_,
input_block);
SetBlockRuleEdges(input_block);
stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_,
denorm_, cjk_script_, &projection_,
@ -353,7 +358,8 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
// Find the tab stops, estimate skew, and deskew the tabs, blobs and
// part_grid_.
FindTabVectors(&horizontal_lines_, &image_bblobs_, input_block,
min_gutter_width_, &part_grid_, &deskew_, &reskew_);
min_gutter_width_, tabfind_aligned_gap_fraction_,
&part_grid_, &deskew_, &reskew_);
// Add the deskew to the denorm_.
DENORM* new_denorm = new DENORM;
new_denorm->SetupNormalization(NULL, &deskew_, denorm_,
@ -596,11 +602,11 @@ bool ColumnFinder::MakeColumns(bool single_column) {
bool has_columns = !column_sets_.empty();
if (has_columns) {
// Divide the page into sections of uniform column layout.
AssignColumns(part_sets);
bool any_multi_column = AssignColumns(part_sets);
if (textord_tabfind_show_columns) {
DisplayColumnBounds(&part_sets);
}
ComputeMeanColumnGap();
ComputeMeanColumnGap(any_multi_column);
}
for (int i = 0; i < part_sets.size(); ++i) {
ColPartitionSet* line_set = part_sets.get(i);
@ -663,7 +669,8 @@ void ColumnFinder::PrintColumnCandidates(const char* title) {
// tweak of extending the modal region over small breaks in compatibility.
// Where modal regions overlap, the boundary is chosen so as to minimize
// the cost in terms of ColPartitions not fitting an approved column.
void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
// Returns true if any part of the page is multi-column.
bool ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
int set_count = part_sets.size();
ASSERT_HOST(set_count == gridheight());
// Allocate and init the best_columns_.
@ -708,6 +715,7 @@ void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
}
}
}
bool any_multi_column = false;
// Assign a column set to each vertical grid position.
// While there is an unassigned range, find its mode.
int start, end;
@ -745,6 +753,8 @@ void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
// Assign the column to the range, which now may overlap with other ranges.
AssignColumnToRange(column_set_id, start, end, column_set_costs,
assigned_costs);
if (column_sets_.get(column_set_id)->GoodColumnCount() > 1)
any_multi_column = true;
}
// If anything remains unassigned, the whole lot is unassigned, so
// arbitrarily assign id 0.
@ -758,6 +768,7 @@ void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
delete [] assigned_costs;
delete [] any_columns_possible;
delete [] column_set_costs;
return any_multi_column;
}
// Finds the biggest range in part_sets_ that has no assigned column, but
@ -915,7 +926,7 @@ void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,
}
// Computes the mean_column_gap_.
void ColumnFinder::ComputeMeanColumnGap() {
void ColumnFinder::ComputeMeanColumnGap(bool any_multi_column) {
int total_gap = 0;
int total_width = 0;
int gap_samples = 0;
@ -927,8 +938,8 @@ void ColumnFinder::ComputeMeanColumnGap() {
&total_gap,
&gap_samples);
}
mean_column_gap_ = gap_samples > 0 ? total_gap / gap_samples
: total_width / width_samples;
mean_column_gap_ = any_multi_column && gap_samples > 0
? total_gap / gap_samples : total_width / width_samples;
}
//////// Functions that manipulate ColPartitions in the part_grid_ /////

View File

@ -61,8 +61,9 @@ class ColumnFinder : public TabFind {
// layout analysis to assist in detecting horizontal vs vertically written
// textlines.
ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright,
int resolution, bool cjk_script, TabVector_LIST* vlines,
TabVector_LIST* hlines, int vertical_x, int vertical_y);
int resolution, bool cjk_script, double aligned_gap_fraction,
TabVector_LIST* vlines, TabVector_LIST* hlines,
int vertical_x, int vertical_y);
virtual ~ColumnFinder();
// Accessors for testing
@ -118,7 +119,9 @@ class ColumnFinder : public TabFind {
// is vertical, like say Japanese, or due to text whose writing direction is
// horizontal but whose text appears vertically aligned because the image is
// not the right way up.
bool IsVerticallyAlignedText(TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
// find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
bool IsVerticallyAlignedText(double find_vertical_text_ratio,
TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
// Rotates the blobs and the TabVectors so that the gross writing direction
// (text lines) are horizontal and lines are read down the page.
@ -188,7 +191,8 @@ class ColumnFinder : public TabFind {
void PrintColumnCandidates(const char* title);
// Finds the optimal set of columns that cover the entire image with as
// few changes in column partition as possible.
void AssignColumns(const PartSetVector& part_sets);
// Returns true if any part of the page is multi-column.
bool AssignColumns(const PartSetVector& part_sets);
// Finds the biggest range in part_sets_ that has no assigned column, but
// column assignment is possible.
bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible,
@ -218,7 +222,7 @@ class ColumnFinder : public TabFind {
int** column_set_costs, int* assigned_costs);
// Computes the mean_column_gap_.
void ComputeMeanColumnGap();
void ComputeMeanColumnGap(bool any_multi_column);
//////// Functions that manipulate ColPartitions in the part_grid_ /////
//////// to split, merge, find margins, and find types. //////////////
@ -299,6 +303,9 @@ class ColumnFinder : public TabFind {
int min_gutter_width_;
// The mean gap between columns over the page.
int mean_column_gap_;
// Config param saved at construction time. Modifies min_gutter_width_ with
// vertical text to prevent detection of vertical text as columns.
double tabfind_aligned_gap_fraction_;
// The rotation vector needed to convert original coords to deskewed.
FCOORD deskew_;
// The rotation vector needed to convert deskewed back to original coords.

View File

@ -1080,7 +1080,7 @@ void ColPartitionGrid::FindFigureCaptions() {
for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();
partner_it.forward()) {
ColPartition* partner = partner_it.data();
if (!partner->IsTextType()) continue;
if (!partner->IsTextType() || partner->type() == PT_TABLE) continue;
const TBOX& partner_box = partner->bounding_box();
if (debug) {
tprintf("Finding figure captions for image part:");

View File

@ -49,6 +49,17 @@ ColPartitionSet::ColPartitionSet(ColPartition* part) {
ColPartitionSet::~ColPartitionSet() {
}
// Returns the number of columns of good width.
int ColPartitionSet::GoodColumnCount() const {
int num_good_cols = 0;
// This is a read-only iteration of the list.
ColPartition_IT it(const_cast<ColPartition_LIST*>(&parts_));
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
if (it.data()->good_width()) ++num_good_cols;
}
return num_good_cols;
}
// Return an element of the parts_ list from its index.
ColPartition* ColPartitionSet::GetColumnByIndex(int index) {
ColPartition_IT it(&parts_);

View File

@ -50,13 +50,16 @@ class ColPartitionSet : public ELIST_LINK {
const TBOX& bounding_box() const {
return bounding_box_;
}
bool Empty() {
bool Empty() const {
return parts_.empty();
}
int ColumnCount() {
int ColumnCount() const {
return parts_.length();
}
// Returns the number of columns of good width.
int GoodColumnCount() const;
// Return an element of the parts_ list from its index.
ColPartition* GetColumnByIndex(int index);

View File

@ -43,18 +43,11 @@ namespace tesseract {
INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
BOOL_VAR(textord_tabfind_vertical_text, true, "Enable vertical detection");
BOOL_VAR(textord_tabfind_force_vertical_text, false,
"Force using vertical text page mode");
BOOL_VAR(textord_tabfind_vertical_horizontal_mix, true,
"find horizontal lines such as headers in vertical page mode");
double_VAR(textord_tabfind_vertical_text_ratio, 0.5,
"Fraction of textlines deemed vertical to use vertical page mode");
/** Allowed proportional change in stroke width to be the same font. */
const double kStrokeWidthFractionTolerance = 0.125;
/**
* Allowed constant change in stroke width to be the same font.
* Allowed constant change in stroke width to be the same font.
* Really 1.5 pixels.
*/
const double kStrokeWidthTolerance = 1.5;
@ -215,11 +208,9 @@ static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
// after rotating everything, otherwise the work done here will be enough.
// If osd_blobs is not null, a list of blobs from the dominant textline
// direction are returned for use in orientation and script detection.
bool StrokeWidth::TestVerticalTextDirection(TO_BLOCK* block,
bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio,
TO_BLOCK* block,
BLOBNBOX_CLIST* osd_blobs) {
if (textord_tabfind_force_vertical_text) return true;
if (!textord_tabfind_vertical_text) return false;
int vertical_boxes = 0;
int horizontal_boxes = 0;
// Count vertical normal and large blobs.
@ -242,7 +233,7 @@ bool StrokeWidth::TestVerticalTextDirection(TO_BLOCK* block,
return false;
}
int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
textord_tabfind_vertical_text_ratio);
find_vertical_text_ratio);
if (vertical_boxes >= min_vert_boxes) {
if (osd_blobs != NULL) {
BLOBNBOX_C_IT osd_it(osd_blobs);

View File

@ -78,7 +78,9 @@ class StrokeWidth : public BlobGrid {
// after rotating everything, otherwise the work done here will be enough.
// If osd_blobs is not null, a list of blobs from the dominant textline
// direction are returned for use in orientation and script detection.
bool TestVerticalTextDirection(TO_BLOCK* block,
// find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
bool TestVerticalTextDirection(double find_vertical_text_ratio,
TO_BLOCK* block,
BLOBNBOX_CLIST* osd_blobs);
// Corrects the data structures for the given rotation.

View File

@ -82,8 +82,6 @@ const double kCosMaxSkewAngle = 0.866025;
BOOL_VAR(textord_tabfind_show_initialtabs, false, "Show tab candidates");
BOOL_VAR(textord_tabfind_show_finaltabs, false, "Show tab vectors");
double_VAR(textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned blobs.");
TabFind::TabFind(int gridsize, const ICOORD& bleft, const ICOORD& tright,
TabVector_LIST* vlines, int vertical_x, int vertical_y,
@ -420,7 +418,7 @@ bool TabFind::CommonWidth(int width) {
ICOORDELT_IT it(&column_widths_);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
ICOORDELT* w = it.data();
if (NearlyEqual<int>(width, w->x(), 1))
if (w->x() - 1 <= width && width <= w->y() + 1)
return true;
}
return false;
@ -446,10 +444,12 @@ bool TabFind::VeryDifferentSizes(int size1, int size2) {
bool TabFind::FindTabVectors(TabVector_LIST* hlines,
BLOBNBOX_LIST* image_blobs, TO_BLOCK* block,
int min_gutter_width,
double tabfind_aligned_gap_fraction,
ColPartitionGrid* part_grid,
FCOORD* deskew, FCOORD* reskew) {
ScrollView* tab_win = FindInitialTabVectors(image_blobs, min_gutter_width,
block);
tabfind_aligned_gap_fraction,
block);
ComputeColumnWidths(tab_win, part_grid);
TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);
SortVectors();
@ -540,6 +540,7 @@ ScrollView* TabFind::DisplayTabVectors(ScrollView* tab_win) {
// is mostly of vertical alignment.
ScrollView* TabFind::FindInitialTabVectors(BLOBNBOX_LIST* image_blobs,
int min_gutter_width,
double tabfind_aligned_gap_fraction,
TO_BLOCK* block) {
if (textord_tabfind_show_initialtabs) {
ScrollView* line_win = MakeWindow(0, 0, "VerticalLines");
@ -549,7 +550,8 @@ ScrollView* TabFind::FindInitialTabVectors(BLOBNBOX_LIST* image_blobs,
if (image_blobs != NULL)
InsertBlobsToGrid(true, false, image_blobs, this);
InsertBlobsToGrid(true, false, &block->blobs, this);
ScrollView* initial_win = FindTabBoxes(min_gutter_width);
ScrollView* initial_win = FindTabBoxes(min_gutter_width,
tabfind_aligned_gap_fraction);
FindAllTabVectors(min_gutter_width);
TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);
@ -581,7 +583,8 @@ static void DisplayBoxVector(const GenericVector<BLOBNBOX*>& boxes,
// For each box in the grid, decide whether it is a candidate tab-stop,
// and if so add it to the left/right tab boxes.
ScrollView* TabFind::FindTabBoxes(int min_gutter_width) {
ScrollView* TabFind::FindTabBoxes(int min_gutter_width,
double tabfind_aligned_gap_fraction) {
left_tab_boxes_.clear();
right_tab_boxes_.clear();
// For every bbox in the grid, determine whether it uses a tab on an edge.
@ -589,7 +592,7 @@ ScrollView* TabFind::FindTabBoxes(int min_gutter_width) {
gsearch.StartFullSearch();
BLOBNBOX* bbox;
while ((bbox = gsearch.NextFullSearch()) != NULL) {
if (TestBoxForTabs(bbox, min_gutter_width)) {
if (TestBoxForTabs(bbox, min_gutter_width, tabfind_aligned_gap_fraction)) {
// If it is any kind of tab, insert it into the vectors.
if (bbox->left_tab_type() != TT_NONE)
left_tab_boxes_.push_back(bbox);
@ -616,7 +619,8 @@ ScrollView* TabFind::FindTabBoxes(int min_gutter_width) {
return tab_win;
}
bool TabFind::TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width) {
bool TabFind::TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width,
double tabfind_aligned_gap_fraction) {
GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> radsearch(this);
TBOX box = bbox->bounding_box();
// If there are separator lines, get the column edges.
@ -642,7 +646,7 @@ bool TabFind::TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width) {
// increased under the assumption that column partition is always larger
// than line spacing.
int min_spacing =
static_cast<int>(height * textord_tabfind_aligned_gap_fraction);
static_cast<int>(height * tabfind_aligned_gap_fraction);
if (min_gutter_width > min_spacing)
min_spacing = min_gutter_width;
int min_ragged_gutter = kRaggedGutterMultiple * gridsize();
@ -989,9 +993,16 @@ void TabFind::ComputeColumnWidths(ScrollView* tab_win,
col_widths.print();
// Now make a list of column widths.
MakeColumnWidths(col_widths_size, &col_widths);
// Turn the column width into a range.
ApplyPartitionsToColumnWidths(part_grid, NULL);
}
// Find column width and pair-up tab vectors with existing ColPartitions.
// Finds column width and:
// if col_widths is not null (pass1):
// pair-up tab vectors with existing ColPartitions and accumulate widths.
// else (pass2):
// find the largest real partition width for each recorded column width,
// to be used as the minimum acceptable width.
void TabFind::ApplyPartitionsToColumnWidths(ColPartitionGrid* part_grid,
STATS* col_widths) {
// For every ColPartition in the part_grid, add partners to the tabvectors
@ -1015,13 +1026,27 @@ void TabFind::ApplyPartitionsToColumnWidths(ColPartitionGrid* part_grid,
if (right_vector == NULL || right_vector->IsLeftTab())
continue;
AddPartnerVector(left_blob, right_blob, left_vector, right_vector);
int line_left = left_vector->XAtY(left_blob->bounding_box().bottom());
int line_right = right_vector->XAtY(right_blob->bounding_box().bottom());
// Add to STATS of measurements if the width is significant.
int width = line_right - line_left;
if (width >= kMinColumnWidth)
col_widths->add(width / kColumnWidthFactor, 1);
if (col_widths != NULL) {
AddPartnerVector(left_blob, right_blob, left_vector, right_vector);
if (width >= kMinColumnWidth)
col_widths->add(width / kColumnWidthFactor, 1);
} else {
width /= kColumnWidthFactor;
ICOORDELT_IT it(&column_widths_);
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
ICOORDELT* w = it.data();
if (NearlyEqual<int>(width, w->y(), 1)) {
int true_width = part->bounding_box().width() / kColumnWidthFactor;
if (true_width <= w->y() && true_width > w->x())
w->set_x(true_width);
break;
}
}
}
}
}
@ -1052,7 +1077,7 @@ void TabFind::MakeColumnWidths(int col_widths_size, STATS* col_widths) {
}
if (col_count > kMinLinesInColumn &&
col_count > kMinFractionalLinesInColumn * total_col_count) {
ICOORDELT* w = new ICOORDELT(width, col_count);
ICOORDELT* w = new ICOORDELT(0, width);
w_it.add_after_then_move(w);
if (textord_debug_tabfind)
tprintf("Column of width %d has %d = %.2f%% lines\n",

View File

@ -25,15 +25,6 @@
#include "tabvector.h"
#include "linefind.h"
extern BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
"Force using vertical text page mode");
extern BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true,
"find horizontal lines such as headers in vertical page mode");
extern double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
"Fraction of textlines deemed vertical to use vertical page mode");
extern double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned blobs.");
class BLOBNBOX;
class BLOBNBOX_LIST;
class TO_BLOCK;
@ -190,10 +181,12 @@ class TabFind : public AlignedBlob {
* Top-level function to find TabVectors in an input page block.
* Returns false if the detected skew angle is impossible.
* Applies the detected skew angle to deskew the tabs, blobs and part_grid.
* tabfind_aligned_gap_fraction should be the value of parameter
* textord_tabfind_aligned_gap_fraction
*/
bool FindTabVectors(TabVector_LIST* hlines,
BLOBNBOX_LIST* image_blobs, TO_BLOCK* block,
int min_gutter_width,
int min_gutter_width, double tabfind_aligned_gap_fraction,
ColPartitionGrid* part_grid,
FCOORD* deskew, FCOORD* reskew);
@ -220,8 +213,12 @@ class TabFind : public AlignedBlob {
// true, this finds vertical textlines in possibly rotated blob space.
// In other words, when the page has mostly vertical lines and is rotated,
// setting this to true will find horizontal lines on the page.
// tabfind_aligned_gap_fraction should be the value of parameter
// textord_tabfind_aligned_gap_fraction
ScrollView* FindInitialTabVectors(BLOBNBOX_LIST* image_blobs,
int min_gutter_width, TO_BLOCK* block);
int min_gutter_width,
double tabfind_aligned_gap_fraction,
TO_BLOCK* block);
// Apply the given rotation to the given list of blobs.
static void RotateBlobList(const FCOORD& rotation, BLOBNBOX_LIST* blobs);
@ -245,11 +242,17 @@ class TabFind : public AlignedBlob {
private:
// For each box in the grid, decide whether it is a candidate tab-stop,
// and if so add it to the left and right tab boxes.
ScrollView* FindTabBoxes(int min_gutter_width);
// tabfind_aligned_gap_fraction should be the value of parameter
// textord_tabfind_aligned_gap_fraction
ScrollView* FindTabBoxes(int min_gutter_width,
double tabfind_aligned_gap_fraction);
// Return true if this box looks like a candidate tab stop, and set
// the appropriate tab type(s) to TT_UNCONFIRMED.
bool TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width);
// tabfind_aligned_gap_fraction should be the value of parameter
// textord_tabfind_aligned_gap_fraction
bool TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width,
double tabfind_aligned_gap_fraction);
// Returns true if there is nothing in the rectangle of width min_gutter to
// the left of bbox.
@ -298,7 +301,12 @@ class TabFind : public AlignedBlob {
void ComputeColumnWidths(ScrollView* tab_win,
ColPartitionGrid* part_grid);
// Find column width and pair-up tab vectors with existing ColPartitions.
// Finds column width and:
// if col_widths is not null (pass1):
// pair-up tab vectors with existing ColPartitions and accumulate widths.
// else (pass2):
// find the largest real partition width for each recorded column width,
// to be used as the minimum acceptable width.
void ApplyPartitionsToColumnWidths(ColPartitionGrid* part_grid,
STATS* col_widths);
@ -363,7 +371,8 @@ class TabFind : public AlignedBlob {
TabVector_LIST vectors_; //< List of rule line and tabstops.
TabVector_IT v_it_; //< Iterator for searching vectors_.
TabVector_LIST dead_vectors_; //< Separators and unpartnered tab vectors.
ICOORDELT_LIST column_widths_; //< List of commonly occurring widths.
// List of commonly occuring width ranges with x=min and y=max.
ICOORDELT_LIST column_widths_; //< List of commonly occurring width ranges.
/** Callback to test an int for being a common width. */
WidthCallback* width_cb_;
// Sets of bounding boxes that are candidate tab stops.