mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
Moved params from global in page layout to tesseractclass, improved single column layout analysis
This commit is contained in:
parent
a441993100
commit
55d11ad3c2
@ -340,6 +340,7 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
|
||||
finder = new ColumnFinder(static_cast<int>(to_block->line_size),
|
||||
blkbox.botleft(), blkbox.topright(),
|
||||
source_resolution_, textord_use_cjk_fp_model,
|
||||
textord_tabfind_aligned_gap_fraction,
|
||||
&v_lines, &h_lines, vertical_x, vertical_y);
|
||||
|
||||
finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
|
||||
@ -354,7 +355,12 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
|
||||
// We want the text lines horizontal, (vertical text indicates vertical
|
||||
// textlines) which may conflict (eg vertically written CJK).
|
||||
int osd_orientation = 0;
|
||||
bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
|
||||
bool vertical_text = textord_tabfind_force_vertical_text;
|
||||
if (!vertical_text && textord_tabfind_vertical_text) {
|
||||
vertical_text =
|
||||
finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
|
||||
to_block, &osd_blobs);
|
||||
}
|
||||
if (osd && osd_tess != NULL && osr != NULL) {
|
||||
GenericVector<int> osd_scripts;
|
||||
if (osd_tess != this) {
|
||||
|
@ -419,6 +419,16 @@ Tesseract::Tesseract()
|
||||
"for layout analysis.", this->params()),
|
||||
BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
|
||||
this->params()),
|
||||
BOOL_MEMBER(textord_tabfind_vertical_text, true,
|
||||
"Enable vertical detection", this->params()),
|
||||
BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
|
||||
"Force using vertical text page mode", this->params()),
|
||||
double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
|
||||
"Fraction of textlines deemed vertical to use vertical page "
|
||||
"mode", this->params()),
|
||||
double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
|
||||
"Fraction of height used as a minimum gap for aligned blobs.",
|
||||
this->params()),
|
||||
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
|
||||
this->params()),
|
||||
|
||||
@ -430,6 +440,9 @@ Tesseract::Tesseract()
|
||||
// reasonably sure that Tesseract users have updated their data files.
|
||||
//
|
||||
// BEGIN DEPRECATED PARAMETERS
|
||||
BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
|
||||
"find horizontal lines such as headers in vertical page mode",
|
||||
this->params()),
|
||||
INT_MEMBER(tessedit_ok_mode, 5,
|
||||
"Acceptance decision algorithm", this->params()),
|
||||
BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
|
||||
|
@ -1000,6 +1000,14 @@ class Tesseract : public Wordrec {
|
||||
"Only initialize with the config file. Useful if the instance is "
|
||||
"not going to be used for OCR but say only for layout analysis.");
|
||||
BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
|
||||
BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
|
||||
BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
|
||||
"Force using vertical text page mode");
|
||||
double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
|
||||
"Fraction of textlines deemed vertical to use vertical page "
|
||||
"mode");
|
||||
double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
|
||||
"Fraction of height used as a minimum gap for aligned blobs.");
|
||||
INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
|
||||
|
||||
// The following parameters were deprecated and removed from their original
|
||||
@ -1010,6 +1018,8 @@ class Tesseract : public Wordrec {
|
||||
// reasonably sure that Tesseract users have updated their data files.
|
||||
//
|
||||
// BEGIN DEPRECATED PARAMETERS
|
||||
BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true,
|
||||
"find horizontal lines such as headers in vertical page mode");
|
||||
INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
|
||||
BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
|
||||
" dawgs (e.g. for non-space delimited languages)");
|
||||
|
@ -84,6 +84,7 @@ ScrollView* ColumnFinder::blocks_win_ = NULL;
|
||||
ColumnFinder::ColumnFinder(int gridsize,
|
||||
const ICOORD& bleft, const ICOORD& tright,
|
||||
int resolution, bool cjk_script,
|
||||
double aligned_gap_fraction,
|
||||
TabVector_LIST* vlines, TabVector_LIST* hlines,
|
||||
int vertical_x, int vertical_y)
|
||||
: TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y,
|
||||
@ -91,6 +92,7 @@ ColumnFinder::ColumnFinder(int gridsize,
|
||||
cjk_script_(cjk_script),
|
||||
min_gutter_width_(static_cast<int>(kMinGutterWidthGrid * gridsize)),
|
||||
mean_column_gap_(tright.x() - bleft.x()),
|
||||
tabfind_aligned_gap_fraction_(aligned_gap_fraction),
|
||||
reskew_(1.0f, 0.0f), rotation_(1.0f, 0.0f), rerotate_(1.0f, 0.0f),
|
||||
best_columns_(NULL), stroke_width_(NULL),
|
||||
part_grid_(gridsize, bleft, tright), nontext_map_(NULL),
|
||||
@ -184,9 +186,11 @@ void ColumnFinder::SetupAndFilterNoise(Pix* photo_mask_pix,
|
||||
// is vertical, like say Japanese, or due to text whose writing direction is
|
||||
// horizontal but whose text appears vertically aligned because the image is
|
||||
// not the right way up.
|
||||
bool ColumnFinder::IsVerticallyAlignedText(TO_BLOCK* block,
|
||||
bool ColumnFinder::IsVerticallyAlignedText(double find_vertical_text_ratio,
|
||||
TO_BLOCK* block,
|
||||
BLOBNBOX_CLIST* osd_blobs) {
|
||||
return stroke_width_->TestVerticalTextDirection(block, osd_blobs);
|
||||
return stroke_width_->TestVerticalTextDirection(find_vertical_text_ratio,
|
||||
block, osd_blobs);
|
||||
}
|
||||
|
||||
// Rotates the blobs and the TabVectors so that the gross writing direction
|
||||
@ -292,7 +296,8 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
|
||||
pixOr(photo_mask_pix, photo_mask_pix, nontext_map_);
|
||||
stroke_width_->FindLeaderPartitions(input_block, &part_grid_);
|
||||
stroke_width_->RemoveLineResidue(&big_parts_);
|
||||
FindInitialTabVectors(NULL, min_gutter_width_, input_block);
|
||||
FindInitialTabVectors(NULL, min_gutter_width_, tabfind_aligned_gap_fraction_,
|
||||
input_block);
|
||||
SetBlockRuleEdges(input_block);
|
||||
stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_,
|
||||
denorm_, cjk_script_, &projection_,
|
||||
@ -353,7 +358,8 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
|
||||
// Find the tab stops, estimate skew, and deskew the tabs, blobs and
|
||||
// part_grid_.
|
||||
FindTabVectors(&horizontal_lines_, &image_bblobs_, input_block,
|
||||
min_gutter_width_, &part_grid_, &deskew_, &reskew_);
|
||||
min_gutter_width_, tabfind_aligned_gap_fraction_,
|
||||
&part_grid_, &deskew_, &reskew_);
|
||||
// Add the deskew to the denorm_.
|
||||
DENORM* new_denorm = new DENORM;
|
||||
new_denorm->SetupNormalization(NULL, &deskew_, denorm_,
|
||||
@ -596,11 +602,11 @@ bool ColumnFinder::MakeColumns(bool single_column) {
|
||||
bool has_columns = !column_sets_.empty();
|
||||
if (has_columns) {
|
||||
// Divide the page into sections of uniform column layout.
|
||||
AssignColumns(part_sets);
|
||||
bool any_multi_column = AssignColumns(part_sets);
|
||||
if (textord_tabfind_show_columns) {
|
||||
DisplayColumnBounds(&part_sets);
|
||||
}
|
||||
ComputeMeanColumnGap();
|
||||
ComputeMeanColumnGap(any_multi_column);
|
||||
}
|
||||
for (int i = 0; i < part_sets.size(); ++i) {
|
||||
ColPartitionSet* line_set = part_sets.get(i);
|
||||
@ -663,7 +669,8 @@ void ColumnFinder::PrintColumnCandidates(const char* title) {
|
||||
// tweak of extending the modal region over small breaks in compatibility.
|
||||
// Where modal regions overlap, the boundary is chosen so as to minimize
|
||||
// the cost in terms of ColPartitions not fitting an approved column.
|
||||
void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
|
||||
// Returns true if any part of the page is multi-column.
|
||||
bool ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
|
||||
int set_count = part_sets.size();
|
||||
ASSERT_HOST(set_count == gridheight());
|
||||
// Allocate and init the best_columns_.
|
||||
@ -708,6 +715,7 @@ void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
|
||||
}
|
||||
}
|
||||
}
|
||||
bool any_multi_column = false;
|
||||
// Assign a column set to each vertical grid position.
|
||||
// While there is an unassigned range, find its mode.
|
||||
int start, end;
|
||||
@ -745,6 +753,8 @@ void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
|
||||
// Assign the column to the range, which now may overlap with other ranges.
|
||||
AssignColumnToRange(column_set_id, start, end, column_set_costs,
|
||||
assigned_costs);
|
||||
if (column_sets_.get(column_set_id)->GoodColumnCount() > 1)
|
||||
any_multi_column = true;
|
||||
}
|
||||
// If anything remains unassigned, the whole lot is unassigned, so
|
||||
// arbitrarily assign id 0.
|
||||
@ -758,6 +768,7 @@ void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
|
||||
delete [] assigned_costs;
|
||||
delete [] any_columns_possible;
|
||||
delete [] column_set_costs;
|
||||
return any_multi_column;
|
||||
}
|
||||
|
||||
// Finds the biggest range in part_sets_ that has no assigned column, but
|
||||
@ -915,7 +926,7 @@ void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,
|
||||
}
|
||||
|
||||
// Computes the mean_column_gap_.
|
||||
void ColumnFinder::ComputeMeanColumnGap() {
|
||||
void ColumnFinder::ComputeMeanColumnGap(bool any_multi_column) {
|
||||
int total_gap = 0;
|
||||
int total_width = 0;
|
||||
int gap_samples = 0;
|
||||
@ -927,8 +938,8 @@ void ColumnFinder::ComputeMeanColumnGap() {
|
||||
&total_gap,
|
||||
&gap_samples);
|
||||
}
|
||||
mean_column_gap_ = gap_samples > 0 ? total_gap / gap_samples
|
||||
: total_width / width_samples;
|
||||
mean_column_gap_ = any_multi_column && gap_samples > 0
|
||||
? total_gap / gap_samples : total_width / width_samples;
|
||||
}
|
||||
|
||||
//////// Functions that manipulate ColPartitions in the part_grid_ /////
|
||||
|
@ -61,8 +61,9 @@ class ColumnFinder : public TabFind {
|
||||
// layout analysis to assist in detecting horizontal vs vertically written
|
||||
// textlines.
|
||||
ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright,
|
||||
int resolution, bool cjk_script, TabVector_LIST* vlines,
|
||||
TabVector_LIST* hlines, int vertical_x, int vertical_y);
|
||||
int resolution, bool cjk_script, double aligned_gap_fraction,
|
||||
TabVector_LIST* vlines, TabVector_LIST* hlines,
|
||||
int vertical_x, int vertical_y);
|
||||
virtual ~ColumnFinder();
|
||||
|
||||
// Accessors for testing
|
||||
@ -118,7 +119,9 @@ class ColumnFinder : public TabFind {
|
||||
// is vertical, like say Japanese, or due to text whose writing direction is
|
||||
// horizontal but whose text appears vertically aligned because the image is
|
||||
// not the right way up.
|
||||
bool IsVerticallyAlignedText(TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
|
||||
// find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
|
||||
bool IsVerticallyAlignedText(double find_vertical_text_ratio,
|
||||
TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
|
||||
|
||||
// Rotates the blobs and the TabVectors so that the gross writing direction
|
||||
// (text lines) are horizontal and lines are read down the page.
|
||||
@ -188,7 +191,8 @@ class ColumnFinder : public TabFind {
|
||||
void PrintColumnCandidates(const char* title);
|
||||
// Finds the optimal set of columns that cover the entire image with as
|
||||
// few changes in column partition as possible.
|
||||
void AssignColumns(const PartSetVector& part_sets);
|
||||
// Returns true if any part of the page is multi-column.
|
||||
bool AssignColumns(const PartSetVector& part_sets);
|
||||
// Finds the biggest range in part_sets_ that has no assigned column, but
|
||||
// column assignment is possible.
|
||||
bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible,
|
||||
@ -218,7 +222,7 @@ class ColumnFinder : public TabFind {
|
||||
int** column_set_costs, int* assigned_costs);
|
||||
|
||||
// Computes the mean_column_gap_.
|
||||
void ComputeMeanColumnGap();
|
||||
void ComputeMeanColumnGap(bool any_multi_column);
|
||||
|
||||
//////// Functions that manipulate ColPartitions in the part_grid_ /////
|
||||
//////// to split, merge, find margins, and find types. //////////////
|
||||
@ -299,6 +303,9 @@ class ColumnFinder : public TabFind {
|
||||
int min_gutter_width_;
|
||||
// The mean gap between columns over the page.
|
||||
int mean_column_gap_;
|
||||
// Config param saved at construction time. Modifies min_gutter_width_ with
|
||||
// vertical text to prevent detection of vertical text as columns.
|
||||
double tabfind_aligned_gap_fraction_;
|
||||
// The rotation vector needed to convert original coords to deskewed.
|
||||
FCOORD deskew_;
|
||||
// The rotation vector needed to convert deskewed back to original coords.
|
||||
|
@ -1080,7 +1080,7 @@ void ColPartitionGrid::FindFigureCaptions() {
|
||||
for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();
|
||||
partner_it.forward()) {
|
||||
ColPartition* partner = partner_it.data();
|
||||
if (!partner->IsTextType()) continue;
|
||||
if (!partner->IsTextType() || partner->type() == PT_TABLE) continue;
|
||||
const TBOX& partner_box = partner->bounding_box();
|
||||
if (debug) {
|
||||
tprintf("Finding figure captions for image part:");
|
||||
|
@ -49,6 +49,17 @@ ColPartitionSet::ColPartitionSet(ColPartition* part) {
|
||||
ColPartitionSet::~ColPartitionSet() {
|
||||
}
|
||||
|
||||
// Returns the number of columns of good width.
|
||||
int ColPartitionSet::GoodColumnCount() const {
|
||||
int num_good_cols = 0;
|
||||
// This is a read-only iteration of the list.
|
||||
ColPartition_IT it(const_cast<ColPartition_LIST*>(&parts_));
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
if (it.data()->good_width()) ++num_good_cols;
|
||||
}
|
||||
return num_good_cols;
|
||||
}
|
||||
|
||||
// Return an element of the parts_ list from its index.
|
||||
ColPartition* ColPartitionSet::GetColumnByIndex(int index) {
|
||||
ColPartition_IT it(&parts_);
|
||||
|
@ -50,13 +50,16 @@ class ColPartitionSet : public ELIST_LINK {
|
||||
const TBOX& bounding_box() const {
|
||||
return bounding_box_;
|
||||
}
|
||||
bool Empty() {
|
||||
bool Empty() const {
|
||||
return parts_.empty();
|
||||
}
|
||||
int ColumnCount() {
|
||||
int ColumnCount() const {
|
||||
return parts_.length();
|
||||
}
|
||||
|
||||
// Returns the number of columns of good width.
|
||||
int GoodColumnCount() const;
|
||||
|
||||
// Return an element of the parts_ list from its index.
|
||||
ColPartition* GetColumnByIndex(int index);
|
||||
|
||||
|
@ -43,18 +43,11 @@ namespace tesseract {
|
||||
|
||||
INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
|
||||
BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
|
||||
BOOL_VAR(textord_tabfind_vertical_text, true, "Enable vertical detection");
|
||||
BOOL_VAR(textord_tabfind_force_vertical_text, false,
|
||||
"Force using vertical text page mode");
|
||||
BOOL_VAR(textord_tabfind_vertical_horizontal_mix, true,
|
||||
"find horizontal lines such as headers in vertical page mode");
|
||||
double_VAR(textord_tabfind_vertical_text_ratio, 0.5,
|
||||
"Fraction of textlines deemed vertical to use vertical page mode");
|
||||
|
||||
/** Allowed proportional change in stroke width to be the same font. */
|
||||
const double kStrokeWidthFractionTolerance = 0.125;
|
||||
/**
|
||||
* Allowed constant change in stroke width to be the same font.
|
||||
* Allowed constant change in stroke width to be the same font.
|
||||
* Really 1.5 pixels.
|
||||
*/
|
||||
const double kStrokeWidthTolerance = 1.5;
|
||||
@ -215,11 +208,9 @@ static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
|
||||
// after rotating everything, otherwise the work done here will be enough.
|
||||
// If osd_blobs is not null, a list of blobs from the dominant textline
|
||||
// direction are returned for use in orientation and script detection.
|
||||
bool StrokeWidth::TestVerticalTextDirection(TO_BLOCK* block,
|
||||
bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio,
|
||||
TO_BLOCK* block,
|
||||
BLOBNBOX_CLIST* osd_blobs) {
|
||||
if (textord_tabfind_force_vertical_text) return true;
|
||||
if (!textord_tabfind_vertical_text) return false;
|
||||
|
||||
int vertical_boxes = 0;
|
||||
int horizontal_boxes = 0;
|
||||
// Count vertical normal and large blobs.
|
||||
@ -242,7 +233,7 @@ bool StrokeWidth::TestVerticalTextDirection(TO_BLOCK* block,
|
||||
return false;
|
||||
}
|
||||
int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
|
||||
textord_tabfind_vertical_text_ratio);
|
||||
find_vertical_text_ratio);
|
||||
if (vertical_boxes >= min_vert_boxes) {
|
||||
if (osd_blobs != NULL) {
|
||||
BLOBNBOX_C_IT osd_it(osd_blobs);
|
||||
|
@ -78,7 +78,9 @@ class StrokeWidth : public BlobGrid {
|
||||
// after rotating everything, otherwise the work done here will be enough.
|
||||
// If osd_blobs is not null, a list of blobs from the dominant textline
|
||||
// direction are returned for use in orientation and script detection.
|
||||
bool TestVerticalTextDirection(TO_BLOCK* block,
|
||||
// find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
|
||||
bool TestVerticalTextDirection(double find_vertical_text_ratio,
|
||||
TO_BLOCK* block,
|
||||
BLOBNBOX_CLIST* osd_blobs);
|
||||
|
||||
// Corrects the data structures for the given rotation.
|
||||
|
@ -82,8 +82,6 @@ const double kCosMaxSkewAngle = 0.866025;
|
||||
|
||||
BOOL_VAR(textord_tabfind_show_initialtabs, false, "Show tab candidates");
|
||||
BOOL_VAR(textord_tabfind_show_finaltabs, false, "Show tab vectors");
|
||||
double_VAR(textord_tabfind_aligned_gap_fraction, 0.75,
|
||||
"Fraction of height used as a minimum gap for aligned blobs.");
|
||||
|
||||
TabFind::TabFind(int gridsize, const ICOORD& bleft, const ICOORD& tright,
|
||||
TabVector_LIST* vlines, int vertical_x, int vertical_y,
|
||||
@ -420,7 +418,7 @@ bool TabFind::CommonWidth(int width) {
|
||||
ICOORDELT_IT it(&column_widths_);
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
ICOORDELT* w = it.data();
|
||||
if (NearlyEqual<int>(width, w->x(), 1))
|
||||
if (w->x() - 1 <= width && width <= w->y() + 1)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -446,10 +444,12 @@ bool TabFind::VeryDifferentSizes(int size1, int size2) {
|
||||
bool TabFind::FindTabVectors(TabVector_LIST* hlines,
|
||||
BLOBNBOX_LIST* image_blobs, TO_BLOCK* block,
|
||||
int min_gutter_width,
|
||||
double tabfind_aligned_gap_fraction,
|
||||
ColPartitionGrid* part_grid,
|
||||
FCOORD* deskew, FCOORD* reskew) {
|
||||
ScrollView* tab_win = FindInitialTabVectors(image_blobs, min_gutter_width,
|
||||
block);
|
||||
tabfind_aligned_gap_fraction,
|
||||
block);
|
||||
ComputeColumnWidths(tab_win, part_grid);
|
||||
TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);
|
||||
SortVectors();
|
||||
@ -540,6 +540,7 @@ ScrollView* TabFind::DisplayTabVectors(ScrollView* tab_win) {
|
||||
// is mostly of vertical alignment.
|
||||
ScrollView* TabFind::FindInitialTabVectors(BLOBNBOX_LIST* image_blobs,
|
||||
int min_gutter_width,
|
||||
double tabfind_aligned_gap_fraction,
|
||||
TO_BLOCK* block) {
|
||||
if (textord_tabfind_show_initialtabs) {
|
||||
ScrollView* line_win = MakeWindow(0, 0, "VerticalLines");
|
||||
@ -549,7 +550,8 @@ ScrollView* TabFind::FindInitialTabVectors(BLOBNBOX_LIST* image_blobs,
|
||||
if (image_blobs != NULL)
|
||||
InsertBlobsToGrid(true, false, image_blobs, this);
|
||||
InsertBlobsToGrid(true, false, &block->blobs, this);
|
||||
ScrollView* initial_win = FindTabBoxes(min_gutter_width);
|
||||
ScrollView* initial_win = FindTabBoxes(min_gutter_width,
|
||||
tabfind_aligned_gap_fraction);
|
||||
FindAllTabVectors(min_gutter_width);
|
||||
|
||||
TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);
|
||||
@ -581,7 +583,8 @@ static void DisplayBoxVector(const GenericVector<BLOBNBOX*>& boxes,
|
||||
|
||||
// For each box in the grid, decide whether it is a candidate tab-stop,
|
||||
// and if so add it to the left/right tab boxes.
|
||||
ScrollView* TabFind::FindTabBoxes(int min_gutter_width) {
|
||||
ScrollView* TabFind::FindTabBoxes(int min_gutter_width,
|
||||
double tabfind_aligned_gap_fraction) {
|
||||
left_tab_boxes_.clear();
|
||||
right_tab_boxes_.clear();
|
||||
// For every bbox in the grid, determine whether it uses a tab on an edge.
|
||||
@ -589,7 +592,7 @@ ScrollView* TabFind::FindTabBoxes(int min_gutter_width) {
|
||||
gsearch.StartFullSearch();
|
||||
BLOBNBOX* bbox;
|
||||
while ((bbox = gsearch.NextFullSearch()) != NULL) {
|
||||
if (TestBoxForTabs(bbox, min_gutter_width)) {
|
||||
if (TestBoxForTabs(bbox, min_gutter_width, tabfind_aligned_gap_fraction)) {
|
||||
// If it is any kind of tab, insert it into the vectors.
|
||||
if (bbox->left_tab_type() != TT_NONE)
|
||||
left_tab_boxes_.push_back(bbox);
|
||||
@ -616,7 +619,8 @@ ScrollView* TabFind::FindTabBoxes(int min_gutter_width) {
|
||||
return tab_win;
|
||||
}
|
||||
|
||||
bool TabFind::TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width) {
|
||||
bool TabFind::TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width,
|
||||
double tabfind_aligned_gap_fraction) {
|
||||
GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> radsearch(this);
|
||||
TBOX box = bbox->bounding_box();
|
||||
// If there are separator lines, get the column edges.
|
||||
@ -642,7 +646,7 @@ bool TabFind::TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width) {
|
||||
// increased under the assumption that column partition is always larger
|
||||
// than line spacing.
|
||||
int min_spacing =
|
||||
static_cast<int>(height * textord_tabfind_aligned_gap_fraction);
|
||||
static_cast<int>(height * tabfind_aligned_gap_fraction);
|
||||
if (min_gutter_width > min_spacing)
|
||||
min_spacing = min_gutter_width;
|
||||
int min_ragged_gutter = kRaggedGutterMultiple * gridsize();
|
||||
@ -989,9 +993,16 @@ void TabFind::ComputeColumnWidths(ScrollView* tab_win,
|
||||
col_widths.print();
|
||||
// Now make a list of column widths.
|
||||
MakeColumnWidths(col_widths_size, &col_widths);
|
||||
// Turn the column width into a range.
|
||||
ApplyPartitionsToColumnWidths(part_grid, NULL);
|
||||
}
|
||||
|
||||
// Find column width and pair-up tab vectors with existing ColPartitions.
|
||||
// Finds column width and:
|
||||
// if col_widths is not null (pass1):
|
||||
// pair-up tab vectors with existing ColPartitions and accumulate widths.
|
||||
// else (pass2):
|
||||
// find the largest real partition width for each recorded column width,
|
||||
// to be used as the minimum acceptable width.
|
||||
void TabFind::ApplyPartitionsToColumnWidths(ColPartitionGrid* part_grid,
|
||||
STATS* col_widths) {
|
||||
// For every ColPartition in the part_grid, add partners to the tabvectors
|
||||
@ -1015,13 +1026,27 @@ void TabFind::ApplyPartitionsToColumnWidths(ColPartitionGrid* part_grid,
|
||||
if (right_vector == NULL || right_vector->IsLeftTab())
|
||||
continue;
|
||||
|
||||
AddPartnerVector(left_blob, right_blob, left_vector, right_vector);
|
||||
int line_left = left_vector->XAtY(left_blob->bounding_box().bottom());
|
||||
int line_right = right_vector->XAtY(right_blob->bounding_box().bottom());
|
||||
// Add to STATS of measurements if the width is significant.
|
||||
int width = line_right - line_left;
|
||||
if (width >= kMinColumnWidth)
|
||||
col_widths->add(width / kColumnWidthFactor, 1);
|
||||
if (col_widths != NULL) {
|
||||
AddPartnerVector(left_blob, right_blob, left_vector, right_vector);
|
||||
if (width >= kMinColumnWidth)
|
||||
col_widths->add(width / kColumnWidthFactor, 1);
|
||||
} else {
|
||||
width /= kColumnWidthFactor;
|
||||
ICOORDELT_IT it(&column_widths_);
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
ICOORDELT* w = it.data();
|
||||
if (NearlyEqual<int>(width, w->y(), 1)) {
|
||||
int true_width = part->bounding_box().width() / kColumnWidthFactor;
|
||||
if (true_width <= w->y() && true_width > w->x())
|
||||
w->set_x(true_width);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1052,7 +1077,7 @@ void TabFind::MakeColumnWidths(int col_widths_size, STATS* col_widths) {
|
||||
}
|
||||
if (col_count > kMinLinesInColumn &&
|
||||
col_count > kMinFractionalLinesInColumn * total_col_count) {
|
||||
ICOORDELT* w = new ICOORDELT(width, col_count);
|
||||
ICOORDELT* w = new ICOORDELT(0, width);
|
||||
w_it.add_after_then_move(w);
|
||||
if (textord_debug_tabfind)
|
||||
tprintf("Column of width %d has %d = %.2f%% lines\n",
|
||||
|
@ -25,15 +25,6 @@
|
||||
#include "tabvector.h"
|
||||
#include "linefind.h"
|
||||
|
||||
extern BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
|
||||
"Force using vertical text page mode");
|
||||
extern BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true,
|
||||
"find horizontal lines such as headers in vertical page mode");
|
||||
extern double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
|
||||
"Fraction of textlines deemed vertical to use vertical page mode");
|
||||
extern double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
|
||||
"Fraction of height used as a minimum gap for aligned blobs.");
|
||||
|
||||
class BLOBNBOX;
|
||||
class BLOBNBOX_LIST;
|
||||
class TO_BLOCK;
|
||||
@ -190,10 +181,12 @@ class TabFind : public AlignedBlob {
|
||||
* Top-level function to find TabVectors in an input page block.
|
||||
* Returns false if the detected skew angle is impossible.
|
||||
* Applies the detected skew angle to deskew the tabs, blobs and part_grid.
|
||||
* tabfind_aligned_gap_fraction should be the value of parameter
|
||||
* textord_tabfind_aligned_gap_fraction
|
||||
*/
|
||||
bool FindTabVectors(TabVector_LIST* hlines,
|
||||
BLOBNBOX_LIST* image_blobs, TO_BLOCK* block,
|
||||
int min_gutter_width,
|
||||
int min_gutter_width, double tabfind_aligned_gap_fraction,
|
||||
ColPartitionGrid* part_grid,
|
||||
FCOORD* deskew, FCOORD* reskew);
|
||||
|
||||
@ -220,8 +213,12 @@ class TabFind : public AlignedBlob {
|
||||
// true, this finds vertical textlines in possibly rotated blob space.
|
||||
// In other words, when the page has mostly vertical lines and is rotated,
|
||||
// setting this to true will find horizontal lines on the page.
|
||||
// tabfind_aligned_gap_fraction should be the value of parameter
|
||||
// textord_tabfind_aligned_gap_fraction
|
||||
ScrollView* FindInitialTabVectors(BLOBNBOX_LIST* image_blobs,
|
||||
int min_gutter_width, TO_BLOCK* block);
|
||||
int min_gutter_width,
|
||||
double tabfind_aligned_gap_fraction,
|
||||
TO_BLOCK* block);
|
||||
|
||||
// Apply the given rotation to the given list of blobs.
|
||||
static void RotateBlobList(const FCOORD& rotation, BLOBNBOX_LIST* blobs);
|
||||
@ -245,11 +242,17 @@ class TabFind : public AlignedBlob {
|
||||
private:
|
||||
// For each box in the grid, decide whether it is a candidate tab-stop,
|
||||
// and if so add it to the left and right tab boxes.
|
||||
ScrollView* FindTabBoxes(int min_gutter_width);
|
||||
// tabfind_aligned_gap_fraction should be the value of parameter
|
||||
// textord_tabfind_aligned_gap_fraction
|
||||
ScrollView* FindTabBoxes(int min_gutter_width,
|
||||
double tabfind_aligned_gap_fraction);
|
||||
|
||||
// Return true if this box looks like a candidate tab stop, and set
|
||||
// the appropriate tab type(s) to TT_UNCONFIRMED.
|
||||
bool TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width);
|
||||
// tabfind_aligned_gap_fraction should be the value of parameter
|
||||
// textord_tabfind_aligned_gap_fraction
|
||||
bool TestBoxForTabs(BLOBNBOX* bbox, int min_gutter_width,
|
||||
double tabfind_aligned_gap_fraction);
|
||||
|
||||
// Returns true if there is nothing in the rectangle of width min_gutter to
|
||||
// the left of bbox.
|
||||
@ -298,7 +301,12 @@ class TabFind : public AlignedBlob {
|
||||
void ComputeColumnWidths(ScrollView* tab_win,
|
||||
ColPartitionGrid* part_grid);
|
||||
|
||||
// Find column width and pair-up tab vectors with existing ColPartitions.
|
||||
// Finds column width and:
|
||||
// if col_widths is not null (pass1):
|
||||
// pair-up tab vectors with existing ColPartitions and accumulate widths.
|
||||
// else (pass2):
|
||||
// find the largest real partition width for each recorded column width,
|
||||
// to be used as the minimum acceptable width.
|
||||
void ApplyPartitionsToColumnWidths(ColPartitionGrid* part_grid,
|
||||
STATS* col_widths);
|
||||
|
||||
@ -363,7 +371,8 @@ class TabFind : public AlignedBlob {
|
||||
TabVector_LIST vectors_; //< List of rule line and tabstops.
|
||||
TabVector_IT v_it_; //< Iterator for searching vectors_.
|
||||
TabVector_LIST dead_vectors_; //< Separators and unpartnered tab vectors.
|
||||
ICOORDELT_LIST column_widths_; //< List of commonly occurring widths.
|
||||
// List of commonly occuring width ranges with x=min and y=max.
|
||||
ICOORDELT_LIST column_widths_; //< List of commonly occurring width ranges.
|
||||
/** Callback to test an int for being a common width. */
|
||||
WidthCallback* width_cb_;
|
||||
// Sets of bounding boxes that are candidate tab stops.
|
||||
|
Loading…
Reference in New Issue
Block a user