2009-07-11 10:39:56 +08:00
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
// File: colpartitionset.h
|
|
|
|
// Description: Class to hold a list of ColPartitions of the page that
|
|
|
|
// correspond roughly to columns.
|
|
|
|
// Author: Ray Smith
|
|
|
|
//
|
|
|
|
// (C) Copyright 2008, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
2016-12-04 21:45:26 +08:00
|
|
|
#ifndef TESSERACT_TEXTORD_COLPARTITIONSET_H_
|
|
|
|
#define TESSERACT_TEXTORD_COLPARTITIONSET_H_
|
2009-07-11 10:39:56 +08:00
|
|
|
|
|
|
|
#include "colpartition.h" // For ColPartition_LIST.
|
2019-10-29 01:10:30 +08:00
|
|
|
#include "tesseract/genericvector.h" // For GenericVector.
|
2009-07-11 10:39:56 +08:00
|
|
|
#include "rect.h" // For TBOX.
|
|
|
|
#include "tabvector.h" // For BLOBNBOX_CLIST.
|
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
|
|
|
class WorkingPartSet_LIST;
|
|
|
|
class ColSegment_LIST;
|
|
|
|
class ColPartitionSet;
|
2018-05-20 06:31:03 +08:00
|
|
|
using PartSetVector = GenericVector<ColPartitionSet*>;
|
2009-07-11 10:39:56 +08:00
|
|
|
|
|
|
|
// ColPartitionSet is a class that holds a list of ColPartitions.
|
|
|
|
// Its main use is in holding a candidate partitioning of the width of the
|
|
|
|
// image into columns, where each member ColPartition is a single column.
|
|
|
|
// ColPartitionSets are used in building the column layout of a page.
|
|
|
|
class ColPartitionSet : public ELIST_LINK {
|
|
|
|
public:
|
2018-05-21 07:36:56 +08:00
|
|
|
ColPartitionSet() = default;
|
2009-07-11 10:39:56 +08:00
|
|
|
explicit ColPartitionSet(ColPartition_LIST* partitions);
|
|
|
|
explicit ColPartitionSet(ColPartition* partition);
|
|
|
|
|
2018-05-21 07:36:56 +08:00
|
|
|
~ColPartitionSet() = default;
|
2009-07-11 10:39:56 +08:00
|
|
|
|
|
|
|
// Simple accessors.
|
|
|
|
const TBOX& bounding_box() const {
|
|
|
|
return bounding_box_;
|
|
|
|
}
|
2014-10-08 00:31:00 +08:00
|
|
|
bool Empty() const {
|
2009-07-11 10:39:56 +08:00
|
|
|
return parts_.empty();
|
|
|
|
}
|
2014-10-08 00:31:00 +08:00
|
|
|
int ColumnCount() const {
|
2009-07-11 10:39:56 +08:00
|
|
|
return parts_.length();
|
|
|
|
}
|
|
|
|
|
2014-10-08 00:31:00 +08:00
|
|
|
// Returns the number of columns of good width.
|
|
|
|
int GoodColumnCount() const;
|
|
|
|
|
2009-07-11 10:39:56 +08:00
|
|
|
// Return an element of the parts_ list from its index.
|
|
|
|
ColPartition* GetColumnByIndex(int index);
|
|
|
|
|
2016-12-13 00:23:03 +08:00
|
|
|
// Return the ColPartition that contains the given coords, if any, else nullptr.
|
2009-07-11 10:39:56 +08:00
|
|
|
ColPartition* ColumnContaining(int x, int y);
|
|
|
|
|
|
|
|
// Return the bounding boxes of columns at the given y-range
|
|
|
|
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments);
|
|
|
|
|
2012-02-02 10:53:04 +08:00
|
|
|
// Extract all the parts from the list, relinquishing ownership.
|
|
|
|
void RelinquishParts();
|
2009-07-11 10:39:56 +08:00
|
|
|
|
|
|
|
// Attempt to improve this by adding partitions or expanding partitions.
|
2019-07-04 04:51:10 +08:00
|
|
|
void ImproveColumnCandidate(WidthCallback cb, PartSetVector* src_sets);
|
2009-07-11 10:39:56 +08:00
|
|
|
|
|
|
|
// If this set is good enough to represent a new partitioning into columns,
|
|
|
|
// add it to the vector of sets, otherwise delete it.
|
2019-07-04 04:51:10 +08:00
|
|
|
void AddToColumnSetsIfUnique(PartSetVector* column_sets, WidthCallback cb);
|
2009-07-11 10:39:56 +08:00
|
|
|
|
|
|
|
// Return true if the partitions in other are all compatible with the columns
|
|
|
|
// in this.
|
2019-07-04 04:51:10 +08:00
|
|
|
bool CompatibleColumns(bool debug, ColPartitionSet* other, WidthCallback cb);
|
2009-07-11 10:39:56 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Returns the total width of all blobs in the part_set that do not lie
|
|
|
|
// within an approved column. Used as a cost measure for using this
|
|
|
|
// column set over another that might be compatible.
|
|
|
|
int UnmatchedWidth(ColPartitionSet* part_set);
|
|
|
|
|
2009-07-11 10:39:56 +08:00
|
|
|
// Return true if this ColPartitionSet makes a legal column candidate by
|
|
|
|
// having legal individual partitions and non-overlapping adjacent pairs.
|
|
|
|
bool LegalColumnCandidate();
|
|
|
|
|
|
|
|
// Return a copy of this. If good_only will only copy the Good ColPartitions.
|
|
|
|
ColPartitionSet* Copy(bool good_only);
|
|
|
|
|
|
|
|
// Display the edges of the columns at the given y coords.
|
|
|
|
void DisplayColumnEdges(int y_bottom, int y_top, ScrollView* win);
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Return the ColumnSpanningType that best explains the columns overlapped
|
2009-07-11 10:39:56 +08:00
|
|
|
// by the given coords(left,right,y), with the given margins.
|
|
|
|
// Also return the first and last column index touched by the coords and
|
2010-11-24 02:34:14 +08:00
|
|
|
// the leftmost spanned column.
|
2018-05-28 00:40:13 +08:00
|
|
|
// Column indices are 2n + 1 for real columns (0 based) and even values
|
2009-07-11 10:39:56 +08:00
|
|
|
// represent the gaps in between columns, with 0 being left of the leftmost.
|
2010-11-24 02:34:14 +08:00
|
|
|
// resolution refers to the ppi resolution of the image. It may be 0 if only
|
|
|
|
// the first_col and last_col are required.
|
|
|
|
ColumnSpanningType SpanningType(int resolution,
|
2013-09-23 23:26:50 +08:00
|
|
|
int left, int right, int height, int y,
|
2010-11-24 02:34:14 +08:00
|
|
|
int left_margin, int right_margin,
|
|
|
|
int* first_col, int* last_col,
|
|
|
|
int* first_spanned_col);
|
2009-07-11 10:39:56 +08:00
|
|
|
|
|
|
|
// The column_set has changed. Close down all in-progress WorkingPartSets in
|
|
|
|
// columns that do not match and start new ones for the new columns in this.
|
|
|
|
// As ColPartitions are turned into BLOCKs, the used ones are put in
|
|
|
|
// used_parts, as they still need to be referenced in the grid.
|
|
|
|
void ChangeWorkColumns(const ICOORD& bleft, const ICOORD& tright,
|
|
|
|
int resolution, ColPartition_LIST* used_parts,
|
|
|
|
WorkingPartSet_LIST* working_set);
|
|
|
|
|
|
|
|
// Accumulate the widths and gaps into the given variables.
|
|
|
|
void AccumulateColumnWidthsAndGaps(int* total_width, int* width_samples,
|
|
|
|
int* total_gap, int* gap_samples);
|
|
|
|
|
|
|
|
// Provide debug output for this ColPartitionSet and all the ColPartitions.
|
|
|
|
void Print();
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Add the given partition to the list in the appropriate place.
|
|
|
|
void AddPartition(ColPartition* new_part, ColPartition_IT* it);
|
|
|
|
|
2012-02-02 10:53:04 +08:00
|
|
|
// Compute the coverage and good column count. Coverage is the amount of the
|
|
|
|
// width of the page (in pixels) that is covered by ColPartitions, which are
|
|
|
|
// used to provide candidate column layouts.
|
|
|
|
// Coverage is split into good and bad. Good coverage is provided by
|
|
|
|
// ColPartitions of a frequent width (according to the callback function
|
|
|
|
// provided by TabFinder::WidthCB, which accesses stored statistics on the
|
2019-05-02 02:30:34 +08:00
|
|
|
// widths of ColPartitions) and bad coverage is provided by all other
|
2012-02-02 10:53:04 +08:00
|
|
|
// ColPartitions, even if they have tab vectors at both sides. Thus:
|
|
|
|
// |-----------------------------------------------------------------|
|
|
|
|
// | Double width heading |
|
|
|
|
// |-----------------------------------------------------------------|
|
|
|
|
// |-------------------------------| |-------------------------------|
|
|
|
|
// | Common width ColParition | | Common width ColPartition |
|
|
|
|
// |-------------------------------| |-------------------------------|
|
|
|
|
// the layout with two common-width columns has better coverage than the
|
|
|
|
// double width heading, because the coverage is "good," even though less in
|
|
|
|
// total coverage than the heading, because the heading coverage is "bad."
|
2009-07-11 10:39:56 +08:00
|
|
|
void ComputeCoverage();
|
|
|
|
|
2012-02-02 10:53:04 +08:00
|
|
|
// Adds the coverage, column count and box for a single partition,
|
|
|
|
// without adding it to the list. (Helper factored from ComputeCoverage.)
|
|
|
|
void AddPartitionCoverageAndBox(const ColPartition& part);
|
|
|
|
|
2009-07-11 10:39:56 +08:00
|
|
|
// The partitions in this column candidate.
|
|
|
|
ColPartition_LIST parts_;
|
|
|
|
// The number of partitions that have a frequent column width.
|
|
|
|
int good_column_count_;
|
2012-02-02 10:53:04 +08:00
|
|
|
// Total width of all the good ColPartitions.
|
|
|
|
int good_coverage_;
|
|
|
|
// Total width of all the bad ColPartitions.
|
|
|
|
int bad_coverage_;
|
2009-07-11 10:39:56 +08:00
|
|
|
// Bounding box of all partitions in the set.
|
|
|
|
TBOX bounding_box_;
|
|
|
|
};
|
|
|
|
|
|
|
|
ELISTIZEH(ColPartitionSet)
|
|
|
|
|
|
|
|
} // namespace tesseract.
|
|
|
|
|
2016-12-04 21:45:26 +08:00
|
|
|
#endif // TESSERACT_TEXTORD_COLPARTITION_H_
|