mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
369 lines
18 KiB
C++
369 lines
18 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: colfind.h
|
|
// Description: Class to find columns in the grid of BLOBNBOXes.
|
|
// Author: Ray Smith
|
|
// Created: Thu Feb 21 14:04:01 PST 2008
|
|
//
|
|
// (C) Copyright 2008, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_TEXTORD_COLFIND_H__
|
|
#define TESSERACT_TEXTORD_COLFIND_H__
|
|
|
|
#include "tabfind.h"
|
|
#include "imagefind.h"
|
|
#include "colpartitiongrid.h"
|
|
#include "colpartitionset.h"
|
|
#include "ocrblock.h"
|
|
#include "textlineprojection.h"
|
|
|
|
class BLOCK_LIST;
|
|
struct Boxa;
|
|
struct Pixa;
|
|
class DENORM;
|
|
class ScrollView;
|
|
class STATS;
|
|
class TO_BLOCK;
|
|
|
|
namespace tesseract {
|
|
|
|
extern BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection");
|
|
|
|
class ColPartitionSet;
|
|
class ColPartitionSet_LIST;
|
|
class ColSegment_LIST;
|
|
class ColumnGroup_LIST;
|
|
class LineSpacing;
|
|
class StrokeWidth;
|
|
class TempColumn_LIST;
|
|
class EquationDetectBase;
|
|
|
|
// The ColumnFinder class finds columns in the grid.
|
|
class ColumnFinder : public TabFind {
|
|
public:
|
|
// Gridsize is an estimate of the text size in the image. A suitable value
|
|
// is in TO_BLOCK::line_size after find_components has been used to make
|
|
// the blobs.
|
|
// bleft and tright are the bounds of the image (rectangle) being processed.
|
|
// vlines is a (possibly empty) list of TabVector and vertical_x and y are
|
|
// the sum logical vertical vector produced by LineFinder::FindVerticalLines.
|
|
// If cjk_script is true, then broken CJK characters are fixed during
|
|
// layout analysis to assist in detecting horizontal vs vertically written
|
|
// textlines.
|
|
ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright,
|
|
int resolution, bool cjk_script, double aligned_gap_fraction,
|
|
TabVector_LIST* vlines, TabVector_LIST* hlines,
|
|
int vertical_x, int vertical_y);
|
|
virtual ~ColumnFinder();
|
|
|
|
// Accessors for testing
|
|
const DENORM* denorm() const {
|
|
return denorm_;
|
|
}
|
|
const TextlineProjection* projection() const {
|
|
return &projection_;
|
|
}
|
|
void set_cjk_script(bool is_cjk) {
|
|
cjk_script_ = is_cjk;
|
|
}
|
|
|
|
// ======================================================================
|
|
// The main function of ColumnFinder is broken into pieces to facilitate
|
|
// optional insertion of orientation and script detection in an efficient
|
|
// way. The calling sequence IS MANDATORY however, whether or not
|
|
// OSD is being used:
|
|
// 1. Construction.
|
|
// 2. SetupAndFilterNoise.
|
|
// 3. IsVerticallyAlignedText.
|
|
// 4. CorrectOrientation.
|
|
// 5. FindBlocks.
|
|
// 6. Destruction. Use of a single column finder for multiple images does not
|
|
// make sense.
|
|
// Throughout these steps, the ColPartitions are owned by part_grid_, which
|
|
// means that that it must be kept correct. Exception: big_parts_ owns its
|
|
// own ColPartitions.
|
|
// The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except
|
|
// for a phase in FindBlocks before TransformToBlocks, when they become
|
|
// owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX
|
|
// indicates more of a betrothal for the majority of layout analysis, ie
|
|
// which ColPartition will take ownership when the blobs are release from
|
|
// the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that
|
|
// are part of the image regions, as they are not on any TO_BLOCK list.
|
|
// TODO(rays) break up column finder further into smaller classes, as
|
|
// there is a lot more to it than column finding now.
|
|
// ======================================================================
|
|
|
|
// Performs initial processing on the blobs in the input_block:
|
|
// Setup the part_grid, stroke_width_, nontext_map_.
|
|
// Obvious noise blobs are filtered out and used to mark the nontext_map_.
|
|
// Initial stroke-width analysis is used to get local text alignment
|
|
// direction, so the textline projection_ map can be setup.
|
|
// On return, IsVerticallyAlignedText may be called (now optionally) to
|
|
// determine the gross textline alignment of the page.
|
|
void SetupAndFilterNoise(PageSegMode pageseg_mode, Pix* photo_mask_pix,
|
|
TO_BLOCK* input_block);
|
|
|
|
// Tests for vertical alignment of text (returning true if so), and generates
|
|
// a list of blobs (in osd_blobs) for orientation and script detection.
|
|
// block is the single block for the whole page or rectangle to be OCRed.
|
|
// Note that the vertical alignment may be due to text whose writing direction
|
|
// is vertical, like say Japanese, or due to text whose writing direction is
|
|
// horizontal but whose text appears vertically aligned because the image is
|
|
// not the right way up.
|
|
// find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
|
|
bool IsVerticallyAlignedText(double find_vertical_text_ratio,
|
|
TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs);
|
|
|
|
// Rotates the blobs and the TabVectors so that the gross writing direction
|
|
// (text lines) are horizontal and lines are read down the page.
|
|
// Applied rotation stored in rotation_.
|
|
// A second rotation is calculated for application during recognition to
|
|
// make the rotated blobs upright for recognition.
|
|
// Subsequent rotation stored in text_rotation_.
|
|
//
|
|
// Arguments:
|
|
// vertical_text_lines is true if the text lines are vertical.
|
|
// recognition_rotation [0..3] is the number of anti-clockwise 90 degree
|
|
// rotations from osd required for the text to be upright and readable.
|
|
void CorrectOrientation(TO_BLOCK* block, bool vertical_text_lines,
|
|
int recognition_rotation);
|
|
|
|
// Finds blocks of text, image, rule line, table etc, returning them in the
|
|
// blocks and to_blocks
|
|
// (Each TO_BLOCK points to the basic BLOCK and adds more information.)
|
|
// Image blocks are generated by a combination of photo_mask_pix (which may
|
|
// NOT be NULL) and the rejected text found during preliminary textline
|
|
// finding.
|
|
// The input_block is the result of a call to find_components, and contains
|
|
// the blobs found in the image or rectangle to be OCRed. These blobs will be
|
|
// removed and placed in the output blocks, while unused ones will be deleted.
|
|
// If single_column is true, the input is treated as single column, but
|
|
// it is still divided into blocks of equal line spacing/text size.
|
|
// scaled_color is scaled down by scaled_factor from the input color image,
|
|
// and may be NULL if the input was not color.
|
|
// grey_pix is optional, but if present must match the photo_mask_pix in size,
|
|
// and must be a *real* grey image instead of binary_pix * 255.
|
|
// thresholds_pix is expected to be present iff grey_pix is present and
|
|
// can be an integer factor reduction of the grey_pix. It represents the
|
|
// thresholds that were used to create the binary_pix from the grey_pix.
|
|
// Small blobs that confuse the segmentation into lines are placed into
|
|
// diacritic_blobs, with the intention that they be put into the most
|
|
// appropriate word after the rest of layout analysis.
|
|
// Returns -1 if the user hits the 'd' key in the blocks window while running
|
|
// in debug mode, which requests a retry with more debug info.
|
|
int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor,
|
|
TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix,
|
|
Pix* grey_pix, BLOCK_LIST* blocks,
|
|
BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks);
|
|
|
|
// Get the rotation required to deskew, and its inverse rotation.
|
|
void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew);
|
|
|
|
// Set the equation detection pointer.
|
|
void SetEquationDetect(EquationDetectBase* detect);
|
|
|
|
private:
|
|
// Displays the blob and block bounding boxes in a window called Blocks.
|
|
void DisplayBlocks(BLOCK_LIST* blocks);
|
|
// Displays the column edges at each grid y coordinate defined by
|
|
// best_columns_.
|
|
void DisplayColumnBounds(PartSetVector* sets);
|
|
|
|
////// Functions involved in determining the columns used on the page. /////
|
|
|
|
// Sets up column_sets_ (the determined column layout at each horizontal
|
|
// slice). Returns false if the page is empty.
|
|
bool MakeColumns(bool single_column);
|
|
// Attempt to improve the column_candidates by expanding the columns
|
|
// and adding new partitions from the partition sets in src_sets.
|
|
// Src_sets may be equal to column_candidates, in which case it will
|
|
// use them as a source to improve themselves.
|
|
void ImproveColumnCandidates(PartSetVector* src_sets,
|
|
PartSetVector* column_sets);
|
|
// Prints debug information on the column candidates.
|
|
void PrintColumnCandidates(const char* title);
|
|
// Finds the optimal set of columns that cover the entire image with as
|
|
// few changes in column partition as possible.
|
|
// Returns true if any part of the page is multi-column.
|
|
bool AssignColumns(const PartSetVector& part_sets);
|
|
// Finds the biggest range in part_sets_ that has no assigned column, but
|
|
// column assignment is possible.
|
|
bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible,
|
|
int* start, int* end);
|
|
// Finds the modal compatible column_set_ index within the given range.
|
|
int RangeModalColumnSet(int** column_set_costs, const int* assigned_costs,
|
|
int start, int end);
|
|
// Given that there are many column_set_id compatible columns in the range,
|
|
// shrinks the range to the longest contiguous run of compatibility, allowing
|
|
// gaps where no columns are possible, but not where competing columns are
|
|
// possible.
|
|
void ShrinkRangeToLongestRun(int** column_set_costs,
|
|
const int* assigned_costs,
|
|
const bool* any_columns_possible,
|
|
int column_set_id,
|
|
int* best_start, int* best_end);
|
|
// Moves start in the direction of step, upto, but not including end while
|
|
// the only incompatible regions are no more than kMaxIncompatibleColumnCount
|
|
// in size, and the compatible regions beyond are bigger.
|
|
void ExtendRangePastSmallGaps(int** column_set_costs,
|
|
const int* assigned_costs,
|
|
const bool* any_columns_possible,
|
|
int column_set_id,
|
|
int step, int end, int* start);
|
|
// Assigns the given column_set_id to the part_sets_ in the given range.
|
|
void AssignColumnToRange(int column_set_id, int start, int end,
|
|
int** column_set_costs, int* assigned_costs);
|
|
|
|
// Computes the mean_column_gap_.
|
|
void ComputeMeanColumnGap(bool any_multi_column);
|
|
|
|
//////// Functions that manipulate ColPartitions in the part_grid_ /////
|
|
//////// to split, merge, find margins, and find types. //////////////
|
|
|
|
// Hoovers up all un-owned blobs and deletes them.
|
|
// The rest get released from the block so the ColPartitions can pass
|
|
// ownership to the output blocks.
|
|
void ReleaseBlobsAndCleanupUnused(TO_BLOCK* block);
|
|
// Splits partitions that cross columns where they have nothing in the gap.
|
|
void GridSplitPartitions();
|
|
// Merges partitions where there is vertical overlap, within a single column,
|
|
// and the horizontal gap is small enough.
|
|
void GridMergePartitions();
|
|
// Inserts remaining noise blobs into the most applicable partition if any.
|
|
// If there is no applicable partition, then the blobs are deleted.
|
|
void InsertRemainingNoise(TO_BLOCK* block);
|
|
// Remove partitions that come from horizontal lines that look like
|
|
// underlines, but are not part of a table.
|
|
void GridRemoveUnderlinePartitions();
|
|
// Add horizontal line separators as partitions.
|
|
void GridInsertHLinePartitions();
|
|
// Add vertical line separators as partitions.
|
|
void GridInsertVLinePartitions();
|
|
// For every ColPartition in the grid, sets its type based on position
|
|
// in the columns.
|
|
void SetPartitionTypes();
|
|
// Only images remain with multiple types in a run of partners.
|
|
// Sets the type of all in the group to the maximum of the group.
|
|
void SmoothPartnerRuns();
|
|
|
|
//////// Functions that make the final output blocks ///////
|
|
|
|
// Helper functions for TransformToBlocks.
|
|
// Add the part to the temp list in the correct order.
|
|
void AddToTempPartList(ColPartition* part, ColPartition_CLIST* temp_list);
|
|
// Add everything from the temp list to the work_set assuming correct order.
|
|
void EmptyTempPartList(ColPartition_CLIST* temp_list,
|
|
WorkingPartSet_LIST* work_set);
|
|
|
|
// Transform the grid of partitions to the output blocks.
|
|
void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
|
|
|
|
// Reflect the blob boxes (but not the outlines) in the y-axis so that
|
|
// the blocks get created in the correct RTL order. Rotates the blobs
|
|
// in the input_block and the bblobs list.
|
|
// The reflection is undone in RotateAndReskewBlocks by
|
|
// reflecting the blocks themselves, and then recomputing the blob bounding
|
|
// boxes.
|
|
void ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs);
|
|
|
|
// Undo the deskew that was done in FindTabVectors, as recognition is done
|
|
// without correcting blobs or blob outlines for skew.
|
|
// Reskew the completed blocks to put them back to the original rotated coords
|
|
// that were created by CorrectOrientation.
|
|
// If the input_is_rtl, then reflect the blocks in the y-axis to undo the
|
|
// reflection that was done before FindTabVectors.
|
|
// Blocks that were identified as vertical text (relative to the rotated
|
|
// coordinates) are further rotated so the text lines are horizontal.
|
|
// blob polygonal outlines are rotated to match the position of the blocks
|
|
// that they are in, and their bounding boxes are recalculated to be accurate.
|
|
// Record appropriate inverse transformations and required
|
|
// classifier transformation in the blocks.
|
|
void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST* to_blocks);
|
|
|
|
// Computes the rotations for the block (to make textlines horizontal) and
|
|
// for the blobs (for classification) and sets the appropriate members
|
|
// of the given block.
|
|
// Returns the rotation that needs to be applied to the blobs to make
|
|
// them sit in the rotated block.
|
|
FCOORD ComputeBlockAndClassifyRotation(BLOCK* block);
|
|
|
|
// If true then the page language is cjk, so it is safe to perform
|
|
// FixBrokenCJK.
|
|
bool cjk_script_;
|
|
// The minimum gutter width to apply for finding columns.
|
|
// Modified when vertical text is detected to prevent detection of
|
|
// vertical text lines as columns.
|
|
int min_gutter_width_;
|
|
// The mean gap between columns over the page.
|
|
int mean_column_gap_;
|
|
// Config param saved at construction time. Modifies min_gutter_width_ with
|
|
// vertical text to prevent detection of vertical text as columns.
|
|
double tabfind_aligned_gap_fraction_;
|
|
// The rotation vector needed to convert original coords to deskewed.
|
|
FCOORD deskew_;
|
|
// The rotation vector needed to convert deskewed back to original coords.
|
|
FCOORD reskew_;
|
|
// The rotation vector used to rotate vertically oriented pages.
|
|
FCOORD rotation_;
|
|
// The rotation vector needed to convert the rotated back to original coords.
|
|
FCOORD rerotate_;
|
|
// The additional rotation vector needed to rotate text for recognition.
|
|
FCOORD text_rotation_;
|
|
// The column_sets_ contain the ordered candidate ColPartitionSets that
|
|
// define the possible divisions of the page into columns.
|
|
PartSetVector column_sets_;
|
|
// A simple array of pointers to the best assigned column division at
|
|
// each grid y coordinate.
|
|
ColPartitionSet** best_columns_;
|
|
// The grid used for creating initial partitions with strokewidth.
|
|
StrokeWidth* stroke_width_;
|
|
// The grid used to hold ColPartitions after the columns have been determined.
|
|
ColPartitionGrid part_grid_;
|
|
// List of ColPartitions that are no longer needed after they have been
|
|
// turned into regions, but are kept around because they are referenced
|
|
// by the part_grid_.
|
|
ColPartition_LIST good_parts_;
|
|
// List of ColPartitions that are big and might be dropcap or vertically
|
|
// joined.
|
|
ColPartition_LIST big_parts_;
|
|
// List of ColPartitions that have been declared noise.
|
|
ColPartition_LIST noise_parts_;
|
|
// The fake blobs that are made from the images.
|
|
BLOBNBOX_LIST image_bblobs_;
|
|
// Horizontal line separators.
|
|
TabVector_LIST horizontal_lines_;
|
|
// Image map of photo/noise areas on the page.
|
|
Pix* nontext_map_;
|
|
// Textline projection map.
|
|
TextlineProjection projection_;
|
|
// Sequence of DENORMS that indicate how to get back to the original image
|
|
// coordinate space. The destructor must delete all the DENORMs in the chain.
|
|
DENORM* denorm_;
|
|
|
|
// Various debug windows that automatically go away on completion.
|
|
ScrollView* input_blobs_win_;
|
|
|
|
// The equation region detector pointer. Note: This pointer is passed in by
|
|
// member function SetEquationDetect, and releasing it is NOT owned by this
|
|
// class.
|
|
EquationDetectBase* equation_detect_;
|
|
|
|
// Allow a subsequent instance to reuse the blocks window.
|
|
// Not thread-safe, but multiple threads shouldn't be using windows anyway.
|
|
static ScrollView* blocks_win_;
|
|
};
|
|
|
|
} // namespace tesseract.
|
|
|
|
#endif // TESSERACT_TEXTORD_COLFIND_H__
|