mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
ac014eb27a
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@646 d0cd1f9f-072b-0410-8dd7-cf729c803f20
278 lines
11 KiB
C++
278 lines
11 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: equationdetect.h
|
|
// Description: The equation detection class that inherits equationdetectbase.
|
|
// Author: Zongyi (Joe) Liu (joeliu@google.com)
|
|
// Created: Fri Aug 31 11:13:01 PST 2011
|
|
//
|
|
// (C) Copyright 2011, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H__
|
|
#define TESSERACT_CCMAIN_EQUATIONDETECT_H__
|
|
|
|
#include "blobbox.h"
|
|
#include "equationdetectbase.h"
|
|
#include "genericvector.h"
|
|
#include "unichar.h"
|
|
|
|
class BLOBNBOX;
|
|
class BLOB_CHOICE;
|
|
class BLOB_CHOICE_LIST;
|
|
class TO_BLOCK_LIST;
|
|
class TBOX;
|
|
class UNICHARSET;
|
|
|
|
namespace tesseract {
|
|
|
|
class Tesseract;
|
|
class ColPartition;
|
|
class ColPartitionGrid;
|
|
class ColPartitionSet;
|
|
|
|
class EquationDetect : public EquationDetectBase {
|
|
public:
|
|
EquationDetect(const char* equ_datapath,
|
|
const char* equ_language);
|
|
~EquationDetect();
|
|
|
|
enum IndentType {
|
|
NO_INDENT,
|
|
LEFT_INDENT,
|
|
RIGHT_INDENT,
|
|
BOTH_INDENT,
|
|
INDENT_TYPE_COUNT
|
|
};
|
|
|
|
// Reset the lang_tesseract_ pointer. This function should be called before we
|
|
// do any detector work.
|
|
void SetLangTesseract(Tesseract* lang_tesseract);
|
|
|
|
// Iterate over the blobs inside to_block, and set the blobs that we want to
|
|
// process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
|
|
// returns 0 upon success.
|
|
int LabelSpecialText(TO_BLOCK* to_block);
|
|
|
|
// Find possible equation partitions from part_grid. Should be called
|
|
// after the special_text_type of blobs are set.
|
|
// It returns 0 upon success.
|
|
int FindEquationParts(ColPartitionGrid* part_grid,
|
|
ColPartitionSet** best_columns);
|
|
|
|
// Reset the resolution of the processing image. TEST only function.
|
|
void SetResolution(const int resolution);
|
|
|
|
protected:
|
|
// Identify the special text type for one blob, and update its field. When
|
|
// height_th is set (> 0), we will label the blob as BSTT_NONE if its height
|
|
// is less than height_th.
|
|
void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
|
|
|
|
// Estimate the type for one unichar.
|
|
BlobSpecialTextType EstimateTypeForUnichar(
|
|
const UNICHARSET& unicharset, const UNICHAR_ID id) const;
|
|
|
|
// Compute special text type for each blobs in part_grid_.
|
|
void IdentifySpecialText();
|
|
|
|
// Identify blobs that we want to skip during special blob type
|
|
// classification.
|
|
void IdentifyBlobsToSkip(ColPartition* part);
|
|
|
|
// The ColPartitions in part_grid_ maybe over-segmented, particularly in the
|
|
// block equation regions. So we like to identify these partitions and merge
|
|
// them before we do the searching.
|
|
void MergePartsByLocation();
|
|
|
|
// Staring from the seed center, we do radius search. And for partitions that
|
|
// have large overlaps with seed, we remove them from part_grid_ and add into
|
|
// parts_overlap. Note: this function may update the part_grid_, so if the
|
|
// caller is also running ColPartitionGridSearch, use the RepositionIterator
|
|
// to continue.
|
|
void SearchByOverlap(ColPartition* seed,
|
|
GenericVector<ColPartition*>* parts_overlap);
|
|
|
|
// Insert part back into part_grid_, after it absorbs some other parts.
|
|
void InsertPartAfterAbsorb(ColPartition* part);
|
|
|
|
// Identify the colparitions in part_grid_, label them as PT_EQUATION, and
|
|
// save them into cp_seeds_.
|
|
void IdentifySeedParts();
|
|
|
|
// Check the blobs count for a seed region candidate.
|
|
bool CheckSeedBlobsCount(ColPartition* part);
|
|
|
|
// Compute the foreground pixel density for a tbox area.
|
|
float ComputeForegroundDensity(const TBOX& tbox);
|
|
|
|
// Check if part from seed2 label: with low math density and left indented. We
|
|
// are using two checks:
|
|
// 1. If its left is aligned with any coordinates in indented_texts_left,
|
|
// which we assume have been sorted.
|
|
// 2. If its foreground density is over foreground_density_th.
|
|
bool CheckForSeed2(
|
|
const GenericVector<int>& indented_texts_left,
|
|
const float foreground_density_th,
|
|
ColPartition* part);
|
|
|
|
// Count the number of values in sorted_vec that is close to val, used to
|
|
// check if a partition is aligned with text partitions.
|
|
int CountAlignment(
|
|
const GenericVector<int>& sorted_vec, const int val) const;
|
|
|
|
// Check for a seed candidate using the foreground pixel density. And we
|
|
// return true if the density is below a certain threshold, because characters
|
|
// in equation regions usually are apart with more white spaces.
|
|
bool CheckSeedFgDensity(const float density_th, ColPartition* part);
|
|
|
|
// A light version of SplitCPHor: instead of really doing the part split, we
|
|
// simply compute the union bounding box of each splitted part.
|
|
void SplitCPHorLite(ColPartition* part, GenericVector<TBOX>* splitted_boxes);
|
|
|
|
// Split the part (horizontally), and save the splitted result into
|
|
// parts_splitted. Note that it is caller's responsibility to release the
|
|
// memory owns by parts_splitted. On the other hand, the part is unchanged
|
|
// during this process and still owns the blobs, so do NOT call DeleteBoxes
|
|
// when freeing the colpartitions in parts_splitted.
|
|
void SplitCPHor(ColPartition* part,
|
|
GenericVector<ColPartition*>* parts_splitted);
|
|
|
|
// Check the density for a seed candidate (part) using its math density and
|
|
// italic density, returns true if the check passed.
|
|
bool CheckSeedDensity(const float math_density_high,
|
|
const float math_density_low,
|
|
const ColPartition* part) const;
|
|
|
|
// Check if part is indented.
|
|
IndentType IsIndented(ColPartition* part);
|
|
|
|
// Identify inline partitions from cp_seeds_, and re-label them.
|
|
void IdentifyInlineParts();
|
|
|
|
// Comute the super bounding box for all colpartitions inside part_grid_.
|
|
void ComputeCPsSuperBBox();
|
|
|
|
// Identify inline partitions from cp_seeds_ using the horizontal search.
|
|
void IdentifyInlinePartsHorizontal();
|
|
|
|
// Estimate the line spacing between two text partitions. Returns -1 if not
|
|
// enough data.
|
|
int EstimateTextPartLineSpacing();
|
|
|
|
// Identify inline partitions from cp_seeds_ using vertical search.
|
|
void IdentifyInlinePartsVertical(const bool top_to_bottom,
|
|
const int textPartsLineSpacing);
|
|
|
|
// Check if part is an inline equation zone. This should be called after we
|
|
// identified the seed regions.
|
|
bool IsInline(const bool search_bottom,
|
|
const int textPartsLineSpacing,
|
|
ColPartition* part);
|
|
|
|
// For a given seed partition, we search the part_grid_ and see if there is
|
|
// any partition can be merged with it. It returns true if the seed has been
|
|
// expanded.
|
|
bool ExpandSeed(ColPartition* seed);
|
|
|
|
// Starting from the seed position, we search the part_grid_
|
|
// horizontally/vertically, find all parititions that can be
|
|
// merged with seed, remove them from part_grid_, and put them into
|
|
// parts_to_merge.
|
|
void ExpandSeedHorizontal(const bool search_left,
|
|
ColPartition* seed,
|
|
GenericVector<ColPartition*>* parts_to_merge);
|
|
void ExpandSeedVertical(const bool search_bottom,
|
|
ColPartition* seed,
|
|
GenericVector<ColPartition*>* parts_to_merge);
|
|
|
|
// Check if a part_box is the small neighbor of seed_box.
|
|
bool IsNearSmallNeighbor(const TBOX& seed_box,
|
|
const TBOX& part_box) const;
|
|
|
|
// Perform the density check for part, which we assume is nearing a seed
|
|
// partition. It returns true if the check passed.
|
|
bool CheckSeedNeighborDensity(const ColPartition* part) const;
|
|
|
|
// After identify the math blocks, we do one more scanning on all text
|
|
// partitions, and check if any of them is the satellite of:
|
|
// math blocks: here a p is the satellite of q if:
|
|
// 1. q is the nearest vertical neighbor of p, and
|
|
// 2. y_gap(p, q) is less than a threshold, and
|
|
// 3. x_overlap(p, q) is over a threshold.
|
|
// Note that p can be the satellites of two blocks: its top neighbor and
|
|
// bottom neighbor.
|
|
void ProcessMathBlockSatelliteParts();
|
|
|
|
// Check if part is the satellite of one/two math blocks. If it is, we return
|
|
// true, and save the blocks into math_blocks.
|
|
bool IsMathBlockSatellite(
|
|
ColPartition* part, GenericVector<ColPartition*>* math_blocks);
|
|
|
|
// Search the nearest neighbor of part in one vertical direction as defined in
|
|
// search_bottom. It returns the neighbor found that major x overlap with it,
|
|
// or NULL when not found.
|
|
ColPartition* SearchNNVertical(const bool search_bottom,
|
|
const ColPartition* part);
|
|
|
|
// Check if the neighbor with vertical distance of y_gap is a near and math
|
|
// block partition.
|
|
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
|
|
|
|
// Generate the tiff file name for output/debug file.
|
|
void GetOutputTiffName(const char* name, STRING* image_name) const;
|
|
|
|
// Debugger function that renders ColPartitions on the input image, where:
|
|
// parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
|
|
// will be painted in green, and other parts will be painted in blue.
|
|
void PaintColParts(const STRING& outfile) const;
|
|
|
|
// Debugger function that renders the blobs in part_grid_ over the input
|
|
// image.
|
|
void PaintSpecialTexts(const STRING& outfile) const;
|
|
|
|
// Debugger function that print the math blobs density values for a
|
|
// ColPartition object.
|
|
void PrintSpecialBlobsDensity(const ColPartition* part) const;
|
|
|
|
// The tesseract engine intialized from equation training data.
|
|
Tesseract* equ_tesseract_;
|
|
|
|
// The tesseract engine used for OCR. This pointer is passed in by the caller,
|
|
// so do NOT destroy it in this class.
|
|
Tesseract* lang_tesseract_;
|
|
|
|
// The ColPartitionGrid that we are processing. This pointer is passed in from
|
|
// the caller, so do NOT destroy it in the class.
|
|
ColPartitionGrid* part_grid_;
|
|
|
|
// A simple array of pointers to the best assigned column division at
|
|
// each grid y coordinate. This pointer is passed in from the caller, so do
|
|
// NOT destroy it in the class.
|
|
ColPartitionSet** best_columns_;
|
|
|
|
// The super bounding box of all cps in the part_grid_.
|
|
TBOX* cps_super_bbox_;
|
|
|
|
// The seed ColPartition for equation region.
|
|
GenericVector<ColPartition*> cp_seeds_;
|
|
|
|
// The resolution (dpi) of the processing image.
|
|
int resolution_;
|
|
|
|
// The number of pages we have processed.
|
|
int page_count_;
|
|
};
|
|
|
|
} // namespace tesseract
|
|
|
|
#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_
|