From ac014eb27a4aa78195f05f9e82029591459ea89e Mon Sep 17 00:00:00 2001 From: "theraysmith@gmail.com" Date: Thu, 2 Feb 2012 02:50:01 +0000 Subject: [PATCH] Added experimental equation detector git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@646 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- ccmain/equationdetect.cpp | 1535 ++++++++++++++++++++++++++++++++ ccmain/equationdetect.h | 277 ++++++ textord/equationdetectbase.cpp | 65 ++ textord/equationdetectbase.h | 59 ++ 4 files changed, 1936 insertions(+) create mode 100644 ccmain/equationdetect.cpp create mode 100644 ccmain/equationdetect.h create mode 100644 textord/equationdetectbase.cpp create mode 100644 textord/equationdetectbase.h diff --git a/ccmain/equationdetect.cpp b/ccmain/equationdetect.cpp new file mode 100644 index 000000000..84006033e --- /dev/null +++ b/ccmain/equationdetect.cpp @@ -0,0 +1,1535 @@ +/////////////////////////////////////////////////////////////////////// +// File: equationdetect.cpp +// Description: Helper classes to detect equations. +// Author: Zongyi (Joe) Liu (joeliu@google.com) +// Created: Fri Aug 31 11:13:01 PST 2011 +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifdef _MSC_VER +#pragma warning(disable:4244) // Conversion warnings +#endif + +#include + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "equationdetect.h" + +#include "bbgrid.h" +#include "classify.h" +#include "colpartition.h" +#include "colpartitiongrid.h" +#include "colpartitionset.h" +#include "helpers.h" +#include "ratngs.h" +#include "tesseractclass.h" + +// Config variables. +BOOL_VAR(equationdetect_save_bi_image, false, "Save input bi image"); +BOOL_VAR(equationdetect_save_spt_image, false, "Save special character image"); +BOOL_VAR(equationdetect_save_seed_image, false, "Save the seed image"); +BOOL_VAR(equationdetect_save_merged_image, false, "Save the merged image"); + +namespace tesseract { + +/////////////////////////////////////////////////////////////////////////// +// Utility ColParition sort functions. +/////////////////////////////////////////////////////////////////////////// +static int SortCPByTopReverse(const void* p1, const void* p2) { + const ColPartition* cp1 = *reinterpret_cast(p1); + const ColPartition* cp2 = *reinterpret_cast(p2); + ASSERT_HOST(cp1 != NULL && cp2 != NULL); + const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box()); + return box2.top() - box1.top(); +} + +static int SortCPByBottom(const void* p1, const void* p2) { + const ColPartition* cp1 = *reinterpret_cast(p1); + const ColPartition* cp2 = *reinterpret_cast(p2); + ASSERT_HOST(cp1 != NULL && cp2 != NULL); + const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box()); + return box1.bottom() - box2.bottom(); +} + +static int SortCPByHeight(const void* p1, const void* p2) { + const ColPartition* cp1 = *reinterpret_cast(p1); + const ColPartition* cp2 = *reinterpret_cast(p2); + ASSERT_HOST(cp1 != NULL && cp2 != NULL); + const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box()); + return box1.height() - box2.height(); +} + +// TODO(joeliu): we may want to parameterize these constants. +const float kMathDigitDensityTh1 = 0.25; +const float kMathDigitDensityTh2 = 0.1; +const float kMathItalicDensityTh = 0.5; +const float kUnclearDensityTh = 0.25; +const int kSeedBlobsCountTh = 10; +const int kLeftIndentAlignmentCountTh = 1; + +// Returns true if PolyBlockType is of text type or equation type. +inline bool IsTextOrEquationType(PolyBlockType type) { + return PTIsTextType(type) || type == PT_EQUATION; +} + +inline bool IsLeftIndented(const EquationDetect::IndentType type) { + return type == EquationDetect::LEFT_INDENT || + type == EquationDetect::BOTH_INDENT; +} + +inline bool IsRightIndented(const EquationDetect::IndentType type) { + return type == EquationDetect::RIGHT_INDENT || + type == EquationDetect::BOTH_INDENT; +} + +EquationDetect::EquationDetect(const char* equ_datapath, + const char* equ_name) { + const char* default_name = "equ"; + if (equ_name == NULL) { + equ_name = default_name; + } + equ_tesseract_ = lang_tesseract_ = NULL; + resolution_ = 0; + page_count_ = 0; + + // Construct equ_tesseract_. + equ_tesseract_ = new Tesseract(); + if (equ_tesseract_->init_tesseract(equ_datapath, equ_name, + OEM_TESSERACT_ONLY)) { + tprintf("Warning: equation region detection requested," + " but %s failed to load from %s\n", equ_name, equ_datapath); + delete equ_tesseract_; + equ_tesseract_ = NULL; + } + + cps_super_bbox_ = NULL; +} + +EquationDetect::~EquationDetect() { + if (equ_tesseract_) { + delete (equ_tesseract_); + } + if (cps_super_bbox_) { + delete(cps_super_bbox_); + } +} + +void EquationDetect::SetLangTesseract(Tesseract* lang_tesseract) { + lang_tesseract_ = lang_tesseract; +} + +void EquationDetect::SetResolution(const int resolution) { + resolution_ = resolution; +} + +int EquationDetect::LabelSpecialText(TO_BLOCK* to_block) { + if (to_block == NULL) { + tprintf("Warning: input to_block is NULL!\n"); + return -1; + } + + GenericVector blob_lists; + blob_lists.push_back(&(to_block->blobs)); + blob_lists.push_back(&(to_block->large_blobs)); + for (int i = 0; i < blob_lists.size(); ++i) { + BLOBNBOX_IT bbox_it(blob_lists[i]); + for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list(); + bbox_it.forward()) { + bbox_it.data()->set_special_text_type(BSTT_NONE); + } + } + + return 0; +} + +void EquationDetect::IdentifySpecialText( + BLOBNBOX *blobnbox, const int height_th) { + ASSERT_HOST(blobnbox != NULL); + if (blobnbox->bounding_box().height() < height_th && height_th > 0) { + // For small blob, we simply set to BSTT_NONE. + blobnbox->set_special_text_type(BSTT_NONE); + return; + } + + BLOB_CHOICE_LIST ratings_equ, ratings_lang; + C_BLOB* blob = blobnbox->cblob(); + TBLOB* tblob = TBLOB::PolygonalCopy(blob); + const TBOX& box = tblob->bounding_box(); + + // Normalize the blob. Set the origin to the place we want to be the + // bottom-middle, and scaling is to make the height the x-height. + float scaling = static_cast(kBlnXHeight) / box.height(); + DENORM denorm; + float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom(); + denorm.SetupNormalization(NULL, NULL, NULL, NULL, NULL, 0, + x_orig, y_orig, scaling, scaling, + 0.0f, static_cast(kBlnBaselineOffset)); + TBLOB* normed_blob = new TBLOB(*tblob); + normed_blob->Normalize(denorm); + equ_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_equ, NULL); + lang_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_lang, NULL); + delete normed_blob; + delete tblob; + + // Get the best choice from ratings_lang and rating_equ. As the choice in the + // list has already been sorted by the certainty, we simply use the first + // choice. + BLOB_CHOICE *lang_choice = NULL, *equ_choice = NULL; + if (ratings_lang.length() > 0) { + BLOB_CHOICE_IT choice_it(&ratings_lang); + lang_choice = choice_it.data(); + } + if (ratings_equ.length() > 0) { + BLOB_CHOICE_IT choice_it(&ratings_equ); + equ_choice = choice_it.data(); + } + + float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX; + float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX; + + const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8; + // The scores here are negative, so the max/min == fabs(min/max). + // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score); + float diff = fabs(lang_score - equ_score); + BlobSpecialTextType type = BSTT_NONE; + + // Classification. + if (fmax(lang_score, equ_score) < kConfScoreTh) { + // If both score are very small, then mark it as unclear. + type = BSTT_UNCLEAR; + } else if (diff > kConfDiffTh && equ_score > lang_score) { + // If equ_score is significantly higher, then we classify this character as + // math symbol. + type = BSTT_MATH; + } else if (lang_choice) { + // For other cases: lang_score is similar or significantly higher. + type = EstimateTypeForUnichar( + lang_tesseract_->unicharset, lang_choice->unichar_id()); + } + + if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get( + lang_choice->fontinfo_id()).is_italic()) { + // For text symbol, we still check if it is italic. + blobnbox->set_special_text_type(BSTT_ITALIC); + } else { + blobnbox->set_special_text_type(type); + } +} + +BlobSpecialTextType EquationDetect::EstimateTypeForUnichar( + const UNICHARSET& unicharset, const UNICHAR_ID id) const { + STRING s = unicharset.id_to_unichar(id); + if (unicharset.get_isalpha(id)) { + return BSTT_NONE; + } + + if (unicharset.get_ispunctuation(id)) { + // Exclude some special texts that are likely to be confused as math symbol. + static GenericVector ids_to_exclude; + if (ids_to_exclude.empty()) { + static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".", + "〈", "〉", "《", "》", "」", "「", ""}; + int i = 0; + while (kCharsToEx[i] != "") { + ids_to_exclude.push_back( + unicharset.unichar_to_id(kCharsToEx[i++].string())); + } + ids_to_exclude.sort(); + } + return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH; + } + + // Check if it is digit. In addition to the isdigit attribute, we also check + // if this character belongs to those likely to be confused with a digit. + static const STRING kDigitsChars = "|"; + if (unicharset.get_isdigit(id) || + (s.length() == 1 && kDigitsChars.contains(s[0]))) { + return BSTT_DIGIT; + } else { + return BSTT_MATH; + } +} + +void EquationDetect::IdentifySpecialText() { + // Set configuration for Tesseract::AdaptiveClassifier. + equ_tesseract_->tess_cn_matching.set_value(true); // turn it on + equ_tesseract_->tess_bn_matching.set_value(false); + + // Set the multiplier to zero for lang_tesseract_ to improve the accuracy. + int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier; + int classify_integer_matcher = + lang_tesseract_->classify_integer_matcher_multiplier; + lang_tesseract_->classify_class_pruner_multiplier.set_value(0); + lang_tesseract_->classify_integer_matcher_multiplier.set_value(0); + + ColPartitionGridSearch gsearch(part_grid_); + ColPartition *part = NULL; + gsearch.StartFullSearch(); + while ((part = gsearch.NextFullSearch()) != NULL) { + if (!IsTextOrEquationType(part->type())) { + continue; + } + IdentifyBlobsToSkip(part); + BLOBNBOX_C_IT bbox_it(part->boxes()); + // Compute the height threshold. + GenericVector blob_heights; + for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list(); + bbox_it.forward()) { + if (bbox_it.data()->special_text_type() != BSTT_SKIP) { + blob_heights.push_back(bbox_it.data()->bounding_box().height()); + } + } + blob_heights.sort(); + int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2; + for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list(); + bbox_it.forward()) { + if (bbox_it.data()->special_text_type() != BSTT_SKIP) { + IdentifySpecialText(bbox_it.data(), height_th); + } + } + } + + // Set the multiplier values back. + lang_tesseract_->classify_class_pruner_multiplier.set_value( + classify_class_pruner); + lang_tesseract_->classify_integer_matcher_multiplier.set_value( + classify_integer_matcher); + + if (equationdetect_save_spt_image) { // For debug. + STRING outfile; + GetOutputTiffName("_spt", &outfile); + PaintSpecialTexts(outfile); + } +} + +void EquationDetect::IdentifyBlobsToSkip(ColPartition* part) { + ASSERT_HOST(part); + BLOBNBOX_C_IT blob_it(part->boxes()); + + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + // At this moment, no blob should have been joined. + ASSERT_HOST(!blob_it.data()->joined_to_prev()); + } + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + BLOBNBOX* blob = blob_it.data(); + if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) { + continue; + } + TBOX blob_box = blob->bounding_box(); + + // Search if any blob can be merged into blob. If found, then we mark all + // these blobs as BSTT_SKIP. + BLOBNBOX_C_IT blob_it2 = blob_it; + bool found = false; + while (!blob_it2.at_last()) { + BLOBNBOX* nextblob = blob_it2.forward(); + const TBOX& nextblob_box = nextblob->bounding_box(); + if (nextblob_box.left() >= blob_box.right()) { + break; + } + const float kWidthR = 0.4, kHeightR = 0.3; + bool xoverlap = blob_box.major_x_overlap(nextblob_box), + yoverlap = blob_box.y_overlap(nextblob_box); + float widthR = static_cast( + MIN(nextblob_box.width(), blob_box.width())) / + MAX(nextblob_box.width(), blob_box.width()); + float heightR = static_cast( + MIN(nextblob_box.height(), blob_box.height())) / + MAX(nextblob_box.height(), blob_box.height()); + + if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) { + // Found one, set nextblob type and recompute blob_box. + found = true; + nextblob->set_special_text_type(BSTT_SKIP); + blob_box += nextblob_box; + } + } + if (found) { + blob->set_special_text_type(BSTT_SKIP); + } + } +} + +int EquationDetect::FindEquationParts( + ColPartitionGrid* part_grid, ColPartitionSet** best_columns) { + if (!equ_tesseract_ || !lang_tesseract_) { + tprintf("Warning: equ_tesseract_/lang_tesseract_ is NULL!\n"); + return -1; + } + if (!part_grid || !best_columns) { + tprintf("part_grid/best_columns is NULL!!\n"); + return -1; + } + cp_seeds_.clear(); + part_grid_ = part_grid; + best_columns_ = best_columns; + resolution_ = lang_tesseract_->source_resolution(); + STRING outfile; + page_count_++; + + if (equationdetect_save_bi_image) { + GetOutputTiffName("_bi", &outfile); + pixWrite(outfile.string(), lang_tesseract_->pix_binary(), IFF_TIFF_G4); + } + + // Pass 0: Compute special text type for blobs. + IdentifySpecialText(); + + // Pass 1: Merge parts by overlap. + MergePartsByLocation(); + + // Pass 2: compute the math blob density and find the seed partition. + IdentifySeedParts(); + // We still need separate seed into block seed and inline seed partition. + IdentifyInlineParts(); + + if (equationdetect_save_seed_image) { + GetOutputTiffName("_seed", &outfile); + PaintColParts(outfile); + } + + // Pass 3: expand block equation seeds. + while (!cp_seeds_.empty()) { + GenericVector seeds_expanded; + for (int i = 0; i < cp_seeds_.size(); ++i) { + if (ExpandSeed(cp_seeds_[i])) { + // If this seed is expanded, then we add it into seeds_expanded. Note + // this seed has been removed from part_grid_ if it is expanded. + seeds_expanded.push_back(cp_seeds_[i]); + } + } + // Add seeds_expanded back into part_grid_ and reset cp_seeds_. + for (int i = 0; i < seeds_expanded.size(); ++i) { + InsertPartAfterAbsorb(seeds_expanded[i]); + } + cp_seeds_ = seeds_expanded; + } + + // Pass 4: find math block satellite text partitions and merge them. + ProcessMathBlockSatelliteParts(); + + if (equationdetect_save_merged_image) { // For debug. + GetOutputTiffName("_merged", &outfile); + PaintColParts(outfile); + } + + return 0; +} + +void EquationDetect::MergePartsByLocation() { + while (true) { + ColPartition* part = NULL; + // partitions that have been updated. + GenericVector parts_updated; + ColPartitionGridSearch gsearch(part_grid_); + gsearch.StartFullSearch(); + while ((part = gsearch.NextFullSearch()) != NULL) { + if (!IsTextOrEquationType(part->type())) { + continue; + } + GenericVector parts_to_merge; + SearchByOverlap(part, &parts_to_merge); + if (parts_to_merge.empty()) { + continue; + } + + // Merge parts_to_merge with part, and remove them from part_grid_. + part_grid_->RemoveBBox(part); + for (int i = 0; i < parts_to_merge.size(); ++i) { + ASSERT_HOST(parts_to_merge[i] != NULL && parts_to_merge[i] != part); + part->Absorb(parts_to_merge[i], NULL); + } + gsearch.RepositionIterator(); + + parts_updated.push_back(part); + } + + if (parts_updated.empty()) { // Exit the loop + break; + } + + // Re-insert parts_updated into part_grid_. + for (int i = 0; i < parts_updated.size(); ++i) { + InsertPartAfterAbsorb(parts_updated[i]); + } + } +} + +void EquationDetect::SearchByOverlap( + ColPartition* seed, + GenericVector* parts_overlap) { + ASSERT_HOST(seed != NULL && parts_overlap != NULL); + if (!IsTextOrEquationType(seed->type())) { + return; + } + ColPartitionGridSearch search(part_grid_); + const TBOX& seed_box(seed->bounding_box()); + const int kRadNeighborCells = 30; + search.StartRadSearch((seed_box.left() + seed_box.right()) / 2, + (seed_box.top() + seed_box.bottom()) / 2, + kRadNeighborCells); + search.SetUniqueMode(true); + + // Search iteratively. + ColPartition *part; + GenericVector parts; + const float kLargeOverlapTh = 0.95; + const float kEquXOverlap = 0.4, kEquYOverlap = 0.5; + while ((part = search.NextRadSearch()) != NULL) { + if (part == seed || !IsTextOrEquationType(part->type())) { + continue; + } + const TBOX& part_box(part->bounding_box()); + bool merge = false; + + float x_overlap_fraction = part_box.x_overlap_fraction(seed_box), + y_overlap_fraction = part_box.y_overlap_fraction(seed_box); + + // If part is large overlapped with seed, then set merge to true. + if (x_overlap_fraction >= kLargeOverlapTh && + y_overlap_fraction >= kLargeOverlapTh) { + merge = true; + } else if (seed->type() == PT_EQUATION && + IsTextOrEquationType(part->type())) { + if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) || + (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) { + merge = true; + } + } + + if (merge) { // Remove the part from search and put it into parts. + search.RemoveBBox(); + parts_overlap->push_back(part); + } + } +} + +void EquationDetect::InsertPartAfterAbsorb(ColPartition* part) { + ASSERT_HOST(part); + + // Before insert part back into part_grid_, we will need re-compute some + // of its attributes such as first_column_, last_column_. However, we still + // want to preserve its type. + BlobTextFlowType flow_type = part->flow(); + PolyBlockType part_type = part->type(); + BlobRegionType blob_type = part->blob_type(); + + // Call SetPartitionType to re-compute the attributes of part. + const TBOX& part_box(part->bounding_box()); + int grid_x, grid_y; + part_grid_->GridCoords( + part_box.left(), part_box.bottom(), &grid_x, &grid_y); + part->SetPartitionType(resolution_, best_columns_[grid_y]); + + // Reset the types back. + part->set_type(part_type); + part->set_blob_type(blob_type); + part->set_flow(flow_type); + part->SetBlobTypes(); + + // Insert into part_grid_. + part_grid_->InsertBBox(true, true, part); +} + +void EquationDetect::IdentifySeedParts() { + ColPartitionGridSearch gsearch(part_grid_); + ColPartition *part = NULL; + gsearch.StartFullSearch(); + + GenericVector seeds1, seeds2; + // The left coordinates of indented text partitions. + GenericVector indented_texts_left; + // The foreground density of text partitions. + GenericVector texts_foreground_density; + while ((part = gsearch.NextFullSearch()) != NULL) { + if (!IsTextOrEquationType(part->type())) { + continue; + } + part->ComputeSpecialBlobsDensity(); + bool blobs_check = CheckSeedBlobsCount(part); + const int kTextBlobsTh = 20; + + if (CheckSeedDensity(kMathDigitDensityTh1, kMathDigitDensityTh2, part) && + blobs_check) { + // Passed high density threshold test, save into seeds1. + seeds1.push_back(part); + } else { + IndentType indent = IsIndented(part); + if (IsLeftIndented(indent) && blobs_check && + CheckSeedDensity(kMathDigitDensityTh2, kMathDigitDensityTh2, part)) { + // Passed low density threshold test and is indented, save into seeds2. + seeds2.push_back(part); + } else if (!IsRightIndented(indent) && + part->boxes_count() > kTextBlobsTh) { + // This is likely to be a text part, save the features. + const TBOX&box = part->bounding_box(); + if (IsLeftIndented(indent)) { + indented_texts_left.push_back(box.left()); + } + texts_foreground_density.push_back(ComputeForegroundDensity(box)); + } + } + } + + // Sort the features collected from text regions. + indented_texts_left.sort(); + texts_foreground_density.sort(); + float foreground_density_th = 0.15; // Default value. + if (!texts_foreground_density.empty()) { + // Use the median of the texts_foreground_density. + foreground_density_th = 0.8 * texts_foreground_density[ + texts_foreground_density.size() / 2]; + } + + for (int i = 0; i < seeds1.size(); ++i) { + const TBOX& box = seeds1[i]->bounding_box(); + if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) && + !(IsLeftIndented(IsIndented(seeds1[i])) && + CountAlignment(indented_texts_left, box.left()) >= + kLeftIndentAlignmentCountTh)) { + // Mark as PT_EQUATION type. + seeds1[i]->set_type(PT_EQUATION); + cp_seeds_.push_back(seeds1[i]); + } else { // Mark as PT_INLINE_EQUATION type. + seeds1[i]->set_type(PT_INLINE_EQUATION); + } + } + + for (int i = 0; i < seeds2.size(); ++i) { + if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) { + seeds2[i]->set_type(PT_EQUATION); + cp_seeds_.push_back(seeds2[i]); + } + } +} + +float EquationDetect::ComputeForegroundDensity(const TBOX& tbox) { +#if LIBLEPT_MINOR_VERSION < 69 && LIBLEPT_MAJOR_VERSION <= 1 + // This will disable the detector because no seed will be identified. + return 1.0f; +#else + Pix *pix_bi = lang_tesseract_->pix_binary(); + int pix_height = pixGetHeight(pix_bi); + Box* box = boxCreate(tbox.left(), pix_height - tbox.top(), + tbox.width(), tbox.height()); + Pix *pix_sub = pixClipRectangle(pix_bi, box, NULL); + l_float32 fract; + pixForegroundFraction(pix_sub, &fract); + pixDestroy(&pix_sub); + boxDestroy(&box); + + return fract; +#endif +} + +bool EquationDetect::CheckSeedFgDensity(const float density_th, + ColPartition* part) { + ASSERT_HOST(part); + + // Split part horizontall, and check for each sub part. + GenericVector sub_boxes; + SplitCPHorLite(part, &sub_boxes); + float parts_passed = 0.0; + for (int i = 0; i < sub_boxes.size(); ++i) { + float density = ComputeForegroundDensity(sub_boxes[i]); + if (density < density_th) { + parts_passed++; + } + } + + // If most sub parts passed, then we return true. + const float kSeedPartRatioTh = 0.3; + bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh); + + return retval; +} + +void EquationDetect::SplitCPHor(ColPartition* part, + GenericVector* parts_splitted) { + ASSERT_HOST(part && parts_splitted); + if (part->median_width() == 0 || part->boxes_count() == 0) { + return; + } + + // Make a copy of part, and reset parts_splitted. + ColPartition* right_part = part->CopyButDontOwnBlobs(); + parts_splitted->delete_data_pointers(); + parts_splitted->clear(); + + const double kThreshold = part->median_width() * 3.0; + bool found_split = true; + while (found_split) { + found_split = false; + BLOBNBOX_C_IT box_it(right_part->boxes()); + // Blobs are sorted left side first. If blobs overlap, + // the previous blob may have a "more right" right side. + // Account for this by always keeping the largest "right" + // so far. + int previous_right = MIN_INT32; + + // Look for the next split in the partition. + for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { + const TBOX& box = box_it.data()->bounding_box(); + if (previous_right != MIN_INT32 && + box.left() - previous_right > kThreshold) { + // We have a split position. Split the partition in two pieces. + // Insert the left piece in the grid and keep processing the right. + int mid_x = (box.left() + previous_right) / 2; + ColPartition* left_part = right_part; + right_part = left_part->SplitAt(mid_x); + + parts_splitted->push_back(left_part); + left_part->ComputeSpecialBlobsDensity(); + found_split = true; + break; + } + + // The right side of the previous blobs. + previous_right = MAX(previous_right, box.right()); + } + } + + // Add the last piece. + right_part->ComputeSpecialBlobsDensity(); + parts_splitted->push_back(right_part); +} + +void EquationDetect::SplitCPHorLite(ColPartition* part, + GenericVector* splitted_boxes) { + ASSERT_HOST(part && splitted_boxes); + splitted_boxes->clear(); + if (part->median_width() == 0) { + return; + } + + const double kThreshold = part->median_width() * 3.0; + + // Blobs are sorted left side first. If blobs overlap, + // the previous blob may have a "more right" right side. + // Account for this by always keeping the largest "right" + // so far. + TBOX union_box; + int previous_right = MIN_INT32; + BLOBNBOX_C_IT box_it(part->boxes()); + for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { + const TBOX& box = box_it.data()->bounding_box(); + if (previous_right != MIN_INT32 && + box.left() - previous_right > kThreshold) { + // We have a split position. + splitted_boxes->push_back(union_box); + previous_right = MIN_INT32; + } + if (previous_right == MIN_INT32) { + union_box = box; + } else { + union_box += box; + } + // The right side of the previous blobs. + previous_right = MAX(previous_right, box.right()); + } + + // Add the last piece. + if (previous_right != MIN_INT32) { + splitted_boxes->push_back(union_box); + } +} + +bool EquationDetect::CheckForSeed2( + const GenericVector& indented_texts_left, + const float foreground_density_th, + ColPartition* part) { + ASSERT_HOST(part); + const TBOX& box = part->bounding_box(); + + // Check if it is aligned with any indented_texts_left. + if (!indented_texts_left.empty() && + CountAlignment(indented_texts_left, box.left()) >= + kLeftIndentAlignmentCountTh) { + return false; + } + + // Check the foreground density. + if (ComputeForegroundDensity(box) > foreground_density_th) { + return false; + } + + return true; +} + +int EquationDetect::CountAlignment( + const GenericVector& sorted_vec, const int val) const { + if (sorted_vec.empty()) { + return 0; + } + const int kDistTh = static_cast(roundf(0.03 * resolution_)); + int pos = sorted_vec.binary_search(val), count = 0; + + // Search left side. + int index = pos; + while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) { + count++; + } + + // Search right side. + index = pos + 1; + while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) { + count++; + } + + return count; +} + +void EquationDetect::IdentifyInlineParts() { + ComputeCPsSuperBBox(); + IdentifyInlinePartsHorizontal(); + int textparts_linespacing = EstimateTextPartLineSpacing(); + IdentifyInlinePartsVertical(true, textparts_linespacing); + IdentifyInlinePartsVertical(false, textparts_linespacing); +} + +void EquationDetect::ComputeCPsSuperBBox() { + ColPartitionGridSearch gsearch(part_grid_); + ColPartition *part = NULL; + gsearch.StartFullSearch(); + if (cps_super_bbox_) { + delete cps_super_bbox_; + } + cps_super_bbox_ = new TBOX(); + while ((part = gsearch.NextFullSearch()) != NULL) { + (*cps_super_bbox_) += part->bounding_box(); + } +} + +void EquationDetect::IdentifyInlinePartsHorizontal() { + ASSERT_HOST(cps_super_bbox_); + GenericVector new_seeds; + const int kMarginDiffTh = IntCastRounded( + 0.5 * lang_tesseract_->source_resolution()); + const int kGapTh = static_cast(roundf( + 1.0 * lang_tesseract_->source_resolution())); + ColPartitionGridSearch search(part_grid_); + search.SetUniqueMode(true); + // The center x coordinate of the cp_super_bbox_. + int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2; + for (int i = 0; i < cp_seeds_.size(); ++i) { + ColPartition* part = cp_seeds_[i]; + const TBOX& part_box(part->bounding_box()); + int left_margin = part_box.left() - cps_super_bbox_->left(), + right_margin = cps_super_bbox_->right() - part_box.right(); + bool right_to_left; + if (left_margin + kMarginDiffTh < right_margin && + left_margin < kMarginDiffTh) { + // part is left aligned, so we search if it has any right neighbor. + search.StartSideSearch( + part_box.right(), part_box.top(), part_box.bottom()); + right_to_left = false; + } else if (left_margin > cps_cx) { + // part locates on the right half on image, so search if it has any left + // neighbor. + search.StartSideSearch( + part_box.left(), part_box.top(), part_box.bottom()); + right_to_left = true; + } else { // part is not an inline equation. + new_seeds.push_back(part); + continue; + } + ColPartition* neighbor = NULL; + bool side_neighbor_found = false; + while ((neighbor = search.NextSideSearch(right_to_left)) != NULL) { + const TBOX& neighbor_box(neighbor->bounding_box()); + if (!IsTextOrEquationType(neighbor->type()) || + part_box.x_gap(neighbor_box) > kGapTh || + !part_box.major_y_overlap(neighbor_box) || + part_box.major_x_overlap(neighbor_box)) { + continue; + } + // We have found one. Set the side_neighbor_found flag. + side_neighbor_found = true; + break; + } + if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION. + part->set_type(PT_INLINE_EQUATION); + } else { + // Check the geometric feature of neighbor. + const TBOX& neighbor_box(neighbor->bounding_box()); + if (neighbor_box.width() > part_box.width() && + neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION. + part->set_type(PT_INLINE_EQUATION); + } else { // part is not an inline equation type. + new_seeds.push_back(part); + } + } + } + + // Reset the cp_seeds_ using the new_seeds. + cp_seeds_ = new_seeds; +} + +int EquationDetect::EstimateTextPartLineSpacing() { + ColPartitionGridSearch gsearch(part_grid_); + + // Get the y gap between text partitions; + ColPartition *current = NULL, *prev = NULL; + gsearch.StartFullSearch(); + GenericVector ygaps; + while ((current = gsearch.NextFullSearch()) != NULL) { + if (!PTIsTextType(current->type())) { + continue; + } + if (prev != NULL) { + const TBOX ¤t_box = current->bounding_box(); + const TBOX &prev_box = prev->bounding_box(); + // prev and current should be x major overlap and non y overlap. + if (current_box.major_x_overlap(prev_box) && + !current_box.y_overlap(prev_box)) { + int gap = current_box.y_gap(prev_box); + if (gap < MIN(current_box.height(), prev_box.height())) { + // The gap should be smaller than the height of the bounding boxes. + ygaps.push_back(gap); + } + } + } + prev = current; + } + + if (ygaps.size() < 8) { // We do not have enough data. + return -1; + } + + // Compute the line spacing from ygaps: use the mean of the first half. + ygaps.sort(); + int spacing = 0, count; + for (count = 0; count < ygaps.size() / 2; count++) { + spacing += ygaps[count]; + } + return spacing / count; +} + +void EquationDetect::IdentifyInlinePartsVertical( + const bool top_to_bottom, const int textparts_linespacing) { + if (cp_seeds_.empty()) { + return; + } + + // Sort cp_seeds_. + if (top_to_bottom) { // From top to bottom. + cp_seeds_.sort(&SortCPByTopReverse); + } else { // From bottom to top. + cp_seeds_.sort(&SortCPByBottom); + } + + GenericVector new_seeds; + for (int i = 0; i < cp_seeds_.size(); ++i) { + ColPartition* part = cp_seeds_[i]; + // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look + // for its top neighbors, so that if two/more inline regions are connected + // to each other, then we will identify the top one, and then use it to + // identify the bottom one. + if (IsInline(!top_to_bottom, textparts_linespacing, part)) { + part->set_type(PT_INLINE_EQUATION); + } else { + new_seeds.push_back(part); + } + } + cp_seeds_ = new_seeds; +} + +bool EquationDetect::IsInline(const bool search_bottom, + const int textparts_linespacing, + ColPartition* part) { + ASSERT_HOST(part != NULL); + // Look for its nearest vertical neighbor that hardly overlaps in y but + // largely overlaps in x. + ColPartitionGridSearch search(part_grid_); + ColPartition *neighbor = NULL; + const TBOX& part_box(part->bounding_box()); + const float kYGapRatioTh = 1.0; + + if (search_bottom) { + search.StartVerticalSearch(part_box.left(), part_box.right(), + part_box.bottom()); + } else { + search.StartVerticalSearch(part_box.left(), part_box.right(), + part_box.top()); + } + search.SetUniqueMode(true); + while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) { + const TBOX& neighbor_box(neighbor->bounding_box()); + if (part_box.y_gap(neighbor_box) > kYGapRatioTh * + MIN(part_box.height(), neighbor_box.height())) { + // Finished searching. + break; + } + if (!PTIsTextType(neighbor->type())) { + continue; + } + + // Check if neighbor and part is inline similar. + const float kHeightRatioTh = 0.5; + const int kYGapTh = textparts_linespacing > 0 ? + textparts_linespacing + static_cast(roundf(0.02 * resolution_)): + static_cast(roundf(0.05 * resolution_)); // Default value. + if (part_box.x_overlap(neighbor_box) && // Location feature. + part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing. + // Geo feature. + static_cast(MIN(part_box.height(), neighbor_box.height())) / + MAX(part_box.height(), neighbor_box.height()) > kHeightRatioTh) { + return true; + } + } + + return false; +} + +bool EquationDetect::CheckSeedBlobsCount(ColPartition* part) { + if (!part) { + return false; + } + const int kSeedMathBlobsCount = 2; + const int kSeedMathDigitBlobsCount = 5; + + int blobs = part->boxes_count(), + math_blobs = part->SpecialBlobsCount(BSTT_MATH), + digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT); + if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount || + math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) { + return false; + } + + return true; +} + +bool EquationDetect::CheckSeedDensity( + const float math_density_high, + const float math_density_low, + const ColPartition* part) const { + ASSERT_HOST(part); + float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH) + + part->SpecialBlobsDensity(BSTT_DIGIT); + float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC); + if (math_digit_density > math_density_high) { + return true; + } + if (math_digit_density + italic_density > kMathItalicDensityTh && + math_digit_density > math_density_low) { + return true; + } + + return false; +} + +EquationDetect::IndentType EquationDetect::IsIndented(ColPartition* part) { + ASSERT_HOST(part); + + ColPartitionGridSearch search(part_grid_); + ColPartition *neighbor = NULL; + const TBOX& part_box(part->bounding_box()); + const int kXGapTh = static_cast(roundf(0.5 * resolution_)); + const int kRadiusTh = static_cast(roundf(3.0 * resolution_)); + const int kYGapTh = static_cast(roundf(0.5 * resolution_)); + + // Here we use a simple approximation algorithm: from the center of part, We + // perform the radius search, and check if we can find a neighboring parition + // that locates on the top/bottom left of part. + search.StartRadSearch((part_box.left() + part_box.right()) / 2, + (part_box.top() + part_box.bottom()) / 2, kRadiusTh); + search.SetUniqueMode(true); + bool left_indented = false, right_indented = false; + while ((neighbor = search.NextRadSearch()) != NULL && + (!left_indented || !right_indented)) { + if (neighbor == part) { + continue; + } + const TBOX& neighbor_box(neighbor->bounding_box()); + + if (part_box.major_y_overlap(neighbor_box) && + part_box.x_gap(neighbor_box) < kXGapTh) { + // When this happens, it is likely part is a fragment of an + // over-segmented colpartition. So we return false. + return NO_INDENT; + } + + if (!IsTextOrEquationType(neighbor->type())) { + continue; + } + + // The neighbor should be above/below part, and overlap in x direction. + if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) { + continue; + } + + if (part_box.y_gap(neighbor_box) < kYGapTh) { + int left_gap = part_box.left() - neighbor_box.left(); + int right_gap = neighbor_box.right() - part_box.right(); + if (left_gap > kXGapTh) { + left_indented = true; + } + if (right_gap > kXGapTh) { + right_indented = true; + } + } + } + + if (left_indented && right_indented) { + return BOTH_INDENT; + } + if (left_indented) { + return LEFT_INDENT; + } + if (right_indented) { + return RIGHT_INDENT; + } + return NO_INDENT; +} + +bool EquationDetect::ExpandSeed(ColPartition* seed) { + if (seed == NULL || // This seed has been absorbed by other seeds. + seed->IsVerticalType()) { // We skip vertical type right now. + return false; + } + + // Expand in four directions. + GenericVector parts_to_merge; + ExpandSeedHorizontal(true, seed, &parts_to_merge); + ExpandSeedHorizontal(false, seed, &parts_to_merge); + ExpandSeedVertical(true, seed, &parts_to_merge); + ExpandSeedVertical(false, seed, &parts_to_merge); + SearchByOverlap(seed, &parts_to_merge); + + if (parts_to_merge.empty()) { // We don't find any partition to merge. + return false; + } + + // Merge all partitions in parts_to_merge with seed. We first remove seed + // from part_grid_ as its bounding box is going to expand. Then we add it + // back after it aborbs all parts_to_merge parititions. + part_grid_->RemoveBBox(seed); + for (int i = 0; i < parts_to_merge.size(); ++i) { + ColPartition* part = parts_to_merge[i]; + if (part->type() == PT_EQUATION) { + // If part is in cp_seeds_, then we mark it as NULL so that we won't + // process it again. + for (int j = 0; j < cp_seeds_.size(); ++j) { + if (part == cp_seeds_[j]) { + cp_seeds_[j] = NULL; + break; + } + } + } + + // part has already been removed from part_grid_ in function + // ExpandSeedHorizontal/ExpandSeedVertical. + seed->Absorb(part, NULL); + } + + return true; +} + +void EquationDetect::ExpandSeedHorizontal( + const bool search_left, + ColPartition* seed, + GenericVector* parts_to_merge) { + ASSERT_HOST(seed != NULL && parts_to_merge != NULL); + const float kYOverlapTh = 0.6; + const int kXGapTh = static_cast(roundf(0.2 * resolution_)); + + ColPartitionGridSearch search(part_grid_); + const TBOX& seed_box(seed->bounding_box()); + int x = search_left ? seed_box.left() : seed_box.right(); + search.StartSideSearch(x, seed_box.bottom(), seed_box.top()); + search.SetUniqueMode(true); + + // Search iteratively. + ColPartition *part = NULL; + while ((part = search.NextSideSearch(search_left)) != NULL) { + if (part == seed) { + continue; + } + const TBOX& part_box(part->bounding_box()); + if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope. + break; + } + + // Check part location. + if ((part_box.left() >= seed_box.left() && search_left) || + (part_box.right() <= seed_box.right() && !search_left)) { + continue; + } + + if (part->type() != PT_EQUATION) { // Non-equation type. + // Skip PT_LINLINE_EQUATION and non text type. + if (part->type() == PT_INLINE_EQUATION || + (!IsTextOrEquationType(part->type()) && + part->blob_type() != BRT_HLINE)) { + continue; + } + // For other types, it should be the near small neighbor of seed. + if (!IsNearSmallNeighbor(seed_box, part_box) || + !CheckSeedNeighborDensity(part)) { + continue; + } + } else { // Equation type, check the y overlap. + if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh && + seed_box.y_overlap_fraction(part_box) < kYOverlapTh) { + continue; + } + } + + // Passed the check, delete it from search and add into parts_to_merge. + search.RemoveBBox(); + parts_to_merge->push_back(part); + } +} + +void EquationDetect::ExpandSeedVertical( + const bool search_bottom, + ColPartition* seed, + GenericVector* parts_to_merge) { + ASSERT_HOST(seed != NULL && parts_to_merge != NULL && + cps_super_bbox_ != NULL); + const float kXOverlapTh = 0.4; + const int kYGapTh = static_cast(roundf(0.2 * resolution_)); + + ColPartitionGridSearch search(part_grid_); + const TBOX& seed_box(seed->bounding_box()); + int y = search_bottom ? seed_box.bottom() : seed_box.top(); + search.StartVerticalSearch( + cps_super_bbox_->left(), cps_super_bbox_->right(), y); + search.SetUniqueMode(true); + + // Search iteratively. + ColPartition *part = NULL; + GenericVector parts; + int skipped_min_top = INT_MAX, skipped_max_bottom = -1; + while ((part = search.NextVerticalSearch(search_bottom)) != NULL) { + if (part == seed) { + continue; + } + const TBOX& part_box(part->bounding_box()); + + if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope. + break; + } + + // Check part location. + if ((part_box.bottom() >= seed_box.bottom() && search_bottom) || + (part_box.top() <= seed_box.top() && !search_bottom)) { + continue; + } + + bool skip_part = false; + if (part->type() != PT_EQUATION) { // Non-equation type. + // Skip PT_LINLINE_EQUATION and non text type. + if (part->type() == PT_INLINE_EQUATION || + (!IsTextOrEquationType(part->type()) && + part->blob_type() != BRT_HLINE)) { + skip_part = true; + } else if (!IsNearSmallNeighbor(seed_box, part_box) || + !CheckSeedNeighborDensity(part)) { + // For other types, it should be the near small neighbor of seed. + skip_part = true; + } + } else { // Equation type, check the x overlap. + if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh && + seed_box.x_overlap_fraction(part_box) < kXOverlapTh) { + skip_part = true; + } + } + if (skip_part) { + if (part->type() != PT_EQUATION) { + if (skipped_min_top > part_box.top()) { + skipped_min_top = part_box.top(); + } + if (skipped_max_bottom < part_box.bottom()) { + skipped_max_bottom = part_box.bottom(); + } + } + } else { + parts.push_back(part); + } + } + + // For every part in parts, we need verify it is not above skipped_min_top + // when search top, or not below skipped_max_bottom when search bottom. I.e., + // we will skip a part if it looks like: + // search bottom | search top + // seed: ****************** | part: ********** + // skipped: xxx | skipped: xxx + // part: ********** | seed: *********** + for (int i = 0; i < parts.size(); i++) { + const TBOX& part_box(parts[i]->bounding_box()); + if ((search_bottom && part_box.top() <= skipped_max_bottom) || + (!search_bottom && part_box.bottom() >= skipped_min_top)) { + continue; + } + // Add parts[i] into parts_to_merge, and delete it from part_grid_. + parts_to_merge->push_back(parts[i]); + part_grid_->RemoveBBox(parts[i]); + } +} + +bool EquationDetect::IsNearSmallNeighbor(const TBOX& seed_box, + const TBOX& part_box) const { + const int kXGapTh = static_cast(roundf(0.25 * resolution_)); + const int kYGapTh = static_cast(roundf(0.05 * resolution_)); + + // Check geometric feature. + if (part_box.height() > seed_box.height() || + part_box.width() > seed_box.width()) { + return false; + } + + // Check overlap and distance. + if ((!part_box.major_x_overlap(seed_box) || + part_box.y_gap(seed_box) > kYGapTh) && + (!part_box.major_y_overlap(seed_box) || + part_box.x_gap(seed_box) > kXGapTh)) { + return false; + } + + return true; +} + +bool EquationDetect::CheckSeedNeighborDensity(const ColPartition* part) const { + ASSERT_HOST(part); + if (part->boxes_count() < kSeedBlobsCountTh) { + // Too few blobs, skip the check. + return true; + } + + // We check the math blobs density and the unclear blobs density. + if (part->SpecialBlobsDensity(BSTT_MATH) + + part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 || + part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) { + return true; + } + + return false; +} + +void EquationDetect::ProcessMathBlockSatelliteParts() { + // Iterate over part_grid_, and find all parts that are text type but not + // equation type. + ColPartition *part = NULL; + GenericVector text_parts; + ColPartitionGridSearch gsearch(part_grid_); + gsearch.StartFullSearch(); + while ((part = gsearch.NextFullSearch()) != NULL) { + if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) { + text_parts.push_back(part); + } + } + if (text_parts.empty()) { + return; + } + + // Compute the medium height of the text_parts. + text_parts.sort(&SortCPByHeight); + const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box(); + int med_height = text_box.height(); + if (text_parts.size() % 2 == 0 && text_parts.size() > 1) { + const TBOX& text_box = + text_parts[text_parts.size() / 2 - 1]->bounding_box(); + med_height = static_cast(roundf( + 0.5 * (text_box.height() + med_height))); + } + + // Iterate every text_parts and check if it is a math block satellite. + for (int i = 0; i < text_parts.size(); ++i) { + const TBOX& text_box(text_parts[i]->bounding_box()); + if (text_box.height() > med_height) { + continue; + } + GenericVector math_blocks; + if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) { + continue; + } + + // Found. merge text_parts[i] with math_blocks. + part_grid_->RemoveBBox(text_parts[i]); + text_parts[i]->set_type(PT_EQUATION); + for (int j = 0; j < math_blocks.size(); ++j) { + part_grid_->RemoveBBox(math_blocks[j]); + text_parts[i]->Absorb(math_blocks[j], NULL); + } + InsertPartAfterAbsorb(text_parts[i]); + } +} + +bool EquationDetect::IsMathBlockSatellite( + ColPartition* part, GenericVector* math_blocks) { + ASSERT_HOST(part != NULL && math_blocks != NULL); + math_blocks->clear(); + const TBOX& part_box(part->bounding_box()); + // Find the top/bottom nearest neighbor of part. + ColPartition *neighbors[2]; + int y_gaps[2] = {INT_MAX, INT_MAX}; + // The horizontal boundary of the neighbors. + int neighbors_left = INT_MAX, neighbors_right = 0; + for (int i = 0; i < 2; ++i) { + neighbors[i] = SearchNNVertical(i != 0, part); + if (neighbors[i]) { + const TBOX& neighbor_box = neighbors[i]->bounding_box(); + y_gaps[i] = neighbor_box.y_gap(part_box); + if (neighbor_box.left() < neighbors_left) { + neighbors_left = neighbor_box.left(); + } + if (neighbor_box.right() > neighbors_right) { + neighbors_right = neighbor_box.right(); + } + } + } + if (neighbors[0] == neighbors[1]) { + // This happens when part is inside neighbor. + neighbors[1] = NULL; + y_gaps[1] = INT_MAX; + } + + // Check if part is within [neighbors_left, neighbors_right]. + if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) { + return false; + } + + // Get the index of the near one in neighbors. + int index = y_gaps[0] < y_gaps[1] ? 0 : 1; + + // Check the near one. + if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) { + math_blocks->push_back(neighbors[index]); + } else { + // If the near one failed the check, then we skip checking the far one. + return false; + } + + // Check the far one. + index = 1 - index; + if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) { + math_blocks->push_back(neighbors[index]); + } + + return true; +} + +ColPartition* EquationDetect::SearchNNVertical( + const bool search_bottom, const ColPartition* part) { + ASSERT_HOST(part); + ColPartition *nearest_neighbor = NULL, *neighbor = NULL; + const int kYGapTh = static_cast(roundf(resolution_ * 0.5)); + + ColPartitionGridSearch search(part_grid_); + search.SetUniqueMode(true); + const TBOX& part_box(part->bounding_box()); + int y = search_bottom ? part_box.bottom() : part_box.top(); + search.StartVerticalSearch(part_box.left(), part_box.right(), y); + int min_y_gap = INT_MAX; + while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) { + if (neighbor == part || !IsTextOrEquationType(neighbor->type())) { + continue; + } + const TBOX& neighbor_box(neighbor->bounding_box()); + int y_gap = neighbor_box.y_gap(part_box); + if (y_gap > kYGapTh) { // Out of scope. + break; + } + if (!neighbor_box.major_x_overlap(part_box) || + (search_bottom && neighbor_box.bottom() > part_box.bottom()) || + (!search_bottom && neighbor_box.top() < part_box.top())) { + continue; + } + if (y_gap < min_y_gap) { + min_y_gap = y_gap; + nearest_neighbor = neighbor; + } + } + + return nearest_neighbor; +} + +bool EquationDetect::IsNearMathNeighbor( + const int y_gap, const ColPartition *neighbor) const { + if (!neighbor) { + return false; + } + const int kYGapTh = static_cast(roundf(resolution_ * 0.1)); + return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh; +} + +void EquationDetect::GetOutputTiffName(const char* name, + STRING* image_name) const { + ASSERT_HOST(image_name && name); + char page[50]; + snprintf(page, sizeof(page), "%04d", page_count_); + *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif"; +} + +void EquationDetect::PaintSpecialTexts(const STRING& outfile) const { + Pix *pix = NULL, *pixBi = lang_tesseract_->pix_binary(); + pix = pixConvertTo32(pixBi); + ColPartitionGridSearch gsearch(part_grid_); + ColPartition* part = NULL; + gsearch.StartFullSearch(); + while ((part = gsearch.NextFullSearch()) != NULL) { + BLOBNBOX_C_IT blob_it(part->boxes()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + RenderSpecialText(pix, blob_it.data()); + } + } + + pixWrite(outfile.string(), pix, IFF_TIFF_LZW); + pixDestroy(&pix); +} + +void EquationDetect::PaintColParts(const STRING& outfile) const { + Pix *pix = pixConvertTo32(lang_tesseract_->BestPix()); + ColPartitionGridSearch gsearch(part_grid_); + gsearch.StartFullSearch(); + ColPartition* part = NULL; + while ((part = gsearch.NextFullSearch()) != NULL) { + const TBOX& tbox = part->bounding_box(); + Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(), + tbox.width(), tbox.height()); + if (part->type() == PT_EQUATION) { + pixRenderBoxArb(pix, box, 5, 255, 0, 0); + } else if (part->type() == PT_INLINE_EQUATION) { + pixRenderBoxArb(pix, box, 5, 0, 255, 0); + } else { + pixRenderBoxArb(pix, box, 5, 0, 0, 255); + } + boxDestroy(&box); + } + + pixWrite(outfile.string(), pix, IFF_TIFF_LZW); + pixDestroy(&pix); +} + +void EquationDetect::PrintSpecialBlobsDensity(const ColPartition* part) const { + ASSERT_HOST(part); + TBOX box(part->bounding_box()); + int h = pixGetHeight(lang_tesseract_->BestPix()); + tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ", + h - box.top(), h - box.bottom()); + box.print(); + tprintf("blobs count = %d, density = ", part->boxes_count()); + for (int i = 0; i < BSTT_COUNT; ++i) { + BlobSpecialTextType type = static_cast(i); + tprintf("%d:%f ", i, part->SpecialBlobsDensity(type)); + } + tprintf("\n"); +} + +}; // namespace tesseract diff --git a/ccmain/equationdetect.h b/ccmain/equationdetect.h new file mode 100644 index 000000000..09b553963 --- /dev/null +++ b/ccmain/equationdetect.h @@ -0,0 +1,277 @@ +/////////////////////////////////////////////////////////////////////// +// File: equationdetect.h +// Description: The equation detection class that inherits equationdetectbase. +// Author: Zongyi (Joe) Liu (joeliu@google.com) +// Created: Fri Aug 31 11:13:01 PST 2011 +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H__ +#define TESSERACT_CCMAIN_EQUATIONDETECT_H__ + +#include "blobbox.h" +#include "equationdetectbase.h" +#include "genericvector.h" +#include "unichar.h" + +class BLOBNBOX; +class BLOB_CHOICE; +class BLOB_CHOICE_LIST; +class TO_BLOCK_LIST; +class TBOX; +class UNICHARSET; + +namespace tesseract { + +class Tesseract; +class ColPartition; +class ColPartitionGrid; +class ColPartitionSet; + +class EquationDetect : public EquationDetectBase { + public: + EquationDetect(const char* equ_datapath, + const char* equ_language); + ~EquationDetect(); + + enum IndentType { + NO_INDENT, + LEFT_INDENT, + RIGHT_INDENT, + BOTH_INDENT, + INDENT_TYPE_COUNT + }; + + // Reset the lang_tesseract_ pointer. This function should be called before we + // do any detector work. + void SetLangTesseract(Tesseract* lang_tesseract); + + // Iterate over the blobs inside to_block, and set the blobs that we want to + // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function + // returns 0 upon success. + int LabelSpecialText(TO_BLOCK* to_block); + + // Find possible equation partitions from part_grid. Should be called + // after the special_text_type of blobs are set. + // It returns 0 upon success. + int FindEquationParts(ColPartitionGrid* part_grid, + ColPartitionSet** best_columns); + + // Reset the resolution of the processing image. TEST only function. + void SetResolution(const int resolution); + + protected: + // Identify the special text type for one blob, and update its field. When + // height_th is set (> 0), we will label the blob as BSTT_NONE if its height + // is less than height_th. + void IdentifySpecialText(BLOBNBOX *blob, const int height_th); + + // Estimate the type for one unichar. + BlobSpecialTextType EstimateTypeForUnichar( + const UNICHARSET& unicharset, const UNICHAR_ID id) const; + + // Compute special text type for each blobs in part_grid_. + void IdentifySpecialText(); + + // Identify blobs that we want to skip during special blob type + // classification. + void IdentifyBlobsToSkip(ColPartition* part); + + // The ColPartitions in part_grid_ maybe over-segmented, particularly in the + // block equation regions. So we like to identify these partitions and merge + // them before we do the searching. + void MergePartsByLocation(); + + // Staring from the seed center, we do radius search. And for partitions that + // have large overlaps with seed, we remove them from part_grid_ and add into + // parts_overlap. Note: this function may update the part_grid_, so if the + // caller is also running ColPartitionGridSearch, use the RepositionIterator + // to continue. + void SearchByOverlap(ColPartition* seed, + GenericVector* parts_overlap); + + // Insert part back into part_grid_, after it absorbs some other parts. + void InsertPartAfterAbsorb(ColPartition* part); + + // Identify the colparitions in part_grid_, label them as PT_EQUATION, and + // save them into cp_seeds_. + void IdentifySeedParts(); + + // Check the blobs count for a seed region candidate. + bool CheckSeedBlobsCount(ColPartition* part); + + // Compute the foreground pixel density for a tbox area. + float ComputeForegroundDensity(const TBOX& tbox); + + // Check if part from seed2 label: with low math density and left indented. We + // are using two checks: + // 1. If its left is aligned with any coordinates in indented_texts_left, + // which we assume have been sorted. + // 2. If its foreground density is over foreground_density_th. + bool CheckForSeed2( + const GenericVector& indented_texts_left, + const float foreground_density_th, + ColPartition* part); + + // Count the number of values in sorted_vec that is close to val, used to + // check if a partition is aligned with text partitions. + int CountAlignment( + const GenericVector& sorted_vec, const int val) const; + + // Check for a seed candidate using the foreground pixel density. And we + // return true if the density is below a certain threshold, because characters + // in equation regions usually are apart with more white spaces. + bool CheckSeedFgDensity(const float density_th, ColPartition* part); + + // A light version of SplitCPHor: instead of really doing the part split, we + // simply compute the union bounding box of each splitted part. + void SplitCPHorLite(ColPartition* part, GenericVector* splitted_boxes); + + // Split the part (horizontally), and save the splitted result into + // parts_splitted. Note that it is caller's responsibility to release the + // memory owns by parts_splitted. On the other hand, the part is unchanged + // during this process and still owns the blobs, so do NOT call DeleteBoxes + // when freeing the colpartitions in parts_splitted. + void SplitCPHor(ColPartition* part, + GenericVector* parts_splitted); + + // Check the density for a seed candidate (part) using its math density and + // italic density, returns true if the check passed. + bool CheckSeedDensity(const float math_density_high, + const float math_density_low, + const ColPartition* part) const; + + // Check if part is indented. + IndentType IsIndented(ColPartition* part); + + // Identify inline partitions from cp_seeds_, and re-label them. + void IdentifyInlineParts(); + + // Comute the super bounding box for all colpartitions inside part_grid_. + void ComputeCPsSuperBBox(); + + // Identify inline partitions from cp_seeds_ using the horizontal search. + void IdentifyInlinePartsHorizontal(); + + // Estimate the line spacing between two text partitions. Returns -1 if not + // enough data. + int EstimateTextPartLineSpacing(); + + // Identify inline partitions from cp_seeds_ using vertical search. + void IdentifyInlinePartsVertical(const bool top_to_bottom, + const int textPartsLineSpacing); + + // Check if part is an inline equation zone. This should be called after we + // identified the seed regions. + bool IsInline(const bool search_bottom, + const int textPartsLineSpacing, + ColPartition* part); + + // For a given seed partition, we search the part_grid_ and see if there is + // any partition can be merged with it. It returns true if the seed has been + // expanded. + bool ExpandSeed(ColPartition* seed); + + // Starting from the seed position, we search the part_grid_ + // horizontally/vertically, find all parititions that can be + // merged with seed, remove them from part_grid_, and put them into + // parts_to_merge. + void ExpandSeedHorizontal(const bool search_left, + ColPartition* seed, + GenericVector* parts_to_merge); + void ExpandSeedVertical(const bool search_bottom, + ColPartition* seed, + GenericVector* parts_to_merge); + + // Check if a part_box is the small neighbor of seed_box. + bool IsNearSmallNeighbor(const TBOX& seed_box, + const TBOX& part_box) const; + + // Perform the density check for part, which we assume is nearing a seed + // partition. It returns true if the check passed. + bool CheckSeedNeighborDensity(const ColPartition* part) const; + + // After identify the math blocks, we do one more scanning on all text + // partitions, and check if any of them is the satellite of: + // math blocks: here a p is the satellite of q if: + // 1. q is the nearest vertical neighbor of p, and + // 2. y_gap(p, q) is less than a threshold, and + // 3. x_overlap(p, q) is over a threshold. + // Note that p can be the satellites of two blocks: its top neighbor and + // bottom neighbor. + void ProcessMathBlockSatelliteParts(); + + // Check if part is the satellite of one/two math blocks. If it is, we return + // true, and save the blocks into math_blocks. + bool IsMathBlockSatellite( + ColPartition* part, GenericVector* math_blocks); + + // Search the nearest neighbor of part in one vertical direction as defined in + // search_bottom. It returns the neighbor found that major x overlap with it, + // or NULL when not found. + ColPartition* SearchNNVertical(const bool search_bottom, + const ColPartition* part); + + // Check if the neighbor with vertical distance of y_gap is a near and math + // block partition. + bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const; + + // Generate the tiff file name for output/debug file. + void GetOutputTiffName(const char* name, STRING* image_name) const; + + // Debugger function that renders ColPartitions on the input image, where: + // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION + // will be painted in green, and other parts will be painted in blue. + void PaintColParts(const STRING& outfile) const; + + // Debugger function that renders the blobs in part_grid_ over the input + // image. + void PaintSpecialTexts(const STRING& outfile) const; + + // Debugger function that print the math blobs density values for a + // ColPartition object. + void PrintSpecialBlobsDensity(const ColPartition* part) const; + + // The tesseract engine intialized from equation training data. + Tesseract* equ_tesseract_; + + // The tesseract engine used for OCR. This pointer is passed in by the caller, + // so do NOT destroy it in this class. + Tesseract* lang_tesseract_; + + // The ColPartitionGrid that we are processing. This pointer is passed in from + // the caller, so do NOT destroy it in the class. + ColPartitionGrid* part_grid_; + + // A simple array of pointers to the best assigned column division at + // each grid y coordinate. This pointer is passed in from the caller, so do + // NOT destroy it in the class. + ColPartitionSet** best_columns_; + + // The super bounding box of all cps in the part_grid_. + TBOX* cps_super_bbox_; + + // The seed ColPartition for equation region. + GenericVector cp_seeds_; + + // The resolution (dpi) of the processing image. + int resolution_; + + // The number of pages we have processed. + int page_count_; +}; + +} // namespace tesseract + +#endif // TESSERACT_CCMAIN_EQUATIONDETECT_H_ diff --git a/textord/equationdetectbase.cpp b/textord/equationdetectbase.cpp new file mode 100644 index 000000000..29a9c8481 --- /dev/null +++ b/textord/equationdetectbase.cpp @@ -0,0 +1,65 @@ +/////////////////////////////////////////////////////////////////////// +// File: equationdetectbase.cpp +// Description: The base class equation detection class. +// Author: Zongyi (Joe) Liu (joeliu@google.com) +// Created: Fri Aug 31 11:13:01 PST 2011 +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "allheaders.h" +#include "blobbox.h" +#include "equationdetectbase.h" + +namespace tesseract { + +EquationDetectBase::EquationDetectBase() { +} + +EquationDetectBase::~EquationDetectBase() { +} + +void EquationDetectBase::RenderSpecialText(Pix* pix, + BLOBNBOX* blob) { + ASSERT_HOST(pix != NULL && pixGetDepth(pix) == 32 && blob != NULL); + const TBOX& tbox = blob->bounding_box(); + int height = pixGetHeight(pix); + const int box_width = 5; + + // Coordinate translation: tesseract use left bottom as the original, while + // leptonica uses left top as the original. + Box *box = boxCreate(tbox.left(), height - tbox.top(), + tbox.width(), tbox.height()); + switch (blob->special_text_type()) { + case BSTT_MATH: // Red box. + pixRenderBoxArb(pix, box, box_width, 255, 0, 0); + break; + case BSTT_DIGIT: // cyan box. + pixRenderBoxArb(pix, box, box_width, 0, 255, 255); + break; + case BSTT_ITALIC: // Green box. + pixRenderBoxArb(pix, box, box_width, 0, 255, 0); + break; + case BSTT_UNCLEAR: // blue box. + pixRenderBoxArb(pix, box, box_width, 0, 255, 0); + break; + case BSTT_NONE: + default: + // yellow box. + pixRenderBoxArb(pix, box, box_width, 255, 255, 0); + break; + } + boxDestroy(&box); +} + +}; // namespace tesseract diff --git a/textord/equationdetectbase.h b/textord/equationdetectbase.h new file mode 100644 index 000000000..d47c74a53 --- /dev/null +++ b/textord/equationdetectbase.h @@ -0,0 +1,59 @@ +/////////////////////////////////////////////////////////////////////// +// File: equationdetectbase.h +// Description: The base class equation detection class. +// Author: Zongyi (Joe) Liu (joeliu@google.com) +// Created: Fri Aug 31 11:13:01 PST 2011 +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_TEXTORD_EQUATIONDETECTBASE_H__ +#define TESSERACT_TEXTORD_EQUATIONDETECTBASE_H__ + +class BLOBNBOX_LIST; +class TO_BLOCK; +struct Pix; + +namespace tesseract { + +class ColPartitionGrid; +class ColPartitionSet; + +class EquationDetectBase { + public: + EquationDetectBase(); + virtual ~EquationDetectBase(); + + // Iterate over the blobs inside to_block, and set the blobs that we want to + // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function + // returns 0 upon success. + virtual int LabelSpecialText(TO_BLOCK* to_block) = 0; + + // Interface to find possible equation partition grid from part_grid. This + // should be called after IdentifySpecialText function. + virtual int FindEquationParts(ColPartitionGrid* part_grid, + ColPartitionSet** best_columns) = 0; + + // Debug function: Render a bounding box on pix based on the value of its + // special_text_type, specifically: + // BSTT_MATH: red box + // BSTT_DIGIT: cyan box + // BSTT_ITALIC: green box + // BSTT_UNCLEAR: blue box + // All others: yellow box + static void RenderSpecialText(Pix* pix, BLOBNBOX* blob); +}; + +}; // namespace tesseract + +#endif // TESSERACT_TEXTORD_EQUATIONDETECTBASE_H__