/********************************************************************** * File: paragraphs.cpp * Description: Paragraph detection for tesseract. * Author: David Eger * Created: 25 February 2011 * * (C) Copyright 2011, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include "paragraphs.h" #include // for isspace #include // for abs #include // for snprintf #include // for abs #include // for strchr, strlen #include // for max #include // for unique_ptr #include "genericvector.h" // for GenericVector, GenericVectorEqEq #include "helpers.h" // for UpdateRange, ClipToRange #include "host.h" // for NearlyEqual #include "mutableiterator.h" // for MutableIterator #include "ocrblock.h" // for BLOCK #include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA... #include "ocrrow.h" // for ROW #include "pageiterator.h" // for PageIterator #include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO... #include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels #include "pdblock.h" // for PDBLK #include "polyblk.h" // for POLY_BLOCK #include "publictypes.h" // for JUSTIFICATION_LEFT, JUSTIFICATION_R... #include "ratngs.h" // for WERD_CHOICE #include "rect.h" // for TBOX #include "statistc.h" // for STATS #include "strngs.h" // for STRING #include "tprintf.h" // for tprintf #include "unichar.h" // for UNICHAR, UNICHAR_ID #include "unicharset.h" // for UNICHARSET #include "unicodes.h" // for kPDF, kRLE #include "werd.h" // for WERD, W_REP_CHAR namespace tesseract { // Special "weak" ParagraphModels. const ParagraphModel *kCrownLeft = reinterpret_cast(0xDEAD111F); const ParagraphModel *kCrownRight = reinterpret_cast(0xDEAD888F); // Do the text and geometry of two rows support a paragraph break between them? static bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j); // Given the width of a typical space between words, what is the threshold // by which by which we think left and right alignments for paragraphs // can vary and still be aligned. static int Epsilon(int space_pix) { return space_pix * 4 / 5; } static bool AcceptableRowArgs( int debug_level, int min_num_rows, const char *function_name, const GenericVector *rows, int row_start, int row_end) { if (row_start < 0 || row_end > rows->size() || row_start > row_end) { tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", row_start, row_end, rows->size()); return false; } if (row_end - row_start < min_num_rows) { if (debug_level > 1) { tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name); } return false; } return true; } // =============================== Debug Code ================================ // Convert an integer to a decimal string. static STRING StrOf(int num) { char buffer[30]; snprintf(buffer, sizeof(buffer), "%d", num); return STRING(buffer); } // Given a row-major matrix of unicode text and a column separator, print // a formatted table. For ASCII, we get good column alignment. static void PrintTable(const GenericVector > &rows, const STRING &colsep) { GenericVector max_col_widths; for (int r = 0; r < rows.size(); r++) { int num_columns = rows[r].size(); for (int c = 0; c < num_columns; c++) { int num_unicodes = 0; for (int i = 0; i < rows[r][c].size(); i++) { if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++; } if (c >= max_col_widths.size()) { max_col_widths.push_back(num_unicodes); } else { if (num_unicodes > max_col_widths[c]) max_col_widths[c] = num_unicodes; } } } GenericVector col_width_patterns; for (int c = 0; c < max_col_widths.size(); c++) { col_width_patterns.push_back( STRING("%-") + StrOf(max_col_widths[c]) + "s"); } for (int r = 0; r < rows.size(); r++) { for (int c = 0; c < rows[r].size(); c++) { if (c > 0) tprintf("%s", colsep.string()); tprintf(col_width_patterns[c].string(), rows[r][c].string()); } tprintf("\n"); } } static STRING RtlEmbed(const STRING &word, bool rtlify) { if (rtlify) return STRING(kRLE) + word + STRING(kPDF); return word; } // Print the current thoughts of the paragraph detector. static void PrintDetectorState(const ParagraphTheory &theory, const GenericVector &rows) { GenericVector > output; output.push_back(GenericVector()); output.back().push_back("#row"); output.back().push_back("space"); output.back().push_back(".."); output.back().push_back("lword[widthSEL]"); output.back().push_back("rword[widthSEL]"); RowScratchRegisters::AppendDebugHeaderFields(&output.back()); output.back().push_back("text"); for (int i = 0; i < rows.size(); i++) { output.push_back(GenericVector()); GenericVector &row = output.back(); const RowInfo& ri = *rows[i].ri_; row.push_back(StrOf(i)); row.push_back(StrOf(ri.average_interword_space)); row.push_back(ri.has_leaders ? ".." : " "); row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + StrOf(ri.lword_box.width()) + (ri.lword_likely_starts_idea ? "S" : "s") + (ri.lword_likely_ends_idea ? "E" : "e") + (ri.lword_indicates_list_item ? "L" : "l") + "]"); row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + StrOf(ri.rword_box.width()) + (ri.rword_likely_starts_idea ? "S" : "s") + (ri.rword_likely_ends_idea ? "E" : "e") + (ri.rword_indicates_list_item ? "L" : "l") + "]"); rows[i].AppendDebugInfo(theory, &row); row.push_back(RtlEmbed(ri.text, !ri.ltr)); } PrintTable(output, " "); tprintf("Active Paragraph Models:\n"); for (int m = 0; m < theory.models().size(); m++) { tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string()); } } static void DebugDump( bool should_print, const STRING &phase, const ParagraphTheory &theory, const GenericVector &rows) { if (!should_print) return; tprintf("# %s\n", phase.string()); PrintDetectorState(theory, rows); } // Print out the text for rows[row_start, row_end) static void PrintRowRange(const GenericVector &rows, int row_start, int row_end) { tprintf("======================================\n"); for (int row = row_start; row < row_end; row++) { tprintf("%s\n", rows[row].ri_->text.string()); } tprintf("======================================\n"); } // ============= Brain Dead Language Model (ASCII Version) =================== static bool IsLatinLetter(int ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); } static bool IsDigitLike(int ch) { return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I'; } static bool IsOpeningPunct(int ch) { return strchr("'\"({[", ch) != nullptr; } static bool IsTerminalPunct(int ch) { return strchr(":'\".?!]})", ch) != nullptr; } // Return a pointer after consuming as much text as qualifies as roman numeral. static const char *SkipChars(const char *str, const char *toskip) { while (*str != '\0' && strchr(toskip, *str)) { str++; } return str; } static const char *SkipChars(const char *str, bool (*skip)(int)) { while (*str != '\0' && skip(*str)) { str++; } return str; } static const char *SkipOne(const char *str, const char *toskip) { if (*str != '\0' && strchr(toskip, *str)) return str + 1; return str; } // Return whether it is very likely that this is a numeral marker that could // start a list item. Some examples include: // A I iii. VI (2) 3.5. [C-4] static bool LikelyListNumeral(const STRING &word) { const char *kRomans = "ivxlmdIVXLMD"; const char *kDigits = "012345789"; const char *kOpen = "[{("; const char *kSep = ":;-.,"; const char *kClose = "]})"; int num_segments = 0; const char *pos = word.string(); while (*pos != '\0' && num_segments < 3) { // skip up to two open parens. const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen); const char *numeral_end = SkipChars(numeral_start, kRomans); if (numeral_end != numeral_start) { // Got Roman Numeral. Great. } else { numeral_end = SkipChars(numeral_start, kDigits); if (numeral_end == numeral_start) { // If there's a single latin letter, we can use that. numeral_end = SkipChars(numeral_start, IsLatinLetter); if (numeral_end - numeral_start != 1) break; } } // We got some sort of numeral. num_segments++; // Skip any trailing parens or punctuation. pos = SkipChars(SkipChars(numeral_end, kClose), kSep); if (pos == numeral_end) break; } return *pos == '\0'; } static bool LikelyListMark(const STRING &word) { const char *kListMarks = "0Oo*.,+."; return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr; } bool AsciiLikelyListItem(const STRING &word) { return LikelyListMark(word) || LikelyListNumeral(word); } // ========== Brain Dead Language Model (Tesseract Version) ================ // Return the first Unicode Codepoint from werd[pos]. int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) { if (!u || !werd || pos > werd->length()) return 0; return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni(); } // A useful helper class for finding the first j >= i so that word[j] // does not have given character type. class UnicodeSpanSkipper { public: UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) : u_(unicharset), word_(word) { wordlen_ = word->length(); } // Given an input position, return the first position >= pos not punc. int SkipPunc(int pos); // Given an input position, return the first position >= pos not digit. int SkipDigits(int pos); // Given an input position, return the first position >= pos not roman. int SkipRomans(int pos); // Given an input position, return the first position >= pos not alpha. int SkipAlpha(int pos); private: const UNICHARSET *u_; const WERD_CHOICE *word_; int wordlen_; }; int UnicodeSpanSkipper::SkipPunc(int pos) { while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++; return pos; } int UnicodeSpanSkipper::SkipDigits(int pos) { while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++; return pos; } int UnicodeSpanSkipper::SkipRomans(int pos) { const char *kRomans = "ivxlmdIVXLMD"; while (pos < wordlen_) { int ch = UnicodeFor(u_, word_, pos); if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) break; pos++; } return pos; } int UnicodeSpanSkipper::SkipAlpha(int pos) { while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++; return pos; } static bool LikelyListMarkUnicode(int ch) { if (ch < 0x80) { STRING single_ch; single_ch += ch; return LikelyListMark(single_ch); } switch (ch) { // TODO(eger) expand this list of unicodes as needed. case 0x00B0: // degree sign case 0x2022: // bullet case 0x25E6: // white bullet case 0x00B7: // middle dot case 0x25A1: // white square case 0x25A0: // black square case 0x25AA: // black small square case 0x2B1D: // black very small square case 0x25BA: // black right-pointing pointer case 0x25CF: // black circle case 0x25CB: // white circle return true; default: break; // fall through } return false; } // Return whether it is very likely that this is a numeral marker that could // start a list item. Some examples include: // A I iii. VI (2) 3.5. [C-4] static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) return true; UnicodeSpanSkipper m(u, werd); int num_segments = 0; int pos = 0; while (pos < werd->length() && num_segments < 3) { int numeral_start = m.SkipPunc(pos); if (numeral_start > pos + 1) break; int numeral_end = m.SkipRomans(numeral_start); if (numeral_end == numeral_start) { numeral_end = m.SkipDigits(numeral_start); if (numeral_end == numeral_start) { // If there's a single latin letter, we can use that. numeral_end = m.SkipAlpha(numeral_start); if (numeral_end - numeral_start != 1) break; } } // We got some sort of numeral. num_segments++; // Skip any trailing punctuation. pos = m.SkipPunc(numeral_end); if (pos == numeral_end) break; } return pos == werd->length(); } // ========= Brain Dead Language Model (combined entry points) ================ // Given the leftmost word of a line either as a Tesseract unicharset + werd // or a utf8 string, set the following attributes for it: // is_list - this word might be a list number or bullet. // starts_idea - this word is likely to start a sentence. // ends_idea - this word is likely to end a sentence. void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea) { *is_list = false; *starts_idea = false; *ends_idea = false; if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty *ends_idea = true; return; } if (unicharset && werd) { // We have a proper werd and unicharset so use it. if (UniLikelyListItem(unicharset, werd)) { *is_list = true; *starts_idea = true; *ends_idea = true; } if (unicharset->get_isupper(werd->unichar_id(0))) { *starts_idea = true; } if (unicharset->get_ispunctuation(werd->unichar_id(0))) { *starts_idea = true; *ends_idea = true; } } else { // Assume utf8 is mostly ASCII if (AsciiLikelyListItem(utf8)) { *is_list = true; *starts_idea = true; } int start_letter = utf8[0]; if (IsOpeningPunct(start_letter)) { *starts_idea = true; } if (IsTerminalPunct(start_letter)) { *ends_idea = true; } if (start_letter >= 'A' && start_letter <= 'Z') { *starts_idea = true; } } } // Given the rightmost word of a line either as a Tesseract unicharset + werd // or a utf8 string, set the following attributes for it: // is_list - this word might be a list number or bullet. // starts_idea - this word is likely to start a sentence. // ends_idea - this word is likely to end a sentence. void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea) { *is_list = false; *starts_idea = false; *ends_idea = false; if (utf8.size() == 0 || (werd != nullptr && werd->length() == 0)) { // Empty *ends_idea = true; return; } if (unicharset && werd) { // We have a proper werd and unicharset so use it. if (UniLikelyListItem(unicharset, werd)) { *is_list = true; *starts_idea = true; } UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1); if (unicharset->get_ispunctuation(last_letter)) { *ends_idea = true; } } else { // Assume utf8 is mostly ASCII if (AsciiLikelyListItem(utf8)) { *is_list = true; *starts_idea = true; } int last_letter = utf8[utf8.size() - 1]; if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { *ends_idea = true; } } } // =============== Implementation of RowScratchRegisters ===================== /* static */ void RowScratchRegisters::AppendDebugHeaderFields( GenericVector *header) { header->push_back("[lmarg,lind;rind,rmarg]"); header->push_back("model"); } void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory, GenericVector *dbg) const { char s[30]; snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", lmargin_, lindent_, rindent_, rmargin_); dbg->push_back(s); STRING model_string; model_string += static_cast(GetLineType()); model_string += ":"; int model_numbers = 0; for (int h = 0; h < hypotheses_.size(); h++) { if (hypotheses_[h].model == nullptr) continue; if (model_numbers > 0) model_string += ","; if (StrongModel(hypotheses_[h].model)) { model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model)); } else if (hypotheses_[h].model == kCrownLeft) { model_string += "CrL"; } else if (hypotheses_[h].model == kCrownRight) { model_string += "CrR"; } model_numbers++; } if (model_numbers == 0) model_string += "0"; dbg->push_back(model_string); } void RowScratchRegisters::Init(const RowInfo &row) { ri_ = &row; lmargin_ = 0; lindent_ = row.pix_ldistance; rmargin_ = 0; rindent_ = row.pix_rdistance; } LineType RowScratchRegisters::GetLineType() const { if (hypotheses_.empty()) return LT_UNKNOWN; bool has_start = false; bool has_body = false; for (int i = 0; i < hypotheses_.size(); i++) { switch (hypotheses_[i].ty) { case LT_START: has_start = true; break; case LT_BODY: has_body = true; break; default: tprintf("Encountered bad value in hypothesis list: %c\n", hypotheses_[i].ty); break; } } if (has_start && has_body) return LT_MULTIPLE; return has_start ? LT_START : LT_BODY; } LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { if (hypotheses_.empty()) return LT_UNKNOWN; bool has_start = false; bool has_body = false; for (int i = 0; i < hypotheses_.size(); i++) { if (hypotheses_[i].model != model) continue; switch (hypotheses_[i].ty) { case LT_START: has_start = true; break; case LT_BODY: has_body = true; break; default: tprintf("Encountered bad value in hypothesis list: %c\n", hypotheses_[i].ty); break; } } if (has_start && has_body) return LT_MULTIPLE; return has_start ? LT_START : LT_BODY; } void RowScratchRegisters::SetStartLine() { LineType current_lt = GetLineType(); if (current_lt != LT_UNKNOWN && current_lt != LT_START) { tprintf("Trying to set a line to be START when it's already BODY.\n"); } if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr)); } } void RowScratchRegisters::SetBodyLine() { LineType current_lt = GetLineType(); if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { tprintf("Trying to set a line to be BODY when it's already START.\n"); } if (current_lt == LT_UNKNOWN || current_lt == LT_START) { hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr)); } } void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { hypotheses_.push_back_new(LineHypothesis(LT_START, model)); int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr)); if (old_idx >= 0) hypotheses_.remove(old_idx); } void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { hypotheses_.push_back_new(LineHypothesis(LT_BODY, model)); int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr)); if (old_idx >= 0) hypotheses_.remove(old_idx); } void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { for (int h = 0; h < hypotheses_.size(); h++) { if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model)) models->push_back_new(hypotheses_[h].model); } } void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { for (int h = 0; h < hypotheses_.size(); h++) { if (StrongModel(hypotheses_[h].model)) models->push_back_new(hypotheses_[h].model); } } void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { for (int h = 0; h < hypotheses_.size(); h++) { if (hypotheses_[h].model != nullptr) models->push_back_new(hypotheses_[h].model); } } const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) return nullptr; return hypotheses_[0].model; } const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) return nullptr; return hypotheses_[0].model; } // Discard any hypotheses whose model is not in the given list. void RowScratchRegisters::DiscardNonMatchingHypotheses( const SetOfModels &models) { if (models.empty()) return; for (int h = hypotheses_.size() - 1; h >= 0; h--) { if (!models.contains(hypotheses_[h].model)) { hypotheses_.remove(h); } } } // ============ Geometry based Paragraph Detection Algorithm ================= struct Cluster { Cluster() : center(0), count(0) {} Cluster(int cen, int num) : center(cen), count(num) {} int center; // The center of the cluster. int count; // The number of entries within the cluster. }; class SimpleClusterer { public: explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {} void Add(int value) { values_.push_back(value); } int size() const { return values_.size(); } void GetClusters(GenericVector *clusters); private: int max_cluster_width_; GenericVectorEqEq values_; }; // Return the index of the cluster closest to value. static int ClosestCluster(const GenericVector &clusters, int value) { int best_index = 0; for (int i = 0; i < clusters.size(); i++) { if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) best_index = i; } return best_index; } void SimpleClusterer::GetClusters(GenericVector *clusters) { clusters->clear(); values_.sort(); for (int i = 0; i < values_.size();) { int orig_i = i; int lo = values_[i]; int hi = lo; while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { hi = values_[i]; } clusters->push_back(Cluster((hi + lo) / 2, i - orig_i)); } } // Calculate left- and right-indent tab stop values seen in // rows[row_start, row_end) given a tolerance of tolerance. static void CalculateTabStops(GenericVector *rows, int row_start, int row_end, int tolerance, GenericVector *left_tabs, GenericVector *right_tabs) { if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) return; // First pass: toss all left and right indents into clusterers. SimpleClusterer initial_lefts(tolerance); SimpleClusterer initial_rights(tolerance); GenericVector initial_left_tabs; GenericVector initial_right_tabs; for (int i = row_start; i < row_end; i++) { initial_lefts.Add((*rows)[i].lindent_); initial_rights.Add((*rows)[i].rindent_); } initial_lefts.GetClusters(&initial_left_tabs); initial_rights.GetClusters(&initial_right_tabs); // Second pass: cluster only lines that are not "stray" // An example of a stray line is a page number -- a line whose start // and end tab-stops are far outside the typical start and end tab-stops // for the block. // Put another way, we only cluster data from lines whose start or end // tab stop is frequent. SimpleClusterer lefts(tolerance); SimpleClusterer rights(tolerance); // Outlier elimination. We might want to switch this to test outlier-ness // based on how strange a position an outlier is in instead of or in addition // to how rare it is. These outliers get re-added if we end up having too // few tab stops, to work with, however. int infrequent_enough_to_ignore = 0; if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1; if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2; for (int i = row_start; i < row_end; i++) { int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore || initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { lefts.Add((*rows)[i].lindent_); rights.Add((*rows)[i].rindent_); } } lefts.GetClusters(left_tabs); rights.GetClusters(right_tabs); if ((left_tabs->size() == 1 && right_tabs->size() >= 4) || (right_tabs->size() == 1 && left_tabs->size() >= 4)) { // One side is really ragged, and the other only has one tab stop, // so those "insignificant outliers" are probably important, actually. // This often happens on a page of an index. Add back in the ones // we omitted in the first pass. for (int i = row_start; i < row_end; i++) { int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore || initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) { lefts.Add((*rows)[i].lindent_); rights.Add((*rows)[i].rindent_); } } } lefts.GetClusters(left_tabs); rights.GetClusters(right_tabs); // If one side is almost a two-indent aligned side, and the other clearly // isn't, try to prune out the least frequent tab stop from that side. if (left_tabs->size() == 3 && right_tabs->size() >= 4) { int to_prune = -1; for (int i = left_tabs->size() - 1; i >= 0; i--) { if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) { to_prune = i; } } if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { left_tabs->remove(to_prune); } } if (right_tabs->size() == 3 && left_tabs->size() >= 4) { int to_prune = -1; for (int i = right_tabs->size() - 1; i >= 0; i--) { if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) { to_prune = i; } } if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { right_tabs->remove(to_prune); } } } // Given a paragraph model mark rows[row_start, row_end) as said model // start or body lines. // // Case 1: model->first_indent_ != model->body_indent_ // Differentiating the paragraph start lines from the paragraph body lines in // this case is easy, we just see how far each line is indented. // // Case 2: model->first_indent_ == model->body_indent_ // Here, we find end-of-paragraph lines by looking for "short lines." // What constitutes a "short line" changes depending on whether the text // ragged-right[left] or fully justified (aligned left and right). // // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0) // We have a new paragraph it the first word would have at the end // of the previous line. // // Case 2b: Fully Justified. (eop_threshold > 0) // We mark a line as short (end of paragraph) if the offside indent // is greater than eop_threshold. static void MarkRowsWithModel(GenericVector *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold) { if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) return; for (int row = row_start; row < row_end; row++) { bool valid_first = ValidFirstLine(rows, row, model); bool valid_body = ValidBodyLine(rows, row, model); if (valid_first && !valid_body) { (*rows)[row].AddStartLine(model); } else if (valid_body && !valid_first) { (*rows)[row].AddBodyLine(model); } else if (valid_body && valid_first) { bool after_eop = (row == row_start); if (row > row_start) { if (eop_threshold > 0) { if (model->justification() == JUSTIFICATION_LEFT) { after_eop = (*rows)[row - 1].rindent_ > eop_threshold; } else { after_eop = (*rows)[row - 1].lindent_ > eop_threshold; } } else { after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification()); } } if (after_eop) { (*rows)[row].AddStartLine(model); } else { (*rows)[row].AddBodyLine(model); } } else { // Do nothing. Stray row. } } } // GeometricClassifierState holds all of the information we'll use while // trying to determine a paragraph model for the text lines in a block of // text: // + the rows under consideration [row_start, row_end) // + the common left- and right-indent tab stops // + does the block start out left-to-right or right-to-left // Further, this struct holds the data we amass for the (single) ParagraphModel // we'll assign to the text lines (assuming we get that far). struct GeometricClassifierState { GeometricClassifierState(int dbg_level, GenericVector *r, int r_start, int r_end) : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end), margin(0) { tolerance = InterwordSpace(*r, r_start, r_end); CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs); if (debug_level >= 3) { tprintf("Geometry: TabStop cluster tolerance = %d; " "%d left tabs; %d right tabs\n", tolerance, left_tabs.size(), right_tabs.size()); } ltr = (*r)[r_start].ri_->ltr; } void AssumeLeftJustification() { just = tesseract::JUSTIFICATION_LEFT; margin = (*rows)[row_start].lmargin_; } void AssumeRightJustification() { just = tesseract::JUSTIFICATION_RIGHT; margin = (*rows)[row_start].rmargin_; } // Align tabs are the tab stops the text is aligned to. const GenericVector &AlignTabs() const { if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs; return left_tabs; } // Offside tabs are the tab stops opposite the tabs used to align the text. // // Note that for a left-to-right text which is aligned to the right such as // this function comment, the offside tabs are the horizontal tab stops // marking the beginning of ("Note", "this" and "marking"). const GenericVector &OffsideTabs() const { if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs; return right_tabs; } // Return whether the i'th row extends from the leftmost left tab stop // to the right most right tab stop. bool IsFullRow(int i) const { return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 && ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0; } int AlignsideTabIndex(int row_idx) const { return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just)); } // Given what we know about the paragraph justification (just), would the // first word of row_b have fit at the end of row_a? bool FirstWordWouldHaveFit(int row_a, int row_b) { return ::tesseract::FirstWordWouldHaveFit( (*rows)[row_a], (*rows)[row_b], just); } void PrintRows() const { PrintRowRange(*rows, row_start, row_end); } void Fail(int min_debug_level, const char *why) const { if (debug_level < min_debug_level) return; tprintf("# %s\n", why); PrintRows(); } ParagraphModel Model() const { return ParagraphModel(just, margin, first_indent, body_indent, tolerance); } // We print out messages with a debug level at least as great as debug_level. int debug_level; // The Geometric Classifier was asked to find a single paragraph model // to fit the text rows (*rows)[row_start, row_end) GenericVector *rows; int row_start; int row_end; // The amount by which we expect the text edge can vary and still be aligned. int tolerance; // Is the script in this text block left-to-right? // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve bool ltr; // These left and right tab stops were determined to be the common tab // stops for the given text. GenericVector left_tabs; GenericVector right_tabs; // These are parameters we must determine to create a ParagraphModel. tesseract::ParagraphJustification just; int margin; int first_indent; int body_indent; // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel() int eop_threshold; }; // Given a section of text where strong textual clues did not help identifying // paragraph breaks, and for which the left and right indents have exactly // three tab stops between them, attempt to find the paragraph breaks based // solely on the outline of the text and whether the script is left-to-right. // // Algorithm Detail: // The selected rows are in the form of a rectangle except // for some number of "short lines" of the same length: // // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx // xxxxxxxxxxx xxxxxxxxxx # A "short" line. // xxxxxxxxxxxxx xxxxxxxxxxxx // xxxxxxxxxxxxx xxxxxxxxxxxx // // We have a slightly different situation if the only short // line is at the end of the excerpt. // // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx // xxxxxxxxxxxxx xxxxxxxxxxxx // xxxxxxxxxxxxx xxxxxxxxxxxx // xxxxxxxxxxx xxxxxxxxxx # A "short" line. // // We'll interpret these as follows based on the reasoning in the comment for // GeometricClassify(): // [script direction: first indent, body indent] // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0 static void GeometricClassifyThreeTabStopTextBlock( int debug_level, GeometricClassifierState &s, ParagraphTheory *theory) { int num_rows = s.row_end - s.row_start; int num_full_rows = 0; int last_row_full = 0; for (int i = s.row_start; i < s.row_end; i++) { if (s.IsFullRow(i)) { num_full_rows++; if (i == s.row_end - 1) last_row_full++; } } if (num_full_rows < 0.7 * num_rows) { s.Fail(1, "Not enough full lines to know which lines start paras."); return; } // eop_threshold gets set if we're fully justified; see MarkRowsWithModel() s.eop_threshold = 0; if (s.ltr) { s.AssumeLeftJustification(); } else { s.AssumeRightJustification(); } if (debug_level > 0) { tprintf("# Not enough variety for clear outline classification. " "Guessing these are %s aligned based on script.\n", s.ltr ? "left" : "right"); s.PrintRows(); } if (s.AlignTabs().size() == 2) { // case A1 or A2 s.first_indent = s.AlignTabs()[1].center; s.body_indent = s.AlignTabs()[0].center; } else { // case B1 or B2 if (num_rows - 1 == num_full_rows - last_row_full) { // case B2 const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight; (*s.rows)[s.row_start].AddStartLine(model); for (int i = s.row_start + 1; i < s.row_end; i++) { (*s.rows)[i].AddBodyLine(model); } return; } else { // case B1 s.first_indent = s.body_indent = s.AlignTabs()[0].center; s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; } } const ParagraphModel *model = theory->AddModel(s.Model()); MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold); return; } // This function is called if strong textual clues were not available, but // the caller hopes that the paragraph breaks will be super obvious just // by the outline of the text. // // The particularly difficult case is figuring out what's going on if you // don't have enough short paragraph end lines to tell us what's going on. // // For instance, let's say you have the following outline: // // (A1) xxxxxxxxxxxxxxxxxxxxxx // xxxxxxxxxxxxxxxxxxxx // xxxxxxxxxxxxxxxxxxxxxx // xxxxxxxxxxxxxxxxxxxxxx // // Even if we know that the text is left-to-right and so will probably be // left-aligned, both of the following are possible texts: // // (A1a) 1. Here our list item // with two full lines. // 2. Here a second item. // 3. Here our third one. // // (A1b) so ends paragraph one. // Here starts another // paragraph we want to // read. This continues // // These examples are obvious from the text and should have been caught // by the StrongEvidenceClassify pass. However, for languages where we don't // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), // it's worth guessing that (A1b) is the correct interpretation if there are // far more "full" lines than "short" lines. static void GeometricClassify(int debug_level, GenericVector *rows, int row_start, int row_end, ParagraphTheory *theory) { if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) return; if (debug_level > 1) { tprintf("###############################################\n"); tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end); tprintf("###############################################\n"); } RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); GeometricClassifierState s(debug_level, rows, row_start, row_end); if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { s.Fail(2, "Too much variety for simple outline classification."); return; } if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { s.Fail(1, "Not enough variety for simple outline classification."); return; } if (s.left_tabs.size() + s.right_tabs.size() == 3) { GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory); return; } // At this point, we know that one side has at least two tab stops, and the // other side has one or two tab stops. // Left to determine: // (1) Which is the body indent and which is the first line indent? // (2) Is the text fully justified? // If one side happens to have three or more tab stops, assume that side // is opposite of the aligned side. if (s.right_tabs.size() > 2) { s.AssumeLeftJustification(); } else if (s.left_tabs.size() > 2) { s.AssumeRightJustification(); } else if (s.ltr) { // guess based on script direction s.AssumeLeftJustification(); } else { s.AssumeRightJustification(); } if (s.AlignTabs().size() == 2) { // For each tab stop on the aligned side, how many of them appear // to be paragraph start lines? [first lines] int firsts[2] = {0, 0}; // Count the first line as a likely paragraph start line. firsts[s.AlignsideTabIndex(s.row_start)]++; // For each line, if the first word would have fit on the previous // line count it as a likely paragraph start line. bool jam_packed = true; for (int i = s.row_start + 1; i < s.row_end; i++) { if (s.FirstWordWouldHaveFit(i - 1, i)) { firsts[s.AlignsideTabIndex(i)]++; jam_packed = false; } } // Make an extra accounting for the last line of the paragraph just // in case it's the only short line in the block. That is, take its // first word as typical and see if this looks like the *last* line // of a paragraph. If so, mark the *other* indent as probably a first. if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++; } int percent0firsts, percent1firsts; percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count; percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count; // TODO(eger): Tune these constants if necessary. if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) { s.first_indent = s.AlignTabs()[1].center; s.body_indent = s.AlignTabs()[0].center; } else if ((percent1firsts < 20 && 30 < percent0firsts) || percent1firsts + 30 < percent0firsts) { s.first_indent = s.AlignTabs()[0].center; s.body_indent = s.AlignTabs()[1].center; } else { // Ambiguous! Probably lineated (poetry) if (debug_level > 1) { tprintf("# Cannot determine %s indent likely to start paragraphs.\n", s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right"); tprintf("# Indent of %d looks like a first line %d%% of the time.\n", s.AlignTabs()[0].center, percent0firsts); tprintf("# Indent of %d looks like a first line %d%% of the time.\n", s.AlignTabs()[1].center, percent1firsts); s.PrintRows(); } return; } } else { // There's only one tab stop for the "aligned to" side. s.first_indent = s.body_indent = s.AlignTabs()[0].center; } // At this point, we have our model. const ParagraphModel *model = theory->AddModel(s.Model()); // Now all we have to do is figure out if the text is fully justified or not. // eop_threshold: default to fully justified unless we see evidence below. // See description on MarkRowsWithModel() s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; // If the text is not fully justified, re-set the eop_threshold to 0. if (s.AlignTabs().size() == 2) { // Paragraphs with a paragraph-start indent. for (int i = s.row_start; i < s.row_end - 1; i++) { if (ValidFirstLine(s.rows, i + 1, model) && !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { // We found a non-end-of-paragraph short line: not fully justified. s.eop_threshold = 0; break; } } } else { // Paragraphs with no paragraph-start indent. for (int i = s.row_start; i < s.row_end - 1; i++) { if (!s.FirstWordWouldHaveFit(i, i + 1) && !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { // We found a non-end-of-paragraph short line: not fully justified. s.eop_threshold = 0; break; } } } MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold); } // =============== Implementation of ParagraphTheory ===================== const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { for (int i = 0; i < models_->size(); i++) { if ((*models_)[i]->Comparable(model)) return (*models_)[i]; } auto *m = new ParagraphModel(model); models_->push_back(m); models_we_added_.push_back_new(m); return m; } void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { for (int i = models_->size() - 1; i >= 0; i--) { ParagraphModel *m = (*models_)[i]; if (!used_models.contains(m) && models_we_added_.contains(m)) { models_->remove(i); models_we_added_.remove(models_we_added_.get_index(m)); delete m; } } } // Examine rows[start, end) and try to determine if an existing non-centered // paragraph model would fit them perfectly. If so, return a pointer to it. // If not, return nullptr. const ParagraphModel *ParagraphTheory::Fits( const GenericVector *rows, int start, int end) const { for (int m = 0; m < models_->size(); m++) { const ParagraphModel *model = (*models_)[m]; if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) return model; } return nullptr; } void ParagraphTheory::NonCenteredModels(SetOfModels *models) { for (int m = 0; m < models_->size(); m++) { const ParagraphModel *model = (*models_)[m]; if (model->justification() != JUSTIFICATION_CENTER) models->push_back_new(model); } } int ParagraphTheory::IndexOf(const ParagraphModel *model) const { for (int i = 0; i < models_->size(); i++) { if ((*models_)[i] == model) return i; } return -1; } bool ValidFirstLine(const GenericVector *rows, int row, const ParagraphModel *model) { if (!StrongModel(model)) { tprintf("ValidFirstLine() should only be called with strong models!\n"); } return StrongModel(model) && model->ValidFirstLine( (*rows)[row].lmargin_, (*rows)[row].lindent_, (*rows)[row].rindent_, (*rows)[row].rmargin_); } bool ValidBodyLine(const GenericVector *rows, int row, const ParagraphModel *model) { if (!StrongModel(model)) { tprintf("ValidBodyLine() should only be called with strong models!\n"); } return StrongModel(model) && model->ValidBodyLine( (*rows)[row].lmargin_, (*rows)[row].lindent_, (*rows)[row].rindent_, (*rows)[row].rmargin_); } bool CrownCompatible(const GenericVector *rows, int a, int b, const ParagraphModel *model) { if (model != kCrownRight && model != kCrownLeft) { tprintf("CrownCompatible() should only be called with crown models!\n"); return false; } RowScratchRegisters &row_a = (*rows)[a]; RowScratchRegisters &row_b = (*rows)[b]; if (model == kCrownRight) { return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_, Epsilon(row_a.ri_->average_interword_space)); } return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_, Epsilon(row_a.ri_->average_interword_space)); } // =============== Implementation of ParagraphModelSmearer ==================== ParagraphModelSmearer::ParagraphModelSmearer( GenericVector *rows, int row_start, int row_end, ParagraphTheory *theory) : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) { if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { row_start_ = 0; row_end_ = 0; return; } SetOfModels no_models; for (int row = row_start - 1; row <= row_end; row++) { open_models_.push_back(no_models); } } // see paragraphs_internal.h void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { SetOfModels no_models; if (row_start < row_start_) row_start = row_start_; if (row_end > row_end_) row_end = row_end_; for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) { if ((*rows_)[row].ri_->num_words == 0) { OpenModels(row + 1) = no_models; } else { SetOfModels &opened = OpenModels(row); (*rows_)[row].StartHypotheses(&opened); // Which models survive the transition from row to row + 1? SetOfModels still_open; for (int m = 0; m < opened.size(); m++) { if (ValidFirstLine(rows_, row, opened[m]) || ValidBodyLine(rows_, row, opened[m])) { // This is basic filtering; we check likely paragraph starty-ness down // below in Smear() -- you know, whether the first word would have fit // and such. still_open.push_back_new(opened[m]); } } OpenModels(row + 1) = still_open; } } } // see paragraphs_internal.h void ParagraphModelSmearer::Smear() { CalculateOpenModels(row_start_, row_end_); // For each row which we're unsure about (that is, it is LT_UNKNOWN or // we have multiple LT_START hypotheses), see if there's a model that // was recently used (an "open" model) which might model it well. for (int i = row_start_; i < row_end_; i++) { RowScratchRegisters &row = (*rows_)[i]; if (row.ri_->num_words == 0) continue; // Step One: // Figure out if there are "open" models which are left-alined or // right-aligned. This is important for determining whether the // "first" word in a row would fit at the "end" of the previous row. bool left_align_open = false; bool right_align_open = false; for (int m = 0; m < OpenModels(i).size(); m++) { switch (OpenModels(i)[m]->justification()) { case JUSTIFICATION_LEFT: left_align_open = true; break; case JUSTIFICATION_RIGHT: right_align_open = true; break; default: left_align_open = right_align_open = true; } } // Step Two: // Use that knowledge to figure out if this row is likely to // start a paragraph. bool likely_start; if (i == 0) { likely_start = true; } else { if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) { likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) || LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT); } else if (left_align_open) { likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT); } else { likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT); } } // Step Three: // If this text line seems like an obvious first line of an // open model, or an obvious continuation of an existing // modelled paragraph, mark it up. if (likely_start) { // Add Start Hypotheses for all Open models that fit. for (int m = 0; m < OpenModels(i).size(); m++) { if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { row.AddStartLine(OpenModels(i)[m]); } } } else { // Add relevant body line hypotheses. SetOfModels last_line_models; if (i > 0) { (*rows_)[i - 1].StrongHypotheses(&last_line_models); } else { theory_->NonCenteredModels(&last_line_models); } for (int m = 0; m < last_line_models.size(); m++) { const ParagraphModel *model = last_line_models[m]; if (ValidBodyLine(rows_, i, model)) row.AddBodyLine(model); } } // Step Four: // If we're still quite unsure about this line, go through all // models in our theory and see if this row could be the start // of any of our models. if (row.GetLineType() == LT_UNKNOWN || (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { SetOfModels all_models; theory_->NonCenteredModels(&all_models); for (int m = 0; m < all_models.size(); m++) { if (ValidFirstLine(rows_, i, all_models[m])) { row.AddStartLine(all_models[m]); } } } // Step Five: // Since we may have updated the hypotheses about this row, we need // to recalculate the Open models for the rest of rows[i + 1, row_end) if (row.GetLineType() != LT_UNKNOWN) { CalculateOpenModels(i + 1, row_end_); } } } // ================ Main Paragraph Detection Algorithm ======================= // Find out what ParagraphModels are actually used, and discard any // that are not. static void DiscardUnusedModels(const GenericVector &rows, ParagraphTheory *theory) { SetOfModels used_models; for (int i = 0; i < rows.size(); i++) { rows[i].StrongHypotheses(&used_models); } theory->DiscardUnusedModels(used_models); } // DowngradeWeakestToCrowns: // Forget any flush-{left, right} models unless we see two or more // of them in sequence. // // In pass 3, we start to classify even flush-left paragraphs (paragraphs // where the first line and body indent are the same) as having proper Models. // This is generally dangerous, since if you start imagining that flush-left // is a typical paragraph model when it is not, it will lead you to chop normal // indented paragraphs in the middle whenever a sentence happens to start on a // new line (see "This" above). What to do? // What we do is to take any paragraph which is flush left and is not // preceded by another paragraph of the same model and convert it to a "Crown" // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder // for later. It means that the paragraph is flush, but it would be desirable // to mark it as the same model as following text if it fits. This downgrade // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we // avoid making flush left Paragraph Models whenever we see a top-of-the-page // half-of-a-paragraph. and instead we mark it the same as normal body text. // // Implementation: // // Comb backwards through the row scratch registers, and turn any // sequences of body lines of equivalent type abutted against the beginning // or a body or start line of a different type into a crown paragraph. static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, GenericVector *rows) { int start; for (int end = rows->size(); end > 0; end = start) { // Search back for a body line of a unique type. const ParagraphModel *model = nullptr; while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) { end--; } if (end == 0) break; start = end - 1; while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { start--; // walk back to the first line that is not the same body type. } if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) && NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) { start--; } start++; // Now rows[start, end) is a sequence of unique body hypotheses of model. if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) continue; if (!StrongModel(model)) { while (start > 0 && CrownCompatible(rows, start - 1, start, model)) start--; } if (start == 0 || (!StrongModel(model)) || (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { // crownify rows[start, end) const ParagraphModel *crown_model = model; if (StrongModel(model)) { if (model->justification() == JUSTIFICATION_LEFT) crown_model = kCrownLeft; else crown_model = kCrownRight; } (*rows)[start].SetUnknown(); (*rows)[start].AddStartLine(crown_model); for (int row = start + 1; row < end; row++) { (*rows)[row].SetUnknown(); (*rows)[row].AddBodyLine(crown_model); } } } DiscardUnusedModels(*rows, theory); } // Clear all hypotheses about lines [start, end) and reset margins. // // The empty space between the left of a row and the block boundary (and // similarly for the right) is split into two pieces: margin and indent. // In initial processing, we assume the block is tight and the margin for // all lines is set to zero. However, if our first pass does not yield // models for everything, it may be due to an inset paragraph like a // block-quote. In that case, we make a second pass over that unmarked // section of the page and reset the "margin" portion of the empty space // to the common amount of space at the ends of the lines under consid- // eration. This would be equivalent to percentile set to 0. However, // sometimes we have a single character sticking out in the right margin // of a text block (like the 'r' in 'for' on line 3 above), and we can // really just ignore it as an outlier. To express this, we allow the // user to specify the percentile (0..100) of indent values to use as // the common margin for each row in the run of rows[start, end). void RecomputeMarginsAndClearHypotheses( GenericVector *rows, int start, int end, int percentile) { if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) return; int lmin, lmax, rmin, rmax; lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_; rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_; for (int i = start; i < end; i++) { RowScratchRegisters &sr = (*rows)[i]; sr.SetUnknown(); if (sr.ri_->num_words == 0) continue; UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax); UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax); } STATS lefts(lmin, lmax + 1); STATS rights(rmin, rmax + 1); for (int i = start; i < end; i++) { RowScratchRegisters &sr = (*rows)[i]; if (sr.ri_->num_words == 0) continue; lefts.add(sr.lmargin_ + sr.lindent_, 1); rights.add(sr.rmargin_ + sr.rindent_, 1); } int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0); int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0); for (int i = start; i < end; i++) { RowScratchRegisters &sr = (*rows)[i]; int ldelta = ignorable_left - sr.lmargin_; sr.lmargin_ += ldelta; sr.lindent_ -= ldelta; int rdelta = ignorable_right - sr.rmargin_; sr.rmargin_ += rdelta; sr.rindent_ -= rdelta; } } // Return the median inter-word space in rows[row_start, row_end). int InterwordSpace(const GenericVector &rows, int row_start, int row_end) { if (row_end < row_start + 1) return 1; int word_height = (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2; int word_width = (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2; STATS spacing_widths(0, 5 + word_width); for (int i = row_start; i < row_end; i++) { if (rows[i].ri_->num_words > 1) { spacing_widths.add(rows[i].ri_->average_interword_space, 1); } } int minimum_reasonable_space = word_height / 3; if (minimum_reasonable_space < 2) minimum_reasonable_space = 2; int median = spacing_widths.median(); return (median > minimum_reasonable_space) ? median : minimum_reasonable_space; } // Return whether the first word on the after line can fit in the space at // the end of the before line (knowing which way the text is aligned and read). bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification) { if (before.ri_->num_words == 0 || after.ri_->num_words == 0) return true; if (justification == JUSTIFICATION_UNKNOWN) { tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); } int available_space; if (justification == JUSTIFICATION_CENTER) { available_space = before.lindent_ + before.rindent_; } else { available_space = before.OffsideIndent(justification); } available_space -= before.ri_->average_interword_space; if (before.ri_->ltr) return after.ri_->lword_box.width() < available_space; return after.ri_->rword_box.width() < available_space; } // Return whether the first word on the after line can fit in the space at // the end of the before line (not knowing which way the text goes) in a left // or right alignment. bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after) { if (before.ri_->num_words == 0 || after.ri_->num_words == 0) return true; int available_space = before.lindent_; if (before.rindent_ > available_space) available_space = before.rindent_; available_space -= before.ri_->average_interword_space; if (before.ri_->ltr) return after.ri_->lword_box.width() < available_space; return after.ri_->rword_box.width() < available_space; } static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) { if (before.ri_->ltr) { return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea; } else { return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea; } } static bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j) { return before.ri_->num_words == 0 || (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after)); } // Examine rows[start, end) and try to determine what sort of ParagraphModel // would fit them as a single paragraph. // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN. // If the rows given could be a consistent start to a paragraph, set *consistent // true. static ParagraphModel InternalParagraphModelByOutline( const GenericVector *rows, int start, int end, int tolerance, bool *consistent) { int ltr_line_count = 0; for (int i = start; i < end; i++) { ltr_line_count += static_cast((*rows)[i].ri_->ltr); } bool ltr = (ltr_line_count >= (end - start) / 2); *consistent = true; if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) return ParagraphModel(); // Ensure the caller only passed us a region with a common rmargin and // lmargin. int lmargin = (*rows)[start].lmargin_; int rmargin = (*rows)[start].rmargin_; int lmin, lmax, rmin, rmax, cmin, cmax; lmin = lmax = (*rows)[start + 1].lindent_; rmin = rmax = (*rows)[start + 1].rindent_; cmin = cmax = 0; for (int i = start + 1; i < end; i++) { if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { tprintf("Margins don't match! Software error.\n"); *consistent = false; return ParagraphModel(); } UpdateRange((*rows)[i].lindent_, &lmin, &lmax); UpdateRange((*rows)[i].rindent_, &rmin, &rmax); UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax); } int ldiff = lmax - lmin; int rdiff = rmax - rmin; int cdiff = cmax - cmin; if (rdiff > tolerance && ldiff > tolerance) { if (cdiff < tolerance * 2) { if (end - start < 3) return ParagraphModel(); return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance); } *consistent = false; return ParagraphModel(); } if (end - start < 3) // Don't return a model for two line paras. return ParagraphModel(); // These booleans keep us from saying something is aligned left when the body // left variance is too large. bool body_admits_left_alignment = ldiff < tolerance; bool body_admits_right_alignment = rdiff < tolerance; ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_, (lmin + lmax) / 2, tolerance); ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_, (rmin + rmax) / 2, tolerance); // These booleans keep us from having an indent on the "wrong side" for the // first line. bool text_admits_left_alignment = ltr || left_model.is_flush(); bool text_admits_right_alignment = !ltr || right_model.is_flush(); // At least one of the edges is less than tolerance in variance. // If the other is obviously ragged, it can't be the one aligned to. // [Note the last line is included in this raggedness.] if (tolerance < rdiff) { if (body_admits_left_alignment && text_admits_left_alignment) return left_model; *consistent = false; return ParagraphModel(); } if (tolerance < ldiff) { if (body_admits_right_alignment && text_admits_right_alignment) return right_model; *consistent = false; return ParagraphModel(); } // At this point, we know the body text doesn't vary much on either side. // If the first line juts out oddly in one direction or the other, // that likely indicates the side aligned to. int first_left = (*rows)[start].lindent_; int first_right = (*rows)[start].rindent_; if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) return left_model; if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) return right_model; *consistent = false; return ParagraphModel(); } // Examine rows[start, end) and try to determine what sort of ParagraphModel // would fit them as a single paragraph. If nothing fits, // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug // output if we're debugging. static ParagraphModel ParagraphModelByOutline( int debug_level, const GenericVector *rows, int start, int end, int tolerance) { bool unused_consistent; ParagraphModel retval = InternalParagraphModelByOutline( rows, start, end, tolerance, &unused_consistent); if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { tprintf("Could not determine a model for this paragraph:\n"); PrintRowRange(*rows, start, end); } return retval; } // Do rows[start, end) form a single instance of the given paragraph model? bool RowsFitModel(const GenericVector *rows, int start, int end, const ParagraphModel *model) { if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) return false; if (!ValidFirstLine(rows, start, model)) return false; for (int i = start + 1 ; i < end; i++) { if (!ValidBodyLine(rows, i, model)) return false; } return true; } // Examine rows[row_start, row_end) as an independent section of text, // and mark rows that are exceptionally clear as start-of-paragraph // and paragraph-body lines. // // We presume that any lines surrounding rows[row_start, row_end) may // have wildly different paragraph models, so we don't key any data off // of those lines. // // We only take the very strongest signals, as we don't want to get // confused and marking up centered text, poetry, or source code as // clearly part of a typical paragraph. static void MarkStrongEvidence(GenericVector *rows, int row_start, int row_end) { // Record patently obvious body text. for (int i = row_start + 1; i < row_end; i++) { const RowScratchRegisters &prev = (*rows)[i - 1]; RowScratchRegisters &curr = (*rows)[i]; tesseract::ParagraphJustification typical_justification = prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea && !FirstWordWouldHaveFit(prev, curr, typical_justification)) { curr.SetBodyLine(); } } // Record patently obvious start paragraph lines. // // It's an extremely good signal of the start of a paragraph that // the first word would have fit on the end of the previous line. // However, applying just that signal would have us mark random // start lines of lineated text (poetry and source code) and some // centered headings as paragraph start lines. Therefore, we use // a second qualification for a paragraph start: Not only should // the first word of this line have fit on the previous line, // but also, this line should go full to the right of the block, // disallowing a subsequent word from having fit on this line. // First row: { RowScratchRegisters &curr = (*rows)[row_start]; RowScratchRegisters &next = (*rows)[row_start + 1]; tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) && (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) { curr.SetStartLine(); } } // Middle rows for (int i = row_start + 1; i < row_end - 1; i++) { RowScratchRegisters &prev = (*rows)[i - 1]; RowScratchRegisters &curr = (*rows)[i]; RowScratchRegisters &next = (*rows)[i + 1]; tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) && LikelyParagraphStart(prev, curr, j)) { curr.SetStartLine(); } } // Last row { // the short circuit at the top means we have at least two lines. RowScratchRegisters &prev = (*rows)[row_end - 2]; RowScratchRegisters &curr = (*rows)[row_end - 1]; tesseract::ParagraphJustification j = curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) && LikelyParagraphStart(prev, curr, j)) { curr.SetStartLine(); } } } // Look for sequences of a start line followed by some body lines in // rows[row_start, row_end) and create ParagraphModels for them if // they seem coherent. static void ModelStrongEvidence(int debug_level, GenericVector *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory) { if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) return; int start = row_start; while (start < row_end) { while (start < row_end && (*rows)[start].GetLineType() != LT_START) start++; if (start >= row_end - 1) break; int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space); int end = start; ParagraphModel last_model; bool next_consistent; do { ++end; // rows[row, end) was consistent. // If rows[row, end + 1) is not consistent, // just model rows[row, end) if (end < row_end - 1) { RowScratchRegisters &next = (*rows)[end]; LineType lt = next.GetLineType(); next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN && !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end])); } else { next_consistent = false; } if (next_consistent) { ParagraphModel next_model = InternalParagraphModelByOutline( rows, start, end + 1, tolerance, &next_consistent); if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT && next_model.justification() != JUSTIFICATION_LEFT) || (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT && next_model.justification() != JUSTIFICATION_RIGHT)) { next_consistent = false; } last_model = next_model; } else { next_consistent = false; } } while (next_consistent && end < row_end); // At this point, rows[start, end) looked like it could have been a // single paragraph. If we can make a good ParagraphModel for it, // do so and mark this sequence with that model. if (end > start + 1) { // emit a new paragraph if we have more than one line. const ParagraphModel *model = nullptr; ParagraphModel new_model = ParagraphModelByOutline( debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end))); if (new_model.justification() == JUSTIFICATION_UNKNOWN) { // couldn't create a good model, oh well. } else if (new_model.is_flush()) { if (end == start + 2) { // It's very likely we just got two paragraph starts in a row. end = start + 1; } else if (start == row_start) { // Mark this as a Crown. if (new_model.justification() == JUSTIFICATION_LEFT) { model = kCrownLeft; } else { model = kCrownRight; } } else if (allow_flush_models) { model = theory->AddModel(new_model); } } else { model = theory->AddModel(new_model); } if (model) { (*rows)[start].AddStartLine(model); for (int i = start + 1; i < end; i++) { (*rows)[i].AddBodyLine(model); } } } start = end; } } // We examine rows[row_start, row_end) and do the following: // (1) Clear all existing hypotheses for the rows being considered. // (2) Mark up any rows as exceptionally likely to be paragraph starts // or paragraph body lines as such using both geometric and textual // clues. // (3) Form models for any sequence of start + continuation lines. // (4) Smear the paragraph models to cover surrounding text. static void StrongEvidenceClassify(int debug_level, GenericVector *rows, int row_start, int row_end, ParagraphTheory *theory) { if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) return; if (debug_level > 1) { tprintf("#############################################\n"); tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); tprintf("#############################################\n"); } RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); MarkStrongEvidence(rows, row_start, row_end); DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows); // Create paragraph models. ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory); DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows); // At this point, some rows are marked up as paragraphs with model numbers, // and some rows are marked up as either LT_START or LT_BODY. Now let's // smear any good paragraph hypotheses forward and backward. ParagraphModelSmearer smearer(rows, row_start, row_end, theory); smearer.Smear(); } static void SeparateSimpleLeaderLines(GenericVector *rows, int row_start, int row_end, ParagraphTheory *theory) { for (int i = row_start + 1; i < row_end - 1; i++) { if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders && (*rows)[i + 1].ri_->has_leaders) { const ParagraphModel *model = theory->AddModel( ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0)); (*rows)[i].AddStartLine(model); } } } // Collect sequences of unique hypotheses in row registers and create proper // paragraphs for them, referencing the paragraphs in row_owners. static void ConvertHypothesizedModelRunsToParagraphs( int debug_level, const GenericVector &rows, GenericVector *row_owners, ParagraphTheory *theory) { int end = rows.size(); int start; for (; end > 0; end = start) { start = end - 1; const ParagraphModel *model = nullptr; // TODO(eger): Be smarter about dealing with multiple hypotheses. bool single_line_paragraph = false; SetOfModels models; rows[start].NonNullHypotheses(&models); if (!models.empty()) { model = models[0]; if (rows[start].GetLineType(model) != LT_BODY) single_line_paragraph = true; } if (model && !single_line_paragraph) { // walk back looking for more body lines and then a start line. while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { // do nothing } if (start < 0 || rows[start].GetLineType(model) != LT_START) { model = nullptr; } } if (model == nullptr) { continue; } // rows[start, end) should be a paragraph. PARA *p = new PARA(); if (model == kCrownLeft || model == kCrownRight) { p->is_very_first_or_continuation = true; // Crown paragraph. // If we can find an existing ParagraphModel that fits, use it, // else create a new one. for (int row = end; row < rows.size(); row++) { if ((*row_owners)[row] && (ValidBodyLine(&rows, start, (*row_owners)[row]->model) && (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { model = (*row_owners)[row]->model; break; } } if (model == kCrownLeft) { // No subsequent model fits, so cons one up. model = theory->AddModel(ParagraphModel( JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_, 0, 0, Epsilon(rows[start].ri_->average_interword_space))); } else if (model == kCrownRight) { // No subsequent model fits, so cons one up. model = theory->AddModel(ParagraphModel( JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_, 0, 0, Epsilon(rows[start].ri_->average_interword_space))); } } rows[start].SetUnknown(); rows[start].AddStartLine(model); for (int i = start + 1; i < end; i++) { rows[i].SetUnknown(); rows[i].AddBodyLine(model); } p->model = model; p->has_drop_cap = rows[start].ri_->has_drop_cap; p->is_list_item = model->justification() == JUSTIFICATION_RIGHT ? rows[start].ri_->rword_indicates_list_item : rows[start].ri_->lword_indicates_list_item; for (int row = start; row < end; row++) { if ((*row_owners)[row] != nullptr) { tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " "more than once!\n"); delete (*row_owners)[row]; } (*row_owners)[row] = p; } } } struct Interval { Interval() : begin(0), end(0) {} Interval(int b, int e) : begin(b), end(e) {} int begin; int end; }; // Return whether rows[row] appears to be stranded, meaning that the evidence // for this row is very weak due to context. For instance, two lines of source // code may happen to be indented at the same tab vector as body text starts, // leading us to think they are two start-of-paragraph lines. This is not // optimal. However, we also don't want to mark a sequence of short dialog // as "weak," so our heuristic is: // (1) If a line is surrounded by lines of unknown type, it's weak. // (2) If two lines in a row are start lines for a given paragraph type, but // after that the same paragraph type does not continue, they're weak. static bool RowIsStranded(const GenericVector &rows, int row) { SetOfModels row_models; rows[row].StrongHypotheses(&row_models); for (int m = 0; m < row_models.size(); m++) { bool all_starts = rows[row].GetLineType(); int run_length = 1; bool continues = true; for (int i = row - 1; i >= 0 && continues; i--) { SetOfModels models; rows[i].NonNullHypotheses(&models); switch (rows[i].GetLineType(row_models[m])) { case LT_START: run_length++; break; case LT_MULTIPLE: // explicit fall-through case LT_BODY: run_length++; all_starts = false; break; case LT_UNKNOWN: // explicit fall-through default: continues = false; } } continues = true; for (int i = row + 1; i < rows.size() && continues; i++) { SetOfModels models; rows[i].NonNullHypotheses(&models); switch (rows[i].GetLineType(row_models[m])) { case LT_START: run_length++; break; case LT_MULTIPLE: // explicit fall-through case LT_BODY: run_length++; all_starts = false; break; case LT_UNKNOWN: // explicit fall-through default: continues = false; } } if (run_length > 2 || (!all_starts && run_length > 1)) return false; } return true; } // Go through rows[row_start, row_end) and gather up sequences that need better // classification. // + Sequences of non-empty rows without hypotheses. // + Crown paragraphs not immediately followed by a strongly modeled line. // + Single line paragraphs surrounded by text that doesn't match the // model. static void LeftoverSegments(const GenericVector &rows, GenericVector *to_fix, int row_start, int row_end) { to_fix->clear(); for (int i = row_start; i < row_end; i++) { bool needs_fixing = false; SetOfModels models; SetOfModels models_w_crowns; rows[i].StrongHypotheses(&models); rows[i].NonNullHypotheses(&models_w_crowns); if (models.empty() && !models_w_crowns.empty()) { // Crown paragraph. Is it followed by a modeled line? for (int end = i + 1; end < rows.size(); end++) { SetOfModels end_models; SetOfModels strong_end_models; rows[end].NonNullHypotheses(&end_models); rows[end].StrongHypotheses(&strong_end_models); if (end_models.empty()) { needs_fixing = true; break; } else if (!strong_end_models.empty()) { needs_fixing = false; break; } } } else if (models.empty() && rows[i].ri_->num_words > 0) { // No models at all. needs_fixing = true; } if (!needs_fixing && !models.empty()) { needs_fixing = RowIsStranded(rows, i); } if (needs_fixing) { if (!to_fix->empty() && to_fix->back().end == i - 1) to_fix->back().end = i; else to_fix->push_back(Interval(i, i)); } } // Convert inclusive intervals to half-open intervals. for (int i = 0; i < to_fix->size(); i++) { (*to_fix)[i].end = (*to_fix)[i].end + 1; } } // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), // normalize each row_owner to point to an actual PARA, and output the // paragraphs in order onto paragraphs. void CanonicalizeDetectionResults( GenericVector *row_owners, PARA_LIST *paragraphs) { GenericVector &rows = *row_owners; paragraphs->clear(); PARA_IT out(paragraphs); PARA *formerly_null = nullptr; for (int i = 0; i < rows.size(); i++) { if (rows[i] == nullptr) { if (i == 0 || rows[i - 1] != formerly_null) { rows[i] = formerly_null = new PARA(); } else { rows[i] = formerly_null; continue; } } else if (i > 0 && rows[i - 1] == rows[i]) { continue; } out.add_after_then_move(rows[i]); } } // Main entry point for Paragraph Detection Algorithm. // // Given a set of equally spaced textlines (described by row_infos), // Split them into paragraphs. // // Output: // row_owners - one pointer for each row, to the paragraph it belongs to. // paragraphs - this is the actual list of PARA objects. // models - the list of paragraph models referenced by the PARA objects. // caller is responsible for deleting the models. void DetectParagraphs(int debug_level, GenericVector *row_infos, GenericVector *row_owners, PARA_LIST *paragraphs, GenericVector *models) { GenericVector rows; ParagraphTheory theory(models); // Initialize row_owners to be a bunch of nullptr pointers. row_owners->init_to_size(row_infos->size(), nullptr); // Set up row scratch registers for the main algorithm. rows.init_to_size(row_infos->size(), RowScratchRegisters()); for (int i = 0; i < row_infos->size(); i++) { rows[i].Init((*row_infos)[i]); } // Pass 1: // Detect sequences of lines that all contain leader dots (.....) // These are likely Tables of Contents. If there are three text lines in // a row with leader dots, it's pretty safe to say the middle one should // be a paragraph of its own. SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory); DebugDump(debug_level > 1, "End of Pass 1", theory, rows); GenericVector leftovers; LeftoverSegments(rows, &leftovers, 0, rows.size()); for (int i = 0; i < leftovers.size(); i++) { // Pass 2a: // Find any strongly evidenced start-of-paragraph lines. If they're // followed by two lines that look like body lines, make a paragraph // model for that and see if that model applies throughout the text // (that is, "smear" it). StrongEvidenceClassify(debug_level, &rows, leftovers[i].begin, leftovers[i].end, &theory); // Pass 2b: // If we had any luck in pass 2a, we got part of the page and didn't // know how to classify a few runs of rows. Take the segments that // didn't find a model and reprocess them individually. GenericVector leftovers2; LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end); bool pass2a_was_useful = leftovers2.size() > 1 || (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size())); if (pass2a_was_useful) { for (int j = 0; j < leftovers2.size(); j++) { StrongEvidenceClassify(debug_level, &rows, leftovers2[j].begin, leftovers2[j].end, &theory); } } } DebugDump(debug_level > 1, "End of Pass 2", theory, rows); // Pass 3: // These are the dregs for which we didn't have enough strong textual // and geometric clues to form matching models for. Let's see if // the geometric clues are simple enough that we could just use those. LeftoverSegments(rows, &leftovers, 0, rows.size()); for (int i = 0; i < leftovers.size(); i++) { GeometricClassify(debug_level, &rows, leftovers[i].begin, leftovers[i].end, &theory); } // Undo any flush models for which there's little evidence. DowngradeWeakestToCrowns(debug_level, &theory, &rows); DebugDump(debug_level > 1, "End of Pass 3", theory, rows); // Pass 4: // Take everything that's still not marked up well and clear all markings. LeftoverSegments(rows, &leftovers, 0, rows.size()); for (int i = 0; i < leftovers.size(); i++) { for (int j = leftovers[i].begin; j < leftovers[i].end; j++) { rows[j].SetUnknown(); } } DebugDump(debug_level > 1, "End of Pass 4", theory, rows); // Convert all of the unique hypothesis runs to PARAs. ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory); DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows); // Finally, clean up any dangling nullptr row paragraph parents. CanonicalizeDetectionResults(row_owners, paragraphs); } // ============ Code interfacing with the rest of Tesseract ================== static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) { // Set up text, lword_text, and rword_text (mostly for debug printing). STRING fake_text; PageIterator pit(static_cast(it)); bool first_word = true; if (!pit.Empty(RIL_WORD)) { do { fake_text += "x"; if (first_word) info->lword_text += "x"; info->rword_text += "x"; if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) && !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) { fake_text += " "; info->rword_text = ""; first_word = false; } } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL)); } if (fake_text.size() == 0) return; int lspaces = info->pix_ldistance / info->average_interword_space; for (int i = 0; i < lspaces; i++) { info->text += ' '; } info->text += fake_text; // Set up lword_box, rword_box, and num_words. PAGE_RES_IT page_res_it = *it.PageResIt(); WERD_RES *word_res = page_res_it.restart_row(); ROW_RES *this_row = page_res_it.row(); WERD_RES *lword = nullptr; WERD_RES *rword = nullptr; info->num_words = 0; do { if (word_res) { if (!lword) lword = word_res; if (rword != word_res) info->num_words++; rword = word_res; } word_res = page_res_it.forward(); } while (page_res_it.row() == this_row); if (lword) info->lword_box = lword->word->bounding_box(); if (rword) info->rword_box = rword->word->bounding_box(); } // Given a Tesseract Iterator pointing to a text line, fill in the paragraph // detector RowInfo with all relevant information from the row. static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) { if (it.PageResIt()->row() != nullptr) { ROW *row = it.PageResIt()->row()->row; info->pix_ldistance = row->lmargin(); info->pix_rdistance = row->rmargin(); info->average_interword_space = row->space() > 0 ? row->space() : std::max(static_cast(row->x_height()), 1); info->pix_xheight = row->x_height(); info->has_leaders = false; info->has_drop_cap = row->has_drop_cap(); info->ltr = true; // set below depending on word scripts } else { info->pix_ldistance = info->pix_rdistance = 0; info->average_interword_space = 1; info->pix_xheight = 1.0; info->has_leaders = false; info->has_drop_cap = false; info->ltr = true; } info->num_words = 0; info->lword_indicates_list_item = false; info->lword_likely_starts_idea = false; info->lword_likely_ends_idea = false; info->rword_indicates_list_item = false; info->rword_likely_starts_idea = false; info->rword_likely_ends_idea = false; info->has_leaders = false; info->ltr = true; if (!after_recognition) { InitializeTextAndBoxesPreRecognition(it, info); return; } info->text = ""; const std::unique_ptr text(it.GetUTF8Text(RIL_TEXTLINE)); int trailing_ws_idx = strlen(text.get()); // strip trailing space while (trailing_ws_idx > 0 && // isspace() only takes ASCII isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) trailing_ws_idx--; if (trailing_ws_idx > 0) { int lspaces = info->pix_ldistance / info->average_interword_space; for (int i = 0; i < lspaces; i++) info->text += ' '; for (int i = 0; i < trailing_ws_idx; i++) info->text += text[i]; } if (info->text.size() == 0) { return; } PAGE_RES_IT page_res_it = *it.PageResIt(); GenericVector werds; WERD_RES *word_res = page_res_it.restart_row(); ROW_RES *this_row = page_res_it.row(); int num_leaders = 0; int ltr = 0; int rtl = 0; do { if (word_res && word_res->best_choice->unichar_string().length() > 0) { werds.push_back(word_res); ltr += word_res->AnyLtrCharsInWord() ? 1 : 0; rtl += word_res->AnyRtlCharsInWord() ? 1 : 0; if (word_res->word->flag(W_REP_CHAR)) num_leaders++; } word_res = page_res_it.forward(); } while (page_res_it.row() == this_row); info->ltr = ltr >= rtl; info->has_leaders = num_leaders > 3; info->num_words = werds.size(); if (!werds.empty()) { WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; info->lword_text = lword->best_choice->unichar_string().string(); info->rword_text = rword->best_choice->unichar_string().string(); info->lword_box = lword->word->bounding_box(); info->rword_box = rword->word->bounding_box(); LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text, &info->lword_indicates_list_item, &info->lword_likely_starts_idea, &info->lword_likely_ends_idea); RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text, &info->rword_indicates_list_item, &info->rword_likely_starts_idea, &info->rword_likely_ends_idea); } } // This is called after rows have been identified and words are recognized. // Much of this could be implemented before word recognition, but text helps // to identify bulleted lists and gives good signals for sentence boundaries. void DetectParagraphs(int debug_level, bool after_text_recognition, const MutableIterator *block_start, GenericVector *models) { // Clear out any preconceived notions. if (block_start->Empty(RIL_TEXTLINE)) { return; } BLOCK *block = block_start->PageResIt()->block()->block; block->para_list()->clear(); bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText(); // Convert the Tesseract structures to RowInfos // for the paragraph detection algorithm. MutableIterator row(*block_start); if (row.Empty(RIL_TEXTLINE)) return; // end of input already. GenericVector row_infos; do { if (!row.PageResIt()->row()) continue; // empty row. row.PageResIt()->row()->row->set_para(nullptr); row_infos.push_back(RowInfo()); RowInfo &ri = row_infos.back(); InitializeRowInfo(after_text_recognition, row, &ri); } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE)); // If we're called before text recognition, we might not have // tight block bounding boxes, so trim by the minimum on each side. if (!row_infos.empty()) { int min_lmargin = row_infos[0].pix_ldistance; int min_rmargin = row_infos[0].pix_rdistance; for (int i = 1; i < row_infos.size(); i++) { if (row_infos[i].pix_ldistance < min_lmargin) min_lmargin = row_infos[i].pix_ldistance; if (row_infos[i].pix_rdistance < min_rmargin) min_rmargin = row_infos[i].pix_rdistance; } if (min_lmargin > 0 || min_rmargin > 0) { for (int i = 0; i < row_infos.size(); i++) { row_infos[i].pix_ldistance -= min_lmargin; row_infos[i].pix_rdistance -= min_rmargin; } } } // Run the paragraph detection algorithm. GenericVector row_owners; GenericVector the_paragraphs; if (!is_image_block) { DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models); } else { row_owners.init_to_size(row_infos.size(), nullptr); CanonicalizeDetectionResults(&row_owners, block->para_list()); } // Now stitch in the row_owners into the rows. row = *block_start; for (int i = 0; i < row_owners.size(); i++) { while (!row.PageResIt()->row()) row.Next(RIL_TEXTLINE); row.PageResIt()->row()->row->set_para(row_owners[i]); row.Next(RIL_TEXTLINE); } } } // namespace