mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 23:19:07 +08:00
Replace remaining GenericVector by std::vector for src/ccmain
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
bf42f8313d
commit
1f94d79c81
@ -24,7 +24,6 @@
|
||||
# include "boxread.h"
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
#include <tesseract/unichar.h>
|
||||
#include "genericvector.h"
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "unicharset.h"
|
||||
@ -489,7 +488,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
|
||||
if (word->text() == nullptr || word->text()[0] == '\0')
|
||||
continue; // Ignore words that have no text.
|
||||
// Convert the correct text to a vector of UNICHAR_ID
|
||||
GenericVector<UNICHAR_ID> target_text;
|
||||
std::vector<UNICHAR_ID> target_text;
|
||||
if (!ConvertStringToUnichars(word->text(), &target_text)) {
|
||||
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
|
||||
pr_it.DeleteCurrentWord();
|
||||
@ -505,7 +504,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
|
||||
|
||||
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||
/// @return false if an invalid UNICHAR_ID is encountered.
|
||||
bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids) {
|
||||
bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
|
||||
for (int step = 0; *utf8 != '\0'; utf8 += step) {
|
||||
const char *next_space = strchr(utf8, ' ');
|
||||
if (next_space == nullptr)
|
||||
@ -528,10 +527,10 @@ bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_
|
||||
/// applies a full search on the classifier results to find the best classified
|
||||
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
|
||||
/// substitutions ARE used.
|
||||
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
|
||||
bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
|
||||
// Classify all required combinations of blobs and save results in choices.
|
||||
const int word_length = word_res->box_word->length();
|
||||
auto *choices = new GenericVector<BLOB_CHOICE_LIST *>[word_length];
|
||||
auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
|
||||
for (int i = 0; i < word_length; ++i) {
|
||||
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
|
||||
BLOB_CHOICE_LIST *match_result =
|
||||
@ -552,8 +551,11 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
|
||||
float best_rating = 0.0f;
|
||||
SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
|
||||
&word_res->best_state);
|
||||
for (int i = 0; i < word_length; ++i)
|
||||
choices[i].delete_data_pointers();
|
||||
for (int i = 0; i < word_length; ++i) {
|
||||
for (auto choice : choices[i]) {
|
||||
delete choice;
|
||||
}
|
||||
}
|
||||
delete[] choices;
|
||||
if (word_res->best_state.empty()) {
|
||||
// Build the original segmentation and if it is the same length as the
|
||||
@ -583,9 +585,9 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
|
||||
|
||||
/// Recursive helper to find a match to the target_text (from text_index
|
||||
/// position) in the choices (from choices_pos position).
|
||||
/// @param choices is an array of GenericVectors, of length choices_length,
|
||||
/// @param choices is an array of vectors of length choices_length,
|
||||
/// with each element representing a starting position in the word, and the
|
||||
/// #GenericVector holding classification results for a sequence of consecutive
|
||||
/// #vector holding classification results for a sequence of consecutive
|
||||
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||
/// @param choices_pos
|
||||
/// @param choices_length
|
||||
@ -595,8 +597,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
|
||||
/// @param segmentation
|
||||
/// @param best_rating
|
||||
/// @param best_segmentation
|
||||
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const GenericVector<UNICHAR_ID> &target_text,
|
||||
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const std::vector<UNICHAR_ID> &target_text,
|
||||
int text_index, float rating, std::vector<int> *segmentation,
|
||||
float *best_rating, std::vector<int> *best_segmentation) {
|
||||
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
|
||||
|
@ -461,8 +461,8 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
continue;
|
||||
}
|
||||
// Two words sharing the same language model, excellent!
|
||||
GenericVector<WERD_CHOICE *> overrides_word1;
|
||||
GenericVector<WERD_CHOICE *> overrides_word2;
|
||||
std::vector<WERD_CHOICE *> overrides_word1;
|
||||
std::vector<WERD_CHOICE *> overrides_word2;
|
||||
|
||||
const auto orig_w1_str = w_prev->best_choice->unichar_string();
|
||||
const auto orig_w2_str = w->best_choice->unichar_string();
|
||||
@ -768,7 +768,7 @@ static int SelectBestWords(double rating_ratio, double certainty_margin, bool de
|
||||
PointerVector<WERD_RES> *best_words) {
|
||||
// Process the smallest groups of words that have an overlapping word
|
||||
// boundary at the end.
|
||||
GenericVector<WERD_RES *> out_words;
|
||||
std::vector<WERD_RES *> out_words;
|
||||
// Index into each word vector (best, new).
|
||||
int b = 0, n = 0;
|
||||
int num_best = 0, num_new = 0;
|
||||
@ -893,19 +893,19 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
|
||||
return false;
|
||||
real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
|
||||
// Get the noise outlines into a vector with matching bool map.
|
||||
GenericVector<C_OUTLINE *> outlines;
|
||||
std::vector<C_OUTLINE *> outlines;
|
||||
real_word->GetNoiseOutlines(&outlines);
|
||||
GenericVector<bool> word_wanted;
|
||||
GenericVector<bool> overlapped_any_blob;
|
||||
GenericVector<C_BLOB *> target_blobs;
|
||||
std::vector<bool> word_wanted;
|
||||
std::vector<bool> overlapped_any_blob;
|
||||
std::vector<C_BLOB *> target_blobs;
|
||||
AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
|
||||
&overlapped_any_blob, &target_blobs);
|
||||
// Filter the outlines that overlapped any blob and put them into the word
|
||||
// now. This simplifies the remaining task and also makes it more accurate
|
||||
// as it has more completed blobs to work on.
|
||||
GenericVector<bool> wanted;
|
||||
GenericVector<C_BLOB *> wanted_blobs;
|
||||
GenericVector<C_OUTLINE *> wanted_outlines;
|
||||
std::vector<bool> wanted;
|
||||
std::vector<C_BLOB *> wanted_blobs;
|
||||
std::vector<C_OUTLINE *> wanted_outlines;
|
||||
int num_overlapped = 0;
|
||||
int num_overlapped_used = 0;
|
||||
for (int i = 0; i < overlapped_any_blob.size(); ++i) {
|
||||
@ -948,11 +948,11 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
|
||||
// Output: word_wanted indicates which outlines are to be assigned to a blob,
|
||||
// target_blobs indicates which to assign to, and overlapped_any_blob is
|
||||
// true for all outlines that overlapped a blob.
|
||||
void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines,
|
||||
void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
|
||||
int pass, WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<bool> *overlapped_any_blob,
|
||||
GenericVector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<bool> *overlapped_any_blob,
|
||||
std::vector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> blob_wanted;
|
||||
word_wanted->resize(outlines.size(), false);
|
||||
overlapped_any_blob->resize(outlines.size(), false);
|
||||
@ -999,10 +999,10 @@ void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE
|
||||
|
||||
// Attempts to assign non-overlapping outlines to their nearest blobs or
|
||||
// make new blobs out of them.
|
||||
void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
|
||||
void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
|
||||
WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> blob_wanted;
|
||||
word_wanted->resize(outlines.size(), false);
|
||||
target_blobs->resize(outlines.size(), nullptr);
|
||||
@ -1077,7 +1077,7 @@ void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &out
|
||||
// are desired, in which case ok_outlines indicates which ones.
|
||||
bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
|
||||
C_BLOB *blob,
|
||||
const GenericVector<C_OUTLINE *> &outlines,
|
||||
const std::vector<C_OUTLINE *> &outlines,
|
||||
int num_outlines, std::vector<bool> *ok_outlines) {
|
||||
std::string best_str;
|
||||
float target_cert = certainty_threshold;
|
||||
@ -1161,7 +1161,7 @@ bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
|
||||
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
|
||||
// the inclusion of the outlines, and returns the certainty of the raw choice.
|
||||
float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
|
||||
const GenericVector<C_OUTLINE *> &outlines, int pass_n,
|
||||
const std::vector<C_OUTLINE *> &outlines, int pass_n,
|
||||
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
|
||||
C_OUTLINE_IT ol_it;
|
||||
C_OUTLINE *first_to_keep = nullptr;
|
||||
@ -1865,8 +1865,7 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
|
||||
const int fontinfo_size = get_fontinfo_table().size();
|
||||
if (fontinfo_size == 0)
|
||||
return;
|
||||
GenericVector<int> font_total_score;
|
||||
font_total_score.init_to_size(fontinfo_size, 0);
|
||||
std::vector<int> font_total_score(fontinfo_size);
|
||||
|
||||
// Compute the font scores for the word
|
||||
if (tessedit_debug_fonts) {
|
||||
|
@ -131,7 +131,7 @@ int EquationDetect::LabelSpecialText(TO_BLOCK *to_block) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
GenericVector<BLOBNBOX_LIST *> blob_lists;
|
||||
std::vector<BLOBNBOX_LIST *> blob_lists;
|
||||
blob_lists.push_back(&(to_block->blobs));
|
||||
blob_lists.push_back(&(to_block->large_blobs));
|
||||
for (int i = 0; i < blob_lists.size(); ++i) {
|
||||
@ -223,16 +223,17 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &uni
|
||||
|
||||
if (unicharset.get_ispunctuation(id)) {
|
||||
// Exclude some special texts that are likely to be confused as math symbol.
|
||||
static GenericVector<UNICHAR_ID> ids_to_exclude;
|
||||
static std::vector<UNICHAR_ID> ids_to_exclude;
|
||||
if (ids_to_exclude.empty()) {
|
||||
static const char *kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
|
||||
"〈", "〉", "《", "》", "」", "「"};
|
||||
for (auto i = 0; i < countof(kCharsToEx); i++) {
|
||||
ids_to_exclude.push_back(unicharset.unichar_to_id(kCharsToEx[i]));
|
||||
}
|
||||
ids_to_exclude.sort();
|
||||
std::sort(ids_to_exclude.begin(), ids_to_exclude.end());
|
||||
}
|
||||
return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
|
||||
auto found = std::binary_search(ids_to_exclude.begin(), ids_to_exclude.end(), id);
|
||||
return found ? BSTT_NONE : BSTT_MATH;
|
||||
}
|
||||
|
||||
// Check if it is digit. In addition to the isdigit attribute, we also check
|
||||
@ -266,13 +267,13 @@ void EquationDetect::IdentifySpecialText() {
|
||||
IdentifyBlobsToSkip(part);
|
||||
BLOBNBOX_C_IT bbox_it(part->boxes());
|
||||
// Compute the height threshold.
|
||||
GenericVector<int> blob_heights;
|
||||
std::vector<int> blob_heights;
|
||||
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
|
||||
if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
|
||||
blob_heights.push_back(bbox_it.data()->bounding_box().height());
|
||||
}
|
||||
}
|
||||
blob_heights.sort();
|
||||
std::sort(blob_heights.begin(), blob_heights.end());
|
||||
const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
|
||||
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
|
||||
if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
|
||||
@ -377,7 +378,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
||||
|
||||
// Pass 3: expand block equation seeds.
|
||||
while (!cp_seeds_.empty()) {
|
||||
GenericVector<ColPartition *> seeds_expanded;
|
||||
std::vector<ColPartition *> seeds_expanded;
|
||||
for (int i = 0; i < cp_seeds_.size(); ++i) {
|
||||
if (ExpandSeed(cp_seeds_[i])) {
|
||||
// If this seed is expanded, then we add it into seeds_expanded. Note
|
||||
@ -407,14 +408,14 @@ void EquationDetect::MergePartsByLocation() {
|
||||
while (true) {
|
||||
ColPartition *part = nullptr;
|
||||
// partitions that have been updated.
|
||||
GenericVector<ColPartition *> parts_updated;
|
||||
std::vector<ColPartition *> parts_updated;
|
||||
ColPartitionGridSearch gsearch(part_grid_);
|
||||
gsearch.StartFullSearch();
|
||||
while ((part = gsearch.NextFullSearch()) != nullptr) {
|
||||
if (!IsTextOrEquationType(part->type())) {
|
||||
continue;
|
||||
}
|
||||
GenericVector<ColPartition *> parts_to_merge;
|
||||
std::vector<ColPartition *> parts_to_merge;
|
||||
SearchByOverlap(part, &parts_to_merge);
|
||||
if (parts_to_merge.empty()) {
|
||||
continue;
|
||||
@ -443,7 +444,7 @@ void EquationDetect::MergePartsByLocation() {
|
||||
}
|
||||
|
||||
void EquationDetect::SearchByOverlap(ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_overlap) {
|
||||
std::vector<ColPartition *> *parts_overlap) {
|
||||
ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);
|
||||
if (!IsTextOrEquationType(seed->type())) {
|
||||
return;
|
||||
@ -457,7 +458,7 @@ void EquationDetect::SearchByOverlap(ColPartition *seed,
|
||||
|
||||
// Search iteratively.
|
||||
ColPartition *part;
|
||||
GenericVector<ColPartition *> parts;
|
||||
std::vector<ColPartition *> parts;
|
||||
const float kLargeOverlapTh = 0.95;
|
||||
const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
|
||||
while ((part = search.NextRadSearch()) != nullptr) {
|
||||
@ -518,11 +519,11 @@ void EquationDetect::IdentifySeedParts() {
|
||||
ColPartition *part = nullptr;
|
||||
gsearch.StartFullSearch();
|
||||
|
||||
GenericVector<ColPartition *> seeds1, seeds2;
|
||||
std::vector<ColPartition *> seeds1, seeds2;
|
||||
// The left coordinates of indented text partitions.
|
||||
GenericVector<int> indented_texts_left;
|
||||
std::vector<int> indented_texts_left;
|
||||
// The foreground density of text partitions.
|
||||
GenericVector<float> texts_foreground_density;
|
||||
std::vector<float> texts_foreground_density;
|
||||
while ((part = gsearch.NextFullSearch()) != nullptr) {
|
||||
if (!IsTextOrEquationType(part->type())) {
|
||||
continue;
|
||||
@ -552,8 +553,8 @@ void EquationDetect::IdentifySeedParts() {
|
||||
}
|
||||
|
||||
// Sort the features collected from text regions.
|
||||
indented_texts_left.sort();
|
||||
texts_foreground_density.sort();
|
||||
std::sort(indented_texts_left.begin(), indented_texts_left.end());
|
||||
std::sort(texts_foreground_density.begin(), texts_foreground_density.end());
|
||||
float foreground_density_th = 0.15; // Default value.
|
||||
if (!texts_foreground_density.empty()) {
|
||||
// Use the median of the texts_foreground_density.
|
||||
@ -598,7 +599,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
|
||||
ASSERT_HOST(part);
|
||||
|
||||
// Split part horizontall, and check for each sub part.
|
||||
GenericVector<TBOX> sub_boxes;
|
||||
std::vector<TBOX> sub_boxes;
|
||||
SplitCPHorLite(part, &sub_boxes);
|
||||
float parts_passed = 0.0;
|
||||
for (int i = 0; i < sub_boxes.size(); ++i) {
|
||||
@ -615,7 +616,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
|
||||
return retval;
|
||||
}
|
||||
|
||||
void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) {
|
||||
void EquationDetect::SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
|
||||
ASSERT_HOST(part && parts_splitted);
|
||||
if (part->median_width() == 0 || part->boxes_count() == 0) {
|
||||
return;
|
||||
@ -623,7 +624,9 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
|
||||
|
||||
// Make a copy of part, and reset parts_splitted.
|
||||
ColPartition *right_part = part->CopyButDontOwnBlobs();
|
||||
parts_splitted->delete_data_pointers();
|
||||
for (auto part : *parts_splitted) {
|
||||
delete part;
|
||||
}
|
||||
parts_splitted->clear();
|
||||
|
||||
const double kThreshold = part->median_width() * 3.0;
|
||||
@ -663,7 +666,7 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
|
||||
parts_splitted->push_back(right_part);
|
||||
}
|
||||
|
||||
void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) {
|
||||
void EquationDetect::SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
|
||||
ASSERT_HOST(part && splitted_boxes);
|
||||
splitted_boxes->clear();
|
||||
if (part->median_width() == 0) {
|
||||
@ -701,7 +704,7 @@ void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *spl
|
||||
}
|
||||
}
|
||||
|
||||
bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left,
|
||||
bool EquationDetect::CheckForSeed2(const std::vector<int> &indented_texts_left,
|
||||
const float foreground_density_th, ColPartition *part) {
|
||||
ASSERT_HOST(part);
|
||||
const TBOX &box = part->bounding_box();
|
||||
@ -720,22 +723,25 @@ bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left
|
||||
return true;
|
||||
}
|
||||
|
||||
int EquationDetect::CountAlignment(const GenericVector<int> &sorted_vec, const int val) const {
|
||||
int EquationDetect::CountAlignment(const std::vector<int> &sorted_vec, const int val) const {
|
||||
if (sorted_vec.empty()) {
|
||||
return 0;
|
||||
}
|
||||
const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
|
||||
const int pos = sorted_vec.binary_search(val);
|
||||
const int kDistTh = static_cast<int>(round(0.03f * resolution_));
|
||||
auto pos = std::upper_bound(sorted_vec.begin(), sorted_vec.end(), val);
|
||||
if (pos > sorted_vec.begin()) {
|
||||
--pos;
|
||||
}
|
||||
int count = 0;
|
||||
|
||||
// Search left side.
|
||||
int index = pos;
|
||||
auto index = pos - sorted_vec.begin();
|
||||
while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
|
||||
count++;
|
||||
}
|
||||
|
||||
// Search right side.
|
||||
index = pos + 1;
|
||||
index = pos + 1 - sorted_vec.begin();
|
||||
while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
|
||||
count++;
|
||||
}
|
||||
@ -764,9 +770,9 @@ void EquationDetect::ComputeCPsSuperBBox() {
|
||||
|
||||
void EquationDetect::IdentifyInlinePartsHorizontal() {
|
||||
ASSERT_HOST(cps_super_bbox_);
|
||||
GenericVector<ColPartition *> new_seeds;
|
||||
std::vector<ColPartition *> new_seeds;
|
||||
const int kMarginDiffTh = IntCastRounded(0.5 * lang_tesseract_->source_resolution());
|
||||
const int kGapTh = static_cast<int>(roundf(1.0 * lang_tesseract_->source_resolution()));
|
||||
const int kGapTh = static_cast<int>(round(1.0f * lang_tesseract_->source_resolution()));
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
search.SetUniqueMode(true);
|
||||
// The center x coordinate of the cp_super_bbox_.
|
||||
@ -826,7 +832,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
|
||||
// Get the y gap between text partitions;
|
||||
ColPartition *current = nullptr, *prev = nullptr;
|
||||
gsearch.StartFullSearch();
|
||||
GenericVector<int> ygaps;
|
||||
std::vector<int> ygaps;
|
||||
while ((current = gsearch.NextFullSearch()) != nullptr) {
|
||||
if (!PTIsTextType(current->type())) {
|
||||
continue;
|
||||
@ -851,7 +857,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
|
||||
}
|
||||
|
||||
// Compute the line spacing from ygaps: use the mean of the first half.
|
||||
ygaps.sort();
|
||||
std::sort(ygaps.begin(), ygaps.end());
|
||||
int spacing = 0, count;
|
||||
for (count = 0; count < ygaps.size() / 2; count++) {
|
||||
spacing += ygaps[count];
|
||||
@ -867,12 +873,12 @@ void EquationDetect::IdentifyInlinePartsVertical(const bool top_to_bottom,
|
||||
|
||||
// Sort cp_seeds_.
|
||||
if (top_to_bottom) { // From top to bottom.
|
||||
cp_seeds_.sort(&SortCPByTopReverse);
|
||||
std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByTopReverse);
|
||||
} else { // From bottom to top.
|
||||
cp_seeds_.sort(&SortCPByBottom);
|
||||
std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByBottom);
|
||||
}
|
||||
|
||||
GenericVector<ColPartition *> new_seeds;
|
||||
std::vector<ColPartition *> new_seeds;
|
||||
for (int i = 0; i < cp_seeds_.size(); ++i) {
|
||||
ColPartition *part = cp_seeds_[i];
|
||||
// If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
|
||||
@ -918,8 +924,8 @@ bool EquationDetect::IsInline(const bool search_bottom, const int textparts_line
|
||||
// Check if neighbor and part is inline similar.
|
||||
const float kHeightRatioTh = 0.5;
|
||||
const int kYGapTh = textparts_linespacing > 0
|
||||
? textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_))
|
||||
: static_cast<int>(roundf(0.05 * resolution_)); // Default value.
|
||||
? textparts_linespacing + static_cast<int>(round(0.02f * resolution_))
|
||||
: static_cast<int>(round(0.05f * resolution_)); // Default value.
|
||||
if (part_box.x_overlap(neighbor_box) && // Location feature.
|
||||
part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
|
||||
// Geo feature.
|
||||
@ -973,9 +979,9 @@ EquationDetect::IndentType EquationDetect::IsIndented(ColPartition *part) {
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
ColPartition *neighbor = nullptr;
|
||||
const TBOX &part_box(part->bounding_box());
|
||||
const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
|
||||
const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
|
||||
const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
|
||||
const int kXGapTh = static_cast<int>(round(0.5f * resolution_));
|
||||
const int kRadiusTh = static_cast<int>(round(3.0f * resolution_));
|
||||
const int kYGapTh = static_cast<int>(round(0.5f * resolution_));
|
||||
|
||||
// Here we use a simple approximation algorithm: from the center of part, We
|
||||
// perform the radius search, and check if we can find a neighboring partition
|
||||
@ -1036,7 +1042,7 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
|
||||
}
|
||||
|
||||
// Expand in four directions.
|
||||
GenericVector<ColPartition *> parts_to_merge;
|
||||
std::vector<ColPartition *> parts_to_merge;
|
||||
ExpandSeedHorizontal(true, seed, &parts_to_merge);
|
||||
ExpandSeedHorizontal(false, seed, &parts_to_merge);
|
||||
ExpandSeedVertical(true, seed, &parts_to_merge);
|
||||
@ -1073,10 +1079,10 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
|
||||
}
|
||||
|
||||
void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge) {
|
||||
std::vector<ColPartition *> *parts_to_merge) {
|
||||
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);
|
||||
const float kYOverlapTh = 0.6;
|
||||
const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
|
||||
const int kXGapTh = static_cast<int>(round(0.2f * resolution_));
|
||||
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
const TBOX &seed_box(seed->bounding_box());
|
||||
@ -1125,10 +1131,10 @@ void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *
|
||||
}
|
||||
|
||||
void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge) {
|
||||
std::vector<ColPartition *> *parts_to_merge) {
|
||||
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr && cps_super_bbox_ != nullptr);
|
||||
const float kXOverlapTh = 0.4;
|
||||
const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
|
||||
const int kYGapTh = static_cast<int>(round(0.2f * resolution_));
|
||||
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
const TBOX &seed_box(seed->bounding_box());
|
||||
@ -1138,7 +1144,7 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
|
||||
|
||||
// Search iteratively.
|
||||
ColPartition *part = nullptr;
|
||||
GenericVector<ColPartition *> parts;
|
||||
std::vector<ColPartition *> parts;
|
||||
int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;
|
||||
while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {
|
||||
if (part == seed) {
|
||||
@ -1206,8 +1212,8 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
|
||||
}
|
||||
|
||||
bool EquationDetect::IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const {
|
||||
const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
|
||||
const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
|
||||
const int kXGapTh = static_cast<int>(round(0.25f * resolution_));
|
||||
const int kYGapTh = static_cast<int>(round(0.05f * resolution_));
|
||||
|
||||
// Check geometric feature.
|
||||
if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) {
|
||||
@ -1244,7 +1250,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
// Iterate over part_grid_, and find all parts that are text type but not
|
||||
// equation type.
|
||||
ColPartition *part = nullptr;
|
||||
GenericVector<ColPartition *> text_parts;
|
||||
std::vector<ColPartition *> text_parts;
|
||||
ColPartitionGridSearch gsearch(part_grid_);
|
||||
gsearch.StartFullSearch();
|
||||
while ((part = gsearch.NextFullSearch()) != nullptr) {
|
||||
@ -1257,12 +1263,12 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
}
|
||||
|
||||
// Compute the medium height of the text_parts.
|
||||
text_parts.sort(&SortCPByHeight);
|
||||
std::sort(text_parts.begin(), text_parts.end(), &SortCPByHeight);
|
||||
const TBOX &text_box = text_parts[text_parts.size() / 2]->bounding_box();
|
||||
int med_height = text_box.height();
|
||||
if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
|
||||
const TBOX &text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box();
|
||||
med_height = static_cast<int>(roundf(0.5 * (text_box.height() + med_height)));
|
||||
med_height = static_cast<int>(round(0.5f * (text_box.height() + med_height)));
|
||||
}
|
||||
|
||||
// Iterate every text_parts and check if it is a math block satellite.
|
||||
@ -1271,7 +1277,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
if (text_box.height() > med_height) {
|
||||
continue;
|
||||
}
|
||||
GenericVector<ColPartition *> math_blocks;
|
||||
std::vector<ColPartition *> math_blocks;
|
||||
if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
|
||||
continue;
|
||||
}
|
||||
@ -1288,7 +1294,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
}
|
||||
|
||||
bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
|
||||
GenericVector<ColPartition *> *math_blocks) {
|
||||
std::vector<ColPartition *> *math_blocks) {
|
||||
ASSERT_HOST(part != nullptr && math_blocks != nullptr);
|
||||
math_blocks->clear();
|
||||
const TBOX &part_box(part->bounding_box());
|
||||
@ -1344,7 +1350,7 @@ bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
|
||||
ColPartition *EquationDetect::SearchNNVertical(const bool search_bottom, const ColPartition *part) {
|
||||
ASSERT_HOST(part);
|
||||
ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;
|
||||
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
|
||||
const int kYGapTh = static_cast<int>(round(resolution_ * 0.5f));
|
||||
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
search.SetUniqueMode(true);
|
||||
@ -1379,7 +1385,7 @@ bool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *nei
|
||||
if (!neighbor) {
|
||||
return false;
|
||||
}
|
||||
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
|
||||
const int kYGapTh = static_cast<int>(round(resolution_ * 0.1f));
|
||||
return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
|
||||
}
|
||||
|
||||
|
@ -22,7 +22,6 @@
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
#include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText...
|
||||
#include "equationdetectbase.h" // for EquationDetectBase
|
||||
#include "genericvector.h" // for GenericVector
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
class TBOX;
|
||||
@ -86,7 +85,7 @@ protected:
|
||||
// parts_overlap. Note: this function may update the part_grid_, so if the
|
||||
// caller is also running ColPartitionGridSearch, use the RepositionIterator
|
||||
// to continue.
|
||||
void SearchByOverlap(ColPartition *seed, GenericVector<ColPartition *> *parts_overlap);
|
||||
void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);
|
||||
|
||||
// Insert part back into part_grid_, after it absorbs some other parts.
|
||||
void InsertPartAfterAbsorb(ColPartition *part);
|
||||
@ -106,12 +105,12 @@ protected:
|
||||
// 1. If its left is aligned with any coordinates in indented_texts_left,
|
||||
// which we assume have been sorted.
|
||||
// 2. If its foreground density is over foreground_density_th.
|
||||
bool CheckForSeed2(const GenericVector<int> &indented_texts_left,
|
||||
bool CheckForSeed2(const std::vector<int> &indented_texts_left,
|
||||
const float foreground_density_th, ColPartition *part);
|
||||
|
||||
// Count the number of values in sorted_vec that is close to val, used to
|
||||
// check if a partition is aligned with text partitions.
|
||||
int CountAlignment(const GenericVector<int> &sorted_vec, const int val) const;
|
||||
int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;
|
||||
|
||||
// Check for a seed candidate using the foreground pixel density. And we
|
||||
// return true if the density is below a certain threshold, because characters
|
||||
@ -120,14 +119,14 @@ protected:
|
||||
|
||||
// A light version of SplitCPHor: instead of really doing the part split, we
|
||||
// simply compute the union bounding box of each split part.
|
||||
void SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes);
|
||||
void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);
|
||||
|
||||
// Split the part (horizontally), and save the split result into
|
||||
// parts_splitted. Note that it is caller's responsibility to release the
|
||||
// memory owns by parts_splitted. On the other hand, the part is unchanged
|
||||
// during this process and still owns the blobs, so do NOT call DeleteBoxes
|
||||
// when freeing the colpartitions in parts_splitted.
|
||||
void SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted);
|
||||
void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);
|
||||
|
||||
// Check the density for a seed candidate (part) using its math density and
|
||||
// italic density, returns true if the check passed.
|
||||
@ -167,9 +166,9 @@ protected:
|
||||
// merged with seed, remove them from part_grid_, and put them into
|
||||
// parts_to_merge.
|
||||
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge);
|
||||
std::vector<ColPartition *> *parts_to_merge);
|
||||
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge);
|
||||
std::vector<ColPartition *> *parts_to_merge);
|
||||
|
||||
// Check if a part_box is the small neighbor of seed_box.
|
||||
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;
|
||||
@ -190,7 +189,7 @@ protected:
|
||||
|
||||
// Check if part is the satellite of one/two math blocks. If it is, we return
|
||||
// true, and save the blocks into math_blocks.
|
||||
bool IsMathBlockSatellite(ColPartition *part, GenericVector<ColPartition *> *math_blocks);
|
||||
bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);
|
||||
|
||||
// Search the nearest neighbor of part in one vertical direction as defined in
|
||||
// search_bottom. It returns the neighbor found that major x overlap with it,
|
||||
@ -237,7 +236,7 @@ protected:
|
||||
TBOX *cps_super_bbox_;
|
||||
|
||||
// The seed ColPartition for equation region.
|
||||
GenericVector<ColPartition *> cp_seeds_;
|
||||
std::vector<ColPartition *> cp_seeds_;
|
||||
|
||||
// The resolution (dpi) of the processing image.
|
||||
int resolution_;
|
||||
|
@ -18,7 +18,6 @@
|
||||
|
||||
#include "paragraphs.h"
|
||||
|
||||
#include "genericvector.h" // for GenericVector, GenericVectorEqEq
|
||||
#include "helpers.h" // for UpdateRange, ClipToRange
|
||||
#include "host.h" // for NearlyEqual
|
||||
#include "mutableiterator.h" // for MutableIterator
|
||||
@ -72,7 +71,7 @@ static int Epsilon(int space_pix) {
|
||||
}
|
||||
|
||||
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
|
||||
const GenericVector<RowScratchRegisters> *rows, int row_start,
|
||||
const std::vector<RowScratchRegisters> *rows, int row_start,
|
||||
int row_end) {
|
||||
if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
|
||||
tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", row_start, row_end,
|
||||
@ -134,7 +133,7 @@ static std::string RtlEmbed(const std::string &word, bool rtlify) {
|
||||
|
||||
// Print the current thoughts of the paragraph detector.
|
||||
static void PrintDetectorState(const ParagraphTheory &theory,
|
||||
const GenericVector<RowScratchRegisters> &rows) {
|
||||
const std::vector<RowScratchRegisters> &rows) {
|
||||
std::vector<std::vector<std::string>> output;
|
||||
output.push_back(std::vector<std::string>());
|
||||
output.back().push_back("#row");
|
||||
@ -173,7 +172,7 @@ static void PrintDetectorState(const ParagraphTheory &theory,
|
||||
}
|
||||
|
||||
static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
|
||||
const GenericVector<RowScratchRegisters> &rows) {
|
||||
const std::vector<RowScratchRegisters> &rows) {
|
||||
if (!should_print)
|
||||
return;
|
||||
tprintf("# %s\n", phase);
|
||||
@ -181,7 +180,7 @@ static void DebugDump(bool should_print, const char *phase, const ParagraphTheor
|
||||
}
|
||||
|
||||
// Print out the text for rows[row_start, row_end)
|
||||
static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows, int row_start,
|
||||
static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
|
||||
int row_end) {
|
||||
tprintf("======================================\n");
|
||||
for (int row = row_start; row < row_end; row++) {
|
||||
@ -398,6 +397,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
|
||||
return pos == werd->length();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void push_back_new(std::vector<T> &vector, const T &data) {
|
||||
if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
|
||||
vector.push_back(data);
|
||||
}
|
||||
}
|
||||
|
||||
// ========= Brain Dead Language Model (combined entry points) ================
|
||||
|
||||
// Given the leftmost word of a line either as a Tesseract unicharset + werd
|
||||
@ -581,7 +587,7 @@ void RowScratchRegisters::SetStartLine() {
|
||||
tprintf("Trying to set a line to be START when it's already BODY.\n");
|
||||
}
|
||||
if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
@ -591,42 +597,44 @@ void RowScratchRegisters::SetBodyLine() {
|
||||
tprintf("Trying to set a line to be BODY when it's already START.\n");
|
||||
}
|
||||
if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_START, model));
|
||||
int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
|
||||
if (old_idx >= 0)
|
||||
hypotheses_.remove(old_idx);
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_START, model));
|
||||
auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
|
||||
if (found != hypotheses_.end()) {
|
||||
hypotheses_.erase(found);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
|
||||
int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
|
||||
if (old_idx >= 0)
|
||||
hypotheses_.remove(old_idx);
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
|
||||
auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
|
||||
if (found != hypotheses_.end()) {
|
||||
hypotheses_.erase(found);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
|
||||
for (int h = 0; h < hypotheses_.size(); h++) {
|
||||
if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
|
||||
models->push_back_new(hypotheses_[h].model);
|
||||
push_back_new(*models, hypotheses_[h].model);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
|
||||
for (int h = 0; h < hypotheses_.size(); h++) {
|
||||
if (StrongModel(hypotheses_[h].model))
|
||||
models->push_back_new(hypotheses_[h].model);
|
||||
push_back_new(*models, hypotheses_[h].model);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
|
||||
for (int h = 0; h < hypotheses_.size(); h++) {
|
||||
if (hypotheses_[h].model != nullptr)
|
||||
models->push_back_new(hypotheses_[h].model);
|
||||
push_back_new(*models, hypotheses_[h].model);
|
||||
}
|
||||
}
|
||||
|
||||
@ -647,8 +655,8 @@ void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models
|
||||
if (models.empty())
|
||||
return;
|
||||
for (int h = hypotheses_.size() - 1; h >= 0; h--) {
|
||||
if (!models.contains(hypotheses_[h].model)) {
|
||||
hypotheses_.remove(h);
|
||||
if (!contains(models, hypotheses_[h].model)) {
|
||||
hypotheses_.erase(hypotheses_.begin() + h);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -672,15 +680,15 @@ public:
|
||||
int size() const {
|
||||
return values_.size();
|
||||
}
|
||||
void GetClusters(GenericVector<Cluster> *clusters);
|
||||
void GetClusters(std::vector<Cluster> *clusters);
|
||||
|
||||
private:
|
||||
int max_cluster_width_;
|
||||
GenericVector<int> values_;
|
||||
std::vector<int> values_;
|
||||
};
|
||||
|
||||
// Return the index of the cluster closest to value.
|
||||
static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
|
||||
static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
|
||||
int best_index = 0;
|
||||
for (int i = 0; i < clusters.size(); i++) {
|
||||
if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center))
|
||||
@ -689,9 +697,9 @@ static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
|
||||
return best_index;
|
||||
}
|
||||
|
||||
void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
|
||||
void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
|
||||
clusters->clear();
|
||||
values_.sort();
|
||||
std::sort(values_.begin(), values_.end());
|
||||
for (int i = 0; i < values_.size();) {
|
||||
int orig_i = i;
|
||||
int lo = values_[i];
|
||||
@ -705,16 +713,16 @@ void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
|
||||
|
||||
// Calculate left- and right-indent tab stop values seen in
|
||||
// rows[row_start, row_end) given a tolerance of tolerance.
|
||||
static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
int tolerance, GenericVector<Cluster> *left_tabs,
|
||||
GenericVector<Cluster> *right_tabs) {
|
||||
static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
int tolerance, std::vector<Cluster> *left_tabs,
|
||||
std::vector<Cluster> *right_tabs) {
|
||||
if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
// First pass: toss all left and right indents into clusterers.
|
||||
SimpleClusterer initial_lefts(tolerance);
|
||||
SimpleClusterer initial_rights(tolerance);
|
||||
GenericVector<Cluster> initial_left_tabs;
|
||||
GenericVector<Cluster> initial_right_tabs;
|
||||
std::vector<Cluster> initial_left_tabs;
|
||||
std::vector<Cluster> initial_right_tabs;
|
||||
for (int i = row_start; i < row_end; i++) {
|
||||
initial_lefts.Add((*rows)[i].lindent_);
|
||||
initial_rights.Add((*rows)[i].rindent_);
|
||||
@ -782,7 +790,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
}
|
||||
}
|
||||
if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
|
||||
left_tabs->remove(to_prune);
|
||||
left_tabs->erase(left_tabs->begin() + to_prune);
|
||||
}
|
||||
}
|
||||
if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
|
||||
@ -793,7 +801,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
}
|
||||
}
|
||||
if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
|
||||
right_tabs->remove(to_prune);
|
||||
right_tabs->erase(right_tabs->begin() + to_prune);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -817,7 +825,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
// Case 2b: Fully Justified. (eop_threshold > 0)
|
||||
// We mark a line as short (end of paragraph) if the offside indent
|
||||
// is greater than eop_threshold.
|
||||
static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
const ParagraphModel *model, bool ltr, int eop_threshold) {
|
||||
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
@ -861,7 +869,7 @@ static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
// Further, this struct holds the data we amass for the (single) ParagraphModel
|
||||
// we'll assign to the text lines (assuming we get that far).
|
||||
struct GeometricClassifierState {
|
||||
GeometricClassifierState(int dbg_level, GenericVector<RowScratchRegisters> *r, int r_start,
|
||||
GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
|
||||
int r_end)
|
||||
: debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
|
||||
tolerance = InterwordSpace(*r, r_start, r_end);
|
||||
@ -886,7 +894,7 @@ struct GeometricClassifierState {
|
||||
}
|
||||
|
||||
// Align tabs are the tab stops the text is aligned to.
|
||||
const GenericVector<Cluster> &AlignTabs() const {
|
||||
const std::vector<Cluster> &AlignTabs() const {
|
||||
if (just == tesseract::JUSTIFICATION_RIGHT)
|
||||
return right_tabs;
|
||||
return left_tabs;
|
||||
@ -897,7 +905,7 @@ struct GeometricClassifierState {
|
||||
// Note that for a left-to-right text which is aligned to the right such as
|
||||
// this function comment, the offside tabs are the horizontal tab stops
|
||||
// marking the beginning of ("Note", "this" and "marking").
|
||||
const GenericVector<Cluster> &OffsideTabs() const {
|
||||
const std::vector<Cluster> &OffsideTabs() const {
|
||||
if (just == tesseract::JUSTIFICATION_RIGHT)
|
||||
return left_tabs;
|
||||
return right_tabs;
|
||||
@ -940,7 +948,7 @@ struct GeometricClassifierState {
|
||||
|
||||
// The Geometric Classifier was asked to find a single paragraph model
|
||||
// to fit the text rows (*rows)[row_start, row_end)
|
||||
GenericVector<RowScratchRegisters> *rows;
|
||||
std::vector<RowScratchRegisters> *rows;
|
||||
int row_start = 0;
|
||||
int row_end = 0;
|
||||
|
||||
@ -953,8 +961,8 @@ struct GeometricClassifierState {
|
||||
|
||||
// These left and right tab stops were determined to be the common tab
|
||||
// stops for the given text.
|
||||
GenericVector<Cluster> left_tabs;
|
||||
GenericVector<Cluster> right_tabs;
|
||||
std::vector<Cluster> left_tabs;
|
||||
std::vector<Cluster> right_tabs;
|
||||
|
||||
// These are parameters we must determine to create a ParagraphModel.
|
||||
tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
|
||||
@ -1083,7 +1091,7 @@ static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricCla
|
||||
// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
|
||||
// it's worth guessing that (A1b) is the correct interpretation if there are
|
||||
// far more "full" lines than "short" lines.
|
||||
static void GeometricClassify(int debug_level, GenericVector<RowScratchRegisters> *rows,
|
||||
static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, ParagraphTheory *theory) {
|
||||
if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
@ -1223,7 +1231,7 @@ const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
|
||||
}
|
||||
auto *m = new ParagraphModel(model);
|
||||
models_->push_back(m);
|
||||
models_we_added_.push_back_new(m);
|
||||
push_back_new(models_we_added_, m);
|
||||
return m;
|
||||
}
|
||||
|
||||
@ -1231,7 +1239,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
|
||||
size_t w = 0;
|
||||
for (size_t r = 0; r < models_->size(); r++) {
|
||||
ParagraphModel *m = (*models_)[r];
|
||||
if (!used_models.contains(m) && models_we_added_.contains(m)) {
|
||||
if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
|
||||
delete m;
|
||||
} else {
|
||||
if (r > w) {
|
||||
@ -1246,7 +1254,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
|
||||
// Examine rows[start, end) and try to determine if an existing non-centered
|
||||
// paragraph model would fit them perfectly. If so, return a pointer to it.
|
||||
// If not, return nullptr.
|
||||
const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegisters> *rows,
|
||||
const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
|
||||
int start, int end) const {
|
||||
for (const auto *model : *models_) {
|
||||
if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model))
|
||||
@ -1258,7 +1266,7 @@ const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegist
|
||||
void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
|
||||
for (const auto *model : *models_) {
|
||||
if (model->justification() != JUSTIFICATION_CENTER)
|
||||
models->push_back_new(model);
|
||||
push_back_new(*models, model);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1272,7 +1280,7 @@ int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model) {
|
||||
if (!StrongModel(model)) {
|
||||
tprintf("ValidFirstLine() should only be called with strong models!\n");
|
||||
@ -1281,7 +1289,7 @@ bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
(*rows)[row].rindent_, (*rows)[row].rmargin_);
|
||||
}
|
||||
|
||||
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model) {
|
||||
if (!StrongModel(model)) {
|
||||
tprintf("ValidBodyLine() should only be called with strong models!\n");
|
||||
@ -1290,7 +1298,7 @@ bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
(*rows)[row].rindent_, (*rows)[row].rmargin_);
|
||||
}
|
||||
|
||||
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b,
|
||||
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
|
||||
const ParagraphModel *model) {
|
||||
if (model != kCrownRight && model != kCrownLeft) {
|
||||
tprintf("CrownCompatible() should only be called with crown models!\n");
|
||||
@ -1308,7 +1316,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
|
||||
|
||||
// =============== Implementation of ParagraphModelSmearer ====================
|
||||
|
||||
ParagraphModelSmearer::ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
|
||||
ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, ParagraphTheory *theory)
|
||||
: theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
|
||||
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
|
||||
@ -1341,7 +1349,7 @@ void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
|
||||
// This is basic filtering; we check likely paragraph starty-ness down
|
||||
// below in Smear() -- you know, whether the first word would have fit
|
||||
// and such.
|
||||
still_open.push_back_new(opened[m]);
|
||||
push_back_new(still_open, opened[m]);
|
||||
}
|
||||
}
|
||||
OpenModels(row + 1) = still_open;
|
||||
@ -1449,7 +1457,7 @@ void ParagraphModelSmearer::Smear() {
|
||||
|
||||
// Find out what ParagraphModels are actually used, and discard any
|
||||
// that are not.
|
||||
static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
|
||||
static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
|
||||
ParagraphTheory *theory) {
|
||||
SetOfModels used_models;
|
||||
for (int i = 0; i < rows.size(); i++) {
|
||||
@ -1483,7 +1491,7 @@ static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
|
||||
// sequences of body lines of equivalent type abutted against the beginning
|
||||
// or a body or start line of a different type into a crown paragraph.
|
||||
static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
|
||||
GenericVector<RowScratchRegisters> *rows) {
|
||||
std::vector<RowScratchRegisters> *rows) {
|
||||
int start;
|
||||
for (int end = rows->size(); end > 0; end = start) {
|
||||
// Search back for a body line of a unique type.
|
||||
@ -1546,7 +1554,7 @@ static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
|
||||
// really just ignore it as an outlier. To express this, we allow the
|
||||
// user to specify the percentile (0..100) of indent values to use as
|
||||
// the common margin for each row in the run of rows[start, end).
|
||||
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start,
|
||||
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end, int percentile) {
|
||||
if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
|
||||
return;
|
||||
@ -1585,7 +1593,7 @@ void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows
|
||||
}
|
||||
|
||||
// Return the median inter-word space in rows[row_start, row_end).
|
||||
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end) {
|
||||
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
|
||||
if (row_end < row_start + 1)
|
||||
return 1;
|
||||
int word_height =
|
||||
@ -1666,7 +1674,7 @@ static bool LikelyParagraphStart(const RowScratchRegisters &before,
|
||||
// If the rows given could be a consistent start to a paragraph, set *consistent
|
||||
// true.
|
||||
static ParagraphModel InternalParagraphModelByOutline(
|
||||
const GenericVector<RowScratchRegisters> *rows, int start, int end, int tolerance,
|
||||
const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
|
||||
bool *consistent) {
|
||||
int ltr_line_count = 0;
|
||||
for (int i = start; i < end; i++) {
|
||||
@ -1763,7 +1771,7 @@ static ParagraphModel InternalParagraphModelByOutline(
|
||||
// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
|
||||
// output if we're debugging.
|
||||
static ParagraphModel ParagraphModelByOutline(int debug_level,
|
||||
const GenericVector<RowScratchRegisters> *rows,
|
||||
const std::vector<RowScratchRegisters> *rows,
|
||||
int start, int end, int tolerance) {
|
||||
bool unused_consistent;
|
||||
ParagraphModel retval =
|
||||
@ -1776,7 +1784,7 @@ static ParagraphModel ParagraphModelByOutline(int debug_level,
|
||||
}
|
||||
|
||||
// Do rows[start, end) form a single instance of the given paragraph model?
|
||||
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end,
|
||||
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
|
||||
const ParagraphModel *model) {
|
||||
if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
|
||||
return false;
|
||||
@ -1800,7 +1808,7 @@ bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int
|
||||
// We only take the very strongest signals, as we don't want to get
|
||||
// confused and marking up centered text, poetry, or source code as
|
||||
// clearly part of a typical paragraph.
|
||||
static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row_start,
|
||||
static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
|
||||
int row_end) {
|
||||
// Record patently obvious body text.
|
||||
for (int i = row_start + 1; i < row_end; i++) {
|
||||
@ -1862,7 +1870,7 @@ static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row
|
||||
// Look for sequences of a start line followed by some body lines in
|
||||
// rows[row_start, row_end) and create ParagraphModels for them if
|
||||
// they seem coherent.
|
||||
static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegisters> *rows,
|
||||
static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, bool allow_flush_models,
|
||||
ParagraphTheory *theory) {
|
||||
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
|
||||
@ -1951,7 +1959,7 @@ static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegiste
|
||||
// clues.
|
||||
// (3) Form models for any sequence of start + continuation lines.
|
||||
// (4) Smear the paragraph models to cover surrounding text.
|
||||
static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegisters> *rows,
|
||||
static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, ParagraphTheory *theory) {
|
||||
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
@ -1979,7 +1987,7 @@ static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegi
|
||||
smearer.Smear();
|
||||
}
|
||||
|
||||
static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows, int row_start,
|
||||
static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
|
||||
int row_end, ParagraphTheory *theory) {
|
||||
for (int i = row_start + 1; i < row_end - 1; i++) {
|
||||
if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
|
||||
@ -1994,8 +2002,8 @@ static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
|
||||
// Collect sequences of unique hypotheses in row registers and create proper
|
||||
// paragraphs for them, referencing the paragraphs in row_owners.
|
||||
static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
|
||||
GenericVector<RowScratchRegisters> &rows,
|
||||
GenericVector<PARA *> *row_owners,
|
||||
std::vector<RowScratchRegisters> &rows,
|
||||
std::vector<PARA *> *row_owners,
|
||||
ParagraphTheory *theory) {
|
||||
int end = rows.size();
|
||||
int start;
|
||||
@ -2090,7 +2098,7 @@ struct Interval {
|
||||
// (1) If a line is surrounded by lines of unknown type, it's weak.
|
||||
// (2) If two lines in a row are start lines for a given paragraph type, but
|
||||
// after that the same paragraph type does not continue, they're weak.
|
||||
static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) {
|
||||
static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
|
||||
SetOfModels row_models;
|
||||
rows[row].StrongHypotheses(&row_models);
|
||||
|
||||
@ -2145,8 +2153,8 @@ static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int ro
|
||||
// + Crown paragraphs not immediately followed by a strongly modeled line.
|
||||
// + Single line paragraphs surrounded by text that doesn't match the
|
||||
// model.
|
||||
static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
|
||||
GenericVector<Interval> *to_fix, int row_start, int row_end) {
|
||||
static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
|
||||
std::vector<Interval> *to_fix, int row_start, int row_end) {
|
||||
to_fix->clear();
|
||||
for (int i = row_start; i < row_end; i++) {
|
||||
bool needs_fixing = false;
|
||||
@ -2195,8 +2203,8 @@ static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
|
||||
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
|
||||
// normalize each row_owner to point to an actual PARA, and output the
|
||||
// paragraphs in order onto paragraphs.
|
||||
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs) {
|
||||
GenericVector<PARA *> &rows = *row_owners;
|
||||
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
|
||||
std::vector<PARA *> &rows = *row_owners;
|
||||
paragraphs->clear();
|
||||
PARA_IT out(paragraphs);
|
||||
PARA *formerly_null = nullptr;
|
||||
@ -2226,16 +2234,16 @@ void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *
|
||||
// models - the list of paragraph models referenced by the PARA objects.
|
||||
// caller is responsible for deleting the models.
|
||||
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<ParagraphModel *> *models) {
|
||||
GenericVector<RowScratchRegisters> rows;
|
||||
std::vector<RowScratchRegisters> rows;
|
||||
ParagraphTheory theory(models);
|
||||
|
||||
// Initialize row_owners to be a bunch of nullptr pointers.
|
||||
row_owners->init_to_size(row_infos->size(), nullptr);
|
||||
row_owners->resize(row_infos->size());
|
||||
|
||||
// Set up row scratch registers for the main algorithm.
|
||||
rows.init_to_size(row_infos->size(), RowScratchRegisters());
|
||||
rows.resize(row_infos->size(), RowScratchRegisters());
|
||||
for (int i = 0; i < row_infos->size(); i++) {
|
||||
rows[i].Init((*row_infos)[i]);
|
||||
}
|
||||
@ -2249,7 +2257,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
|
||||
DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
|
||||
|
||||
GenericVector<Interval> leftovers;
|
||||
std::vector<Interval> leftovers;
|
||||
LeftoverSegments(rows, &leftovers, 0, rows.size());
|
||||
for (int i = 0; i < leftovers.size(); i++) {
|
||||
// Pass 2a:
|
||||
@ -2263,7 +2271,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
// If we had any luck in pass 2a, we got part of the page and didn't
|
||||
// know how to classify a few runs of rows. Take the segments that
|
||||
// didn't find a model and reprocess them individually.
|
||||
GenericVector<Interval> leftovers2;
|
||||
std::vector<Interval> leftovers2;
|
||||
LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
|
||||
bool pass2a_was_useful =
|
||||
leftovers2.size() > 1 ||
|
||||
@ -2422,7 +2430,7 @@ static void InitializeRowInfo(bool after_recognition, const MutableIterator &it,
|
||||
}
|
||||
|
||||
PAGE_RES_IT page_res_it = *it.PageResIt();
|
||||
GenericVector<WERD_RES *> werds;
|
||||
std::vector<WERD_RES *> werds;
|
||||
WERD_RES *word_res = page_res_it.restart_row();
|
||||
ROW_RES *this_row = page_res_it.row();
|
||||
int num_leaders = 0;
|
||||
@ -2505,12 +2513,12 @@ void DetectParagraphs(int debug_level, bool after_text_recognition,
|
||||
}
|
||||
|
||||
// Run the paragraph detection algorithm.
|
||||
GenericVector<PARA *> row_owners;
|
||||
GenericVector<PARA *> the_paragraphs;
|
||||
std::vector<PARA *> row_owners;
|
||||
std::vector<PARA *> the_paragraphs;
|
||||
if (!is_image_block) {
|
||||
DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
|
||||
} else {
|
||||
row_owners.init_to_size(row_infos.size(), nullptr);
|
||||
row_owners.resize(row_infos.size());
|
||||
CanonicalizeDetectionResults(&row_owners, block->para_list());
|
||||
}
|
||||
|
||||
|
@ -31,9 +31,6 @@ class ParagraphModel;
|
||||
class PARA_LIST;
|
||||
struct PARA;
|
||||
|
||||
template <typename T>
|
||||
class GenericVector;
|
||||
|
||||
// This structure captures all information needed about a text line for the
|
||||
// purposes of paragraph detection. It is meant to be exceedingly light-weight
|
||||
// so that we can easily test paragraph detection independent of the rest of
|
||||
@ -90,7 +87,7 @@ public:
|
||||
// caller is responsible for deleting the models.
|
||||
TESS_API
|
||||
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<ParagraphModel *> *models);
|
||||
|
||||
// Given a MutableIterator to the start of a block, run DetectParagraphs on
|
||||
|
@ -95,7 +95,7 @@ struct LineHypothesis {
|
||||
|
||||
class ParagraphTheory; // Forward Declaration
|
||||
|
||||
using SetOfModels = GenericVector<const ParagraphModel *>;
|
||||
using SetOfModels = std::vector<const ParagraphModel *>;
|
||||
|
||||
// Row Scratch Registers are data generated by the paragraph detection
|
||||
// algorithm based on a RowInfo input.
|
||||
@ -123,7 +123,7 @@ public:
|
||||
|
||||
// Clear all hypotheses about this line.
|
||||
void SetUnknown() {
|
||||
hypotheses_.truncate(0);
|
||||
hypotheses_.clear();
|
||||
}
|
||||
|
||||
// Append all hypotheses of strong models that match this row as a start.
|
||||
@ -190,7 +190,7 @@ public:
|
||||
|
||||
private:
|
||||
// Hypotheses of either LT_START or LT_BODY
|
||||
GenericVector<LineHypothesis> hypotheses_;
|
||||
std::vector<LineHypothesis> hypotheses_;
|
||||
};
|
||||
|
||||
// A collection of convenience functions for wrapping the set of
|
||||
@ -219,21 +219,21 @@ public:
|
||||
|
||||
// If any of the non-centered paragraph models we know about fit
|
||||
// rows[start, end), return it. Else nullptr.
|
||||
const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows, int start,
|
||||
const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end) const;
|
||||
|
||||
int IndexOf(const ParagraphModel *model) const;
|
||||
|
||||
private:
|
||||
std::vector<ParagraphModel *> *models_;
|
||||
GenericVector<ParagraphModel *> models_we_added_;
|
||||
std::vector<ParagraphModel *> models_we_added_;
|
||||
};
|
||||
|
||||
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model);
|
||||
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model);
|
||||
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b,
|
||||
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
|
||||
const ParagraphModel *model);
|
||||
|
||||
// A class for smearing Paragraph Model hypotheses to surrounding rows.
|
||||
@ -245,7 +245,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
|
||||
// "smear" our models over the text.
|
||||
class ParagraphModelSmearer {
|
||||
public:
|
||||
ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
ParagraphTheory *theory);
|
||||
|
||||
// Smear forward paragraph models from existing row markings to subsequent
|
||||
@ -266,7 +266,7 @@ private:
|
||||
}
|
||||
|
||||
ParagraphTheory *theory_;
|
||||
GenericVector<RowScratchRegisters> *rows_;
|
||||
std::vector<RowScratchRegisters> *rows_;
|
||||
int row_start_;
|
||||
int row_end_;
|
||||
|
||||
@ -284,11 +284,11 @@ private:
|
||||
// Clear all hypotheses about lines [start, end) and reset the margins to the
|
||||
// percentile (0..100) value of the left and right row edges for this run of
|
||||
// rows.
|
||||
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start,
|
||||
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end, int percentile);
|
||||
|
||||
// Return the median inter-word space in rows[row_start, row_end).
|
||||
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end);
|
||||
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
|
||||
|
||||
// Return whether the first word on the after line can fit in the space at
|
||||
// the end of the before line (knowing which way the text is aligned and read).
|
||||
@ -300,13 +300,13 @@ bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRe
|
||||
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
|
||||
|
||||
// Do rows[start, end) form a single instance of the given paragraph model?
|
||||
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end,
|
||||
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
|
||||
const ParagraphModel *model);
|
||||
|
||||
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
|
||||
// normalize each row_owner to point to an actual PARA, and output the
|
||||
// paragraphs in order onto paragraphs.
|
||||
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs);
|
||||
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
@ -45,7 +45,7 @@
|
||||
|
||||
#include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
#include "genericvector.h" // for GenericVector, PointerVector
|
||||
#include "genericvector.h" // for PointerVector
|
||||
|
||||
#include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe...
|
||||
|
||||
@ -398,27 +398,27 @@ public:
|
||||
// Input: a set of noisy outlines that probably belong to the real_word.
|
||||
// Output: outlines that overlapped blobs are set to nullptr and put back into
|
||||
// the word, either in the blobs or in the reject list.
|
||||
void AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
|
||||
void AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
|
||||
WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<bool> *overlapped_any_blob,
|
||||
GenericVector<C_BLOB *> *target_blobs);
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<bool> *overlapped_any_blob,
|
||||
std::vector<C_BLOB *> *target_blobs);
|
||||
// Attempts to assign non-overlapping outlines to their nearest blobs or
|
||||
// make new blobs out of them.
|
||||
void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
|
||||
void AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
|
||||
WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<C_BLOB *> *target_blobs);
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<C_BLOB *> *target_blobs);
|
||||
// Starting with ok_outlines set to indicate which outlines overlap the blob,
|
||||
// chooses the optimal set (approximately) and returns true if any outlines
|
||||
// are desired, in which case ok_outlines indicates which ones.
|
||||
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
|
||||
C_BLOB *blob, const GenericVector<C_OUTLINE *> &outlines,
|
||||
C_BLOB *blob, const std::vector<C_OUTLINE *> &outlines,
|
||||
int num_outlines, std::vector<bool> *ok_outlines);
|
||||
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
|
||||
// the inclusion of the outlines, and returns the certainty of the raw choice.
|
||||
float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
|
||||
const GenericVector<C_OUTLINE *> &outlines, int pass_n,
|
||||
const std::vector<C_OUTLINE *> &outlines, int pass_n,
|
||||
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str);
|
||||
// Classifies the given blob (part of word_data->word->word) as an individual
|
||||
// word, using languages, chopper etc, returning only the certainty of the
|
||||
@ -703,22 +703,22 @@ public:
|
||||
void ReSegmentByClassification(PAGE_RES *page_res);
|
||||
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||
// Returns false if an invalid UNICHAR_ID is encountered.
|
||||
bool ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids);
|
||||
bool ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids);
|
||||
// Resegments the word to achieve the target_text from the classifier.
|
||||
// Returns false if the re-segmentation fails.
|
||||
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
|
||||
// applies a full search on the classifier results to find the best classified
|
||||
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
|
||||
// substitutions ARE used.
|
||||
bool FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res);
|
||||
bool FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res);
|
||||
// Recursive helper to find a match to the target_text (from text_index
|
||||
// position) in the choices (from choices_pos position).
|
||||
// Choices is an array of GenericVectors, of length choices_length, with each
|
||||
// Choices is an array of vectors of length choices_length, with each
|
||||
// element representing a starting position in the word, and the
|
||||
// GenericVector holding classification results for a sequence of consecutive
|
||||
// vector holding classification results for a sequence of consecutive
|
||||
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||
void SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const GenericVector<UNICHAR_ID> &target_text,
|
||||
void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const std::vector<UNICHAR_ID> &target_text,
|
||||
int text_index, float rating, std::vector<int> *segmentation,
|
||||
float *best_rating, std::vector<int> *best_segmentation);
|
||||
// Counts up the labelled words and the blobs within.
|
||||
|
@ -502,7 +502,7 @@ void WERD::CleanNoise(float size_threshold) {
|
||||
|
||||
// Extracts all the noise outlines and stuffs the pointers into the given
|
||||
// vector of outlines. Afterwards, the outlines vector owns the pointers.
|
||||
void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) {
|
||||
void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) {
|
||||
C_BLOB_IT rej_it(&rej_cblobs);
|
||||
for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
|
||||
C_BLOB *blob = rej_it.extract();
|
||||
@ -516,13 +516,13 @@ void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) {
|
||||
// back in rej_cblobs where they came from. Where the target_blobs entry is
|
||||
// nullptr, a run of wanted outlines is put into a single new blob.
|
||||
// Ownership of the outlines is transferred back to the word. (Hence
|
||||
// GenericVector and not PointerVector.)
|
||||
// vector and not PointerVector.)
|
||||
// Returns true if any new blob was added to the start of the word, which
|
||||
// suggests that it might need joining to the word before it, and likewise
|
||||
// sets make_next_word_fuzzy true if any new blob was added to the end.
|
||||
bool WERD::AddSelectedOutlines(const GenericVector<bool> &wanted,
|
||||
const GenericVector<C_BLOB *> &target_blobs,
|
||||
const GenericVector<C_OUTLINE *> &outlines,
|
||||
bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,
|
||||
const std::vector<C_BLOB *> &target_blobs,
|
||||
const std::vector<C_OUTLINE *> &outlines,
|
||||
bool *make_next_word_fuzzy) {
|
||||
bool outline_added_to_start = false;
|
||||
if (make_next_word_fuzzy != nullptr)
|
||||
|
@ -21,7 +21,6 @@
|
||||
|
||||
#include "bits16.h"
|
||||
#include "elst2.h"
|
||||
#include "genericvector.h" // GenericVector
|
||||
#include "params.h"
|
||||
#include "stepblob.h"
|
||||
|
||||
@ -173,18 +172,18 @@ public:
|
||||
|
||||
// Extracts all the noise outlines and stuffs the pointers into the given
|
||||
// vector of outlines. Afterwards, the outlines vector owns the pointers.
|
||||
void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
|
||||
void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
|
||||
// Adds the selected outlines to the indcated real blobs, and puts the rest
|
||||
// back in rej_cblobs where they came from. Where the target_blobs entry is
|
||||
// nullptr, a run of wanted outlines is put into a single new blob.
|
||||
// Ownership of the outlines is transferred back to the word. (Hence
|
||||
// GenericVector and not PointerVector.)
|
||||
// vector and not PointerVector.)
|
||||
// Returns true if any new blob was added to the start of the word, which
|
||||
// suggests that it might need joining to the word before it, and likewise
|
||||
// sets make_next_word_fuzzy true if any new blob was added to the end.
|
||||
bool AddSelectedOutlines(const GenericVector<bool> &wanted,
|
||||
const GenericVector<C_BLOB *> &target_blobs,
|
||||
const GenericVector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
|
||||
bool AddSelectedOutlines(const std::vector<bool> &wanted,
|
||||
const std::vector<C_BLOB *> &target_blobs,
|
||||
const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
|
||||
|
||||
private:
|
||||
uint8_t blanks = 0; // no of blanks
|
||||
|
@ -225,16 +225,6 @@ public:
|
||||
qsort(data_, size_used_, sizeof(*data_), comparator);
|
||||
}
|
||||
|
||||
// Searches the array (assuming sorted in ascending order, using sort()) for
|
||||
// an element equal to target and returns true if it is present.
|
||||
// Use binary_search to get the index of target, or its nearest candidate.
|
||||
bool bool_binary_search(const T &target) const {
|
||||
int index = binary_search(target);
|
||||
if (index >= size_used_) {
|
||||
return false;
|
||||
}
|
||||
return data_[index] == target;
|
||||
}
|
||||
// Searches the array (assuming sorted in ascending order, using sort()) for
|
||||
// an element equal to target and returns the index of the best candidate.
|
||||
// The return value is conceptually the largest index i such that
|
||||
|
@ -92,15 +92,15 @@ public:
|
||||
return ComputeForegroundDensity(tbox);
|
||||
}
|
||||
|
||||
int RunCountAlignment(const GenericVector<int> &sorted_vec, const int val) {
|
||||
int RunCountAlignment(const std::vector<int> &sorted_vec, const int val) {
|
||||
return CountAlignment(sorted_vec, val);
|
||||
}
|
||||
|
||||
void RunSplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) {
|
||||
void RunSplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
|
||||
SplitCPHorLite(part, splitted_boxes);
|
||||
}
|
||||
|
||||
void RunSplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) {
|
||||
void RunSplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
|
||||
SplitCPHor(part, parts_splitted);
|
||||
}
|
||||
|
||||
@ -377,7 +377,7 @@ TEST_F(EquationFinderTest, ComputeForegroundDensity) {
|
||||
}
|
||||
|
||||
TEST_F(EquationFinderTest, CountAlignment) {
|
||||
GenericVector<int> vec;
|
||||
std::vector<int> vec;
|
||||
vec.push_back(1);
|
||||
vec.push_back(1);
|
||||
vec.push_back(1);
|
||||
@ -452,7 +452,7 @@ TEST_F(EquationFinderTest, SplitCPHorLite) {
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part->DeleteBoxes();
|
||||
part->set_median_width(10);
|
||||
GenericVector<TBOX> splitted_boxes;
|
||||
std::vector<TBOX> splitted_boxes;
|
||||
|
||||
// Test an empty part.
|
||||
equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
|
||||
@ -486,7 +486,7 @@ TEST_F(EquationFinderTest, SplitCPHor) {
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part->DeleteBoxes();
|
||||
part->set_median_width(10);
|
||||
GenericVector<ColPartition *> parts_splitted;
|
||||
std::vector<ColPartition *> parts_splitted;
|
||||
|
||||
// Test an empty part.
|
||||
equation_det_->RunSplitCPHor(part, &parts_splitted);
|
||||
@ -512,7 +512,9 @@ TEST_F(EquationFinderTest, SplitCPHor) {
|
||||
EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box());
|
||||
EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box());
|
||||
|
||||
parts_splitted.delete_data_pointers();
|
||||
for (auto part_splitted : parts_splitted) {
|
||||
delete part_splitted;
|
||||
}
|
||||
part->DeleteBoxes();
|
||||
delete (part);
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo
|
||||
// Given n rows of reference ground truth, evaluate whether the n rows
|
||||
// of PARA * pointers yield the same paragraph breakpoints.
|
||||
void EvaluateParagraphDetection(const TextAndModel *correct, int n,
|
||||
const GenericVector<PARA *> &detector_output) {
|
||||
const std::vector<PARA *> &detector_output) {
|
||||
int incorrect_breaks = 0;
|
||||
int missed_breaks = 0;
|
||||
int poorly_matched_models = 0;
|
||||
@ -186,7 +186,7 @@ void EvaluateParagraphDetection(const TextAndModel *correct, int n,
|
||||
|
||||
void TestParagraphDetection(const TextAndModel *correct, int num_rows) {
|
||||
std::vector<RowInfo> row_infos;
|
||||
GenericVector<PARA *> row_owners;
|
||||
std::vector<PARA *> row_owners;
|
||||
PARA_LIST paragraphs;
|
||||
std::vector<ParagraphModel *> models;
|
||||
|
||||
@ -312,7 +312,7 @@ TEST(ParagraphsTest, TestSingleFullPageContinuation) {
|
||||
const TextAndModel *correct = kSingleFullPageContinuation;
|
||||
int num_rows = countof(kSingleFullPageContinuation);
|
||||
std::vector<RowInfo> row_infos;
|
||||
GenericVector<PARA *> row_owners;
|
||||
std::vector<PARA *> row_owners;
|
||||
PARA_LIST paragraphs;
|
||||
std::vector<ParagraphModel *> models;
|
||||
models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));
|
||||
|
Loading…
Reference in New Issue
Block a user