Replace remaining GenericVector by std::vector for src/ccmain

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2021-03-15 17:53:11 +01:00
parent bf42f8313d
commit 1f94d79c81
13 changed files with 239 additions and 237 deletions

View File

@ -24,7 +24,6 @@
# include "boxread.h" # include "boxread.h"
#endif // ndef DISABLED_LEGACY_ENGINE #endif // ndef DISABLED_LEGACY_ENGINE
#include <tesseract/unichar.h> #include <tesseract/unichar.h>
#include "genericvector.h"
#include "pageres.h" #include "pageres.h"
#include "tesseractclass.h" #include "tesseractclass.h"
#include "unicharset.h" #include "unicharset.h"
@ -489,7 +488,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
if (word->text() == nullptr || word->text()[0] == '\0') if (word->text() == nullptr || word->text()[0] == '\0')
continue; // Ignore words that have no text. continue; // Ignore words that have no text.
// Convert the correct text to a vector of UNICHAR_ID // Convert the correct text to a vector of UNICHAR_ID
GenericVector<UNICHAR_ID> target_text; std::vector<UNICHAR_ID> target_text;
if (!ConvertStringToUnichars(word->text(), &target_text)) { if (!ConvertStringToUnichars(word->text(), &target_text)) {
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text()); tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
pr_it.DeleteCurrentWord(); pr_it.DeleteCurrentWord();
@ -505,7 +504,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
/// @return false if an invalid UNICHAR_ID is encountered. /// @return false if an invalid UNICHAR_ID is encountered.
bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids) { bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
for (int step = 0; *utf8 != '\0'; utf8 += step) { for (int step = 0; *utf8 != '\0'; utf8 += step) {
const char *next_space = strchr(utf8, ' '); const char *next_space = strchr(utf8, ' ');
if (next_space == nullptr) if (next_space == nullptr)
@ -528,10 +527,10 @@ bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_
/// applies a full search on the classifier results to find the best classified /// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used. /// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res) { bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
// Classify all required combinations of blobs and save results in choices. // Classify all required combinations of blobs and save results in choices.
const int word_length = word_res->box_word->length(); const int word_length = word_res->box_word->length();
auto *choices = new GenericVector<BLOB_CHOICE_LIST *>[word_length]; auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
for (int i = 0; i < word_length; ++i) { for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST *match_result = BLOB_CHOICE_LIST *match_result =
@ -552,8 +551,11 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
float best_rating = 0.0f; float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
&word_res->best_state); &word_res->best_state);
for (int i = 0; i < word_length; ++i) for (int i = 0; i < word_length; ++i) {
choices[i].delete_data_pointers(); for (auto choice : choices[i]) {
delete choice;
}
}
delete[] choices; delete[] choices;
if (word_res->best_state.empty()) { if (word_res->best_state.empty()) {
// Build the original segmentation and if it is the same length as the // Build the original segmentation and if it is the same length as the
@ -583,9 +585,9 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
/// Recursive helper to find a match to the target_text (from text_index /// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position). /// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length, /// @param choices is an array of vectors of length choices_length,
/// with each element representing a starting position in the word, and the /// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive /// #vector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos /// @param choices_pos
/// @param choices_length /// @param choices_length
@ -595,8 +597,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
/// @param segmentation /// @param segmentation
/// @param best_rating /// @param best_rating
/// @param best_segmentation /// @param best_segmentation
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos, void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
int choices_length, const GenericVector<UNICHAR_ID> &target_text, int choices_length, const std::vector<UNICHAR_ID> &target_text,
int text_index, float rating, std::vector<int> *segmentation, int text_index, float rating, std::vector<int> *segmentation,
float *best_rating, std::vector<int> *best_segmentation) { float *best_rating, std::vector<int> *best_segmentation) {
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs(); const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();

View File

@ -461,8 +461,8 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
continue; continue;
} }
// Two words sharing the same language model, excellent! // Two words sharing the same language model, excellent!
GenericVector<WERD_CHOICE *> overrides_word1; std::vector<WERD_CHOICE *> overrides_word1;
GenericVector<WERD_CHOICE *> overrides_word2; std::vector<WERD_CHOICE *> overrides_word2;
const auto orig_w1_str = w_prev->best_choice->unichar_string(); const auto orig_w1_str = w_prev->best_choice->unichar_string();
const auto orig_w2_str = w->best_choice->unichar_string(); const auto orig_w2_str = w->best_choice->unichar_string();
@ -768,7 +768,7 @@ static int SelectBestWords(double rating_ratio, double certainty_margin, bool de
PointerVector<WERD_RES> *best_words) { PointerVector<WERD_RES> *best_words) {
// Process the smallest groups of words that have an overlapping word // Process the smallest groups of words that have an overlapping word
// boundary at the end. // boundary at the end.
GenericVector<WERD_RES *> out_words; std::vector<WERD_RES *> out_words;
// Index into each word vector (best, new). // Index into each word vector (best, new).
int b = 0, n = 0; int b = 0, n = 0;
int num_best = 0, num_new = 0; int num_best = 0, num_new = 0;
@ -893,19 +893,19 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
return false; return false;
real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
// Get the noise outlines into a vector with matching bool map. // Get the noise outlines into a vector with matching bool map.
GenericVector<C_OUTLINE *> outlines; std::vector<C_OUTLINE *> outlines;
real_word->GetNoiseOutlines(&outlines); real_word->GetNoiseOutlines(&outlines);
GenericVector<bool> word_wanted; std::vector<bool> word_wanted;
GenericVector<bool> overlapped_any_blob; std::vector<bool> overlapped_any_blob;
GenericVector<C_BLOB *> target_blobs; std::vector<C_BLOB *> target_blobs;
AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted, AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
&overlapped_any_blob, &target_blobs); &overlapped_any_blob, &target_blobs);
// Filter the outlines that overlapped any blob and put them into the word // Filter the outlines that overlapped any blob and put them into the word
// now. This simplifies the remaining task and also makes it more accurate // now. This simplifies the remaining task and also makes it more accurate
// as it has more completed blobs to work on. // as it has more completed blobs to work on.
GenericVector<bool> wanted; std::vector<bool> wanted;
GenericVector<C_BLOB *> wanted_blobs; std::vector<C_BLOB *> wanted_blobs;
GenericVector<C_OUTLINE *> wanted_outlines; std::vector<C_OUTLINE *> wanted_outlines;
int num_overlapped = 0; int num_overlapped = 0;
int num_overlapped_used = 0; int num_overlapped_used = 0;
for (int i = 0; i < overlapped_any_blob.size(); ++i) { for (int i = 0; i < overlapped_any_blob.size(); ++i) {
@ -948,11 +948,11 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
// Output: word_wanted indicates which outlines are to be assigned to a blob, // Output: word_wanted indicates which outlines are to be assigned to a blob,
// target_blobs indicates which to assign to, and overlapped_any_blob is // target_blobs indicates which to assign to, and overlapped_any_blob is
// true for all outlines that overlapped a blob. // true for all outlines that overlapped a blob.
void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines, void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
int pass, WERD *real_word, PAGE_RES_IT *pr_it, int pass, WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted, std::vector<bool> *word_wanted,
GenericVector<bool> *overlapped_any_blob, std::vector<bool> *overlapped_any_blob,
GenericVector<C_BLOB *> *target_blobs) { std::vector<C_BLOB *> *target_blobs) {
std::vector<bool> blob_wanted; std::vector<bool> blob_wanted;
word_wanted->resize(outlines.size(), false); word_wanted->resize(outlines.size(), false);
overlapped_any_blob->resize(outlines.size(), false); overlapped_any_blob->resize(outlines.size(), false);
@ -999,10 +999,10 @@ void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE
// Attempts to assign non-overlapping outlines to their nearest blobs or // Attempts to assign non-overlapping outlines to their nearest blobs or
// make new blobs out of them. // make new blobs out of them.
void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass, void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
WERD *real_word, PAGE_RES_IT *pr_it, WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted, std::vector<bool> *word_wanted,
GenericVector<C_BLOB *> *target_blobs) { std::vector<C_BLOB *> *target_blobs) {
std::vector<bool> blob_wanted; std::vector<bool> blob_wanted;
word_wanted->resize(outlines.size(), false); word_wanted->resize(outlines.size(), false);
target_blobs->resize(outlines.size(), nullptr); target_blobs->resize(outlines.size(), nullptr);
@ -1077,7 +1077,7 @@ void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &out
// are desired, in which case ok_outlines indicates which ones. // are desired, in which case ok_outlines indicates which ones.
bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
C_BLOB *blob, C_BLOB *blob,
const GenericVector<C_OUTLINE *> &outlines, const std::vector<C_OUTLINE *> &outlines,
int num_outlines, std::vector<bool> *ok_outlines) { int num_outlines, std::vector<bool> *ok_outlines) {
std::string best_str; std::string best_str;
float target_cert = certainty_threshold; float target_cert = certainty_threshold;
@ -1161,7 +1161,7 @@ bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
// the inclusion of the outlines, and returns the certainty of the raw choice. // the inclusion of the outlines, and returns the certainty of the raw choice.
float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines, float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
const GenericVector<C_OUTLINE *> &outlines, int pass_n, const std::vector<C_OUTLINE *> &outlines, int pass_n,
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) { PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
C_OUTLINE_IT ol_it; C_OUTLINE_IT ol_it;
C_OUTLINE *first_to_keep = nullptr; C_OUTLINE *first_to_keep = nullptr;
@ -1865,8 +1865,7 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
const int fontinfo_size = get_fontinfo_table().size(); const int fontinfo_size = get_fontinfo_table().size();
if (fontinfo_size == 0) if (fontinfo_size == 0)
return; return;
GenericVector<int> font_total_score; std::vector<int> font_total_score(fontinfo_size);
font_total_score.init_to_size(fontinfo_size, 0);
// Compute the font scores for the word // Compute the font scores for the word
if (tessedit_debug_fonts) { if (tessedit_debug_fonts) {

View File

@ -131,7 +131,7 @@ int EquationDetect::LabelSpecialText(TO_BLOCK *to_block) {
return -1; return -1;
} }
GenericVector<BLOBNBOX_LIST *> blob_lists; std::vector<BLOBNBOX_LIST *> blob_lists;
blob_lists.push_back(&(to_block->blobs)); blob_lists.push_back(&(to_block->blobs));
blob_lists.push_back(&(to_block->large_blobs)); blob_lists.push_back(&(to_block->large_blobs));
for (int i = 0; i < blob_lists.size(); ++i) { for (int i = 0; i < blob_lists.size(); ++i) {
@ -223,16 +223,17 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &uni
if (unicharset.get_ispunctuation(id)) { if (unicharset.get_ispunctuation(id)) {
// Exclude some special texts that are likely to be confused as math symbol. // Exclude some special texts that are likely to be confused as math symbol.
static GenericVector<UNICHAR_ID> ids_to_exclude; static std::vector<UNICHAR_ID> ids_to_exclude;
if (ids_to_exclude.empty()) { if (ids_to_exclude.empty()) {
static const char *kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".", static const char *kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
"", "", "", "", "", ""}; "", "", "", "", "", ""};
for (auto i = 0; i < countof(kCharsToEx); i++) { for (auto i = 0; i < countof(kCharsToEx); i++) {
ids_to_exclude.push_back(unicharset.unichar_to_id(kCharsToEx[i])); ids_to_exclude.push_back(unicharset.unichar_to_id(kCharsToEx[i]));
} }
ids_to_exclude.sort(); std::sort(ids_to_exclude.begin(), ids_to_exclude.end());
} }
return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH; auto found = std::binary_search(ids_to_exclude.begin(), ids_to_exclude.end(), id);
return found ? BSTT_NONE : BSTT_MATH;
} }
// Check if it is digit. In addition to the isdigit attribute, we also check // Check if it is digit. In addition to the isdigit attribute, we also check
@ -266,13 +267,13 @@ void EquationDetect::IdentifySpecialText() {
IdentifyBlobsToSkip(part); IdentifyBlobsToSkip(part);
BLOBNBOX_C_IT bbox_it(part->boxes()); BLOBNBOX_C_IT bbox_it(part->boxes());
// Compute the height threshold. // Compute the height threshold.
GenericVector<int> blob_heights; std::vector<int> blob_heights;
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) { for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
if (bbox_it.data()->special_text_type() != BSTT_SKIP) { if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
blob_heights.push_back(bbox_it.data()->bounding_box().height()); blob_heights.push_back(bbox_it.data()->bounding_box().height());
} }
} }
blob_heights.sort(); std::sort(blob_heights.begin(), blob_heights.end());
const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2; const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) { for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
if (bbox_it.data()->special_text_type() != BSTT_SKIP) { if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
@ -377,7 +378,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
// Pass 3: expand block equation seeds. // Pass 3: expand block equation seeds.
while (!cp_seeds_.empty()) { while (!cp_seeds_.empty()) {
GenericVector<ColPartition *> seeds_expanded; std::vector<ColPartition *> seeds_expanded;
for (int i = 0; i < cp_seeds_.size(); ++i) { for (int i = 0; i < cp_seeds_.size(); ++i) {
if (ExpandSeed(cp_seeds_[i])) { if (ExpandSeed(cp_seeds_[i])) {
// If this seed is expanded, then we add it into seeds_expanded. Note // If this seed is expanded, then we add it into seeds_expanded. Note
@ -407,14 +408,14 @@ void EquationDetect::MergePartsByLocation() {
while (true) { while (true) {
ColPartition *part = nullptr; ColPartition *part = nullptr;
// partitions that have been updated. // partitions that have been updated.
GenericVector<ColPartition *> parts_updated; std::vector<ColPartition *> parts_updated;
ColPartitionGridSearch gsearch(part_grid_); ColPartitionGridSearch gsearch(part_grid_);
gsearch.StartFullSearch(); gsearch.StartFullSearch();
while ((part = gsearch.NextFullSearch()) != nullptr) { while ((part = gsearch.NextFullSearch()) != nullptr) {
if (!IsTextOrEquationType(part->type())) { if (!IsTextOrEquationType(part->type())) {
continue; continue;
} }
GenericVector<ColPartition *> parts_to_merge; std::vector<ColPartition *> parts_to_merge;
SearchByOverlap(part, &parts_to_merge); SearchByOverlap(part, &parts_to_merge);
if (parts_to_merge.empty()) { if (parts_to_merge.empty()) {
continue; continue;
@ -443,7 +444,7 @@ void EquationDetect::MergePartsByLocation() {
} }
void EquationDetect::SearchByOverlap(ColPartition *seed, void EquationDetect::SearchByOverlap(ColPartition *seed,
GenericVector<ColPartition *> *parts_overlap) { std::vector<ColPartition *> *parts_overlap) {
ASSERT_HOST(seed != nullptr && parts_overlap != nullptr); ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);
if (!IsTextOrEquationType(seed->type())) { if (!IsTextOrEquationType(seed->type())) {
return; return;
@ -457,7 +458,7 @@ void EquationDetect::SearchByOverlap(ColPartition *seed,
// Search iteratively. // Search iteratively.
ColPartition *part; ColPartition *part;
GenericVector<ColPartition *> parts; std::vector<ColPartition *> parts;
const float kLargeOverlapTh = 0.95; const float kLargeOverlapTh = 0.95;
const float kEquXOverlap = 0.4, kEquYOverlap = 0.5; const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
while ((part = search.NextRadSearch()) != nullptr) { while ((part = search.NextRadSearch()) != nullptr) {
@ -518,11 +519,11 @@ void EquationDetect::IdentifySeedParts() {
ColPartition *part = nullptr; ColPartition *part = nullptr;
gsearch.StartFullSearch(); gsearch.StartFullSearch();
GenericVector<ColPartition *> seeds1, seeds2; std::vector<ColPartition *> seeds1, seeds2;
// The left coordinates of indented text partitions. // The left coordinates of indented text partitions.
GenericVector<int> indented_texts_left; std::vector<int> indented_texts_left;
// The foreground density of text partitions. // The foreground density of text partitions.
GenericVector<float> texts_foreground_density; std::vector<float> texts_foreground_density;
while ((part = gsearch.NextFullSearch()) != nullptr) { while ((part = gsearch.NextFullSearch()) != nullptr) {
if (!IsTextOrEquationType(part->type())) { if (!IsTextOrEquationType(part->type())) {
continue; continue;
@ -552,8 +553,8 @@ void EquationDetect::IdentifySeedParts() {
} }
// Sort the features collected from text regions. // Sort the features collected from text regions.
indented_texts_left.sort(); std::sort(indented_texts_left.begin(), indented_texts_left.end());
texts_foreground_density.sort(); std::sort(texts_foreground_density.begin(), texts_foreground_density.end());
float foreground_density_th = 0.15; // Default value. float foreground_density_th = 0.15; // Default value.
if (!texts_foreground_density.empty()) { if (!texts_foreground_density.empty()) {
// Use the median of the texts_foreground_density. // Use the median of the texts_foreground_density.
@ -598,7 +599,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
ASSERT_HOST(part); ASSERT_HOST(part);
// Split part horizontall, and check for each sub part. // Split part horizontall, and check for each sub part.
GenericVector<TBOX> sub_boxes; std::vector<TBOX> sub_boxes;
SplitCPHorLite(part, &sub_boxes); SplitCPHorLite(part, &sub_boxes);
float parts_passed = 0.0; float parts_passed = 0.0;
for (int i = 0; i < sub_boxes.size(); ++i) { for (int i = 0; i < sub_boxes.size(); ++i) {
@ -615,7 +616,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
return retval; return retval;
} }
void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) { void EquationDetect::SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
ASSERT_HOST(part && parts_splitted); ASSERT_HOST(part && parts_splitted);
if (part->median_width() == 0 || part->boxes_count() == 0) { if (part->median_width() == 0 || part->boxes_count() == 0) {
return; return;
@ -623,7 +624,9 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
// Make a copy of part, and reset parts_splitted. // Make a copy of part, and reset parts_splitted.
ColPartition *right_part = part->CopyButDontOwnBlobs(); ColPartition *right_part = part->CopyButDontOwnBlobs();
parts_splitted->delete_data_pointers(); for (auto part : *parts_splitted) {
delete part;
}
parts_splitted->clear(); parts_splitted->clear();
const double kThreshold = part->median_width() * 3.0; const double kThreshold = part->median_width() * 3.0;
@ -663,7 +666,7 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
parts_splitted->push_back(right_part); parts_splitted->push_back(right_part);
} }
void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) { void EquationDetect::SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
ASSERT_HOST(part && splitted_boxes); ASSERT_HOST(part && splitted_boxes);
splitted_boxes->clear(); splitted_boxes->clear();
if (part->median_width() == 0) { if (part->median_width() == 0) {
@ -701,7 +704,7 @@ void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *spl
} }
} }
bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left, bool EquationDetect::CheckForSeed2(const std::vector<int> &indented_texts_left,
const float foreground_density_th, ColPartition *part) { const float foreground_density_th, ColPartition *part) {
ASSERT_HOST(part); ASSERT_HOST(part);
const TBOX &box = part->bounding_box(); const TBOX &box = part->bounding_box();
@ -720,22 +723,25 @@ bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left
return true; return true;
} }
int EquationDetect::CountAlignment(const GenericVector<int> &sorted_vec, const int val) const { int EquationDetect::CountAlignment(const std::vector<int> &sorted_vec, const int val) const {
if (sorted_vec.empty()) { if (sorted_vec.empty()) {
return 0; return 0;
} }
const int kDistTh = static_cast<int>(roundf(0.03 * resolution_)); const int kDistTh = static_cast<int>(round(0.03f * resolution_));
const int pos = sorted_vec.binary_search(val); auto pos = std::upper_bound(sorted_vec.begin(), sorted_vec.end(), val);
if (pos > sorted_vec.begin()) {
--pos;
}
int count = 0; int count = 0;
// Search left side. // Search left side.
int index = pos; auto index = pos - sorted_vec.begin();
while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) { while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
count++; count++;
} }
// Search right side. // Search right side.
index = pos + 1; index = pos + 1 - sorted_vec.begin();
while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) { while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
count++; count++;
} }
@ -764,9 +770,9 @@ void EquationDetect::ComputeCPsSuperBBox() {
void EquationDetect::IdentifyInlinePartsHorizontal() { void EquationDetect::IdentifyInlinePartsHorizontal() {
ASSERT_HOST(cps_super_bbox_); ASSERT_HOST(cps_super_bbox_);
GenericVector<ColPartition *> new_seeds; std::vector<ColPartition *> new_seeds;
const int kMarginDiffTh = IntCastRounded(0.5 * lang_tesseract_->source_resolution()); const int kMarginDiffTh = IntCastRounded(0.5 * lang_tesseract_->source_resolution());
const int kGapTh = static_cast<int>(roundf(1.0 * lang_tesseract_->source_resolution())); const int kGapTh = static_cast<int>(round(1.0f * lang_tesseract_->source_resolution()));
ColPartitionGridSearch search(part_grid_); ColPartitionGridSearch search(part_grid_);
search.SetUniqueMode(true); search.SetUniqueMode(true);
// The center x coordinate of the cp_super_bbox_. // The center x coordinate of the cp_super_bbox_.
@ -826,7 +832,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
// Get the y gap between text partitions; // Get the y gap between text partitions;
ColPartition *current = nullptr, *prev = nullptr; ColPartition *current = nullptr, *prev = nullptr;
gsearch.StartFullSearch(); gsearch.StartFullSearch();
GenericVector<int> ygaps; std::vector<int> ygaps;
while ((current = gsearch.NextFullSearch()) != nullptr) { while ((current = gsearch.NextFullSearch()) != nullptr) {
if (!PTIsTextType(current->type())) { if (!PTIsTextType(current->type())) {
continue; continue;
@ -851,7 +857,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
} }
// Compute the line spacing from ygaps: use the mean of the first half. // Compute the line spacing from ygaps: use the mean of the first half.
ygaps.sort(); std::sort(ygaps.begin(), ygaps.end());
int spacing = 0, count; int spacing = 0, count;
for (count = 0; count < ygaps.size() / 2; count++) { for (count = 0; count < ygaps.size() / 2; count++) {
spacing += ygaps[count]; spacing += ygaps[count];
@ -867,12 +873,12 @@ void EquationDetect::IdentifyInlinePartsVertical(const bool top_to_bottom,
// Sort cp_seeds_. // Sort cp_seeds_.
if (top_to_bottom) { // From top to bottom. if (top_to_bottom) { // From top to bottom.
cp_seeds_.sort(&SortCPByTopReverse); std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByTopReverse);
} else { // From bottom to top. } else { // From bottom to top.
cp_seeds_.sort(&SortCPByBottom); std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByBottom);
} }
GenericVector<ColPartition *> new_seeds; std::vector<ColPartition *> new_seeds;
for (int i = 0; i < cp_seeds_.size(); ++i) { for (int i = 0; i < cp_seeds_.size(); ++i) {
ColPartition *part = cp_seeds_[i]; ColPartition *part = cp_seeds_[i];
// If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
@ -918,8 +924,8 @@ bool EquationDetect::IsInline(const bool search_bottom, const int textparts_line
// Check if neighbor and part is inline similar. // Check if neighbor and part is inline similar.
const float kHeightRatioTh = 0.5; const float kHeightRatioTh = 0.5;
const int kYGapTh = textparts_linespacing > 0 const int kYGapTh = textparts_linespacing > 0
? textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)) ? textparts_linespacing + static_cast<int>(round(0.02f * resolution_))
: static_cast<int>(roundf(0.05 * resolution_)); // Default value. : static_cast<int>(round(0.05f * resolution_)); // Default value.
if (part_box.x_overlap(neighbor_box) && // Location feature. if (part_box.x_overlap(neighbor_box) && // Location feature.
part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing. part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
// Geo feature. // Geo feature.
@ -973,9 +979,9 @@ EquationDetect::IndentType EquationDetect::IsIndented(ColPartition *part) {
ColPartitionGridSearch search(part_grid_); ColPartitionGridSearch search(part_grid_);
ColPartition *neighbor = nullptr; ColPartition *neighbor = nullptr;
const TBOX &part_box(part->bounding_box()); const TBOX &part_box(part->bounding_box());
const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_)); const int kXGapTh = static_cast<int>(round(0.5f * resolution_));
const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_)); const int kRadiusTh = static_cast<int>(round(3.0f * resolution_));
const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_)); const int kYGapTh = static_cast<int>(round(0.5f * resolution_));
// Here we use a simple approximation algorithm: from the center of part, We // Here we use a simple approximation algorithm: from the center of part, We
// perform the radius search, and check if we can find a neighboring partition // perform the radius search, and check if we can find a neighboring partition
@ -1036,7 +1042,7 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
} }
// Expand in four directions. // Expand in four directions.
GenericVector<ColPartition *> parts_to_merge; std::vector<ColPartition *> parts_to_merge;
ExpandSeedHorizontal(true, seed, &parts_to_merge); ExpandSeedHorizontal(true, seed, &parts_to_merge);
ExpandSeedHorizontal(false, seed, &parts_to_merge); ExpandSeedHorizontal(false, seed, &parts_to_merge);
ExpandSeedVertical(true, seed, &parts_to_merge); ExpandSeedVertical(true, seed, &parts_to_merge);
@ -1073,10 +1079,10 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
} }
void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *seed, void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge) { std::vector<ColPartition *> *parts_to_merge) {
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr); ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);
const float kYOverlapTh = 0.6; const float kYOverlapTh = 0.6;
const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_)); const int kXGapTh = static_cast<int>(round(0.2f * resolution_));
ColPartitionGridSearch search(part_grid_); ColPartitionGridSearch search(part_grid_);
const TBOX &seed_box(seed->bounding_box()); const TBOX &seed_box(seed->bounding_box());
@ -1125,10 +1131,10 @@ void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *
} }
void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *seed, void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge) { std::vector<ColPartition *> *parts_to_merge) {
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr && cps_super_bbox_ != nullptr); ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr && cps_super_bbox_ != nullptr);
const float kXOverlapTh = 0.4; const float kXOverlapTh = 0.4;
const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_)); const int kYGapTh = static_cast<int>(round(0.2f * resolution_));
ColPartitionGridSearch search(part_grid_); ColPartitionGridSearch search(part_grid_);
const TBOX &seed_box(seed->bounding_box()); const TBOX &seed_box(seed->bounding_box());
@ -1138,7 +1144,7 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
// Search iteratively. // Search iteratively.
ColPartition *part = nullptr; ColPartition *part = nullptr;
GenericVector<ColPartition *> parts; std::vector<ColPartition *> parts;
int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1; int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;
while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) { while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {
if (part == seed) { if (part == seed) {
@ -1206,8 +1212,8 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
} }
bool EquationDetect::IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const { bool EquationDetect::IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const {
const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_)); const int kXGapTh = static_cast<int>(round(0.25f * resolution_));
const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_)); const int kYGapTh = static_cast<int>(round(0.05f * resolution_));
// Check geometric feature. // Check geometric feature.
if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) { if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) {
@ -1244,7 +1250,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
// Iterate over part_grid_, and find all parts that are text type but not // Iterate over part_grid_, and find all parts that are text type but not
// equation type. // equation type.
ColPartition *part = nullptr; ColPartition *part = nullptr;
GenericVector<ColPartition *> text_parts; std::vector<ColPartition *> text_parts;
ColPartitionGridSearch gsearch(part_grid_); ColPartitionGridSearch gsearch(part_grid_);
gsearch.StartFullSearch(); gsearch.StartFullSearch();
while ((part = gsearch.NextFullSearch()) != nullptr) { while ((part = gsearch.NextFullSearch()) != nullptr) {
@ -1257,12 +1263,12 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
} }
// Compute the medium height of the text_parts. // Compute the medium height of the text_parts.
text_parts.sort(&SortCPByHeight); std::sort(text_parts.begin(), text_parts.end(), &SortCPByHeight);
const TBOX &text_box = text_parts[text_parts.size() / 2]->bounding_box(); const TBOX &text_box = text_parts[text_parts.size() / 2]->bounding_box();
int med_height = text_box.height(); int med_height = text_box.height();
if (text_parts.size() % 2 == 0 && text_parts.size() > 1) { if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
const TBOX &text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box(); const TBOX &text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box();
med_height = static_cast<int>(roundf(0.5 * (text_box.height() + med_height))); med_height = static_cast<int>(round(0.5f * (text_box.height() + med_height)));
} }
// Iterate every text_parts and check if it is a math block satellite. // Iterate every text_parts and check if it is a math block satellite.
@ -1271,7 +1277,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
if (text_box.height() > med_height) { if (text_box.height() > med_height) {
continue; continue;
} }
GenericVector<ColPartition *> math_blocks; std::vector<ColPartition *> math_blocks;
if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) { if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
continue; continue;
} }
@ -1288,7 +1294,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
} }
bool EquationDetect::IsMathBlockSatellite(ColPartition *part, bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
GenericVector<ColPartition *> *math_blocks) { std::vector<ColPartition *> *math_blocks) {
ASSERT_HOST(part != nullptr && math_blocks != nullptr); ASSERT_HOST(part != nullptr && math_blocks != nullptr);
math_blocks->clear(); math_blocks->clear();
const TBOX &part_box(part->bounding_box()); const TBOX &part_box(part->bounding_box());
@ -1344,7 +1350,7 @@ bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
ColPartition *EquationDetect::SearchNNVertical(const bool search_bottom, const ColPartition *part) { ColPartition *EquationDetect::SearchNNVertical(const bool search_bottom, const ColPartition *part) {
ASSERT_HOST(part); ASSERT_HOST(part);
ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr; ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5)); const int kYGapTh = static_cast<int>(round(resolution_ * 0.5f));
ColPartitionGridSearch search(part_grid_); ColPartitionGridSearch search(part_grid_);
search.SetUniqueMode(true); search.SetUniqueMode(true);
@ -1379,7 +1385,7 @@ bool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *nei
if (!neighbor) { if (!neighbor) {
return false; return false;
} }
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1)); const int kYGapTh = static_cast<int>(round(resolution_ * 0.1f));
return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh; return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
} }

View File

@ -22,7 +22,6 @@
#include <tesseract/unichar.h> // for UNICHAR_ID #include <tesseract/unichar.h> // for UNICHAR_ID
#include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText... #include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText...
#include "equationdetectbase.h" // for EquationDetectBase #include "equationdetectbase.h" // for EquationDetectBase
#include "genericvector.h" // for GenericVector
#include "tesseractclass.h" // for Tesseract #include "tesseractclass.h" // for Tesseract
class TBOX; class TBOX;
@ -86,7 +85,7 @@ protected:
// parts_overlap. Note: this function may update the part_grid_, so if the // parts_overlap. Note: this function may update the part_grid_, so if the
// caller is also running ColPartitionGridSearch, use the RepositionIterator // caller is also running ColPartitionGridSearch, use the RepositionIterator
// to continue. // to continue.
void SearchByOverlap(ColPartition *seed, GenericVector<ColPartition *> *parts_overlap); void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);
// Insert part back into part_grid_, after it absorbs some other parts. // Insert part back into part_grid_, after it absorbs some other parts.
void InsertPartAfterAbsorb(ColPartition *part); void InsertPartAfterAbsorb(ColPartition *part);
@ -106,12 +105,12 @@ protected:
// 1. If its left is aligned with any coordinates in indented_texts_left, // 1. If its left is aligned with any coordinates in indented_texts_left,
// which we assume have been sorted. // which we assume have been sorted.
// 2. If its foreground density is over foreground_density_th. // 2. If its foreground density is over foreground_density_th.
bool CheckForSeed2(const GenericVector<int> &indented_texts_left, bool CheckForSeed2(const std::vector<int> &indented_texts_left,
const float foreground_density_th, ColPartition *part); const float foreground_density_th, ColPartition *part);
// Count the number of values in sorted_vec that is close to val, used to // Count the number of values in sorted_vec that is close to val, used to
// check if a partition is aligned with text partitions. // check if a partition is aligned with text partitions.
int CountAlignment(const GenericVector<int> &sorted_vec, const int val) const; int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;
// Check for a seed candidate using the foreground pixel density. And we // Check for a seed candidate using the foreground pixel density. And we
// return true if the density is below a certain threshold, because characters // return true if the density is below a certain threshold, because characters
@ -120,14 +119,14 @@ protected:
// A light version of SplitCPHor: instead of really doing the part split, we // A light version of SplitCPHor: instead of really doing the part split, we
// simply compute the union bounding box of each split part. // simply compute the union bounding box of each split part.
void SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes); void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);
// Split the part (horizontally), and save the split result into // Split the part (horizontally), and save the split result into
// parts_splitted. Note that it is caller's responsibility to release the // parts_splitted. Note that it is caller's responsibility to release the
// memory owns by parts_splitted. On the other hand, the part is unchanged // memory owns by parts_splitted. On the other hand, the part is unchanged
// during this process and still owns the blobs, so do NOT call DeleteBoxes // during this process and still owns the blobs, so do NOT call DeleteBoxes
// when freeing the colpartitions in parts_splitted. // when freeing the colpartitions in parts_splitted.
void SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted); void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);
// Check the density for a seed candidate (part) using its math density and // Check the density for a seed candidate (part) using its math density and
// italic density, returns true if the check passed. // italic density, returns true if the check passed.
@ -167,9 +166,9 @@ protected:
// merged with seed, remove them from part_grid_, and put them into // merged with seed, remove them from part_grid_, and put them into
// parts_to_merge. // parts_to_merge.
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge); std::vector<ColPartition *> *parts_to_merge);
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge); std::vector<ColPartition *> *parts_to_merge);
// Check if a part_box is the small neighbor of seed_box. // Check if a part_box is the small neighbor of seed_box.
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const; bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;
@ -190,7 +189,7 @@ protected:
// Check if part is the satellite of one/two math blocks. If it is, we return // Check if part is the satellite of one/two math blocks. If it is, we return
// true, and save the blocks into math_blocks. // true, and save the blocks into math_blocks.
bool IsMathBlockSatellite(ColPartition *part, GenericVector<ColPartition *> *math_blocks); bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);
// Search the nearest neighbor of part in one vertical direction as defined in // Search the nearest neighbor of part in one vertical direction as defined in
// search_bottom. It returns the neighbor found that major x overlap with it, // search_bottom. It returns the neighbor found that major x overlap with it,
@ -237,7 +236,7 @@ protected:
TBOX *cps_super_bbox_; TBOX *cps_super_bbox_;
// The seed ColPartition for equation region. // The seed ColPartition for equation region.
GenericVector<ColPartition *> cp_seeds_; std::vector<ColPartition *> cp_seeds_;
// The resolution (dpi) of the processing image. // The resolution (dpi) of the processing image.
int resolution_; int resolution_;

View File

@ -18,7 +18,6 @@
#include "paragraphs.h" #include "paragraphs.h"
#include "genericvector.h" // for GenericVector, GenericVectorEqEq
#include "helpers.h" // for UpdateRange, ClipToRange #include "helpers.h" // for UpdateRange, ClipToRange
#include "host.h" // for NearlyEqual #include "host.h" // for NearlyEqual
#include "mutableiterator.h" // for MutableIterator #include "mutableiterator.h" // for MutableIterator
@ -72,7 +71,7 @@ static int Epsilon(int space_pix) {
} }
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name, static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
const GenericVector<RowScratchRegisters> *rows, int row_start, const std::vector<RowScratchRegisters> *rows, int row_start,
int row_end) { int row_end) {
if (row_start < 0 || row_end > rows->size() || row_start > row_end) { if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", row_start, row_end, tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", row_start, row_end,
@ -134,7 +133,7 @@ static std::string RtlEmbed(const std::string &word, bool rtlify) {
// Print the current thoughts of the paragraph detector. // Print the current thoughts of the paragraph detector.
static void PrintDetectorState(const ParagraphTheory &theory, static void PrintDetectorState(const ParagraphTheory &theory,
const GenericVector<RowScratchRegisters> &rows) { const std::vector<RowScratchRegisters> &rows) {
std::vector<std::vector<std::string>> output; std::vector<std::vector<std::string>> output;
output.push_back(std::vector<std::string>()); output.push_back(std::vector<std::string>());
output.back().push_back("#row"); output.back().push_back("#row");
@ -173,7 +172,7 @@ static void PrintDetectorState(const ParagraphTheory &theory,
} }
static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory, static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
const GenericVector<RowScratchRegisters> &rows) { const std::vector<RowScratchRegisters> &rows) {
if (!should_print) if (!should_print)
return; return;
tprintf("# %s\n", phase); tprintf("# %s\n", phase);
@ -181,7 +180,7 @@ static void DebugDump(bool should_print, const char *phase, const ParagraphTheor
} }
// Print out the text for rows[row_start, row_end) // Print out the text for rows[row_start, row_end)
static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows, int row_start, static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
int row_end) { int row_end) {
tprintf("======================================\n"); tprintf("======================================\n");
for (int row = row_start; row < row_end; row++) { for (int row = row_start; row < row_end; row++) {
@ -398,6 +397,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
return pos == werd->length(); return pos == werd->length();
} }
template<class T>
void push_back_new(std::vector<T> &vector, const T &data) {
if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
vector.push_back(data);
}
}
// ========= Brain Dead Language Model (combined entry points) ================ // ========= Brain Dead Language Model (combined entry points) ================
// Given the leftmost word of a line either as a Tesseract unicharset + werd // Given the leftmost word of a line either as a Tesseract unicharset + werd
@ -581,7 +587,7 @@ void RowScratchRegisters::SetStartLine() {
tprintf("Trying to set a line to be START when it's already BODY.\n"); tprintf("Trying to set a line to be START when it's already BODY.\n");
} }
if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr)); push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
} }
} }
@ -591,42 +597,44 @@ void RowScratchRegisters::SetBodyLine() {
tprintf("Trying to set a line to be BODY when it's already START.\n"); tprintf("Trying to set a line to be BODY when it's already START.\n");
} }
if (current_lt == LT_UNKNOWN || current_lt == LT_START) { if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr)); push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
} }
} }
void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
hypotheses_.push_back_new(LineHypothesis(LT_START, model)); push_back_new(hypotheses_, LineHypothesis(LT_START, model));
int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr)); auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
if (old_idx >= 0) if (found != hypotheses_.end()) {
hypotheses_.remove(old_idx); hypotheses_.erase(found);
}
} }
void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
hypotheses_.push_back_new(LineHypothesis(LT_BODY, model)); push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr)); auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
if (old_idx >= 0) if (found != hypotheses_.end()) {
hypotheses_.remove(old_idx); hypotheses_.erase(found);
}
} }
void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
for (int h = 0; h < hypotheses_.size(); h++) { for (int h = 0; h < hypotheses_.size(); h++) {
if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model)) if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
models->push_back_new(hypotheses_[h].model); push_back_new(*models, hypotheses_[h].model);
} }
} }
void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
for (int h = 0; h < hypotheses_.size(); h++) { for (int h = 0; h < hypotheses_.size(); h++) {
if (StrongModel(hypotheses_[h].model)) if (StrongModel(hypotheses_[h].model))
models->push_back_new(hypotheses_[h].model); push_back_new(*models, hypotheses_[h].model);
} }
} }
void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
for (int h = 0; h < hypotheses_.size(); h++) { for (int h = 0; h < hypotheses_.size(); h++) {
if (hypotheses_[h].model != nullptr) if (hypotheses_[h].model != nullptr)
models->push_back_new(hypotheses_[h].model); push_back_new(*models, hypotheses_[h].model);
} }
} }
@ -647,8 +655,8 @@ void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models
if (models.empty()) if (models.empty())
return; return;
for (int h = hypotheses_.size() - 1; h >= 0; h--) { for (int h = hypotheses_.size() - 1; h >= 0; h--) {
if (!models.contains(hypotheses_[h].model)) { if (!contains(models, hypotheses_[h].model)) {
hypotheses_.remove(h); hypotheses_.erase(hypotheses_.begin() + h);
} }
} }
} }
@ -672,15 +680,15 @@ public:
int size() const { int size() const {
return values_.size(); return values_.size();
} }
void GetClusters(GenericVector<Cluster> *clusters); void GetClusters(std::vector<Cluster> *clusters);
private: private:
int max_cluster_width_; int max_cluster_width_;
GenericVector<int> values_; std::vector<int> values_;
}; };
// Return the index of the cluster closest to value. // Return the index of the cluster closest to value.
static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) { static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
int best_index = 0; int best_index = 0;
for (int i = 0; i < clusters.size(); i++) { for (int i = 0; i < clusters.size(); i++) {
if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center))
@ -689,9 +697,9 @@ static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
return best_index; return best_index;
} }
void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) { void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
clusters->clear(); clusters->clear();
values_.sort(); std::sort(values_.begin(), values_.end());
for (int i = 0; i < values_.size();) { for (int i = 0; i < values_.size();) {
int orig_i = i; int orig_i = i;
int lo = values_[i]; int lo = values_[i];
@ -705,16 +713,16 @@ void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
// Calculate left- and right-indent tab stop values seen in // Calculate left- and right-indent tab stop values seen in
// rows[row_start, row_end) given a tolerance of tolerance. // rows[row_start, row_end) given a tolerance of tolerance.
static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end, static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
int tolerance, GenericVector<Cluster> *left_tabs, int tolerance, std::vector<Cluster> *left_tabs,
GenericVector<Cluster> *right_tabs) { std::vector<Cluster> *right_tabs) {
if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
return; return;
// First pass: toss all left and right indents into clusterers. // First pass: toss all left and right indents into clusterers.
SimpleClusterer initial_lefts(tolerance); SimpleClusterer initial_lefts(tolerance);
SimpleClusterer initial_rights(tolerance); SimpleClusterer initial_rights(tolerance);
GenericVector<Cluster> initial_left_tabs; std::vector<Cluster> initial_left_tabs;
GenericVector<Cluster> initial_right_tabs; std::vector<Cluster> initial_right_tabs;
for (int i = row_start; i < row_end; i++) { for (int i = row_start; i < row_end; i++) {
initial_lefts.Add((*rows)[i].lindent_); initial_lefts.Add((*rows)[i].lindent_);
initial_rights.Add((*rows)[i].rindent_); initial_rights.Add((*rows)[i].rindent_);
@ -782,7 +790,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
} }
} }
if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
left_tabs->remove(to_prune); left_tabs->erase(left_tabs->begin() + to_prune);
} }
} }
if (right_tabs->size() == 3 && left_tabs->size() >= 4) { if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
@ -793,7 +801,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
} }
} }
if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
right_tabs->remove(to_prune); right_tabs->erase(right_tabs->begin() + to_prune);
} }
} }
} }
@ -817,7 +825,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
// Case 2b: Fully Justified. (eop_threshold > 0) // Case 2b: Fully Justified. (eop_threshold > 0)
// We mark a line as short (end of paragraph) if the offside indent // We mark a line as short (end of paragraph) if the offside indent
// is greater than eop_threshold. // is greater than eop_threshold.
static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end, static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
const ParagraphModel *model, bool ltr, int eop_threshold) { const ParagraphModel *model, bool ltr, int eop_threshold) {
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
return; return;
@ -861,7 +869,7 @@ static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_
// Further, this struct holds the data we amass for the (single) ParagraphModel // Further, this struct holds the data we amass for the (single) ParagraphModel
// we'll assign to the text lines (assuming we get that far). // we'll assign to the text lines (assuming we get that far).
struct GeometricClassifierState { struct GeometricClassifierState {
GeometricClassifierState(int dbg_level, GenericVector<RowScratchRegisters> *r, int r_start, GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
int r_end) int r_end)
: debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) { : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
tolerance = InterwordSpace(*r, r_start, r_end); tolerance = InterwordSpace(*r, r_start, r_end);
@ -886,7 +894,7 @@ struct GeometricClassifierState {
} }
// Align tabs are the tab stops the text is aligned to. // Align tabs are the tab stops the text is aligned to.
const GenericVector<Cluster> &AlignTabs() const { const std::vector<Cluster> &AlignTabs() const {
if (just == tesseract::JUSTIFICATION_RIGHT) if (just == tesseract::JUSTIFICATION_RIGHT)
return right_tabs; return right_tabs;
return left_tabs; return left_tabs;
@ -897,7 +905,7 @@ struct GeometricClassifierState {
// Note that for a left-to-right text which is aligned to the right such as // Note that for a left-to-right text which is aligned to the right such as
// this function comment, the offside tabs are the horizontal tab stops // this function comment, the offside tabs are the horizontal tab stops
// marking the beginning of ("Note", "this" and "marking"). // marking the beginning of ("Note", "this" and "marking").
const GenericVector<Cluster> &OffsideTabs() const { const std::vector<Cluster> &OffsideTabs() const {
if (just == tesseract::JUSTIFICATION_RIGHT) if (just == tesseract::JUSTIFICATION_RIGHT)
return left_tabs; return left_tabs;
return right_tabs; return right_tabs;
@ -940,7 +948,7 @@ struct GeometricClassifierState {
// The Geometric Classifier was asked to find a single paragraph model // The Geometric Classifier was asked to find a single paragraph model
// to fit the text rows (*rows)[row_start, row_end) // to fit the text rows (*rows)[row_start, row_end)
GenericVector<RowScratchRegisters> *rows; std::vector<RowScratchRegisters> *rows;
int row_start = 0; int row_start = 0;
int row_end = 0; int row_end = 0;
@ -953,8 +961,8 @@ struct GeometricClassifierState {
// These left and right tab stops were determined to be the common tab // These left and right tab stops were determined to be the common tab
// stops for the given text. // stops for the given text.
GenericVector<Cluster> left_tabs; std::vector<Cluster> left_tabs;
GenericVector<Cluster> right_tabs; std::vector<Cluster> right_tabs;
// These are parameters we must determine to create a ParagraphModel. // These are parameters we must determine to create a ParagraphModel.
tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN; tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
@ -1083,7 +1091,7 @@ static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricCla
// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
// it's worth guessing that (A1b) is the correct interpretation if there are // it's worth guessing that (A1b) is the correct interpretation if there are
// far more "full" lines than "short" lines. // far more "full" lines than "short" lines.
static void GeometricClassify(int debug_level, GenericVector<RowScratchRegisters> *rows, static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, ParagraphTheory *theory) { int row_start, int row_end, ParagraphTheory *theory) {
if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
return; return;
@ -1223,7 +1231,7 @@ const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
} }
auto *m = new ParagraphModel(model); auto *m = new ParagraphModel(model);
models_->push_back(m); models_->push_back(m);
models_we_added_.push_back_new(m); push_back_new(models_we_added_, m);
return m; return m;
} }
@ -1231,7 +1239,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
size_t w = 0; size_t w = 0;
for (size_t r = 0; r < models_->size(); r++) { for (size_t r = 0; r < models_->size(); r++) {
ParagraphModel *m = (*models_)[r]; ParagraphModel *m = (*models_)[r];
if (!used_models.contains(m) && models_we_added_.contains(m)) { if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
delete m; delete m;
} else { } else {
if (r > w) { if (r > w) {
@ -1246,7 +1254,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
// Examine rows[start, end) and try to determine if an existing non-centered // Examine rows[start, end) and try to determine if an existing non-centered
// paragraph model would fit them perfectly. If so, return a pointer to it. // paragraph model would fit them perfectly. If so, return a pointer to it.
// If not, return nullptr. // If not, return nullptr.
const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegisters> *rows, const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
int start, int end) const { int start, int end) const {
for (const auto *model : *models_) { for (const auto *model : *models_) {
if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model))
@ -1258,7 +1266,7 @@ const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegist
void ParagraphTheory::NonCenteredModels(SetOfModels *models) { void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
for (const auto *model : *models_) { for (const auto *model : *models_) {
if (model->justification() != JUSTIFICATION_CENTER) if (model->justification() != JUSTIFICATION_CENTER)
models->push_back_new(model); push_back_new(*models, model);
} }
} }
@ -1272,7 +1280,7 @@ int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
return -1; return -1;
} }
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row, bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model) { const ParagraphModel *model) {
if (!StrongModel(model)) { if (!StrongModel(model)) {
tprintf("ValidFirstLine() should only be called with strong models!\n"); tprintf("ValidFirstLine() should only be called with strong models!\n");
@ -1281,7 +1289,7 @@ bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
(*rows)[row].rindent_, (*rows)[row].rmargin_); (*rows)[row].rindent_, (*rows)[row].rmargin_);
} }
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row, bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model) { const ParagraphModel *model) {
if (!StrongModel(model)) { if (!StrongModel(model)) {
tprintf("ValidBodyLine() should only be called with strong models!\n"); tprintf("ValidBodyLine() should only be called with strong models!\n");
@ -1290,7 +1298,7 @@ bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
(*rows)[row].rindent_, (*rows)[row].rmargin_); (*rows)[row].rindent_, (*rows)[row].rmargin_);
} }
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b, bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
const ParagraphModel *model) { const ParagraphModel *model) {
if (model != kCrownRight && model != kCrownLeft) { if (model != kCrownRight && model != kCrownLeft) {
tprintf("CrownCompatible() should only be called with crown models!\n"); tprintf("CrownCompatible() should only be called with crown models!\n");
@ -1308,7 +1316,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
// =============== Implementation of ParagraphModelSmearer ==================== // =============== Implementation of ParagraphModelSmearer ====================
ParagraphModelSmearer::ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows, ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, ParagraphTheory *theory) int row_start, int row_end, ParagraphTheory *theory)
: theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) { : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
@ -1341,7 +1349,7 @@ void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
// This is basic filtering; we check likely paragraph starty-ness down // This is basic filtering; we check likely paragraph starty-ness down
// below in Smear() -- you know, whether the first word would have fit // below in Smear() -- you know, whether the first word would have fit
// and such. // and such.
still_open.push_back_new(opened[m]); push_back_new(still_open, opened[m]);
} }
} }
OpenModels(row + 1) = still_open; OpenModels(row + 1) = still_open;
@ -1449,7 +1457,7 @@ void ParagraphModelSmearer::Smear() {
// Find out what ParagraphModels are actually used, and discard any // Find out what ParagraphModels are actually used, and discard any
// that are not. // that are not.
static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows, static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
ParagraphTheory *theory) { ParagraphTheory *theory) {
SetOfModels used_models; SetOfModels used_models;
for (int i = 0; i < rows.size(); i++) { for (int i = 0; i < rows.size(); i++) {
@ -1483,7 +1491,7 @@ static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
// sequences of body lines of equivalent type abutted against the beginning // sequences of body lines of equivalent type abutted against the beginning
// or a body or start line of a different type into a crown paragraph. // or a body or start line of a different type into a crown paragraph.
static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
GenericVector<RowScratchRegisters> *rows) { std::vector<RowScratchRegisters> *rows) {
int start; int start;
for (int end = rows->size(); end > 0; end = start) { for (int end = rows->size(); end > 0; end = start) {
// Search back for a body line of a unique type. // Search back for a body line of a unique type.
@ -1546,7 +1554,7 @@ static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
// really just ignore it as an outlier. To express this, we allow the // really just ignore it as an outlier. To express this, we allow the
// user to specify the percentile (0..100) of indent values to use as // user to specify the percentile (0..100) of indent values to use as
// the common margin for each row in the run of rows[start, end). // the common margin for each row in the run of rows[start, end).
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start, void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
int end, int percentile) { int end, int percentile) {
if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
return; return;
@ -1585,7 +1593,7 @@ void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows
} }
// Return the median inter-word space in rows[row_start, row_end). // Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end) { int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
if (row_end < row_start + 1) if (row_end < row_start + 1)
return 1; return 1;
int word_height = int word_height =
@ -1666,7 +1674,7 @@ static bool LikelyParagraphStart(const RowScratchRegisters &before,
// If the rows given could be a consistent start to a paragraph, set *consistent // If the rows given could be a consistent start to a paragraph, set *consistent
// true. // true.
static ParagraphModel InternalParagraphModelByOutline( static ParagraphModel InternalParagraphModelByOutline(
const GenericVector<RowScratchRegisters> *rows, int start, int end, int tolerance, const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
bool *consistent) { bool *consistent) {
int ltr_line_count = 0; int ltr_line_count = 0;
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
@ -1763,7 +1771,7 @@ static ParagraphModel InternalParagraphModelByOutline(
// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
// output if we're debugging. // output if we're debugging.
static ParagraphModel ParagraphModelByOutline(int debug_level, static ParagraphModel ParagraphModelByOutline(int debug_level,
const GenericVector<RowScratchRegisters> *rows, const std::vector<RowScratchRegisters> *rows,
int start, int end, int tolerance) { int start, int end, int tolerance) {
bool unused_consistent; bool unused_consistent;
ParagraphModel retval = ParagraphModel retval =
@ -1776,7 +1784,7 @@ static ParagraphModel ParagraphModelByOutline(int debug_level,
} }
// Do rows[start, end) form a single instance of the given paragraph model? // Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end, bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
const ParagraphModel *model) { const ParagraphModel *model) {
if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
return false; return false;
@ -1800,7 +1808,7 @@ bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int
// We only take the very strongest signals, as we don't want to get // We only take the very strongest signals, as we don't want to get
// confused and marking up centered text, poetry, or source code as // confused and marking up centered text, poetry, or source code as
// clearly part of a typical paragraph. // clearly part of a typical paragraph.
static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row_start, static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
int row_end) { int row_end) {
// Record patently obvious body text. // Record patently obvious body text.
for (int i = row_start + 1; i < row_end; i++) { for (int i = row_start + 1; i < row_end; i++) {
@ -1862,7 +1870,7 @@ static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row
// Look for sequences of a start line followed by some body lines in // Look for sequences of a start line followed by some body lines in
// rows[row_start, row_end) and create ParagraphModels for them if // rows[row_start, row_end) and create ParagraphModels for them if
// they seem coherent. // they seem coherent.
static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegisters> *rows, static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, bool allow_flush_models, int row_start, int row_end, bool allow_flush_models,
ParagraphTheory *theory) { ParagraphTheory *theory) {
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
@ -1951,7 +1959,7 @@ static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegiste
// clues. // clues.
// (3) Form models for any sequence of start + continuation lines. // (3) Form models for any sequence of start + continuation lines.
// (4) Smear the paragraph models to cover surrounding text. // (4) Smear the paragraph models to cover surrounding text.
static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegisters> *rows, static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, ParagraphTheory *theory) { int row_start, int row_end, ParagraphTheory *theory) {
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
return; return;
@ -1979,7 +1987,7 @@ static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegi
smearer.Smear(); smearer.Smear();
} }
static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows, int row_start, static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
int row_end, ParagraphTheory *theory) { int row_end, ParagraphTheory *theory) {
for (int i = row_start + 1; i < row_end - 1; i++) { for (int i = row_start + 1; i < row_end - 1; i++) {
if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders && if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
@ -1994,8 +2002,8 @@ static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
// Collect sequences of unique hypotheses in row registers and create proper // Collect sequences of unique hypotheses in row registers and create proper
// paragraphs for them, referencing the paragraphs in row_owners. // paragraphs for them, referencing the paragraphs in row_owners.
static void ConvertHypothesizedModelRunsToParagraphs(int debug_level, static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
GenericVector<RowScratchRegisters> &rows, std::vector<RowScratchRegisters> &rows,
GenericVector<PARA *> *row_owners, std::vector<PARA *> *row_owners,
ParagraphTheory *theory) { ParagraphTheory *theory) {
int end = rows.size(); int end = rows.size();
int start; int start;
@ -2090,7 +2098,7 @@ struct Interval {
// (1) If a line is surrounded by lines of unknown type, it's weak. // (1) If a line is surrounded by lines of unknown type, it's weak.
// (2) If two lines in a row are start lines for a given paragraph type, but // (2) If two lines in a row are start lines for a given paragraph type, but
// after that the same paragraph type does not continue, they're weak. // after that the same paragraph type does not continue, they're weak.
static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) { static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
SetOfModels row_models; SetOfModels row_models;
rows[row].StrongHypotheses(&row_models); rows[row].StrongHypotheses(&row_models);
@ -2145,8 +2153,8 @@ static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int ro
// + Crown paragraphs not immediately followed by a strongly modeled line. // + Crown paragraphs not immediately followed by a strongly modeled line.
// + Single line paragraphs surrounded by text that doesn't match the // + Single line paragraphs surrounded by text that doesn't match the
// model. // model.
static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows, static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
GenericVector<Interval> *to_fix, int row_start, int row_end) { std::vector<Interval> *to_fix, int row_start, int row_end) {
to_fix->clear(); to_fix->clear();
for (int i = row_start; i < row_end; i++) { for (int i = row_start; i < row_end; i++) {
bool needs_fixing = false; bool needs_fixing = false;
@ -2195,8 +2203,8 @@ static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the // normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs. // paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs) { void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
GenericVector<PARA *> &rows = *row_owners; std::vector<PARA *> &rows = *row_owners;
paragraphs->clear(); paragraphs->clear();
PARA_IT out(paragraphs); PARA_IT out(paragraphs);
PARA *formerly_null = nullptr; PARA *formerly_null = nullptr;
@ -2226,16 +2234,16 @@ void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *
// models - the list of paragraph models referenced by the PARA objects. // models - the list of paragraph models referenced by the PARA objects.
// caller is responsible for deleting the models. // caller is responsible for deleting the models.
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos, void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs, std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
std::vector<ParagraphModel *> *models) { std::vector<ParagraphModel *> *models) {
GenericVector<RowScratchRegisters> rows; std::vector<RowScratchRegisters> rows;
ParagraphTheory theory(models); ParagraphTheory theory(models);
// Initialize row_owners to be a bunch of nullptr pointers. // Initialize row_owners to be a bunch of nullptr pointers.
row_owners->init_to_size(row_infos->size(), nullptr); row_owners->resize(row_infos->size());
// Set up row scratch registers for the main algorithm. // Set up row scratch registers for the main algorithm.
rows.init_to_size(row_infos->size(), RowScratchRegisters()); rows.resize(row_infos->size(), RowScratchRegisters());
for (int i = 0; i < row_infos->size(); i++) { for (int i = 0; i < row_infos->size(); i++) {
rows[i].Init((*row_infos)[i]); rows[i].Init((*row_infos)[i]);
} }
@ -2249,7 +2257,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
DebugDump(debug_level > 1, "End of Pass 1", theory, rows); DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
GenericVector<Interval> leftovers; std::vector<Interval> leftovers;
LeftoverSegments(rows, &leftovers, 0, rows.size()); LeftoverSegments(rows, &leftovers, 0, rows.size());
for (int i = 0; i < leftovers.size(); i++) { for (int i = 0; i < leftovers.size(); i++) {
// Pass 2a: // Pass 2a:
@ -2263,7 +2271,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
// If we had any luck in pass 2a, we got part of the page and didn't // If we had any luck in pass 2a, we got part of the page and didn't
// know how to classify a few runs of rows. Take the segments that // know how to classify a few runs of rows. Take the segments that
// didn't find a model and reprocess them individually. // didn't find a model and reprocess them individually.
GenericVector<Interval> leftovers2; std::vector<Interval> leftovers2;
LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end); LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
bool pass2a_was_useful = bool pass2a_was_useful =
leftovers2.size() > 1 || leftovers2.size() > 1 ||
@ -2422,7 +2430,7 @@ static void InitializeRowInfo(bool after_recognition, const MutableIterator &it,
} }
PAGE_RES_IT page_res_it = *it.PageResIt(); PAGE_RES_IT page_res_it = *it.PageResIt();
GenericVector<WERD_RES *> werds; std::vector<WERD_RES *> werds;
WERD_RES *word_res = page_res_it.restart_row(); WERD_RES *word_res = page_res_it.restart_row();
ROW_RES *this_row = page_res_it.row(); ROW_RES *this_row = page_res_it.row();
int num_leaders = 0; int num_leaders = 0;
@ -2505,12 +2513,12 @@ void DetectParagraphs(int debug_level, bool after_text_recognition,
} }
// Run the paragraph detection algorithm. // Run the paragraph detection algorithm.
GenericVector<PARA *> row_owners; std::vector<PARA *> row_owners;
GenericVector<PARA *> the_paragraphs; std::vector<PARA *> the_paragraphs;
if (!is_image_block) { if (!is_image_block) {
DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models); DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
} else { } else {
row_owners.init_to_size(row_infos.size(), nullptr); row_owners.resize(row_infos.size());
CanonicalizeDetectionResults(&row_owners, block->para_list()); CanonicalizeDetectionResults(&row_owners, block->para_list());
} }

View File

@ -31,9 +31,6 @@ class ParagraphModel;
class PARA_LIST; class PARA_LIST;
struct PARA; struct PARA;
template <typename T>
class GenericVector;
// This structure captures all information needed about a text line for the // This structure captures all information needed about a text line for the
// purposes of paragraph detection. It is meant to be exceedingly light-weight // purposes of paragraph detection. It is meant to be exceedingly light-weight
// so that we can easily test paragraph detection independent of the rest of // so that we can easily test paragraph detection independent of the rest of
@ -90,7 +87,7 @@ public:
// caller is responsible for deleting the models. // caller is responsible for deleting the models.
TESS_API TESS_API
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos, void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs, std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
std::vector<ParagraphModel *> *models); std::vector<ParagraphModel *> *models);
// Given a MutableIterator to the start of a block, run DetectParagraphs on // Given a MutableIterator to the start of a block, run DetectParagraphs on

View File

@ -95,7 +95,7 @@ struct LineHypothesis {
class ParagraphTheory; // Forward Declaration class ParagraphTheory; // Forward Declaration
using SetOfModels = GenericVector<const ParagraphModel *>; using SetOfModels = std::vector<const ParagraphModel *>;
// Row Scratch Registers are data generated by the paragraph detection // Row Scratch Registers are data generated by the paragraph detection
// algorithm based on a RowInfo input. // algorithm based on a RowInfo input.
@ -123,7 +123,7 @@ public:
// Clear all hypotheses about this line. // Clear all hypotheses about this line.
void SetUnknown() { void SetUnknown() {
hypotheses_.truncate(0); hypotheses_.clear();
} }
// Append all hypotheses of strong models that match this row as a start. // Append all hypotheses of strong models that match this row as a start.
@ -190,7 +190,7 @@ public:
private: private:
// Hypotheses of either LT_START or LT_BODY // Hypotheses of either LT_START or LT_BODY
GenericVector<LineHypothesis> hypotheses_; std::vector<LineHypothesis> hypotheses_;
}; };
// A collection of convenience functions for wrapping the set of // A collection of convenience functions for wrapping the set of
@ -219,21 +219,21 @@ public:
// If any of the non-centered paragraph models we know about fit // If any of the non-centered paragraph models we know about fit
// rows[start, end), return it. Else nullptr. // rows[start, end), return it. Else nullptr.
const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows, int start, const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
int end) const; int end) const;
int IndexOf(const ParagraphModel *model) const; int IndexOf(const ParagraphModel *model) const;
private: private:
std::vector<ParagraphModel *> *models_; std::vector<ParagraphModel *> *models_;
GenericVector<ParagraphModel *> models_we_added_; std::vector<ParagraphModel *> models_we_added_;
}; };
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row, bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model); const ParagraphModel *model);
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row, bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model); const ParagraphModel *model);
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b, bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
const ParagraphModel *model); const ParagraphModel *model);
// A class for smearing Paragraph Model hypotheses to surrounding rows. // A class for smearing Paragraph Model hypotheses to surrounding rows.
@ -245,7 +245,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
// "smear" our models over the text. // "smear" our models over the text.
class ParagraphModelSmearer { class ParagraphModelSmearer {
public: public:
ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end, ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
ParagraphTheory *theory); ParagraphTheory *theory);
// Smear forward paragraph models from existing row markings to subsequent // Smear forward paragraph models from existing row markings to subsequent
@ -266,7 +266,7 @@ private:
} }
ParagraphTheory *theory_; ParagraphTheory *theory_;
GenericVector<RowScratchRegisters> *rows_; std::vector<RowScratchRegisters> *rows_;
int row_start_; int row_start_;
int row_end_; int row_end_;
@ -284,11 +284,11 @@ private:
// Clear all hypotheses about lines [start, end) and reset the margins to the // Clear all hypotheses about lines [start, end) and reset the margins to the
// percentile (0..100) value of the left and right row edges for this run of // percentile (0..100) value of the left and right row edges for this run of
// rows. // rows.
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start, void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
int end, int percentile); int end, int percentile);
// Return the median inter-word space in rows[row_start, row_end). // Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end); int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
// Return whether the first word on the after line can fit in the space at // Return whether the first word on the after line can fit in the space at
// the end of the before line (knowing which way the text is aligned and read). // the end of the before line (knowing which way the text is aligned and read).
@ -300,13 +300,13 @@ bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRe
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after); bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
// Do rows[start, end) form a single instance of the given paragraph model? // Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end, bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
const ParagraphModel *model); const ParagraphModel *model);
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the // normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs. // paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs); void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
} // namespace tesseract } // namespace tesseract

View File

@ -45,7 +45,7 @@
#include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L... #include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...
#include <tesseract/unichar.h> // for UNICHAR_ID #include <tesseract/unichar.h> // for UNICHAR_ID
#include "genericvector.h" // for GenericVector, PointerVector #include "genericvector.h" // for PointerVector
#include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe... #include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe...
@ -398,27 +398,27 @@ public:
// Input: a set of noisy outlines that probably belong to the real_word. // Input: a set of noisy outlines that probably belong to the real_word.
// Output: outlines that overlapped blobs are set to nullptr and put back into // Output: outlines that overlapped blobs are set to nullptr and put back into
// the word, either in the blobs or in the reject list. // the word, either in the blobs or in the reject list.
void AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass, void AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
WERD *real_word, PAGE_RES_IT *pr_it, WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted, std::vector<bool> *word_wanted,
GenericVector<bool> *overlapped_any_blob, std::vector<bool> *overlapped_any_blob,
GenericVector<C_BLOB *> *target_blobs); std::vector<C_BLOB *> *target_blobs);
// Attempts to assign non-overlapping outlines to their nearest blobs or // Attempts to assign non-overlapping outlines to their nearest blobs or
// make new blobs out of them. // make new blobs out of them.
void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass, void AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
WERD *real_word, PAGE_RES_IT *pr_it, WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted, std::vector<bool> *word_wanted,
GenericVector<C_BLOB *> *target_blobs); std::vector<C_BLOB *> *target_blobs);
// Starting with ok_outlines set to indicate which outlines overlap the blob, // Starting with ok_outlines set to indicate which outlines overlap the blob,
// chooses the optimal set (approximately) and returns true if any outlines // chooses the optimal set (approximately) and returns true if any outlines
// are desired, in which case ok_outlines indicates which ones. // are desired, in which case ok_outlines indicates which ones.
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
C_BLOB *blob, const GenericVector<C_OUTLINE *> &outlines, C_BLOB *blob, const std::vector<C_OUTLINE *> &outlines,
int num_outlines, std::vector<bool> *ok_outlines); int num_outlines, std::vector<bool> *ok_outlines);
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
// the inclusion of the outlines, and returns the certainty of the raw choice. // the inclusion of the outlines, and returns the certainty of the raw choice.
float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines, float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
const GenericVector<C_OUTLINE *> &outlines, int pass_n, const std::vector<C_OUTLINE *> &outlines, int pass_n,
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str); PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str);
// Classifies the given blob (part of word_data->word->word) as an individual // Classifies the given blob (part of word_data->word->word) as an individual
// word, using languages, chopper etc, returning only the certainty of the // word, using languages, chopper etc, returning only the certainty of the
@ -703,22 +703,22 @@ public:
void ReSegmentByClassification(PAGE_RES *page_res); void ReSegmentByClassification(PAGE_RES *page_res);
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
// Returns false if an invalid UNICHAR_ID is encountered. // Returns false if an invalid UNICHAR_ID is encountered.
bool ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids); bool ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids);
// Resegments the word to achieve the target_text from the classifier. // Resegments the word to achieve the target_text from the classifier.
// Returns false if the re-segmentation fails. // Returns false if the re-segmentation fails.
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
// applies a full search on the classifier results to find the best classified // applies a full search on the classifier results to find the best classified
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
// substitutions ARE used. // substitutions ARE used.
bool FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res); bool FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res);
// Recursive helper to find a match to the target_text (from text_index // Recursive helper to find a match to the target_text (from text_index
// position) in the choices (from choices_pos position). // position) in the choices (from choices_pos position).
// Choices is an array of GenericVectors, of length choices_length, with each // Choices is an array of vectors of length choices_length, with each
// element representing a starting position in the word, and the // element representing a starting position in the word, and the
// GenericVector holding classification results for a sequence of consecutive // vector holding classification results for a sequence of consecutive
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
void SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos, void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
int choices_length, const GenericVector<UNICHAR_ID> &target_text, int choices_length, const std::vector<UNICHAR_ID> &target_text,
int text_index, float rating, std::vector<int> *segmentation, int text_index, float rating, std::vector<int> *segmentation,
float *best_rating, std::vector<int> *best_segmentation); float *best_rating, std::vector<int> *best_segmentation);
// Counts up the labelled words and the blobs within. // Counts up the labelled words and the blobs within.

View File

@ -502,7 +502,7 @@ void WERD::CleanNoise(float size_threshold) {
// Extracts all the noise outlines and stuffs the pointers into the given // Extracts all the noise outlines and stuffs the pointers into the given
// vector of outlines. Afterwards, the outlines vector owns the pointers. // vector of outlines. Afterwards, the outlines vector owns the pointers.
void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) { void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) {
C_BLOB_IT rej_it(&rej_cblobs); C_BLOB_IT rej_it(&rej_cblobs);
for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
C_BLOB *blob = rej_it.extract(); C_BLOB *blob = rej_it.extract();
@ -516,13 +516,13 @@ void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) {
// back in rej_cblobs where they came from. Where the target_blobs entry is // back in rej_cblobs where they came from. Where the target_blobs entry is
// nullptr, a run of wanted outlines is put into a single new blob. // nullptr, a run of wanted outlines is put into a single new blob.
// Ownership of the outlines is transferred back to the word. (Hence // Ownership of the outlines is transferred back to the word. (Hence
// GenericVector and not PointerVector.) // vector and not PointerVector.)
// Returns true if any new blob was added to the start of the word, which // Returns true if any new blob was added to the start of the word, which
// suggests that it might need joining to the word before it, and likewise // suggests that it might need joining to the word before it, and likewise
// sets make_next_word_fuzzy true if any new blob was added to the end. // sets make_next_word_fuzzy true if any new blob was added to the end.
bool WERD::AddSelectedOutlines(const GenericVector<bool> &wanted, bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,
const GenericVector<C_BLOB *> &target_blobs, const std::vector<C_BLOB *> &target_blobs,
const GenericVector<C_OUTLINE *> &outlines, const std::vector<C_OUTLINE *> &outlines,
bool *make_next_word_fuzzy) { bool *make_next_word_fuzzy) {
bool outline_added_to_start = false; bool outline_added_to_start = false;
if (make_next_word_fuzzy != nullptr) if (make_next_word_fuzzy != nullptr)

View File

@ -21,7 +21,6 @@
#include "bits16.h" #include "bits16.h"
#include "elst2.h" #include "elst2.h"
#include "genericvector.h" // GenericVector
#include "params.h" #include "params.h"
#include "stepblob.h" #include "stepblob.h"
@ -173,18 +172,18 @@ public:
// Extracts all the noise outlines and stuffs the pointers into the given // Extracts all the noise outlines and stuffs the pointers into the given
// vector of outlines. Afterwards, the outlines vector owns the pointers. // vector of outlines. Afterwards, the outlines vector owns the pointers.
void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines); void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
// Adds the selected outlines to the indcated real blobs, and puts the rest // Adds the selected outlines to the indcated real blobs, and puts the rest
// back in rej_cblobs where they came from. Where the target_blobs entry is // back in rej_cblobs where they came from. Where the target_blobs entry is
// nullptr, a run of wanted outlines is put into a single new blob. // nullptr, a run of wanted outlines is put into a single new blob.
// Ownership of the outlines is transferred back to the word. (Hence // Ownership of the outlines is transferred back to the word. (Hence
// GenericVector and not PointerVector.) // vector and not PointerVector.)
// Returns true if any new blob was added to the start of the word, which // Returns true if any new blob was added to the start of the word, which
// suggests that it might need joining to the word before it, and likewise // suggests that it might need joining to the word before it, and likewise
// sets make_next_word_fuzzy true if any new blob was added to the end. // sets make_next_word_fuzzy true if any new blob was added to the end.
bool AddSelectedOutlines(const GenericVector<bool> &wanted, bool AddSelectedOutlines(const std::vector<bool> &wanted,
const GenericVector<C_BLOB *> &target_blobs, const std::vector<C_BLOB *> &target_blobs,
const GenericVector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy); const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
private: private:
uint8_t blanks = 0; // no of blanks uint8_t blanks = 0; // no of blanks

View File

@ -225,16 +225,6 @@ public:
qsort(data_, size_used_, sizeof(*data_), comparator); qsort(data_, size_used_, sizeof(*data_), comparator);
} }
// Searches the array (assuming sorted in ascending order, using sort()) for
// an element equal to target and returns true if it is present.
// Use binary_search to get the index of target, or its nearest candidate.
bool bool_binary_search(const T &target) const {
int index = binary_search(target);
if (index >= size_used_) {
return false;
}
return data_[index] == target;
}
// Searches the array (assuming sorted in ascending order, using sort()) for // Searches the array (assuming sorted in ascending order, using sort()) for
// an element equal to target and returns the index of the best candidate. // an element equal to target and returns the index of the best candidate.
// The return value is conceptually the largest index i such that // The return value is conceptually the largest index i such that

View File

@ -92,15 +92,15 @@ public:
return ComputeForegroundDensity(tbox); return ComputeForegroundDensity(tbox);
} }
int RunCountAlignment(const GenericVector<int> &sorted_vec, const int val) { int RunCountAlignment(const std::vector<int> &sorted_vec, const int val) {
return CountAlignment(sorted_vec, val); return CountAlignment(sorted_vec, val);
} }
void RunSplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) { void RunSplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
SplitCPHorLite(part, splitted_boxes); SplitCPHorLite(part, splitted_boxes);
} }
void RunSplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) { void RunSplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
SplitCPHor(part, parts_splitted); SplitCPHor(part, parts_splitted);
} }
@ -377,7 +377,7 @@ TEST_F(EquationFinderTest, ComputeForegroundDensity) {
} }
TEST_F(EquationFinderTest, CountAlignment) { TEST_F(EquationFinderTest, CountAlignment) {
GenericVector<int> vec; std::vector<int> vec;
vec.push_back(1); vec.push_back(1);
vec.push_back(1); vec.push_back(1);
vec.push_back(1); vec.push_back(1);
@ -452,7 +452,7 @@ TEST_F(EquationFinderTest, SplitCPHorLite) {
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
part->DeleteBoxes(); part->DeleteBoxes();
part->set_median_width(10); part->set_median_width(10);
GenericVector<TBOX> splitted_boxes; std::vector<TBOX> splitted_boxes;
// Test an empty part. // Test an empty part.
equation_det_->RunSplitCPHorLite(part, &splitted_boxes); equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
@ -486,7 +486,7 @@ TEST_F(EquationFinderTest, SplitCPHor) {
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
part->DeleteBoxes(); part->DeleteBoxes();
part->set_median_width(10); part->set_median_width(10);
GenericVector<ColPartition *> parts_splitted; std::vector<ColPartition *> parts_splitted;
// Test an empty part. // Test an empty part.
equation_det_->RunSplitCPHor(part, &parts_splitted); equation_det_->RunSplitCPHor(part, &parts_splitted);
@ -512,7 +512,9 @@ TEST_F(EquationFinderTest, SplitCPHor) {
EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box()); EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box());
EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box()); EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box());
parts_splitted.delete_data_pointers(); for (auto part_splitted : parts_splitted) {
delete part_splitted;
}
part->DeleteBoxes(); part->DeleteBoxes();
delete (part); delete (part);
} }

View File

@ -107,7 +107,7 @@ void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo
// Given n rows of reference ground truth, evaluate whether the n rows // Given n rows of reference ground truth, evaluate whether the n rows
// of PARA * pointers yield the same paragraph breakpoints. // of PARA * pointers yield the same paragraph breakpoints.
void EvaluateParagraphDetection(const TextAndModel *correct, int n, void EvaluateParagraphDetection(const TextAndModel *correct, int n,
const GenericVector<PARA *> &detector_output) { const std::vector<PARA *> &detector_output) {
int incorrect_breaks = 0; int incorrect_breaks = 0;
int missed_breaks = 0; int missed_breaks = 0;
int poorly_matched_models = 0; int poorly_matched_models = 0;
@ -186,7 +186,7 @@ void EvaluateParagraphDetection(const TextAndModel *correct, int n,
void TestParagraphDetection(const TextAndModel *correct, int num_rows) { void TestParagraphDetection(const TextAndModel *correct, int num_rows) {
std::vector<RowInfo> row_infos; std::vector<RowInfo> row_infos;
GenericVector<PARA *> row_owners; std::vector<PARA *> row_owners;
PARA_LIST paragraphs; PARA_LIST paragraphs;
std::vector<ParagraphModel *> models; std::vector<ParagraphModel *> models;
@ -312,7 +312,7 @@ TEST(ParagraphsTest, TestSingleFullPageContinuation) {
const TextAndModel *correct = kSingleFullPageContinuation; const TextAndModel *correct = kSingleFullPageContinuation;
int num_rows = countof(kSingleFullPageContinuation); int num_rows = countof(kSingleFullPageContinuation);
std::vector<RowInfo> row_infos; std::vector<RowInfo> row_infos;
GenericVector<PARA *> row_owners; std::vector<PARA *> row_owners;
PARA_LIST paragraphs; PARA_LIST paragraphs;
std::vector<ParagraphModel *> models; std::vector<ParagraphModel *> models;
models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10)); models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));