Merge pull request #3332 from stweil/vector

Replace some GenericVector by std::vector
This commit is contained in:
Egor Pugin 2021-03-16 19:15:21 +03:00 committed by GitHub
commit d604bf3c68
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
71 changed files with 686 additions and 669 deletions

View File

@ -24,7 +24,6 @@
# include "boxread.h"
#endif // ndef DISABLED_LEGACY_ENGINE
#include <tesseract/unichar.h>
#include "genericvector.h"
#include "pageres.h"
#include "tesseractclass.h"
#include "unicharset.h"
@ -240,7 +239,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
tprintf("Maximally chopping word at:");
word_res->word->bounding_box().print();
}
GenericVector<BLOB_CHOICE *> blob_choices;
std::vector<BLOB_CHOICE *> blob_choices;
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
auto rating = static_cast<float>(INT8_MAX);
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
@ -271,7 +270,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
// combine confidence w/ serial #
auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
0.0f, 0.0f, BCC_FAKE);
blob_choices.insert(right_choice, blob_number + 1);
blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
}
}
word_res->CloneChoppedToRebuild();
@ -374,8 +373,8 @@ bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const
// Eliminated best_state and correct_text entries for the consumed
// blobs.
for (int j = 1; j < blob_count; ++j) {
word_res->best_state.remove(i + 1);
word_res->correct_text.remove(i + 1);
word_res->best_state.erase(word_res->best_state.begin() + i + 1);
word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
}
// Assume that no box spans multiple source words, so we are done with
// this box.
@ -489,7 +488,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
if (word->text() == nullptr || word->text()[0] == '\0')
continue; // Ignore words that have no text.
// Convert the correct text to a vector of UNICHAR_ID
GenericVector<UNICHAR_ID> target_text;
std::vector<UNICHAR_ID> target_text;
if (!ConvertStringToUnichars(word->text(), &target_text)) {
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
pr_it.DeleteCurrentWord();
@ -505,7 +504,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
/// @return false if an invalid UNICHAR_ID is encountered.
bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids) {
bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
for (int step = 0; *utf8 != '\0'; utf8 += step) {
const char *next_space = strchr(utf8, ' ');
if (next_space == nullptr)
@ -528,10 +527,10 @@ bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
// Classify all required combinations of blobs and save results in choices.
const int word_length = word_res->box_word->length();
auto *choices = new GenericVector<BLOB_CHOICE_LIST *>[word_length];
auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST *match_result =
@ -548,12 +547,15 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
// match. Using wildcards makes it difficult to find the correct
// segmentation even when it is there.
word_res->best_state.clear();
GenericVector<int> search_segmentation;
std::vector<int> search_segmentation;
float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
&word_res->best_state);
for (int i = 0; i < word_length; ++i)
choices[i].delete_data_pointers();
for (int i = 0; i < word_length; ++i) {
for (auto choice : choices[i]) {
delete choice;
}
}
delete[] choices;
if (word_res->best_state.empty()) {
// Build the original segmentation and if it is the same length as the
@ -583,9 +585,9 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length,
/// @param choices is an array of vectors of length choices_length,
/// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive
/// #vector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos
/// @param choices_length
@ -595,10 +597,10 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
/// @param segmentation
/// @param best_rating
/// @param best_segmentation
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
int choices_length, const GenericVector<UNICHAR_ID> &target_text,
int text_index, float rating, GenericVector<int> *segmentation,
float *best_rating, GenericVector<int> *best_segmentation) {
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
int choices_length, const std::vector<UNICHAR_ID> &target_text,
int text_index, float rating, std::vector<int> *segmentation,
float *best_rating, std::vector<int> *best_segmentation) {
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
for (int length = 1; length <= choices[choices_pos].size(); ++length) {
// Rating of matching choice or worst choice if no match.
@ -654,7 +656,7 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices,
unicharset.id_to_unichar(target_text[text_index]));
}
}
segmentation->truncate(segmentation->size() - 1);
segmentation->resize(segmentation->size() - 1);
}
}

View File

@ -461,8 +461,8 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
continue;
}
// Two words sharing the same language model, excellent!
GenericVector<WERD_CHOICE *> overrides_word1;
GenericVector<WERD_CHOICE *> overrides_word2;
std::vector<WERD_CHOICE *> overrides_word1;
std::vector<WERD_CHOICE *> overrides_word2;
const auto orig_w1_str = w_prev->best_choice->unichar_string();
const auto orig_w2_str = w->best_choice->unichar_string();
@ -768,7 +768,7 @@ static int SelectBestWords(double rating_ratio, double certainty_margin, bool de
PointerVector<WERD_RES> *best_words) {
// Process the smallest groups of words that have an overlapping word
// boundary at the end.
GenericVector<WERD_RES *> out_words;
std::vector<WERD_RES *> out_words;
// Index into each word vector (best, new).
int b = 0, n = 0;
int num_best = 0, num_new = 0;
@ -893,19 +893,19 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
return false;
real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
// Get the noise outlines into a vector with matching bool map.
GenericVector<C_OUTLINE *> outlines;
std::vector<C_OUTLINE *> outlines;
real_word->GetNoiseOutlines(&outlines);
GenericVector<bool> word_wanted;
GenericVector<bool> overlapped_any_blob;
GenericVector<C_BLOB *> target_blobs;
std::vector<bool> word_wanted;
std::vector<bool> overlapped_any_blob;
std::vector<C_BLOB *> target_blobs;
AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
&overlapped_any_blob, &target_blobs);
// Filter the outlines that overlapped any blob and put them into the word
// now. This simplifies the remaining task and also makes it more accurate
// as it has more completed blobs to work on.
GenericVector<bool> wanted;
GenericVector<C_BLOB *> wanted_blobs;
GenericVector<C_OUTLINE *> wanted_outlines;
std::vector<bool> wanted;
std::vector<C_BLOB *> wanted_blobs;
std::vector<C_OUTLINE *> wanted_outlines;
int num_overlapped = 0;
int num_overlapped_used = 0;
for (int i = 0; i < overlapped_any_blob.size(); ++i) {
@ -948,11 +948,11 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
// Output: word_wanted indicates which outlines are to be assigned to a blob,
// target_blobs indicates which to assign to, and overlapped_any_blob is
// true for all outlines that overlapped a blob.
void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines,
void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
int pass, WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted,
GenericVector<bool> *overlapped_any_blob,
GenericVector<C_BLOB *> *target_blobs) {
std::vector<bool> *word_wanted,
std::vector<bool> *overlapped_any_blob,
std::vector<C_BLOB *> *target_blobs) {
std::vector<bool> blob_wanted;
word_wanted->resize(outlines.size(), false);
overlapped_any_blob->resize(outlines.size(), false);
@ -999,10 +999,10 @@ void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE
// Attempts to assign non-overlapping outlines to their nearest blobs or
// make new blobs out of them.
void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted,
GenericVector<C_BLOB *> *target_blobs) {
std::vector<bool> *word_wanted,
std::vector<C_BLOB *> *target_blobs) {
std::vector<bool> blob_wanted;
word_wanted->resize(outlines.size(), false);
target_blobs->resize(outlines.size(), nullptr);
@ -1077,7 +1077,7 @@ void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &out
// are desired, in which case ok_outlines indicates which ones.
bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
C_BLOB *blob,
const GenericVector<C_OUTLINE *> &outlines,
const std::vector<C_OUTLINE *> &outlines,
int num_outlines, std::vector<bool> *ok_outlines) {
std::string best_str;
float target_cert = certainty_threshold;
@ -1161,7 +1161,7 @@ bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
// the inclusion of the outlines, and returns the certainty of the raw choice.
float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
const GenericVector<C_OUTLINE *> &outlines, int pass_n,
const std::vector<C_OUTLINE *> &outlines, int pass_n,
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
C_OUTLINE_IT ol_it;
C_OUTLINE *first_to_keep = nullptr;
@ -1865,8 +1865,7 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
const int fontinfo_size = get_fontinfo_table().size();
if (fontinfo_size == 0)
return;
GenericVector<int> font_total_score;
font_total_score.init_to_size(fontinfo_size, 0);
std::vector<int> font_total_score(fontinfo_size);
// Compute the font scores for the word
if (tessedit_debug_fonts) {

View File

@ -131,7 +131,7 @@ int EquationDetect::LabelSpecialText(TO_BLOCK *to_block) {
return -1;
}
GenericVector<BLOBNBOX_LIST *> blob_lists;
std::vector<BLOBNBOX_LIST *> blob_lists;
blob_lists.push_back(&(to_block->blobs));
blob_lists.push_back(&(to_block->large_blobs));
for (int i = 0; i < blob_lists.size(); ++i) {
@ -223,16 +223,17 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &uni
if (unicharset.get_ispunctuation(id)) {
// Exclude some special texts that are likely to be confused as math symbol.
static GenericVector<UNICHAR_ID> ids_to_exclude;
static std::vector<UNICHAR_ID> ids_to_exclude;
if (ids_to_exclude.empty()) {
static const char *kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
"", "", "", "", "", ""};
for (auto i = 0; i < countof(kCharsToEx); i++) {
ids_to_exclude.push_back(unicharset.unichar_to_id(kCharsToEx[i]));
}
ids_to_exclude.sort();
std::sort(ids_to_exclude.begin(), ids_to_exclude.end());
}
return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
auto found = std::binary_search(ids_to_exclude.begin(), ids_to_exclude.end(), id);
return found ? BSTT_NONE : BSTT_MATH;
}
// Check if it is digit. In addition to the isdigit attribute, we also check
@ -266,13 +267,13 @@ void EquationDetect::IdentifySpecialText() {
IdentifyBlobsToSkip(part);
BLOBNBOX_C_IT bbox_it(part->boxes());
// Compute the height threshold.
GenericVector<int> blob_heights;
std::vector<int> blob_heights;
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
blob_heights.push_back(bbox_it.data()->bounding_box().height());
}
}
blob_heights.sort();
std::sort(blob_heights.begin(), blob_heights.end());
const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
@ -377,7 +378,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
// Pass 3: expand block equation seeds.
while (!cp_seeds_.empty()) {
GenericVector<ColPartition *> seeds_expanded;
std::vector<ColPartition *> seeds_expanded;
for (int i = 0; i < cp_seeds_.size(); ++i) {
if (ExpandSeed(cp_seeds_[i])) {
// If this seed is expanded, then we add it into seeds_expanded. Note
@ -407,14 +408,14 @@ void EquationDetect::MergePartsByLocation() {
while (true) {
ColPartition *part = nullptr;
// partitions that have been updated.
GenericVector<ColPartition *> parts_updated;
std::vector<ColPartition *> parts_updated;
ColPartitionGridSearch gsearch(part_grid_);
gsearch.StartFullSearch();
while ((part = gsearch.NextFullSearch()) != nullptr) {
if (!IsTextOrEquationType(part->type())) {
continue;
}
GenericVector<ColPartition *> parts_to_merge;
std::vector<ColPartition *> parts_to_merge;
SearchByOverlap(part, &parts_to_merge);
if (parts_to_merge.empty()) {
continue;
@ -443,7 +444,7 @@ void EquationDetect::MergePartsByLocation() {
}
void EquationDetect::SearchByOverlap(ColPartition *seed,
GenericVector<ColPartition *> *parts_overlap) {
std::vector<ColPartition *> *parts_overlap) {
ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);
if (!IsTextOrEquationType(seed->type())) {
return;
@ -457,7 +458,7 @@ void EquationDetect::SearchByOverlap(ColPartition *seed,
// Search iteratively.
ColPartition *part;
GenericVector<ColPartition *> parts;
std::vector<ColPartition *> parts;
const float kLargeOverlapTh = 0.95;
const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
while ((part = search.NextRadSearch()) != nullptr) {
@ -518,11 +519,11 @@ void EquationDetect::IdentifySeedParts() {
ColPartition *part = nullptr;
gsearch.StartFullSearch();
GenericVector<ColPartition *> seeds1, seeds2;
std::vector<ColPartition *> seeds1, seeds2;
// The left coordinates of indented text partitions.
GenericVector<int> indented_texts_left;
std::vector<int> indented_texts_left;
// The foreground density of text partitions.
GenericVector<float> texts_foreground_density;
std::vector<float> texts_foreground_density;
while ((part = gsearch.NextFullSearch()) != nullptr) {
if (!IsTextOrEquationType(part->type())) {
continue;
@ -552,8 +553,8 @@ void EquationDetect::IdentifySeedParts() {
}
// Sort the features collected from text regions.
indented_texts_left.sort();
texts_foreground_density.sort();
std::sort(indented_texts_left.begin(), indented_texts_left.end());
std::sort(texts_foreground_density.begin(), texts_foreground_density.end());
float foreground_density_th = 0.15; // Default value.
if (!texts_foreground_density.empty()) {
// Use the median of the texts_foreground_density.
@ -598,7 +599,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
ASSERT_HOST(part);
// Split part horizontall, and check for each sub part.
GenericVector<TBOX> sub_boxes;
std::vector<TBOX> sub_boxes;
SplitCPHorLite(part, &sub_boxes);
float parts_passed = 0.0;
for (int i = 0; i < sub_boxes.size(); ++i) {
@ -615,7 +616,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
return retval;
}
void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) {
void EquationDetect::SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
ASSERT_HOST(part && parts_splitted);
if (part->median_width() == 0 || part->boxes_count() == 0) {
return;
@ -623,7 +624,9 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
// Make a copy of part, and reset parts_splitted.
ColPartition *right_part = part->CopyButDontOwnBlobs();
parts_splitted->delete_data_pointers();
for (auto part : *parts_splitted) {
delete part;
}
parts_splitted->clear();
const double kThreshold = part->median_width() * 3.0;
@ -663,7 +666,7 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
parts_splitted->push_back(right_part);
}
void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) {
void EquationDetect::SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
ASSERT_HOST(part && splitted_boxes);
splitted_boxes->clear();
if (part->median_width() == 0) {
@ -701,7 +704,7 @@ void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *spl
}
}
bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left,
bool EquationDetect::CheckForSeed2(const std::vector<int> &indented_texts_left,
const float foreground_density_th, ColPartition *part) {
ASSERT_HOST(part);
const TBOX &box = part->bounding_box();
@ -720,22 +723,25 @@ bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left
return true;
}
int EquationDetect::CountAlignment(const GenericVector<int> &sorted_vec, const int val) const {
int EquationDetect::CountAlignment(const std::vector<int> &sorted_vec, const int val) const {
if (sorted_vec.empty()) {
return 0;
}
const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
const int pos = sorted_vec.binary_search(val);
const int kDistTh = static_cast<int>(round(0.03f * resolution_));
auto pos = std::upper_bound(sorted_vec.begin(), sorted_vec.end(), val);
if (pos > sorted_vec.begin()) {
--pos;
}
int count = 0;
// Search left side.
int index = pos;
auto index = pos - sorted_vec.begin();
while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
count++;
}
// Search right side.
index = pos + 1;
index = pos + 1 - sorted_vec.begin();
while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
count++;
}
@ -764,9 +770,9 @@ void EquationDetect::ComputeCPsSuperBBox() {
void EquationDetect::IdentifyInlinePartsHorizontal() {
ASSERT_HOST(cps_super_bbox_);
GenericVector<ColPartition *> new_seeds;
std::vector<ColPartition *> new_seeds;
const int kMarginDiffTh = IntCastRounded(0.5 * lang_tesseract_->source_resolution());
const int kGapTh = static_cast<int>(roundf(1.0 * lang_tesseract_->source_resolution()));
const int kGapTh = static_cast<int>(round(1.0f * lang_tesseract_->source_resolution()));
ColPartitionGridSearch search(part_grid_);
search.SetUniqueMode(true);
// The center x coordinate of the cp_super_bbox_.
@ -826,7 +832,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
// Get the y gap between text partitions;
ColPartition *current = nullptr, *prev = nullptr;
gsearch.StartFullSearch();
GenericVector<int> ygaps;
std::vector<int> ygaps;
while ((current = gsearch.NextFullSearch()) != nullptr) {
if (!PTIsTextType(current->type())) {
continue;
@ -851,7 +857,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
}
// Compute the line spacing from ygaps: use the mean of the first half.
ygaps.sort();
std::sort(ygaps.begin(), ygaps.end());
int spacing = 0, count;
for (count = 0; count < ygaps.size() / 2; count++) {
spacing += ygaps[count];
@ -867,12 +873,12 @@ void EquationDetect::IdentifyInlinePartsVertical(const bool top_to_bottom,
// Sort cp_seeds_.
if (top_to_bottom) { // From top to bottom.
cp_seeds_.sort(&SortCPByTopReverse);
std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByTopReverse);
} else { // From bottom to top.
cp_seeds_.sort(&SortCPByBottom);
std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByBottom);
}
GenericVector<ColPartition *> new_seeds;
std::vector<ColPartition *> new_seeds;
for (int i = 0; i < cp_seeds_.size(); ++i) {
ColPartition *part = cp_seeds_[i];
// If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
@ -918,8 +924,8 @@ bool EquationDetect::IsInline(const bool search_bottom, const int textparts_line
// Check if neighbor and part is inline similar.
const float kHeightRatioTh = 0.5;
const int kYGapTh = textparts_linespacing > 0
? textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_))
: static_cast<int>(roundf(0.05 * resolution_)); // Default value.
? textparts_linespacing + static_cast<int>(round(0.02f * resolution_))
: static_cast<int>(round(0.05f * resolution_)); // Default value.
if (part_box.x_overlap(neighbor_box) && // Location feature.
part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
// Geo feature.
@ -973,9 +979,9 @@ EquationDetect::IndentType EquationDetect::IsIndented(ColPartition *part) {
ColPartitionGridSearch search(part_grid_);
ColPartition *neighbor = nullptr;
const TBOX &part_box(part->bounding_box());
const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
const int kXGapTh = static_cast<int>(round(0.5f * resolution_));
const int kRadiusTh = static_cast<int>(round(3.0f * resolution_));
const int kYGapTh = static_cast<int>(round(0.5f * resolution_));
// Here we use a simple approximation algorithm: from the center of part, We
// perform the radius search, and check if we can find a neighboring partition
@ -1036,7 +1042,7 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
}
// Expand in four directions.
GenericVector<ColPartition *> parts_to_merge;
std::vector<ColPartition *> parts_to_merge;
ExpandSeedHorizontal(true, seed, &parts_to_merge);
ExpandSeedHorizontal(false, seed, &parts_to_merge);
ExpandSeedVertical(true, seed, &parts_to_merge);
@ -1073,10 +1079,10 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
}
void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge) {
std::vector<ColPartition *> *parts_to_merge) {
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);
const float kYOverlapTh = 0.6;
const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
const int kXGapTh = static_cast<int>(round(0.2f * resolution_));
ColPartitionGridSearch search(part_grid_);
const TBOX &seed_box(seed->bounding_box());
@ -1125,10 +1131,10 @@ void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *
}
void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge) {
std::vector<ColPartition *> *parts_to_merge) {
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr && cps_super_bbox_ != nullptr);
const float kXOverlapTh = 0.4;
const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
const int kYGapTh = static_cast<int>(round(0.2f * resolution_));
ColPartitionGridSearch search(part_grid_);
const TBOX &seed_box(seed->bounding_box());
@ -1138,7 +1144,7 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
// Search iteratively.
ColPartition *part = nullptr;
GenericVector<ColPartition *> parts;
std::vector<ColPartition *> parts;
int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;
while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {
if (part == seed) {
@ -1206,8 +1212,8 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
}
bool EquationDetect::IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const {
const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
const int kXGapTh = static_cast<int>(round(0.25f * resolution_));
const int kYGapTh = static_cast<int>(round(0.05f * resolution_));
// Check geometric feature.
if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) {
@ -1244,7 +1250,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
// Iterate over part_grid_, and find all parts that are text type but not
// equation type.
ColPartition *part = nullptr;
GenericVector<ColPartition *> text_parts;
std::vector<ColPartition *> text_parts;
ColPartitionGridSearch gsearch(part_grid_);
gsearch.StartFullSearch();
while ((part = gsearch.NextFullSearch()) != nullptr) {
@ -1257,12 +1263,12 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
}
// Compute the medium height of the text_parts.
text_parts.sort(&SortCPByHeight);
std::sort(text_parts.begin(), text_parts.end(), &SortCPByHeight);
const TBOX &text_box = text_parts[text_parts.size() / 2]->bounding_box();
int med_height = text_box.height();
if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
const TBOX &text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box();
med_height = static_cast<int>(roundf(0.5 * (text_box.height() + med_height)));
med_height = static_cast<int>(round(0.5f * (text_box.height() + med_height)));
}
// Iterate every text_parts and check if it is a math block satellite.
@ -1271,7 +1277,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
if (text_box.height() > med_height) {
continue;
}
GenericVector<ColPartition *> math_blocks;
std::vector<ColPartition *> math_blocks;
if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
continue;
}
@ -1288,7 +1294,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
}
bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
GenericVector<ColPartition *> *math_blocks) {
std::vector<ColPartition *> *math_blocks) {
ASSERT_HOST(part != nullptr && math_blocks != nullptr);
math_blocks->clear();
const TBOX &part_box(part->bounding_box());
@ -1344,7 +1350,7 @@ bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
ColPartition *EquationDetect::SearchNNVertical(const bool search_bottom, const ColPartition *part) {
ASSERT_HOST(part);
ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
const int kYGapTh = static_cast<int>(round(resolution_ * 0.5f));
ColPartitionGridSearch search(part_grid_);
search.SetUniqueMode(true);
@ -1379,7 +1385,7 @@ bool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *nei
if (!neighbor) {
return false;
}
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
const int kYGapTh = static_cast<int>(round(resolution_ * 0.1f));
return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
}

View File

@ -22,7 +22,6 @@
#include <tesseract/unichar.h> // for UNICHAR_ID
#include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText...
#include "equationdetectbase.h" // for EquationDetectBase
#include "genericvector.h" // for GenericVector
#include "tesseractclass.h" // for Tesseract
class TBOX;
@ -86,7 +85,7 @@ protected:
// parts_overlap. Note: this function may update the part_grid_, so if the
// caller is also running ColPartitionGridSearch, use the RepositionIterator
// to continue.
void SearchByOverlap(ColPartition *seed, GenericVector<ColPartition *> *parts_overlap);
void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);
// Insert part back into part_grid_, after it absorbs some other parts.
void InsertPartAfterAbsorb(ColPartition *part);
@ -106,12 +105,12 @@ protected:
// 1. If its left is aligned with any coordinates in indented_texts_left,
// which we assume have been sorted.
// 2. If its foreground density is over foreground_density_th.
bool CheckForSeed2(const GenericVector<int> &indented_texts_left,
bool CheckForSeed2(const std::vector<int> &indented_texts_left,
const float foreground_density_th, ColPartition *part);
// Count the number of values in sorted_vec that is close to val, used to
// check if a partition is aligned with text partitions.
int CountAlignment(const GenericVector<int> &sorted_vec, const int val) const;
int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;
// Check for a seed candidate using the foreground pixel density. And we
// return true if the density is below a certain threshold, because characters
@ -120,14 +119,14 @@ protected:
// A light version of SplitCPHor: instead of really doing the part split, we
// simply compute the union bounding box of each split part.
void SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes);
void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);
// Split the part (horizontally), and save the split result into
// parts_splitted. Note that it is caller's responsibility to release the
// memory owns by parts_splitted. On the other hand, the part is unchanged
// during this process and still owns the blobs, so do NOT call DeleteBoxes
// when freeing the colpartitions in parts_splitted.
void SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted);
void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);
// Check the density for a seed candidate (part) using its math density and
// italic density, returns true if the check passed.
@ -167,9 +166,9 @@ protected:
// merged with seed, remove them from part_grid_, and put them into
// parts_to_merge.
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge);
std::vector<ColPartition *> *parts_to_merge);
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
GenericVector<ColPartition *> *parts_to_merge);
std::vector<ColPartition *> *parts_to_merge);
// Check if a part_box is the small neighbor of seed_box.
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;
@ -190,7 +189,7 @@ protected:
// Check if part is the satellite of one/two math blocks. If it is, we return
// true, and save the blocks into math_blocks.
bool IsMathBlockSatellite(ColPartition *part, GenericVector<ColPartition *> *math_blocks);
bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);
// Search the nearest neighbor of part in one vertical direction as defined in
// search_bottom. It returns the neighbor found that major x overlap with it,
@ -237,7 +236,7 @@ protected:
TBOX *cps_super_bbox_;
// The seed ColPartition for equation region.
GenericVector<ColPartition *> cp_seeds_;
std::vector<ColPartition *> cp_seeds_;
// The resolution (dpi) of the processing image.
int resolution_;

View File

@ -18,7 +18,6 @@
#include "paragraphs.h"
#include "genericvector.h" // for GenericVector, GenericVectorEqEq
#include "helpers.h" // for UpdateRange, ClipToRange
#include "host.h" // for NearlyEqual
#include "mutableiterator.h" // for MutableIterator
@ -72,7 +71,7 @@ static int Epsilon(int space_pix) {
}
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
const GenericVector<RowScratchRegisters> *rows, int row_start,
const std::vector<RowScratchRegisters> *rows, int row_start,
int row_end) {
if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", row_start, row_end,
@ -134,7 +133,7 @@ static std::string RtlEmbed(const std::string &word, bool rtlify) {
// Print the current thoughts of the paragraph detector.
static void PrintDetectorState(const ParagraphTheory &theory,
const GenericVector<RowScratchRegisters> &rows) {
const std::vector<RowScratchRegisters> &rows) {
std::vector<std::vector<std::string>> output;
output.push_back(std::vector<std::string>());
output.back().push_back("#row");
@ -173,7 +172,7 @@ static void PrintDetectorState(const ParagraphTheory &theory,
}
static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
const GenericVector<RowScratchRegisters> &rows) {
const std::vector<RowScratchRegisters> &rows) {
if (!should_print)
return;
tprintf("# %s\n", phase);
@ -181,7 +180,7 @@ static void DebugDump(bool should_print, const char *phase, const ParagraphTheor
}
// Print out the text for rows[row_start, row_end)
static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows, int row_start,
static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
int row_end) {
tprintf("======================================\n");
for (int row = row_start; row < row_end; row++) {
@ -398,6 +397,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
return pos == werd->length();
}
template<class T>
void push_back_new(std::vector<T> &vector, const T &data) {
if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
vector.push_back(data);
}
}
// ========= Brain Dead Language Model (combined entry points) ================
// Given the leftmost word of a line either as a Tesseract unicharset + werd
@ -581,7 +587,7 @@ void RowScratchRegisters::SetStartLine() {
tprintf("Trying to set a line to be START when it's already BODY.\n");
}
if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
}
}
@ -591,42 +597,44 @@ void RowScratchRegisters::SetBodyLine() {
tprintf("Trying to set a line to be BODY when it's already START.\n");
}
if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
}
}
void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
hypotheses_.push_back_new(LineHypothesis(LT_START, model));
int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
if (old_idx >= 0)
hypotheses_.remove(old_idx);
push_back_new(hypotheses_, LineHypothesis(LT_START, model));
auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
if (found != hypotheses_.end()) {
hypotheses_.erase(found);
}
}
void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
if (old_idx >= 0)
hypotheses_.remove(old_idx);
push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
if (found != hypotheses_.end()) {
hypotheses_.erase(found);
}
}
void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
for (int h = 0; h < hypotheses_.size(); h++) {
if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
models->push_back_new(hypotheses_[h].model);
push_back_new(*models, hypotheses_[h].model);
}
}
void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
for (int h = 0; h < hypotheses_.size(); h++) {
if (StrongModel(hypotheses_[h].model))
models->push_back_new(hypotheses_[h].model);
push_back_new(*models, hypotheses_[h].model);
}
}
void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
for (int h = 0; h < hypotheses_.size(); h++) {
if (hypotheses_[h].model != nullptr)
models->push_back_new(hypotheses_[h].model);
push_back_new(*models, hypotheses_[h].model);
}
}
@ -647,8 +655,8 @@ void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models
if (models.empty())
return;
for (int h = hypotheses_.size() - 1; h >= 0; h--) {
if (!models.contains(hypotheses_[h].model)) {
hypotheses_.remove(h);
if (!contains(models, hypotheses_[h].model)) {
hypotheses_.erase(hypotheses_.begin() + h);
}
}
}
@ -672,15 +680,15 @@ public:
int size() const {
return values_.size();
}
void GetClusters(GenericVector<Cluster> *clusters);
void GetClusters(std::vector<Cluster> *clusters);
private:
int max_cluster_width_;
GenericVector<int> values_;
std::vector<int> values_;
};
// Return the index of the cluster closest to value.
static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
int best_index = 0;
for (int i = 0; i < clusters.size(); i++) {
if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center))
@ -689,9 +697,9 @@ static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
return best_index;
}
void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
clusters->clear();
values_.sort();
std::sort(values_.begin(), values_.end());
for (int i = 0; i < values_.size();) {
int orig_i = i;
int lo = values_[i];
@ -705,16 +713,16 @@ void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
// Calculate left- and right-indent tab stop values seen in
// rows[row_start, row_end) given a tolerance of tolerance.
static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
int tolerance, GenericVector<Cluster> *left_tabs,
GenericVector<Cluster> *right_tabs) {
static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
int tolerance, std::vector<Cluster> *left_tabs,
std::vector<Cluster> *right_tabs) {
if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
return;
// First pass: toss all left and right indents into clusterers.
SimpleClusterer initial_lefts(tolerance);
SimpleClusterer initial_rights(tolerance);
GenericVector<Cluster> initial_left_tabs;
GenericVector<Cluster> initial_right_tabs;
std::vector<Cluster> initial_left_tabs;
std::vector<Cluster> initial_right_tabs;
for (int i = row_start; i < row_end; i++) {
initial_lefts.Add((*rows)[i].lindent_);
initial_rights.Add((*rows)[i].rindent_);
@ -782,7 +790,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
}
}
if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
left_tabs->remove(to_prune);
left_tabs->erase(left_tabs->begin() + to_prune);
}
}
if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
@ -793,7 +801,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
}
}
if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
right_tabs->remove(to_prune);
right_tabs->erase(right_tabs->begin() + to_prune);
}
}
}
@ -817,7 +825,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
// Case 2b: Fully Justified. (eop_threshold > 0)
// We mark a line as short (end of paragraph) if the offside indent
// is greater than eop_threshold.
static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
const ParagraphModel *model, bool ltr, int eop_threshold) {
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
return;
@ -861,7 +869,7 @@ static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_
// Further, this struct holds the data we amass for the (single) ParagraphModel
// we'll assign to the text lines (assuming we get that far).
struct GeometricClassifierState {
GeometricClassifierState(int dbg_level, GenericVector<RowScratchRegisters> *r, int r_start,
GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
int r_end)
: debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
tolerance = InterwordSpace(*r, r_start, r_end);
@ -886,7 +894,7 @@ struct GeometricClassifierState {
}
// Align tabs are the tab stops the text is aligned to.
const GenericVector<Cluster> &AlignTabs() const {
const std::vector<Cluster> &AlignTabs() const {
if (just == tesseract::JUSTIFICATION_RIGHT)
return right_tabs;
return left_tabs;
@ -897,7 +905,7 @@ struct GeometricClassifierState {
// Note that for a left-to-right text which is aligned to the right such as
// this function comment, the offside tabs are the horizontal tab stops
// marking the beginning of ("Note", "this" and "marking").
const GenericVector<Cluster> &OffsideTabs() const {
const std::vector<Cluster> &OffsideTabs() const {
if (just == tesseract::JUSTIFICATION_RIGHT)
return left_tabs;
return right_tabs;
@ -940,7 +948,7 @@ struct GeometricClassifierState {
// The Geometric Classifier was asked to find a single paragraph model
// to fit the text rows (*rows)[row_start, row_end)
GenericVector<RowScratchRegisters> *rows;
std::vector<RowScratchRegisters> *rows;
int row_start = 0;
int row_end = 0;
@ -953,8 +961,8 @@ struct GeometricClassifierState {
// These left and right tab stops were determined to be the common tab
// stops for the given text.
GenericVector<Cluster> left_tabs;
GenericVector<Cluster> right_tabs;
std::vector<Cluster> left_tabs;
std::vector<Cluster> right_tabs;
// These are parameters we must determine to create a ParagraphModel.
tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
@ -1083,7 +1091,7 @@ static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricCla
// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
// it's worth guessing that (A1b) is the correct interpretation if there are
// far more "full" lines than "short" lines.
static void GeometricClassify(int debug_level, GenericVector<RowScratchRegisters> *rows,
static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, ParagraphTheory *theory) {
if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
return;
@ -1223,7 +1231,7 @@ const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
}
auto *m = new ParagraphModel(model);
models_->push_back(m);
models_we_added_.push_back_new(m);
push_back_new(models_we_added_, m);
return m;
}
@ -1231,7 +1239,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
size_t w = 0;
for (size_t r = 0; r < models_->size(); r++) {
ParagraphModel *m = (*models_)[r];
if (!used_models.contains(m) && models_we_added_.contains(m)) {
if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
delete m;
} else {
if (r > w) {
@ -1246,7 +1254,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
// Examine rows[start, end) and try to determine if an existing non-centered
// paragraph model would fit them perfectly. If so, return a pointer to it.
// If not, return nullptr.
const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegisters> *rows,
const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
int start, int end) const {
for (const auto *model : *models_) {
if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model))
@ -1258,7 +1266,7 @@ const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegist
void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
for (const auto *model : *models_) {
if (model->justification() != JUSTIFICATION_CENTER)
models->push_back_new(model);
push_back_new(*models, model);
}
}
@ -1272,7 +1280,7 @@ int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
return -1;
}
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model) {
if (!StrongModel(model)) {
tprintf("ValidFirstLine() should only be called with strong models!\n");
@ -1281,7 +1289,7 @@ bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
(*rows)[row].rindent_, (*rows)[row].rmargin_);
}
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model) {
if (!StrongModel(model)) {
tprintf("ValidBodyLine() should only be called with strong models!\n");
@ -1290,7 +1298,7 @@ bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
(*rows)[row].rindent_, (*rows)[row].rmargin_);
}
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b,
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
const ParagraphModel *model) {
if (model != kCrownRight && model != kCrownLeft) {
tprintf("CrownCompatible() should only be called with crown models!\n");
@ -1308,7 +1316,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
// =============== Implementation of ParagraphModelSmearer ====================
ParagraphModelSmearer::ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, ParagraphTheory *theory)
: theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
@ -1341,7 +1349,7 @@ void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
// This is basic filtering; we check likely paragraph starty-ness down
// below in Smear() -- you know, whether the first word would have fit
// and such.
still_open.push_back_new(opened[m]);
push_back_new(still_open, opened[m]);
}
}
OpenModels(row + 1) = still_open;
@ -1449,7 +1457,7 @@ void ParagraphModelSmearer::Smear() {
// Find out what ParagraphModels are actually used, and discard any
// that are not.
static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
ParagraphTheory *theory) {
SetOfModels used_models;
for (int i = 0; i < rows.size(); i++) {
@ -1483,7 +1491,7 @@ static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
// sequences of body lines of equivalent type abutted against the beginning
// or a body or start line of a different type into a crown paragraph.
static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
GenericVector<RowScratchRegisters> *rows) {
std::vector<RowScratchRegisters> *rows) {
int start;
for (int end = rows->size(); end > 0; end = start) {
// Search back for a body line of a unique type.
@ -1546,7 +1554,7 @@ static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
// really just ignore it as an outlier. To express this, we allow the
// user to specify the percentile (0..100) of indent values to use as
// the common margin for each row in the run of rows[start, end).
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start,
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
int end, int percentile) {
if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
return;
@ -1585,7 +1593,7 @@ void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows
}
// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end) {
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
if (row_end < row_start + 1)
return 1;
int word_height =
@ -1666,7 +1674,7 @@ static bool LikelyParagraphStart(const RowScratchRegisters &before,
// If the rows given could be a consistent start to a paragraph, set *consistent
// true.
static ParagraphModel InternalParagraphModelByOutline(
const GenericVector<RowScratchRegisters> *rows, int start, int end, int tolerance,
const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
bool *consistent) {
int ltr_line_count = 0;
for (int i = start; i < end; i++) {
@ -1763,7 +1771,7 @@ static ParagraphModel InternalParagraphModelByOutline(
// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
// output if we're debugging.
static ParagraphModel ParagraphModelByOutline(int debug_level,
const GenericVector<RowScratchRegisters> *rows,
const std::vector<RowScratchRegisters> *rows,
int start, int end, int tolerance) {
bool unused_consistent;
ParagraphModel retval =
@ -1776,7 +1784,7 @@ static ParagraphModel ParagraphModelByOutline(int debug_level,
}
// Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end,
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
const ParagraphModel *model) {
if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
return false;
@ -1800,7 +1808,7 @@ bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int
// We only take the very strongest signals, as we don't want to get
// confused and marking up centered text, poetry, or source code as
// clearly part of a typical paragraph.
static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row_start,
static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
int row_end) {
// Record patently obvious body text.
for (int i = row_start + 1; i < row_end; i++) {
@ -1862,7 +1870,7 @@ static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row
// Look for sequences of a start line followed by some body lines in
// rows[row_start, row_end) and create ParagraphModels for them if
// they seem coherent.
static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegisters> *rows,
static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, bool allow_flush_models,
ParagraphTheory *theory) {
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
@ -1951,7 +1959,7 @@ static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegiste
// clues.
// (3) Form models for any sequence of start + continuation lines.
// (4) Smear the paragraph models to cover surrounding text.
static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegisters> *rows,
static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
int row_start, int row_end, ParagraphTheory *theory) {
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
return;
@ -1979,7 +1987,7 @@ static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegi
smearer.Smear();
}
static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows, int row_start,
static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
int row_end, ParagraphTheory *theory) {
for (int i = row_start + 1; i < row_end - 1; i++) {
if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
@ -1994,8 +2002,8 @@ static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
// Collect sequences of unique hypotheses in row registers and create proper
// paragraphs for them, referencing the paragraphs in row_owners.
static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
GenericVector<RowScratchRegisters> &rows,
GenericVector<PARA *> *row_owners,
std::vector<RowScratchRegisters> &rows,
std::vector<PARA *> *row_owners,
ParagraphTheory *theory) {
int end = rows.size();
int start;
@ -2090,7 +2098,7 @@ struct Interval {
// (1) If a line is surrounded by lines of unknown type, it's weak.
// (2) If two lines in a row are start lines for a given paragraph type, but
// after that the same paragraph type does not continue, they're weak.
static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) {
static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
SetOfModels row_models;
rows[row].StrongHypotheses(&row_models);
@ -2145,8 +2153,8 @@ static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int ro
// + Crown paragraphs not immediately followed by a strongly modeled line.
// + Single line paragraphs surrounded by text that doesn't match the
// model.
static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
GenericVector<Interval> *to_fix, int row_start, int row_end) {
static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
std::vector<Interval> *to_fix, int row_start, int row_end) {
to_fix->clear();
for (int i = row_start; i < row_end; i++) {
bool needs_fixing = false;
@ -2195,8 +2203,8 @@ static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs) {
GenericVector<PARA *> &rows = *row_owners;
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
std::vector<PARA *> &rows = *row_owners;
paragraphs->clear();
PARA_IT out(paragraphs);
PARA *formerly_null = nullptr;
@ -2226,16 +2234,16 @@ void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *
// models - the list of paragraph models referenced by the PARA objects.
// caller is responsible for deleting the models.
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs,
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
std::vector<ParagraphModel *> *models) {
GenericVector<RowScratchRegisters> rows;
std::vector<RowScratchRegisters> rows;
ParagraphTheory theory(models);
// Initialize row_owners to be a bunch of nullptr pointers.
row_owners->init_to_size(row_infos->size(), nullptr);
row_owners->resize(row_infos->size());
// Set up row scratch registers for the main algorithm.
rows.init_to_size(row_infos->size(), RowScratchRegisters());
rows.resize(row_infos->size(), RowScratchRegisters());
for (int i = 0; i < row_infos->size(); i++) {
rows[i].Init((*row_infos)[i]);
}
@ -2249,7 +2257,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
GenericVector<Interval> leftovers;
std::vector<Interval> leftovers;
LeftoverSegments(rows, &leftovers, 0, rows.size());
for (int i = 0; i < leftovers.size(); i++) {
// Pass 2a:
@ -2263,7 +2271,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
// If we had any luck in pass 2a, we got part of the page and didn't
// know how to classify a few runs of rows. Take the segments that
// didn't find a model and reprocess them individually.
GenericVector<Interval> leftovers2;
std::vector<Interval> leftovers2;
LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
bool pass2a_was_useful =
leftovers2.size() > 1 ||
@ -2422,7 +2430,7 @@ static void InitializeRowInfo(bool after_recognition, const MutableIterator &it,
}
PAGE_RES_IT page_res_it = *it.PageResIt();
GenericVector<WERD_RES *> werds;
std::vector<WERD_RES *> werds;
WERD_RES *word_res = page_res_it.restart_row();
ROW_RES *this_row = page_res_it.row();
int num_leaders = 0;
@ -2505,12 +2513,12 @@ void DetectParagraphs(int debug_level, bool after_text_recognition,
}
// Run the paragraph detection algorithm.
GenericVector<PARA *> row_owners;
GenericVector<PARA *> the_paragraphs;
std::vector<PARA *> row_owners;
std::vector<PARA *> the_paragraphs;
if (!is_image_block) {
DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
} else {
row_owners.init_to_size(row_infos.size(), nullptr);
row_owners.resize(row_infos.size());
CanonicalizeDetectionResults(&row_owners, block->para_list());
}

View File

@ -31,9 +31,6 @@ class ParagraphModel;
class PARA_LIST;
struct PARA;
template <typename T>
class GenericVector;
// This structure captures all information needed about a text line for the
// purposes of paragraph detection. It is meant to be exceedingly light-weight
// so that we can easily test paragraph detection independent of the rest of
@ -90,7 +87,7 @@ public:
// caller is responsible for deleting the models.
TESS_API
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs,
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
std::vector<ParagraphModel *> *models);
// Given a MutableIterator to the start of a block, run DetectParagraphs on

View File

@ -95,7 +95,7 @@ struct LineHypothesis {
class ParagraphTheory; // Forward Declaration
using SetOfModels = GenericVector<const ParagraphModel *>;
using SetOfModels = std::vector<const ParagraphModel *>;
// Row Scratch Registers are data generated by the paragraph detection
// algorithm based on a RowInfo input.
@ -123,7 +123,7 @@ public:
// Clear all hypotheses about this line.
void SetUnknown() {
hypotheses_.truncate(0);
hypotheses_.clear();
}
// Append all hypotheses of strong models that match this row as a start.
@ -190,7 +190,7 @@ public:
private:
// Hypotheses of either LT_START or LT_BODY
GenericVector<LineHypothesis> hypotheses_;
std::vector<LineHypothesis> hypotheses_;
};
// A collection of convenience functions for wrapping the set of
@ -219,21 +219,21 @@ public:
// If any of the non-centered paragraph models we know about fit
// rows[start, end), return it. Else nullptr.
const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows, int start,
const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
int end) const;
int IndexOf(const ParagraphModel *model) const;
private:
std::vector<ParagraphModel *> *models_;
GenericVector<ParagraphModel *> models_we_added_;
std::vector<ParagraphModel *> models_we_added_;
};
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model);
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
const ParagraphModel *model);
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b,
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
const ParagraphModel *model);
// A class for smearing Paragraph Model hypotheses to surrounding rows.
@ -245,7 +245,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
// "smear" our models over the text.
class ParagraphModelSmearer {
public:
ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
ParagraphTheory *theory);
// Smear forward paragraph models from existing row markings to subsequent
@ -266,7 +266,7 @@ private:
}
ParagraphTheory *theory_;
GenericVector<RowScratchRegisters> *rows_;
std::vector<RowScratchRegisters> *rows_;
int row_start_;
int row_end_;
@ -284,11 +284,11 @@ private:
// Clear all hypotheses about lines [start, end) and reset the margins to the
// percentile (0..100) value of the left and right row edges for this run of
// rows.
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start,
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
int end, int percentile);
// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end);
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
// Return whether the first word on the after line can fit in the space at
// the end of the before line (knowing which way the text is aligned and read).
@ -300,13 +300,13 @@ bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRe
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
// Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end,
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
const ParagraphModel *model);
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs);
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
} // namespace tesseract

View File

@ -45,7 +45,7 @@
#include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...
#include <tesseract/unichar.h> // for UNICHAR_ID
#include "genericvector.h" // for GenericVector, PointerVector
#include "genericvector.h" // for PointerVector
#include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe...
@ -398,27 +398,27 @@ public:
// Input: a set of noisy outlines that probably belong to the real_word.
// Output: outlines that overlapped blobs are set to nullptr and put back into
// the word, either in the blobs or in the reject list.
void AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
void AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted,
GenericVector<bool> *overlapped_any_blob,
GenericVector<C_BLOB *> *target_blobs);
std::vector<bool> *word_wanted,
std::vector<bool> *overlapped_any_blob,
std::vector<C_BLOB *> *target_blobs);
// Attempts to assign non-overlapping outlines to their nearest blobs or
// make new blobs out of them.
void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
void AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
WERD *real_word, PAGE_RES_IT *pr_it,
GenericVector<bool> *word_wanted,
GenericVector<C_BLOB *> *target_blobs);
std::vector<bool> *word_wanted,
std::vector<C_BLOB *> *target_blobs);
// Starting with ok_outlines set to indicate which outlines overlap the blob,
// chooses the optimal set (approximately) and returns true if any outlines
// are desired, in which case ok_outlines indicates which ones.
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
C_BLOB *blob, const GenericVector<C_OUTLINE *> &outlines,
C_BLOB *blob, const std::vector<C_OUTLINE *> &outlines,
int num_outlines, std::vector<bool> *ok_outlines);
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
// the inclusion of the outlines, and returns the certainty of the raw choice.
float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
const GenericVector<C_OUTLINE *> &outlines, int pass_n,
const std::vector<C_OUTLINE *> &outlines, int pass_n,
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str);
// Classifies the given blob (part of word_data->word->word) as an individual
// word, using languages, chopper etc, returning only the certainty of the
@ -703,24 +703,24 @@ public:
void ReSegmentByClassification(PAGE_RES *page_res);
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
// Returns false if an invalid UNICHAR_ID is encountered.
bool ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids);
bool ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids);
// Resegments the word to achieve the target_text from the classifier.
// Returns false if the re-segmentation fails.
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
// applies a full search on the classifier results to find the best classified
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
// substitutions ARE used.
bool FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res);
bool FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res);
// Recursive helper to find a match to the target_text (from text_index
// position) in the choices (from choices_pos position).
// Choices is an array of GenericVectors, of length choices_length, with each
// Choices is an array of vectors of length choices_length, with each
// element representing a starting position in the word, and the
// GenericVector holding classification results for a sequence of consecutive
// vector holding classification results for a sequence of consecutive
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
void SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
int choices_length, const GenericVector<UNICHAR_ID> &target_text,
int text_index, float rating, GenericVector<int> *segmentation,
float *best_rating, GenericVector<int> *best_segmentation);
void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
int choices_length, const std::vector<UNICHAR_ID> &target_text,
int text_index, float rating, std::vector<int> *segmentation,
float *best_rating, std::vector<int> *best_segmentation);
// Counts up the labelled words and the blobs within.
// Deletes all unused or emptied words, counting the unused ones.
// Resets W_BOL and W_EOL flags correctly.

View File

@ -183,7 +183,7 @@ void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
chopped2->blobs.push_back(chopped->blobs[i]);
}
chopped->blobs.truncate(split_pt);
chopped->blobs.resize(split_pt);
word->chopped_word = nullptr;
delete word2->chopped_word;
word2->chopped_word = nullptr;
@ -223,8 +223,8 @@ void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_b
TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
// Tack the word2 outputs onto the end of the word outputs.
word->chopped_word->blobs += word2->chopped_word->blobs;
word->rebuild_word->blobs += word2->rebuild_word->blobs;
word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
word2->chopped_word->blobs.clear();
word2->rebuild_word->blobs.clear();
TPOINT split_pt;
@ -234,17 +234,17 @@ void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_b
// Since the seam list is one element short, an empty seam marking the
// end of the last blob in the first word is needed first.
word->seam_array.push_back(new SEAM(0.0f, split_pt));
word->seam_array += word2->seam_array;
word2->seam_array.truncate(0);
word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
word2->seam_array.clear();
// Fix widths and gaps.
word->blob_widths += word2->blob_widths;
word->blob_gaps += word2->blob_gaps;
word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
// Fix the ratings matrix.
int rat1 = word->ratings->dimension();
int rat2 = word2->ratings->dimension();
word->ratings->AttachOnCorner(word2->ratings);
ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
word->best_state += word2->best_state;
word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
// Append the word choices.
*word->raw_choice += *word2->raw_choice;

View File

@ -826,7 +826,9 @@ void TWERD::CopyFrom(const TWERD &src) {
// Deletes owned data.
void TWERD::Clear() {
blobs.delete_data_pointers();
for (auto blob : blobs) {
delete blob;
}
blobs.clear();
}
@ -869,8 +871,9 @@ void TWERD::MergeBlobs(int start, int end) {
blobs[i] = nullptr;
}
// Remove dead blobs from the vector.
// TODO: optimize.
for (int i = start + 1; i < end && start + 1 < blobs.size(); ++i) {
blobs.remove(start + 1);
blobs.erase(blobs.begin() + start + 1);
}
}

View File

@ -450,8 +450,8 @@ struct TWERD {
void plot(ScrollView *window);
GenericVector<TBLOB *> blobs; // Blobs in word.
bool latin_script; // This word is in a latin-based script.
std::vector<TBLOB *> blobs; // Blobs in word.
bool latin_script; // This word is in a latin-based script.
};
/*----------------------------------------------------------------------

View File

@ -2,7 +2,6 @@
* File: linlsq.h (Formerly llsq.h)
* Description: Linear Least squares fitting code.
* Author: Ray Smith
* Created: Thu Sep 12 08:44:51 BST 1991
*
* (C) Copyright 1991, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
@ -22,13 +21,11 @@
#include "points.h" // for FCOORD
#include <algorithm> // for std::nth_element
#include <cstdint> // for int32_t
namespace tesseract {
template <typename T>
class GenericVector;
class TESS_API LLSQ {
public:
LLSQ() { // constructor
@ -111,29 +108,30 @@ private:
// An assumption is made that most of the values are spread over no more than
// half the range, but wrap-around is accounted for if the median is near
// the wrap-around point.
// Cannot be a member of GenericVector, as it makes heavy used of LLSQ.
// Cannot be a member of vector, as it makes heavy use of LLSQ.
// T must be an integer or float/double type.
template <typename T>
T MedianOfCircularValues(T modulus, GenericVector<T> *v) {
T MedianOfCircularValues(T modulus, std::vector<T> &v) {
LLSQ stats;
T halfrange = static_cast<T>(modulus / 2);
int num_elements = v->size();
for (int i = 0; i < num_elements; ++i) {
stats.add((*v)[i], (*v)[i] + halfrange);
auto num_elements = v.size();
for (auto i : v) {
stats.add(i, i + halfrange);
}
bool offset_needed = stats.y_variance() < stats.x_variance();
if (offset_needed) {
for (int i = 0; i < num_elements; ++i) {
(*v)[i] += halfrange;
for (auto i : v) {
i += halfrange;
}
}
int median_index = v->choose_nth_item(num_elements / 2);
auto median_index = num_elements / 2;
std::nth_element(v.begin(), v.begin() + median_index, v.end());
if (offset_needed) {
for (int i = 0; i < num_elements; ++i) {
(*v)[i] -= halfrange;
for (auto i : v) {
i -= halfrange;
}
}
return (*v)[median_index];
return v[median_index];
}
} // namespace tesseract

View File

@ -391,8 +391,8 @@ void WERD_RES::SetupBlamerBundle() {
// Computes the blob_widths and blob_gaps from the chopped_word.
void WERD_RES::SetupBlobWidthsAndGaps() {
blob_widths.truncate(0);
blob_gaps.truncate(0);
blob_widths.clear();
blob_gaps.clear();
int num_blobs = chopped_word->NumBlobs();
for (int b = 0; b < num_blobs; ++b) {
TBLOB *blob = chopped_word->blobs[b];
@ -410,7 +410,7 @@ void WERD_RES::SetupBlobWidthsAndGaps() {
void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {
// Insert the seam into the SEAMS array.
seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
seam_array.insert(seam, blob_number);
seam_array.insert(seam_array.begin() + blob_number, seam);
if (ratings != nullptr) {
// Expand the ratings matrix.
ratings = ratings->ConsumeAndMakeBigger(blob_number);
@ -753,13 +753,20 @@ void WERD_RES::ConsumeWordResults(WERD_RES *word) {
MovePointerData(&chopped_word, &word->chopped_word);
MovePointerData(&rebuild_word, &word->rebuild_word);
MovePointerData(&box_word, &word->box_word);
seam_array.delete_data_pointers();
for (auto data : seam_array) {
delete data;
}
seam_array = word->seam_array;
word->seam_array.clear();
best_state.move(&word->best_state);
correct_text.move(&word->correct_text);
blob_widths.move(&word->blob_widths);
blob_gaps.move(&word->blob_gaps);
// TODO: optimize moves.
best_state = word->best_state;
word->best_state.clear();
correct_text = word->correct_text;
word->correct_text.clear();
blob_widths = word->blob_widths;
word->blob_widths.clear();
blob_gaps = word->blob_gaps;
word->blob_gaps.clear();
if (ratings != nullptr)
ratings->delete_matrix_pointers();
MovePointerData(&ratings, &word->ratings);
@ -797,7 +804,7 @@ void WERD_RES::RebuildBestState() {
rebuild_word = new TWERD;
if (seam_array.empty())
start_seam_list(chopped_word, &seam_array);
best_state.truncate(0);
best_state.clear();
int start = 0;
for (int i = 0; i < best_choice->length(); ++i) {
int length = best_choice->state(i);
@ -873,7 +880,7 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE **choices) {
}
FakeWordFromRatings(TOP_CHOICE_PERM);
reject_map.initialise(blob_count);
best_state.init_to_size(blob_count, 1);
best_state.resize(blob_count, 1);
done = true;
}
@ -958,7 +965,7 @@ void WERD_RES::MergeAdjacentBlobs(int index) {
box_word->MergeBoxes(index, index + 2);
if (index + 1 < best_state.size()) {
best_state[index] += best_state[index + 1];
best_state.remove(index + 1);
best_state.erase(best_state.begin() + index + 1);
}
}
@ -1088,7 +1095,9 @@ void WERD_RES::ClearResults() {
box_word = nullptr;
best_state.clear();
correct_text.clear();
seam_array.delete_data_pointers();
for (auto data : seam_array) {
delete data;
}
seam_array.clear();
blob_widths.clear();
blob_gaps.clear();
@ -1204,7 +1213,7 @@ WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *ne
// are likely very poor, if they come from LSTM, where it only outputs the
// character at one pixel within it, so we find the midpoints between them.
static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,
C_BLOB_LIST *next_word_blobs, GenericVector<int> *blob_ends) {
C_BLOB_LIST *next_word_blobs, std::vector<int> *blob_ends) {
C_BLOB_IT blob_it(word.word->cblob_list());
for (int i = 0; i < word.best_state.size(); ++i) {
int length = word.best_state[i];
@ -1341,7 +1350,7 @@ void PAGE_RES_IT::ReplaceCurrentWord(tesseract::PointerVector<WERD_RES> *words)
WERD_RES *word_w = (*words)[w];
clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
// Compute blob boundaries.
GenericVector<int> blob_ends;
std::vector<int> blob_ends;
C_BLOB_LIST *next_word_blobs =
w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);

View File

@ -31,7 +31,7 @@
#include "werd.h" // for WERD, W_BOL, W_EOL
#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
#include "genericvector.h" // for GenericVector, PointerVector (ptr only)
#include "genericvector.h" // for PointerVector (ptr only)
#include <sys/types.h> // for int8_t
#include <cstdint> // for int32_t, int16_t
@ -83,19 +83,19 @@ public:
// the next word. This pointer is not owned by PAGE_RES class.
WERD_CHOICE **prev_word_best_choice;
// Sums of blame reasons computed by the blamer.
GenericVector<int> blame_reasons;
std::vector<int> blame_reasons;
// Debug information about all the misadaptions on this page.
// Each BlamerBundle contains an index into this vector, so that words that
// caused misadaption could be marked. However, since words could be
// deleted/split/merged, the log is stored on the PAGE_RES level.
GenericVector<std::string> misadaption_log;
std::vector<std::string> misadaption_log;
inline void Init() {
char_count = 0;
rej_count = 0;
rejected = false;
prev_word_best_choice = nullptr;
blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
blame_reasons.resize(IRR_NUM_REASONS);
}
PAGE_RES() {
@ -207,12 +207,12 @@ public:
// The length of chopped_word matches length of seam_array + 1 (if set).
TWERD *chopped_word = nullptr; // BLN chopped fragments output.
// Vector of SEAM* holding chopping points matching chopped_word.
GenericVector<SEAM *> seam_array;
std::vector<SEAM *> seam_array;
// Widths of blobs in chopped_word.
GenericVector<int> blob_widths;
std::vector<int> blob_widths;
// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
// blob i and blob i+1.
GenericVector<int> blob_gaps;
std::vector<int> blob_gaps;
// Stores the lstm choices of every timestep
std::vector<std::vector<std::pair<const char *, float>>> timesteps;
// Stores the lstm choices of every timestep segmented by character
@ -277,11 +277,11 @@ public:
// rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
// adjacent blobs in chopped_word. The seams in seam_array are hidden
// within a rebuild_word blob and revealed between them.
GenericVector<int> best_state; // Number of blobs in each best blob.
std::vector<int> best_state; // Number of blobs in each best blob.
// The correct_text is used during training and adaption to carry the
// text to the training system without the need for a unicharset. There
// is one entry in the vector for each blob in rebuild_word and box_word.
GenericVector<std::string> correct_text;
std::vector<std::string> correct_text;
// Less-well documented members.
// TODO(rays) Add more documentation here.

View File

@ -19,6 +19,7 @@
#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
#include <cstring> // for memset
#include <string>
#include <vector>

View File

@ -51,8 +51,8 @@ bool SEAM::IsHealthy(const TBLOB &blob, int min_points, int min_area) const {
// seam, which is about to be inserted at insert_index. Returns false if
// any of the computations fails, as this indicates an invalid chop.
// widthn_/widthp_ are only changed if modify is true.
bool SEAM::PrepareToInsertSeam(const GenericVector<SEAM *> &seams,
const GenericVector<TBLOB *> &blobs, int insert_index, bool modify) {
bool SEAM::PrepareToInsertSeam(const std::vector<SEAM *> &seams,
const std::vector<TBLOB *> &blobs, int insert_index, bool modify) {
for (int s = 0; s < insert_index; ++s) {
if (!seams[s]->FindBlobWidth(blobs, s, modify))
return false;
@ -68,7 +68,7 @@ bool SEAM::PrepareToInsertSeam(const GenericVector<SEAM *> &seams,
// Computes the widthp_/widthn_ range. Returns false if not all the splits
// are accounted for. widthn_/widthp_ are only changed if modify is true.
bool SEAM::FindBlobWidth(const GenericVector<TBLOB *> &blobs, int index, bool modify) {
bool SEAM::FindBlobWidth(const std::vector<TBLOB *> &blobs, int index, bool modify) {
int num_found = 0;
if (modify) {
widthp_ = 0;
@ -147,7 +147,7 @@ void SEAM::Print(const char *label) const {
// Prints a collection of SEAMs.
/* static */
void SEAM::PrintSeams(const char *label, const GenericVector<SEAM *> &seams) {
void SEAM::PrintSeams(const char *label, const std::vector<SEAM *> &seams) {
if (!seams.empty()) {
tprintf("%s\n", label);
for (int x = 0; x < seams.size(); ++x) {
@ -169,7 +169,7 @@ void SEAM::Mark(ScrollView *window) const {
// Break up the blobs in this chain so that they are all independent.
// This operation should undo the affect of join_pieces.
/* static */
void SEAM::BreakPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
void SEAM::BreakPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
int first, int last) {
for (int x = first; x < last; ++x)
seams[x]->Reveal();
@ -191,7 +191,7 @@ void SEAM::BreakPieces(const GenericVector<SEAM *> &seams, const GenericVector<T
// Join a group of base level pieces into a single blob that can then
// be classified.
/* static */
void SEAM::JoinPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
void SEAM::JoinPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
int first, int last) {
TESSLINE *outline = blobs[first]->outlines;
if (!outline)
@ -245,8 +245,8 @@ float SEAM::FullPriority(int xmin, int xmax, double overlap_knob, int centered_m
* present in the starting segmentation. Each of the seams created
* by this routine have location information only.
*/
void start_seam_list(TWERD *word, GenericVector<SEAM *> *seam_array) {
seam_array->truncate(0);
void start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array) {
seam_array->clear();
TPOINT location;
for (int b = 1; b < word->NumBlobs(); ++b) {

View File

@ -133,11 +133,11 @@ public:
// seam, which is about to be inserted at insert_index. Returns false if
// any of the computations fails, as this indicates an invalid chop.
// widthn_/widthp_ are only changed if modify is true.
bool PrepareToInsertSeam(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
bool PrepareToInsertSeam(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
int insert_index, bool modify);
// Computes the widthp_/widthn_ range. Returns false if not all the splits
// are accounted for. widthn_/widthp_ are only changed if modify is true.
bool FindBlobWidth(const GenericVector<TBLOB *> &blobs, int index, bool modify);
bool FindBlobWidth(const std::vector<TBLOB *> &blobs, int index, bool modify);
// Splits this blob into two blobs by applying the splits included in
// *this SEAM
@ -149,7 +149,7 @@ public:
// Prints everything in *this SEAM.
void Print(const char *label) const;
// Prints a collection of SEAMs.
static void PrintSeams(const char *label, const GenericVector<SEAM *> &seams);
static void PrintSeams(const char *label, const std::vector<SEAM *> &seams);
#ifndef GRAPHICS_DISABLED
// Draws the seam in the given window.
void Mark(ScrollView *window) const;
@ -157,11 +157,11 @@ public:
// Break up the blobs in this chain so that they are all independent.
// This operation should undo the affect of join_pieces.
static void BreakPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
static void BreakPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
int first, int last);
// Join a group of base level pieces into a single blob that can then
// be classified.
static void JoinPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
static void JoinPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
int first, int last);
// Hides the seam so the outlines appear not to be cut by it.
@ -193,7 +193,7 @@ private:
SPLIT splits_[kMaxNumSplits];
};
void start_seam_list(TWERD *word, GenericVector<SEAM *> *seam_array);
void start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array);
} // namespace tesseract

View File

@ -462,13 +462,13 @@ static bool GatherPeak(int index, const int *src_buckets, int *used_buckets, int
// to sort on the output will re-sort by increasing mean of peak if that is
// more useful than decreasing total count.
// Returns the actual number of modes found.
int STATS::top_n_modes(int max_modes, GenericVector<KDPairInc<float, int>> *modes) const {
int STATS::top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const {
if (max_modes <= 0)
return 0;
int src_count = rangemax_ - rangemin_;
// Used copies the counts in buckets_ as they get used.
STATS used(rangemin_, rangemax_);
modes->truncate(0);
modes.clear();
// Total count of the smallest peak found so far.
int least_count = 1;
// Mode that is used as a seed for each peak
@ -502,21 +502,21 @@ int STATS::top_n_modes(int max_modes, GenericVector<KDPairInc<float, int>> *mode
&total_value))
break;
}
if (total_count > least_count || modes->size() < max_modes) {
if (total_count > least_count || modes.size() < max_modes) {
// We definitely want this mode, so if we have enough discard the least.
if (modes->size() == max_modes)
modes->truncate(max_modes - 1);
if (modes.size() == max_modes)
modes.resize(max_modes - 1);
int target_index = 0;
// Linear search for the target insertion point.
while (target_index < modes->size() && (*modes)[target_index].data() >= total_count)
while (target_index < modes.size() && modes[target_index].data() >= total_count)
++target_index;
auto peak_mean = static_cast<float>(total_value / total_count + rangemin_);
modes->insert(KDPairInc<float, int>(peak_mean, total_count), target_index);
least_count = modes->back().data();
modes.insert(modes.begin() + target_index, KDPairInc<float, int>(peak_mean, total_count));
least_count = modes.back().data();
}
}
} while (max_count > 0);
return modes->size();
return modes.size();
}
/**********************************************************************

View File

@ -113,7 +113,7 @@ public:
// sort on the output will re-sort by increasing mean of peak if that is more
// useful than decreasing total count. Returns the actual number of modes
// found.
int top_n_modes(int max_modes, GenericVector<KDPairInc<float, int>> *modes) const;
int top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const;
// Prints a summary and table of the histogram.
void print() const;

View File

@ -502,7 +502,7 @@ void WERD::CleanNoise(float size_threshold) {
// Extracts all the noise outlines and stuffs the pointers into the given
// vector of outlines. Afterwards, the outlines vector owns the pointers.
void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) {
void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) {
C_BLOB_IT rej_it(&rej_cblobs);
for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
C_BLOB *blob = rej_it.extract();
@ -516,13 +516,13 @@ void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) {
// back in rej_cblobs where they came from. Where the target_blobs entry is
// nullptr, a run of wanted outlines is put into a single new blob.
// Ownership of the outlines is transferred back to the word. (Hence
// GenericVector and not PointerVector.)
// vector and not PointerVector.)
// Returns true if any new blob was added to the start of the word, which
// suggests that it might need joining to the word before it, and likewise
// sets make_next_word_fuzzy true if any new blob was added to the end.
bool WERD::AddSelectedOutlines(const GenericVector<bool> &wanted,
const GenericVector<C_BLOB *> &target_blobs,
const GenericVector<C_OUTLINE *> &outlines,
bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,
const std::vector<C_BLOB *> &target_blobs,
const std::vector<C_OUTLINE *> &outlines,
bool *make_next_word_fuzzy) {
bool outline_added_to_start = false;
if (make_next_word_fuzzy != nullptr)

View File

@ -21,7 +21,6 @@
#include "bits16.h"
#include "elst2.h"
#include "genericvector.h" // GenericVector
#include "params.h"
#include "stepblob.h"
@ -173,18 +172,18 @@ public:
// Extracts all the noise outlines and stuffs the pointers into the given
// vector of outlines. Afterwards, the outlines vector owns the pointers.
void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
// Adds the selected outlines to the indcated real blobs, and puts the rest
// back in rej_cblobs where they came from. Where the target_blobs entry is
// nullptr, a run of wanted outlines is put into a single new blob.
// Ownership of the outlines is transferred back to the word. (Hence
// GenericVector and not PointerVector.)
// vector and not PointerVector.)
// Returns true if any new blob was added to the start of the word, which
// suggests that it might need joining to the word before it, and likewise
// sets make_next_word_fuzzy true if any new blob was added to the end.
bool AddSelectedOutlines(const GenericVector<bool> &wanted,
const GenericVector<C_BLOB *> &target_blobs,
const GenericVector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
bool AddSelectedOutlines(const std::vector<bool> &wanted,
const std::vector<C_BLOB *> &target_blobs,
const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
private:
uint8_t blanks = 0; // no of blanks

View File

@ -4,7 +4,6 @@
// File: genericheap.h
// Description: Template heap class.
// Author: Ray Smith, based on Dan Johnson's original code.
// Created: Wed Mar 14 08:13:00 PDT 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
@ -38,7 +37,7 @@ namespace tesseract {
// GenericHeap doesn't look inside it except for operator<.
//
// The heap is stored as a packed binary tree in an array hosted by a
// GenericVector<Pair>, with the invariant that the children of each node are
// vector<Pair>, with the invariant that the children of each node are
// both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator<
// to use Key::operator< to generate a MIN heap and KDPairDec defines
// Pair::operator< to use Key::operator> to generate a MAX heap by reversing
@ -59,7 +58,7 @@ template <typename Pair>
class GenericHeap {
public:
GenericHeap() = default;
// The initial size is only a GenericVector::reserve. It is not enforced as
// The initial size is only a vector::reserve. It is not enforced as
// the size limit of the heap. Caller must implement their own enforcement.
explicit GenericHeap(int initial_size) {
heap_.reserve(initial_size);
@ -77,12 +76,12 @@ public:
}
void clear() {
// Clear truncates to 0 to keep the number reserved in tact.
heap_.truncate(0);
heap_.clear();
}
// Provides access to the underlying vector.
// Caution! any changes that modify the keys will invalidate the heap!
GenericVector<Pair> *heap() {
return &heap_;
std::vector<Pair> &heap() {
return heap_;
}
// Provides read-only access to an element of the underlying vector.
const Pair &get(int index) const {
@ -128,11 +127,11 @@ public:
// Sift the hole at the start of the heap_ downwards to match the last
// element.
Pair hole_pair = heap_[new_size];
heap_.truncate(new_size);
heap_.resize(new_size);
int hole_index = SiftDown(0, hole_pair);
heap_[hole_index] = hole_pair;
} else {
heap_.truncate(new_size);
heap_.resize(new_size);
}
return true;
}
@ -154,7 +153,7 @@ public:
int hole_index = SiftUp(worst_index, hole_pair);
heap_[hole_index] = hole_pair;
}
heap_.truncate(heap_size);
heap_.resize(heap_size);
return true;
}
@ -179,7 +178,7 @@ public:
// The pointed-to Pair has changed its key value, so the location of pair
// is reshuffled to maintain the heap invariant.
// Must be a valid pointer to an element of the heap_!
// Caution! Since GenericHeap is based on GenericVector, reallocs may occur
// Caution! Since GenericHeap is based on vector, reallocs may occur
// whenever the vector is extended and elements may get shuffled by any
// Push or Pop operation. Therefore use this function only if Data in Pair is
// of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as
@ -235,7 +234,7 @@ private:
}
private:
GenericVector<Pair> heap_;
std::vector<Pair> heap_;
};
} // namespace tesseract

View File

@ -225,16 +225,6 @@ public:
qsort(data_, size_used_, sizeof(*data_), comparator);
}
// Searches the array (assuming sorted in ascending order, using sort()) for
// an element equal to target and returns true if it is present.
// Use binary_search to get the index of target, or its nearest candidate.
bool bool_binary_search(const T &target) const {
int index = binary_search(target);
if (index >= size_used_) {
return false;
}
return data_[index] == target;
}
// Searches the array (assuming sorted in ascending order, using sort()) for
// an element equal to target and returns the index of the best candidate.
// The return value is conceptually the largest index i such that
@ -255,25 +245,6 @@ public:
return bottom;
}
// Compact the vector by deleting elements using operator!= on basic types.
// The vector must be sorted.
void compact_sorted() {
if (size_used_ == 0) {
return;
}
// First element is in no matter what, hence the i = 1.
int last_write = 0;
for (int i = 1; i < size_used_; ++i) {
// Finds next unique item and writes it.
if (data_[last_write] != data_[i]) {
data_[++last_write] = data_[i];
}
}
// last_write is the index of a valid data cell, so add 1.
size_used_ = last_write + 1;
}
// Returns the index of what would be the target_index_th item in the array
// if the members were sorted, without actually sorting. Members are
// shuffled around, but it takes O(n) time.

View File

@ -24,6 +24,7 @@
#include <cmath> // std::isfinite
#include <cstdio>
#include <cstring>
#include <algorithm> // for std::find
#include <functional>
#include <random>
#include <string>
@ -31,6 +32,11 @@
namespace tesseract {
template <class T>
inline bool contains(const std::vector<T> &data, const T &value) {
return std::find(data.begin(), data.end(), value) != data.end();
}
inline const std::vector<std::string> split(const std::string &s, char c) {
std::string buff;
std::vector<std::string> v;

View File

@ -245,8 +245,7 @@ void UnicharCompress::DefragmentCodeValues(int encoded_null) {
// all codes are used. Likewise with the Han encoding, it is possible that not
// all numbers of strokes are used.
ComputeCodeRange();
GenericVector<int> offsets;
offsets.init_to_size(code_range_, 0);
std::vector<int> offsets(code_range_);
// Find which codes are used
for (int c = 0; c < encoder_.size(); ++c) {
const RecodedCharID &code = encoder_[c];
@ -390,26 +389,26 @@ void UnicharCompress::SetupDecoder() {
prefix.Truncate(len);
auto final_it = final_codes_.find(prefix);
if (final_it == final_codes_.end()) {
auto *code_list = new GenericVector<int>;
auto *code_list = new std::vector<int>;
code_list->push_back(code(len));
final_codes_[prefix] = code_list;
while (--len >= 0) {
prefix.Truncate(len);
auto next_it = next_codes_.find(prefix);
if (next_it == next_codes_.end()) {
auto *code_list = new GenericVector<int>;
auto *code_list = new std::vector<int>;
code_list->push_back(code(len));
next_codes_[prefix] = code_list;
} else {
// We still have to search the list as we may get here via multiple
// lengths of code.
if (!next_it->second->contains(code(len)))
if (!contains(*next_it->second, code(len)))
next_it->second->push_back(code(len));
break; // This prefix has been processed.
}
}
} else {
if (!final_it->second->contains(code(len)))
if (!contains(*final_it->second, code(len)))
final_it->second->push_back(code(len));
}
}

View File

@ -22,7 +22,7 @@
#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
#include <unordered_map>
#include "genericvector.h" // GenericVector
#include <vector>
#include "serialis.h"
#include "unicharset.h"
@ -178,13 +178,13 @@ public:
}
// Returns a list of valid non-final next codes for a given prefix code,
// which may be empty.
const GenericVector<int> *GetNextCodes(const RecodedCharID &code) const {
const std::vector<int> *GetNextCodes(const RecodedCharID &code) const {
auto it = next_codes_.find(code);
return it == next_codes_.end() ? nullptr : it->second;
}
// Returns a list of valid final codes for a given prefix code, which may
// be empty.
const GenericVector<int> *GetFinalCodes(const RecodedCharID &code) const {
const std::vector<int> *GetFinalCodes(const RecodedCharID &code) const {
auto it = final_codes_.find(code);
return it == final_codes_.end() ? nullptr : it->second;
}
@ -225,14 +225,14 @@ private:
// Decoder converts the output of encoder back to a unichar-id.
std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash> decoder_;
// True if the index is a valid single or start code.
GenericVector<bool> is_valid_start_;
std::vector<bool> is_valid_start_;
// Maps a prefix code to a list of valid next codes.
// The map owns the vectors.
std::unordered_map<RecodedCharID, GenericVector<int> *, RecodedCharID::RecodedCharIDHash>
std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
next_codes_;
// Maps a prefix code to a list of valid final codes.
// The map owns the vectors.
std::unordered_map<RecodedCharID, GenericVector<int> *, RecodedCharID::RecodedCharIDHash>
std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
final_codes_;
// Max of any value in encoder_ + 1.
int code_range_;

View File

@ -57,9 +57,9 @@ struct NodeChild {
NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
};
using NodeChildVector = GenericVector<NodeChild>;
using SuccessorList = GenericVector<int>;
using SuccessorListsVector = GenericVector<SuccessorList *>;
using NodeChildVector = std::vector<NodeChild>;
using SuccessorList = std::vector<int>;
using SuccessorListsVector = std::vector<SuccessorList *>;
enum DawgType {
DAWG_TYPE_PUNCTUATION,
@ -176,7 +176,7 @@ public:
/// Fills vec with unichar ids that represent the character classes
/// of the given unichar_id.
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const {
std::vector<UNICHAR_ID> *vec) const {
(void)unichar_id;
(void)unicharset;
(void)vec;
@ -355,15 +355,16 @@ struct DawgPosition {
bool back_to_punc = false;
};
class DawgPositionVector : public GenericVector<DawgPosition> {
class DawgPositionVector : public std::vector<DawgPosition> {
public:
/// Adds an entry for the given dawg_index with the given node to the vec.
/// Returns false if the same entry already exists in the vector,
/// true otherwise.
inline bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg) {
for (int i = 0; i < size(); ++i) {
if (data_[i] == new_pos)
for (auto position : *this) {
if (position == new_pos) {
return false;
}
}
push_back(new_pos);
if (debug) {

View File

@ -201,19 +201,19 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
punc_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
if (punc_dawg_)
dawgs_ += punc_dawg_;
dawgs_.push_back(punc_dawg_);
}
if (load_system_dawg) {
Dawg *system_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg)
dawgs_ += system_dawg;
dawgs_.push_back(system_dawg);
}
if (load_number_dawg) {
Dawg *number_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg)
dawgs_ += number_dawg;
dawgs_.push_back(number_dawg);
}
if (load_bigram_dawg) {
bigram_dawg_ =
@ -225,13 +225,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
freq_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
if (freq_dawg_)
dawgs_ += freq_dawg_;
dawgs_.push_back(freq_dawg_);
}
if (load_unambig_dawg) {
unambig_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
if (unambig_dawg_)
dawgs_ += unambig_dawg_;
dawgs_.push_back(unambig_dawg_);
}
std::string name;
@ -249,7 +249,7 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
@ -267,13 +267,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
document_words_ =
new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
dawgs_ += document_words_;
dawgs_.push_back(document_words_);
// This dawg is temporary and should not be searched by letter_is_ok.
pending_words_ =
@ -287,19 +287,19 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
punc_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
if (punc_dawg_)
dawgs_ += punc_dawg_;
dawgs_.push_back(punc_dawg_);
}
if (load_system_dawg) {
Dawg *system_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg)
dawgs_ += system_dawg;
dawgs_.push_back(system_dawg);
}
if (load_number_dawg) {
Dawg *number_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg)
dawgs_ += number_dawg;
dawgs_.push_back(number_dawg);
}
// stolen from Dict::Load (but needs params_ from Tesseract
@ -319,7 +319,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
@ -337,7 +337,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
}
@ -358,9 +358,9 @@ bool Dict::FinishLoad() {
const Dawg *other = dawgs_[j];
if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
kDawgSuccessors[dawg->type()][other->type()])
*lst += j;
lst->push_back(j);
}
successors_ += lst;
successors_.push_back(lst);
}
return true;
}
@ -378,7 +378,9 @@ void Dict::End() {
delete dawg_cache_;
dawg_cache_ = nullptr;
}
successors_.delete_data_pointers();
for (auto successor : successors_) {
delete successor;
}
dawgs_.clear();
successors_.clear();
document_words_ = nullptr;
@ -550,7 +552,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
// Try to find the edge corresponding to the exact unichar_id and to all the
// edges corresponding to the character class of unichar_id.
GenericVector<UNICHAR_ID> unichar_id_patterns;
std::vector<UNICHAR_ID> unichar_id_patterns;
unichar_id_patterns.push_back(unichar_id);
dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
for (int i = 0; i < unichar_id_patterns.size(); ++i) {
@ -605,12 +607,12 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern
int dawg_ty = dawgs_[i]->type();
bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
*dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
if (dawg_debug_level >= 3) {
tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
}
} else if (!punc_dawg_available || !subsumed_by_punc) {
*dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
if (dawg_debug_level >= 3) {
tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
}

View File

@ -54,7 +54,7 @@ struct CHAR_FRAGMENT_INFO {
float certainty;
};
using DawgVector = GenericVector<Dawg *>;
using DawgVector = std::vector<Dawg *>;
//
// Constants
@ -495,7 +495,7 @@ private:
// matching. The first member of each list is taken as canonical. For
// example, the first list contains hyphens and dashes with the first symbol
// being the ASCII hyphen minus.
std::vector<GenericVector<UNICHAR_ID>> equivalent_symbols_;
std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
// Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
DawgCache *dawg_cache_;
bool dawg_cache_is_ours_; // we should delete our own dawg_cache_

View File

@ -2,7 +2,6 @@
** Filename: stopper.h
** Purpose: Stopping criteria for word classifier.
** Author: Dan Johnson
** History: Wed May 1 09:42:57 1991, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
@ -22,7 +21,6 @@
#include "ratngs.h"
#include <tesseract/unichar.h>
#include "genericvector.h"
namespace tesseract {
@ -46,7 +44,7 @@ struct DANGERR_INFO {
UNICHAR_ID leftmost; // in the replacement, what's the leftmost character?
};
using DANGERR = GenericVector<DANGERR_INFO>;
using DANGERR = std::vector<DANGERR_INFO>;
} // namespace tesseract

View File

@ -24,7 +24,6 @@
#include "dawg.h"
#include "dict.h"
#include "genericvector.h"
#include "helpers.h"
#include "kdpair.h"
@ -49,7 +48,9 @@ const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
// Reset the Trie to empty.
void Trie::clear() {
nodes_.delete_data_pointers();
for (auto node : nodes_) {
delete node;
}
nodes_.clear();
root_back_freelist_.clear();
num_edges_ = 0;
@ -122,10 +123,11 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in
EDGE_RECORD edge_rec;
link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
EDGE_INDEX edge_index = root_back_freelist_.pop_back();
EDGE_INDEX edge_index = root_back_freelist_.back();
root_back_freelist_.pop_back();
(*vec)[edge_index] = edge_rec;
} else if (search_index < vec->size()) {
vec->insert(edge_rec, search_index);
vec->insert(vec->begin() + search_index, edge_rec);
} else {
vec->push_back(edge_rec);
}
@ -153,7 +155,7 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool m
*edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
}
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions) {
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {
if (word.length() <= 0)
return false; // can't add empty words
if (repetitions != nullptr)
@ -330,7 +332,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
}
void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const {
std::vector<UNICHAR_ID> *vec) const {
bool is_alpha = unicharset.get_isalpha(unichar_id);
if (is_alpha) {
vec->push_back(alpha_pattern_);
@ -388,7 +390,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset)
// Parse the pattern and construct a unichar id vector.
// Record the number of repetitions of each unichar in the parallel vector.
WERD_CHOICE word(&unicharset);
GenericVector<bool> repetitions_vec;
std::vector<bool> repetitions_vec;
const char *str_ptr = string;
int step = unicharset.step(str_ptr);
bool failed = false;
@ -462,12 +464,12 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
tprintf("\n");
}
if (direction == FORWARD_EDGE) {
nodes_[node1]->forward_edges.remove(edge_index);
nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
} else if (node1 == 0) {
KillEdge(&nodes_[node1]->backward_edges[edge_index]);
root_back_freelist_.push_back(edge_index);
} else {
nodes_[node1]->backward_edges.remove(edge_index);
nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
}
--num_edges_;
}
@ -476,7 +478,7 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
// 1 Avoid insertion sorting or bubble sorting the tail root node
// (back links on node 0, a list of all the leaves.). The node is
// huge, and sorting it with n^2 time is terrible.
// 2 Avoid using GenericVector::remove on the tail root node.
// 2 Avoid using vector::erase on the tail root node.
// (a) During add of words to the trie, zero-out the unichars and
// keep a freelist of spaces to re-use.
// (b) During reduction, just zero-out the unichars of deleted back
@ -624,13 +626,13 @@ void Trie::sort_edges(EDGE_VECTOR *edges) {
int num_edges = edges->size();
if (num_edges <= 1)
return;
GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
sort_vec.reserve(num_edges);
for (int i = 0; i < num_edges; ++i) {
sort_vec.push_back(
KDPairInc<UNICHAR_ID, EDGE_RECORD>(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
}
sort_vec.sort();
std::sort(sort_vec.begin(), sort_vec.end());
for (int i = 0; i < num_edges; ++i)
(*edges)[i] = sort_vec[i].data();
}

View File

@ -21,14 +21,12 @@
#include "dawg.h"
#include "genericvector.h"
namespace tesseract {
class UNICHARSET;
// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
// max int32, we will need to change GenericVector to use int64 for size
// max int32, we will need to change vector to use int64 for size
// and address indices. This does not seem to be needed immediately,
// since currently the largest number of edges limit used by tesseract
// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
@ -39,13 +37,13 @@ class UNICHARSET;
// the 64 bit EDGE_RECORD.
using EDGE_INDEX = int64_t; // index of an edge in a given node
using NODE_MARKER = bool *;
using EDGE_VECTOR = GenericVector<EDGE_RECORD>;
using EDGE_VECTOR = std::vector<EDGE_RECORD>;
struct TRIE_NODE_RECORD {
EDGE_VECTOR forward_edges;
EDGE_VECTOR backward_edges;
};
using TRIE_NODES = GenericVector<TRIE_NODE_RECORD *>;
using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;
/**
* Concrete class for Trie data structure that allows to store a list of
@ -88,7 +86,9 @@ public:
initialized_patterns_ = false;
}
~Trie() override {
nodes_.delete_data_pointers();
for (auto node : nodes_) {
delete node;
}
}
// Reset the Trie to empty.
@ -230,7 +230,7 @@ public:
// Fills in the given unichar id vector with the unichar ids that represent
// the patterns of the character classes of the given unichar_id.
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const override;
std::vector<UNICHAR_ID> *vec) const override;
// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
// a self loop and the given unichar_id matches the unichar_id stored in the
@ -256,7 +256,7 @@ public:
//
// Return true if add succeeded, false otherwise (e.g. when a word contained
// an invalid unichar id or the trie was getting too large and was cleared).
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions);
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
bool add_word_to_dawg(const WERD_CHOICE &word) {
return add_word_to_dawg(word, nullptr);
}
@ -395,7 +395,7 @@ protected:
// Member variables
TRIE_NODES nodes_; // vector of nodes in the Trie
// Freelist of edges in the root backwards node that were previously zeroed.
GenericVector<EDGE_INDEX> root_back_freelist_;
std::vector<EDGE_INDEX> root_back_freelist_;
uint64_t num_edges_; // sum of all edges (forward and backward)
uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction
uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index

View File

@ -129,10 +129,8 @@ void FullyConnected::Forward(bool debug, const NetworkIO &input,
else
output->Resize(input, no_);
SetupForward(input, input_transpose);
GenericVector<NetworkScratch::FloatVec> temp_lines;
temp_lines.init_to_size(kNumThreads, NetworkScratch::FloatVec());
GenericVector<NetworkScratch::FloatVec> curr_input;
curr_input.init_to_size(kNumThreads, NetworkScratch::FloatVec());
std::vector<NetworkScratch::FloatVec> temp_lines(kNumThreads);
std::vector<NetworkScratch::FloatVec> curr_input(kNumThreads);
int ro = no_;
if (IntSimdMatrix::intSimdMatrix)
ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
@ -233,13 +231,12 @@ bool FullyConnected::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkSc
DisplayBackward(fwd_deltas);
#endif
back_deltas->Resize(fwd_deltas, ni_);
GenericVector<NetworkScratch::FloatVec> errors;
errors.init_to_size(kNumThreads, NetworkScratch::FloatVec());
std::vector<NetworkScratch::FloatVec> errors(kNumThreads);
for (int i = 0; i < kNumThreads; ++i)
errors[i].Init(no_, scratch);
GenericVector<NetworkScratch::FloatVec> temp_backprops;
std::vector<NetworkScratch::FloatVec> temp_backprops;
if (needs_to_backprop_) {
temp_backprops.init_to_size(kNumThreads, NetworkScratch::FloatVec());
temp_backprops.resize(kNumThreads);
for (int i = 0; i < kNumThreads; ++i)
temp_backprops[i].Init(ni_, scratch);
}

View File

@ -297,10 +297,10 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
// for the other dimension, used only when working in true 2D mode. The width
// is enough to hold an entire strip of the major direction.
int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1;
GenericVector<NetworkScratch::FloatVec> states, outputs;
std::vector<NetworkScratch::FloatVec> states, outputs;
if (Is2D()) {
states.init_to_size(buf_width, NetworkScratch::FloatVec());
outputs.init_to_size(buf_width, NetworkScratch::FloatVec());
states.resize(buf_width);
outputs.resize(buf_width);
for (int i = 0; i < buf_width; ++i) {
states[i].Init(ns_, scratch);
ZeroVector<double>(ns_, states[i]);
@ -494,10 +494,10 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
// Rotating buffers of width buf_width allow storage of the recurrent time-
// steps used only for true 2-D. Stores one full strip of the major direction.
int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1;
GenericVector<NetworkScratch::FloatVec> stateerr, sourceerr;
std::vector<NetworkScratch::FloatVec> stateerr, sourceerr;
if (Is2D()) {
stateerr.init_to_size(buf_width, NetworkScratch::FloatVec());
sourceerr.init_to_size(buf_width, NetworkScratch::FloatVec());
stateerr.resize(buf_width);
sourceerr.resize(buf_width);
for (int t = 0; t < buf_width; ++t) {
stateerr[t].Init(ns_, scratch);
sourceerr[t].Init(na_, scratch);

View File

@ -2,7 +2,6 @@
// File: networkio.cpp
// Description: Network input/output data, allowing float/int implementations.
// Author: Ray Smith
// Created: Thu Jun 19 13:01:31 PST 2014
//
// (C) Copyright 2014, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
@ -507,7 +506,7 @@ int NetworkIO::BestLabel(int t, int not_this, int not_that, float *score) const
// Returns the best start position out of [start, end) (into which all labels
// must fit) to obtain the highest cumulative score for the given labels.
int NetworkIO::PositionOfBestMatch(const GenericVector<int> &labels, int start, int end) const {
int NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const {
int length = labels.size();
int last_start = end - length;
int best_start = -1;
@ -524,7 +523,7 @@ int NetworkIO::PositionOfBestMatch(const GenericVector<int> &labels, int start,
// Returns the cumulative score of the given labels starting at start, and
// using one label per time-step.
double NetworkIO::ScoreOfLabels(const GenericVector<int> &labels, int start) const {
double NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const {
int length = labels.size();
double score = 0.0;
for (int i = 0; i < length; ++i) {

View File

@ -23,7 +23,6 @@
#include <cstdio>
#include <vector>
#include "genericvector.h"
#include "helpers.h"
#include "static_shape.h"
#include "stridemap.h"
@ -169,10 +168,10 @@ public:
int BestLabel(int t, int not_this, int not_that, float *score) const;
// Returns the best start position out of range (into which both start and end
// must fit) to obtain the highest cumulative score for the given labels.
int PositionOfBestMatch(const GenericVector<int> &labels, int start, int end) const;
int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const;
// Returns the cumulative score of the given labels starting at start, and
// using one label per time-step.
double ScoreOfLabels(const GenericVector<int> &labels, int start) const;
double ScoreOfLabels(const std::vector<int> &labels, int start) const;
// Helper function sets all the outputs for a single timestep, such that
// label has value ok_score, and the other labels share 1 - ok_score.
// Assumes float mode.

View File

@ -20,14 +20,13 @@
#define TESSERACT_LSTM_NETWORKSCRATCH_H_
#include <mutex>
#include "genericvector.h"
#include "matrix.h"
#include "networkio.h"
namespace tesseract {
// Generic scratch space for network layers. Provides NetworkIO that can store
// a complete set (over time) of intermediates, and GenericVector<float>
// a complete set (over time) of intermediates, and vector<float>
// scratch space that auto-frees after use. The aim here is to provide a set
// of temporary buffers to network layers that can be reused between layers
// and don't have to be reallocated on each call.
@ -125,7 +124,7 @@ public:
}; // class IO.
// Class that acts like a fixed array of float, yet actually uses space
// from a GenericVector<float> in the source NetworkScratch, and knows how
// from a vector<float> in the source NetworkScratch, and knows how
// to unstack the borrowed vector on destruction.
class FloatVec {
public:
@ -145,12 +144,8 @@ public:
scratch_space_->vec_stack_.Return(vec_);
scratch_space_ = scratch;
vec_ = scratch_space_->vec_stack_.Borrow();
// Abuse vec_ here; first resize to 'reserve', which is larger
// than 'size' (i.e. it's size rounded up) then resize down again
// to the desired size. This assumes that the implementation does
// not shrink the storage on a resize.
vec_->resize_no_init(reserve);
vec_->resize_no_init(size);
vec_->reserve(reserve);
vec_->resize(size);
data_ = &(*vec_)[0];
}
@ -169,7 +164,7 @@ public:
private:
// Vector borrowed from the scratch space. Use Return to free it.
GenericVector<double> *vec_;
std::vector<double> *vec_;
// Short-cut pointer to the underlying array.
double *data_;
// The source scratch_space_. Borrowed pointer, used to free the
@ -251,7 +246,7 @@ public:
private:
PointerVector<T> stack_;
GenericVector<bool> flags_;
std::vector<bool> flags_;
int stack_top_;
std::mutex mutex_;
}; // class Stack.
@ -259,11 +254,11 @@ public:
private:
// If true, the network weights are int8_t, if false, float.
bool int_mode_;
// Stacks of NetworkIO and GenericVector<float>. Once allocated, they are not
// Stacks of NetworkIO and vector<float>. Once allocated, they are not
// deleted until the NetworkScratch is deleted.
Stack<NetworkIO> int_stack_;
Stack<NetworkIO> float_stack_;
Stack<GenericVector<double>> vec_stack_;
Stack<std::vector<double>> vec_stack_;
Stack<TransposedArray> array_stack_;
};

View File

@ -61,8 +61,7 @@ void Parallel::Forward(bool debug, const NetworkIO &input, const TransposedArray
int stack_size = stack_.size();
if (type_ == NT_PAR_2D_LSTM) {
// Special case, run parallel in parallel.
GenericVector<NetworkScratch::IO> results;
results.init_to_size(stack_size, NetworkScratch::IO());
std::vector<NetworkScratch::IO> results(stack_size);
for (int i = 0; i < stack_size; ++i) {
results[i].Resize(input, stack_[i]->NumOutputs(), scratch);
}
@ -124,9 +123,8 @@ bool Parallel::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch
int stack_size = stack_.size();
if (type_ == NT_PAR_2D_LSTM) {
// Special case, run parallel in parallel.
GenericVector<NetworkScratch::IO> in_deltas, out_deltas;
in_deltas.init_to_size(stack_size, NetworkScratch::IO());
out_deltas.init_to_size(stack_size, NetworkScratch::IO());
std::vector<NetworkScratch::IO> in_deltas(stack_size);
std::vector<NetworkScratch::IO> out_deltas(stack_size);
// Split the forward deltas for each stack element.
int feature_offset = 0;
for (int i = 0; i < stack_.size(); ++i) {

View File

@ -190,7 +190,7 @@ bool Plumbing::Serialize(TFile *fp) const {
for (uint32_t i = 0; i < size; ++i)
if (!stack_[i]->Serialize(fp))
return false;
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !learning_rates_.Serialize(fp)) {
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->Serialize(learning_rates_)) {
return false;
}
return true;
@ -209,7 +209,7 @@ bool Plumbing::DeSerialize(TFile *fp) {
return false;
AddToStack(network);
}
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !learning_rates_.DeSerialize(fp)) {
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->DeSerialize(learning_rates_)) {
return false;
}
return true;

View File

@ -19,7 +19,6 @@
#ifndef TESSERACT_LSTM_PLUMBING_H_
#define TESSERACT_LSTM_PLUMBING_H_
#include "genericvector.h"
#include "matrix.h"
#include "network.h"
@ -139,7 +138,7 @@ protected:
PointerVector<Network> stack_;
// Layer-specific learning rate iff network_flags_ & NF_LAYER_SPECIFIC_LR.
// One element for each element of stack_.
GenericVector<float> learning_rates_;
std::vector<float> learning_rates_;
};
} // namespace tesseract.

View File

@ -23,7 +23,7 @@
#include "pageres.h"
#include "unicharcompress.h"
#include <algorithm>
#include <algorithm> // for std::reverse
#include <deque>
#include <map>
#include <set>
@ -181,7 +181,7 @@ void RecodeBeamSearch::ExtractBestPathAsLabels(std::vector<int> *labels,
std::vector<int> *xcoords) const {
labels->clear();
xcoords->clear();
GenericVector<const RecodeNode *> best_nodes;
std::vector<const RecodeNode *> best_nodes;
ExtractBestPaths(&best_nodes, nullptr);
// Now just run CTC on the best nodes.
int t = 0;
@ -205,7 +205,7 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET
std::vector<float> *certs,
std::vector<float> *ratings,
std::vector<int> *xcoords) const {
GenericVector<const RecodeNode *> best_nodes;
std::vector<const RecodeNode *> best_nodes;
ExtractBestPaths(&best_nodes, nullptr);
ExtractPathAsUnicharIds(best_nodes, unichar_ids, certs, ratings, xcoords);
if (debug) {
@ -224,8 +224,8 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX &line_box, float scale_
std::vector<float> certs;
std::vector<float> ratings;
std::vector<int> xcoords;
GenericVector<const RecodeNode *> best_nodes;
GenericVector<const RecodeNode *> second_nodes;
std::vector<const RecodeNode *> best_nodes;
std::vector<const RecodeNode *> second_nodes;
character_boundaries_.clear();
ExtractBestPaths(&best_nodes, &second_nodes);
if (debug) {
@ -306,10 +306,10 @@ void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET *
}
// fill the topology with depths first
for (int step = beam->size() - 1; step >= 0; --step) {
GenericVector<tesseract::RecodePair> *heaps = beam->get(step)->beams_->heap();
for (int node = 0; node < heaps->size(); ++node) {
std::vector<tesseract::RecodePair> &heaps = beam->get(step)->beams_->heap();
for (auto node : heaps) {
int backtracker = 0;
const RecodeNode *curr = &heaps->get(node).data();
const RecodeNode *curr = &node.data();
while (curr != nullptr && !visited.count(curr)) {
visited.insert(curr);
topology[step - backtracker].push_back(curr);
@ -371,7 +371,6 @@ void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET *
}
void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
GenericVector<tesseract::RecodePair> *heaps = nullptr;
PointerVector<RecodeBeam> *currentBeam = nullptr;
if (character_boundaries_.size() < 2)
return;
@ -389,14 +388,15 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
std::vector<float> ratings;
std::vector<int> xcoords;
int backpath = character_boundaries_[j] - character_boundaries_[j - 1];
heaps = currentBeam->get(character_boundaries_[j] - 1)->beams_->heap();
GenericVector<const RecodeNode *> best_nodes;
std::vector<tesseract::RecodePair> &heaps =
currentBeam->get(character_boundaries_[j] - 1)->beams_->heap();
std::vector<const RecodeNode *> best_nodes;
std::vector<const RecodeNode *> best;
// Scan the segmented node chain for valid unichar ids.
for (int i = 0; i < heaps->size(); ++i) {
for (auto entry : heaps) {
bool validChar = false;
int backcounter = 0;
const RecodeNode *node = &heaps->get(i).data();
const RecodeNode *node = &entry.data();
while (node != nullptr && backcounter < backpath) {
if (node->code != null_char_ && node->unichar_id != INVALID_UNICHAR_ID) {
validChar = true;
@ -406,7 +406,7 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
++backcounter;
}
if (validChar)
best.push_back(&heaps->get(i).data());
best.push_back(&entry.data());
}
// find the best rated segmented node chain and extract the unichar id.
if (!best.empty()) {
@ -488,8 +488,7 @@ void RecodeBeamSearch::DebugBeams(const UNICHARSET &unicharset) const {
// Generates debug output of the content of a single beam position.
void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHeap &heap) const {
GenericVector<const RecodeNode *> unichar_bests;
unichar_bests.init_to_size(unicharset.size(), nullptr);
std::vector<const RecodeNode *> unichar_bests(unicharset.size());
const RecodeNode *null_best = nullptr;
int heap_size = heap.size();
for (int i = 0; i < heap_size; ++i) {
@ -518,7 +517,7 @@ void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHe
// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
// duplicates, nulls and intermediate parts.
/* static */
void RecodeBeamSearch::ExtractPathAsUnicharIds(const GenericVector<const RecodeNode *> &best_nodes,
void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vector<const RecodeNode *> &best_nodes,
std::vector<int> *unichar_ids,
std::vector<float> *certs,
std::vector<float> *ratings,
@ -699,14 +698,14 @@ void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio
if (debug) {
int beam_index = BeamIndex(true, NC_ANYTHING, 0);
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
GenericVector<const RecodeNode *> path;
std::vector<const RecodeNode *> path;
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
tprintf("Step %d: Dawg beam %d:\n", t, i);
DebugPath(charset, path);
}
beam_index = BeamIndex(false, NC_ANYTHING, 0);
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
GenericVector<const RecodeNode *> path;
std::vector<const RecodeNode *> path;
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
tprintf("Step %d: Non-Dawg beam %d:\n", t, i);
DebugPath(charset, path);
@ -765,14 +764,14 @@ void RecodeBeamSearch::DecodeSecondaryStep(const float *outputs, int t, double d
if (debug) {
int beam_index = BeamIndex(true, NC_ANYTHING, 0);
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
GenericVector<const RecodeNode *> path;
std::vector<const RecodeNode *> path;
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
tprintf("Step %d: Dawg beam %d:\n", t, i);
DebugPath(charset, path);
}
beam_index = BeamIndex(false, NC_ANYTHING, 0);
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
GenericVector<const RecodeNode *> path;
std::vector<const RecodeNode *> path;
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
tprintf("Step %d: Non-Dawg beam %d:\n", t, i);
DebugPath(charset, path);
@ -858,7 +857,7 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const
dict_ratio, use_dawgs, NC_ANYTHING, prev, step);
}
}
const GenericVector<int> *final_codes = recoder_.GetFinalCodes(prefix);
const std::vector<int> *final_codes = recoder_.GetFinalCodes(prefix);
if (final_codes != nullptr) {
for (int i = 0; i < final_codes->size(); ++i) {
int code = (*final_codes)[i];
@ -892,7 +891,7 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const
}
}
}
const GenericVector<int> *next_codes = recoder_.GetNextCodes(prefix);
const std::vector<int> *next_codes = recoder_.GetNextCodes(prefix);
if (next_codes != nullptr) {
for (int i = 0; i < next_codes->size(); ++i) {
int code = (*next_codes)[i];
@ -1121,17 +1120,17 @@ bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *hea
// TODO(rays) consider hash map instead of linear search.
// It might not be faster because the hash map would have to be updated
// every time a heap reshuffle happens, and that would be a lot of overhead.
GenericVector<RecodePair> *nodes = heap->heap();
for (int i = 0; i < nodes->size(); ++i) {
RecodeNode &node = (*nodes)[i].data();
std::vector<RecodePair> &nodes = heap->heap();
for (int i = 0; i < nodes.size(); ++i) {
RecodeNode &node = nodes[i].data();
if (node.code == new_node->code && node.code_hash == new_node->code_hash &&
node.permuter == new_node->permuter && node.start_of_dawg == new_node->start_of_dawg) {
if (new_node->score > node.score) {
// The new one is better. Update the entire node in the heap and
// reshuffle.
node = *new_node;
(*nodes)[i].key() = node.score;
heap->Reshuffle(&(*nodes)[i]);
nodes[i].key() = node.score;
heap->Reshuffle(&nodes[i]);
}
return true;
}
@ -1156,8 +1155,8 @@ uint64_t RecodeBeamSearch::ComputeCodeHash(int code, bool dup, const RecodeNode
// during Decode. On return the best_nodes vector essentially contains the set
// of code, score pairs that make the optimal path with the constraint that
// the recoder can decode the code sequence back to a sequence of unichar-ids.
void RecodeBeamSearch::ExtractBestPaths(GenericVector<const RecodeNode *> *best_nodes,
GenericVector<const RecodeNode *> *second_nodes) const {
void RecodeBeamSearch::ExtractBestPaths(std::vector<const RecodeNode *> *best_nodes,
std::vector<const RecodeNode *> *second_nodes) const {
// Scan both beams to extract the best and second best paths.
const RecodeNode *best_node = nullptr;
const RecodeNode *second_best_node = nullptr;
@ -1201,30 +1200,30 @@ void RecodeBeamSearch::ExtractBestPaths(GenericVector<const RecodeNode *> *best_
// Helper backtracks through the lattice from the given node, storing the
// path and reversing it.
void RecodeBeamSearch::ExtractPath(const RecodeNode *node,
GenericVector<const RecodeNode *> *path) const {
path->truncate(0);
std::vector<const RecodeNode *> *path) const {
path->clear();
while (node != nullptr) {
path->push_back(node);
node = node->prev;
}
path->reverse();
std::reverse(path->begin(), path->end());
}
void RecodeBeamSearch::ExtractPath(const RecodeNode *node, GenericVector<const RecodeNode *> *path,
void RecodeBeamSearch::ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path,
int limiter) const {
int pathcounter = 0;
path->truncate(0);
path->clear();
while (node != nullptr && pathcounter < limiter) {
path->push_back(node);
node = node->prev;
++pathcounter;
}
path->reverse();
std::reverse(path->begin(), path->end());
}
// Helper prints debug information on the given lattice path.
void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset,
const GenericVector<const RecodeNode *> &path) const {
const std::vector<const RecodeNode *> &path) const {
for (int c = 0; c < path.size(); ++c) {
const RecodeNode &node = *path[c];
tprintf("%d ", c);
@ -1234,7 +1233,7 @@ void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset,
// Helper prints debug information on the given unichar path.
void RecodeBeamSearch::DebugUnicharPath(const UNICHARSET *unicharset,
const GenericVector<const RecodeNode *> &path,
const std::vector<const RecodeNode *> &path,
const std::vector<int> &unichar_ids,
const std::vector<float> &certs,
const std::vector<float> &ratings,

View File

@ -301,7 +301,7 @@ private:
// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
// duplicates, nulls and intermediate parts.
static void ExtractPathAsUnicharIds(const GenericVector<const RecodeNode *> &best_nodes,
static void ExtractPathAsUnicharIds(const std::vector<const RecodeNode *> &best_nodes,
std::vector<int> *unichar_ids, std::vector<float> *certs,
std::vector<float> *ratings, std::vector<int> *xcoords,
std::vector<int> *character_boundaries = nullptr);
@ -380,17 +380,17 @@ private:
// during Decode. On return the best_nodes vector essentially contains the set
// of code, score pairs that make the optimal path with the constraint that
// the recoder can decode the code sequence back to a sequence of unichar-ids.
void ExtractBestPaths(GenericVector<const RecodeNode *> *best_nodes,
GenericVector<const RecodeNode *> *second_nodes) const;
void ExtractBestPaths(std::vector<const RecodeNode *> *best_nodes,
std::vector<const RecodeNode *> *second_nodes) const;
// Helper backtracks through the lattice from the given node, storing the
// path and reversing it.
void ExtractPath(const RecodeNode *node, GenericVector<const RecodeNode *> *path) const;
void ExtractPath(const RecodeNode *node, GenericVector<const RecodeNode *> *path,
void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path) const;
void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path,
int limiter) const;
// Helper prints debug information on the given lattice path.
void DebugPath(const UNICHARSET *unicharset, const GenericVector<const RecodeNode *> &path) const;
void DebugPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path) const;
// Helper prints debug information on the given unichar path.
void DebugUnicharPath(const UNICHARSET *unicharset, const GenericVector<const RecodeNode *> &path,
void DebugUnicharPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path,
const std::vector<int> &unichar_ids, const std::vector<float> &certs,
const std::vector<float> &ratings, const std::vector<int> &xcoords) const;

View File

@ -44,7 +44,7 @@ bool TFNetwork::Serialize(TFile *fp) const {
return false;
std::string proto_str;
model_proto_.SerializeToString(&proto_str);
GenericVector<char> data;
std::vector<char> data;
data.resize_no_init(proto_str.size());
memcpy(&data[0], proto_str.data(), proto_str.size());
if (!data.Serialize(fp))
@ -55,7 +55,7 @@ bool TFNetwork::Serialize(TFile *fp) const {
// Reads from the given file. Returns false in case of error.
// Should be overridden by subclasses, but NOT called by their DeSerialize.
bool TFNetwork::DeSerialize(TFile *fp) {
GenericVector<char> data;
std::vector<char> data;
if (!data.DeSerialize(fp))
return false;
if (!model_proto_.ParseFromArray(&data[0], data.size())) {

View File

@ -256,8 +256,8 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {
if (int_mode_) {
if (!wi_.DeSerialize(fp))
return false;
GenericVector<float> old_scales;
if (!old_scales.DeSerialize(fp))
std::vector<float> old_scales;
if (!fp->DeSerialize(old_scales))
return false;
scales_.reserve(old_scales.size());
for (int i = 0; i < old_scales.size(); ++i) {

View File

@ -277,8 +277,8 @@ double BaselineRow::AdjustBaselineToGrid(int debug, const FCOORD &direction, dou
void BaselineRow::SetupBlobDisplacements(const FCOORD &direction) {
// Set of perpendicular displacements of the blob bottoms from the required
// baseline direction.
GenericVector<double> perp_blob_dists;
displacement_modes_.truncate(0);
std::vector<double> perp_blob_dists;
displacement_modes_.clear();
// Gather the skew-corrected position of every blob.
double min_dist = FLT_MAX;
double max_dist = -FLT_MAX;
@ -310,8 +310,8 @@ void BaselineRow::SetupBlobDisplacements(const FCOORD &direction) {
for (int i = 0; i < perp_blob_dists.size(); ++i) {
dist_stats.add(IntCastRounded(perp_blob_dists[i] / disp_quant_factor_), 1);
}
GenericVector<KDPairInc<float, int>> scaled_modes;
dist_stats.top_n_modes(kMaxDisplacementsModes, &scaled_modes);
std::vector<KDPairInc<float, int>> scaled_modes;
dist_stats.top_n_modes(kMaxDisplacementsModes, scaled_modes);
#ifdef kDebugYCoord
if (debug) {
for (int i = 0; i < scaled_modes.size(); ++i) {
@ -428,7 +428,7 @@ double BaselineBlock::SpacingModelError(double perp_disp, double line_spacing, d
bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) {
if (non_text_block_)
return false;
GenericVector<double> angles;
std::vector<double> angles;
for (int r = 0; r < rows_.size(); ++r) {
BaselineRow *row = rows_[r];
if (row->FitBaseline(use_box_bottoms)) {
@ -440,7 +440,7 @@ bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) {
}
if (!angles.empty()) {
skew_angle_ = MedianOfCircularValues(M_PI, &angles);
skew_angle_ = MedianOfCircularValues(M_PI, angles);
good_skew_angle_ = true;
} else {
skew_angle_ = 0.0f;
@ -610,7 +610,7 @@ void BaselineBlock::DrawPixSpline(Pix *pix_in) {
// observations.
bool BaselineBlock::ComputeLineSpacing() {
FCOORD direction(cos(skew_angle_), sin(skew_angle_));
GenericVector<double> row_positions;
std::vector<double> row_positions;
ComputeBaselinePositions(direction, &row_positions);
if (row_positions.size() < 2)
return false;
@ -644,7 +644,7 @@ bool BaselineBlock::ComputeLineSpacing() {
// of the block baseline a line sits, hence the function and argument name
// positions not distances.
void BaselineBlock::ComputeBaselinePositions(const FCOORD &direction,
GenericVector<double> *positions) {
std::vector<double> *positions) {
positions->clear();
for (int r = 0; r < rows_.size(); ++r) {
BaselineRow *row = rows_[r];
@ -659,7 +659,7 @@ void BaselineBlock::ComputeBaselinePositions(const FCOORD &direction,
// Computes an estimate of the line spacing of the block from the median
// of the spacings between adjacent overlapping textlines.
void BaselineBlock::EstimateLineSpacing() {
GenericVector<float> spacings;
std::vector<float> spacings;
for (int r = 0; r < rows_.size(); ++r) {
BaselineRow *row = rows_[r];
// Exclude silly lines.
@ -682,7 +682,8 @@ void BaselineBlock::EstimateLineSpacing() {
// If we have at least one value, use it, otherwise leave the previous
// value unchanged.
if (!spacings.empty()) {
line_spacing_ = spacings[spacings.choose_nth_item(spacings.size() / 2)];
std::nth_element(spacings.begin(), spacings.begin() + spacings.size() / 2, spacings.end());
line_spacing_ = spacings[spacings.size() / 2];
if (debug_level_ > 1)
tprintf("Estimate of linespacing = %g\n", line_spacing_);
}
@ -692,7 +693,7 @@ void BaselineBlock::EstimateLineSpacing() {
// line to the deskewed y-position of each baseline as a function of its
// estimated line index, allowing for a small error in the initial linespacing
// and choosing the best available model.
void BaselineBlock::RefineLineSpacing(const GenericVector<double> &positions) {
void BaselineBlock::RefineLineSpacing(const std::vector<double> &positions) {
double spacings[3], offsets[3], errors[3];
int index_range;
errors[0] =
@ -727,7 +728,7 @@ void BaselineBlock::RefineLineSpacing(const GenericVector<double> &positions) {
// and the corresponding intercept in c_out, and the number of spacings seen
// in index_delta. Returns the error of fit to the line spacing model.
// Uses a simple linear regression, but optimized the offset using the median.
double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions, double m_in,
double BaselineBlock::FitLineSpacingModel(const std::vector<double> &positions, double m_in,
double *m_out, double *c_out, int *index_delta) {
if (m_in == 0.0f || positions.size() < 2) {
*m_out = m_in;
@ -736,12 +737,12 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions
*index_delta = 0;
return 0.0;
}
GenericVector<double> offsets;
std::vector<double> offsets;
// Get the offset (remainder) linespacing for each line and choose the median.
for (int i = 0; i < positions.size(); ++i)
offsets.push_back(fmod(positions[i], m_in));
// Get the median offset.
double median_offset = MedianOfCircularValues(m_in, &offsets);
double median_offset = MedianOfCircularValues(m_in, offsets);
// Now fit a line to quantized line number and offset.
LLSQ llsq;
int min_index = INT32_MAX;
@ -755,7 +756,7 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions
// Get the refined line spacing.
*m_out = llsq.m();
// Use the median offset rather than the mean.
offsets.truncate(0);
offsets.clear();
if (*m_out != 0.0) {
for (int i = 0; i < positions.size(); ++i) {
offsets.push_back(fmod(positions[i], *m_out));
@ -766,7 +767,7 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions
tprintf("%d: %g\n", i, offsets[i]);
}
}
*c_out = MedianOfCircularValues(*m_out, &offsets);
*c_out = MedianOfCircularValues(*m_out, offsets);
} else {
*c_out = 0.0;
}
@ -808,7 +809,7 @@ BaselineDetect::BaselineDetect(int debug_level, const FCOORD &page_skew, TO_BLOC
// block-wise and page-wise data to smooth small blocks/rows, and applies
// smoothing based on block/page-level skew and block-level linespacing.
void BaselineDetect::ComputeStraightBaselines(bool use_box_bottoms) {
GenericVector<double> block_skew_angles;
std::vector<double> block_skew_angles;
for (int i = 0; i < blocks_.size(); ++i) {
BaselineBlock *bl_block = blocks_[i];
if (debug_level_ > 0)
@ -820,7 +821,7 @@ void BaselineDetect::ComputeStraightBaselines(bool use_box_bottoms) {
// Compute a page-wide default skew for blocks with too little information.
double default_block_skew = page_skew_.angle();
if (!block_skew_angles.empty()) {
default_block_skew = MedianOfCircularValues(M_PI, &block_skew_angles);
default_block_skew = MedianOfCircularValues(M_PI, block_skew_angles);
}
if (debug_level_ > 0) {
tprintf("Page skew angle = %g\n", default_block_skew);

View File

@ -23,8 +23,6 @@
#include "points.h"
#include "rect.h"
#include "genericvector.h"
struct Pix;
namespace tesseract {
@ -109,7 +107,7 @@ private:
FCOORD baseline_pt1_;
FCOORD baseline_pt2_;
// Set of modes of displacements. They indicate preferable baseline positions.
GenericVector<double> displacement_modes_;
std::vector<double> displacement_modes_;
// Quantization factor used for displacement_modes_.
double disp_quant_factor_;
// Half the acceptance range of blob displacements for computing the
@ -187,7 +185,7 @@ private:
// Computes the deskewed vertical position of each baseline in the block and
// stores them in the given vector.
void ComputeBaselinePositions(const FCOORD &direction, GenericVector<double> *positions);
void ComputeBaselinePositions(const FCOORD &direction, std::vector<double> *positions);
// Computes an estimate of the line spacing of the block from the median
// of the spacings between adjacent overlapping textlines.
@ -197,13 +195,13 @@ private:
// line to the deskewed y-position of each baseline as a function of its
// estimated line index, allowing for a small error in the initial linespacing
// and choosing the best available model.
void RefineLineSpacing(const GenericVector<double> &positions);
void RefineLineSpacing(const std::vector<double> &positions);
// Given an initial estimate of line spacing (m_in) and the positions of each
// baseline, computes the line spacing of the block more accurately in m_out,
// and the corresponding intercept in c_out, and the number of spacings seen
// in index_delta. Returns the error of fit to the line spacing model.
double FitLineSpacingModel(const GenericVector<double> &positions, double m_in, double *m_out,
double FitLineSpacingModel(const std::vector<double> &positions, double m_in, double *m_out,
double *c_out, int *index_delta);
// The block to which this class adds extra information used during baseline

View File

@ -384,6 +384,23 @@ int SortByBoxLeft(const void *void1, const void *void2) {
return p1->bounding_box().top() - p2->bounding_box().top();
}
template <class BBC>
bool StdSortByBoxLeft(const void *void1, const void *void2) {
// The void*s are actually doubly indirected, so get rid of one level.
const BBC *p1 = *static_cast<const BBC *const *>(void1);
const BBC *p2 = *static_cast<const BBC *const *>(void2);
int result = p1->bounding_box().left() - p2->bounding_box().left();
if (result != 0)
return result < 0;
result = p1->bounding_box().right() - p2->bounding_box().right();
if (result != 0)
return result < 0;
result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
if (result != 0)
return result < 0;
return p1->bounding_box().top() < p2->bounding_box().top();
}
// Sort function to sort a BBC by bounding_box().right() in right-to-left order.
template <class BBC>
int SortRightToLeft(const void *void1, const void *void2) {
@ -402,6 +419,23 @@ int SortRightToLeft(const void *void1, const void *void2) {
return p1->bounding_box().top() - p2->bounding_box().top();
}
template <class BBC>
bool StdSortRightToLeft(const void *void1, const void *void2) {
// The void*s are actually doubly indirected, so get rid of one level.
const BBC *p1 = *static_cast<const BBC *const *>(void1);
const BBC *p2 = *static_cast<const BBC *const *>(void2);
int result = p2->bounding_box().right() - p1->bounding_box().right();
if (result != 0)
return result < 0;
result = p2->bounding_box().left() - p1->bounding_box().left();
if (result != 0)
return result < 0;
result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
if (result != 0)
return result < 0;
return p1->bounding_box().top() < p2->bounding_box().top();
}
// Sort function to sort a BBC by bounding_box().bottom().
template <class BBC>
int SortByBoxBottom(const void *void1, const void *void2) {

View File

@ -18,7 +18,6 @@
///////////////////////////////////////////////////////////////////////
#include "cjkpitch.h"
#include "genericvector.h"
#include "topitch.h"
#include "tovars.h"
@ -109,7 +108,7 @@ public:
~LocalCorrelation() {}
void Finish() {
values_.sort(float_pair_compare);
std::sort(values_.begin(), values_.end(), float_pair_compare);
finalized_ = true;
}
@ -155,14 +154,12 @@ public:
}
private:
static int float_pair_compare(const void *a, const void *b) {
const auto *f_a = static_cast<const float_pair *>(a);
const auto *f_b = static_cast<const float_pair *>(b);
return (f_a->x > f_b->x) ? 1 : ((f_a->x < f_b->x) ? -1 : 0);
static bool float_pair_compare(const float_pair f_a, const float_pair f_b) {
return f_a.x < f_b.x;
}
bool finalized_;
GenericVector<struct float_pair> values_;
std::vector<struct float_pair> values_;
};
// Class to represent a character on a fixed pitch row. A FPChar may
@ -450,7 +447,7 @@ private:
index++;
}
}
characters_.truncate(index);
characters_.resize(index);
}
float pitch_ = 0.0f; // Character pitch.
@ -472,7 +469,7 @@ private:
SimpleStats heights_;
GenericVector<FPChar> characters_;
std::vector<FPChar> characters_;
TO_ROW *real_row_ = nullptr; // Underlying TD_ROW for this row.
};

View File

@ -101,7 +101,9 @@ ColumnFinder::ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &trig
}
ColumnFinder::~ColumnFinder() {
column_sets_.delete_data_pointers();
for (auto set : column_sets_) {
delete set;
}
delete[] best_columns_;
delete stroke_width_;
delete input_blobs_win_;
@ -552,7 +554,7 @@ bool ColumnFinder::MakeColumns(bool single_column) {
bool good_only = true;
do {
for (int i = 0; i < gridheight_; ++i) {
ColPartitionSet *line_set = part_sets.get(i);
ColPartitionSet *line_set = part_sets.at(i);
if (line_set != nullptr && line_set->LegalColumnCandidate()) {
ColPartitionSet *column_candidate = line_set->Copy(good_only);
if (column_candidate != nullptr)
@ -590,7 +592,7 @@ bool ColumnFinder::MakeColumns(bool single_column) {
ComputeMeanColumnGap(any_multi_column);
}
for (int i = 0; i < part_sets.size(); ++i) {
ColPartitionSet *line_set = part_sets.get(i);
ColPartitionSet *line_set = part_sets.at(i);
if (line_set != nullptr) {
line_set->RelinquishParts();
delete line_set;
@ -604,8 +606,9 @@ bool ColumnFinder::MakeColumns(bool single_column) {
// Src_sets may be equal to column_candidates, in which case it will
// use them as a source to improve themselves.
void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets) {
PartSetVector temp_cols;
temp_cols.move(column_sets);
// TODO: optimize.
PartSetVector temp_cols = *column_sets;
column_sets->clear();
if (src_sets == column_sets)
src_sets = &temp_cols;
int set_size = temp_cols.size();
@ -613,7 +616,7 @@ void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVecto
bool good_only = true;
do {
for (int i = 0; i < set_size; ++i) {
ColPartitionSet *column_candidate = temp_cols.get(i);
ColPartitionSet *column_candidate = temp_cols.at(i);
ASSERT_HOST(column_candidate != nullptr);
ColPartitionSet *improved = column_candidate->Copy(good_only);
if (improved != nullptr) {
@ -623,10 +626,15 @@ void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVecto
}
good_only = !good_only;
} while (column_sets->empty() && !good_only);
if (column_sets->empty())
column_sets->move(&temp_cols);
else
temp_cols.delete_data_pointers();
if (column_sets->empty()) {
// TODO: optimize.
column_sets = &temp_cols;
temp_cols.clear();
} else {
for (auto data : temp_cols) {
delete data;
}
}
}
// Prints debug information on the column candidates.
@ -635,7 +643,7 @@ void ColumnFinder::PrintColumnCandidates(const char *title) {
tprintf("Found %d %s:\n", set_size, title);
if (textord_debug_tabfind >= 3) {
for (int i = 0; i < set_size; ++i) {
ColPartitionSet *column_set = column_sets_.get(i);
ColPartitionSet *column_set = column_sets_.at(i);
column_set->Print();
}
}
@ -673,7 +681,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
// Set possible column_sets to indicate whether each set is compatible
// with each column.
for (int part_i = 0; part_i < set_count; ++part_i) {
ColPartitionSet *line_set = part_sets.get(part_i);
ColPartitionSet *line_set = part_sets.at(part_i);
bool debug = line_set != nullptr && WithinTestRegion(2, line_set->bounding_box().left(),
line_set->bounding_box().bottom());
column_set_costs[part_i] = new int[column_count];
@ -681,8 +689,8 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
assigned_costs[part_i] = INT32_MAX;
for (int col_i = 0; col_i < column_count; ++col_i) {
if (line_set != nullptr &&
column_sets_.get(col_i)->CompatibleColumns(debug, line_set, WidthCB())) {
column_set_costs[part_i][col_i] = column_sets_.get(col_i)->UnmatchedWidth(line_set);
column_sets_.at(col_i)->CompatibleColumns(debug, line_set, WidthCB())) {
column_set_costs[part_i][col_i] = column_sets_.at(col_i)->UnmatchedWidth(line_set);
any_columns_possible[part_i] = true;
} else {
column_set_costs[part_i][col_i] = INT32_MAX;
@ -702,7 +710,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
int column_set_id = RangeModalColumnSet(column_set_costs, assigned_costs, start, end);
if (textord_debug_tabfind >= 2) {
tprintf("Range modal column id = %d\n", column_set_id);
column_sets_.get(column_set_id)->Print();
column_sets_.at(column_set_id)->Print();
}
// Now find the longest run of the column_set_id in the range.
ShrinkRangeToLongestRun(column_set_costs, assigned_costs, any_columns_possible, column_set_id,
@ -722,7 +730,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
tprintf("Column id %d applies to range = %d - %d\n", column_set_id, start, end);
// Assign the column to the range, which now may overlap with other ranges.
AssignColumnToRange(column_set_id, start, end, column_set_costs, assigned_costs);
if (column_sets_.get(column_set_id)->GoodColumnCount() > 1)
if (column_sets_.at(column_set_id)->GoodColumnCount() > 1)
any_multi_column = true;
}
// If anything remains unassigned, the whole lot is unassigned, so
@ -879,7 +887,7 @@ void ColumnFinder::ExtendRangePastSmallGaps(int **column_set_costs, const int *a
// Assigns the given column_set_id to the given range.
void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,
int **column_set_costs, int *assigned_costs) {
ColPartitionSet *column_set = column_sets_.get(column_set_id);
ColPartitionSet *column_set = column_sets_.at(column_set_id);
for (int i = start; i < end; ++i) {
assigned_costs[i] = column_set_costs[i][column_set_id];
best_columns_[i] = column_set;

View File

@ -1472,7 +1472,7 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction
ComputeSearchBoxAndScaling(direction, part_box, gridsize(), &search_box, &dist_scaling);
bool image_region =
ImageFind::CountPixelsInRotatedBox(search_box, im_box, rerotation, nontext_map) > 0;
GenericVector<int> dists[NPT_COUNT];
std::vector<int> dists[NPT_COUNT];
AccumulatePartDistances(part, dist_scaling, search_box, nontext_map, im_box, rerotation, debug,
dists);
// By iteratively including the next smallest distance across the vectors,
@ -1537,12 +1537,12 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction
// vectors in the dists array are sorted in increasing order.
// The nontext_map (+im_box, rerotation) is used to make text invisible if
// there is non-text in between.
// dists must be an array of GenericVectors of size NPT_COUNT.
// dists must be an array of vectors of size NPT_COUNT.
void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part,
const ICOORD &dist_scaling, const TBOX &search_box,
Pix *nontext_map, const TBOX &im_box,
const FCOORD &rerotation, bool debug,
GenericVector<int> *dists) {
std::vector<int> *dists) {
const TBOX &part_box = base_part.bounding_box();
ColPartitionGridSearch rsearch(this);
rsearch.SetUniqueMode(true);
@ -1571,7 +1571,7 @@ void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part,
// Truncate the number of boxes, so text doesn't get too much advantage.
int n_boxes = std::min(neighbour->boxes_count(), kSmoothDecisionMargin);
BlobTextFlowType n_flow = neighbour->flow();
GenericVector<int> *count_vector = nullptr;
std::vector<int> *count_vector = nullptr;
if (n_flow == BTFT_STRONG_CHAIN) {
if (n_type == BRT_TEXT)
count_vector = &dists[NPT_HTEXT];
@ -1602,8 +1602,9 @@ void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part,
neighbour->Print();
}
}
for (int i = 0; i < NPT_COUNT; ++i)
dists[i].sort();
for (int i = 0; i < NPT_COUNT; ++i) {
std::sort(dists[i].begin(), dists[i].end());
}
}
// Improves the margins of the part ColPartition by searching for

View File

@ -214,10 +214,10 @@ private:
// distance (scaled by dist_scaling) of the part from the base_part to the
// vector of the appropriate type for the partition. Prior to return, the
// vectors in the dists array are sorted in increasing order.
// dists must be an array of GenericVectors of size NPT_COUNT.
// dists must be an array of vectors of size NPT_COUNT.
void AccumulatePartDistances(const ColPartition &base_part, const ICOORD &dist_scaling,
const TBOX &search_box, Pix *nontext_map, const TBOX &im_box,
const FCOORD &rerotation, bool debug, GenericVector<int> *dists);
const FCOORD &rerotation, bool debug, std::vector<int> *dists);
// Improves the margins of the ColPartition by searching for
// neighbours that vertically overlap significantly.

View File

@ -93,7 +93,7 @@ void ColPartitionSet::ImproveColumnCandidate(WidthCallback cb, PartSetVector *sr
// Iterate over the provided column sets, as each one may have something
// to improve this.
for (int i = 0; i < set_size; ++i) {
ColPartitionSet *column_set = src_sets->get(i);
ColPartitionSet *column_set = src_sets->at(i);
if (column_set == nullptr)
continue;
// Iterate over the parts in this and column_set, adding bigger or
@ -184,7 +184,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthC
return;
}
for (int i = 0; i < column_sets->size(); ++i) {
ColPartitionSet *columns = column_sets->get(i);
ColPartitionSet *columns = column_sets->at(i);
// In ordering the column set candidates, good_coverage_ is king,
// followed by good_column_count_ and then bad_coverage_.
bool better = good_coverage_ > columns->good_coverage_;
@ -198,7 +198,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthC
// The new one is better so add it.
if (debug)
tprintf("Good one\n");
column_sets->insert(this, i);
column_sets->insert(column_sets->begin() + i, this);
return;
}
if (columns->CompatibleColumns(false, this, cb)) {

View File

@ -21,7 +21,6 @@
#define TESSERACT_TEXTORD_COLPARTITIONSET_H_
#include "colpartition.h" // For ColPartition_LIST.
#include "genericvector.h" // For GenericVector.
#include "rect.h" // For TBOX.
#include "tabvector.h" // For BLOBNBOX_CLIST.
@ -30,7 +29,7 @@ namespace tesseract {
class WorkingPartSet_LIST;
class ColSegment_LIST;
class ColPartitionSet;
using PartSetVector = GenericVector<ColPartitionSet *>;
using PartSetVector = std::vector<ColPartitionSet *>;
// ColPartitionSet is a class that holds a list of ColPartitions.
// Its main use is in holding a candidate partitioning of the width of the

View File

@ -516,7 +516,7 @@ ScrollView *TabFind::FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_g
#ifndef GRAPHICS_DISABLED
// Helper displays all the boxes in the given vector on the given window.
static void DisplayBoxVector(const GenericVector<BLOBNBOX *> &boxes, ScrollView *win) {
static void DisplayBoxVector(const std::vector<BLOBNBOX *> &boxes, ScrollView *win) {
for (int i = 0; i < boxes.size(); ++i) {
TBOX box = boxes[i]->bounding_box();
int left_x = box.left();
@ -552,8 +552,8 @@ ScrollView *TabFind::FindTabBoxes(int min_gutter_width, double tabfind_aligned_g
}
// Sort left tabs by left and right by right to see the outermost one first
// on a ragged tab.
left_tab_boxes_.sort(SortByBoxLeft<BLOBNBOX>);
right_tab_boxes_.sort(SortRightToLeft<BLOBNBOX>);
std::sort(left_tab_boxes_.begin(), left_tab_boxes_.end(), StdSortByBoxLeft<BLOBNBOX>);
std::sort(right_tab_boxes_.begin(), right_tab_boxes_.end(), StdSortRightToLeft<BLOBNBOX>);
ScrollView *tab_win = nullptr;
#ifndef GRAPHICS_DISABLED
if (textord_tabfind_show_initialtabs) {
@ -831,7 +831,7 @@ int TabFind::FindTabVectors(int search_size_multiple, TabAlignment alignment, in
int vector_count = 0;
// Search the right or left tab boxes, looking for tab vectors.
bool right = alignment == TA_RIGHT_ALIGNED || alignment == TA_RIGHT_RAGGED;
const GenericVector<BLOBNBOX *> &boxes = right ? right_tab_boxes_ : left_tab_boxes_;
const std::vector<BLOBNBOX *> &boxes = right ? right_tab_boxes_ : left_tab_boxes_;
for (int i = 0; i < boxes.size(); ++i) {
BLOBNBOX *bbox = boxes[i];
if ((!right && bbox->left_tab_type() == TT_MAYBE_ALIGNED) ||

View File

@ -354,8 +354,8 @@ private:
/** Callback to test an int for being a common width. */
WidthCallback width_cb_;
// Sets of bounding boxes that are candidate tab stops.
GenericVector<BLOBNBOX *> left_tab_boxes_;
GenericVector<BLOBNBOX *> right_tab_boxes_;
std::vector<BLOBNBOX *> left_tab_boxes_;
std::vector<BLOBNBOX *> right_tab_boxes_;
};
} // namespace tesseract.

View File

@ -156,12 +156,11 @@ bool StructuredTable::FindLinedStructure() {
if (cell_x_.size() < 3 || cell_y_.size() < 3)
return false;
cell_x_.sort();
cell_y_.sort();
// Remove duplicates that may have occurred due to split lines.
cell_x_.compact_sorted();
cell_y_.compact_sorted();
// Sort and remove duplicates that may have occurred due to split lines.
std::sort(cell_x_.begin(), cell_x_.end());
std::unique(cell_x_.begin(), cell_x_.end());
std::sort(cell_y_.begin(), cell_y_.end());
std::unique(cell_y_.begin(), cell_y_.end());
// The border should be the extents of line boxes, not middle.
cell_x_[0] = bounding_box_.left();
@ -170,8 +169,8 @@ bool StructuredTable::FindLinedStructure() {
cell_y_[cell_y_.size() - 1] = bounding_box_.top();
// Remove duplicates that may have occurred due to moving the borders.
cell_x_.compact_sorted();
cell_y_.compact_sorted();
std::unique(cell_x_.begin(), cell_x_.end());
std::unique(cell_y_.begin(), cell_y_.end());
CalculateMargins();
CalculateStats();
@ -347,8 +346,8 @@ bool StructuredTable::VerifyWhitespacedTable() {
// in the middle of the two nearest partitions.
void StructuredTable::FindWhitespacedColumns() {
// Set of the extents of all partitions on the page.
GenericVector<int> left_sides;
GenericVector<int> right_sides;
std::vector<int> left_sides;
std::vector<int> right_sides;
// Look at each text partition. We want to find the partitions
// that have extremal left/right sides. These will give us a basis
@ -371,8 +370,8 @@ void StructuredTable::FindWhitespacedColumns() {
return;
// Since data may be inserted in grid order, we sort the left/right sides.
left_sides.sort();
right_sides.sort();
std::sort(left_sides.begin(), left_sides.end());
std::sort(right_sides.begin(), right_sides.end());
// At this point, in the "merged list", we expect to have a left side,
// followed by either more left sides or a right side. The last number
@ -390,8 +389,8 @@ void StructuredTable::FindWhitespacedColumns() {
// in the middle of the two nearest partitions.
void StructuredTable::FindWhitespacedRows() {
// Set of the extents of all partitions on the page.
GenericVector<int> bottom_sides;
GenericVector<int> top_sides;
std::vector<int> bottom_sides;
std::vector<int> top_sides;
// We will be "shrinking" partitions, so keep the min/max around to
// make sure the bottom/top lines do not intersect text.
int min_bottom = INT32_MAX;
@ -435,8 +434,8 @@ void StructuredTable::FindWhitespacedRows() {
return;
// Since data may be inserted in grid order, we sort the bottom/top sides.
bottom_sides.sort();
top_sides.sort();
std::sort(bottom_sides.begin(), bottom_sides.end());
std::sort(top_sides.begin(), top_sides.end());
// At this point, in the "merged list", we expect to have a bottom side,
// followed by either more bottom sides or a top side. The last number
@ -573,17 +572,17 @@ void StructuredTable::AbsorbNearbyLines() {
// desired height.
// The first/last items are extremal values of the list and known.
// NOTE: This function assumes the lists are sorted!
void StructuredTable::FindCellSplitLocations(const GenericVector<int> &min_list,
const GenericVector<int> &max_list, int max_merged,
GenericVector<int> *locations) {
void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
const std::vector<int> &max_list, int max_merged,
std::vector<int> *locations) {
locations->clear();
ASSERT_HOST(min_list.size() == max_list.size());
if (min_list.size() == 0)
return;
ASSERT_HOST(min_list.get(0) < max_list.get(0));
ASSERT_HOST(min_list.get(min_list.size() - 1) < max_list.get(max_list.size() - 1));
ASSERT_HOST(min_list.at(0) < max_list.at(0));
ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
locations->push_back(min_list.get(0));
locations->push_back(min_list.at(0));
int min_index = 0;
int max_index = 0;
int stacked_partitions = 0;
@ -610,7 +609,7 @@ void StructuredTable::FindCellSplitLocations(const GenericVector<int> &min_list,
++max_index;
}
}
locations->push_back(max_list.get(max_list.size() - 1));
locations->push_back(max_list.at(max_list.size() - 1));
}
// Counts the number of partitions in the table

View File

@ -21,7 +21,6 @@
#define TABLERECOG_H_
#include "colpartitiongrid.h"
#include "genericvector.h"
namespace tesseract {
@ -209,9 +208,9 @@ protected:
// are inserted wherever space exists between partitions. If it is 2,
// lines may intersect 2 partitions at most, but you also need at least
// 2 partitions to generate a line.
static void FindCellSplitLocations(const GenericVector<int> &min_list,
const GenericVector<int> &max_list, int max_merged,
GenericVector<int> *locations);
static void FindCellSplitLocations(const std::vector<int> &min_list,
const std::vector<int> &max_list, int max_merged,
std::vector<int> *locations);
////////
//////// Utility function for table queries
@ -236,8 +235,8 @@ protected:
// bounding box is a convenient external representation.
// cell_x_ and cell_y_ indicate the grid lines.
TBOX bounding_box_; // Bounding box
GenericVector<int> cell_x_; // Locations of vertical divisions (sorted)
GenericVector<int> cell_y_; // Locations of horizontal divisions (sorted)
std::vector<int> cell_x_; // Locations of vertical divisions (sorted)
std::vector<int> cell_y_; // Locations of horizontal divisions (sorted)
bool is_lined_; // Is the table backed up by a line structure
// Table margins, set via CalculateMargins
int space_above_;

View File

@ -49,7 +49,7 @@
#include "tprintf.h" // for tprintf
#include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP
#include "genericvector.h" // for PointerVector, GenericVector
#include "genericvector.h" // for PointerVector
#include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate
@ -685,7 +685,7 @@ struct BlockGroup {
// Min xheight of the blocks.
float min_xheight;
// Collection of borrowed pointers to the blocks in the group.
GenericVector<BLOCK *> blocks;
std::vector<BLOCK *> blocks;
};
// Groups blocks by rotation, then, for each group, makes a WordGrid and calls

View File

@ -41,9 +41,6 @@
namespace tesseract {
template <typename T>
class GenericVector;
// Even though the limit on the number of chunks may now be removed, keep
// the same limit for repeatable behavior, and it may be a speed advantage.
static const int kMaxNumChunks = 64;
@ -79,7 +76,7 @@ static int check_blob(TBLOB *blob) {
*
* Return true if any of the splits share a point with this one.
*/
static int any_shared_split_points(const GenericVector<SEAM *> &seams, SEAM *seam) {
static int any_shared_split_points(const std::vector<SEAM *> &seams, SEAM *seam) {
int length;
int index;
@ -167,13 +164,13 @@ static int16_t total_containment(TBLOB *blob1, TBLOB *blob2) {
// Helper runs all the checks on a seam to make sure it is valid.
// Returns the seam if OK, otherwise deletes the seam and returns nullptr.
static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob,
TBLOB *other_blob, const GenericVector<SEAM *> &seams, SEAM *seam) {
TBLOB *other_blob, const std::vector<SEAM *> &seams, SEAM *seam) {
if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr ||
total_containment(blob, other_blob) || check_blob(other_blob) ||
!seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
any_shared_split_points(seams, seam) ||
!seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
word->blobs.remove(blob_number + 1);
word->blobs.erase(word->blobs.begin() + blob_number + 1);
if (seam) {
seam->UndoSeam(blob, other_blob);
delete seam;
@ -200,12 +197,12 @@ static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB
* it was successful.
*/
SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
const GenericVector<SEAM *> &seams) {
const std::vector<SEAM *> &seams) {
if (repair_unchopped_blobs)
preserve_outline_tree(blob->outlines);
TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
// Insert it into the word.
word->blobs.insert(other_blob, blob_number + 1);
word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);
SEAM *seam = nullptr;
if (prioritize_division) {
@ -235,7 +232,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number,
TPOINT location;
if (divisible_blob(blob, italic_blob, &location)) {
other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
word->blobs.insert(other_blob, blob_number + 1);
word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);
seam = new SEAM(0.0f, location);
seam->ApplySeam(italic_blob, blob, other_blob);
seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam);
@ -250,7 +247,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number,
}
SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
const GenericVector<SEAM *> &seams) {
const std::vector<SEAM *> &seams) {
return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams);
}
@ -305,7 +302,7 @@ SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic
* word->seam_array and the resulting blobs are unclassified, so this function
* can be used by ApplyBox as well as during recognition.
*/
SEAM *Wordrec::improve_one_blob(const GenericVector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
int *blob_number) {
float rating_ceiling = FLT_MAX;
@ -347,7 +344,7 @@ SEAM *Wordrec::improve_one_blob(const GenericVector<BLOB_CHOICE *> &blob_choices
* Used for testing chopper.
*/
SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes,
const GenericVector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
int *blob_number) {
if (prioritize_division) {
return chop_overlapping_blob(boxes, true, word_res, blob_number);
@ -427,12 +424,12 @@ void Wordrec::chop_word_main(WERD_RES *word) {
void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
LMPainPoints *pain_points,
GenericVector<SegSearchPending> *pending) {
std::vector<SegSearchPending> *pending) {
int blob_number;
do { // improvement loop.
// Make a simple vector of BLOB_CHOICEs to make it easy to pick which
// one to chop.
GenericVector<BLOB_CHOICE *> blob_choices;
std::vector<BLOB_CHOICE *> blob_choices;
int num_blobs = word->ratings->dimension();
for (int i = 0; i < num_blobs; ++i) {
BLOB_CHOICE_LIST *choices = word->ratings->get(i, i);
@ -460,7 +457,7 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
// Remap existing pain points.
pain_points->RemapForSplit(blob_number);
// Insert a new pending at the chop point.
pending->insert(SegSearchPending(), blob_number);
pending->insert(pending->begin() + blob_number, SegSearchPending());
// Classify the two newly created blobs using ProcessSegSearchPainPoint,
// as that updates the pending correctly and adds new pain points.
@ -501,7 +498,7 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
* These are the results of the last classification. Find a likely
* place to apply splits. If none, return -1.
**********************************************************************/
int Wordrec::select_blob_to_split(const GenericVector<BLOB_CHOICE *> &blob_choices,
int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices,
float rating_ceiling, bool split_next_to_fragment) {
BLOB_CHOICE *blob_choice;
int x;

View File

@ -34,8 +34,6 @@
#include "unicharset.h" // for UNICHARSET
#include "unicity_table.h" // for UnicityTable
template <typename T>
class GenericVector;
template <typename T>
class UnicityTable;

View File

@ -200,9 +200,10 @@ bool LMPainPoints::GeneratePainPoint(int col, int row, LMPainPointsType pp_type,
*/
void LMPainPoints::RemapForSplit(int index) {
for (auto &pain_points_heap : pain_points_heaps_) {
GenericVector<MatrixCoordPair> *heap = pain_points_heap.heap();
for (int j = 0; j < heap->size(); ++j)
(*heap)[j].data().MapForSplit(index);
std::vector<MatrixCoordPair> &heap = pain_points_heap.heap();
for (auto entry : heap) {
entry.data().MapForSplit(index);
}
}
}

View File

@ -23,6 +23,8 @@
#include <cstdio>
#include "bitvector.h"
#include "helpers.h" // for ClipToRange
#include "serialis.h" // for TFile
#include "tprintf.h"
namespace tesseract {
@ -103,8 +105,8 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
present.Init(PTRAIN_NUM_FEATURE_TYPES);
lang_ = lang;
// Load weights for passes with adaption on.
GenericVector<float> &weights = weights_vec_[pass_];
weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0);
std::vector<float> &weights = weights_vec_[pass_];
weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f);
while (fp->FGets(line, kMaxLineSize) != nullptr) {
char *key = nullptr;
@ -129,13 +131,13 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
}
}
lang_ = "";
weights.truncate(0);
weights.clear();
}
return complete;
}
bool ParamsModel::SaveToFile(const char *full_path) const {
const GenericVector<float> &weights = weights_vec_[pass_];
const std::vector<float> &weights = weights_vec_[pass_];
if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) {
tprintf("Refusing to save ParamsModel that has not been initialized.\n");
return false;

View File

@ -19,7 +19,7 @@
#ifndef TESSERACT_WORDREC_PARAMS_MODEL_H_
#define TESSERACT_WORDREC_PARAMS_MODEL_H_
#include "genericvector.h" // for GenericVector
#include <tesseract/export.h> // for TESS_API
#include "params_training_featdef.h" // for PTRAIN_NUM_FEATURE_TYPES
namespace tesseract {
@ -38,7 +38,7 @@ public:
};
ParamsModel() : pass_(PTRAIN_PASS1) {}
ParamsModel(const char *lang, const GenericVector<float> &weights)
ParamsModel(const char *lang, const std::vector<float> &weights)
: lang_(lang), pass_(PTRAIN_PASS1) {
weights_vec_[pass_] = weights;
}
@ -65,10 +65,10 @@ public:
// Returns true on success.
bool LoadFromFp(const char *lang, TFile *fp);
const GenericVector<float> &weights() const {
const std::vector<float> &weights() const {
return weights_vec_[pass_];
}
const GenericVector<float> &weights_for_pass(PassEnum pass) const {
const std::vector<float> &weights_for_pass(PassEnum pass) const {
return weights_vec_[pass];
}
void SetPass(PassEnum pass) {
@ -84,7 +84,7 @@ private:
PassEnum pass_;
// Several sets of weights for various OCR passes (e.g. pass1 with adaption,
// pass2 without adaption, etc).
GenericVector<float> weights_vec_[PTRAIN_NUM_PASSES];
std::vector<float> weights_vec_[PTRAIN_NUM_PASSES];
};
} // namespace tesseract

View File

@ -46,7 +46,7 @@ using tesseract::ScoredFont;
* the collection of small pieces un modified.
**********************************************************************/
namespace tesseract {
BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM *> &seams, int16_t start,
BLOB_CHOICE_LIST *Wordrec::classify_piece(const std::vector<SEAM *> &seams, int16_t start,
int16_t end, const char *description, TWERD *word,
BlamerBundle *blamer_bundle) {
if (end > start)

View File

@ -19,7 +19,6 @@
#include <cstdint> // for INT32_MAX
#include "blamer.h" // for BlamerBundle
#include "errcode.h" // for ASSERT_HOST
#include "genericvector.h" // for GenericVector
#include "lm_pain_points.h" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi...
#include "lm_state.h" // for BestChoiceBundle, ViterbiStateEntry
#include "matrix.h" // for MATRIX_COORD, MATRIX
@ -44,7 +43,7 @@ void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle
// Compute scaling factor that will help us recover blob outline length
// from classifier rating and certainty for the blob.
float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
GenericVector<SegSearchPending> pending;
std::vector<SegSearchPending> pending;
InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle);
if (!SegSearchDone(0)) { // find a better choice
@ -122,7 +121,7 @@ void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle
// without doing any additional chopping or joining.
// (Internal factored version that can be used as part of the main SegSearch.)
void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
GenericVector<SegSearchPending> *pending,
std::vector<SegSearchPending> *pending,
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) {
if (segsearch_debug_level > 0) {
tprintf("Starting SegSearch on ratings matrix%s:\n",
@ -154,7 +153,7 @@ void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
// children are considered in the non-decreasing order of their column, since
// this guarantees that all the parents would be up to date before an update
// of a child is done.
pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
pending->resize(word_res->ratings->dimension(), SegSearchPending());
// Search the ratings matrix for the initial best path.
(*pending)[0].SetColumnClassified();
@ -163,7 +162,7 @@ void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
}
void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
GenericVector<SegSearchPending> *pending, WERD_RES *word_res,
std::vector<SegSearchPending> *pending, WERD_RES *word_res,
LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
BlamerBundle *blamer_bundle) {
MATRIX *ratings = word_res->ratings;
@ -223,7 +222,7 @@ void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
const char *pain_point_type,
GenericVector<SegSearchPending> *pending,
std::vector<SegSearchPending> *pending,
WERD_RES *word_res, LMPainPoints *pain_points,
BlamerBundle *blamer_bundle) {
if (segsearch_debug_level > 0) {
@ -279,7 +278,7 @@ void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_
// Needed when the n-gram model is enabled, as the multi-length comparison
// implementation will re-value existing paths to worse values.
void Wordrec::ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
GenericVector<SegSearchPending> *pending) {
std::vector<SegSearchPending> *pending) {
// TODO(rays) More refactoring required here.
// Delete existing viterbi states.
for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {

View File

@ -81,8 +81,6 @@ public:
# include "seam.h" // for SEAM (ptr only), PRIORITY
# include "stopper.h" // for DANGERR
# include "genericvector.h" // for GenericVector
# include <cstdint> // for int16_t, int32_t
namespace tesseract {
@ -329,7 +327,7 @@ public:
// without doing any additional chopping or joining.
// (Internal factored version that can be used as part of the main SegSearch.)
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
GenericVector<SegSearchPending> *pending,
std::vector<SegSearchPending> *pending,
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
// Runs SegSearch() function (above) without needing a best_choice_bundle
@ -352,22 +350,22 @@ public:
// chopper.cpp
SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
const GenericVector<SEAM *> &seams);
const std::vector<SEAM *> &seams);
SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
const GenericVector<SEAM *> &seams);
const std::vector<SEAM *> &seams);
SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
int *blob_number);
SEAM *improve_one_blob(const GenericVector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
int *blob_number);
SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
const GenericVector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
int *blob_number);
void chop_word_main(WERD_RES *word);
void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
LMPainPoints *pain_points, GenericVector<SegSearchPending> *pending);
int select_blob_to_split(const GenericVector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
LMPainPoints *pain_points, std::vector<SegSearchPending> *pending);
int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
bool split_next_to_fragment);
int select_blob_to_split_from_fixpt(DANGERR *fixpt);
@ -391,7 +389,7 @@ public:
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt);
// pieces.cpp
virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector<SEAM *> &seams, int16_t start,
virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start,
int16_t end, const char *description, TWERD *word,
BlamerBundle *blamer_bundle);
// Try to merge fragments in the ratings matrix and put the result in
@ -466,7 +464,7 @@ protected:
// if a new best choice is found
//
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
GenericVector<SegSearchPending> *pending, WERD_RES *word_res,
std::vector<SegSearchPending> *pending, WERD_RES *word_res,
LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
BlamerBundle *blamer_bundle);
@ -474,13 +472,13 @@ protected:
// new pain points to join the newly classified blob with its neighbors.
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
const char *pain_point_type,
GenericVector<SegSearchPending> *pending, WERD_RES *word_res,
std::vector<SegSearchPending> *pending, WERD_RES *word_res,
LMPainPoints *pain_points, BlamerBundle *blamer_bundle);
// Resets enough of the results so that the Viterbi search is re-run.
// Needed when the n-gram model is enabled, as the multi-length comparison
// implementation will re-value existing paths to worse values.
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
GenericVector<SegSearchPending> *pending);
std::vector<SegSearchPending> *pending);
// Add pain points for classifying blobs on the correct segmentation path
// (so that we can evaluate correct segmentation path and discover the reason

View File

@ -92,15 +92,15 @@ public:
return ComputeForegroundDensity(tbox);
}
int RunCountAlignment(const GenericVector<int> &sorted_vec, const int val) {
int RunCountAlignment(const std::vector<int> &sorted_vec, const int val) {
return CountAlignment(sorted_vec, val);
}
void RunSplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) {
void RunSplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
SplitCPHorLite(part, splitted_boxes);
}
void RunSplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) {
void RunSplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
SplitCPHor(part, parts_splitted);
}
@ -377,7 +377,7 @@ TEST_F(EquationFinderTest, ComputeForegroundDensity) {
}
TEST_F(EquationFinderTest, CountAlignment) {
GenericVector<int> vec;
std::vector<int> vec;
vec.push_back(1);
vec.push_back(1);
vec.push_back(1);
@ -452,7 +452,7 @@ TEST_F(EquationFinderTest, SplitCPHorLite) {
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
part->DeleteBoxes();
part->set_median_width(10);
GenericVector<TBOX> splitted_boxes;
std::vector<TBOX> splitted_boxes;
// Test an empty part.
equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
@ -486,7 +486,7 @@ TEST_F(EquationFinderTest, SplitCPHor) {
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
part->DeleteBoxes();
part->set_median_width(10);
GenericVector<ColPartition *> parts_splitted;
std::vector<ColPartition *> parts_splitted;
// Test an empty part.
equation_det_->RunSplitCPHor(part, &parts_splitted);
@ -512,7 +512,9 @@ TEST_F(EquationFinderTest, SplitCPHor) {
EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box());
EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box());
parts_splitted.delete_data_pointers();
for (auto part_splitted : parts_splitted) {
delete part_splitted;
}
part->DeleteBoxes();
delete (part);
}

View File

@ -107,7 +107,7 @@ void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo
// Given n rows of reference ground truth, evaluate whether the n rows
// of PARA * pointers yield the same paragraph breakpoints.
void EvaluateParagraphDetection(const TextAndModel *correct, int n,
const GenericVector<PARA *> &detector_output) {
const std::vector<PARA *> &detector_output) {
int incorrect_breaks = 0;
int missed_breaks = 0;
int poorly_matched_models = 0;
@ -186,7 +186,7 @@ void EvaluateParagraphDetection(const TextAndModel *correct, int n,
void TestParagraphDetection(const TextAndModel *correct, int num_rows) {
std::vector<RowInfo> row_infos;
GenericVector<PARA *> row_owners;
std::vector<PARA *> row_owners;
PARA_LIST paragraphs;
std::vector<ParagraphModel *> models;
@ -312,7 +312,7 @@ TEST(ParagraphsTest, TestSingleFullPageContinuation) {
const TextAndModel *correct = kSingleFullPageContinuation;
int num_rows = countof(kSingleFullPageContinuation);
std::vector<RowInfo> row_infos;
GenericVector<PARA *> row_owners;
std::vector<PARA *> row_owners;
PARA_LIST paragraphs;
std::vector<ParagraphModel *> models;
models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));

View File

@ -9,7 +9,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "genericvector.h"
#include "kdpair.h"
#include "statistc.h"
@ -42,8 +41,8 @@ TEST_F(STATSTest, BasicStats) {
// Tests the top_n_modes function.
TEST_F(STATSTest, TopNModes) {
GenericVector<tesseract::KDPairInc<float, int> > modes;
int num_modes = stats_.top_n_modes(3, &modes);
std::vector<tesseract::KDPairInc<float, int> > modes;
int num_modes = stats_.top_n_modes(3, modes);
EXPECT_EQ(3, num_modes);
// Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14.
EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key());

View File

@ -39,27 +39,27 @@ public:
void InjectCellY(int y) {
cell_y_.push_back(y);
cell_y_.sort();
std::sort(cell_y_.begin(), cell_y_.end());
}
void InjectCellX(int x) {
cell_x_.push_back(x);
cell_x_.sort();
std::sort(cell_x_.begin(), cell_x_.end());
}
void ExpectCellX(int x_min, int second, int add, int almost_done, int x_max) {
ASSERT_EQ(0, (almost_done - second) % add);
EXPECT_EQ(3 + (almost_done - second) / add, cell_x_.size());
EXPECT_EQ(x_min, cell_x_.get(0));
EXPECT_EQ(x_max, cell_x_.get(cell_x_.size() - 1));
EXPECT_EQ(x_min, cell_x_.at(0));
EXPECT_EQ(x_max, cell_x_.at(cell_x_.size() - 1));
for (int i = 1; i < cell_x_.size() - 1; ++i) {
EXPECT_EQ(second + add * (i - 1), cell_x_.get(i));
EXPECT_EQ(second + add * (i - 1), cell_x_.at(i));
}
}
void ExpectSortedX() {
EXPECT_GT(cell_x_.size(), 0);
for (int i = 1; i < cell_x_.size(); ++i) {
EXPECT_LT(cell_x_.get(i - 1), cell_x_.get(i));
EXPECT_LT(cell_x_.at(i - 1), cell_x_.at(i));
}
}
};

View File

@ -93,7 +93,7 @@ protected:
int len = compressed_.EncodeUnichar(u, &code);
// Check round-trip encoding.
int unichar_id;
GenericVector<UNICHAR_ID> normed_ids;
std::vector<UNICHAR_ID> normed_ids;
if (u == null_char_ || u == unicharset_.size()) {
unichar_id = null_char_;
} else {
@ -137,7 +137,7 @@ protected:
const std::vector<RecodedCharID> &times_seen) {
RecodedCharID extended = code;
int length = code.length();
const GenericVector<int> *final_codes = compressed_.GetFinalCodes(code);
const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);
if (final_codes != nullptr) {
for (int i = 0; i < final_codes->size(); ++i) {
int ending = (*final_codes)[i];
@ -147,7 +147,7 @@ protected:
EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
}
}
const GenericVector<int> *next_codes = compressed_.GetNextCodes(code);
const std::vector<int> *next_codes = compressed_.GetNextCodes(code);
if (next_codes != nullptr) {
for (int i = 0; i < next_codes->size(); ++i) {
int extension = (*next_codes)[i];