mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
Merge pull request #3332 from stweil/vector
Replace some GenericVector by std::vector
This commit is contained in:
commit
d604bf3c68
@ -24,7 +24,6 @@
|
||||
# include "boxread.h"
|
||||
#endif // ndef DISABLED_LEGACY_ENGINE
|
||||
#include <tesseract/unichar.h>
|
||||
#include "genericvector.h"
|
||||
#include "pageres.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "unicharset.h"
|
||||
@ -240,7 +239,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
|
||||
tprintf("Maximally chopping word at:");
|
||||
word_res->word->bounding_box().print();
|
||||
}
|
||||
GenericVector<BLOB_CHOICE *> blob_choices;
|
||||
std::vector<BLOB_CHOICE *> blob_choices;
|
||||
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
|
||||
auto rating = static_cast<float>(INT8_MAX);
|
||||
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
|
||||
@ -271,7 +270,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
|
||||
// combine confidence w/ serial #
|
||||
auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
|
||||
0.0f, 0.0f, BCC_FAKE);
|
||||
blob_choices.insert(right_choice, blob_number + 1);
|
||||
blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
|
||||
}
|
||||
}
|
||||
word_res->CloneChoppedToRebuild();
|
||||
@ -374,8 +373,8 @@ bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const
|
||||
// Eliminated best_state and correct_text entries for the consumed
|
||||
// blobs.
|
||||
for (int j = 1; j < blob_count; ++j) {
|
||||
word_res->best_state.remove(i + 1);
|
||||
word_res->correct_text.remove(i + 1);
|
||||
word_res->best_state.erase(word_res->best_state.begin() + i + 1);
|
||||
word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
|
||||
}
|
||||
// Assume that no box spans multiple source words, so we are done with
|
||||
// this box.
|
||||
@ -489,7 +488,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
|
||||
if (word->text() == nullptr || word->text()[0] == '\0')
|
||||
continue; // Ignore words that have no text.
|
||||
// Convert the correct text to a vector of UNICHAR_ID
|
||||
GenericVector<UNICHAR_ID> target_text;
|
||||
std::vector<UNICHAR_ID> target_text;
|
||||
if (!ConvertStringToUnichars(word->text(), &target_text)) {
|
||||
tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
|
||||
pr_it.DeleteCurrentWord();
|
||||
@ -505,7 +504,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
|
||||
|
||||
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||
/// @return false if an invalid UNICHAR_ID is encountered.
|
||||
bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids) {
|
||||
bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
|
||||
for (int step = 0; *utf8 != '\0'; utf8 += step) {
|
||||
const char *next_space = strchr(utf8, ' ');
|
||||
if (next_space == nullptr)
|
||||
@ -528,10 +527,10 @@ bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_
|
||||
/// applies a full search on the classifier results to find the best classified
|
||||
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
|
||||
/// substitutions ARE used.
|
||||
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
|
||||
bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
|
||||
// Classify all required combinations of blobs and save results in choices.
|
||||
const int word_length = word_res->box_word->length();
|
||||
auto *choices = new GenericVector<BLOB_CHOICE_LIST *>[word_length];
|
||||
auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
|
||||
for (int i = 0; i < word_length; ++i) {
|
||||
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
|
||||
BLOB_CHOICE_LIST *match_result =
|
||||
@ -548,12 +547,15 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
|
||||
// match. Using wildcards makes it difficult to find the correct
|
||||
// segmentation even when it is there.
|
||||
word_res->best_state.clear();
|
||||
GenericVector<int> search_segmentation;
|
||||
std::vector<int> search_segmentation;
|
||||
float best_rating = 0.0f;
|
||||
SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
|
||||
&word_res->best_state);
|
||||
for (int i = 0; i < word_length; ++i)
|
||||
choices[i].delete_data_pointers();
|
||||
for (int i = 0; i < word_length; ++i) {
|
||||
for (auto choice : choices[i]) {
|
||||
delete choice;
|
||||
}
|
||||
}
|
||||
delete[] choices;
|
||||
if (word_res->best_state.empty()) {
|
||||
// Build the original segmentation and if it is the same length as the
|
||||
@ -583,9 +585,9 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
|
||||
|
||||
/// Recursive helper to find a match to the target_text (from text_index
|
||||
/// position) in the choices (from choices_pos position).
|
||||
/// @param choices is an array of GenericVectors, of length choices_length,
|
||||
/// @param choices is an array of vectors of length choices_length,
|
||||
/// with each element representing a starting position in the word, and the
|
||||
/// #GenericVector holding classification results for a sequence of consecutive
|
||||
/// #vector holding classification results for a sequence of consecutive
|
||||
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||
/// @param choices_pos
|
||||
/// @param choices_length
|
||||
@ -595,10 +597,10 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, W
|
||||
/// @param segmentation
|
||||
/// @param best_rating
|
||||
/// @param best_segmentation
|
||||
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const GenericVector<UNICHAR_ID> &target_text,
|
||||
int text_index, float rating, GenericVector<int> *segmentation,
|
||||
float *best_rating, GenericVector<int> *best_segmentation) {
|
||||
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const std::vector<UNICHAR_ID> &target_text,
|
||||
int text_index, float rating, std::vector<int> *segmentation,
|
||||
float *best_rating, std::vector<int> *best_segmentation) {
|
||||
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
|
||||
for (int length = 1; length <= choices[choices_pos].size(); ++length) {
|
||||
// Rating of matching choice or worst choice if no match.
|
||||
@ -654,7 +656,7 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices,
|
||||
unicharset.id_to_unichar(target_text[text_index]));
|
||||
}
|
||||
}
|
||||
segmentation->truncate(segmentation->size() - 1);
|
||||
segmentation->resize(segmentation->size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -461,8 +461,8 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
continue;
|
||||
}
|
||||
// Two words sharing the same language model, excellent!
|
||||
GenericVector<WERD_CHOICE *> overrides_word1;
|
||||
GenericVector<WERD_CHOICE *> overrides_word2;
|
||||
std::vector<WERD_CHOICE *> overrides_word1;
|
||||
std::vector<WERD_CHOICE *> overrides_word2;
|
||||
|
||||
const auto orig_w1_str = w_prev->best_choice->unichar_string();
|
||||
const auto orig_w2_str = w->best_choice->unichar_string();
|
||||
@ -768,7 +768,7 @@ static int SelectBestWords(double rating_ratio, double certainty_margin, bool de
|
||||
PointerVector<WERD_RES> *best_words) {
|
||||
// Process the smallest groups of words that have an overlapping word
|
||||
// boundary at the end.
|
||||
GenericVector<WERD_RES *> out_words;
|
||||
std::vector<WERD_RES *> out_words;
|
||||
// Index into each word vector (best, new).
|
||||
int b = 0, n = 0;
|
||||
int num_best = 0, num_new = 0;
|
||||
@ -893,19 +893,19 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
|
||||
return false;
|
||||
real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
|
||||
// Get the noise outlines into a vector with matching bool map.
|
||||
GenericVector<C_OUTLINE *> outlines;
|
||||
std::vector<C_OUTLINE *> outlines;
|
||||
real_word->GetNoiseOutlines(&outlines);
|
||||
GenericVector<bool> word_wanted;
|
||||
GenericVector<bool> overlapped_any_blob;
|
||||
GenericVector<C_BLOB *> target_blobs;
|
||||
std::vector<bool> word_wanted;
|
||||
std::vector<bool> overlapped_any_blob;
|
||||
std::vector<C_BLOB *> target_blobs;
|
||||
AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
|
||||
&overlapped_any_blob, &target_blobs);
|
||||
// Filter the outlines that overlapped any blob and put them into the word
|
||||
// now. This simplifies the remaining task and also makes it more accurate
|
||||
// as it has more completed blobs to work on.
|
||||
GenericVector<bool> wanted;
|
||||
GenericVector<C_BLOB *> wanted_blobs;
|
||||
GenericVector<C_OUTLINE *> wanted_outlines;
|
||||
std::vector<bool> wanted;
|
||||
std::vector<C_BLOB *> wanted_blobs;
|
||||
std::vector<C_OUTLINE *> wanted_outlines;
|
||||
int num_overlapped = 0;
|
||||
int num_overlapped_used = 0;
|
||||
for (int i = 0; i < overlapped_any_blob.size(); ++i) {
|
||||
@ -948,11 +948,11 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next
|
||||
// Output: word_wanted indicates which outlines are to be assigned to a blob,
|
||||
// target_blobs indicates which to assign to, and overlapped_any_blob is
|
||||
// true for all outlines that overlapped a blob.
|
||||
void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines,
|
||||
void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
|
||||
int pass, WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<bool> *overlapped_any_blob,
|
||||
GenericVector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<bool> *overlapped_any_blob,
|
||||
std::vector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> blob_wanted;
|
||||
word_wanted->resize(outlines.size(), false);
|
||||
overlapped_any_blob->resize(outlines.size(), false);
|
||||
@ -999,10 +999,10 @@ void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE
|
||||
|
||||
// Attempts to assign non-overlapping outlines to their nearest blobs or
|
||||
// make new blobs out of them.
|
||||
void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
|
||||
void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
|
||||
WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<C_BLOB *> *target_blobs) {
|
||||
std::vector<bool> blob_wanted;
|
||||
word_wanted->resize(outlines.size(), false);
|
||||
target_blobs->resize(outlines.size(), nullptr);
|
||||
@ -1077,7 +1077,7 @@ void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &out
|
||||
// are desired, in which case ok_outlines indicates which ones.
|
||||
bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
|
||||
C_BLOB *blob,
|
||||
const GenericVector<C_OUTLINE *> &outlines,
|
||||
const std::vector<C_OUTLINE *> &outlines,
|
||||
int num_outlines, std::vector<bool> *ok_outlines) {
|
||||
std::string best_str;
|
||||
float target_cert = certainty_threshold;
|
||||
@ -1161,7 +1161,7 @@ bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
|
||||
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
|
||||
// the inclusion of the outlines, and returns the certainty of the raw choice.
|
||||
float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
|
||||
const GenericVector<C_OUTLINE *> &outlines, int pass_n,
|
||||
const std::vector<C_OUTLINE *> &outlines, int pass_n,
|
||||
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
|
||||
C_OUTLINE_IT ol_it;
|
||||
C_OUTLINE *first_to_keep = nullptr;
|
||||
@ -1865,8 +1865,7 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
|
||||
const int fontinfo_size = get_fontinfo_table().size();
|
||||
if (fontinfo_size == 0)
|
||||
return;
|
||||
GenericVector<int> font_total_score;
|
||||
font_total_score.init_to_size(fontinfo_size, 0);
|
||||
std::vector<int> font_total_score(fontinfo_size);
|
||||
|
||||
// Compute the font scores for the word
|
||||
if (tessedit_debug_fonts) {
|
||||
|
@ -131,7 +131,7 @@ int EquationDetect::LabelSpecialText(TO_BLOCK *to_block) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
GenericVector<BLOBNBOX_LIST *> blob_lists;
|
||||
std::vector<BLOBNBOX_LIST *> blob_lists;
|
||||
blob_lists.push_back(&(to_block->blobs));
|
||||
blob_lists.push_back(&(to_block->large_blobs));
|
||||
for (int i = 0; i < blob_lists.size(); ++i) {
|
||||
@ -223,16 +223,17 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &uni
|
||||
|
||||
if (unicharset.get_ispunctuation(id)) {
|
||||
// Exclude some special texts that are likely to be confused as math symbol.
|
||||
static GenericVector<UNICHAR_ID> ids_to_exclude;
|
||||
static std::vector<UNICHAR_ID> ids_to_exclude;
|
||||
if (ids_to_exclude.empty()) {
|
||||
static const char *kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
|
||||
"〈", "〉", "《", "》", "」", "「"};
|
||||
for (auto i = 0; i < countof(kCharsToEx); i++) {
|
||||
ids_to_exclude.push_back(unicharset.unichar_to_id(kCharsToEx[i]));
|
||||
}
|
||||
ids_to_exclude.sort();
|
||||
std::sort(ids_to_exclude.begin(), ids_to_exclude.end());
|
||||
}
|
||||
return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
|
||||
auto found = std::binary_search(ids_to_exclude.begin(), ids_to_exclude.end(), id);
|
||||
return found ? BSTT_NONE : BSTT_MATH;
|
||||
}
|
||||
|
||||
// Check if it is digit. In addition to the isdigit attribute, we also check
|
||||
@ -266,13 +267,13 @@ void EquationDetect::IdentifySpecialText() {
|
||||
IdentifyBlobsToSkip(part);
|
||||
BLOBNBOX_C_IT bbox_it(part->boxes());
|
||||
// Compute the height threshold.
|
||||
GenericVector<int> blob_heights;
|
||||
std::vector<int> blob_heights;
|
||||
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
|
||||
if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
|
||||
blob_heights.push_back(bbox_it.data()->bounding_box().height());
|
||||
}
|
||||
}
|
||||
blob_heights.sort();
|
||||
std::sort(blob_heights.begin(), blob_heights.end());
|
||||
const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
|
||||
for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
|
||||
if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
|
||||
@ -377,7 +378,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
||||
|
||||
// Pass 3: expand block equation seeds.
|
||||
while (!cp_seeds_.empty()) {
|
||||
GenericVector<ColPartition *> seeds_expanded;
|
||||
std::vector<ColPartition *> seeds_expanded;
|
||||
for (int i = 0; i < cp_seeds_.size(); ++i) {
|
||||
if (ExpandSeed(cp_seeds_[i])) {
|
||||
// If this seed is expanded, then we add it into seeds_expanded. Note
|
||||
@ -407,14 +408,14 @@ void EquationDetect::MergePartsByLocation() {
|
||||
while (true) {
|
||||
ColPartition *part = nullptr;
|
||||
// partitions that have been updated.
|
||||
GenericVector<ColPartition *> parts_updated;
|
||||
std::vector<ColPartition *> parts_updated;
|
||||
ColPartitionGridSearch gsearch(part_grid_);
|
||||
gsearch.StartFullSearch();
|
||||
while ((part = gsearch.NextFullSearch()) != nullptr) {
|
||||
if (!IsTextOrEquationType(part->type())) {
|
||||
continue;
|
||||
}
|
||||
GenericVector<ColPartition *> parts_to_merge;
|
||||
std::vector<ColPartition *> parts_to_merge;
|
||||
SearchByOverlap(part, &parts_to_merge);
|
||||
if (parts_to_merge.empty()) {
|
||||
continue;
|
||||
@ -443,7 +444,7 @@ void EquationDetect::MergePartsByLocation() {
|
||||
}
|
||||
|
||||
void EquationDetect::SearchByOverlap(ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_overlap) {
|
||||
std::vector<ColPartition *> *parts_overlap) {
|
||||
ASSERT_HOST(seed != nullptr && parts_overlap != nullptr);
|
||||
if (!IsTextOrEquationType(seed->type())) {
|
||||
return;
|
||||
@ -457,7 +458,7 @@ void EquationDetect::SearchByOverlap(ColPartition *seed,
|
||||
|
||||
// Search iteratively.
|
||||
ColPartition *part;
|
||||
GenericVector<ColPartition *> parts;
|
||||
std::vector<ColPartition *> parts;
|
||||
const float kLargeOverlapTh = 0.95;
|
||||
const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
|
||||
while ((part = search.NextRadSearch()) != nullptr) {
|
||||
@ -518,11 +519,11 @@ void EquationDetect::IdentifySeedParts() {
|
||||
ColPartition *part = nullptr;
|
||||
gsearch.StartFullSearch();
|
||||
|
||||
GenericVector<ColPartition *> seeds1, seeds2;
|
||||
std::vector<ColPartition *> seeds1, seeds2;
|
||||
// The left coordinates of indented text partitions.
|
||||
GenericVector<int> indented_texts_left;
|
||||
std::vector<int> indented_texts_left;
|
||||
// The foreground density of text partitions.
|
||||
GenericVector<float> texts_foreground_density;
|
||||
std::vector<float> texts_foreground_density;
|
||||
while ((part = gsearch.NextFullSearch()) != nullptr) {
|
||||
if (!IsTextOrEquationType(part->type())) {
|
||||
continue;
|
||||
@ -552,8 +553,8 @@ void EquationDetect::IdentifySeedParts() {
|
||||
}
|
||||
|
||||
// Sort the features collected from text regions.
|
||||
indented_texts_left.sort();
|
||||
texts_foreground_density.sort();
|
||||
std::sort(indented_texts_left.begin(), indented_texts_left.end());
|
||||
std::sort(texts_foreground_density.begin(), texts_foreground_density.end());
|
||||
float foreground_density_th = 0.15; // Default value.
|
||||
if (!texts_foreground_density.empty()) {
|
||||
// Use the median of the texts_foreground_density.
|
||||
@ -598,7 +599,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
|
||||
ASSERT_HOST(part);
|
||||
|
||||
// Split part horizontall, and check for each sub part.
|
||||
GenericVector<TBOX> sub_boxes;
|
||||
std::vector<TBOX> sub_boxes;
|
||||
SplitCPHorLite(part, &sub_boxes);
|
||||
float parts_passed = 0.0;
|
||||
for (int i = 0; i < sub_boxes.size(); ++i) {
|
||||
@ -615,7 +616,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa
|
||||
return retval;
|
||||
}
|
||||
|
||||
void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) {
|
||||
void EquationDetect::SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
|
||||
ASSERT_HOST(part && parts_splitted);
|
||||
if (part->median_width() == 0 || part->boxes_count() == 0) {
|
||||
return;
|
||||
@ -623,7 +624,9 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
|
||||
|
||||
// Make a copy of part, and reset parts_splitted.
|
||||
ColPartition *right_part = part->CopyButDontOwnBlobs();
|
||||
parts_splitted->delete_data_pointers();
|
||||
for (auto part : *parts_splitted) {
|
||||
delete part;
|
||||
}
|
||||
parts_splitted->clear();
|
||||
|
||||
const double kThreshold = part->median_width() * 3.0;
|
||||
@ -663,7 +666,7 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVector<ColPartition *
|
||||
parts_splitted->push_back(right_part);
|
||||
}
|
||||
|
||||
void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) {
|
||||
void EquationDetect::SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
|
||||
ASSERT_HOST(part && splitted_boxes);
|
||||
splitted_boxes->clear();
|
||||
if (part->median_width() == 0) {
|
||||
@ -701,7 +704,7 @@ void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *spl
|
||||
}
|
||||
}
|
||||
|
||||
bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left,
|
||||
bool EquationDetect::CheckForSeed2(const std::vector<int> &indented_texts_left,
|
||||
const float foreground_density_th, ColPartition *part) {
|
||||
ASSERT_HOST(part);
|
||||
const TBOX &box = part->bounding_box();
|
||||
@ -720,22 +723,25 @@ bool EquationDetect::CheckForSeed2(const GenericVector<int> &indented_texts_left
|
||||
return true;
|
||||
}
|
||||
|
||||
int EquationDetect::CountAlignment(const GenericVector<int> &sorted_vec, const int val) const {
|
||||
int EquationDetect::CountAlignment(const std::vector<int> &sorted_vec, const int val) const {
|
||||
if (sorted_vec.empty()) {
|
||||
return 0;
|
||||
}
|
||||
const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
|
||||
const int pos = sorted_vec.binary_search(val);
|
||||
const int kDistTh = static_cast<int>(round(0.03f * resolution_));
|
||||
auto pos = std::upper_bound(sorted_vec.begin(), sorted_vec.end(), val);
|
||||
if (pos > sorted_vec.begin()) {
|
||||
--pos;
|
||||
}
|
||||
int count = 0;
|
||||
|
||||
// Search left side.
|
||||
int index = pos;
|
||||
auto index = pos - sorted_vec.begin();
|
||||
while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
|
||||
count++;
|
||||
}
|
||||
|
||||
// Search right side.
|
||||
index = pos + 1;
|
||||
index = pos + 1 - sorted_vec.begin();
|
||||
while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
|
||||
count++;
|
||||
}
|
||||
@ -764,9 +770,9 @@ void EquationDetect::ComputeCPsSuperBBox() {
|
||||
|
||||
void EquationDetect::IdentifyInlinePartsHorizontal() {
|
||||
ASSERT_HOST(cps_super_bbox_);
|
||||
GenericVector<ColPartition *> new_seeds;
|
||||
std::vector<ColPartition *> new_seeds;
|
||||
const int kMarginDiffTh = IntCastRounded(0.5 * lang_tesseract_->source_resolution());
|
||||
const int kGapTh = static_cast<int>(roundf(1.0 * lang_tesseract_->source_resolution()));
|
||||
const int kGapTh = static_cast<int>(round(1.0f * lang_tesseract_->source_resolution()));
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
search.SetUniqueMode(true);
|
||||
// The center x coordinate of the cp_super_bbox_.
|
||||
@ -826,7 +832,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
|
||||
// Get the y gap between text partitions;
|
||||
ColPartition *current = nullptr, *prev = nullptr;
|
||||
gsearch.StartFullSearch();
|
||||
GenericVector<int> ygaps;
|
||||
std::vector<int> ygaps;
|
||||
while ((current = gsearch.NextFullSearch()) != nullptr) {
|
||||
if (!PTIsTextType(current->type())) {
|
||||
continue;
|
||||
@ -851,7 +857,7 @@ int EquationDetect::EstimateTextPartLineSpacing() {
|
||||
}
|
||||
|
||||
// Compute the line spacing from ygaps: use the mean of the first half.
|
||||
ygaps.sort();
|
||||
std::sort(ygaps.begin(), ygaps.end());
|
||||
int spacing = 0, count;
|
||||
for (count = 0; count < ygaps.size() / 2; count++) {
|
||||
spacing += ygaps[count];
|
||||
@ -867,12 +873,12 @@ void EquationDetect::IdentifyInlinePartsVertical(const bool top_to_bottom,
|
||||
|
||||
// Sort cp_seeds_.
|
||||
if (top_to_bottom) { // From top to bottom.
|
||||
cp_seeds_.sort(&SortCPByTopReverse);
|
||||
std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByTopReverse);
|
||||
} else { // From bottom to top.
|
||||
cp_seeds_.sort(&SortCPByBottom);
|
||||
std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByBottom);
|
||||
}
|
||||
|
||||
GenericVector<ColPartition *> new_seeds;
|
||||
std::vector<ColPartition *> new_seeds;
|
||||
for (int i = 0; i < cp_seeds_.size(); ++i) {
|
||||
ColPartition *part = cp_seeds_[i];
|
||||
// If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
|
||||
@ -918,8 +924,8 @@ bool EquationDetect::IsInline(const bool search_bottom, const int textparts_line
|
||||
// Check if neighbor and part is inline similar.
|
||||
const float kHeightRatioTh = 0.5;
|
||||
const int kYGapTh = textparts_linespacing > 0
|
||||
? textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_))
|
||||
: static_cast<int>(roundf(0.05 * resolution_)); // Default value.
|
||||
? textparts_linespacing + static_cast<int>(round(0.02f * resolution_))
|
||||
: static_cast<int>(round(0.05f * resolution_)); // Default value.
|
||||
if (part_box.x_overlap(neighbor_box) && // Location feature.
|
||||
part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
|
||||
// Geo feature.
|
||||
@ -973,9 +979,9 @@ EquationDetect::IndentType EquationDetect::IsIndented(ColPartition *part) {
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
ColPartition *neighbor = nullptr;
|
||||
const TBOX &part_box(part->bounding_box());
|
||||
const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
|
||||
const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
|
||||
const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
|
||||
const int kXGapTh = static_cast<int>(round(0.5f * resolution_));
|
||||
const int kRadiusTh = static_cast<int>(round(3.0f * resolution_));
|
||||
const int kYGapTh = static_cast<int>(round(0.5f * resolution_));
|
||||
|
||||
// Here we use a simple approximation algorithm: from the center of part, We
|
||||
// perform the radius search, and check if we can find a neighboring partition
|
||||
@ -1036,7 +1042,7 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
|
||||
}
|
||||
|
||||
// Expand in four directions.
|
||||
GenericVector<ColPartition *> parts_to_merge;
|
||||
std::vector<ColPartition *> parts_to_merge;
|
||||
ExpandSeedHorizontal(true, seed, &parts_to_merge);
|
||||
ExpandSeedHorizontal(false, seed, &parts_to_merge);
|
||||
ExpandSeedVertical(true, seed, &parts_to_merge);
|
||||
@ -1073,10 +1079,10 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) {
|
||||
}
|
||||
|
||||
void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge) {
|
||||
std::vector<ColPartition *> *parts_to_merge) {
|
||||
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr);
|
||||
const float kYOverlapTh = 0.6;
|
||||
const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
|
||||
const int kXGapTh = static_cast<int>(round(0.2f * resolution_));
|
||||
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
const TBOX &seed_box(seed->bounding_box());
|
||||
@ -1125,10 +1131,10 @@ void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *
|
||||
}
|
||||
|
||||
void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge) {
|
||||
std::vector<ColPartition *> *parts_to_merge) {
|
||||
ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr && cps_super_bbox_ != nullptr);
|
||||
const float kXOverlapTh = 0.4;
|
||||
const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
|
||||
const int kYGapTh = static_cast<int>(round(0.2f * resolution_));
|
||||
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
const TBOX &seed_box(seed->bounding_box());
|
||||
@ -1138,7 +1144,7 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
|
||||
|
||||
// Search iteratively.
|
||||
ColPartition *part = nullptr;
|
||||
GenericVector<ColPartition *> parts;
|
||||
std::vector<ColPartition *> parts;
|
||||
int skipped_min_top = std::numeric_limits<int>::max(), skipped_max_bottom = -1;
|
||||
while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) {
|
||||
if (part == seed) {
|
||||
@ -1206,8 +1212,8 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *
|
||||
}
|
||||
|
||||
bool EquationDetect::IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const {
|
||||
const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
|
||||
const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
|
||||
const int kXGapTh = static_cast<int>(round(0.25f * resolution_));
|
||||
const int kYGapTh = static_cast<int>(round(0.05f * resolution_));
|
||||
|
||||
// Check geometric feature.
|
||||
if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) {
|
||||
@ -1244,7 +1250,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
// Iterate over part_grid_, and find all parts that are text type but not
|
||||
// equation type.
|
||||
ColPartition *part = nullptr;
|
||||
GenericVector<ColPartition *> text_parts;
|
||||
std::vector<ColPartition *> text_parts;
|
||||
ColPartitionGridSearch gsearch(part_grid_);
|
||||
gsearch.StartFullSearch();
|
||||
while ((part = gsearch.NextFullSearch()) != nullptr) {
|
||||
@ -1257,12 +1263,12 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
}
|
||||
|
||||
// Compute the medium height of the text_parts.
|
||||
text_parts.sort(&SortCPByHeight);
|
||||
std::sort(text_parts.begin(), text_parts.end(), &SortCPByHeight);
|
||||
const TBOX &text_box = text_parts[text_parts.size() / 2]->bounding_box();
|
||||
int med_height = text_box.height();
|
||||
if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
|
||||
const TBOX &text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box();
|
||||
med_height = static_cast<int>(roundf(0.5 * (text_box.height() + med_height)));
|
||||
med_height = static_cast<int>(round(0.5f * (text_box.height() + med_height)));
|
||||
}
|
||||
|
||||
// Iterate every text_parts and check if it is a math block satellite.
|
||||
@ -1271,7 +1277,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
if (text_box.height() > med_height) {
|
||||
continue;
|
||||
}
|
||||
GenericVector<ColPartition *> math_blocks;
|
||||
std::vector<ColPartition *> math_blocks;
|
||||
if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
|
||||
continue;
|
||||
}
|
||||
@ -1288,7 +1294,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() {
|
||||
}
|
||||
|
||||
bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
|
||||
GenericVector<ColPartition *> *math_blocks) {
|
||||
std::vector<ColPartition *> *math_blocks) {
|
||||
ASSERT_HOST(part != nullptr && math_blocks != nullptr);
|
||||
math_blocks->clear();
|
||||
const TBOX &part_box(part->bounding_box());
|
||||
@ -1344,7 +1350,7 @@ bool EquationDetect::IsMathBlockSatellite(ColPartition *part,
|
||||
ColPartition *EquationDetect::SearchNNVertical(const bool search_bottom, const ColPartition *part) {
|
||||
ASSERT_HOST(part);
|
||||
ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr;
|
||||
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
|
||||
const int kYGapTh = static_cast<int>(round(resolution_ * 0.5f));
|
||||
|
||||
ColPartitionGridSearch search(part_grid_);
|
||||
search.SetUniqueMode(true);
|
||||
@ -1379,7 +1385,7 @@ bool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *nei
|
||||
if (!neighbor) {
|
||||
return false;
|
||||
}
|
||||
const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
|
||||
const int kYGapTh = static_cast<int>(round(resolution_ * 0.1f));
|
||||
return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
|
||||
}
|
||||
|
||||
|
@ -22,7 +22,6 @@
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
#include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText...
|
||||
#include "equationdetectbase.h" // for EquationDetectBase
|
||||
#include "genericvector.h" // for GenericVector
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
class TBOX;
|
||||
@ -86,7 +85,7 @@ protected:
|
||||
// parts_overlap. Note: this function may update the part_grid_, so if the
|
||||
// caller is also running ColPartitionGridSearch, use the RepositionIterator
|
||||
// to continue.
|
||||
void SearchByOverlap(ColPartition *seed, GenericVector<ColPartition *> *parts_overlap);
|
||||
void SearchByOverlap(ColPartition *seed, std::vector<ColPartition *> *parts_overlap);
|
||||
|
||||
// Insert part back into part_grid_, after it absorbs some other parts.
|
||||
void InsertPartAfterAbsorb(ColPartition *part);
|
||||
@ -106,12 +105,12 @@ protected:
|
||||
// 1. If its left is aligned with any coordinates in indented_texts_left,
|
||||
// which we assume have been sorted.
|
||||
// 2. If its foreground density is over foreground_density_th.
|
||||
bool CheckForSeed2(const GenericVector<int> &indented_texts_left,
|
||||
bool CheckForSeed2(const std::vector<int> &indented_texts_left,
|
||||
const float foreground_density_th, ColPartition *part);
|
||||
|
||||
// Count the number of values in sorted_vec that is close to val, used to
|
||||
// check if a partition is aligned with text partitions.
|
||||
int CountAlignment(const GenericVector<int> &sorted_vec, const int val) const;
|
||||
int CountAlignment(const std::vector<int> &sorted_vec, const int val) const;
|
||||
|
||||
// Check for a seed candidate using the foreground pixel density. And we
|
||||
// return true if the density is below a certain threshold, because characters
|
||||
@ -120,14 +119,14 @@ protected:
|
||||
|
||||
// A light version of SplitCPHor: instead of really doing the part split, we
|
||||
// simply compute the union bounding box of each split part.
|
||||
void SplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes);
|
||||
void SplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes);
|
||||
|
||||
// Split the part (horizontally), and save the split result into
|
||||
// parts_splitted. Note that it is caller's responsibility to release the
|
||||
// memory owns by parts_splitted. On the other hand, the part is unchanged
|
||||
// during this process and still owns the blobs, so do NOT call DeleteBoxes
|
||||
// when freeing the colpartitions in parts_splitted.
|
||||
void SplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted);
|
||||
void SplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted);
|
||||
|
||||
// Check the density for a seed candidate (part) using its math density and
|
||||
// italic density, returns true if the check passed.
|
||||
@ -167,9 +166,9 @@ protected:
|
||||
// merged with seed, remove them from part_grid_, and put them into
|
||||
// parts_to_merge.
|
||||
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge);
|
||||
std::vector<ColPartition *> *parts_to_merge);
|
||||
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed,
|
||||
GenericVector<ColPartition *> *parts_to_merge);
|
||||
std::vector<ColPartition *> *parts_to_merge);
|
||||
|
||||
// Check if a part_box is the small neighbor of seed_box.
|
||||
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const;
|
||||
@ -190,7 +189,7 @@ protected:
|
||||
|
||||
// Check if part is the satellite of one/two math blocks. If it is, we return
|
||||
// true, and save the blocks into math_blocks.
|
||||
bool IsMathBlockSatellite(ColPartition *part, GenericVector<ColPartition *> *math_blocks);
|
||||
bool IsMathBlockSatellite(ColPartition *part, std::vector<ColPartition *> *math_blocks);
|
||||
|
||||
// Search the nearest neighbor of part in one vertical direction as defined in
|
||||
// search_bottom. It returns the neighbor found that major x overlap with it,
|
||||
@ -237,7 +236,7 @@ protected:
|
||||
TBOX *cps_super_bbox_;
|
||||
|
||||
// The seed ColPartition for equation region.
|
||||
GenericVector<ColPartition *> cp_seeds_;
|
||||
std::vector<ColPartition *> cp_seeds_;
|
||||
|
||||
// The resolution (dpi) of the processing image.
|
||||
int resolution_;
|
||||
|
@ -18,7 +18,6 @@
|
||||
|
||||
#include "paragraphs.h"
|
||||
|
||||
#include "genericvector.h" // for GenericVector, GenericVectorEqEq
|
||||
#include "helpers.h" // for UpdateRange, ClipToRange
|
||||
#include "host.h" // for NearlyEqual
|
||||
#include "mutableiterator.h" // for MutableIterator
|
||||
@ -72,7 +71,7 @@ static int Epsilon(int space_pix) {
|
||||
}
|
||||
|
||||
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
|
||||
const GenericVector<RowScratchRegisters> *rows, int row_start,
|
||||
const std::vector<RowScratchRegisters> *rows, int row_start,
|
||||
int row_end) {
|
||||
if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
|
||||
tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", row_start, row_end,
|
||||
@ -134,7 +133,7 @@ static std::string RtlEmbed(const std::string &word, bool rtlify) {
|
||||
|
||||
// Print the current thoughts of the paragraph detector.
|
||||
static void PrintDetectorState(const ParagraphTheory &theory,
|
||||
const GenericVector<RowScratchRegisters> &rows) {
|
||||
const std::vector<RowScratchRegisters> &rows) {
|
||||
std::vector<std::vector<std::string>> output;
|
||||
output.push_back(std::vector<std::string>());
|
||||
output.back().push_back("#row");
|
||||
@ -173,7 +172,7 @@ static void PrintDetectorState(const ParagraphTheory &theory,
|
||||
}
|
||||
|
||||
static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
|
||||
const GenericVector<RowScratchRegisters> &rows) {
|
||||
const std::vector<RowScratchRegisters> &rows) {
|
||||
if (!should_print)
|
||||
return;
|
||||
tprintf("# %s\n", phase);
|
||||
@ -181,7 +180,7 @@ static void DebugDump(bool should_print, const char *phase, const ParagraphTheor
|
||||
}
|
||||
|
||||
// Print out the text for rows[row_start, row_end)
|
||||
static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows, int row_start,
|
||||
static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
|
||||
int row_end) {
|
||||
tprintf("======================================\n");
|
||||
for (int row = row_start; row < row_end; row++) {
|
||||
@ -398,6 +397,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
|
||||
return pos == werd->length();
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void push_back_new(std::vector<T> &vector, const T &data) {
|
||||
if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
|
||||
vector.push_back(data);
|
||||
}
|
||||
}
|
||||
|
||||
// ========= Brain Dead Language Model (combined entry points) ================
|
||||
|
||||
// Given the leftmost word of a line either as a Tesseract unicharset + werd
|
||||
@ -581,7 +587,7 @@ void RowScratchRegisters::SetStartLine() {
|
||||
tprintf("Trying to set a line to be START when it's already BODY.\n");
|
||||
}
|
||||
if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr));
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
@ -591,42 +597,44 @@ void RowScratchRegisters::SetBodyLine() {
|
||||
tprintf("Trying to set a line to be BODY when it's already START.\n");
|
||||
}
|
||||
if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr));
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_START, model));
|
||||
int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr));
|
||||
if (old_idx >= 0)
|
||||
hypotheses_.remove(old_idx);
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_START, model));
|
||||
auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
|
||||
if (found != hypotheses_.end()) {
|
||||
hypotheses_.erase(found);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
|
||||
hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
|
||||
int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr));
|
||||
if (old_idx >= 0)
|
||||
hypotheses_.remove(old_idx);
|
||||
push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
|
||||
auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
|
||||
if (found != hypotheses_.end()) {
|
||||
hypotheses_.erase(found);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
|
||||
for (int h = 0; h < hypotheses_.size(); h++) {
|
||||
if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
|
||||
models->push_back_new(hypotheses_[h].model);
|
||||
push_back_new(*models, hypotheses_[h].model);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
|
||||
for (int h = 0; h < hypotheses_.size(); h++) {
|
||||
if (StrongModel(hypotheses_[h].model))
|
||||
models->push_back_new(hypotheses_[h].model);
|
||||
push_back_new(*models, hypotheses_[h].model);
|
||||
}
|
||||
}
|
||||
|
||||
void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
|
||||
for (int h = 0; h < hypotheses_.size(); h++) {
|
||||
if (hypotheses_[h].model != nullptr)
|
||||
models->push_back_new(hypotheses_[h].model);
|
||||
push_back_new(*models, hypotheses_[h].model);
|
||||
}
|
||||
}
|
||||
|
||||
@ -647,8 +655,8 @@ void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models
|
||||
if (models.empty())
|
||||
return;
|
||||
for (int h = hypotheses_.size() - 1; h >= 0; h--) {
|
||||
if (!models.contains(hypotheses_[h].model)) {
|
||||
hypotheses_.remove(h);
|
||||
if (!contains(models, hypotheses_[h].model)) {
|
||||
hypotheses_.erase(hypotheses_.begin() + h);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -672,15 +680,15 @@ public:
|
||||
int size() const {
|
||||
return values_.size();
|
||||
}
|
||||
void GetClusters(GenericVector<Cluster> *clusters);
|
||||
void GetClusters(std::vector<Cluster> *clusters);
|
||||
|
||||
private:
|
||||
int max_cluster_width_;
|
||||
GenericVector<int> values_;
|
||||
std::vector<int> values_;
|
||||
};
|
||||
|
||||
// Return the index of the cluster closest to value.
|
||||
static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
|
||||
static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
|
||||
int best_index = 0;
|
||||
for (int i = 0; i < clusters.size(); i++) {
|
||||
if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center))
|
||||
@ -689,9 +697,9 @@ static int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
|
||||
return best_index;
|
||||
}
|
||||
|
||||
void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
|
||||
void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
|
||||
clusters->clear();
|
||||
values_.sort();
|
||||
std::sort(values_.begin(), values_.end());
|
||||
for (int i = 0; i < values_.size();) {
|
||||
int orig_i = i;
|
||||
int lo = values_[i];
|
||||
@ -705,16 +713,16 @@ void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
|
||||
|
||||
// Calculate left- and right-indent tab stop values seen in
|
||||
// rows[row_start, row_end) given a tolerance of tolerance.
|
||||
static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
int tolerance, GenericVector<Cluster> *left_tabs,
|
||||
GenericVector<Cluster> *right_tabs) {
|
||||
static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
int tolerance, std::vector<Cluster> *left_tabs,
|
||||
std::vector<Cluster> *right_tabs) {
|
||||
if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
// First pass: toss all left and right indents into clusterers.
|
||||
SimpleClusterer initial_lefts(tolerance);
|
||||
SimpleClusterer initial_rights(tolerance);
|
||||
GenericVector<Cluster> initial_left_tabs;
|
||||
GenericVector<Cluster> initial_right_tabs;
|
||||
std::vector<Cluster> initial_left_tabs;
|
||||
std::vector<Cluster> initial_right_tabs;
|
||||
for (int i = row_start; i < row_end; i++) {
|
||||
initial_lefts.Add((*rows)[i].lindent_);
|
||||
initial_rights.Add((*rows)[i].rindent_);
|
||||
@ -782,7 +790,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
}
|
||||
}
|
||||
if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
|
||||
left_tabs->remove(to_prune);
|
||||
left_tabs->erase(left_tabs->begin() + to_prune);
|
||||
}
|
||||
}
|
||||
if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
|
||||
@ -793,7 +801,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
}
|
||||
}
|
||||
if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
|
||||
right_tabs->remove(to_prune);
|
||||
right_tabs->erase(right_tabs->begin() + to_prune);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -817,7 +825,7 @@ static void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
// Case 2b: Fully Justified. (eop_threshold > 0)
|
||||
// We mark a line as short (end of paragraph) if the offside indent
|
||||
// is greater than eop_threshold.
|
||||
static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
const ParagraphModel *model, bool ltr, int eop_threshold) {
|
||||
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
@ -861,7 +869,7 @@ static void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, int row_
|
||||
// Further, this struct holds the data we amass for the (single) ParagraphModel
|
||||
// we'll assign to the text lines (assuming we get that far).
|
||||
struct GeometricClassifierState {
|
||||
GeometricClassifierState(int dbg_level, GenericVector<RowScratchRegisters> *r, int r_start,
|
||||
GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
|
||||
int r_end)
|
||||
: debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
|
||||
tolerance = InterwordSpace(*r, r_start, r_end);
|
||||
@ -886,7 +894,7 @@ struct GeometricClassifierState {
|
||||
}
|
||||
|
||||
// Align tabs are the tab stops the text is aligned to.
|
||||
const GenericVector<Cluster> &AlignTabs() const {
|
||||
const std::vector<Cluster> &AlignTabs() const {
|
||||
if (just == tesseract::JUSTIFICATION_RIGHT)
|
||||
return right_tabs;
|
||||
return left_tabs;
|
||||
@ -897,7 +905,7 @@ struct GeometricClassifierState {
|
||||
// Note that for a left-to-right text which is aligned to the right such as
|
||||
// this function comment, the offside tabs are the horizontal tab stops
|
||||
// marking the beginning of ("Note", "this" and "marking").
|
||||
const GenericVector<Cluster> &OffsideTabs() const {
|
||||
const std::vector<Cluster> &OffsideTabs() const {
|
||||
if (just == tesseract::JUSTIFICATION_RIGHT)
|
||||
return left_tabs;
|
||||
return right_tabs;
|
||||
@ -940,7 +948,7 @@ struct GeometricClassifierState {
|
||||
|
||||
// The Geometric Classifier was asked to find a single paragraph model
|
||||
// to fit the text rows (*rows)[row_start, row_end)
|
||||
GenericVector<RowScratchRegisters> *rows;
|
||||
std::vector<RowScratchRegisters> *rows;
|
||||
int row_start = 0;
|
||||
int row_end = 0;
|
||||
|
||||
@ -953,8 +961,8 @@ struct GeometricClassifierState {
|
||||
|
||||
// These left and right tab stops were determined to be the common tab
|
||||
// stops for the given text.
|
||||
GenericVector<Cluster> left_tabs;
|
||||
GenericVector<Cluster> right_tabs;
|
||||
std::vector<Cluster> left_tabs;
|
||||
std::vector<Cluster> right_tabs;
|
||||
|
||||
// These are parameters we must determine to create a ParagraphModel.
|
||||
tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN;
|
||||
@ -1083,7 +1091,7 @@ static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricCla
|
||||
// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
|
||||
// it's worth guessing that (A1b) is the correct interpretation if there are
|
||||
// far more "full" lines than "short" lines.
|
||||
static void GeometricClassify(int debug_level, GenericVector<RowScratchRegisters> *rows,
|
||||
static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, ParagraphTheory *theory) {
|
||||
if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
@ -1223,7 +1231,7 @@ const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
|
||||
}
|
||||
auto *m = new ParagraphModel(model);
|
||||
models_->push_back(m);
|
||||
models_we_added_.push_back_new(m);
|
||||
push_back_new(models_we_added_, m);
|
||||
return m;
|
||||
}
|
||||
|
||||
@ -1231,7 +1239,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
|
||||
size_t w = 0;
|
||||
for (size_t r = 0; r < models_->size(); r++) {
|
||||
ParagraphModel *m = (*models_)[r];
|
||||
if (!used_models.contains(m) && models_we_added_.contains(m)) {
|
||||
if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
|
||||
delete m;
|
||||
} else {
|
||||
if (r > w) {
|
||||
@ -1246,7 +1254,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
|
||||
// Examine rows[start, end) and try to determine if an existing non-centered
|
||||
// paragraph model would fit them perfectly. If so, return a pointer to it.
|
||||
// If not, return nullptr.
|
||||
const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegisters> *rows,
|
||||
const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
|
||||
int start, int end) const {
|
||||
for (const auto *model : *models_) {
|
||||
if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model))
|
||||
@ -1258,7 +1266,7 @@ const ParagraphModel *ParagraphTheory::Fits(const GenericVector<RowScratchRegist
|
||||
void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
|
||||
for (const auto *model : *models_) {
|
||||
if (model->justification() != JUSTIFICATION_CENTER)
|
||||
models->push_back_new(model);
|
||||
push_back_new(*models, model);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1272,7 +1280,7 @@ int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model) {
|
||||
if (!StrongModel(model)) {
|
||||
tprintf("ValidFirstLine() should only be called with strong models!\n");
|
||||
@ -1281,7 +1289,7 @@ bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
(*rows)[row].rindent_, (*rows)[row].rmargin_);
|
||||
}
|
||||
|
||||
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model) {
|
||||
if (!StrongModel(model)) {
|
||||
tprintf("ValidBodyLine() should only be called with strong models!\n");
|
||||
@ -1290,7 +1298,7 @@ bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
(*rows)[row].rindent_, (*rows)[row].rmargin_);
|
||||
}
|
||||
|
||||
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b,
|
||||
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
|
||||
const ParagraphModel *model) {
|
||||
if (model != kCrownRight && model != kCrownLeft) {
|
||||
tprintf("CrownCompatible() should only be called with crown models!\n");
|
||||
@ -1308,7 +1316,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
|
||||
|
||||
// =============== Implementation of ParagraphModelSmearer ====================
|
||||
|
||||
ParagraphModelSmearer::ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
|
||||
ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, ParagraphTheory *theory)
|
||||
: theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
|
||||
if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
|
||||
@ -1341,7 +1349,7 @@ void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
|
||||
// This is basic filtering; we check likely paragraph starty-ness down
|
||||
// below in Smear() -- you know, whether the first word would have fit
|
||||
// and such.
|
||||
still_open.push_back_new(opened[m]);
|
||||
push_back_new(still_open, opened[m]);
|
||||
}
|
||||
}
|
||||
OpenModels(row + 1) = still_open;
|
||||
@ -1449,7 +1457,7 @@ void ParagraphModelSmearer::Smear() {
|
||||
|
||||
// Find out what ParagraphModels are actually used, and discard any
|
||||
// that are not.
|
||||
static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
|
||||
static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
|
||||
ParagraphTheory *theory) {
|
||||
SetOfModels used_models;
|
||||
for (int i = 0; i < rows.size(); i++) {
|
||||
@ -1483,7 +1491,7 @@ static void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
|
||||
// sequences of body lines of equivalent type abutted against the beginning
|
||||
// or a body or start line of a different type into a crown paragraph.
|
||||
static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
|
||||
GenericVector<RowScratchRegisters> *rows) {
|
||||
std::vector<RowScratchRegisters> *rows) {
|
||||
int start;
|
||||
for (int end = rows->size(); end > 0; end = start) {
|
||||
// Search back for a body line of a unique type.
|
||||
@ -1546,7 +1554,7 @@ static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
|
||||
// really just ignore it as an outlier. To express this, we allow the
|
||||
// user to specify the percentile (0..100) of indent values to use as
|
||||
// the common margin for each row in the run of rows[start, end).
|
||||
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start,
|
||||
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end, int percentile) {
|
||||
if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
|
||||
return;
|
||||
@ -1585,7 +1593,7 @@ void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows
|
||||
}
|
||||
|
||||
// Return the median inter-word space in rows[row_start, row_end).
|
||||
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end) {
|
||||
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
|
||||
if (row_end < row_start + 1)
|
||||
return 1;
|
||||
int word_height =
|
||||
@ -1666,7 +1674,7 @@ static bool LikelyParagraphStart(const RowScratchRegisters &before,
|
||||
// If the rows given could be a consistent start to a paragraph, set *consistent
|
||||
// true.
|
||||
static ParagraphModel InternalParagraphModelByOutline(
|
||||
const GenericVector<RowScratchRegisters> *rows, int start, int end, int tolerance,
|
||||
const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
|
||||
bool *consistent) {
|
||||
int ltr_line_count = 0;
|
||||
for (int i = start; i < end; i++) {
|
||||
@ -1763,7 +1771,7 @@ static ParagraphModel InternalParagraphModelByOutline(
|
||||
// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
|
||||
// output if we're debugging.
|
||||
static ParagraphModel ParagraphModelByOutline(int debug_level,
|
||||
const GenericVector<RowScratchRegisters> *rows,
|
||||
const std::vector<RowScratchRegisters> *rows,
|
||||
int start, int end, int tolerance) {
|
||||
bool unused_consistent;
|
||||
ParagraphModel retval =
|
||||
@ -1776,7 +1784,7 @@ static ParagraphModel ParagraphModelByOutline(int debug_level,
|
||||
}
|
||||
|
||||
// Do rows[start, end) form a single instance of the given paragraph model?
|
||||
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end,
|
||||
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
|
||||
const ParagraphModel *model) {
|
||||
if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
|
||||
return false;
|
||||
@ -1800,7 +1808,7 @@ bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int
|
||||
// We only take the very strongest signals, as we don't want to get
|
||||
// confused and marking up centered text, poetry, or source code as
|
||||
// clearly part of a typical paragraph.
|
||||
static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row_start,
|
||||
static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
|
||||
int row_end) {
|
||||
// Record patently obvious body text.
|
||||
for (int i = row_start + 1; i < row_end; i++) {
|
||||
@ -1862,7 +1870,7 @@ static void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, int row
|
||||
// Look for sequences of a start line followed by some body lines in
|
||||
// rows[row_start, row_end) and create ParagraphModels for them if
|
||||
// they seem coherent.
|
||||
static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegisters> *rows,
|
||||
static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, bool allow_flush_models,
|
||||
ParagraphTheory *theory) {
|
||||
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
|
||||
@ -1951,7 +1959,7 @@ static void ModelStrongEvidence(int debug_level, GenericVector<RowScratchRegiste
|
||||
// clues.
|
||||
// (3) Form models for any sequence of start + continuation lines.
|
||||
// (4) Smear the paragraph models to cover surrounding text.
|
||||
static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegisters> *rows,
|
||||
static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
|
||||
int row_start, int row_end, ParagraphTheory *theory) {
|
||||
if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
|
||||
return;
|
||||
@ -1979,7 +1987,7 @@ static void StrongEvidenceClassify(int debug_level, GenericVector<RowScratchRegi
|
||||
smearer.Smear();
|
||||
}
|
||||
|
||||
static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows, int row_start,
|
||||
static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
|
||||
int row_end, ParagraphTheory *theory) {
|
||||
for (int i = row_start + 1; i < row_end - 1; i++) {
|
||||
if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
|
||||
@ -1994,8 +2002,8 @@ static void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
|
||||
// Collect sequences of unique hypotheses in row registers and create proper
|
||||
// paragraphs for them, referencing the paragraphs in row_owners.
|
||||
static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
|
||||
GenericVector<RowScratchRegisters> &rows,
|
||||
GenericVector<PARA *> *row_owners,
|
||||
std::vector<RowScratchRegisters> &rows,
|
||||
std::vector<PARA *> *row_owners,
|
||||
ParagraphTheory *theory) {
|
||||
int end = rows.size();
|
||||
int start;
|
||||
@ -2090,7 +2098,7 @@ struct Interval {
|
||||
// (1) If a line is surrounded by lines of unknown type, it's weak.
|
||||
// (2) If two lines in a row are start lines for a given paragraph type, but
|
||||
// after that the same paragraph type does not continue, they're weak.
|
||||
static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) {
|
||||
static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
|
||||
SetOfModels row_models;
|
||||
rows[row].StrongHypotheses(&row_models);
|
||||
|
||||
@ -2145,8 +2153,8 @@ static bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int ro
|
||||
// + Crown paragraphs not immediately followed by a strongly modeled line.
|
||||
// + Single line paragraphs surrounded by text that doesn't match the
|
||||
// model.
|
||||
static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
|
||||
GenericVector<Interval> *to_fix, int row_start, int row_end) {
|
||||
static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
|
||||
std::vector<Interval> *to_fix, int row_start, int row_end) {
|
||||
to_fix->clear();
|
||||
for (int i = row_start; i < row_end; i++) {
|
||||
bool needs_fixing = false;
|
||||
@ -2195,8 +2203,8 @@ static void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
|
||||
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
|
||||
// normalize each row_owner to point to an actual PARA, and output the
|
||||
// paragraphs in order onto paragraphs.
|
||||
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs) {
|
||||
GenericVector<PARA *> &rows = *row_owners;
|
||||
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
|
||||
std::vector<PARA *> &rows = *row_owners;
|
||||
paragraphs->clear();
|
||||
PARA_IT out(paragraphs);
|
||||
PARA *formerly_null = nullptr;
|
||||
@ -2226,16 +2234,16 @@ void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *
|
||||
// models - the list of paragraph models referenced by the PARA objects.
|
||||
// caller is responsible for deleting the models.
|
||||
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<ParagraphModel *> *models) {
|
||||
GenericVector<RowScratchRegisters> rows;
|
||||
std::vector<RowScratchRegisters> rows;
|
||||
ParagraphTheory theory(models);
|
||||
|
||||
// Initialize row_owners to be a bunch of nullptr pointers.
|
||||
row_owners->init_to_size(row_infos->size(), nullptr);
|
||||
row_owners->resize(row_infos->size());
|
||||
|
||||
// Set up row scratch registers for the main algorithm.
|
||||
rows.init_to_size(row_infos->size(), RowScratchRegisters());
|
||||
rows.resize(row_infos->size(), RowScratchRegisters());
|
||||
for (int i = 0; i < row_infos->size(); i++) {
|
||||
rows[i].Init((*row_infos)[i]);
|
||||
}
|
||||
@ -2249,7 +2257,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
|
||||
DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
|
||||
|
||||
GenericVector<Interval> leftovers;
|
||||
std::vector<Interval> leftovers;
|
||||
LeftoverSegments(rows, &leftovers, 0, rows.size());
|
||||
for (int i = 0; i < leftovers.size(); i++) {
|
||||
// Pass 2a:
|
||||
@ -2263,7 +2271,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
// If we had any luck in pass 2a, we got part of the page and didn't
|
||||
// know how to classify a few runs of rows. Take the segments that
|
||||
// didn't find a model and reprocess them individually.
|
||||
GenericVector<Interval> leftovers2;
|
||||
std::vector<Interval> leftovers2;
|
||||
LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
|
||||
bool pass2a_was_useful =
|
||||
leftovers2.size() > 1 ||
|
||||
@ -2422,7 +2430,7 @@ static void InitializeRowInfo(bool after_recognition, const MutableIterator &it,
|
||||
}
|
||||
|
||||
PAGE_RES_IT page_res_it = *it.PageResIt();
|
||||
GenericVector<WERD_RES *> werds;
|
||||
std::vector<WERD_RES *> werds;
|
||||
WERD_RES *word_res = page_res_it.restart_row();
|
||||
ROW_RES *this_row = page_res_it.row();
|
||||
int num_leaders = 0;
|
||||
@ -2505,12 +2513,12 @@ void DetectParagraphs(int debug_level, bool after_text_recognition,
|
||||
}
|
||||
|
||||
// Run the paragraph detection algorithm.
|
||||
GenericVector<PARA *> row_owners;
|
||||
GenericVector<PARA *> the_paragraphs;
|
||||
std::vector<PARA *> row_owners;
|
||||
std::vector<PARA *> the_paragraphs;
|
||||
if (!is_image_block) {
|
||||
DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
|
||||
} else {
|
||||
row_owners.init_to_size(row_infos.size(), nullptr);
|
||||
row_owners.resize(row_infos.size());
|
||||
CanonicalizeDetectionResults(&row_owners, block->para_list());
|
||||
}
|
||||
|
||||
|
@ -31,9 +31,6 @@ class ParagraphModel;
|
||||
class PARA_LIST;
|
||||
struct PARA;
|
||||
|
||||
template <typename T>
|
||||
class GenericVector;
|
||||
|
||||
// This structure captures all information needed about a text line for the
|
||||
// purposes of paragraph detection. It is meant to be exceedingly light-weight
|
||||
// so that we can easily test paragraph detection independent of the rest of
|
||||
@ -90,7 +87,7 @@ public:
|
||||
// caller is responsible for deleting the models.
|
||||
TESS_API
|
||||
void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
|
||||
std::vector<ParagraphModel *> *models);
|
||||
|
||||
// Given a MutableIterator to the start of a block, run DetectParagraphs on
|
||||
|
@ -95,7 +95,7 @@ struct LineHypothesis {
|
||||
|
||||
class ParagraphTheory; // Forward Declaration
|
||||
|
||||
using SetOfModels = GenericVector<const ParagraphModel *>;
|
||||
using SetOfModels = std::vector<const ParagraphModel *>;
|
||||
|
||||
// Row Scratch Registers are data generated by the paragraph detection
|
||||
// algorithm based on a RowInfo input.
|
||||
@ -123,7 +123,7 @@ public:
|
||||
|
||||
// Clear all hypotheses about this line.
|
||||
void SetUnknown() {
|
||||
hypotheses_.truncate(0);
|
||||
hypotheses_.clear();
|
||||
}
|
||||
|
||||
// Append all hypotheses of strong models that match this row as a start.
|
||||
@ -190,7 +190,7 @@ public:
|
||||
|
||||
private:
|
||||
// Hypotheses of either LT_START or LT_BODY
|
||||
GenericVector<LineHypothesis> hypotheses_;
|
||||
std::vector<LineHypothesis> hypotheses_;
|
||||
};
|
||||
|
||||
// A collection of convenience functions for wrapping the set of
|
||||
@ -219,21 +219,21 @@ public:
|
||||
|
||||
// If any of the non-centered paragraph models we know about fit
|
||||
// rows[start, end), return it. Else nullptr.
|
||||
const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows, int start,
|
||||
const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end) const;
|
||||
|
||||
int IndexOf(const ParagraphModel *model) const;
|
||||
|
||||
private:
|
||||
std::vector<ParagraphModel *> *models_;
|
||||
GenericVector<ParagraphModel *> models_we_added_;
|
||||
std::vector<ParagraphModel *> models_we_added_;
|
||||
};
|
||||
|
||||
bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model);
|
||||
bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, int row,
|
||||
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
|
||||
const ParagraphModel *model);
|
||||
bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int b,
|
||||
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
|
||||
const ParagraphModel *model);
|
||||
|
||||
// A class for smearing Paragraph Model hypotheses to surrounding rows.
|
||||
@ -245,7 +245,7 @@ bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, int a, int
|
||||
// "smear" our models over the text.
|
||||
class ParagraphModelSmearer {
|
||||
public:
|
||||
ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
|
||||
ParagraphTheory *theory);
|
||||
|
||||
// Smear forward paragraph models from existing row markings to subsequent
|
||||
@ -266,7 +266,7 @@ private:
|
||||
}
|
||||
|
||||
ParagraphTheory *theory_;
|
||||
GenericVector<RowScratchRegisters> *rows_;
|
||||
std::vector<RowScratchRegisters> *rows_;
|
||||
int row_start_;
|
||||
int row_end_;
|
||||
|
||||
@ -284,11 +284,11 @@ private:
|
||||
// Clear all hypotheses about lines [start, end) and reset the margins to the
|
||||
// percentile (0..100) value of the left and right row edges for this run of
|
||||
// rows.
|
||||
void RecomputeMarginsAndClearHypotheses(GenericVector<RowScratchRegisters> *rows, int start,
|
||||
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
|
||||
int end, int percentile);
|
||||
|
||||
// Return the median inter-word space in rows[row_start, row_end).
|
||||
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, int row_start, int row_end);
|
||||
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
|
||||
|
||||
// Return whether the first word on the after line can fit in the space at
|
||||
// the end of the before line (knowing which way the text is aligned and read).
|
||||
@ -300,13 +300,13 @@ bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRe
|
||||
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
|
||||
|
||||
// Do rows[start, end) form a single instance of the given paragraph model?
|
||||
bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, int start, int end,
|
||||
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
|
||||
const ParagraphModel *model);
|
||||
|
||||
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
|
||||
// normalize each row_owner to point to an actual PARA, and output the
|
||||
// paragraphs in order onto paragraphs.
|
||||
void CanonicalizeDetectionResults(GenericVector<PARA *> *row_owners, PARA_LIST *paragraphs);
|
||||
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
@ -45,7 +45,7 @@
|
||||
|
||||
#include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
#include "genericvector.h" // for GenericVector, PointerVector
|
||||
#include "genericvector.h" // for PointerVector
|
||||
|
||||
#include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe...
|
||||
|
||||
@ -398,27 +398,27 @@ public:
|
||||
// Input: a set of noisy outlines that probably belong to the real_word.
|
||||
// Output: outlines that overlapped blobs are set to nullptr and put back into
|
||||
// the word, either in the blobs or in the reject list.
|
||||
void AssignDiacriticsToOverlappingBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
|
||||
void AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
|
||||
WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<bool> *overlapped_any_blob,
|
||||
GenericVector<C_BLOB *> *target_blobs);
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<bool> *overlapped_any_blob,
|
||||
std::vector<C_BLOB *> *target_blobs);
|
||||
// Attempts to assign non-overlapping outlines to their nearest blobs or
|
||||
// make new blobs out of them.
|
||||
void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE *> &outlines, int pass,
|
||||
void AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
|
||||
WERD *real_word, PAGE_RES_IT *pr_it,
|
||||
GenericVector<bool> *word_wanted,
|
||||
GenericVector<C_BLOB *> *target_blobs);
|
||||
std::vector<bool> *word_wanted,
|
||||
std::vector<C_BLOB *> *target_blobs);
|
||||
// Starting with ok_outlines set to indicate which outlines overlap the blob,
|
||||
// chooses the optimal set (approximately) and returns true if any outlines
|
||||
// are desired, in which case ok_outlines indicates which ones.
|
||||
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
|
||||
C_BLOB *blob, const GenericVector<C_OUTLINE *> &outlines,
|
||||
C_BLOB *blob, const std::vector<C_OUTLINE *> &outlines,
|
||||
int num_outlines, std::vector<bool> *ok_outlines);
|
||||
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
|
||||
// the inclusion of the outlines, and returns the certainty of the raw choice.
|
||||
float ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
|
||||
const GenericVector<C_OUTLINE *> &outlines, int pass_n,
|
||||
const std::vector<C_OUTLINE *> &outlines, int pass_n,
|
||||
PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str);
|
||||
// Classifies the given blob (part of word_data->word->word) as an individual
|
||||
// word, using languages, chopper etc, returning only the certainty of the
|
||||
@ -703,24 +703,24 @@ public:
|
||||
void ReSegmentByClassification(PAGE_RES *page_res);
|
||||
// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
|
||||
// Returns false if an invalid UNICHAR_ID is encountered.
|
||||
bool ConvertStringToUnichars(const char *utf8, GenericVector<UNICHAR_ID> *class_ids);
|
||||
bool ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids);
|
||||
// Resegments the word to achieve the target_text from the classifier.
|
||||
// Returns false if the re-segmentation fails.
|
||||
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
|
||||
// applies a full search on the classifier results to find the best classified
|
||||
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
|
||||
// substitutions ARE used.
|
||||
bool FindSegmentation(const GenericVector<UNICHAR_ID> &target_text, WERD_RES *word_res);
|
||||
bool FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res);
|
||||
// Recursive helper to find a match to the target_text (from text_index
|
||||
// position) in the choices (from choices_pos position).
|
||||
// Choices is an array of GenericVectors, of length choices_length, with each
|
||||
// Choices is an array of vectors of length choices_length, with each
|
||||
// element representing a starting position in the word, and the
|
||||
// GenericVector holding classification results for a sequence of consecutive
|
||||
// vector holding classification results for a sequence of consecutive
|
||||
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||
void SearchForText(const GenericVector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const GenericVector<UNICHAR_ID> &target_text,
|
||||
int text_index, float rating, GenericVector<int> *segmentation,
|
||||
float *best_rating, GenericVector<int> *best_segmentation);
|
||||
void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||
int choices_length, const std::vector<UNICHAR_ID> &target_text,
|
||||
int text_index, float rating, std::vector<int> *segmentation,
|
||||
float *best_rating, std::vector<int> *best_segmentation);
|
||||
// Counts up the labelled words and the blobs within.
|
||||
// Deletes all unused or emptied words, counting the unused ones.
|
||||
// Resets W_BOL and W_EOL flags correctly.
|
||||
|
@ -183,7 +183,7 @@ void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
|
||||
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
|
||||
chopped2->blobs.push_back(chopped->blobs[i]);
|
||||
}
|
||||
chopped->blobs.truncate(split_pt);
|
||||
chopped->blobs.resize(split_pt);
|
||||
word->chopped_word = nullptr;
|
||||
delete word2->chopped_word;
|
||||
word2->chopped_word = nullptr;
|
||||
@ -223,8 +223,8 @@ void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_b
|
||||
TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
|
||||
TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
|
||||
// Tack the word2 outputs onto the end of the word outputs.
|
||||
word->chopped_word->blobs += word2->chopped_word->blobs;
|
||||
word->rebuild_word->blobs += word2->rebuild_word->blobs;
|
||||
word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end());
|
||||
word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end());
|
||||
word2->chopped_word->blobs.clear();
|
||||
word2->rebuild_word->blobs.clear();
|
||||
TPOINT split_pt;
|
||||
@ -234,17 +234,17 @@ void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_b
|
||||
// Since the seam list is one element short, an empty seam marking the
|
||||
// end of the last blob in the first word is needed first.
|
||||
word->seam_array.push_back(new SEAM(0.0f, split_pt));
|
||||
word->seam_array += word2->seam_array;
|
||||
word2->seam_array.truncate(0);
|
||||
word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end());
|
||||
word2->seam_array.clear();
|
||||
// Fix widths and gaps.
|
||||
word->blob_widths += word2->blob_widths;
|
||||
word->blob_gaps += word2->blob_gaps;
|
||||
word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end());
|
||||
word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end());
|
||||
// Fix the ratings matrix.
|
||||
int rat1 = word->ratings->dimension();
|
||||
int rat2 = word2->ratings->dimension();
|
||||
word->ratings->AttachOnCorner(word2->ratings);
|
||||
ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
|
||||
word->best_state += word2->best_state;
|
||||
word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end());
|
||||
// Append the word choices.
|
||||
*word->raw_choice += *word2->raw_choice;
|
||||
|
||||
|
@ -826,7 +826,9 @@ void TWERD::CopyFrom(const TWERD &src) {
|
||||
|
||||
// Deletes owned data.
|
||||
void TWERD::Clear() {
|
||||
blobs.delete_data_pointers();
|
||||
for (auto blob : blobs) {
|
||||
delete blob;
|
||||
}
|
||||
blobs.clear();
|
||||
}
|
||||
|
||||
@ -869,8 +871,9 @@ void TWERD::MergeBlobs(int start, int end) {
|
||||
blobs[i] = nullptr;
|
||||
}
|
||||
// Remove dead blobs from the vector.
|
||||
// TODO: optimize.
|
||||
for (int i = start + 1; i < end && start + 1 < blobs.size(); ++i) {
|
||||
blobs.remove(start + 1);
|
||||
blobs.erase(blobs.begin() + start + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -450,8 +450,8 @@ struct TWERD {
|
||||
|
||||
void plot(ScrollView *window);
|
||||
|
||||
GenericVector<TBLOB *> blobs; // Blobs in word.
|
||||
bool latin_script; // This word is in a latin-based script.
|
||||
std::vector<TBLOB *> blobs; // Blobs in word.
|
||||
bool latin_script; // This word is in a latin-based script.
|
||||
};
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
|
@ -2,7 +2,6 @@
|
||||
* File: linlsq.h (Formerly llsq.h)
|
||||
* Description: Linear Least squares fitting code.
|
||||
* Author: Ray Smith
|
||||
* Created: Thu Sep 12 08:44:51 BST 1991
|
||||
*
|
||||
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -22,13 +21,11 @@
|
||||
|
||||
#include "points.h" // for FCOORD
|
||||
|
||||
#include <algorithm> // for std::nth_element
|
||||
#include <cstdint> // for int32_t
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
template <typename T>
|
||||
class GenericVector;
|
||||
|
||||
class TESS_API LLSQ {
|
||||
public:
|
||||
LLSQ() { // constructor
|
||||
@ -111,29 +108,30 @@ private:
|
||||
// An assumption is made that most of the values are spread over no more than
|
||||
// half the range, but wrap-around is accounted for if the median is near
|
||||
// the wrap-around point.
|
||||
// Cannot be a member of GenericVector, as it makes heavy used of LLSQ.
|
||||
// Cannot be a member of vector, as it makes heavy use of LLSQ.
|
||||
// T must be an integer or float/double type.
|
||||
template <typename T>
|
||||
T MedianOfCircularValues(T modulus, GenericVector<T> *v) {
|
||||
T MedianOfCircularValues(T modulus, std::vector<T> &v) {
|
||||
LLSQ stats;
|
||||
T halfrange = static_cast<T>(modulus / 2);
|
||||
int num_elements = v->size();
|
||||
for (int i = 0; i < num_elements; ++i) {
|
||||
stats.add((*v)[i], (*v)[i] + halfrange);
|
||||
auto num_elements = v.size();
|
||||
for (auto i : v) {
|
||||
stats.add(i, i + halfrange);
|
||||
}
|
||||
bool offset_needed = stats.y_variance() < stats.x_variance();
|
||||
if (offset_needed) {
|
||||
for (int i = 0; i < num_elements; ++i) {
|
||||
(*v)[i] += halfrange;
|
||||
for (auto i : v) {
|
||||
i += halfrange;
|
||||
}
|
||||
}
|
||||
int median_index = v->choose_nth_item(num_elements / 2);
|
||||
auto median_index = num_elements / 2;
|
||||
std::nth_element(v.begin(), v.begin() + median_index, v.end());
|
||||
if (offset_needed) {
|
||||
for (int i = 0; i < num_elements; ++i) {
|
||||
(*v)[i] -= halfrange;
|
||||
for (auto i : v) {
|
||||
i -= halfrange;
|
||||
}
|
||||
}
|
||||
return (*v)[median_index];
|
||||
return v[median_index];
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -391,8 +391,8 @@ void WERD_RES::SetupBlamerBundle() {
|
||||
|
||||
// Computes the blob_widths and blob_gaps from the chopped_word.
|
||||
void WERD_RES::SetupBlobWidthsAndGaps() {
|
||||
blob_widths.truncate(0);
|
||||
blob_gaps.truncate(0);
|
||||
blob_widths.clear();
|
||||
blob_gaps.clear();
|
||||
int num_blobs = chopped_word->NumBlobs();
|
||||
for (int b = 0; b < num_blobs; ++b) {
|
||||
TBLOB *blob = chopped_word->blobs[b];
|
||||
@ -410,7 +410,7 @@ void WERD_RES::SetupBlobWidthsAndGaps() {
|
||||
void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {
|
||||
// Insert the seam into the SEAMS array.
|
||||
seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
|
||||
seam_array.insert(seam, blob_number);
|
||||
seam_array.insert(seam_array.begin() + blob_number, seam);
|
||||
if (ratings != nullptr) {
|
||||
// Expand the ratings matrix.
|
||||
ratings = ratings->ConsumeAndMakeBigger(blob_number);
|
||||
@ -753,13 +753,20 @@ void WERD_RES::ConsumeWordResults(WERD_RES *word) {
|
||||
MovePointerData(&chopped_word, &word->chopped_word);
|
||||
MovePointerData(&rebuild_word, &word->rebuild_word);
|
||||
MovePointerData(&box_word, &word->box_word);
|
||||
seam_array.delete_data_pointers();
|
||||
for (auto data : seam_array) {
|
||||
delete data;
|
||||
}
|
||||
seam_array = word->seam_array;
|
||||
word->seam_array.clear();
|
||||
best_state.move(&word->best_state);
|
||||
correct_text.move(&word->correct_text);
|
||||
blob_widths.move(&word->blob_widths);
|
||||
blob_gaps.move(&word->blob_gaps);
|
||||
// TODO: optimize moves.
|
||||
best_state = word->best_state;
|
||||
word->best_state.clear();
|
||||
correct_text = word->correct_text;
|
||||
word->correct_text.clear();
|
||||
blob_widths = word->blob_widths;
|
||||
word->blob_widths.clear();
|
||||
blob_gaps = word->blob_gaps;
|
||||
word->blob_gaps.clear();
|
||||
if (ratings != nullptr)
|
||||
ratings->delete_matrix_pointers();
|
||||
MovePointerData(&ratings, &word->ratings);
|
||||
@ -797,7 +804,7 @@ void WERD_RES::RebuildBestState() {
|
||||
rebuild_word = new TWERD;
|
||||
if (seam_array.empty())
|
||||
start_seam_list(chopped_word, &seam_array);
|
||||
best_state.truncate(0);
|
||||
best_state.clear();
|
||||
int start = 0;
|
||||
for (int i = 0; i < best_choice->length(); ++i) {
|
||||
int length = best_choice->state(i);
|
||||
@ -873,7 +880,7 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE **choices) {
|
||||
}
|
||||
FakeWordFromRatings(TOP_CHOICE_PERM);
|
||||
reject_map.initialise(blob_count);
|
||||
best_state.init_to_size(blob_count, 1);
|
||||
best_state.resize(blob_count, 1);
|
||||
done = true;
|
||||
}
|
||||
|
||||
@ -958,7 +965,7 @@ void WERD_RES::MergeAdjacentBlobs(int index) {
|
||||
box_word->MergeBoxes(index, index + 2);
|
||||
if (index + 1 < best_state.size()) {
|
||||
best_state[index] += best_state[index + 1];
|
||||
best_state.remove(index + 1);
|
||||
best_state.erase(best_state.begin() + index + 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1088,7 +1095,9 @@ void WERD_RES::ClearResults() {
|
||||
box_word = nullptr;
|
||||
best_state.clear();
|
||||
correct_text.clear();
|
||||
seam_array.delete_data_pointers();
|
||||
for (auto data : seam_array) {
|
||||
delete data;
|
||||
}
|
||||
seam_array.clear();
|
||||
blob_widths.clear();
|
||||
blob_gaps.clear();
|
||||
@ -1204,7 +1213,7 @@ WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *ne
|
||||
// are likely very poor, if they come from LSTM, where it only outputs the
|
||||
// character at one pixel within it, so we find the midpoints between them.
|
||||
static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,
|
||||
C_BLOB_LIST *next_word_blobs, GenericVector<int> *blob_ends) {
|
||||
C_BLOB_LIST *next_word_blobs, std::vector<int> *blob_ends) {
|
||||
C_BLOB_IT blob_it(word.word->cblob_list());
|
||||
for (int i = 0; i < word.best_state.size(); ++i) {
|
||||
int length = word.best_state[i];
|
||||
@ -1341,7 +1350,7 @@ void PAGE_RES_IT::ReplaceCurrentWord(tesseract::PointerVector<WERD_RES> *words)
|
||||
WERD_RES *word_w = (*words)[w];
|
||||
clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
|
||||
// Compute blob boundaries.
|
||||
GenericVector<int> blob_ends;
|
||||
std::vector<int> blob_ends;
|
||||
C_BLOB_LIST *next_word_blobs =
|
||||
w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
|
||||
ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
|
||||
|
@ -31,7 +31,7 @@
|
||||
#include "werd.h" // for WERD, W_BOL, W_EOL
|
||||
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
|
||||
#include "genericvector.h" // for GenericVector, PointerVector (ptr only)
|
||||
#include "genericvector.h" // for PointerVector (ptr only)
|
||||
|
||||
#include <sys/types.h> // for int8_t
|
||||
#include <cstdint> // for int32_t, int16_t
|
||||
@ -83,19 +83,19 @@ public:
|
||||
// the next word. This pointer is not owned by PAGE_RES class.
|
||||
WERD_CHOICE **prev_word_best_choice;
|
||||
// Sums of blame reasons computed by the blamer.
|
||||
GenericVector<int> blame_reasons;
|
||||
std::vector<int> blame_reasons;
|
||||
// Debug information about all the misadaptions on this page.
|
||||
// Each BlamerBundle contains an index into this vector, so that words that
|
||||
// caused misadaption could be marked. However, since words could be
|
||||
// deleted/split/merged, the log is stored on the PAGE_RES level.
|
||||
GenericVector<std::string> misadaption_log;
|
||||
std::vector<std::string> misadaption_log;
|
||||
|
||||
inline void Init() {
|
||||
char_count = 0;
|
||||
rej_count = 0;
|
||||
rejected = false;
|
||||
prev_word_best_choice = nullptr;
|
||||
blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
|
||||
blame_reasons.resize(IRR_NUM_REASONS);
|
||||
}
|
||||
|
||||
PAGE_RES() {
|
||||
@ -207,12 +207,12 @@ public:
|
||||
// The length of chopped_word matches length of seam_array + 1 (if set).
|
||||
TWERD *chopped_word = nullptr; // BLN chopped fragments output.
|
||||
// Vector of SEAM* holding chopping points matching chopped_word.
|
||||
GenericVector<SEAM *> seam_array;
|
||||
std::vector<SEAM *> seam_array;
|
||||
// Widths of blobs in chopped_word.
|
||||
GenericVector<int> blob_widths;
|
||||
std::vector<int> blob_widths;
|
||||
// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
|
||||
// blob i and blob i+1.
|
||||
GenericVector<int> blob_gaps;
|
||||
std::vector<int> blob_gaps;
|
||||
// Stores the lstm choices of every timestep
|
||||
std::vector<std::vector<std::pair<const char *, float>>> timesteps;
|
||||
// Stores the lstm choices of every timestep segmented by character
|
||||
@ -277,11 +277,11 @@ public:
|
||||
// rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
|
||||
// adjacent blobs in chopped_word. The seams in seam_array are hidden
|
||||
// within a rebuild_word blob and revealed between them.
|
||||
GenericVector<int> best_state; // Number of blobs in each best blob.
|
||||
std::vector<int> best_state; // Number of blobs in each best blob.
|
||||
// The correct_text is used during training and adaption to carry the
|
||||
// text to the training system without the need for a unicharset. There
|
||||
// is one entry in the vector for each blob in rebuild_word and box_word.
|
||||
GenericVector<std::string> correct_text;
|
||||
std::vector<std::string> correct_text;
|
||||
|
||||
// Less-well documented members.
|
||||
// TODO(rays) Add more documentation here.
|
||||
|
@ -19,6 +19,7 @@
|
||||
#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
|
||||
#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
|
||||
|
||||
#include <cstring> // for memset
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
@ -51,8 +51,8 @@ bool SEAM::IsHealthy(const TBLOB &blob, int min_points, int min_area) const {
|
||||
// seam, which is about to be inserted at insert_index. Returns false if
|
||||
// any of the computations fails, as this indicates an invalid chop.
|
||||
// widthn_/widthp_ are only changed if modify is true.
|
||||
bool SEAM::PrepareToInsertSeam(const GenericVector<SEAM *> &seams,
|
||||
const GenericVector<TBLOB *> &blobs, int insert_index, bool modify) {
|
||||
bool SEAM::PrepareToInsertSeam(const std::vector<SEAM *> &seams,
|
||||
const std::vector<TBLOB *> &blobs, int insert_index, bool modify) {
|
||||
for (int s = 0; s < insert_index; ++s) {
|
||||
if (!seams[s]->FindBlobWidth(blobs, s, modify))
|
||||
return false;
|
||||
@ -68,7 +68,7 @@ bool SEAM::PrepareToInsertSeam(const GenericVector<SEAM *> &seams,
|
||||
|
||||
// Computes the widthp_/widthn_ range. Returns false if not all the splits
|
||||
// are accounted for. widthn_/widthp_ are only changed if modify is true.
|
||||
bool SEAM::FindBlobWidth(const GenericVector<TBLOB *> &blobs, int index, bool modify) {
|
||||
bool SEAM::FindBlobWidth(const std::vector<TBLOB *> &blobs, int index, bool modify) {
|
||||
int num_found = 0;
|
||||
if (modify) {
|
||||
widthp_ = 0;
|
||||
@ -147,7 +147,7 @@ void SEAM::Print(const char *label) const {
|
||||
|
||||
// Prints a collection of SEAMs.
|
||||
/* static */
|
||||
void SEAM::PrintSeams(const char *label, const GenericVector<SEAM *> &seams) {
|
||||
void SEAM::PrintSeams(const char *label, const std::vector<SEAM *> &seams) {
|
||||
if (!seams.empty()) {
|
||||
tprintf("%s\n", label);
|
||||
for (int x = 0; x < seams.size(); ++x) {
|
||||
@ -169,7 +169,7 @@ void SEAM::Mark(ScrollView *window) const {
|
||||
// Break up the blobs in this chain so that they are all independent.
|
||||
// This operation should undo the affect of join_pieces.
|
||||
/* static */
|
||||
void SEAM::BreakPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
|
||||
void SEAM::BreakPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
|
||||
int first, int last) {
|
||||
for (int x = first; x < last; ++x)
|
||||
seams[x]->Reveal();
|
||||
@ -191,7 +191,7 @@ void SEAM::BreakPieces(const GenericVector<SEAM *> &seams, const GenericVector<T
|
||||
// Join a group of base level pieces into a single blob that can then
|
||||
// be classified.
|
||||
/* static */
|
||||
void SEAM::JoinPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
|
||||
void SEAM::JoinPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
|
||||
int first, int last) {
|
||||
TESSLINE *outline = blobs[first]->outlines;
|
||||
if (!outline)
|
||||
@ -245,8 +245,8 @@ float SEAM::FullPriority(int xmin, int xmax, double overlap_knob, int centered_m
|
||||
* present in the starting segmentation. Each of the seams created
|
||||
* by this routine have location information only.
|
||||
*/
|
||||
void start_seam_list(TWERD *word, GenericVector<SEAM *> *seam_array) {
|
||||
seam_array->truncate(0);
|
||||
void start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array) {
|
||||
seam_array->clear();
|
||||
TPOINT location;
|
||||
|
||||
for (int b = 1; b < word->NumBlobs(); ++b) {
|
||||
|
@ -133,11 +133,11 @@ public:
|
||||
// seam, which is about to be inserted at insert_index. Returns false if
|
||||
// any of the computations fails, as this indicates an invalid chop.
|
||||
// widthn_/widthp_ are only changed if modify is true.
|
||||
bool PrepareToInsertSeam(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
|
||||
bool PrepareToInsertSeam(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
|
||||
int insert_index, bool modify);
|
||||
// Computes the widthp_/widthn_ range. Returns false if not all the splits
|
||||
// are accounted for. widthn_/widthp_ are only changed if modify is true.
|
||||
bool FindBlobWidth(const GenericVector<TBLOB *> &blobs, int index, bool modify);
|
||||
bool FindBlobWidth(const std::vector<TBLOB *> &blobs, int index, bool modify);
|
||||
|
||||
// Splits this blob into two blobs by applying the splits included in
|
||||
// *this SEAM
|
||||
@ -149,7 +149,7 @@ public:
|
||||
// Prints everything in *this SEAM.
|
||||
void Print(const char *label) const;
|
||||
// Prints a collection of SEAMs.
|
||||
static void PrintSeams(const char *label, const GenericVector<SEAM *> &seams);
|
||||
static void PrintSeams(const char *label, const std::vector<SEAM *> &seams);
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
// Draws the seam in the given window.
|
||||
void Mark(ScrollView *window) const;
|
||||
@ -157,11 +157,11 @@ public:
|
||||
|
||||
// Break up the blobs in this chain so that they are all independent.
|
||||
// This operation should undo the affect of join_pieces.
|
||||
static void BreakPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
|
||||
static void BreakPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
|
||||
int first, int last);
|
||||
// Join a group of base level pieces into a single blob that can then
|
||||
// be classified.
|
||||
static void JoinPieces(const GenericVector<SEAM *> &seams, const GenericVector<TBLOB *> &blobs,
|
||||
static void JoinPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
|
||||
int first, int last);
|
||||
|
||||
// Hides the seam so the outlines appear not to be cut by it.
|
||||
@ -193,7 +193,7 @@ private:
|
||||
SPLIT splits_[kMaxNumSplits];
|
||||
};
|
||||
|
||||
void start_seam_list(TWERD *word, GenericVector<SEAM *> *seam_array);
|
||||
void start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
@ -462,13 +462,13 @@ static bool GatherPeak(int index, const int *src_buckets, int *used_buckets, int
|
||||
// to sort on the output will re-sort by increasing mean of peak if that is
|
||||
// more useful than decreasing total count.
|
||||
// Returns the actual number of modes found.
|
||||
int STATS::top_n_modes(int max_modes, GenericVector<KDPairInc<float, int>> *modes) const {
|
||||
int STATS::top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const {
|
||||
if (max_modes <= 0)
|
||||
return 0;
|
||||
int src_count = rangemax_ - rangemin_;
|
||||
// Used copies the counts in buckets_ as they get used.
|
||||
STATS used(rangemin_, rangemax_);
|
||||
modes->truncate(0);
|
||||
modes.clear();
|
||||
// Total count of the smallest peak found so far.
|
||||
int least_count = 1;
|
||||
// Mode that is used as a seed for each peak
|
||||
@ -502,21 +502,21 @@ int STATS::top_n_modes(int max_modes, GenericVector<KDPairInc<float, int>> *mode
|
||||
&total_value))
|
||||
break;
|
||||
}
|
||||
if (total_count > least_count || modes->size() < max_modes) {
|
||||
if (total_count > least_count || modes.size() < max_modes) {
|
||||
// We definitely want this mode, so if we have enough discard the least.
|
||||
if (modes->size() == max_modes)
|
||||
modes->truncate(max_modes - 1);
|
||||
if (modes.size() == max_modes)
|
||||
modes.resize(max_modes - 1);
|
||||
int target_index = 0;
|
||||
// Linear search for the target insertion point.
|
||||
while (target_index < modes->size() && (*modes)[target_index].data() >= total_count)
|
||||
while (target_index < modes.size() && modes[target_index].data() >= total_count)
|
||||
++target_index;
|
||||
auto peak_mean = static_cast<float>(total_value / total_count + rangemin_);
|
||||
modes->insert(KDPairInc<float, int>(peak_mean, total_count), target_index);
|
||||
least_count = modes->back().data();
|
||||
modes.insert(modes.begin() + target_index, KDPairInc<float, int>(peak_mean, total_count));
|
||||
least_count = modes.back().data();
|
||||
}
|
||||
}
|
||||
} while (max_count > 0);
|
||||
return modes->size();
|
||||
return modes.size();
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
|
@ -113,7 +113,7 @@ public:
|
||||
// sort on the output will re-sort by increasing mean of peak if that is more
|
||||
// useful than decreasing total count. Returns the actual number of modes
|
||||
// found.
|
||||
int top_n_modes(int max_modes, GenericVector<KDPairInc<float, int>> *modes) const;
|
||||
int top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const;
|
||||
|
||||
// Prints a summary and table of the histogram.
|
||||
void print() const;
|
||||
|
@ -502,7 +502,7 @@ void WERD::CleanNoise(float size_threshold) {
|
||||
|
||||
// Extracts all the noise outlines and stuffs the pointers into the given
|
||||
// vector of outlines. Afterwards, the outlines vector owns the pointers.
|
||||
void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) {
|
||||
void WERD::GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines) {
|
||||
C_BLOB_IT rej_it(&rej_cblobs);
|
||||
for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
|
||||
C_BLOB *blob = rej_it.extract();
|
||||
@ -516,13 +516,13 @@ void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines) {
|
||||
// back in rej_cblobs where they came from. Where the target_blobs entry is
|
||||
// nullptr, a run of wanted outlines is put into a single new blob.
|
||||
// Ownership of the outlines is transferred back to the word. (Hence
|
||||
// GenericVector and not PointerVector.)
|
||||
// vector and not PointerVector.)
|
||||
// Returns true if any new blob was added to the start of the word, which
|
||||
// suggests that it might need joining to the word before it, and likewise
|
||||
// sets make_next_word_fuzzy true if any new blob was added to the end.
|
||||
bool WERD::AddSelectedOutlines(const GenericVector<bool> &wanted,
|
||||
const GenericVector<C_BLOB *> &target_blobs,
|
||||
const GenericVector<C_OUTLINE *> &outlines,
|
||||
bool WERD::AddSelectedOutlines(const std::vector<bool> &wanted,
|
||||
const std::vector<C_BLOB *> &target_blobs,
|
||||
const std::vector<C_OUTLINE *> &outlines,
|
||||
bool *make_next_word_fuzzy) {
|
||||
bool outline_added_to_start = false;
|
||||
if (make_next_word_fuzzy != nullptr)
|
||||
|
@ -21,7 +21,6 @@
|
||||
|
||||
#include "bits16.h"
|
||||
#include "elst2.h"
|
||||
#include "genericvector.h" // GenericVector
|
||||
#include "params.h"
|
||||
#include "stepblob.h"
|
||||
|
||||
@ -173,18 +172,18 @@ public:
|
||||
|
||||
// Extracts all the noise outlines and stuffs the pointers into the given
|
||||
// vector of outlines. Afterwards, the outlines vector owns the pointers.
|
||||
void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
|
||||
void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
|
||||
// Adds the selected outlines to the indcated real blobs, and puts the rest
|
||||
// back in rej_cblobs where they came from. Where the target_blobs entry is
|
||||
// nullptr, a run of wanted outlines is put into a single new blob.
|
||||
// Ownership of the outlines is transferred back to the word. (Hence
|
||||
// GenericVector and not PointerVector.)
|
||||
// vector and not PointerVector.)
|
||||
// Returns true if any new blob was added to the start of the word, which
|
||||
// suggests that it might need joining to the word before it, and likewise
|
||||
// sets make_next_word_fuzzy true if any new blob was added to the end.
|
||||
bool AddSelectedOutlines(const GenericVector<bool> &wanted,
|
||||
const GenericVector<C_BLOB *> &target_blobs,
|
||||
const GenericVector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
|
||||
bool AddSelectedOutlines(const std::vector<bool> &wanted,
|
||||
const std::vector<C_BLOB *> &target_blobs,
|
||||
const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
|
||||
|
||||
private:
|
||||
uint8_t blanks = 0; // no of blanks
|
||||
|
@ -4,7 +4,6 @@
|
||||
// File: genericheap.h
|
||||
// Description: Template heap class.
|
||||
// Author: Ray Smith, based on Dan Johnson's original code.
|
||||
// Created: Wed Mar 14 08:13:00 PDT 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -38,7 +37,7 @@ namespace tesseract {
|
||||
// GenericHeap doesn't look inside it except for operator<.
|
||||
//
|
||||
// The heap is stored as a packed binary tree in an array hosted by a
|
||||
// GenericVector<Pair>, with the invariant that the children of each node are
|
||||
// vector<Pair>, with the invariant that the children of each node are
|
||||
// both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator<
|
||||
// to use Key::operator< to generate a MIN heap and KDPairDec defines
|
||||
// Pair::operator< to use Key::operator> to generate a MAX heap by reversing
|
||||
@ -59,7 +58,7 @@ template <typename Pair>
|
||||
class GenericHeap {
|
||||
public:
|
||||
GenericHeap() = default;
|
||||
// The initial size is only a GenericVector::reserve. It is not enforced as
|
||||
// The initial size is only a vector::reserve. It is not enforced as
|
||||
// the size limit of the heap. Caller must implement their own enforcement.
|
||||
explicit GenericHeap(int initial_size) {
|
||||
heap_.reserve(initial_size);
|
||||
@ -77,12 +76,12 @@ public:
|
||||
}
|
||||
void clear() {
|
||||
// Clear truncates to 0 to keep the number reserved in tact.
|
||||
heap_.truncate(0);
|
||||
heap_.clear();
|
||||
}
|
||||
// Provides access to the underlying vector.
|
||||
// Caution! any changes that modify the keys will invalidate the heap!
|
||||
GenericVector<Pair> *heap() {
|
||||
return &heap_;
|
||||
std::vector<Pair> &heap() {
|
||||
return heap_;
|
||||
}
|
||||
// Provides read-only access to an element of the underlying vector.
|
||||
const Pair &get(int index) const {
|
||||
@ -128,11 +127,11 @@ public:
|
||||
// Sift the hole at the start of the heap_ downwards to match the last
|
||||
// element.
|
||||
Pair hole_pair = heap_[new_size];
|
||||
heap_.truncate(new_size);
|
||||
heap_.resize(new_size);
|
||||
int hole_index = SiftDown(0, hole_pair);
|
||||
heap_[hole_index] = hole_pair;
|
||||
} else {
|
||||
heap_.truncate(new_size);
|
||||
heap_.resize(new_size);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -154,7 +153,7 @@ public:
|
||||
int hole_index = SiftUp(worst_index, hole_pair);
|
||||
heap_[hole_index] = hole_pair;
|
||||
}
|
||||
heap_.truncate(heap_size);
|
||||
heap_.resize(heap_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -179,7 +178,7 @@ public:
|
||||
// The pointed-to Pair has changed its key value, so the location of pair
|
||||
// is reshuffled to maintain the heap invariant.
|
||||
// Must be a valid pointer to an element of the heap_!
|
||||
// Caution! Since GenericHeap is based on GenericVector, reallocs may occur
|
||||
// Caution! Since GenericHeap is based on vector, reallocs may occur
|
||||
// whenever the vector is extended and elements may get shuffled by any
|
||||
// Push or Pop operation. Therefore use this function only if Data in Pair is
|
||||
// of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as
|
||||
@ -235,7 +234,7 @@ private:
|
||||
}
|
||||
|
||||
private:
|
||||
GenericVector<Pair> heap_;
|
||||
std::vector<Pair> heap_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -225,16 +225,6 @@ public:
|
||||
qsort(data_, size_used_, sizeof(*data_), comparator);
|
||||
}
|
||||
|
||||
// Searches the array (assuming sorted in ascending order, using sort()) for
|
||||
// an element equal to target and returns true if it is present.
|
||||
// Use binary_search to get the index of target, or its nearest candidate.
|
||||
bool bool_binary_search(const T &target) const {
|
||||
int index = binary_search(target);
|
||||
if (index >= size_used_) {
|
||||
return false;
|
||||
}
|
||||
return data_[index] == target;
|
||||
}
|
||||
// Searches the array (assuming sorted in ascending order, using sort()) for
|
||||
// an element equal to target and returns the index of the best candidate.
|
||||
// The return value is conceptually the largest index i such that
|
||||
@ -255,25 +245,6 @@ public:
|
||||
return bottom;
|
||||
}
|
||||
|
||||
// Compact the vector by deleting elements using operator!= on basic types.
|
||||
// The vector must be sorted.
|
||||
void compact_sorted() {
|
||||
if (size_used_ == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// First element is in no matter what, hence the i = 1.
|
||||
int last_write = 0;
|
||||
for (int i = 1; i < size_used_; ++i) {
|
||||
// Finds next unique item and writes it.
|
||||
if (data_[last_write] != data_[i]) {
|
||||
data_[++last_write] = data_[i];
|
||||
}
|
||||
}
|
||||
// last_write is the index of a valid data cell, so add 1.
|
||||
size_used_ = last_write + 1;
|
||||
}
|
||||
|
||||
// Returns the index of what would be the target_index_th item in the array
|
||||
// if the members were sorted, without actually sorting. Members are
|
||||
// shuffled around, but it takes O(n) time.
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <cmath> // std::isfinite
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <algorithm> // for std::find
|
||||
#include <functional>
|
||||
#include <random>
|
||||
#include <string>
|
||||
@ -31,6 +32,11 @@
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
template <class T>
|
||||
inline bool contains(const std::vector<T> &data, const T &value) {
|
||||
return std::find(data.begin(), data.end(), value) != data.end();
|
||||
}
|
||||
|
||||
inline const std::vector<std::string> split(const std::string &s, char c) {
|
||||
std::string buff;
|
||||
std::vector<std::string> v;
|
||||
|
@ -245,8 +245,7 @@ void UnicharCompress::DefragmentCodeValues(int encoded_null) {
|
||||
// all codes are used. Likewise with the Han encoding, it is possible that not
|
||||
// all numbers of strokes are used.
|
||||
ComputeCodeRange();
|
||||
GenericVector<int> offsets;
|
||||
offsets.init_to_size(code_range_, 0);
|
||||
std::vector<int> offsets(code_range_);
|
||||
// Find which codes are used
|
||||
for (int c = 0; c < encoder_.size(); ++c) {
|
||||
const RecodedCharID &code = encoder_[c];
|
||||
@ -390,26 +389,26 @@ void UnicharCompress::SetupDecoder() {
|
||||
prefix.Truncate(len);
|
||||
auto final_it = final_codes_.find(prefix);
|
||||
if (final_it == final_codes_.end()) {
|
||||
auto *code_list = new GenericVector<int>;
|
||||
auto *code_list = new std::vector<int>;
|
||||
code_list->push_back(code(len));
|
||||
final_codes_[prefix] = code_list;
|
||||
while (--len >= 0) {
|
||||
prefix.Truncate(len);
|
||||
auto next_it = next_codes_.find(prefix);
|
||||
if (next_it == next_codes_.end()) {
|
||||
auto *code_list = new GenericVector<int>;
|
||||
auto *code_list = new std::vector<int>;
|
||||
code_list->push_back(code(len));
|
||||
next_codes_[prefix] = code_list;
|
||||
} else {
|
||||
// We still have to search the list as we may get here via multiple
|
||||
// lengths of code.
|
||||
if (!next_it->second->contains(code(len)))
|
||||
if (!contains(*next_it->second, code(len)))
|
||||
next_it->second->push_back(code(len));
|
||||
break; // This prefix has been processed.
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (!final_it->second->contains(code(len)))
|
||||
if (!contains(*final_it->second, code(len)))
|
||||
final_it->second->push_back(code(len));
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,7 @@
|
||||
#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
|
||||
|
||||
#include <unordered_map>
|
||||
#include "genericvector.h" // GenericVector
|
||||
#include <vector>
|
||||
#include "serialis.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
@ -178,13 +178,13 @@ public:
|
||||
}
|
||||
// Returns a list of valid non-final next codes for a given prefix code,
|
||||
// which may be empty.
|
||||
const GenericVector<int> *GetNextCodes(const RecodedCharID &code) const {
|
||||
const std::vector<int> *GetNextCodes(const RecodedCharID &code) const {
|
||||
auto it = next_codes_.find(code);
|
||||
return it == next_codes_.end() ? nullptr : it->second;
|
||||
}
|
||||
// Returns a list of valid final codes for a given prefix code, which may
|
||||
// be empty.
|
||||
const GenericVector<int> *GetFinalCodes(const RecodedCharID &code) const {
|
||||
const std::vector<int> *GetFinalCodes(const RecodedCharID &code) const {
|
||||
auto it = final_codes_.find(code);
|
||||
return it == final_codes_.end() ? nullptr : it->second;
|
||||
}
|
||||
@ -225,14 +225,14 @@ private:
|
||||
// Decoder converts the output of encoder back to a unichar-id.
|
||||
std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash> decoder_;
|
||||
// True if the index is a valid single or start code.
|
||||
GenericVector<bool> is_valid_start_;
|
||||
std::vector<bool> is_valid_start_;
|
||||
// Maps a prefix code to a list of valid next codes.
|
||||
// The map owns the vectors.
|
||||
std::unordered_map<RecodedCharID, GenericVector<int> *, RecodedCharID::RecodedCharIDHash>
|
||||
std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
|
||||
next_codes_;
|
||||
// Maps a prefix code to a list of valid final codes.
|
||||
// The map owns the vectors.
|
||||
std::unordered_map<RecodedCharID, GenericVector<int> *, RecodedCharID::RecodedCharIDHash>
|
||||
std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
|
||||
final_codes_;
|
||||
// Max of any value in encoder_ + 1.
|
||||
int code_range_;
|
||||
|
@ -57,9 +57,9 @@ struct NodeChild {
|
||||
NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
|
||||
};
|
||||
|
||||
using NodeChildVector = GenericVector<NodeChild>;
|
||||
using SuccessorList = GenericVector<int>;
|
||||
using SuccessorListsVector = GenericVector<SuccessorList *>;
|
||||
using NodeChildVector = std::vector<NodeChild>;
|
||||
using SuccessorList = std::vector<int>;
|
||||
using SuccessorListsVector = std::vector<SuccessorList *>;
|
||||
|
||||
enum DawgType {
|
||||
DAWG_TYPE_PUNCTUATION,
|
||||
@ -176,7 +176,7 @@ public:
|
||||
/// Fills vec with unichar ids that represent the character classes
|
||||
/// of the given unichar_id.
|
||||
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
|
||||
GenericVector<UNICHAR_ID> *vec) const {
|
||||
std::vector<UNICHAR_ID> *vec) const {
|
||||
(void)unichar_id;
|
||||
(void)unicharset;
|
||||
(void)vec;
|
||||
@ -355,15 +355,16 @@ struct DawgPosition {
|
||||
bool back_to_punc = false;
|
||||
};
|
||||
|
||||
class DawgPositionVector : public GenericVector<DawgPosition> {
|
||||
class DawgPositionVector : public std::vector<DawgPosition> {
|
||||
public:
|
||||
/// Adds an entry for the given dawg_index with the given node to the vec.
|
||||
/// Returns false if the same entry already exists in the vector,
|
||||
/// true otherwise.
|
||||
inline bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg) {
|
||||
for (int i = 0; i < size(); ++i) {
|
||||
if (data_[i] == new_pos)
|
||||
for (auto position : *this) {
|
||||
if (position == new_pos) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
push_back(new_pos);
|
||||
if (debug) {
|
||||
|
@ -201,19 +201,19 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
|
||||
punc_dawg_ =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
|
||||
if (punc_dawg_)
|
||||
dawgs_ += punc_dawg_;
|
||||
dawgs_.push_back(punc_dawg_);
|
||||
}
|
||||
if (load_system_dawg) {
|
||||
Dawg *system_dawg =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
|
||||
if (system_dawg)
|
||||
dawgs_ += system_dawg;
|
||||
dawgs_.push_back(system_dawg);
|
||||
}
|
||||
if (load_number_dawg) {
|
||||
Dawg *number_dawg =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
|
||||
if (number_dawg)
|
||||
dawgs_ += number_dawg;
|
||||
dawgs_.push_back(number_dawg);
|
||||
}
|
||||
if (load_bigram_dawg) {
|
||||
bigram_dawg_ =
|
||||
@ -225,13 +225,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
|
||||
freq_dawg_ =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
|
||||
if (freq_dawg_)
|
||||
dawgs_ += freq_dawg_;
|
||||
dawgs_.push_back(freq_dawg_);
|
||||
}
|
||||
if (load_unambig_dawg) {
|
||||
unambig_dawg_ =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
|
||||
if (unambig_dawg_)
|
||||
dawgs_ += unambig_dawg_;
|
||||
dawgs_.push_back(unambig_dawg_);
|
||||
}
|
||||
|
||||
std::string name;
|
||||
@ -249,7 +249,7 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
|
||||
tprintf("Error: failed to load %s\n", name.c_str());
|
||||
delete trie_ptr;
|
||||
} else {
|
||||
dawgs_ += trie_ptr;
|
||||
dawgs_.push_back(trie_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -267,13 +267,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
|
||||
tprintf("Error: failed to load %s\n", name.c_str());
|
||||
delete trie_ptr;
|
||||
} else {
|
||||
dawgs_ += trie_ptr;
|
||||
dawgs_.push_back(trie_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
document_words_ =
|
||||
new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
|
||||
dawgs_ += document_words_;
|
||||
dawgs_.push_back(document_words_);
|
||||
|
||||
// This dawg is temporary and should not be searched by letter_is_ok.
|
||||
pending_words_ =
|
||||
@ -287,19 +287,19 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
|
||||
punc_dawg_ =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
|
||||
if (punc_dawg_)
|
||||
dawgs_ += punc_dawg_;
|
||||
dawgs_.push_back(punc_dawg_);
|
||||
}
|
||||
if (load_system_dawg) {
|
||||
Dawg *system_dawg =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
|
||||
if (system_dawg)
|
||||
dawgs_ += system_dawg;
|
||||
dawgs_.push_back(system_dawg);
|
||||
}
|
||||
if (load_number_dawg) {
|
||||
Dawg *number_dawg =
|
||||
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
|
||||
if (number_dawg)
|
||||
dawgs_ += number_dawg;
|
||||
dawgs_.push_back(number_dawg);
|
||||
}
|
||||
|
||||
// stolen from Dict::Load (but needs params_ from Tesseract
|
||||
@ -319,7 +319,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
|
||||
tprintf("Error: failed to load %s\n", name.c_str());
|
||||
delete trie_ptr;
|
||||
} else {
|
||||
dawgs_ += trie_ptr;
|
||||
dawgs_.push_back(trie_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -337,7 +337,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
|
||||
tprintf("Error: failed to load %s\n", name.c_str());
|
||||
delete trie_ptr;
|
||||
} else {
|
||||
dawgs_ += trie_ptr;
|
||||
dawgs_.push_back(trie_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -358,9 +358,9 @@ bool Dict::FinishLoad() {
|
||||
const Dawg *other = dawgs_[j];
|
||||
if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
|
||||
kDawgSuccessors[dawg->type()][other->type()])
|
||||
*lst += j;
|
||||
lst->push_back(j);
|
||||
}
|
||||
successors_ += lst;
|
||||
successors_.push_back(lst);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -378,7 +378,9 @@ void Dict::End() {
|
||||
delete dawg_cache_;
|
||||
dawg_cache_ = nullptr;
|
||||
}
|
||||
successors_.delete_data_pointers();
|
||||
for (auto successor : successors_) {
|
||||
delete successor;
|
||||
}
|
||||
dawgs_.clear();
|
||||
successors_.clear();
|
||||
document_words_ = nullptr;
|
||||
@ -550,7 +552,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA
|
||||
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
|
||||
// Try to find the edge corresponding to the exact unichar_id and to all the
|
||||
// edges corresponding to the character class of unichar_id.
|
||||
GenericVector<UNICHAR_ID> unichar_id_patterns;
|
||||
std::vector<UNICHAR_ID> unichar_id_patterns;
|
||||
unichar_id_patterns.push_back(unichar_id);
|
||||
dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
|
||||
for (int i = 0; i < unichar_id_patterns.size(); ++i) {
|
||||
@ -605,12 +607,12 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern
|
||||
int dawg_ty = dawgs_[i]->type();
|
||||
bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
|
||||
if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
|
||||
*dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
|
||||
dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
|
||||
if (dawg_debug_level >= 3) {
|
||||
tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
|
||||
}
|
||||
} else if (!punc_dawg_available || !subsumed_by_punc) {
|
||||
*dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
|
||||
dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
|
||||
if (dawg_debug_level >= 3) {
|
||||
tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ struct CHAR_FRAGMENT_INFO {
|
||||
float certainty;
|
||||
};
|
||||
|
||||
using DawgVector = GenericVector<Dawg *>;
|
||||
using DawgVector = std::vector<Dawg *>;
|
||||
|
||||
//
|
||||
// Constants
|
||||
@ -495,7 +495,7 @@ private:
|
||||
// matching. The first member of each list is taken as canonical. For
|
||||
// example, the first list contains hyphens and dashes with the first symbol
|
||||
// being the ASCII hyphen minus.
|
||||
std::vector<GenericVector<UNICHAR_ID>> equivalent_symbols_;
|
||||
std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
|
||||
// Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
|
||||
DawgCache *dawg_cache_;
|
||||
bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
|
||||
|
@ -2,7 +2,6 @@
|
||||
** Filename: stopper.h
|
||||
** Purpose: Stopping criteria for word classifier.
|
||||
** Author: Dan Johnson
|
||||
** History: Wed May 1 09:42:57 1991, DSJ, Created.
|
||||
**
|
||||
** (c) Copyright Hewlett-Packard Company, 1988.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -22,7 +21,6 @@
|
||||
#include "ratngs.h"
|
||||
|
||||
#include <tesseract/unichar.h>
|
||||
#include "genericvector.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -46,7 +44,7 @@ struct DANGERR_INFO {
|
||||
UNICHAR_ID leftmost; // in the replacement, what's the leftmost character?
|
||||
};
|
||||
|
||||
using DANGERR = GenericVector<DANGERR_INFO>;
|
||||
using DANGERR = std::vector<DANGERR_INFO>;
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
@ -24,7 +24,6 @@
|
||||
|
||||
#include "dawg.h"
|
||||
#include "dict.h"
|
||||
#include "genericvector.h"
|
||||
#include "helpers.h"
|
||||
#include "kdpair.h"
|
||||
|
||||
@ -49,7 +48,9 @@ const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
|
||||
|
||||
// Reset the Trie to empty.
|
||||
void Trie::clear() {
|
||||
nodes_.delete_data_pointers();
|
||||
for (auto node : nodes_) {
|
||||
delete node;
|
||||
}
|
||||
nodes_.clear();
|
||||
root_back_freelist_.clear();
|
||||
num_edges_ = 0;
|
||||
@ -122,10 +123,11 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in
|
||||
EDGE_RECORD edge_rec;
|
||||
link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
|
||||
if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
|
||||
EDGE_INDEX edge_index = root_back_freelist_.pop_back();
|
||||
EDGE_INDEX edge_index = root_back_freelist_.back();
|
||||
root_back_freelist_.pop_back();
|
||||
(*vec)[edge_index] = edge_rec;
|
||||
} else if (search_index < vec->size()) {
|
||||
vec->insert(edge_rec, search_index);
|
||||
vec->insert(vec->begin() + search_index, edge_rec);
|
||||
} else {
|
||||
vec->push_back(edge_rec);
|
||||
}
|
||||
@ -153,7 +155,7 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool m
|
||||
*edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
|
||||
}
|
||||
|
||||
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions) {
|
||||
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {
|
||||
if (word.length() <= 0)
|
||||
return false; // can't add empty words
|
||||
if (repetitions != nullptr)
|
||||
@ -330,7 +332,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
|
||||
}
|
||||
|
||||
void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
|
||||
GenericVector<UNICHAR_ID> *vec) const {
|
||||
std::vector<UNICHAR_ID> *vec) const {
|
||||
bool is_alpha = unicharset.get_isalpha(unichar_id);
|
||||
if (is_alpha) {
|
||||
vec->push_back(alpha_pattern_);
|
||||
@ -388,7 +390,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset)
|
||||
// Parse the pattern and construct a unichar id vector.
|
||||
// Record the number of repetitions of each unichar in the parallel vector.
|
||||
WERD_CHOICE word(&unicharset);
|
||||
GenericVector<bool> repetitions_vec;
|
||||
std::vector<bool> repetitions_vec;
|
||||
const char *str_ptr = string;
|
||||
int step = unicharset.step(str_ptr);
|
||||
bool failed = false;
|
||||
@ -462,12 +464,12 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
|
||||
tprintf("\n");
|
||||
}
|
||||
if (direction == FORWARD_EDGE) {
|
||||
nodes_[node1]->forward_edges.remove(edge_index);
|
||||
nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
|
||||
} else if (node1 == 0) {
|
||||
KillEdge(&nodes_[node1]->backward_edges[edge_index]);
|
||||
root_back_freelist_.push_back(edge_index);
|
||||
} else {
|
||||
nodes_[node1]->backward_edges.remove(edge_index);
|
||||
nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
|
||||
}
|
||||
--num_edges_;
|
||||
}
|
||||
@ -476,7 +478,7 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
|
||||
// 1 Avoid insertion sorting or bubble sorting the tail root node
|
||||
// (back links on node 0, a list of all the leaves.). The node is
|
||||
// huge, and sorting it with n^2 time is terrible.
|
||||
// 2 Avoid using GenericVector::remove on the tail root node.
|
||||
// 2 Avoid using vector::erase on the tail root node.
|
||||
// (a) During add of words to the trie, zero-out the unichars and
|
||||
// keep a freelist of spaces to re-use.
|
||||
// (b) During reduction, just zero-out the unichars of deleted back
|
||||
@ -624,13 +626,13 @@ void Trie::sort_edges(EDGE_VECTOR *edges) {
|
||||
int num_edges = edges->size();
|
||||
if (num_edges <= 1)
|
||||
return;
|
||||
GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
|
||||
std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
|
||||
sort_vec.reserve(num_edges);
|
||||
for (int i = 0; i < num_edges; ++i) {
|
||||
sort_vec.push_back(
|
||||
KDPairInc<UNICHAR_ID, EDGE_RECORD>(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
|
||||
}
|
||||
sort_vec.sort();
|
||||
std::sort(sort_vec.begin(), sort_vec.end());
|
||||
for (int i = 0; i < num_edges; ++i)
|
||||
(*edges)[i] = sort_vec[i].data();
|
||||
}
|
||||
|
@ -21,14 +21,12 @@
|
||||
|
||||
#include "dawg.h"
|
||||
|
||||
#include "genericvector.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class UNICHARSET;
|
||||
|
||||
// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
|
||||
// max int32, we will need to change GenericVector to use int64 for size
|
||||
// max int32, we will need to change vector to use int64 for size
|
||||
// and address indices. This does not seem to be needed immediately,
|
||||
// since currently the largest number of edges limit used by tesseract
|
||||
// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
|
||||
@ -39,13 +37,13 @@ class UNICHARSET;
|
||||
// the 64 bit EDGE_RECORD.
|
||||
using EDGE_INDEX = int64_t; // index of an edge in a given node
|
||||
using NODE_MARKER = bool *;
|
||||
using EDGE_VECTOR = GenericVector<EDGE_RECORD>;
|
||||
using EDGE_VECTOR = std::vector<EDGE_RECORD>;
|
||||
|
||||
struct TRIE_NODE_RECORD {
|
||||
EDGE_VECTOR forward_edges;
|
||||
EDGE_VECTOR backward_edges;
|
||||
};
|
||||
using TRIE_NODES = GenericVector<TRIE_NODE_RECORD *>;
|
||||
using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;
|
||||
|
||||
/**
|
||||
* Concrete class for Trie data structure that allows to store a list of
|
||||
@ -88,7 +86,9 @@ public:
|
||||
initialized_patterns_ = false;
|
||||
}
|
||||
~Trie() override {
|
||||
nodes_.delete_data_pointers();
|
||||
for (auto node : nodes_) {
|
||||
delete node;
|
||||
}
|
||||
}
|
||||
|
||||
// Reset the Trie to empty.
|
||||
@ -230,7 +230,7 @@ public:
|
||||
// Fills in the given unichar id vector with the unichar ids that represent
|
||||
// the patterns of the character classes of the given unichar_id.
|
||||
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
|
||||
GenericVector<UNICHAR_ID> *vec) const override;
|
||||
std::vector<UNICHAR_ID> *vec) const override;
|
||||
|
||||
// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
|
||||
// a self loop and the given unichar_id matches the unichar_id stored in the
|
||||
@ -256,7 +256,7 @@ public:
|
||||
//
|
||||
// Return true if add succeeded, false otherwise (e.g. when a word contained
|
||||
// an invalid unichar id or the trie was getting too large and was cleared).
|
||||
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions);
|
||||
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
|
||||
bool add_word_to_dawg(const WERD_CHOICE &word) {
|
||||
return add_word_to_dawg(word, nullptr);
|
||||
}
|
||||
@ -395,7 +395,7 @@ protected:
|
||||
// Member variables
|
||||
TRIE_NODES nodes_; // vector of nodes in the Trie
|
||||
// Freelist of edges in the root backwards node that were previously zeroed.
|
||||
GenericVector<EDGE_INDEX> root_back_freelist_;
|
||||
std::vector<EDGE_INDEX> root_back_freelist_;
|
||||
uint64_t num_edges_; // sum of all edges (forward and backward)
|
||||
uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction
|
||||
uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index
|
||||
|
@ -129,10 +129,8 @@ void FullyConnected::Forward(bool debug, const NetworkIO &input,
|
||||
else
|
||||
output->Resize(input, no_);
|
||||
SetupForward(input, input_transpose);
|
||||
GenericVector<NetworkScratch::FloatVec> temp_lines;
|
||||
temp_lines.init_to_size(kNumThreads, NetworkScratch::FloatVec());
|
||||
GenericVector<NetworkScratch::FloatVec> curr_input;
|
||||
curr_input.init_to_size(kNumThreads, NetworkScratch::FloatVec());
|
||||
std::vector<NetworkScratch::FloatVec> temp_lines(kNumThreads);
|
||||
std::vector<NetworkScratch::FloatVec> curr_input(kNumThreads);
|
||||
int ro = no_;
|
||||
if (IntSimdMatrix::intSimdMatrix)
|
||||
ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro);
|
||||
@ -233,13 +231,12 @@ bool FullyConnected::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkSc
|
||||
DisplayBackward(fwd_deltas);
|
||||
#endif
|
||||
back_deltas->Resize(fwd_deltas, ni_);
|
||||
GenericVector<NetworkScratch::FloatVec> errors;
|
||||
errors.init_to_size(kNumThreads, NetworkScratch::FloatVec());
|
||||
std::vector<NetworkScratch::FloatVec> errors(kNumThreads);
|
||||
for (int i = 0; i < kNumThreads; ++i)
|
||||
errors[i].Init(no_, scratch);
|
||||
GenericVector<NetworkScratch::FloatVec> temp_backprops;
|
||||
std::vector<NetworkScratch::FloatVec> temp_backprops;
|
||||
if (needs_to_backprop_) {
|
||||
temp_backprops.init_to_size(kNumThreads, NetworkScratch::FloatVec());
|
||||
temp_backprops.resize(kNumThreads);
|
||||
for (int i = 0; i < kNumThreads; ++i)
|
||||
temp_backprops[i].Init(ni_, scratch);
|
||||
}
|
||||
|
@ -297,10 +297,10 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
|
||||
// for the other dimension, used only when working in true 2D mode. The width
|
||||
// is enough to hold an entire strip of the major direction.
|
||||
int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1;
|
||||
GenericVector<NetworkScratch::FloatVec> states, outputs;
|
||||
std::vector<NetworkScratch::FloatVec> states, outputs;
|
||||
if (Is2D()) {
|
||||
states.init_to_size(buf_width, NetworkScratch::FloatVec());
|
||||
outputs.init_to_size(buf_width, NetworkScratch::FloatVec());
|
||||
states.resize(buf_width);
|
||||
outputs.resize(buf_width);
|
||||
for (int i = 0; i < buf_width; ++i) {
|
||||
states[i].Init(ns_, scratch);
|
||||
ZeroVector<double>(ns_, states[i]);
|
||||
@ -494,10 +494,10 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
|
||||
// Rotating buffers of width buf_width allow storage of the recurrent time-
|
||||
// steps used only for true 2-D. Stores one full strip of the major direction.
|
||||
int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1;
|
||||
GenericVector<NetworkScratch::FloatVec> stateerr, sourceerr;
|
||||
std::vector<NetworkScratch::FloatVec> stateerr, sourceerr;
|
||||
if (Is2D()) {
|
||||
stateerr.init_to_size(buf_width, NetworkScratch::FloatVec());
|
||||
sourceerr.init_to_size(buf_width, NetworkScratch::FloatVec());
|
||||
stateerr.resize(buf_width);
|
||||
sourceerr.resize(buf_width);
|
||||
for (int t = 0; t < buf_width; ++t) {
|
||||
stateerr[t].Init(ns_, scratch);
|
||||
sourceerr[t].Init(na_, scratch);
|
||||
|
@ -2,7 +2,6 @@
|
||||
// File: networkio.cpp
|
||||
// Description: Network input/output data, allowing float/int implementations.
|
||||
// Author: Ray Smith
|
||||
// Created: Thu Jun 19 13:01:31 PST 2014
|
||||
//
|
||||
// (C) Copyright 2014, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -507,7 +506,7 @@ int NetworkIO::BestLabel(int t, int not_this, int not_that, float *score) const
|
||||
|
||||
// Returns the best start position out of [start, end) (into which all labels
|
||||
// must fit) to obtain the highest cumulative score for the given labels.
|
||||
int NetworkIO::PositionOfBestMatch(const GenericVector<int> &labels, int start, int end) const {
|
||||
int NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const {
|
||||
int length = labels.size();
|
||||
int last_start = end - length;
|
||||
int best_start = -1;
|
||||
@ -524,7 +523,7 @@ int NetworkIO::PositionOfBestMatch(const GenericVector<int> &labels, int start,
|
||||
|
||||
// Returns the cumulative score of the given labels starting at start, and
|
||||
// using one label per time-step.
|
||||
double NetworkIO::ScoreOfLabels(const GenericVector<int> &labels, int start) const {
|
||||
double NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const {
|
||||
int length = labels.size();
|
||||
double score = 0.0;
|
||||
for (int i = 0; i < length; ++i) {
|
||||
|
@ -23,7 +23,6 @@
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "helpers.h"
|
||||
#include "static_shape.h"
|
||||
#include "stridemap.h"
|
||||
@ -169,10 +168,10 @@ public:
|
||||
int BestLabel(int t, int not_this, int not_that, float *score) const;
|
||||
// Returns the best start position out of range (into which both start and end
|
||||
// must fit) to obtain the highest cumulative score for the given labels.
|
||||
int PositionOfBestMatch(const GenericVector<int> &labels, int start, int end) const;
|
||||
int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const;
|
||||
// Returns the cumulative score of the given labels starting at start, and
|
||||
// using one label per time-step.
|
||||
double ScoreOfLabels(const GenericVector<int> &labels, int start) const;
|
||||
double ScoreOfLabels(const std::vector<int> &labels, int start) const;
|
||||
// Helper function sets all the outputs for a single timestep, such that
|
||||
// label has value ok_score, and the other labels share 1 - ok_score.
|
||||
// Assumes float mode.
|
||||
|
@ -20,14 +20,13 @@
|
||||
#define TESSERACT_LSTM_NETWORKSCRATCH_H_
|
||||
|
||||
#include <mutex>
|
||||
#include "genericvector.h"
|
||||
#include "matrix.h"
|
||||
#include "networkio.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Generic scratch space for network layers. Provides NetworkIO that can store
|
||||
// a complete set (over time) of intermediates, and GenericVector<float>
|
||||
// a complete set (over time) of intermediates, and vector<float>
|
||||
// scratch space that auto-frees after use. The aim here is to provide a set
|
||||
// of temporary buffers to network layers that can be reused between layers
|
||||
// and don't have to be reallocated on each call.
|
||||
@ -125,7 +124,7 @@ public:
|
||||
}; // class IO.
|
||||
|
||||
// Class that acts like a fixed array of float, yet actually uses space
|
||||
// from a GenericVector<float> in the source NetworkScratch, and knows how
|
||||
// from a vector<float> in the source NetworkScratch, and knows how
|
||||
// to unstack the borrowed vector on destruction.
|
||||
class FloatVec {
|
||||
public:
|
||||
@ -145,12 +144,8 @@ public:
|
||||
scratch_space_->vec_stack_.Return(vec_);
|
||||
scratch_space_ = scratch;
|
||||
vec_ = scratch_space_->vec_stack_.Borrow();
|
||||
// Abuse vec_ here; first resize to 'reserve', which is larger
|
||||
// than 'size' (i.e. it's size rounded up) then resize down again
|
||||
// to the desired size. This assumes that the implementation does
|
||||
// not shrink the storage on a resize.
|
||||
vec_->resize_no_init(reserve);
|
||||
vec_->resize_no_init(size);
|
||||
vec_->reserve(reserve);
|
||||
vec_->resize(size);
|
||||
data_ = &(*vec_)[0];
|
||||
}
|
||||
|
||||
@ -169,7 +164,7 @@ public:
|
||||
|
||||
private:
|
||||
// Vector borrowed from the scratch space. Use Return to free it.
|
||||
GenericVector<double> *vec_;
|
||||
std::vector<double> *vec_;
|
||||
// Short-cut pointer to the underlying array.
|
||||
double *data_;
|
||||
// The source scratch_space_. Borrowed pointer, used to free the
|
||||
@ -251,7 +246,7 @@ public:
|
||||
|
||||
private:
|
||||
PointerVector<T> stack_;
|
||||
GenericVector<bool> flags_;
|
||||
std::vector<bool> flags_;
|
||||
int stack_top_;
|
||||
std::mutex mutex_;
|
||||
}; // class Stack.
|
||||
@ -259,11 +254,11 @@ public:
|
||||
private:
|
||||
// If true, the network weights are int8_t, if false, float.
|
||||
bool int_mode_;
|
||||
// Stacks of NetworkIO and GenericVector<float>. Once allocated, they are not
|
||||
// Stacks of NetworkIO and vector<float>. Once allocated, they are not
|
||||
// deleted until the NetworkScratch is deleted.
|
||||
Stack<NetworkIO> int_stack_;
|
||||
Stack<NetworkIO> float_stack_;
|
||||
Stack<GenericVector<double>> vec_stack_;
|
||||
Stack<std::vector<double>> vec_stack_;
|
||||
Stack<TransposedArray> array_stack_;
|
||||
};
|
||||
|
||||
|
@ -61,8 +61,7 @@ void Parallel::Forward(bool debug, const NetworkIO &input, const TransposedArray
|
||||
int stack_size = stack_.size();
|
||||
if (type_ == NT_PAR_2D_LSTM) {
|
||||
// Special case, run parallel in parallel.
|
||||
GenericVector<NetworkScratch::IO> results;
|
||||
results.init_to_size(stack_size, NetworkScratch::IO());
|
||||
std::vector<NetworkScratch::IO> results(stack_size);
|
||||
for (int i = 0; i < stack_size; ++i) {
|
||||
results[i].Resize(input, stack_[i]->NumOutputs(), scratch);
|
||||
}
|
||||
@ -124,9 +123,8 @@ bool Parallel::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch
|
||||
int stack_size = stack_.size();
|
||||
if (type_ == NT_PAR_2D_LSTM) {
|
||||
// Special case, run parallel in parallel.
|
||||
GenericVector<NetworkScratch::IO> in_deltas, out_deltas;
|
||||
in_deltas.init_to_size(stack_size, NetworkScratch::IO());
|
||||
out_deltas.init_to_size(stack_size, NetworkScratch::IO());
|
||||
std::vector<NetworkScratch::IO> in_deltas(stack_size);
|
||||
std::vector<NetworkScratch::IO> out_deltas(stack_size);
|
||||
// Split the forward deltas for each stack element.
|
||||
int feature_offset = 0;
|
||||
for (int i = 0; i < stack_.size(); ++i) {
|
||||
|
@ -190,7 +190,7 @@ bool Plumbing::Serialize(TFile *fp) const {
|
||||
for (uint32_t i = 0; i < size; ++i)
|
||||
if (!stack_[i]->Serialize(fp))
|
||||
return false;
|
||||
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !learning_rates_.Serialize(fp)) {
|
||||
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->Serialize(learning_rates_)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -209,7 +209,7 @@ bool Plumbing::DeSerialize(TFile *fp) {
|
||||
return false;
|
||||
AddToStack(network);
|
||||
}
|
||||
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !learning_rates_.DeSerialize(fp)) {
|
||||
if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->DeSerialize(learning_rates_)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -19,7 +19,6 @@
|
||||
#ifndef TESSERACT_LSTM_PLUMBING_H_
|
||||
#define TESSERACT_LSTM_PLUMBING_H_
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "matrix.h"
|
||||
#include "network.h"
|
||||
|
||||
@ -139,7 +138,7 @@ protected:
|
||||
PointerVector<Network> stack_;
|
||||
// Layer-specific learning rate iff network_flags_ & NF_LAYER_SPECIFIC_LR.
|
||||
// One element for each element of stack_.
|
||||
GenericVector<float> learning_rates_;
|
||||
std::vector<float> learning_rates_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
@ -23,7 +23,7 @@
|
||||
#include "pageres.h"
|
||||
#include "unicharcompress.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <algorithm> // for std::reverse
|
||||
#include <deque>
|
||||
#include <map>
|
||||
#include <set>
|
||||
@ -181,7 +181,7 @@ void RecodeBeamSearch::ExtractBestPathAsLabels(std::vector<int> *labels,
|
||||
std::vector<int> *xcoords) const {
|
||||
labels->clear();
|
||||
xcoords->clear();
|
||||
GenericVector<const RecodeNode *> best_nodes;
|
||||
std::vector<const RecodeNode *> best_nodes;
|
||||
ExtractBestPaths(&best_nodes, nullptr);
|
||||
// Now just run CTC on the best nodes.
|
||||
int t = 0;
|
||||
@ -205,7 +205,7 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET
|
||||
std::vector<float> *certs,
|
||||
std::vector<float> *ratings,
|
||||
std::vector<int> *xcoords) const {
|
||||
GenericVector<const RecodeNode *> best_nodes;
|
||||
std::vector<const RecodeNode *> best_nodes;
|
||||
ExtractBestPaths(&best_nodes, nullptr);
|
||||
ExtractPathAsUnicharIds(best_nodes, unichar_ids, certs, ratings, xcoords);
|
||||
if (debug) {
|
||||
@ -224,8 +224,8 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX &line_box, float scale_
|
||||
std::vector<float> certs;
|
||||
std::vector<float> ratings;
|
||||
std::vector<int> xcoords;
|
||||
GenericVector<const RecodeNode *> best_nodes;
|
||||
GenericVector<const RecodeNode *> second_nodes;
|
||||
std::vector<const RecodeNode *> best_nodes;
|
||||
std::vector<const RecodeNode *> second_nodes;
|
||||
character_boundaries_.clear();
|
||||
ExtractBestPaths(&best_nodes, &second_nodes);
|
||||
if (debug) {
|
||||
@ -306,10 +306,10 @@ void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET *
|
||||
}
|
||||
// fill the topology with depths first
|
||||
for (int step = beam->size() - 1; step >= 0; --step) {
|
||||
GenericVector<tesseract::RecodePair> *heaps = beam->get(step)->beams_->heap();
|
||||
for (int node = 0; node < heaps->size(); ++node) {
|
||||
std::vector<tesseract::RecodePair> &heaps = beam->get(step)->beams_->heap();
|
||||
for (auto node : heaps) {
|
||||
int backtracker = 0;
|
||||
const RecodeNode *curr = &heaps->get(node).data();
|
||||
const RecodeNode *curr = &node.data();
|
||||
while (curr != nullptr && !visited.count(curr)) {
|
||||
visited.insert(curr);
|
||||
topology[step - backtracker].push_back(curr);
|
||||
@ -371,7 +371,6 @@ void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET *
|
||||
}
|
||||
|
||||
void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
|
||||
GenericVector<tesseract::RecodePair> *heaps = nullptr;
|
||||
PointerVector<RecodeBeam> *currentBeam = nullptr;
|
||||
if (character_boundaries_.size() < 2)
|
||||
return;
|
||||
@ -389,14 +388,15 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
|
||||
std::vector<float> ratings;
|
||||
std::vector<int> xcoords;
|
||||
int backpath = character_boundaries_[j] - character_boundaries_[j - 1];
|
||||
heaps = currentBeam->get(character_boundaries_[j] - 1)->beams_->heap();
|
||||
GenericVector<const RecodeNode *> best_nodes;
|
||||
std::vector<tesseract::RecodePair> &heaps =
|
||||
currentBeam->get(character_boundaries_[j] - 1)->beams_->heap();
|
||||
std::vector<const RecodeNode *> best_nodes;
|
||||
std::vector<const RecodeNode *> best;
|
||||
// Scan the segmented node chain for valid unichar ids.
|
||||
for (int i = 0; i < heaps->size(); ++i) {
|
||||
for (auto entry : heaps) {
|
||||
bool validChar = false;
|
||||
int backcounter = 0;
|
||||
const RecodeNode *node = &heaps->get(i).data();
|
||||
const RecodeNode *node = &entry.data();
|
||||
while (node != nullptr && backcounter < backpath) {
|
||||
if (node->code != null_char_ && node->unichar_id != INVALID_UNICHAR_ID) {
|
||||
validChar = true;
|
||||
@ -406,7 +406,7 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
|
||||
++backcounter;
|
||||
}
|
||||
if (validChar)
|
||||
best.push_back(&heaps->get(i).data());
|
||||
best.push_back(&entry.data());
|
||||
}
|
||||
// find the best rated segmented node chain and extract the unichar id.
|
||||
if (!best.empty()) {
|
||||
@ -488,8 +488,7 @@ void RecodeBeamSearch::DebugBeams(const UNICHARSET &unicharset) const {
|
||||
|
||||
// Generates debug output of the content of a single beam position.
|
||||
void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHeap &heap) const {
|
||||
GenericVector<const RecodeNode *> unichar_bests;
|
||||
unichar_bests.init_to_size(unicharset.size(), nullptr);
|
||||
std::vector<const RecodeNode *> unichar_bests(unicharset.size());
|
||||
const RecodeNode *null_best = nullptr;
|
||||
int heap_size = heap.size();
|
||||
for (int i = 0; i < heap_size; ++i) {
|
||||
@ -518,7 +517,7 @@ void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHe
|
||||
// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
|
||||
// duplicates, nulls and intermediate parts.
|
||||
/* static */
|
||||
void RecodeBeamSearch::ExtractPathAsUnicharIds(const GenericVector<const RecodeNode *> &best_nodes,
|
||||
void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vector<const RecodeNode *> &best_nodes,
|
||||
std::vector<int> *unichar_ids,
|
||||
std::vector<float> *certs,
|
||||
std::vector<float> *ratings,
|
||||
@ -699,14 +698,14 @@ void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio
|
||||
if (debug) {
|
||||
int beam_index = BeamIndex(true, NC_ANYTHING, 0);
|
||||
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
|
||||
GenericVector<const RecodeNode *> path;
|
||||
std::vector<const RecodeNode *> path;
|
||||
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
|
||||
tprintf("Step %d: Dawg beam %d:\n", t, i);
|
||||
DebugPath(charset, path);
|
||||
}
|
||||
beam_index = BeamIndex(false, NC_ANYTHING, 0);
|
||||
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
|
||||
GenericVector<const RecodeNode *> path;
|
||||
std::vector<const RecodeNode *> path;
|
||||
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
|
||||
tprintf("Step %d: Non-Dawg beam %d:\n", t, i);
|
||||
DebugPath(charset, path);
|
||||
@ -765,14 +764,14 @@ void RecodeBeamSearch::DecodeSecondaryStep(const float *outputs, int t, double d
|
||||
if (debug) {
|
||||
int beam_index = BeamIndex(true, NC_ANYTHING, 0);
|
||||
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
|
||||
GenericVector<const RecodeNode *> path;
|
||||
std::vector<const RecodeNode *> path;
|
||||
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
|
||||
tprintf("Step %d: Dawg beam %d:\n", t, i);
|
||||
DebugPath(charset, path);
|
||||
}
|
||||
beam_index = BeamIndex(false, NC_ANYTHING, 0);
|
||||
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
|
||||
GenericVector<const RecodeNode *> path;
|
||||
std::vector<const RecodeNode *> path;
|
||||
ExtractPath(&prev->beams_[beam_index].get(i).data(), &path);
|
||||
tprintf("Step %d: Non-Dawg beam %d:\n", t, i);
|
||||
DebugPath(charset, path);
|
||||
@ -858,7 +857,7 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const
|
||||
dict_ratio, use_dawgs, NC_ANYTHING, prev, step);
|
||||
}
|
||||
}
|
||||
const GenericVector<int> *final_codes = recoder_.GetFinalCodes(prefix);
|
||||
const std::vector<int> *final_codes = recoder_.GetFinalCodes(prefix);
|
||||
if (final_codes != nullptr) {
|
||||
for (int i = 0; i < final_codes->size(); ++i) {
|
||||
int code = (*final_codes)[i];
|
||||
@ -892,7 +891,7 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const
|
||||
}
|
||||
}
|
||||
}
|
||||
const GenericVector<int> *next_codes = recoder_.GetNextCodes(prefix);
|
||||
const std::vector<int> *next_codes = recoder_.GetNextCodes(prefix);
|
||||
if (next_codes != nullptr) {
|
||||
for (int i = 0; i < next_codes->size(); ++i) {
|
||||
int code = (*next_codes)[i];
|
||||
@ -1121,17 +1120,17 @@ bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *hea
|
||||
// TODO(rays) consider hash map instead of linear search.
|
||||
// It might not be faster because the hash map would have to be updated
|
||||
// every time a heap reshuffle happens, and that would be a lot of overhead.
|
||||
GenericVector<RecodePair> *nodes = heap->heap();
|
||||
for (int i = 0; i < nodes->size(); ++i) {
|
||||
RecodeNode &node = (*nodes)[i].data();
|
||||
std::vector<RecodePair> &nodes = heap->heap();
|
||||
for (int i = 0; i < nodes.size(); ++i) {
|
||||
RecodeNode &node = nodes[i].data();
|
||||
if (node.code == new_node->code && node.code_hash == new_node->code_hash &&
|
||||
node.permuter == new_node->permuter && node.start_of_dawg == new_node->start_of_dawg) {
|
||||
if (new_node->score > node.score) {
|
||||
// The new one is better. Update the entire node in the heap and
|
||||
// reshuffle.
|
||||
node = *new_node;
|
||||
(*nodes)[i].key() = node.score;
|
||||
heap->Reshuffle(&(*nodes)[i]);
|
||||
nodes[i].key() = node.score;
|
||||
heap->Reshuffle(&nodes[i]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -1156,8 +1155,8 @@ uint64_t RecodeBeamSearch::ComputeCodeHash(int code, bool dup, const RecodeNode
|
||||
// during Decode. On return the best_nodes vector essentially contains the set
|
||||
// of code, score pairs that make the optimal path with the constraint that
|
||||
// the recoder can decode the code sequence back to a sequence of unichar-ids.
|
||||
void RecodeBeamSearch::ExtractBestPaths(GenericVector<const RecodeNode *> *best_nodes,
|
||||
GenericVector<const RecodeNode *> *second_nodes) const {
|
||||
void RecodeBeamSearch::ExtractBestPaths(std::vector<const RecodeNode *> *best_nodes,
|
||||
std::vector<const RecodeNode *> *second_nodes) const {
|
||||
// Scan both beams to extract the best and second best paths.
|
||||
const RecodeNode *best_node = nullptr;
|
||||
const RecodeNode *second_best_node = nullptr;
|
||||
@ -1201,30 +1200,30 @@ void RecodeBeamSearch::ExtractBestPaths(GenericVector<const RecodeNode *> *best_
|
||||
// Helper backtracks through the lattice from the given node, storing the
|
||||
// path and reversing it.
|
||||
void RecodeBeamSearch::ExtractPath(const RecodeNode *node,
|
||||
GenericVector<const RecodeNode *> *path) const {
|
||||
path->truncate(0);
|
||||
std::vector<const RecodeNode *> *path) const {
|
||||
path->clear();
|
||||
while (node != nullptr) {
|
||||
path->push_back(node);
|
||||
node = node->prev;
|
||||
}
|
||||
path->reverse();
|
||||
std::reverse(path->begin(), path->end());
|
||||
}
|
||||
|
||||
void RecodeBeamSearch::ExtractPath(const RecodeNode *node, GenericVector<const RecodeNode *> *path,
|
||||
void RecodeBeamSearch::ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path,
|
||||
int limiter) const {
|
||||
int pathcounter = 0;
|
||||
path->truncate(0);
|
||||
path->clear();
|
||||
while (node != nullptr && pathcounter < limiter) {
|
||||
path->push_back(node);
|
||||
node = node->prev;
|
||||
++pathcounter;
|
||||
}
|
||||
path->reverse();
|
||||
std::reverse(path->begin(), path->end());
|
||||
}
|
||||
|
||||
// Helper prints debug information on the given lattice path.
|
||||
void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset,
|
||||
const GenericVector<const RecodeNode *> &path) const {
|
||||
const std::vector<const RecodeNode *> &path) const {
|
||||
for (int c = 0; c < path.size(); ++c) {
|
||||
const RecodeNode &node = *path[c];
|
||||
tprintf("%d ", c);
|
||||
@ -1234,7 +1233,7 @@ void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset,
|
||||
|
||||
// Helper prints debug information on the given unichar path.
|
||||
void RecodeBeamSearch::DebugUnicharPath(const UNICHARSET *unicharset,
|
||||
const GenericVector<const RecodeNode *> &path,
|
||||
const std::vector<const RecodeNode *> &path,
|
||||
const std::vector<int> &unichar_ids,
|
||||
const std::vector<float> &certs,
|
||||
const std::vector<float> &ratings,
|
||||
|
@ -301,7 +301,7 @@ private:
|
||||
|
||||
// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
|
||||
// duplicates, nulls and intermediate parts.
|
||||
static void ExtractPathAsUnicharIds(const GenericVector<const RecodeNode *> &best_nodes,
|
||||
static void ExtractPathAsUnicharIds(const std::vector<const RecodeNode *> &best_nodes,
|
||||
std::vector<int> *unichar_ids, std::vector<float> *certs,
|
||||
std::vector<float> *ratings, std::vector<int> *xcoords,
|
||||
std::vector<int> *character_boundaries = nullptr);
|
||||
@ -380,17 +380,17 @@ private:
|
||||
// during Decode. On return the best_nodes vector essentially contains the set
|
||||
// of code, score pairs that make the optimal path with the constraint that
|
||||
// the recoder can decode the code sequence back to a sequence of unichar-ids.
|
||||
void ExtractBestPaths(GenericVector<const RecodeNode *> *best_nodes,
|
||||
GenericVector<const RecodeNode *> *second_nodes) const;
|
||||
void ExtractBestPaths(std::vector<const RecodeNode *> *best_nodes,
|
||||
std::vector<const RecodeNode *> *second_nodes) const;
|
||||
// Helper backtracks through the lattice from the given node, storing the
|
||||
// path and reversing it.
|
||||
void ExtractPath(const RecodeNode *node, GenericVector<const RecodeNode *> *path) const;
|
||||
void ExtractPath(const RecodeNode *node, GenericVector<const RecodeNode *> *path,
|
||||
void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path) const;
|
||||
void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path,
|
||||
int limiter) const;
|
||||
// Helper prints debug information on the given lattice path.
|
||||
void DebugPath(const UNICHARSET *unicharset, const GenericVector<const RecodeNode *> &path) const;
|
||||
void DebugPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path) const;
|
||||
// Helper prints debug information on the given unichar path.
|
||||
void DebugUnicharPath(const UNICHARSET *unicharset, const GenericVector<const RecodeNode *> &path,
|
||||
void DebugUnicharPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path,
|
||||
const std::vector<int> &unichar_ids, const std::vector<float> &certs,
|
||||
const std::vector<float> &ratings, const std::vector<int> &xcoords) const;
|
||||
|
||||
|
@ -44,7 +44,7 @@ bool TFNetwork::Serialize(TFile *fp) const {
|
||||
return false;
|
||||
std::string proto_str;
|
||||
model_proto_.SerializeToString(&proto_str);
|
||||
GenericVector<char> data;
|
||||
std::vector<char> data;
|
||||
data.resize_no_init(proto_str.size());
|
||||
memcpy(&data[0], proto_str.data(), proto_str.size());
|
||||
if (!data.Serialize(fp))
|
||||
@ -55,7 +55,7 @@ bool TFNetwork::Serialize(TFile *fp) const {
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// Should be overridden by subclasses, but NOT called by their DeSerialize.
|
||||
bool TFNetwork::DeSerialize(TFile *fp) {
|
||||
GenericVector<char> data;
|
||||
std::vector<char> data;
|
||||
if (!data.DeSerialize(fp))
|
||||
return false;
|
||||
if (!model_proto_.ParseFromArray(&data[0], data.size())) {
|
||||
|
@ -256,8 +256,8 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {
|
||||
if (int_mode_) {
|
||||
if (!wi_.DeSerialize(fp))
|
||||
return false;
|
||||
GenericVector<float> old_scales;
|
||||
if (!old_scales.DeSerialize(fp))
|
||||
std::vector<float> old_scales;
|
||||
if (!fp->DeSerialize(old_scales))
|
||||
return false;
|
||||
scales_.reserve(old_scales.size());
|
||||
for (int i = 0; i < old_scales.size(); ++i) {
|
||||
|
@ -277,8 +277,8 @@ double BaselineRow::AdjustBaselineToGrid(int debug, const FCOORD &direction, dou
|
||||
void BaselineRow::SetupBlobDisplacements(const FCOORD &direction) {
|
||||
// Set of perpendicular displacements of the blob bottoms from the required
|
||||
// baseline direction.
|
||||
GenericVector<double> perp_blob_dists;
|
||||
displacement_modes_.truncate(0);
|
||||
std::vector<double> perp_blob_dists;
|
||||
displacement_modes_.clear();
|
||||
// Gather the skew-corrected position of every blob.
|
||||
double min_dist = FLT_MAX;
|
||||
double max_dist = -FLT_MAX;
|
||||
@ -310,8 +310,8 @@ void BaselineRow::SetupBlobDisplacements(const FCOORD &direction) {
|
||||
for (int i = 0; i < perp_blob_dists.size(); ++i) {
|
||||
dist_stats.add(IntCastRounded(perp_blob_dists[i] / disp_quant_factor_), 1);
|
||||
}
|
||||
GenericVector<KDPairInc<float, int>> scaled_modes;
|
||||
dist_stats.top_n_modes(kMaxDisplacementsModes, &scaled_modes);
|
||||
std::vector<KDPairInc<float, int>> scaled_modes;
|
||||
dist_stats.top_n_modes(kMaxDisplacementsModes, scaled_modes);
|
||||
#ifdef kDebugYCoord
|
||||
if (debug) {
|
||||
for (int i = 0; i < scaled_modes.size(); ++i) {
|
||||
@ -428,7 +428,7 @@ double BaselineBlock::SpacingModelError(double perp_disp, double line_spacing, d
|
||||
bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) {
|
||||
if (non_text_block_)
|
||||
return false;
|
||||
GenericVector<double> angles;
|
||||
std::vector<double> angles;
|
||||
for (int r = 0; r < rows_.size(); ++r) {
|
||||
BaselineRow *row = rows_[r];
|
||||
if (row->FitBaseline(use_box_bottoms)) {
|
||||
@ -440,7 +440,7 @@ bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) {
|
||||
}
|
||||
|
||||
if (!angles.empty()) {
|
||||
skew_angle_ = MedianOfCircularValues(M_PI, &angles);
|
||||
skew_angle_ = MedianOfCircularValues(M_PI, angles);
|
||||
good_skew_angle_ = true;
|
||||
} else {
|
||||
skew_angle_ = 0.0f;
|
||||
@ -610,7 +610,7 @@ void BaselineBlock::DrawPixSpline(Pix *pix_in) {
|
||||
// observations.
|
||||
bool BaselineBlock::ComputeLineSpacing() {
|
||||
FCOORD direction(cos(skew_angle_), sin(skew_angle_));
|
||||
GenericVector<double> row_positions;
|
||||
std::vector<double> row_positions;
|
||||
ComputeBaselinePositions(direction, &row_positions);
|
||||
if (row_positions.size() < 2)
|
||||
return false;
|
||||
@ -644,7 +644,7 @@ bool BaselineBlock::ComputeLineSpacing() {
|
||||
// of the block baseline a line sits, hence the function and argument name
|
||||
// positions not distances.
|
||||
void BaselineBlock::ComputeBaselinePositions(const FCOORD &direction,
|
||||
GenericVector<double> *positions) {
|
||||
std::vector<double> *positions) {
|
||||
positions->clear();
|
||||
for (int r = 0; r < rows_.size(); ++r) {
|
||||
BaselineRow *row = rows_[r];
|
||||
@ -659,7 +659,7 @@ void BaselineBlock::ComputeBaselinePositions(const FCOORD &direction,
|
||||
// Computes an estimate of the line spacing of the block from the median
|
||||
// of the spacings between adjacent overlapping textlines.
|
||||
void BaselineBlock::EstimateLineSpacing() {
|
||||
GenericVector<float> spacings;
|
||||
std::vector<float> spacings;
|
||||
for (int r = 0; r < rows_.size(); ++r) {
|
||||
BaselineRow *row = rows_[r];
|
||||
// Exclude silly lines.
|
||||
@ -682,7 +682,8 @@ void BaselineBlock::EstimateLineSpacing() {
|
||||
// If we have at least one value, use it, otherwise leave the previous
|
||||
// value unchanged.
|
||||
if (!spacings.empty()) {
|
||||
line_spacing_ = spacings[spacings.choose_nth_item(spacings.size() / 2)];
|
||||
std::nth_element(spacings.begin(), spacings.begin() + spacings.size() / 2, spacings.end());
|
||||
line_spacing_ = spacings[spacings.size() / 2];
|
||||
if (debug_level_ > 1)
|
||||
tprintf("Estimate of linespacing = %g\n", line_spacing_);
|
||||
}
|
||||
@ -692,7 +693,7 @@ void BaselineBlock::EstimateLineSpacing() {
|
||||
// line to the deskewed y-position of each baseline as a function of its
|
||||
// estimated line index, allowing for a small error in the initial linespacing
|
||||
// and choosing the best available model.
|
||||
void BaselineBlock::RefineLineSpacing(const GenericVector<double> &positions) {
|
||||
void BaselineBlock::RefineLineSpacing(const std::vector<double> &positions) {
|
||||
double spacings[3], offsets[3], errors[3];
|
||||
int index_range;
|
||||
errors[0] =
|
||||
@ -727,7 +728,7 @@ void BaselineBlock::RefineLineSpacing(const GenericVector<double> &positions) {
|
||||
// and the corresponding intercept in c_out, and the number of spacings seen
|
||||
// in index_delta. Returns the error of fit to the line spacing model.
|
||||
// Uses a simple linear regression, but optimized the offset using the median.
|
||||
double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions, double m_in,
|
||||
double BaselineBlock::FitLineSpacingModel(const std::vector<double> &positions, double m_in,
|
||||
double *m_out, double *c_out, int *index_delta) {
|
||||
if (m_in == 0.0f || positions.size() < 2) {
|
||||
*m_out = m_in;
|
||||
@ -736,12 +737,12 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions
|
||||
*index_delta = 0;
|
||||
return 0.0;
|
||||
}
|
||||
GenericVector<double> offsets;
|
||||
std::vector<double> offsets;
|
||||
// Get the offset (remainder) linespacing for each line and choose the median.
|
||||
for (int i = 0; i < positions.size(); ++i)
|
||||
offsets.push_back(fmod(positions[i], m_in));
|
||||
// Get the median offset.
|
||||
double median_offset = MedianOfCircularValues(m_in, &offsets);
|
||||
double median_offset = MedianOfCircularValues(m_in, offsets);
|
||||
// Now fit a line to quantized line number and offset.
|
||||
LLSQ llsq;
|
||||
int min_index = INT32_MAX;
|
||||
@ -755,7 +756,7 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions
|
||||
// Get the refined line spacing.
|
||||
*m_out = llsq.m();
|
||||
// Use the median offset rather than the mean.
|
||||
offsets.truncate(0);
|
||||
offsets.clear();
|
||||
if (*m_out != 0.0) {
|
||||
for (int i = 0; i < positions.size(); ++i) {
|
||||
offsets.push_back(fmod(positions[i], *m_out));
|
||||
@ -766,7 +767,7 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector<double> &positions
|
||||
tprintf("%d: %g\n", i, offsets[i]);
|
||||
}
|
||||
}
|
||||
*c_out = MedianOfCircularValues(*m_out, &offsets);
|
||||
*c_out = MedianOfCircularValues(*m_out, offsets);
|
||||
} else {
|
||||
*c_out = 0.0;
|
||||
}
|
||||
@ -808,7 +809,7 @@ BaselineDetect::BaselineDetect(int debug_level, const FCOORD &page_skew, TO_BLOC
|
||||
// block-wise and page-wise data to smooth small blocks/rows, and applies
|
||||
// smoothing based on block/page-level skew and block-level linespacing.
|
||||
void BaselineDetect::ComputeStraightBaselines(bool use_box_bottoms) {
|
||||
GenericVector<double> block_skew_angles;
|
||||
std::vector<double> block_skew_angles;
|
||||
for (int i = 0; i < blocks_.size(); ++i) {
|
||||
BaselineBlock *bl_block = blocks_[i];
|
||||
if (debug_level_ > 0)
|
||||
@ -820,7 +821,7 @@ void BaselineDetect::ComputeStraightBaselines(bool use_box_bottoms) {
|
||||
// Compute a page-wide default skew for blocks with too little information.
|
||||
double default_block_skew = page_skew_.angle();
|
||||
if (!block_skew_angles.empty()) {
|
||||
default_block_skew = MedianOfCircularValues(M_PI, &block_skew_angles);
|
||||
default_block_skew = MedianOfCircularValues(M_PI, block_skew_angles);
|
||||
}
|
||||
if (debug_level_ > 0) {
|
||||
tprintf("Page skew angle = %g\n", default_block_skew);
|
||||
|
@ -23,8 +23,6 @@
|
||||
#include "points.h"
|
||||
#include "rect.h"
|
||||
|
||||
#include "genericvector.h"
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
@ -109,7 +107,7 @@ private:
|
||||
FCOORD baseline_pt1_;
|
||||
FCOORD baseline_pt2_;
|
||||
// Set of modes of displacements. They indicate preferable baseline positions.
|
||||
GenericVector<double> displacement_modes_;
|
||||
std::vector<double> displacement_modes_;
|
||||
// Quantization factor used for displacement_modes_.
|
||||
double disp_quant_factor_;
|
||||
// Half the acceptance range of blob displacements for computing the
|
||||
@ -187,7 +185,7 @@ private:
|
||||
|
||||
// Computes the deskewed vertical position of each baseline in the block and
|
||||
// stores them in the given vector.
|
||||
void ComputeBaselinePositions(const FCOORD &direction, GenericVector<double> *positions);
|
||||
void ComputeBaselinePositions(const FCOORD &direction, std::vector<double> *positions);
|
||||
|
||||
// Computes an estimate of the line spacing of the block from the median
|
||||
// of the spacings between adjacent overlapping textlines.
|
||||
@ -197,13 +195,13 @@ private:
|
||||
// line to the deskewed y-position of each baseline as a function of its
|
||||
// estimated line index, allowing for a small error in the initial linespacing
|
||||
// and choosing the best available model.
|
||||
void RefineLineSpacing(const GenericVector<double> &positions);
|
||||
void RefineLineSpacing(const std::vector<double> &positions);
|
||||
|
||||
// Given an initial estimate of line spacing (m_in) and the positions of each
|
||||
// baseline, computes the line spacing of the block more accurately in m_out,
|
||||
// and the corresponding intercept in c_out, and the number of spacings seen
|
||||
// in index_delta. Returns the error of fit to the line spacing model.
|
||||
double FitLineSpacingModel(const GenericVector<double> &positions, double m_in, double *m_out,
|
||||
double FitLineSpacingModel(const std::vector<double> &positions, double m_in, double *m_out,
|
||||
double *c_out, int *index_delta);
|
||||
|
||||
// The block to which this class adds extra information used during baseline
|
||||
|
@ -384,6 +384,23 @@ int SortByBoxLeft(const void *void1, const void *void2) {
|
||||
return p1->bounding_box().top() - p2->bounding_box().top();
|
||||
}
|
||||
|
||||
template <class BBC>
|
||||
bool StdSortByBoxLeft(const void *void1, const void *void2) {
|
||||
// The void*s are actually doubly indirected, so get rid of one level.
|
||||
const BBC *p1 = *static_cast<const BBC *const *>(void1);
|
||||
const BBC *p2 = *static_cast<const BBC *const *>(void2);
|
||||
int result = p1->bounding_box().left() - p2->bounding_box().left();
|
||||
if (result != 0)
|
||||
return result < 0;
|
||||
result = p1->bounding_box().right() - p2->bounding_box().right();
|
||||
if (result != 0)
|
||||
return result < 0;
|
||||
result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
|
||||
if (result != 0)
|
||||
return result < 0;
|
||||
return p1->bounding_box().top() < p2->bounding_box().top();
|
||||
}
|
||||
|
||||
// Sort function to sort a BBC by bounding_box().right() in right-to-left order.
|
||||
template <class BBC>
|
||||
int SortRightToLeft(const void *void1, const void *void2) {
|
||||
@ -402,6 +419,23 @@ int SortRightToLeft(const void *void1, const void *void2) {
|
||||
return p1->bounding_box().top() - p2->bounding_box().top();
|
||||
}
|
||||
|
||||
template <class BBC>
|
||||
bool StdSortRightToLeft(const void *void1, const void *void2) {
|
||||
// The void*s are actually doubly indirected, so get rid of one level.
|
||||
const BBC *p1 = *static_cast<const BBC *const *>(void1);
|
||||
const BBC *p2 = *static_cast<const BBC *const *>(void2);
|
||||
int result = p2->bounding_box().right() - p1->bounding_box().right();
|
||||
if (result != 0)
|
||||
return result < 0;
|
||||
result = p2->bounding_box().left() - p1->bounding_box().left();
|
||||
if (result != 0)
|
||||
return result < 0;
|
||||
result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
|
||||
if (result != 0)
|
||||
return result < 0;
|
||||
return p1->bounding_box().top() < p2->bounding_box().top();
|
||||
}
|
||||
|
||||
// Sort function to sort a BBC by bounding_box().bottom().
|
||||
template <class BBC>
|
||||
int SortByBoxBottom(const void *void1, const void *void2) {
|
||||
|
@ -18,7 +18,6 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "cjkpitch.h"
|
||||
#include "genericvector.h"
|
||||
#include "topitch.h"
|
||||
#include "tovars.h"
|
||||
|
||||
@ -109,7 +108,7 @@ public:
|
||||
~LocalCorrelation() {}
|
||||
|
||||
void Finish() {
|
||||
values_.sort(float_pair_compare);
|
||||
std::sort(values_.begin(), values_.end(), float_pair_compare);
|
||||
finalized_ = true;
|
||||
}
|
||||
|
||||
@ -155,14 +154,12 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
static int float_pair_compare(const void *a, const void *b) {
|
||||
const auto *f_a = static_cast<const float_pair *>(a);
|
||||
const auto *f_b = static_cast<const float_pair *>(b);
|
||||
return (f_a->x > f_b->x) ? 1 : ((f_a->x < f_b->x) ? -1 : 0);
|
||||
static bool float_pair_compare(const float_pair f_a, const float_pair f_b) {
|
||||
return f_a.x < f_b.x;
|
||||
}
|
||||
|
||||
bool finalized_;
|
||||
GenericVector<struct float_pair> values_;
|
||||
std::vector<struct float_pair> values_;
|
||||
};
|
||||
|
||||
// Class to represent a character on a fixed pitch row. A FPChar may
|
||||
@ -450,7 +447,7 @@ private:
|
||||
index++;
|
||||
}
|
||||
}
|
||||
characters_.truncate(index);
|
||||
characters_.resize(index);
|
||||
}
|
||||
|
||||
float pitch_ = 0.0f; // Character pitch.
|
||||
@ -472,7 +469,7 @@ private:
|
||||
|
||||
SimpleStats heights_;
|
||||
|
||||
GenericVector<FPChar> characters_;
|
||||
std::vector<FPChar> characters_;
|
||||
TO_ROW *real_row_ = nullptr; // Underlying TD_ROW for this row.
|
||||
};
|
||||
|
||||
|
@ -101,7 +101,9 @@ ColumnFinder::ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &trig
|
||||
}
|
||||
|
||||
ColumnFinder::~ColumnFinder() {
|
||||
column_sets_.delete_data_pointers();
|
||||
for (auto set : column_sets_) {
|
||||
delete set;
|
||||
}
|
||||
delete[] best_columns_;
|
||||
delete stroke_width_;
|
||||
delete input_blobs_win_;
|
||||
@ -552,7 +554,7 @@ bool ColumnFinder::MakeColumns(bool single_column) {
|
||||
bool good_only = true;
|
||||
do {
|
||||
for (int i = 0; i < gridheight_; ++i) {
|
||||
ColPartitionSet *line_set = part_sets.get(i);
|
||||
ColPartitionSet *line_set = part_sets.at(i);
|
||||
if (line_set != nullptr && line_set->LegalColumnCandidate()) {
|
||||
ColPartitionSet *column_candidate = line_set->Copy(good_only);
|
||||
if (column_candidate != nullptr)
|
||||
@ -590,7 +592,7 @@ bool ColumnFinder::MakeColumns(bool single_column) {
|
||||
ComputeMeanColumnGap(any_multi_column);
|
||||
}
|
||||
for (int i = 0; i < part_sets.size(); ++i) {
|
||||
ColPartitionSet *line_set = part_sets.get(i);
|
||||
ColPartitionSet *line_set = part_sets.at(i);
|
||||
if (line_set != nullptr) {
|
||||
line_set->RelinquishParts();
|
||||
delete line_set;
|
||||
@ -604,8 +606,9 @@ bool ColumnFinder::MakeColumns(bool single_column) {
|
||||
// Src_sets may be equal to column_candidates, in which case it will
|
||||
// use them as a source to improve themselves.
|
||||
void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets) {
|
||||
PartSetVector temp_cols;
|
||||
temp_cols.move(column_sets);
|
||||
// TODO: optimize.
|
||||
PartSetVector temp_cols = *column_sets;
|
||||
column_sets->clear();
|
||||
if (src_sets == column_sets)
|
||||
src_sets = &temp_cols;
|
||||
int set_size = temp_cols.size();
|
||||
@ -613,7 +616,7 @@ void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVecto
|
||||
bool good_only = true;
|
||||
do {
|
||||
for (int i = 0; i < set_size; ++i) {
|
||||
ColPartitionSet *column_candidate = temp_cols.get(i);
|
||||
ColPartitionSet *column_candidate = temp_cols.at(i);
|
||||
ASSERT_HOST(column_candidate != nullptr);
|
||||
ColPartitionSet *improved = column_candidate->Copy(good_only);
|
||||
if (improved != nullptr) {
|
||||
@ -623,10 +626,15 @@ void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVecto
|
||||
}
|
||||
good_only = !good_only;
|
||||
} while (column_sets->empty() && !good_only);
|
||||
if (column_sets->empty())
|
||||
column_sets->move(&temp_cols);
|
||||
else
|
||||
temp_cols.delete_data_pointers();
|
||||
if (column_sets->empty()) {
|
||||
// TODO: optimize.
|
||||
column_sets = &temp_cols;
|
||||
temp_cols.clear();
|
||||
} else {
|
||||
for (auto data : temp_cols) {
|
||||
delete data;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prints debug information on the column candidates.
|
||||
@ -635,7 +643,7 @@ void ColumnFinder::PrintColumnCandidates(const char *title) {
|
||||
tprintf("Found %d %s:\n", set_size, title);
|
||||
if (textord_debug_tabfind >= 3) {
|
||||
for (int i = 0; i < set_size; ++i) {
|
||||
ColPartitionSet *column_set = column_sets_.get(i);
|
||||
ColPartitionSet *column_set = column_sets_.at(i);
|
||||
column_set->Print();
|
||||
}
|
||||
}
|
||||
@ -673,7 +681,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
|
||||
// Set possible column_sets to indicate whether each set is compatible
|
||||
// with each column.
|
||||
for (int part_i = 0; part_i < set_count; ++part_i) {
|
||||
ColPartitionSet *line_set = part_sets.get(part_i);
|
||||
ColPartitionSet *line_set = part_sets.at(part_i);
|
||||
bool debug = line_set != nullptr && WithinTestRegion(2, line_set->bounding_box().left(),
|
||||
line_set->bounding_box().bottom());
|
||||
column_set_costs[part_i] = new int[column_count];
|
||||
@ -681,8 +689,8 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
|
||||
assigned_costs[part_i] = INT32_MAX;
|
||||
for (int col_i = 0; col_i < column_count; ++col_i) {
|
||||
if (line_set != nullptr &&
|
||||
column_sets_.get(col_i)->CompatibleColumns(debug, line_set, WidthCB())) {
|
||||
column_set_costs[part_i][col_i] = column_sets_.get(col_i)->UnmatchedWidth(line_set);
|
||||
column_sets_.at(col_i)->CompatibleColumns(debug, line_set, WidthCB())) {
|
||||
column_set_costs[part_i][col_i] = column_sets_.at(col_i)->UnmatchedWidth(line_set);
|
||||
any_columns_possible[part_i] = true;
|
||||
} else {
|
||||
column_set_costs[part_i][col_i] = INT32_MAX;
|
||||
@ -702,7 +710,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
|
||||
int column_set_id = RangeModalColumnSet(column_set_costs, assigned_costs, start, end);
|
||||
if (textord_debug_tabfind >= 2) {
|
||||
tprintf("Range modal column id = %d\n", column_set_id);
|
||||
column_sets_.get(column_set_id)->Print();
|
||||
column_sets_.at(column_set_id)->Print();
|
||||
}
|
||||
// Now find the longest run of the column_set_id in the range.
|
||||
ShrinkRangeToLongestRun(column_set_costs, assigned_costs, any_columns_possible, column_set_id,
|
||||
@ -722,7 +730,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) {
|
||||
tprintf("Column id %d applies to range = %d - %d\n", column_set_id, start, end);
|
||||
// Assign the column to the range, which now may overlap with other ranges.
|
||||
AssignColumnToRange(column_set_id, start, end, column_set_costs, assigned_costs);
|
||||
if (column_sets_.get(column_set_id)->GoodColumnCount() > 1)
|
||||
if (column_sets_.at(column_set_id)->GoodColumnCount() > 1)
|
||||
any_multi_column = true;
|
||||
}
|
||||
// If anything remains unassigned, the whole lot is unassigned, so
|
||||
@ -879,7 +887,7 @@ void ColumnFinder::ExtendRangePastSmallGaps(int **column_set_costs, const int *a
|
||||
// Assigns the given column_set_id to the given range.
|
||||
void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,
|
||||
int **column_set_costs, int *assigned_costs) {
|
||||
ColPartitionSet *column_set = column_sets_.get(column_set_id);
|
||||
ColPartitionSet *column_set = column_sets_.at(column_set_id);
|
||||
for (int i = start; i < end; ++i) {
|
||||
assigned_costs[i] = column_set_costs[i][column_set_id];
|
||||
best_columns_[i] = column_set;
|
||||
|
@ -1472,7 +1472,7 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction
|
||||
ComputeSearchBoxAndScaling(direction, part_box, gridsize(), &search_box, &dist_scaling);
|
||||
bool image_region =
|
||||
ImageFind::CountPixelsInRotatedBox(search_box, im_box, rerotation, nontext_map) > 0;
|
||||
GenericVector<int> dists[NPT_COUNT];
|
||||
std::vector<int> dists[NPT_COUNT];
|
||||
AccumulatePartDistances(part, dist_scaling, search_box, nontext_map, im_box, rerotation, debug,
|
||||
dists);
|
||||
// By iteratively including the next smallest distance across the vectors,
|
||||
@ -1537,12 +1537,12 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction
|
||||
// vectors in the dists array are sorted in increasing order.
|
||||
// The nontext_map (+im_box, rerotation) is used to make text invisible if
|
||||
// there is non-text in between.
|
||||
// dists must be an array of GenericVectors of size NPT_COUNT.
|
||||
// dists must be an array of vectors of size NPT_COUNT.
|
||||
void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part,
|
||||
const ICOORD &dist_scaling, const TBOX &search_box,
|
||||
Pix *nontext_map, const TBOX &im_box,
|
||||
const FCOORD &rerotation, bool debug,
|
||||
GenericVector<int> *dists) {
|
||||
std::vector<int> *dists) {
|
||||
const TBOX &part_box = base_part.bounding_box();
|
||||
ColPartitionGridSearch rsearch(this);
|
||||
rsearch.SetUniqueMode(true);
|
||||
@ -1571,7 +1571,7 @@ void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part,
|
||||
// Truncate the number of boxes, so text doesn't get too much advantage.
|
||||
int n_boxes = std::min(neighbour->boxes_count(), kSmoothDecisionMargin);
|
||||
BlobTextFlowType n_flow = neighbour->flow();
|
||||
GenericVector<int> *count_vector = nullptr;
|
||||
std::vector<int> *count_vector = nullptr;
|
||||
if (n_flow == BTFT_STRONG_CHAIN) {
|
||||
if (n_type == BRT_TEXT)
|
||||
count_vector = &dists[NPT_HTEXT];
|
||||
@ -1602,8 +1602,9 @@ void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part,
|
||||
neighbour->Print();
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < NPT_COUNT; ++i)
|
||||
dists[i].sort();
|
||||
for (int i = 0; i < NPT_COUNT; ++i) {
|
||||
std::sort(dists[i].begin(), dists[i].end());
|
||||
}
|
||||
}
|
||||
|
||||
// Improves the margins of the part ColPartition by searching for
|
||||
|
@ -214,10 +214,10 @@ private:
|
||||
// distance (scaled by dist_scaling) of the part from the base_part to the
|
||||
// vector of the appropriate type for the partition. Prior to return, the
|
||||
// vectors in the dists array are sorted in increasing order.
|
||||
// dists must be an array of GenericVectors of size NPT_COUNT.
|
||||
// dists must be an array of vectors of size NPT_COUNT.
|
||||
void AccumulatePartDistances(const ColPartition &base_part, const ICOORD &dist_scaling,
|
||||
const TBOX &search_box, Pix *nontext_map, const TBOX &im_box,
|
||||
const FCOORD &rerotation, bool debug, GenericVector<int> *dists);
|
||||
const FCOORD &rerotation, bool debug, std::vector<int> *dists);
|
||||
|
||||
// Improves the margins of the ColPartition by searching for
|
||||
// neighbours that vertically overlap significantly.
|
||||
|
@ -93,7 +93,7 @@ void ColPartitionSet::ImproveColumnCandidate(WidthCallback cb, PartSetVector *sr
|
||||
// Iterate over the provided column sets, as each one may have something
|
||||
// to improve this.
|
||||
for (int i = 0; i < set_size; ++i) {
|
||||
ColPartitionSet *column_set = src_sets->get(i);
|
||||
ColPartitionSet *column_set = src_sets->at(i);
|
||||
if (column_set == nullptr)
|
||||
continue;
|
||||
// Iterate over the parts in this and column_set, adding bigger or
|
||||
@ -184,7 +184,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthC
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < column_sets->size(); ++i) {
|
||||
ColPartitionSet *columns = column_sets->get(i);
|
||||
ColPartitionSet *columns = column_sets->at(i);
|
||||
// In ordering the column set candidates, good_coverage_ is king,
|
||||
// followed by good_column_count_ and then bad_coverage_.
|
||||
bool better = good_coverage_ > columns->good_coverage_;
|
||||
@ -198,7 +198,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthC
|
||||
// The new one is better so add it.
|
||||
if (debug)
|
||||
tprintf("Good one\n");
|
||||
column_sets->insert(this, i);
|
||||
column_sets->insert(column_sets->begin() + i, this);
|
||||
return;
|
||||
}
|
||||
if (columns->CompatibleColumns(false, this, cb)) {
|
||||
|
@ -21,7 +21,6 @@
|
||||
#define TESSERACT_TEXTORD_COLPARTITIONSET_H_
|
||||
|
||||
#include "colpartition.h" // For ColPartition_LIST.
|
||||
#include "genericvector.h" // For GenericVector.
|
||||
#include "rect.h" // For TBOX.
|
||||
#include "tabvector.h" // For BLOBNBOX_CLIST.
|
||||
|
||||
@ -30,7 +29,7 @@ namespace tesseract {
|
||||
class WorkingPartSet_LIST;
|
||||
class ColSegment_LIST;
|
||||
class ColPartitionSet;
|
||||
using PartSetVector = GenericVector<ColPartitionSet *>;
|
||||
using PartSetVector = std::vector<ColPartitionSet *>;
|
||||
|
||||
// ColPartitionSet is a class that holds a list of ColPartitions.
|
||||
// Its main use is in holding a candidate partitioning of the width of the
|
||||
|
@ -516,7 +516,7 @@ ScrollView *TabFind::FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_g
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
|
||||
// Helper displays all the boxes in the given vector on the given window.
|
||||
static void DisplayBoxVector(const GenericVector<BLOBNBOX *> &boxes, ScrollView *win) {
|
||||
static void DisplayBoxVector(const std::vector<BLOBNBOX *> &boxes, ScrollView *win) {
|
||||
for (int i = 0; i < boxes.size(); ++i) {
|
||||
TBOX box = boxes[i]->bounding_box();
|
||||
int left_x = box.left();
|
||||
@ -552,8 +552,8 @@ ScrollView *TabFind::FindTabBoxes(int min_gutter_width, double tabfind_aligned_g
|
||||
}
|
||||
// Sort left tabs by left and right by right to see the outermost one first
|
||||
// on a ragged tab.
|
||||
left_tab_boxes_.sort(SortByBoxLeft<BLOBNBOX>);
|
||||
right_tab_boxes_.sort(SortRightToLeft<BLOBNBOX>);
|
||||
std::sort(left_tab_boxes_.begin(), left_tab_boxes_.end(), StdSortByBoxLeft<BLOBNBOX>);
|
||||
std::sort(right_tab_boxes_.begin(), right_tab_boxes_.end(), StdSortRightToLeft<BLOBNBOX>);
|
||||
ScrollView *tab_win = nullptr;
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
if (textord_tabfind_show_initialtabs) {
|
||||
@ -831,7 +831,7 @@ int TabFind::FindTabVectors(int search_size_multiple, TabAlignment alignment, in
|
||||
int vector_count = 0;
|
||||
// Search the right or left tab boxes, looking for tab vectors.
|
||||
bool right = alignment == TA_RIGHT_ALIGNED || alignment == TA_RIGHT_RAGGED;
|
||||
const GenericVector<BLOBNBOX *> &boxes = right ? right_tab_boxes_ : left_tab_boxes_;
|
||||
const std::vector<BLOBNBOX *> &boxes = right ? right_tab_boxes_ : left_tab_boxes_;
|
||||
for (int i = 0; i < boxes.size(); ++i) {
|
||||
BLOBNBOX *bbox = boxes[i];
|
||||
if ((!right && bbox->left_tab_type() == TT_MAYBE_ALIGNED) ||
|
||||
|
@ -354,8 +354,8 @@ private:
|
||||
/** Callback to test an int for being a common width. */
|
||||
WidthCallback width_cb_;
|
||||
// Sets of bounding boxes that are candidate tab stops.
|
||||
GenericVector<BLOBNBOX *> left_tab_boxes_;
|
||||
GenericVector<BLOBNBOX *> right_tab_boxes_;
|
||||
std::vector<BLOBNBOX *> left_tab_boxes_;
|
||||
std::vector<BLOBNBOX *> right_tab_boxes_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
@ -156,12 +156,11 @@ bool StructuredTable::FindLinedStructure() {
|
||||
if (cell_x_.size() < 3 || cell_y_.size() < 3)
|
||||
return false;
|
||||
|
||||
cell_x_.sort();
|
||||
cell_y_.sort();
|
||||
|
||||
// Remove duplicates that may have occurred due to split lines.
|
||||
cell_x_.compact_sorted();
|
||||
cell_y_.compact_sorted();
|
||||
// Sort and remove duplicates that may have occurred due to split lines.
|
||||
std::sort(cell_x_.begin(), cell_x_.end());
|
||||
std::unique(cell_x_.begin(), cell_x_.end());
|
||||
std::sort(cell_y_.begin(), cell_y_.end());
|
||||
std::unique(cell_y_.begin(), cell_y_.end());
|
||||
|
||||
// The border should be the extents of line boxes, not middle.
|
||||
cell_x_[0] = bounding_box_.left();
|
||||
@ -170,8 +169,8 @@ bool StructuredTable::FindLinedStructure() {
|
||||
cell_y_[cell_y_.size() - 1] = bounding_box_.top();
|
||||
|
||||
// Remove duplicates that may have occurred due to moving the borders.
|
||||
cell_x_.compact_sorted();
|
||||
cell_y_.compact_sorted();
|
||||
std::unique(cell_x_.begin(), cell_x_.end());
|
||||
std::unique(cell_y_.begin(), cell_y_.end());
|
||||
|
||||
CalculateMargins();
|
||||
CalculateStats();
|
||||
@ -347,8 +346,8 @@ bool StructuredTable::VerifyWhitespacedTable() {
|
||||
// in the middle of the two nearest partitions.
|
||||
void StructuredTable::FindWhitespacedColumns() {
|
||||
// Set of the extents of all partitions on the page.
|
||||
GenericVector<int> left_sides;
|
||||
GenericVector<int> right_sides;
|
||||
std::vector<int> left_sides;
|
||||
std::vector<int> right_sides;
|
||||
|
||||
// Look at each text partition. We want to find the partitions
|
||||
// that have extremal left/right sides. These will give us a basis
|
||||
@ -371,8 +370,8 @@ void StructuredTable::FindWhitespacedColumns() {
|
||||
return;
|
||||
|
||||
// Since data may be inserted in grid order, we sort the left/right sides.
|
||||
left_sides.sort();
|
||||
right_sides.sort();
|
||||
std::sort(left_sides.begin(), left_sides.end());
|
||||
std::sort(right_sides.begin(), right_sides.end());
|
||||
|
||||
// At this point, in the "merged list", we expect to have a left side,
|
||||
// followed by either more left sides or a right side. The last number
|
||||
@ -390,8 +389,8 @@ void StructuredTable::FindWhitespacedColumns() {
|
||||
// in the middle of the two nearest partitions.
|
||||
void StructuredTable::FindWhitespacedRows() {
|
||||
// Set of the extents of all partitions on the page.
|
||||
GenericVector<int> bottom_sides;
|
||||
GenericVector<int> top_sides;
|
||||
std::vector<int> bottom_sides;
|
||||
std::vector<int> top_sides;
|
||||
// We will be "shrinking" partitions, so keep the min/max around to
|
||||
// make sure the bottom/top lines do not intersect text.
|
||||
int min_bottom = INT32_MAX;
|
||||
@ -435,8 +434,8 @@ void StructuredTable::FindWhitespacedRows() {
|
||||
return;
|
||||
|
||||
// Since data may be inserted in grid order, we sort the bottom/top sides.
|
||||
bottom_sides.sort();
|
||||
top_sides.sort();
|
||||
std::sort(bottom_sides.begin(), bottom_sides.end());
|
||||
std::sort(top_sides.begin(), top_sides.end());
|
||||
|
||||
// At this point, in the "merged list", we expect to have a bottom side,
|
||||
// followed by either more bottom sides or a top side. The last number
|
||||
@ -573,17 +572,17 @@ void StructuredTable::AbsorbNearbyLines() {
|
||||
// desired height.
|
||||
// The first/last items are extremal values of the list and known.
|
||||
// NOTE: This function assumes the lists are sorted!
|
||||
void StructuredTable::FindCellSplitLocations(const GenericVector<int> &min_list,
|
||||
const GenericVector<int> &max_list, int max_merged,
|
||||
GenericVector<int> *locations) {
|
||||
void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
|
||||
const std::vector<int> &max_list, int max_merged,
|
||||
std::vector<int> *locations) {
|
||||
locations->clear();
|
||||
ASSERT_HOST(min_list.size() == max_list.size());
|
||||
if (min_list.size() == 0)
|
||||
return;
|
||||
ASSERT_HOST(min_list.get(0) < max_list.get(0));
|
||||
ASSERT_HOST(min_list.get(min_list.size() - 1) < max_list.get(max_list.size() - 1));
|
||||
ASSERT_HOST(min_list.at(0) < max_list.at(0));
|
||||
ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
|
||||
|
||||
locations->push_back(min_list.get(0));
|
||||
locations->push_back(min_list.at(0));
|
||||
int min_index = 0;
|
||||
int max_index = 0;
|
||||
int stacked_partitions = 0;
|
||||
@ -610,7 +609,7 @@ void StructuredTable::FindCellSplitLocations(const GenericVector<int> &min_list,
|
||||
++max_index;
|
||||
}
|
||||
}
|
||||
locations->push_back(max_list.get(max_list.size() - 1));
|
||||
locations->push_back(max_list.at(max_list.size() - 1));
|
||||
}
|
||||
|
||||
// Counts the number of partitions in the table
|
||||
|
@ -21,7 +21,6 @@
|
||||
#define TABLERECOG_H_
|
||||
|
||||
#include "colpartitiongrid.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -209,9 +208,9 @@ protected:
|
||||
// are inserted wherever space exists between partitions. If it is 2,
|
||||
// lines may intersect 2 partitions at most, but you also need at least
|
||||
// 2 partitions to generate a line.
|
||||
static void FindCellSplitLocations(const GenericVector<int> &min_list,
|
||||
const GenericVector<int> &max_list, int max_merged,
|
||||
GenericVector<int> *locations);
|
||||
static void FindCellSplitLocations(const std::vector<int> &min_list,
|
||||
const std::vector<int> &max_list, int max_merged,
|
||||
std::vector<int> *locations);
|
||||
|
||||
////////
|
||||
//////// Utility function for table queries
|
||||
@ -236,8 +235,8 @@ protected:
|
||||
// bounding box is a convenient external representation.
|
||||
// cell_x_ and cell_y_ indicate the grid lines.
|
||||
TBOX bounding_box_; // Bounding box
|
||||
GenericVector<int> cell_x_; // Locations of vertical divisions (sorted)
|
||||
GenericVector<int> cell_y_; // Locations of horizontal divisions (sorted)
|
||||
std::vector<int> cell_x_; // Locations of vertical divisions (sorted)
|
||||
std::vector<int> cell_y_; // Locations of horizontal divisions (sorted)
|
||||
bool is_lined_; // Is the table backed up by a line structure
|
||||
// Table margins, set via CalculateMargins
|
||||
int space_above_;
|
||||
|
@ -49,7 +49,7 @@
|
||||
#include "tprintf.h" // for tprintf
|
||||
#include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP
|
||||
|
||||
#include "genericvector.h" // for PointerVector, GenericVector
|
||||
#include "genericvector.h" // for PointerVector
|
||||
|
||||
#include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate
|
||||
|
||||
@ -685,7 +685,7 @@ struct BlockGroup {
|
||||
// Min xheight of the blocks.
|
||||
float min_xheight;
|
||||
// Collection of borrowed pointers to the blocks in the group.
|
||||
GenericVector<BLOCK *> blocks;
|
||||
std::vector<BLOCK *> blocks;
|
||||
};
|
||||
|
||||
// Groups blocks by rotation, then, for each group, makes a WordGrid and calls
|
||||
|
@ -41,9 +41,6 @@
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
template <typename T>
|
||||
class GenericVector;
|
||||
|
||||
// Even though the limit on the number of chunks may now be removed, keep
|
||||
// the same limit for repeatable behavior, and it may be a speed advantage.
|
||||
static const int kMaxNumChunks = 64;
|
||||
@ -79,7 +76,7 @@ static int check_blob(TBLOB *blob) {
|
||||
*
|
||||
* Return true if any of the splits share a point with this one.
|
||||
*/
|
||||
static int any_shared_split_points(const GenericVector<SEAM *> &seams, SEAM *seam) {
|
||||
static int any_shared_split_points(const std::vector<SEAM *> &seams, SEAM *seam) {
|
||||
int length;
|
||||
int index;
|
||||
|
||||
@ -167,13 +164,13 @@ static int16_t total_containment(TBLOB *blob1, TBLOB *blob2) {
|
||||
// Helper runs all the checks on a seam to make sure it is valid.
|
||||
// Returns the seam if OK, otherwise deletes the seam and returns nullptr.
|
||||
static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob,
|
||||
TBLOB *other_blob, const GenericVector<SEAM *> &seams, SEAM *seam) {
|
||||
TBLOB *other_blob, const std::vector<SEAM *> &seams, SEAM *seam) {
|
||||
if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr ||
|
||||
total_containment(blob, other_blob) || check_blob(other_blob) ||
|
||||
!seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
|
||||
any_shared_split_points(seams, seam) ||
|
||||
!seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
|
||||
word->blobs.remove(blob_number + 1);
|
||||
word->blobs.erase(word->blobs.begin() + blob_number + 1);
|
||||
if (seam) {
|
||||
seam->UndoSeam(blob, other_blob);
|
||||
delete seam;
|
||||
@ -200,12 +197,12 @@ static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB
|
||||
* it was successful.
|
||||
*/
|
||||
SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
|
||||
const GenericVector<SEAM *> &seams) {
|
||||
const std::vector<SEAM *> &seams) {
|
||||
if (repair_unchopped_blobs)
|
||||
preserve_outline_tree(blob->outlines);
|
||||
TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
|
||||
// Insert it into the word.
|
||||
word->blobs.insert(other_blob, blob_number + 1);
|
||||
word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);
|
||||
|
||||
SEAM *seam = nullptr;
|
||||
if (prioritize_division) {
|
||||
@ -235,7 +232,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number,
|
||||
TPOINT location;
|
||||
if (divisible_blob(blob, italic_blob, &location)) {
|
||||
other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
|
||||
word->blobs.insert(other_blob, blob_number + 1);
|
||||
word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);
|
||||
seam = new SEAM(0.0f, location);
|
||||
seam->ApplySeam(italic_blob, blob, other_blob);
|
||||
seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam);
|
||||
@ -250,7 +247,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number,
|
||||
}
|
||||
|
||||
SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
|
||||
const GenericVector<SEAM *> &seams) {
|
||||
const std::vector<SEAM *> &seams) {
|
||||
return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams);
|
||||
}
|
||||
|
||||
@ -305,7 +302,7 @@ SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic
|
||||
* word->seam_array and the resulting blobs are unclassified, so this function
|
||||
* can be used by ApplyBox as well as during recognition.
|
||||
*/
|
||||
SEAM *Wordrec::improve_one_blob(const GenericVector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
||||
SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
||||
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
|
||||
int *blob_number) {
|
||||
float rating_ceiling = FLT_MAX;
|
||||
@ -347,7 +344,7 @@ SEAM *Wordrec::improve_one_blob(const GenericVector<BLOB_CHOICE *> &blob_choices
|
||||
* Used for testing chopper.
|
||||
*/
|
||||
SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes,
|
||||
const GenericVector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
||||
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
||||
int *blob_number) {
|
||||
if (prioritize_division) {
|
||||
return chop_overlapping_blob(boxes, true, word_res, blob_number);
|
||||
@ -427,12 +424,12 @@ void Wordrec::chop_word_main(WERD_RES *word) {
|
||||
void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
||||
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
|
||||
LMPainPoints *pain_points,
|
||||
GenericVector<SegSearchPending> *pending) {
|
||||
std::vector<SegSearchPending> *pending) {
|
||||
int blob_number;
|
||||
do { // improvement loop.
|
||||
// Make a simple vector of BLOB_CHOICEs to make it easy to pick which
|
||||
// one to chop.
|
||||
GenericVector<BLOB_CHOICE *> blob_choices;
|
||||
std::vector<BLOB_CHOICE *> blob_choices;
|
||||
int num_blobs = word->ratings->dimension();
|
||||
for (int i = 0; i < num_blobs; ++i) {
|
||||
BLOB_CHOICE_LIST *choices = word->ratings->get(i, i);
|
||||
@ -460,7 +457,7 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
||||
// Remap existing pain points.
|
||||
pain_points->RemapForSplit(blob_number);
|
||||
// Insert a new pending at the chop point.
|
||||
pending->insert(SegSearchPending(), blob_number);
|
||||
pending->insert(pending->begin() + blob_number, SegSearchPending());
|
||||
|
||||
// Classify the two newly created blobs using ProcessSegSearchPainPoint,
|
||||
// as that updates the pending correctly and adds new pain points.
|
||||
@ -501,7 +498,7 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
||||
* These are the results of the last classification. Find a likely
|
||||
* place to apply splits. If none, return -1.
|
||||
**********************************************************************/
|
||||
int Wordrec::select_blob_to_split(const GenericVector<BLOB_CHOICE *> &blob_choices,
|
||||
int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices,
|
||||
float rating_ceiling, bool split_next_to_fragment) {
|
||||
BLOB_CHOICE *blob_choice;
|
||||
int x;
|
||||
|
@ -34,8 +34,6 @@
|
||||
#include "unicharset.h" // for UNICHARSET
|
||||
#include "unicity_table.h" // for UnicityTable
|
||||
|
||||
template <typename T>
|
||||
class GenericVector;
|
||||
template <typename T>
|
||||
class UnicityTable;
|
||||
|
||||
|
@ -200,9 +200,10 @@ bool LMPainPoints::GeneratePainPoint(int col, int row, LMPainPointsType pp_type,
|
||||
*/
|
||||
void LMPainPoints::RemapForSplit(int index) {
|
||||
for (auto &pain_points_heap : pain_points_heaps_) {
|
||||
GenericVector<MatrixCoordPair> *heap = pain_points_heap.heap();
|
||||
for (int j = 0; j < heap->size(); ++j)
|
||||
(*heap)[j].data().MapForSplit(index);
|
||||
std::vector<MatrixCoordPair> &heap = pain_points_heap.heap();
|
||||
for (auto entry : heap) {
|
||||
entry.data().MapForSplit(index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,6 +23,8 @@
|
||||
#include <cstdio>
|
||||
|
||||
#include "bitvector.h"
|
||||
#include "helpers.h" // for ClipToRange
|
||||
#include "serialis.h" // for TFile
|
||||
#include "tprintf.h"
|
||||
|
||||
namespace tesseract {
|
||||
@ -103,8 +105,8 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
|
||||
present.Init(PTRAIN_NUM_FEATURE_TYPES);
|
||||
lang_ = lang;
|
||||
// Load weights for passes with adaption on.
|
||||
GenericVector<float> &weights = weights_vec_[pass_];
|
||||
weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0);
|
||||
std::vector<float> &weights = weights_vec_[pass_];
|
||||
weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f);
|
||||
|
||||
while (fp->FGets(line, kMaxLineSize) != nullptr) {
|
||||
char *key = nullptr;
|
||||
@ -129,13 +131,13 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
|
||||
}
|
||||
}
|
||||
lang_ = "";
|
||||
weights.truncate(0);
|
||||
weights.clear();
|
||||
}
|
||||
return complete;
|
||||
}
|
||||
|
||||
bool ParamsModel::SaveToFile(const char *full_path) const {
|
||||
const GenericVector<float> &weights = weights_vec_[pass_];
|
||||
const std::vector<float> &weights = weights_vec_[pass_];
|
||||
if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) {
|
||||
tprintf("Refusing to save ParamsModel that has not been initialized.\n");
|
||||
return false;
|
||||
|
@ -19,7 +19,7 @@
|
||||
#ifndef TESSERACT_WORDREC_PARAMS_MODEL_H_
|
||||
#define TESSERACT_WORDREC_PARAMS_MODEL_H_
|
||||
|
||||
#include "genericvector.h" // for GenericVector
|
||||
#include <tesseract/export.h> // for TESS_API
|
||||
#include "params_training_featdef.h" // for PTRAIN_NUM_FEATURE_TYPES
|
||||
|
||||
namespace tesseract {
|
||||
@ -38,7 +38,7 @@ public:
|
||||
};
|
||||
|
||||
ParamsModel() : pass_(PTRAIN_PASS1) {}
|
||||
ParamsModel(const char *lang, const GenericVector<float> &weights)
|
||||
ParamsModel(const char *lang, const std::vector<float> &weights)
|
||||
: lang_(lang), pass_(PTRAIN_PASS1) {
|
||||
weights_vec_[pass_] = weights;
|
||||
}
|
||||
@ -65,10 +65,10 @@ public:
|
||||
// Returns true on success.
|
||||
bool LoadFromFp(const char *lang, TFile *fp);
|
||||
|
||||
const GenericVector<float> &weights() const {
|
||||
const std::vector<float> &weights() const {
|
||||
return weights_vec_[pass_];
|
||||
}
|
||||
const GenericVector<float> &weights_for_pass(PassEnum pass) const {
|
||||
const std::vector<float> &weights_for_pass(PassEnum pass) const {
|
||||
return weights_vec_[pass];
|
||||
}
|
||||
void SetPass(PassEnum pass) {
|
||||
@ -84,7 +84,7 @@ private:
|
||||
PassEnum pass_;
|
||||
// Several sets of weights for various OCR passes (e.g. pass1 with adaption,
|
||||
// pass2 without adaption, etc).
|
||||
GenericVector<float> weights_vec_[PTRAIN_NUM_PASSES];
|
||||
std::vector<float> weights_vec_[PTRAIN_NUM_PASSES];
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -46,7 +46,7 @@ using tesseract::ScoredFont;
|
||||
* the collection of small pieces un modified.
|
||||
**********************************************************************/
|
||||
namespace tesseract {
|
||||
BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM *> &seams, int16_t start,
|
||||
BLOB_CHOICE_LIST *Wordrec::classify_piece(const std::vector<SEAM *> &seams, int16_t start,
|
||||
int16_t end, const char *description, TWERD *word,
|
||||
BlamerBundle *blamer_bundle) {
|
||||
if (end > start)
|
||||
|
@ -19,7 +19,6 @@
|
||||
#include <cstdint> // for INT32_MAX
|
||||
#include "blamer.h" // for BlamerBundle
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#include "genericvector.h" // for GenericVector
|
||||
#include "lm_pain_points.h" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi...
|
||||
#include "lm_state.h" // for BestChoiceBundle, ViterbiStateEntry
|
||||
#include "matrix.h" // for MATRIX_COORD, MATRIX
|
||||
@ -44,7 +43,7 @@ void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle
|
||||
// Compute scaling factor that will help us recover blob outline length
|
||||
// from classifier rating and certainty for the blob.
|
||||
float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
|
||||
GenericVector<SegSearchPending> pending;
|
||||
std::vector<SegSearchPending> pending;
|
||||
InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle);
|
||||
|
||||
if (!SegSearchDone(0)) { // find a better choice
|
||||
@ -122,7 +121,7 @@ void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle
|
||||
// without doing any additional chopping or joining.
|
||||
// (Internal factored version that can be used as part of the main SegSearch.)
|
||||
void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
|
||||
GenericVector<SegSearchPending> *pending,
|
||||
std::vector<SegSearchPending> *pending,
|
||||
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) {
|
||||
if (segsearch_debug_level > 0) {
|
||||
tprintf("Starting SegSearch on ratings matrix%s:\n",
|
||||
@ -154,7 +153,7 @@ void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
|
||||
// children are considered in the non-decreasing order of their column, since
|
||||
// this guarantees that all the parents would be up to date before an update
|
||||
// of a child is done.
|
||||
pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
|
||||
pending->resize(word_res->ratings->dimension(), SegSearchPending());
|
||||
|
||||
// Search the ratings matrix for the initial best path.
|
||||
(*pending)[0].SetColumnClassified();
|
||||
@ -163,7 +162,7 @@ void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
|
||||
}
|
||||
|
||||
void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
|
||||
GenericVector<SegSearchPending> *pending, WERD_RES *word_res,
|
||||
std::vector<SegSearchPending> *pending, WERD_RES *word_res,
|
||||
LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
|
||||
BlamerBundle *blamer_bundle) {
|
||||
MATRIX *ratings = word_res->ratings;
|
||||
@ -223,7 +222,7 @@ void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
|
||||
|
||||
void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
|
||||
const char *pain_point_type,
|
||||
GenericVector<SegSearchPending> *pending,
|
||||
std::vector<SegSearchPending> *pending,
|
||||
WERD_RES *word_res, LMPainPoints *pain_points,
|
||||
BlamerBundle *blamer_bundle) {
|
||||
if (segsearch_debug_level > 0) {
|
||||
@ -279,7 +278,7 @@ void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_
|
||||
// Needed when the n-gram model is enabled, as the multi-length comparison
|
||||
// implementation will re-value existing paths to worse values.
|
||||
void Wordrec::ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
|
||||
GenericVector<SegSearchPending> *pending) {
|
||||
std::vector<SegSearchPending> *pending) {
|
||||
// TODO(rays) More refactoring required here.
|
||||
// Delete existing viterbi states.
|
||||
for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
|
||||
|
@ -81,8 +81,6 @@ public:
|
||||
# include "seam.h" // for SEAM (ptr only), PRIORITY
|
||||
# include "stopper.h" // for DANGERR
|
||||
|
||||
# include "genericvector.h" // for GenericVector
|
||||
|
||||
# include <cstdint> // for int16_t, int32_t
|
||||
|
||||
namespace tesseract {
|
||||
@ -329,7 +327,7 @@ public:
|
||||
// without doing any additional chopping or joining.
|
||||
// (Internal factored version that can be used as part of the main SegSearch.)
|
||||
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
|
||||
GenericVector<SegSearchPending> *pending,
|
||||
std::vector<SegSearchPending> *pending,
|
||||
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
|
||||
|
||||
// Runs SegSearch() function (above) without needing a best_choice_bundle
|
||||
@ -352,22 +350,22 @@ public:
|
||||
|
||||
// chopper.cpp
|
||||
SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
|
||||
const GenericVector<SEAM *> &seams);
|
||||
const std::vector<SEAM *> &seams);
|
||||
SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
|
||||
const GenericVector<SEAM *> &seams);
|
||||
const std::vector<SEAM *> &seams);
|
||||
SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
|
||||
int *blob_number);
|
||||
SEAM *improve_one_blob(const GenericVector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
||||
SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
||||
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
|
||||
int *blob_number);
|
||||
SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
|
||||
const GenericVector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
||||
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
||||
int *blob_number);
|
||||
void chop_word_main(WERD_RES *word);
|
||||
void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
||||
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
|
||||
LMPainPoints *pain_points, GenericVector<SegSearchPending> *pending);
|
||||
int select_blob_to_split(const GenericVector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
|
||||
LMPainPoints *pain_points, std::vector<SegSearchPending> *pending);
|
||||
int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
|
||||
bool split_next_to_fragment);
|
||||
int select_blob_to_split_from_fixpt(DANGERR *fixpt);
|
||||
|
||||
@ -391,7 +389,7 @@ public:
|
||||
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt);
|
||||
|
||||
// pieces.cpp
|
||||
virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector<SEAM *> &seams, int16_t start,
|
||||
virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start,
|
||||
int16_t end, const char *description, TWERD *word,
|
||||
BlamerBundle *blamer_bundle);
|
||||
// Try to merge fragments in the ratings matrix and put the result in
|
||||
@ -466,7 +464,7 @@ protected:
|
||||
// if a new best choice is found
|
||||
//
|
||||
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
|
||||
GenericVector<SegSearchPending> *pending, WERD_RES *word_res,
|
||||
std::vector<SegSearchPending> *pending, WERD_RES *word_res,
|
||||
LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
|
||||
BlamerBundle *blamer_bundle);
|
||||
|
||||
@ -474,13 +472,13 @@ protected:
|
||||
// new pain points to join the newly classified blob with its neighbors.
|
||||
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
|
||||
const char *pain_point_type,
|
||||
GenericVector<SegSearchPending> *pending, WERD_RES *word_res,
|
||||
std::vector<SegSearchPending> *pending, WERD_RES *word_res,
|
||||
LMPainPoints *pain_points, BlamerBundle *blamer_bundle);
|
||||
// Resets enough of the results so that the Viterbi search is re-run.
|
||||
// Needed when the n-gram model is enabled, as the multi-length comparison
|
||||
// implementation will re-value existing paths to worse values.
|
||||
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
|
||||
GenericVector<SegSearchPending> *pending);
|
||||
std::vector<SegSearchPending> *pending);
|
||||
|
||||
// Add pain points for classifying blobs on the correct segmentation path
|
||||
// (so that we can evaluate correct segmentation path and discover the reason
|
||||
|
@ -92,15 +92,15 @@ public:
|
||||
return ComputeForegroundDensity(tbox);
|
||||
}
|
||||
|
||||
int RunCountAlignment(const GenericVector<int> &sorted_vec, const int val) {
|
||||
int RunCountAlignment(const std::vector<int> &sorted_vec, const int val) {
|
||||
return CountAlignment(sorted_vec, val);
|
||||
}
|
||||
|
||||
void RunSplitCPHorLite(ColPartition *part, GenericVector<TBOX> *splitted_boxes) {
|
||||
void RunSplitCPHorLite(ColPartition *part, std::vector<TBOX> *splitted_boxes) {
|
||||
SplitCPHorLite(part, splitted_boxes);
|
||||
}
|
||||
|
||||
void RunSplitCPHor(ColPartition *part, GenericVector<ColPartition *> *parts_splitted) {
|
||||
void RunSplitCPHor(ColPartition *part, std::vector<ColPartition *> *parts_splitted) {
|
||||
SplitCPHor(part, parts_splitted);
|
||||
}
|
||||
|
||||
@ -377,7 +377,7 @@ TEST_F(EquationFinderTest, ComputeForegroundDensity) {
|
||||
}
|
||||
|
||||
TEST_F(EquationFinderTest, CountAlignment) {
|
||||
GenericVector<int> vec;
|
||||
std::vector<int> vec;
|
||||
vec.push_back(1);
|
||||
vec.push_back(1);
|
||||
vec.push_back(1);
|
||||
@ -452,7 +452,7 @@ TEST_F(EquationFinderTest, SplitCPHorLite) {
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part->DeleteBoxes();
|
||||
part->set_median_width(10);
|
||||
GenericVector<TBOX> splitted_boxes;
|
||||
std::vector<TBOX> splitted_boxes;
|
||||
|
||||
// Test an empty part.
|
||||
equation_det_->RunSplitCPHorLite(part, &splitted_boxes);
|
||||
@ -486,7 +486,7 @@ TEST_F(EquationFinderTest, SplitCPHor) {
|
||||
ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE);
|
||||
part->DeleteBoxes();
|
||||
part->set_median_width(10);
|
||||
GenericVector<ColPartition *> parts_splitted;
|
||||
std::vector<ColPartition *> parts_splitted;
|
||||
|
||||
// Test an empty part.
|
||||
equation_det_->RunSplitCPHor(part, &parts_splitted);
|
||||
@ -512,7 +512,9 @@ TEST_F(EquationFinderTest, SplitCPHor) {
|
||||
EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box());
|
||||
EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box());
|
||||
|
||||
parts_splitted.delete_data_pointers();
|
||||
for (auto part_splitted : parts_splitted) {
|
||||
delete part_splitted;
|
||||
}
|
||||
part->DeleteBoxes();
|
||||
delete (part);
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector<RowInfo
|
||||
// Given n rows of reference ground truth, evaluate whether the n rows
|
||||
// of PARA * pointers yield the same paragraph breakpoints.
|
||||
void EvaluateParagraphDetection(const TextAndModel *correct, int n,
|
||||
const GenericVector<PARA *> &detector_output) {
|
||||
const std::vector<PARA *> &detector_output) {
|
||||
int incorrect_breaks = 0;
|
||||
int missed_breaks = 0;
|
||||
int poorly_matched_models = 0;
|
||||
@ -186,7 +186,7 @@ void EvaluateParagraphDetection(const TextAndModel *correct, int n,
|
||||
|
||||
void TestParagraphDetection(const TextAndModel *correct, int num_rows) {
|
||||
std::vector<RowInfo> row_infos;
|
||||
GenericVector<PARA *> row_owners;
|
||||
std::vector<PARA *> row_owners;
|
||||
PARA_LIST paragraphs;
|
||||
std::vector<ParagraphModel *> models;
|
||||
|
||||
@ -312,7 +312,7 @@ TEST(ParagraphsTest, TestSingleFullPageContinuation) {
|
||||
const TextAndModel *correct = kSingleFullPageContinuation;
|
||||
int num_rows = countof(kSingleFullPageContinuation);
|
||||
std::vector<RowInfo> row_infos;
|
||||
GenericVector<PARA *> row_owners;
|
||||
std::vector<PARA *> row_owners;
|
||||
PARA_LIST paragraphs;
|
||||
std::vector<ParagraphModel *> models;
|
||||
models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10));
|
||||
|
@ -9,7 +9,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "kdpair.h"
|
||||
#include "statistc.h"
|
||||
|
||||
@ -42,8 +41,8 @@ TEST_F(STATSTest, BasicStats) {
|
||||
|
||||
// Tests the top_n_modes function.
|
||||
TEST_F(STATSTest, TopNModes) {
|
||||
GenericVector<tesseract::KDPairInc<float, int> > modes;
|
||||
int num_modes = stats_.top_n_modes(3, &modes);
|
||||
std::vector<tesseract::KDPairInc<float, int> > modes;
|
||||
int num_modes = stats_.top_n_modes(3, modes);
|
||||
EXPECT_EQ(3, num_modes);
|
||||
// Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14.
|
||||
EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key());
|
||||
|
@ -39,27 +39,27 @@ public:
|
||||
|
||||
void InjectCellY(int y) {
|
||||
cell_y_.push_back(y);
|
||||
cell_y_.sort();
|
||||
std::sort(cell_y_.begin(), cell_y_.end());
|
||||
}
|
||||
void InjectCellX(int x) {
|
||||
cell_x_.push_back(x);
|
||||
cell_x_.sort();
|
||||
std::sort(cell_x_.begin(), cell_x_.end());
|
||||
}
|
||||
|
||||
void ExpectCellX(int x_min, int second, int add, int almost_done, int x_max) {
|
||||
ASSERT_EQ(0, (almost_done - second) % add);
|
||||
EXPECT_EQ(3 + (almost_done - second) / add, cell_x_.size());
|
||||
EXPECT_EQ(x_min, cell_x_.get(0));
|
||||
EXPECT_EQ(x_max, cell_x_.get(cell_x_.size() - 1));
|
||||
EXPECT_EQ(x_min, cell_x_.at(0));
|
||||
EXPECT_EQ(x_max, cell_x_.at(cell_x_.size() - 1));
|
||||
for (int i = 1; i < cell_x_.size() - 1; ++i) {
|
||||
EXPECT_EQ(second + add * (i - 1), cell_x_.get(i));
|
||||
EXPECT_EQ(second + add * (i - 1), cell_x_.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
void ExpectSortedX() {
|
||||
EXPECT_GT(cell_x_.size(), 0);
|
||||
for (int i = 1; i < cell_x_.size(); ++i) {
|
||||
EXPECT_LT(cell_x_.get(i - 1), cell_x_.get(i));
|
||||
EXPECT_LT(cell_x_.at(i - 1), cell_x_.at(i));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -93,7 +93,7 @@ protected:
|
||||
int len = compressed_.EncodeUnichar(u, &code);
|
||||
// Check round-trip encoding.
|
||||
int unichar_id;
|
||||
GenericVector<UNICHAR_ID> normed_ids;
|
||||
std::vector<UNICHAR_ID> normed_ids;
|
||||
if (u == null_char_ || u == unicharset_.size()) {
|
||||
unichar_id = null_char_;
|
||||
} else {
|
||||
@ -137,7 +137,7 @@ protected:
|
||||
const std::vector<RecodedCharID> ×_seen) {
|
||||
RecodedCharID extended = code;
|
||||
int length = code.length();
|
||||
const GenericVector<int> *final_codes = compressed_.GetFinalCodes(code);
|
||||
const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);
|
||||
if (final_codes != nullptr) {
|
||||
for (int i = 0; i < final_codes->size(); ++i) {
|
||||
int ending = (*final_codes)[i];
|
||||
@ -147,7 +147,7 @@ protected:
|
||||
EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
|
||||
}
|
||||
}
|
||||
const GenericVector<int> *next_codes = compressed_.GetNextCodes(code);
|
||||
const std::vector<int> *next_codes = compressed_.GetNextCodes(code);
|
||||
if (next_codes != nullptr) {
|
||||
for (int i = 0; i < next_codes->size(); ++i) {
|
||||
int extension = (*next_codes)[i];
|
||||
|
Loading…
Reference in New Issue
Block a user