From bf42f8313d6258be3db481f8cea80995aaf117db Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 15 Mar 2021 12:58:24 +0100 Subject: [PATCH] Replace remaining GenericVector by std::vector for src/dict Signed-off-by: Stefan Weil --- src/ccstruct/params_training_featdef.h | 1 + src/dict/dawg.h | 15 +++++----- src/dict/dict.cpp | 40 ++++++++++++++------------ src/dict/dict.h | 4 +-- src/dict/stopper.h | 4 +-- src/dict/trie.cpp | 26 +++++++++-------- src/dict/trie.h | 18 ++++++------ src/wordrec/language_model.cpp | 2 -- src/wordrec/params_model.cpp | 10 ++++--- src/wordrec/params_model.h | 10 +++---- 10 files changed, 67 insertions(+), 63 deletions(-) diff --git a/src/ccstruct/params_training_featdef.h b/src/ccstruct/params_training_featdef.h index 631a5a68..18c9c726 100644 --- a/src/ccstruct/params_training_featdef.h +++ b/src/ccstruct/params_training_featdef.h @@ -19,6 +19,7 @@ #ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_ #define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_ +#include // for memset #include #include diff --git a/src/dict/dawg.h b/src/dict/dawg.h index 2f00885d..67cf0b1f 100644 --- a/src/dict/dawg.h +++ b/src/dict/dawg.h @@ -57,9 +57,9 @@ struct NodeChild { NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} }; -using NodeChildVector = GenericVector; -using SuccessorList = GenericVector; -using SuccessorListsVector = GenericVector; +using NodeChildVector = std::vector; +using SuccessorList = std::vector; +using SuccessorListsVector = std::vector; enum DawgType { DAWG_TYPE_PUNCTUATION, @@ -176,7 +176,7 @@ public: /// Fills vec with unichar ids that represent the character classes /// of the given unichar_id. virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, - GenericVector *vec) const { + std::vector *vec) const { (void)unichar_id; (void)unicharset; (void)vec; @@ -355,15 +355,16 @@ struct DawgPosition { bool back_to_punc = false; }; -class DawgPositionVector : public GenericVector { +class DawgPositionVector : public std::vector { public: /// Adds an entry for the given dawg_index with the given node to the vec. /// Returns false if the same entry already exists in the vector, /// true otherwise. inline bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg) { - for (int i = 0; i < size(); ++i) { - if (data_[i] == new_pos) + for (auto position : *this) { + if (position == new_pos) { return false; + } } push_back(new_pos); if (debug) { diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp index 5ae7cc0f..122083f5 100644 --- a/src/dict/dict.cpp +++ b/src/dict/dict.cpp @@ -201,19 +201,19 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file); if (punc_dawg_) - dawgs_ += punc_dawg_; + dawgs_.push_back(punc_dawg_); } if (load_system_dawg) { Dawg *system_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); if (system_dawg) - dawgs_ += system_dawg; + dawgs_.push_back(system_dawg); } if (load_number_dawg) { Dawg *number_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) - dawgs_ += number_dawg; + dawgs_.push_back(number_dawg); } if (load_bigram_dawg) { bigram_dawg_ = @@ -225,13 +225,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file); if (freq_dawg_) - dawgs_ += freq_dawg_; + dawgs_.push_back(freq_dawg_); } if (load_unambig_dawg) { unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file); if (unambig_dawg_) - dawgs_ += unambig_dawg_; + dawgs_.push_back(unambig_dawg_); } std::string name; @@ -249,7 +249,7 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } @@ -267,13 +267,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level); - dawgs_ += document_words_; + dawgs_.push_back(document_words_); // This dawg is temporary and should not be searched by letter_is_ok. pending_words_ = @@ -287,19 +287,19 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file); if (punc_dawg_) - dawgs_ += punc_dawg_; + dawgs_.push_back(punc_dawg_); } if (load_system_dawg) { Dawg *system_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); if (system_dawg) - dawgs_ += system_dawg; + dawgs_.push_back(system_dawg); } if (load_number_dawg) { Dawg *number_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) - dawgs_ += number_dawg; + dawgs_.push_back(number_dawg); } // stolen from Dict::Load (but needs params_ from Tesseract @@ -319,7 +319,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } @@ -337,7 +337,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } } @@ -358,9 +358,9 @@ bool Dict::FinishLoad() { const Dawg *other = dawgs_[j]; if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) && kDawgSuccessors[dawg->type()][other->type()]) - *lst += j; + lst->push_back(j); } - successors_ += lst; + successors_.push_back(lst); } return true; } @@ -378,7 +378,9 @@ void Dict::End() { delete dawg_cache_; dawg_cache_ = nullptr; } - successors_.delete_data_pointers(); + for (auto successor : successors_) { + delete successor; + } dawgs_.clear(); successors_.clear(); document_words_ = nullptr; @@ -550,7 +552,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); // Try to find the edge corresponding to the exact unichar_id and to all the // edges corresponding to the character class of unichar_id. - GenericVector unichar_id_patterns; + std::vector unichar_id_patterns; unichar_id_patterns.push_back(unichar_id); dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns); for (int i = 0; i < unichar_id_patterns.size(); ++i) { @@ -605,12 +607,12 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern int dawg_ty = dawgs_[i]->type(); bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; if (dawg_ty == DAWG_TYPE_PUNCTUATION) { - *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false); + dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false)); if (dawg_debug_level >= 3) { tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); } } else if (!punc_dawg_available || !subsumed_by_punc) { - *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false); + dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false)); if (dawg_debug_level >= 3) { tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); } diff --git a/src/dict/dict.h b/src/dict/dict.h index 9a40afc6..37c14ce1 100644 --- a/src/dict/dict.h +++ b/src/dict/dict.h @@ -54,7 +54,7 @@ struct CHAR_FRAGMENT_INFO { float certainty; }; -using DawgVector = GenericVector; +using DawgVector = std::vector; // // Constants @@ -495,7 +495,7 @@ private: // matching. The first member of each list is taken as canonical. For // example, the first list contains hyphens and dashes with the first symbol // being the ASCII hyphen minus. - std::vector> equivalent_symbols_; + std::vector> equivalent_symbols_; // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs. DawgCache *dawg_cache_; bool dawg_cache_is_ours_; // we should delete our own dawg_cache_ diff --git a/src/dict/stopper.h b/src/dict/stopper.h index f675c0a6..23be742e 100644 --- a/src/dict/stopper.h +++ b/src/dict/stopper.h @@ -2,7 +2,6 @@ ** Filename: stopper.h ** Purpose: Stopping criteria for word classifier. ** Author: Dan Johnson - ** History: Wed May 1 09:42:57 1991, DSJ, Created. ** ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,7 +21,6 @@ #include "ratngs.h" #include -#include "genericvector.h" namespace tesseract { @@ -46,7 +44,7 @@ struct DANGERR_INFO { UNICHAR_ID leftmost; // in the replacement, what's the leftmost character? }; -using DANGERR = GenericVector; +using DANGERR = std::vector; } // namespace tesseract diff --git a/src/dict/trie.cpp b/src/dict/trie.cpp index e739c802..95917f41 100644 --- a/src/dict/trie.cpp +++ b/src/dict/trie.cpp @@ -24,7 +24,6 @@ #include "dawg.h" #include "dict.h" -#include "genericvector.h" #include "helpers.h" #include "kdpair.h" @@ -49,7 +48,9 @@ const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { // Reset the Trie to empty. void Trie::clear() { - nodes_.delete_data_pointers(); + for (auto node : nodes_) { + delete node; + } nodes_.clear(); root_back_freelist_.clear(); num_edges_ = 0; @@ -122,10 +123,11 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in EDGE_RECORD edge_rec; link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) { - EDGE_INDEX edge_index = root_back_freelist_.pop_back(); + EDGE_INDEX edge_index = root_back_freelist_.back(); + root_back_freelist_.pop_back(); (*vec)[edge_index] = edge_rec; } else if (search_index < vec->size()) { - vec->insert(edge_rec, search_index); + vec->insert(vec->begin() + search_index, edge_rec); } else { vec->push_back(edge_rec); } @@ -153,7 +155,7 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool m *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); } -bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector *repetitions) { +bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector *repetitions) { if (word.length() <= 0) return false; // can't add empty words if (repetitions != nullptr) @@ -330,7 +332,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) { } void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, - GenericVector *vec) const { + std::vector *vec) const { bool is_alpha = unicharset.get_isalpha(unichar_id); if (is_alpha) { vec->push_back(alpha_pattern_); @@ -388,7 +390,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset) // Parse the pattern and construct a unichar id vector. // Record the number of repetitions of each unichar in the parallel vector. WERD_CHOICE word(&unicharset); - GenericVector repetitions_vec; + std::vector repetitions_vec; const char *str_ptr = string; int step = unicharset.step(str_ptr); bool failed = false; @@ -462,12 +464,12 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo tprintf("\n"); } if (direction == FORWARD_EDGE) { - nodes_[node1]->forward_edges.remove(edge_index); + nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index); } else if (node1 == 0) { KillEdge(&nodes_[node1]->backward_edges[edge_index]); root_back_freelist_.push_back(edge_index); } else { - nodes_[node1]->backward_edges.remove(edge_index); + nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index); } --num_edges_; } @@ -476,7 +478,7 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo // 1 Avoid insertion sorting or bubble sorting the tail root node // (back links on node 0, a list of all the leaves.). The node is // huge, and sorting it with n^2 time is terrible. -// 2 Avoid using GenericVector::remove on the tail root node. +// 2 Avoid using vector::erase on the tail root node. // (a) During add of words to the trie, zero-out the unichars and // keep a freelist of spaces to re-use. // (b) During reduction, just zero-out the unichars of deleted back @@ -624,13 +626,13 @@ void Trie::sort_edges(EDGE_VECTOR *edges) { int num_edges = edges->size(); if (num_edges <= 1) return; - GenericVector> sort_vec; + std::vector> sort_vec; sort_vec.reserve(num_edges); for (int i = 0; i < num_edges; ++i) { sort_vec.push_back( KDPairInc(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i])); } - sort_vec.sort(); + std::sort(sort_vec.begin(), sort_vec.end()); for (int i = 0; i < num_edges; ++i) (*edges)[i] = sort_vec[i].data(); } diff --git a/src/dict/trie.h b/src/dict/trie.h index 7a76e008..f3c9850f 100644 --- a/src/dict/trie.h +++ b/src/dict/trie.h @@ -21,14 +21,12 @@ #include "dawg.h" -#include "genericvector.h" - namespace tesseract { class UNICHARSET; // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed -// max int32, we will need to change GenericVector to use int64 for size +// max int32, we will need to change vector to use int64 for size // and address indices. This does not seem to be needed immediately, // since currently the largest number of edges limit used by tesseract // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32. @@ -39,13 +37,13 @@ class UNICHARSET; // the 64 bit EDGE_RECORD. using EDGE_INDEX = int64_t; // index of an edge in a given node using NODE_MARKER = bool *; -using EDGE_VECTOR = GenericVector; +using EDGE_VECTOR = std::vector; struct TRIE_NODE_RECORD { EDGE_VECTOR forward_edges; EDGE_VECTOR backward_edges; }; -using TRIE_NODES = GenericVector; +using TRIE_NODES = std::vector; /** * Concrete class for Trie data structure that allows to store a list of @@ -88,7 +86,9 @@ public: initialized_patterns_ = false; } ~Trie() override { - nodes_.delete_data_pointers(); + for (auto node : nodes_) { + delete node; + } } // Reset the Trie to empty. @@ -230,7 +230,7 @@ public: // Fills in the given unichar id vector with the unichar ids that represent // the patterns of the character classes of the given unichar_id. void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, - GenericVector *vec) const override; + std::vector *vec) const override; // Returns the given EDGE_REF if the EDGE_RECORD that it points to has // a self loop and the given unichar_id matches the unichar_id stored in the @@ -256,7 +256,7 @@ public: // // Return true if add succeeded, false otherwise (e.g. when a word contained // an invalid unichar id or the trie was getting too large and was cleared). - bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector *repetitions); + bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector *repetitions); bool add_word_to_dawg(const WERD_CHOICE &word) { return add_word_to_dawg(word, nullptr); } @@ -395,7 +395,7 @@ protected: // Member variables TRIE_NODES nodes_; // vector of nodes in the Trie // Freelist of edges in the root backwards node that were previously zeroed. - GenericVector root_back_freelist_; + std::vector root_back_freelist_; uint64_t num_edges_; // sum of all edges (forward and backward) uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index diff --git a/src/wordrec/language_model.cpp b/src/wordrec/language_model.cpp index b7dd36c6..11a0abca 100644 --- a/src/wordrec/language_model.cpp +++ b/src/wordrec/language_model.cpp @@ -34,8 +34,6 @@ #include "unicharset.h" // for UNICHARSET #include "unicity_table.h" // for UnicityTable -template -class GenericVector; template class UnicityTable; diff --git a/src/wordrec/params_model.cpp b/src/wordrec/params_model.cpp index c32b37af..bc1c3aba 100644 --- a/src/wordrec/params_model.cpp +++ b/src/wordrec/params_model.cpp @@ -23,6 +23,8 @@ #include #include "bitvector.h" +#include "helpers.h" // for ClipToRange +#include "serialis.h" // for TFile #include "tprintf.h" namespace tesseract { @@ -103,8 +105,8 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) { present.Init(PTRAIN_NUM_FEATURE_TYPES); lang_ = lang; // Load weights for passes with adaption on. - GenericVector &weights = weights_vec_[pass_]; - weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0); + std::vector &weights = weights_vec_[pass_]; + weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f); while (fp->FGets(line, kMaxLineSize) != nullptr) { char *key = nullptr; @@ -129,13 +131,13 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) { } } lang_ = ""; - weights.truncate(0); + weights.clear(); } return complete; } bool ParamsModel::SaveToFile(const char *full_path) const { - const GenericVector &weights = weights_vec_[pass_]; + const std::vector &weights = weights_vec_[pass_]; if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) { tprintf("Refusing to save ParamsModel that has not been initialized.\n"); return false; diff --git a/src/wordrec/params_model.h b/src/wordrec/params_model.h index 24a80871..b679766d 100644 --- a/src/wordrec/params_model.h +++ b/src/wordrec/params_model.h @@ -19,7 +19,7 @@ #ifndef TESSERACT_WORDREC_PARAMS_MODEL_H_ #define TESSERACT_WORDREC_PARAMS_MODEL_H_ -#include "genericvector.h" // for GenericVector +#include // for TESS_API #include "params_training_featdef.h" // for PTRAIN_NUM_FEATURE_TYPES namespace tesseract { @@ -38,7 +38,7 @@ public: }; ParamsModel() : pass_(PTRAIN_PASS1) {} - ParamsModel(const char *lang, const GenericVector &weights) + ParamsModel(const char *lang, const std::vector &weights) : lang_(lang), pass_(PTRAIN_PASS1) { weights_vec_[pass_] = weights; } @@ -65,10 +65,10 @@ public: // Returns true on success. bool LoadFromFp(const char *lang, TFile *fp); - const GenericVector &weights() const { + const std::vector &weights() const { return weights_vec_[pass_]; } - const GenericVector &weights_for_pass(PassEnum pass) const { + const std::vector &weights_for_pass(PassEnum pass) const { return weights_vec_[pass]; } void SetPass(PassEnum pass) { @@ -84,7 +84,7 @@ private: PassEnum pass_; // Several sets of weights for various OCR passes (e.g. pass1 with adaption, // pass2 without adaption, etc). - GenericVector weights_vec_[PTRAIN_NUM_PASSES]; + std::vector weights_vec_[PTRAIN_NUM_PASSES]; }; } // namespace tesseract