Replace remaining GenericVector by std::vector for src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2021-03-15 12:58:24 +01:00
parent 17eee8648f
commit bf42f8313d
10 changed files with 67 additions and 63 deletions

View File

@ -19,6 +19,7 @@
#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
#include <cstring> // for memset
#include <string>
#include <vector>

View File

@ -57,9 +57,9 @@ struct NodeChild {
NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
};
using NodeChildVector = GenericVector<NodeChild>;
using SuccessorList = GenericVector<int>;
using SuccessorListsVector = GenericVector<SuccessorList *>;
using NodeChildVector = std::vector<NodeChild>;
using SuccessorList = std::vector<int>;
using SuccessorListsVector = std::vector<SuccessorList *>;
enum DawgType {
DAWG_TYPE_PUNCTUATION,
@ -176,7 +176,7 @@ public:
/// Fills vec with unichar ids that represent the character classes
/// of the given unichar_id.
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const {
std::vector<UNICHAR_ID> *vec) const {
(void)unichar_id;
(void)unicharset;
(void)vec;
@ -355,15 +355,16 @@ struct DawgPosition {
bool back_to_punc = false;
};
class DawgPositionVector : public GenericVector<DawgPosition> {
class DawgPositionVector : public std::vector<DawgPosition> {
public:
/// Adds an entry for the given dawg_index with the given node to the vec.
/// Returns false if the same entry already exists in the vector,
/// true otherwise.
inline bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg) {
for (int i = 0; i < size(); ++i) {
if (data_[i] == new_pos)
for (auto position : *this) {
if (position == new_pos) {
return false;
}
}
push_back(new_pos);
if (debug) {

View File

@ -201,19 +201,19 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
punc_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
if (punc_dawg_)
dawgs_ += punc_dawg_;
dawgs_.push_back(punc_dawg_);
}
if (load_system_dawg) {
Dawg *system_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg)
dawgs_ += system_dawg;
dawgs_.push_back(system_dawg);
}
if (load_number_dawg) {
Dawg *number_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg)
dawgs_ += number_dawg;
dawgs_.push_back(number_dawg);
}
if (load_bigram_dawg) {
bigram_dawg_ =
@ -225,13 +225,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
freq_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
if (freq_dawg_)
dawgs_ += freq_dawg_;
dawgs_.push_back(freq_dawg_);
}
if (load_unambig_dawg) {
unambig_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
if (unambig_dawg_)
dawgs_ += unambig_dawg_;
dawgs_.push_back(unambig_dawg_);
}
std::string name;
@ -249,7 +249,7 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
@ -267,13 +267,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
document_words_ =
new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
dawgs_ += document_words_;
dawgs_.push_back(document_words_);
// This dawg is temporary and should not be searched by letter_is_ok.
pending_words_ =
@ -287,19 +287,19 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
punc_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
if (punc_dawg_)
dawgs_ += punc_dawg_;
dawgs_.push_back(punc_dawg_);
}
if (load_system_dawg) {
Dawg *system_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg)
dawgs_ += system_dawg;
dawgs_.push_back(system_dawg);
}
if (load_number_dawg) {
Dawg *number_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg)
dawgs_ += number_dawg;
dawgs_.push_back(number_dawg);
}
// stolen from Dict::Load (but needs params_ from Tesseract
@ -319,7 +319,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
@ -337,7 +337,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
}
@ -358,9 +358,9 @@ bool Dict::FinishLoad() {
const Dawg *other = dawgs_[j];
if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
kDawgSuccessors[dawg->type()][other->type()])
*lst += j;
lst->push_back(j);
}
successors_ += lst;
successors_.push_back(lst);
}
return true;
}
@ -378,7 +378,9 @@ void Dict::End() {
delete dawg_cache_;
dawg_cache_ = nullptr;
}
successors_.delete_data_pointers();
for (auto successor : successors_) {
delete successor;
}
dawgs_.clear();
successors_.clear();
document_words_ = nullptr;
@ -550,7 +552,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
// Try to find the edge corresponding to the exact unichar_id and to all the
// edges corresponding to the character class of unichar_id.
GenericVector<UNICHAR_ID> unichar_id_patterns;
std::vector<UNICHAR_ID> unichar_id_patterns;
unichar_id_patterns.push_back(unichar_id);
dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
for (int i = 0; i < unichar_id_patterns.size(); ++i) {
@ -605,12 +607,12 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern
int dawg_ty = dawgs_[i]->type();
bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
*dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
if (dawg_debug_level >= 3) {
tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
}
} else if (!punc_dawg_available || !subsumed_by_punc) {
*dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
if (dawg_debug_level >= 3) {
tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
}

View File

@ -54,7 +54,7 @@ struct CHAR_FRAGMENT_INFO {
float certainty;
};
using DawgVector = GenericVector<Dawg *>;
using DawgVector = std::vector<Dawg *>;
//
// Constants
@ -495,7 +495,7 @@ private:
// matching. The first member of each list is taken as canonical. For
// example, the first list contains hyphens and dashes with the first symbol
// being the ASCII hyphen minus.
std::vector<GenericVector<UNICHAR_ID>> equivalent_symbols_;
std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
// Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
DawgCache *dawg_cache_;
bool dawg_cache_is_ours_; // we should delete our own dawg_cache_

View File

@ -2,7 +2,6 @@
** Filename: stopper.h
** Purpose: Stopping criteria for word classifier.
** Author: Dan Johnson
** History: Wed May 1 09:42:57 1991, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
@ -22,7 +21,6 @@
#include "ratngs.h"
#include <tesseract/unichar.h>
#include "genericvector.h"
namespace tesseract {
@ -46,7 +44,7 @@ struct DANGERR_INFO {
UNICHAR_ID leftmost; // in the replacement, what's the leftmost character?
};
using DANGERR = GenericVector<DANGERR_INFO>;
using DANGERR = std::vector<DANGERR_INFO>;
} // namespace tesseract

View File

@ -24,7 +24,6 @@
#include "dawg.h"
#include "dict.h"
#include "genericvector.h"
#include "helpers.h"
#include "kdpair.h"
@ -49,7 +48,9 @@ const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
// Reset the Trie to empty.
void Trie::clear() {
nodes_.delete_data_pointers();
for (auto node : nodes_) {
delete node;
}
nodes_.clear();
root_back_freelist_.clear();
num_edges_ = 0;
@ -122,10 +123,11 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in
EDGE_RECORD edge_rec;
link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
EDGE_INDEX edge_index = root_back_freelist_.pop_back();
EDGE_INDEX edge_index = root_back_freelist_.back();
root_back_freelist_.pop_back();
(*vec)[edge_index] = edge_rec;
} else if (search_index < vec->size()) {
vec->insert(edge_rec, search_index);
vec->insert(vec->begin() + search_index, edge_rec);
} else {
vec->push_back(edge_rec);
}
@ -153,7 +155,7 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool m
*edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
}
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions) {
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {
if (word.length() <= 0)
return false; // can't add empty words
if (repetitions != nullptr)
@ -330,7 +332,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
}
void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const {
std::vector<UNICHAR_ID> *vec) const {
bool is_alpha = unicharset.get_isalpha(unichar_id);
if (is_alpha) {
vec->push_back(alpha_pattern_);
@ -388,7 +390,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset)
// Parse the pattern and construct a unichar id vector.
// Record the number of repetitions of each unichar in the parallel vector.
WERD_CHOICE word(&unicharset);
GenericVector<bool> repetitions_vec;
std::vector<bool> repetitions_vec;
const char *str_ptr = string;
int step = unicharset.step(str_ptr);
bool failed = false;
@ -462,12 +464,12 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
tprintf("\n");
}
if (direction == FORWARD_EDGE) {
nodes_[node1]->forward_edges.remove(edge_index);
nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
} else if (node1 == 0) {
KillEdge(&nodes_[node1]->backward_edges[edge_index]);
root_back_freelist_.push_back(edge_index);
} else {
nodes_[node1]->backward_edges.remove(edge_index);
nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
}
--num_edges_;
}
@ -476,7 +478,7 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
// 1 Avoid insertion sorting or bubble sorting the tail root node
// (back links on node 0, a list of all the leaves.). The node is
// huge, and sorting it with n^2 time is terrible.
// 2 Avoid using GenericVector::remove on the tail root node.
// 2 Avoid using vector::erase on the tail root node.
// (a) During add of words to the trie, zero-out the unichars and
// keep a freelist of spaces to re-use.
// (b) During reduction, just zero-out the unichars of deleted back
@ -624,13 +626,13 @@ void Trie::sort_edges(EDGE_VECTOR *edges) {
int num_edges = edges->size();
if (num_edges <= 1)
return;
GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
sort_vec.reserve(num_edges);
for (int i = 0; i < num_edges; ++i) {
sort_vec.push_back(
KDPairInc<UNICHAR_ID, EDGE_RECORD>(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
}
sort_vec.sort();
std::sort(sort_vec.begin(), sort_vec.end());
for (int i = 0; i < num_edges; ++i)
(*edges)[i] = sort_vec[i].data();
}

View File

@ -21,14 +21,12 @@
#include "dawg.h"
#include "genericvector.h"
namespace tesseract {
class UNICHARSET;
// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
// max int32, we will need to change GenericVector to use int64 for size
// max int32, we will need to change vector to use int64 for size
// and address indices. This does not seem to be needed immediately,
// since currently the largest number of edges limit used by tesseract
// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
@ -39,13 +37,13 @@ class UNICHARSET;
// the 64 bit EDGE_RECORD.
using EDGE_INDEX = int64_t; // index of an edge in a given node
using NODE_MARKER = bool *;
using EDGE_VECTOR = GenericVector<EDGE_RECORD>;
using EDGE_VECTOR = std::vector<EDGE_RECORD>;
struct TRIE_NODE_RECORD {
EDGE_VECTOR forward_edges;
EDGE_VECTOR backward_edges;
};
using TRIE_NODES = GenericVector<TRIE_NODE_RECORD *>;
using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;
/**
* Concrete class for Trie data structure that allows to store a list of
@ -88,7 +86,9 @@ public:
initialized_patterns_ = false;
}
~Trie() override {
nodes_.delete_data_pointers();
for (auto node : nodes_) {
delete node;
}
}
// Reset the Trie to empty.
@ -230,7 +230,7 @@ public:
// Fills in the given unichar id vector with the unichar ids that represent
// the patterns of the character classes of the given unichar_id.
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const override;
std::vector<UNICHAR_ID> *vec) const override;
// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
// a self loop and the given unichar_id matches the unichar_id stored in the
@ -256,7 +256,7 @@ public:
//
// Return true if add succeeded, false otherwise (e.g. when a word contained
// an invalid unichar id or the trie was getting too large and was cleared).
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions);
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
bool add_word_to_dawg(const WERD_CHOICE &word) {
return add_word_to_dawg(word, nullptr);
}
@ -395,7 +395,7 @@ protected:
// Member variables
TRIE_NODES nodes_; // vector of nodes in the Trie
// Freelist of edges in the root backwards node that were previously zeroed.
GenericVector<EDGE_INDEX> root_back_freelist_;
std::vector<EDGE_INDEX> root_back_freelist_;
uint64_t num_edges_; // sum of all edges (forward and backward)
uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction
uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index

View File

@ -34,8 +34,6 @@
#include "unicharset.h" // for UNICHARSET
#include "unicity_table.h" // for UnicityTable
template <typename T>
class GenericVector;
template <typename T>
class UnicityTable;

View File

@ -23,6 +23,8 @@
#include <cstdio>
#include "bitvector.h"
#include "helpers.h" // for ClipToRange
#include "serialis.h" // for TFile
#include "tprintf.h"
namespace tesseract {
@ -103,8 +105,8 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
present.Init(PTRAIN_NUM_FEATURE_TYPES);
lang_ = lang;
// Load weights for passes with adaption on.
GenericVector<float> &weights = weights_vec_[pass_];
weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0);
std::vector<float> &weights = weights_vec_[pass_];
weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f);
while (fp->FGets(line, kMaxLineSize) != nullptr) {
char *key = nullptr;
@ -129,13 +131,13 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
}
}
lang_ = "";
weights.truncate(0);
weights.clear();
}
return complete;
}
bool ParamsModel::SaveToFile(const char *full_path) const {
const GenericVector<float> &weights = weights_vec_[pass_];
const std::vector<float> &weights = weights_vec_[pass_];
if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) {
tprintf("Refusing to save ParamsModel that has not been initialized.\n");
return false;

View File

@ -19,7 +19,7 @@
#ifndef TESSERACT_WORDREC_PARAMS_MODEL_H_
#define TESSERACT_WORDREC_PARAMS_MODEL_H_
#include "genericvector.h" // for GenericVector
#include <tesseract/export.h> // for TESS_API
#include "params_training_featdef.h" // for PTRAIN_NUM_FEATURE_TYPES
namespace tesseract {
@ -38,7 +38,7 @@ public:
};
ParamsModel() : pass_(PTRAIN_PASS1) {}
ParamsModel(const char *lang, const GenericVector<float> &weights)
ParamsModel(const char *lang, const std::vector<float> &weights)
: lang_(lang), pass_(PTRAIN_PASS1) {
weights_vec_[pass_] = weights;
}
@ -65,10 +65,10 @@ public:
// Returns true on success.
bool LoadFromFp(const char *lang, TFile *fp);
const GenericVector<float> &weights() const {
const std::vector<float> &weights() const {
return weights_vec_[pass_];
}
const GenericVector<float> &weights_for_pass(PassEnum pass) const {
const std::vector<float> &weights_for_pass(PassEnum pass) const {
return weights_vec_[pass];
}
void SetPass(PassEnum pass) {
@ -84,7 +84,7 @@ private:
PassEnum pass_;
// Several sets of weights for various OCR passes (e.g. pass1 with adaption,
// pass2 without adaption, etc).
GenericVector<float> weights_vec_[PTRAIN_NUM_PASSES];
std::vector<float> weights_vec_[PTRAIN_NUM_PASSES];
};
} // namespace tesseract