Replace remaining GenericVector by std::vector for src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>
2024-11-30 23:49:05 +08:00 · 2021-03-15 12:58:24 +01:00 · 2021-03-15 12:58:24 +01:00 · bf42f8313d
commit bf42f8313d
parent 17eee8648f
10 changed files with 67 additions and 63 deletions
--- a/src/ccstruct/params_training_featdef.h
+++ b/src/ccstruct/params_training_featdef.h
@ -19,6 +19,7 @@
 #ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
 #define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_

+#include <cstring> // for memset
 #include <string>
 #include <vector>

--- a/src/dict/dawg.h
+++ b/src/dict/dawg.h
@ -57,9 +57,9 @@ struct NodeChild {
  NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
 };

-using NodeChildVector = GenericVector<NodeChild>;
-using SuccessorList = GenericVector<int>;
-using SuccessorListsVector = GenericVector<SuccessorList *>;
+using NodeChildVector = std::vector<NodeChild>;
+using SuccessorList = std::vector<int>;
+using SuccessorListsVector = std::vector<SuccessorList *>;

 enum DawgType {
  DAWG_TYPE_PUNCTUATION,
@ -176,7 +176,7 @@ public:
  /// Fills vec with unichar ids that represent the character classes
  /// of the given unichar_id.
  virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
-                                      GenericVector<UNICHAR_ID> *vec) const {
+                                      std::vector<UNICHAR_ID> *vec) const {
    (void)unichar_id;
    (void)unicharset;
    (void)vec;
@ -355,15 +355,16 @@ struct DawgPosition {
  bool back_to_punc = false;
 };

-class DawgPositionVector : public GenericVector<DawgPosition> {
+class DawgPositionVector : public std::vector<DawgPosition> {
 public:
  /// Adds an entry for the given dawg_index with the given node to the vec.
  /// Returns false if the same entry already exists in the vector,
  /// true otherwise.
  inline bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg) {
-    for (int i = 0; i < size(); ++i) {
-      if (data_[i] == new_pos)
+    for (auto position : *this) {
+      if (position == new_pos) {
        return false;
+      }
    }
    push_back(new_pos);
    if (debug) {
--- a/src/dict/dict.cpp
+++ b/src/dict/dict.cpp
@ -201,19 +201,19 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
    punc_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
    if (punc_dawg_)
-      dawgs_ += punc_dawg_;
+      dawgs_.push_back(punc_dawg_);
  }
  if (load_system_dawg) {
    Dawg *system_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
    if (system_dawg)
-      dawgs_ += system_dawg;
+      dawgs_.push_back(system_dawg);
  }
  if (load_number_dawg) {
    Dawg *number_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
    if (number_dawg)
-      dawgs_ += number_dawg;
+      dawgs_.push_back(number_dawg);
  }
  if (load_bigram_dawg) {
    bigram_dawg_ =
@ -225,13 +225,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
    freq_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
    if (freq_dawg_)
-      dawgs_ += freq_dawg_;
+      dawgs_.push_back(freq_dawg_);
  }
  if (load_unambig_dawg) {
    unambig_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
    if (unambig_dawg_)
-      dawgs_ += unambig_dawg_;
+      dawgs_.push_back(unambig_dawg_);
  }

  std::string name;
@ -249,7 +249,7 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
-      dawgs_ += trie_ptr;
+      dawgs_.push_back(trie_ptr);
    }
  }

@ -267,13 +267,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
-      dawgs_ += trie_ptr;
+      dawgs_.push_back(trie_ptr);
    }
  }

  document_words_ =
      new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
-  dawgs_ += document_words_;
+  dawgs_.push_back(document_words_);

  // This dawg is temporary and should not be searched by letter_is_ok.
  pending_words_ =
@ -287,19 +287,19 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
    punc_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
    if (punc_dawg_)
-      dawgs_ += punc_dawg_;
+      dawgs_.push_back(punc_dawg_);
  }
  if (load_system_dawg) {
    Dawg *system_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
    if (system_dawg)
-      dawgs_ += system_dawg;
+      dawgs_.push_back(system_dawg);
  }
  if (load_number_dawg) {
    Dawg *number_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
    if (number_dawg)
-      dawgs_ += number_dawg;
+      dawgs_.push_back(number_dawg);
  }

  // stolen from Dict::Load (but needs params_ from Tesseract
@ -319,7 +319,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
-      dawgs_ += trie_ptr;
+      dawgs_.push_back(trie_ptr);
    }
  }

@ -337,7 +337,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
-      dawgs_ += trie_ptr;
+      dawgs_.push_back(trie_ptr);
    }
  }
 }
@ -358,9 +358,9 @@ bool Dict::FinishLoad() {
      const Dawg *other = dawgs_[j];
      if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
          kDawgSuccessors[dawg->type()][other->type()])
-        *lst += j;
+        lst->push_back(j);
    }
-    successors_ += lst;
+    successors_.push_back(lst);
  }
  return true;
 }
@ -378,7 +378,9 @@ void Dict::End() {
    delete dawg_cache_;
    dawg_cache_ = nullptr;
  }
-  successors_.delete_data_pointers();
+  for (auto successor : successors_) {
+    delete successor;
+  }
  dawgs_.clear();
  successors_.clear();
  document_words_ = nullptr;
@ -550,7 +552,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA
  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
  // Try to find the edge corresponding to the exact unichar_id and to all the
  // edges corresponding to the character class of unichar_id.
-  GenericVector<UNICHAR_ID> unichar_id_patterns;
+  std::vector<UNICHAR_ID> unichar_id_patterns;
  unichar_id_patterns.push_back(unichar_id);
  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
@ -605,12 +607,12 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern
      int dawg_ty = dawgs_[i]->type();
      bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
      if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
-        *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
+        dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
        if (dawg_debug_level >= 3) {
          tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
        }
      } else if (!punc_dawg_available || !subsumed_by_punc) {
-        *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
+        dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
        if (dawg_debug_level >= 3) {
          tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
        }
--- a/src/dict/dict.h
+++ b/src/dict/dict.h
@ -54,7 +54,7 @@ struct CHAR_FRAGMENT_INFO {
  float certainty;
 };

-using DawgVector = GenericVector<Dawg *>;
+using DawgVector = std::vector<Dawg *>;

 //
 // Constants
@ -495,7 +495,7 @@ private:
  // matching.  The first member of each list is taken as canonical.  For
  // example, the first list contains hyphens and dashes with the first symbol
  // being the ASCII hyphen minus.
-  std::vector<GenericVector<UNICHAR_ID>> equivalent_symbols_;
+  std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
  DawgCache *dawg_cache_;
  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
--- a/src/dict/stopper.h
+++ b/src/dict/stopper.h
@ -2,7 +2,6 @@
 ** Filename:    stopper.h
 ** Purpose:     Stopping criteria for word classifier.
 ** Author:      Dan Johnson
- ** History:     Wed May  1 09:42:57 1991, DSJ, Created.
 **
 ** (c) Copyright Hewlett-Packard Company, 1988.
 ** Licensed under the Apache License, Version 2.0 (the "License");
@ -22,7 +21,6 @@
 #include "ratngs.h"

 #include <tesseract/unichar.h>
-#include "genericvector.h"

 namespace tesseract {

@ -46,7 +44,7 @@ struct DANGERR_INFO {
  UNICHAR_ID leftmost; // in the replacement, what's the leftmost character?
 };

-using DANGERR = GenericVector<DANGERR_INFO>;
+using DANGERR = std::vector<DANGERR_INFO>;

 } // namespace tesseract

--- a/src/dict/trie.cpp
+++ b/src/dict/trie.cpp
@ -24,7 +24,6 @@

 #include "dawg.h"
 #include "dict.h"
-#include "genericvector.h"
 #include "helpers.h"
 #include "kdpair.h"

@ -49,7 +48,9 @@ const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {

 // Reset the Trie to empty.
 void Trie::clear() {
-  nodes_.delete_data_pointers();
+  for (auto node : nodes_) {
+    delete node;
+  }
  nodes_.clear();
  root_back_freelist_.clear();
  num_edges_ = 0;
@ -122,10 +123,11 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in
  EDGE_RECORD edge_rec;
  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
  if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
-    EDGE_INDEX edge_index = root_back_freelist_.pop_back();
+    EDGE_INDEX edge_index = root_back_freelist_.back();
+    root_back_freelist_.pop_back();
    (*vec)[edge_index] = edge_rec;
  } else if (search_index < vec->size()) {
-    vec->insert(edge_rec, search_index);
+    vec->insert(vec->begin() + search_index, edge_rec);
  } else {
    vec->push_back(edge_rec);
  }
@ -153,7 +155,7 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool m
  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
 }

-bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions) {
+bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {
  if (word.length() <= 0)
    return false; // can't add empty words
  if (repetitions != nullptr)
@ -330,7 +332,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
 }

 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
-                                  GenericVector<UNICHAR_ID> *vec) const {
+                                  std::vector<UNICHAR_ID> *vec) const {
  bool is_alpha = unicharset.get_isalpha(unichar_id);
  if (is_alpha) {
    vec->push_back(alpha_pattern_);
@ -388,7 +390,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset)
    // Parse the pattern and construct a unichar id vector.
    // Record the number of repetitions of each unichar in the parallel vector.
    WERD_CHOICE word(&unicharset);
-    GenericVector<bool> repetitions_vec;
+    std::vector<bool> repetitions_vec;
    const char *str_ptr = string;
    int step = unicharset.step(str_ptr);
    bool failed = false;
@ -462,12 +464,12 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
    tprintf("\n");
  }
  if (direction == FORWARD_EDGE) {
-    nodes_[node1]->forward_edges.remove(edge_index);
+    nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
  } else if (node1 == 0) {
    KillEdge(&nodes_[node1]->backward_edges[edge_index]);
    root_back_freelist_.push_back(edge_index);
  } else {
-    nodes_[node1]->backward_edges.remove(edge_index);
+    nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
  }
  --num_edges_;
 }
@ -476,7 +478,7 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
 // 1 Avoid insertion sorting or bubble sorting the tail root node
 //   (back links on node 0, a list of all the leaves.). The node is
 //   huge, and sorting it with n^2 time is terrible.
-// 2 Avoid using GenericVector::remove on the tail root node.
+// 2 Avoid using vector::erase on the tail root node.
 //   (a) During add of words to the trie, zero-out the unichars and
 //       keep a freelist of spaces to re-use.
 //   (b) During reduction, just zero-out the unichars of deleted back
@ -624,13 +626,13 @@ void Trie::sort_edges(EDGE_VECTOR *edges) {
  int num_edges = edges->size();
  if (num_edges <= 1)
    return;
-  GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
+  std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
  sort_vec.reserve(num_edges);
  for (int i = 0; i < num_edges; ++i) {
    sort_vec.push_back(
        KDPairInc<UNICHAR_ID, EDGE_RECORD>(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
  }
-  sort_vec.sort();
+  std::sort(sort_vec.begin(), sort_vec.end());
  for (int i = 0; i < num_edges; ++i)
    (*edges)[i] = sort_vec[i].data();
 }
--- a/src/dict/trie.h
+++ b/src/dict/trie.h
@ -21,14 +21,12 @@

 #include "dawg.h"

-#include "genericvector.h"
-
 namespace tesseract {

 class UNICHARSET;

 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
-// max int32, we will need to change GenericVector to use int64 for size
+// max int32, we will need to change vector to use int64 for size
 // and address indices. This does not seem to be needed immediately,
 // since currently the largest number of edges limit used by tesseract
 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
@ -39,13 +37,13 @@ class UNICHARSET;
 // the 64 bit EDGE_RECORD.
 using EDGE_INDEX = int64_t; // index of an edge in a given node
 using NODE_MARKER = bool *;
-using EDGE_VECTOR = GenericVector<EDGE_RECORD>;
+using EDGE_VECTOR = std::vector<EDGE_RECORD>;

 struct TRIE_NODE_RECORD {
  EDGE_VECTOR forward_edges;
  EDGE_VECTOR backward_edges;
 };
-using TRIE_NODES = GenericVector<TRIE_NODE_RECORD *>;
+using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;

 /**
 * Concrete class for Trie data structure that allows to store a list of
@ -88,7 +86,9 @@ public:
    initialized_patterns_ = false;
  }
  ~Trie() override {
-    nodes_.delete_data_pointers();
+    for (auto node : nodes_) {
+      delete node;
+    }
  }

  // Reset the Trie to empty.
@ -230,7 +230,7 @@ public:
  // Fills in the given unichar id vector with the unichar ids that represent
  // the patterns of the character classes of the given unichar_id.
  void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
-                              GenericVector<UNICHAR_ID> *vec) const override;
+                              std::vector<UNICHAR_ID> *vec) const override;

  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
  // a self loop and the given unichar_id matches the unichar_id stored in the
@ -256,7 +256,7 @@ public:
  //
  // Return true if add succeeded, false otherwise (e.g. when a word contained
  // an invalid unichar id or the trie was getting too large and was cleared).
-  bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions);
+  bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
  bool add_word_to_dawg(const WERD_CHOICE &word) {
    return add_word_to_dawg(word, nullptr);
  }
@ -395,7 +395,7 @@ protected:
  // Member variables
  TRIE_NODES nodes_; // vector of nodes in the Trie
  // Freelist of edges in the root backwards node that were previously zeroed.
-  GenericVector<EDGE_INDEX> root_back_freelist_;
+  std::vector<EDGE_INDEX> root_back_freelist_;
  uint64_t num_edges_;             // sum of all edges (forward and backward)
  uint64_t deref_direction_mask_;  // mask for EDGE_REF to extract direction
  uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index
--- a/src/wordrec/language_model.cpp
+++ b/src/wordrec/language_model.cpp
@ -34,8 +34,6 @@
 #include "unicharset.h"              // for UNICHARSET
 #include "unicity_table.h"           // for UnicityTable

-template <typename T>
-class GenericVector;
 template <typename T>
 class UnicityTable;

--- a/src/wordrec/params_model.cpp
+++ b/src/wordrec/params_model.cpp
@ -23,6 +23,8 @@
 #include <cstdio>

 #include "bitvector.h"
+#include "helpers.h"   // for ClipToRange
+#include "serialis.h"  // for TFile
 #include "tprintf.h"

 namespace tesseract {
@ -103,8 +105,8 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
  present.Init(PTRAIN_NUM_FEATURE_TYPES);
  lang_ = lang;
  // Load weights for passes with adaption on.
-  GenericVector<float> &weights = weights_vec_[pass_];
-  weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0);
+  std::vector<float> &weights = weights_vec_[pass_];
+  weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f);

  while (fp->FGets(line, kMaxLineSize) != nullptr) {
    char *key = nullptr;
@ -129,13 +131,13 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
      }
    }
    lang_ = "";
-    weights.truncate(0);
+    weights.clear();
  }
  return complete;
 }

 bool ParamsModel::SaveToFile(const char *full_path) const {
-  const GenericVector<float> &weights = weights_vec_[pass_];
+  const std::vector<float> &weights = weights_vec_[pass_];
  if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) {
    tprintf("Refusing to save ParamsModel that has not been initialized.\n");
    return false;
--- a/src/wordrec/params_model.h
+++ b/src/wordrec/params_model.h
@ -19,7 +19,7 @@
 #ifndef TESSERACT_WORDREC_PARAMS_MODEL_H_
 #define TESSERACT_WORDREC_PARAMS_MODEL_H_

-#include "genericvector.h"           // for GenericVector
+#include <tesseract/export.h>        // for TESS_API
 #include "params_training_featdef.h" // for PTRAIN_NUM_FEATURE_TYPES

 namespace tesseract {
@ -38,7 +38,7 @@ public:
  };

  ParamsModel() : pass_(PTRAIN_PASS1) {}
-  ParamsModel(const char *lang, const GenericVector<float> &weights)
+  ParamsModel(const char *lang, const std::vector<float> &weights)
      : lang_(lang), pass_(PTRAIN_PASS1) {
    weights_vec_[pass_] = weights;
  }
@ -65,10 +65,10 @@ public:
  // Returns true on success.
  bool LoadFromFp(const char *lang, TFile *fp);

-  const GenericVector<float> &weights() const {
+  const std::vector<float> &weights() const {
    return weights_vec_[pass_];
  }
-  const GenericVector<float> &weights_for_pass(PassEnum pass) const {
+  const std::vector<float> &weights_for_pass(PassEnum pass) const {
    return weights_vec_[pass];
  }
  void SetPass(PassEnum pass) {
@ -84,7 +84,7 @@ private:
  PassEnum pass_;
  // Several sets of weights for various OCR passes (e.g. pass1 with adaption,
  // pass2 without adaption, etc).
-  GenericVector<float> weights_vec_[PTRAIN_NUM_PASSES];
+  std::vector<float> weights_vec_[PTRAIN_NUM_PASSES];
 };

 } // namespace tesseract