From 2a3682a35e643cefb86eefa4c9a3deddc75295bd Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 15 Mar 2021 09:01:55 +0100 Subject: [PATCH 1/5] Replace remaining GenericVector by std::vector in src/lstm Signed-off-by: Stefan Weil --- src/ccutil/genericheap.h | 21 +++++---- src/ccutil/helpers.h | 6 +++ src/ccutil/unicharcompress.cpp | 11 +++-- src/ccutil/unicharcompress.h | 12 ++--- src/lstm/fullyconnected.cpp | 13 +++--- src/lstm/lstm.cpp | 12 ++--- src/lstm/networkio.cpp | 5 +-- src/lstm/networkio.h | 5 +-- src/lstm/networkscratch.h | 21 ++++----- src/lstm/parallel.cpp | 8 ++-- src/lstm/plumbing.cpp | 4 +- src/lstm/plumbing.h | 3 +- src/lstm/recodebeam.cpp | 75 ++++++++++++++++---------------- src/lstm/recodebeam.h | 14 +++--- src/lstm/tfnetwork.cpp | 4 +- src/lstm/weightmatrix.cpp | 4 +- src/wordrec/lm_pain_points.cpp | 7 +-- unittest/unicharcompress_test.cc | 6 +-- 18 files changed, 111 insertions(+), 120 deletions(-) diff --git a/src/ccutil/genericheap.h b/src/ccutil/genericheap.h index d8464418..e67b4e31 100644 --- a/src/ccutil/genericheap.h +++ b/src/ccutil/genericheap.h @@ -4,7 +4,6 @@ // File: genericheap.h // Description: Template heap class. // Author: Ray Smith, based on Dan Johnson's original code. -// Created: Wed Mar 14 08:13:00 PDT 2012 // // (C) Copyright 2012, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -38,7 +37,7 @@ namespace tesseract { // GenericHeap doesn't look inside it except for operator<. // // The heap is stored as a packed binary tree in an array hosted by a -// GenericVector, with the invariant that the children of each node are +// vector, with the invariant that the children of each node are // both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator< // to use Key::operator< to generate a MIN heap and KDPairDec defines // Pair::operator< to use Key::operator> to generate a MAX heap by reversing @@ -59,7 +58,7 @@ template class GenericHeap { public: GenericHeap() = default; - // The initial size is only a GenericVector::reserve. It is not enforced as + // The initial size is only a vector::reserve. It is not enforced as // the size limit of the heap. Caller must implement their own enforcement. explicit GenericHeap(int initial_size) { heap_.reserve(initial_size); @@ -77,12 +76,12 @@ public: } void clear() { // Clear truncates to 0 to keep the number reserved in tact. - heap_.truncate(0); + heap_.clear(); } // Provides access to the underlying vector. // Caution! any changes that modify the keys will invalidate the heap! - GenericVector *heap() { - return &heap_; + std::vector &heap() { + return heap_; } // Provides read-only access to an element of the underlying vector. const Pair &get(int index) const { @@ -128,11 +127,11 @@ public: // Sift the hole at the start of the heap_ downwards to match the last // element. Pair hole_pair = heap_[new_size]; - heap_.truncate(new_size); + heap_.resize(new_size); int hole_index = SiftDown(0, hole_pair); heap_[hole_index] = hole_pair; } else { - heap_.truncate(new_size); + heap_.resize(new_size); } return true; } @@ -154,7 +153,7 @@ public: int hole_index = SiftUp(worst_index, hole_pair); heap_[hole_index] = hole_pair; } - heap_.truncate(heap_size); + heap_.resize(heap_size); return true; } @@ -179,7 +178,7 @@ public: // The pointed-to Pair has changed its key value, so the location of pair // is reshuffled to maintain the heap invariant. // Must be a valid pointer to an element of the heap_! - // Caution! Since GenericHeap is based on GenericVector, reallocs may occur + // Caution! Since GenericHeap is based on vector, reallocs may occur // whenever the vector is extended and elements may get shuffled by any // Push or Pop operation. Therefore use this function only if Data in Pair is // of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as @@ -235,7 +234,7 @@ private: } private: - GenericVector heap_; + std::vector heap_; }; } // namespace tesseract diff --git a/src/ccutil/helpers.h b/src/ccutil/helpers.h index a11af4b7..7988279f 100644 --- a/src/ccutil/helpers.h +++ b/src/ccutil/helpers.h @@ -24,6 +24,7 @@ #include // std::isfinite #include #include +#include // for std::find #include #include #include @@ -31,6 +32,11 @@ namespace tesseract { +template +inline bool contains(const std::vector &data, const T &value) { + return std::find(data.begin(), data.end(), value) != data.end(); +} + inline const std::vector split(const std::string &s, char c) { std::string buff; std::vector v; diff --git a/src/ccutil/unicharcompress.cpp b/src/ccutil/unicharcompress.cpp index 9bb5b71f..9c4a9509 100644 --- a/src/ccutil/unicharcompress.cpp +++ b/src/ccutil/unicharcompress.cpp @@ -245,8 +245,7 @@ void UnicharCompress::DefragmentCodeValues(int encoded_null) { // all codes are used. Likewise with the Han encoding, it is possible that not // all numbers of strokes are used. ComputeCodeRange(); - GenericVector offsets; - offsets.init_to_size(code_range_, 0); + std::vector offsets(code_range_); // Find which codes are used for (int c = 0; c < encoder_.size(); ++c) { const RecodedCharID &code = encoder_[c]; @@ -390,26 +389,26 @@ void UnicharCompress::SetupDecoder() { prefix.Truncate(len); auto final_it = final_codes_.find(prefix); if (final_it == final_codes_.end()) { - auto *code_list = new GenericVector; + auto *code_list = new std::vector; code_list->push_back(code(len)); final_codes_[prefix] = code_list; while (--len >= 0) { prefix.Truncate(len); auto next_it = next_codes_.find(prefix); if (next_it == next_codes_.end()) { - auto *code_list = new GenericVector; + auto *code_list = new std::vector; code_list->push_back(code(len)); next_codes_[prefix] = code_list; } else { // We still have to search the list as we may get here via multiple // lengths of code. - if (!next_it->second->contains(code(len))) + if (!contains(*next_it->second, code(len))) next_it->second->push_back(code(len)); break; // This prefix has been processed. } } } else { - if (!final_it->second->contains(code(len))) + if (!contains(*final_it->second, code(len))) final_it->second->push_back(code(len)); } } diff --git a/src/ccutil/unicharcompress.h b/src/ccutil/unicharcompress.h index 2db11936..94e7f093 100644 --- a/src/ccutil/unicharcompress.h +++ b/src/ccutil/unicharcompress.h @@ -22,7 +22,7 @@ #define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_ #include -#include "genericvector.h" // GenericVector +#include #include "serialis.h" #include "unicharset.h" @@ -178,13 +178,13 @@ public: } // Returns a list of valid non-final next codes for a given prefix code, // which may be empty. - const GenericVector *GetNextCodes(const RecodedCharID &code) const { + const std::vector *GetNextCodes(const RecodedCharID &code) const { auto it = next_codes_.find(code); return it == next_codes_.end() ? nullptr : it->second; } // Returns a list of valid final codes for a given prefix code, which may // be empty. - const GenericVector *GetFinalCodes(const RecodedCharID &code) const { + const std::vector *GetFinalCodes(const RecodedCharID &code) const { auto it = final_codes_.find(code); return it == final_codes_.end() ? nullptr : it->second; } @@ -225,14 +225,14 @@ private: // Decoder converts the output of encoder back to a unichar-id. std::unordered_map decoder_; // True if the index is a valid single or start code. - GenericVector is_valid_start_; + std::vector is_valid_start_; // Maps a prefix code to a list of valid next codes. // The map owns the vectors. - std::unordered_map *, RecodedCharID::RecodedCharIDHash> + std::unordered_map *, RecodedCharID::RecodedCharIDHash> next_codes_; // Maps a prefix code to a list of valid final codes. // The map owns the vectors. - std::unordered_map *, RecodedCharID::RecodedCharIDHash> + std::unordered_map *, RecodedCharID::RecodedCharIDHash> final_codes_; // Max of any value in encoder_ + 1. int code_range_; diff --git a/src/lstm/fullyconnected.cpp b/src/lstm/fullyconnected.cpp index fbef454b..f52fc87d 100644 --- a/src/lstm/fullyconnected.cpp +++ b/src/lstm/fullyconnected.cpp @@ -129,10 +129,8 @@ void FullyConnected::Forward(bool debug, const NetworkIO &input, else output->Resize(input, no_); SetupForward(input, input_transpose); - GenericVector temp_lines; - temp_lines.init_to_size(kNumThreads, NetworkScratch::FloatVec()); - GenericVector curr_input; - curr_input.init_to_size(kNumThreads, NetworkScratch::FloatVec()); + std::vector temp_lines(kNumThreads); + std::vector curr_input(kNumThreads); int ro = no_; if (IntSimdMatrix::intSimdMatrix) ro = IntSimdMatrix::intSimdMatrix->RoundOutputs(ro); @@ -233,13 +231,12 @@ bool FullyConnected::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkSc DisplayBackward(fwd_deltas); #endif back_deltas->Resize(fwd_deltas, ni_); - GenericVector errors; - errors.init_to_size(kNumThreads, NetworkScratch::FloatVec()); + std::vector errors(kNumThreads); for (int i = 0; i < kNumThreads; ++i) errors[i].Init(no_, scratch); - GenericVector temp_backprops; + std::vector temp_backprops; if (needs_to_backprop_) { - temp_backprops.init_to_size(kNumThreads, NetworkScratch::FloatVec()); + temp_backprops.resize(kNumThreads); for (int i = 0; i < kNumThreads; ++i) temp_backprops[i].Init(ni_, scratch); } diff --git a/src/lstm/lstm.cpp b/src/lstm/lstm.cpp index a8ec3f24..8633eb3f 100644 --- a/src/lstm/lstm.cpp +++ b/src/lstm/lstm.cpp @@ -297,10 +297,10 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in // for the other dimension, used only when working in true 2D mode. The width // is enough to hold an entire strip of the major direction. int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1; - GenericVector states, outputs; + std::vector states, outputs; if (Is2D()) { - states.init_to_size(buf_width, NetworkScratch::FloatVec()); - outputs.init_to_size(buf_width, NetworkScratch::FloatVec()); + states.resize(buf_width); + outputs.resize(buf_width); for (int i = 0; i < buf_width; ++i) { states[i].Init(ns_, scratch); ZeroVector(ns_, states[i]); @@ -494,10 +494,10 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr // Rotating buffers of width buf_width allow storage of the recurrent time- // steps used only for true 2-D. Stores one full strip of the major direction. int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1; - GenericVector stateerr, sourceerr; + std::vector stateerr, sourceerr; if (Is2D()) { - stateerr.init_to_size(buf_width, NetworkScratch::FloatVec()); - sourceerr.init_to_size(buf_width, NetworkScratch::FloatVec()); + stateerr.resize(buf_width); + sourceerr.resize(buf_width); for (int t = 0; t < buf_width; ++t) { stateerr[t].Init(ns_, scratch); sourceerr[t].Init(na_, scratch); diff --git a/src/lstm/networkio.cpp b/src/lstm/networkio.cpp index aab6d64d..20c3a56b 100644 --- a/src/lstm/networkio.cpp +++ b/src/lstm/networkio.cpp @@ -2,7 +2,6 @@ // File: networkio.cpp // Description: Network input/output data, allowing float/int implementations. // Author: Ray Smith -// Created: Thu Jun 19 13:01:31 PST 2014 // // (C) Copyright 2014, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -507,7 +506,7 @@ int NetworkIO::BestLabel(int t, int not_this, int not_that, float *score) const // Returns the best start position out of [start, end) (into which all labels // must fit) to obtain the highest cumulative score for the given labels. -int NetworkIO::PositionOfBestMatch(const GenericVector &labels, int start, int end) const { +int NetworkIO::PositionOfBestMatch(const std::vector &labels, int start, int end) const { int length = labels.size(); int last_start = end - length; int best_start = -1; @@ -524,7 +523,7 @@ int NetworkIO::PositionOfBestMatch(const GenericVector &labels, int start, // Returns the cumulative score of the given labels starting at start, and // using one label per time-step. -double NetworkIO::ScoreOfLabels(const GenericVector &labels, int start) const { +double NetworkIO::ScoreOfLabels(const std::vector &labels, int start) const { int length = labels.size(); double score = 0.0; for (int i = 0; i < length; ++i) { diff --git a/src/lstm/networkio.h b/src/lstm/networkio.h index 40339d4a..b996e05f 100644 --- a/src/lstm/networkio.h +++ b/src/lstm/networkio.h @@ -23,7 +23,6 @@ #include #include -#include "genericvector.h" #include "helpers.h" #include "static_shape.h" #include "stridemap.h" @@ -169,10 +168,10 @@ public: int BestLabel(int t, int not_this, int not_that, float *score) const; // Returns the best start position out of range (into which both start and end // must fit) to obtain the highest cumulative score for the given labels. - int PositionOfBestMatch(const GenericVector &labels, int start, int end) const; + int PositionOfBestMatch(const std::vector &labels, int start, int end) const; // Returns the cumulative score of the given labels starting at start, and // using one label per time-step. - double ScoreOfLabels(const GenericVector &labels, int start) const; + double ScoreOfLabels(const std::vector &labels, int start) const; // Helper function sets all the outputs for a single timestep, such that // label has value ok_score, and the other labels share 1 - ok_score. // Assumes float mode. diff --git a/src/lstm/networkscratch.h b/src/lstm/networkscratch.h index efb31f8d..1e3c5743 100644 --- a/src/lstm/networkscratch.h +++ b/src/lstm/networkscratch.h @@ -20,14 +20,13 @@ #define TESSERACT_LSTM_NETWORKSCRATCH_H_ #include -#include "genericvector.h" #include "matrix.h" #include "networkio.h" namespace tesseract { // Generic scratch space for network layers. Provides NetworkIO that can store -// a complete set (over time) of intermediates, and GenericVector +// a complete set (over time) of intermediates, and vector // scratch space that auto-frees after use. The aim here is to provide a set // of temporary buffers to network layers that can be reused between layers // and don't have to be reallocated on each call. @@ -125,7 +124,7 @@ public: }; // class IO. // Class that acts like a fixed array of float, yet actually uses space - // from a GenericVector in the source NetworkScratch, and knows how + // from a vector in the source NetworkScratch, and knows how // to unstack the borrowed vector on destruction. class FloatVec { public: @@ -145,12 +144,8 @@ public: scratch_space_->vec_stack_.Return(vec_); scratch_space_ = scratch; vec_ = scratch_space_->vec_stack_.Borrow(); - // Abuse vec_ here; first resize to 'reserve', which is larger - // than 'size' (i.e. it's size rounded up) then resize down again - // to the desired size. This assumes that the implementation does - // not shrink the storage on a resize. - vec_->resize_no_init(reserve); - vec_->resize_no_init(size); + vec_->reserve(reserve); + vec_->resize(size); data_ = &(*vec_)[0]; } @@ -169,7 +164,7 @@ public: private: // Vector borrowed from the scratch space. Use Return to free it. - GenericVector *vec_; + std::vector *vec_; // Short-cut pointer to the underlying array. double *data_; // The source scratch_space_. Borrowed pointer, used to free the @@ -251,7 +246,7 @@ public: private: PointerVector stack_; - GenericVector flags_; + std::vector flags_; int stack_top_; std::mutex mutex_; }; // class Stack. @@ -259,11 +254,11 @@ public: private: // If true, the network weights are int8_t, if false, float. bool int_mode_; - // Stacks of NetworkIO and GenericVector. Once allocated, they are not + // Stacks of NetworkIO and vector. Once allocated, they are not // deleted until the NetworkScratch is deleted. Stack int_stack_; Stack float_stack_; - Stack> vec_stack_; + Stack> vec_stack_; Stack array_stack_; }; diff --git a/src/lstm/parallel.cpp b/src/lstm/parallel.cpp index 811bc3bb..f2c889e9 100644 --- a/src/lstm/parallel.cpp +++ b/src/lstm/parallel.cpp @@ -61,8 +61,7 @@ void Parallel::Forward(bool debug, const NetworkIO &input, const TransposedArray int stack_size = stack_.size(); if (type_ == NT_PAR_2D_LSTM) { // Special case, run parallel in parallel. - GenericVector results; - results.init_to_size(stack_size, NetworkScratch::IO()); + std::vector results(stack_size); for (int i = 0; i < stack_size; ++i) { results[i].Resize(input, stack_[i]->NumOutputs(), scratch); } @@ -124,9 +123,8 @@ bool Parallel::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch int stack_size = stack_.size(); if (type_ == NT_PAR_2D_LSTM) { // Special case, run parallel in parallel. - GenericVector in_deltas, out_deltas; - in_deltas.init_to_size(stack_size, NetworkScratch::IO()); - out_deltas.init_to_size(stack_size, NetworkScratch::IO()); + std::vector in_deltas(stack_size); + std::vector out_deltas(stack_size); // Split the forward deltas for each stack element. int feature_offset = 0; for (int i = 0; i < stack_.size(); ++i) { diff --git a/src/lstm/plumbing.cpp b/src/lstm/plumbing.cpp index b5f2c807..c0b2501a 100644 --- a/src/lstm/plumbing.cpp +++ b/src/lstm/plumbing.cpp @@ -190,7 +190,7 @@ bool Plumbing::Serialize(TFile *fp) const { for (uint32_t i = 0; i < size; ++i) if (!stack_[i]->Serialize(fp)) return false; - if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !learning_rates_.Serialize(fp)) { + if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->Serialize(learning_rates_)) { return false; } return true; @@ -209,7 +209,7 @@ bool Plumbing::DeSerialize(TFile *fp) { return false; AddToStack(network); } - if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !learning_rates_.DeSerialize(fp)) { + if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && !fp->DeSerialize(learning_rates_)) { return false; } return true; diff --git a/src/lstm/plumbing.h b/src/lstm/plumbing.h index 1efabc34..a7cea1b8 100644 --- a/src/lstm/plumbing.h +++ b/src/lstm/plumbing.h @@ -19,7 +19,6 @@ #ifndef TESSERACT_LSTM_PLUMBING_H_ #define TESSERACT_LSTM_PLUMBING_H_ -#include "genericvector.h" #include "matrix.h" #include "network.h" @@ -139,7 +138,7 @@ protected: PointerVector stack_; // Layer-specific learning rate iff network_flags_ & NF_LAYER_SPECIFIC_LR. // One element for each element of stack_. - GenericVector learning_rates_; + std::vector learning_rates_; }; } // namespace tesseract. diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 18d595e9..ad128b0f 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -23,7 +23,7 @@ #include "pageres.h" #include "unicharcompress.h" -#include +#include // for std::reverse #include #include #include @@ -181,7 +181,7 @@ void RecodeBeamSearch::ExtractBestPathAsLabels(std::vector *labels, std::vector *xcoords) const { labels->clear(); xcoords->clear(); - GenericVector best_nodes; + std::vector best_nodes; ExtractBestPaths(&best_nodes, nullptr); // Now just run CTC on the best nodes. int t = 0; @@ -205,7 +205,7 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET std::vector *certs, std::vector *ratings, std::vector *xcoords) const { - GenericVector best_nodes; + std::vector best_nodes; ExtractBestPaths(&best_nodes, nullptr); ExtractPathAsUnicharIds(best_nodes, unichar_ids, certs, ratings, xcoords); if (debug) { @@ -224,8 +224,8 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX &line_box, float scale_ std::vector certs; std::vector ratings; std::vector xcoords; - GenericVector best_nodes; - GenericVector second_nodes; + std::vector best_nodes; + std::vector second_nodes; character_boundaries_.clear(); ExtractBestPaths(&best_nodes, &second_nodes); if (debug) { @@ -306,10 +306,10 @@ void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET * } // fill the topology with depths first for (int step = beam->size() - 1; step >= 0; --step) { - GenericVector *heaps = beam->get(step)->beams_->heap(); - for (int node = 0; node < heaps->size(); ++node) { + std::vector &heaps = beam->get(step)->beams_->heap(); + for (auto node : heaps) { int backtracker = 0; - const RecodeNode *curr = &heaps->get(node).data(); + const RecodeNode *curr = &node.data(); while (curr != nullptr && !visited.count(curr)) { visited.insert(curr); topology[step - backtracker].push_back(curr); @@ -371,7 +371,6 @@ void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET * } void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) { - GenericVector *heaps = nullptr; PointerVector *currentBeam = nullptr; if (character_boundaries_.size() < 2) return; @@ -389,14 +388,15 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) { std::vector ratings; std::vector xcoords; int backpath = character_boundaries_[j] - character_boundaries_[j - 1]; - heaps = currentBeam->get(character_boundaries_[j] - 1)->beams_->heap(); - GenericVector best_nodes; + std::vector &heaps = + currentBeam->get(character_boundaries_[j] - 1)->beams_->heap(); + std::vector best_nodes; std::vector best; // Scan the segmented node chain for valid unichar ids. - for (int i = 0; i < heaps->size(); ++i) { + for (auto entry : heaps) { bool validChar = false; int backcounter = 0; - const RecodeNode *node = &heaps->get(i).data(); + const RecodeNode *node = &entry.data(); while (node != nullptr && backcounter < backpath) { if (node->code != null_char_ && node->unichar_id != INVALID_UNICHAR_ID) { validChar = true; @@ -406,7 +406,7 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) { ++backcounter; } if (validChar) - best.push_back(&heaps->get(i).data()); + best.push_back(&entry.data()); } // find the best rated segmented node chain and extract the unichar id. if (!best.empty()) { @@ -488,8 +488,7 @@ void RecodeBeamSearch::DebugBeams(const UNICHARSET &unicharset) const { // Generates debug output of the content of a single beam position. void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHeap &heap) const { - GenericVector unichar_bests; - unichar_bests.init_to_size(unicharset.size(), nullptr); + std::vector unichar_bests(unicharset.size()); const RecodeNode *null_best = nullptr; int heap_size = heap.size(); for (int i = 0; i < heap_size; ++i) { @@ -518,7 +517,7 @@ void RecodeBeamSearch::DebugBeamPos(const UNICHARSET &unicharset, const RecodeHe // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping // duplicates, nulls and intermediate parts. /* static */ -void RecodeBeamSearch::ExtractPathAsUnicharIds(const GenericVector &best_nodes, +void RecodeBeamSearch::ExtractPathAsUnicharIds(const std::vector &best_nodes, std::vector *unichar_ids, std::vector *certs, std::vector *ratings, @@ -699,14 +698,14 @@ void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio if (debug) { int beam_index = BeamIndex(true, NC_ANYTHING, 0); for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - GenericVector path; + std::vector path; ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); tprintf("Step %d: Dawg beam %d:\n", t, i); DebugPath(charset, path); } beam_index = BeamIndex(false, NC_ANYTHING, 0); for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - GenericVector path; + std::vector path; ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); tprintf("Step %d: Non-Dawg beam %d:\n", t, i); DebugPath(charset, path); @@ -765,14 +764,14 @@ void RecodeBeamSearch::DecodeSecondaryStep(const float *outputs, int t, double d if (debug) { int beam_index = BeamIndex(true, NC_ANYTHING, 0); for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - GenericVector path; + std::vector path; ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); tprintf("Step %d: Dawg beam %d:\n", t, i); DebugPath(charset, path); } beam_index = BeamIndex(false, NC_ANYTHING, 0); for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { - GenericVector path; + std::vector path; ExtractPath(&prev->beams_[beam_index].get(i).data(), &path); tprintf("Step %d: Non-Dawg beam %d:\n", t, i); DebugPath(charset, path); @@ -858,7 +857,7 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const dict_ratio, use_dawgs, NC_ANYTHING, prev, step); } } - const GenericVector *final_codes = recoder_.GetFinalCodes(prefix); + const std::vector *final_codes = recoder_.GetFinalCodes(prefix); if (final_codes != nullptr) { for (int i = 0; i < final_codes->size(); ++i) { int code = (*final_codes)[i]; @@ -892,7 +891,7 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode *prev, int index, const } } } - const GenericVector *next_codes = recoder_.GetNextCodes(prefix); + const std::vector *next_codes = recoder_.GetNextCodes(prefix); if (next_codes != nullptr) { for (int i = 0; i < next_codes->size(); ++i) { int code = (*next_codes)[i]; @@ -1121,17 +1120,17 @@ bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *hea // TODO(rays) consider hash map instead of linear search. // It might not be faster because the hash map would have to be updated // every time a heap reshuffle happens, and that would be a lot of overhead. - GenericVector *nodes = heap->heap(); - for (int i = 0; i < nodes->size(); ++i) { - RecodeNode &node = (*nodes)[i].data(); + std::vector &nodes = heap->heap(); + for (int i = 0; i < nodes.size(); ++i) { + RecodeNode &node = nodes[i].data(); if (node.code == new_node->code && node.code_hash == new_node->code_hash && node.permuter == new_node->permuter && node.start_of_dawg == new_node->start_of_dawg) { if (new_node->score > node.score) { // The new one is better. Update the entire node in the heap and // reshuffle. node = *new_node; - (*nodes)[i].key() = node.score; - heap->Reshuffle(&(*nodes)[i]); + nodes[i].key() = node.score; + heap->Reshuffle(&nodes[i]); } return true; } @@ -1156,8 +1155,8 @@ uint64_t RecodeBeamSearch::ComputeCodeHash(int code, bool dup, const RecodeNode // during Decode. On return the best_nodes vector essentially contains the set // of code, score pairs that make the optimal path with the constraint that // the recoder can decode the code sequence back to a sequence of unichar-ids. -void RecodeBeamSearch::ExtractBestPaths(GenericVector *best_nodes, - GenericVector *second_nodes) const { +void RecodeBeamSearch::ExtractBestPaths(std::vector *best_nodes, + std::vector *second_nodes) const { // Scan both beams to extract the best and second best paths. const RecodeNode *best_node = nullptr; const RecodeNode *second_best_node = nullptr; @@ -1201,30 +1200,30 @@ void RecodeBeamSearch::ExtractBestPaths(GenericVector *best_ // Helper backtracks through the lattice from the given node, storing the // path and reversing it. void RecodeBeamSearch::ExtractPath(const RecodeNode *node, - GenericVector *path) const { - path->truncate(0); + std::vector *path) const { + path->clear(); while (node != nullptr) { path->push_back(node); node = node->prev; } - path->reverse(); + std::reverse(path->begin(), path->end()); } -void RecodeBeamSearch::ExtractPath(const RecodeNode *node, GenericVector *path, +void RecodeBeamSearch::ExtractPath(const RecodeNode *node, std::vector *path, int limiter) const { int pathcounter = 0; - path->truncate(0); + path->clear(); while (node != nullptr && pathcounter < limiter) { path->push_back(node); node = node->prev; ++pathcounter; } - path->reverse(); + std::reverse(path->begin(), path->end()); } // Helper prints debug information on the given lattice path. void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset, - const GenericVector &path) const { + const std::vector &path) const { for (int c = 0; c < path.size(); ++c) { const RecodeNode &node = *path[c]; tprintf("%d ", c); @@ -1234,7 +1233,7 @@ void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset, // Helper prints debug information on the given unichar path. void RecodeBeamSearch::DebugUnicharPath(const UNICHARSET *unicharset, - const GenericVector &path, + const std::vector &path, const std::vector &unichar_ids, const std::vector &certs, const std::vector &ratings, diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index cd5619ff..21394eb8 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -301,7 +301,7 @@ private: // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping // duplicates, nulls and intermediate parts. - static void ExtractPathAsUnicharIds(const GenericVector &best_nodes, + static void ExtractPathAsUnicharIds(const std::vector &best_nodes, std::vector *unichar_ids, std::vector *certs, std::vector *ratings, std::vector *xcoords, std::vector *character_boundaries = nullptr); @@ -380,17 +380,17 @@ private: // during Decode. On return the best_nodes vector essentially contains the set // of code, score pairs that make the optimal path with the constraint that // the recoder can decode the code sequence back to a sequence of unichar-ids. - void ExtractBestPaths(GenericVector *best_nodes, - GenericVector *second_nodes) const; + void ExtractBestPaths(std::vector *best_nodes, + std::vector *second_nodes) const; // Helper backtracks through the lattice from the given node, storing the // path and reversing it. - void ExtractPath(const RecodeNode *node, GenericVector *path) const; - void ExtractPath(const RecodeNode *node, GenericVector *path, + void ExtractPath(const RecodeNode *node, std::vector *path) const; + void ExtractPath(const RecodeNode *node, std::vector *path, int limiter) const; // Helper prints debug information on the given lattice path. - void DebugPath(const UNICHARSET *unicharset, const GenericVector &path) const; + void DebugPath(const UNICHARSET *unicharset, const std::vector &path) const; // Helper prints debug information on the given unichar path. - void DebugUnicharPath(const UNICHARSET *unicharset, const GenericVector &path, + void DebugUnicharPath(const UNICHARSET *unicharset, const std::vector &path, const std::vector &unichar_ids, const std::vector &certs, const std::vector &ratings, const std::vector &xcoords) const; diff --git a/src/lstm/tfnetwork.cpp b/src/lstm/tfnetwork.cpp index 6e05a3cb..b076b850 100644 --- a/src/lstm/tfnetwork.cpp +++ b/src/lstm/tfnetwork.cpp @@ -44,7 +44,7 @@ bool TFNetwork::Serialize(TFile *fp) const { return false; std::string proto_str; model_proto_.SerializeToString(&proto_str); - GenericVector data; + std::vector data; data.resize_no_init(proto_str.size()); memcpy(&data[0], proto_str.data(), proto_str.size()); if (!data.Serialize(fp)) @@ -55,7 +55,7 @@ bool TFNetwork::Serialize(TFile *fp) const { // Reads from the given file. Returns false in case of error. // Should be overridden by subclasses, but NOT called by their DeSerialize. bool TFNetwork::DeSerialize(TFile *fp) { - GenericVector data; + std::vector data; if (!data.DeSerialize(fp)) return false; if (!model_proto_.ParseFromArray(&data[0], data.size())) { diff --git a/src/lstm/weightmatrix.cpp b/src/lstm/weightmatrix.cpp index 10d8bac6..ebb72ff0 100644 --- a/src/lstm/weightmatrix.cpp +++ b/src/lstm/weightmatrix.cpp @@ -256,8 +256,8 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) { if (int_mode_) { if (!wi_.DeSerialize(fp)) return false; - GenericVector old_scales; - if (!old_scales.DeSerialize(fp)) + std::vector old_scales; + if (!fp->DeSerialize(old_scales)) return false; scales_.reserve(old_scales.size()); for (int i = 0; i < old_scales.size(); ++i) { diff --git a/src/wordrec/lm_pain_points.cpp b/src/wordrec/lm_pain_points.cpp index 0448e639..886b86f7 100644 --- a/src/wordrec/lm_pain_points.cpp +++ b/src/wordrec/lm_pain_points.cpp @@ -200,9 +200,10 @@ bool LMPainPoints::GeneratePainPoint(int col, int row, LMPainPointsType pp_type, */ void LMPainPoints::RemapForSplit(int index) { for (auto &pain_points_heap : pain_points_heaps_) { - GenericVector *heap = pain_points_heap.heap(); - for (int j = 0; j < heap->size(); ++j) - (*heap)[j].data().MapForSplit(index); + std::vector &heap = pain_points_heap.heap(); + for (auto entry : heap) { + entry.data().MapForSplit(index); + } } } diff --git a/unittest/unicharcompress_test.cc b/unittest/unicharcompress_test.cc index 1f315a1b..c910d5c4 100644 --- a/unittest/unicharcompress_test.cc +++ b/unittest/unicharcompress_test.cc @@ -93,7 +93,7 @@ protected: int len = compressed_.EncodeUnichar(u, &code); // Check round-trip encoding. int unichar_id; - GenericVector normed_ids; + std::vector normed_ids; if (u == null_char_ || u == unicharset_.size()) { unichar_id = null_char_; } else { @@ -137,7 +137,7 @@ protected: const std::vector ×_seen) { RecodedCharID extended = code; int length = code.length(); - const GenericVector *final_codes = compressed_.GetFinalCodes(code); + const std::vector *final_codes = compressed_.GetFinalCodes(code); if (final_codes != nullptr) { for (int i = 0; i < final_codes->size(); ++i) { int ending = (*final_codes)[i]; @@ -147,7 +147,7 @@ protected: EXPECT_NE(INVALID_UNICHAR_ID, unichar_id); } } - const GenericVector *next_codes = compressed_.GetNextCodes(code); + const std::vector *next_codes = compressed_.GetNextCodes(code); if (next_codes != nullptr) { for (int i = 0; i < next_codes->size(); ++i) { int extension = (*next_codes)[i]; From 17eee8648fac199a2bb4cad83ba526f770872b71 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 15 Mar 2021 12:39:24 +0100 Subject: [PATCH 2/5] Replace more GenericVector by std::vector Signed-off-by: Stefan Weil --- src/ccmain/applybox.cpp | 16 ++++++++-------- src/ccmain/tesseractclass.h | 4 ++-- src/ccmain/tfacepp.cpp | 16 ++++++++-------- src/ccstruct/blobs.cpp | 7 +++++-- src/ccstruct/blobs.h | 4 ++-- src/ccstruct/pageres.cpp | 37 +++++++++++++++++++++++-------------- src/ccstruct/pageres.h | 18 +++++++++--------- src/ccstruct/seam.cpp | 16 ++++++++-------- src/ccstruct/seam.h | 12 ++++++------ src/wordrec/chopper.cpp | 29 +++++++++++++---------------- src/wordrec/pieces.cpp | 2 +- src/wordrec/segsearch.cpp | 13 ++++++------- src/wordrec/wordrec.h | 24 +++++++++++------------- 13 files changed, 102 insertions(+), 96 deletions(-) diff --git a/src/ccmain/applybox.cpp b/src/ccmain/applybox.cpp index 29aacf33..5776bca6 100644 --- a/src/ccmain/applybox.cpp +++ b/src/ccmain/applybox.cpp @@ -240,7 +240,7 @@ void Tesseract::MaximallyChopWord(const std::vector &boxes, BLOCK *block, tprintf("Maximally chopping word at:"); word_res->word->bounding_box().print(); } - GenericVector blob_choices; + std::vector blob_choices; ASSERT_HOST(!word_res->chopped_word->blobs.empty()); auto rating = static_cast(INT8_MAX); for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) { @@ -271,7 +271,7 @@ void Tesseract::MaximallyChopWord(const std::vector &boxes, BLOCK *block, // combine confidence w/ serial # auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE); - blob_choices.insert(right_choice, blob_number + 1); + blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice); } } word_res->CloneChoppedToRebuild(); @@ -374,8 +374,8 @@ bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const // Eliminated best_state and correct_text entries for the consumed // blobs. for (int j = 1; j < blob_count; ++j) { - word_res->best_state.remove(i + 1); - word_res->correct_text.remove(i + 1); + word_res->best_state.erase(word_res->best_state.begin() + i + 1); + word_res->correct_text.erase(word_res->correct_text.begin() + i + 1); } // Assume that no box spans multiple source words, so we are done with // this box. @@ -548,7 +548,7 @@ bool Tesseract::FindSegmentation(const GenericVector &target_text, W // match. Using wildcards makes it difficult to find the correct // segmentation even when it is there. word_res->best_state.clear(); - GenericVector search_segmentation; + std::vector search_segmentation; float best_rating = 0.0f; SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, &word_res->best_state); @@ -597,8 +597,8 @@ bool Tesseract::FindSegmentation(const GenericVector &target_text, W /// @param best_segmentation void Tesseract::SearchForText(const GenericVector *choices, int choices_pos, int choices_length, const GenericVector &target_text, - int text_index, float rating, GenericVector *segmentation, - float *best_rating, GenericVector *best_segmentation) { + int text_index, float rating, std::vector *segmentation, + float *best_rating, std::vector *best_segmentation) { const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs(); for (int length = 1; length <= choices[choices_pos].size(); ++length) { // Rating of matching choice or worst choice if no match. @@ -654,7 +654,7 @@ void Tesseract::SearchForText(const GenericVector *choices, unicharset.id_to_unichar(target_text[text_index])); } } - segmentation->truncate(segmentation->size() - 1); + segmentation->resize(segmentation->size() - 1); } } diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 585ebe63..67897500 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -719,8 +719,8 @@ public: // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. void SearchForText(const GenericVector *choices, int choices_pos, int choices_length, const GenericVector &target_text, - int text_index, float rating, GenericVector *segmentation, - float *best_rating, GenericVector *best_segmentation); + int text_index, float rating, std::vector *segmentation, + float *best_rating, std::vector *best_segmentation); // Counts up the labelled words and the blobs within. // Deletes all unused or emptied words, counting the unused ones. // Resets W_BOL and W_EOL flags correctly. diff --git a/src/ccmain/tfacepp.cpp b/src/ccmain/tfacepp.cpp index d602f2cb..edd30071 100644 --- a/src/ccmain/tfacepp.cpp +++ b/src/ccmain/tfacepp.cpp @@ -183,7 +183,7 @@ void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, for (int i = split_pt; i < chopped->NumBlobs(); ++i) { chopped2->blobs.push_back(chopped->blobs[i]); } - chopped->blobs.truncate(split_pt); + chopped->blobs.resize(split_pt); word->chopped_word = nullptr; delete word2->chopped_word; word2->chopped_word = nullptr; @@ -223,8 +223,8 @@ void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_b TBOX prev_box = word->chopped_word->blobs.back()->bounding_box(); TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box(); // Tack the word2 outputs onto the end of the word outputs. - word->chopped_word->blobs += word2->chopped_word->blobs; - word->rebuild_word->blobs += word2->rebuild_word->blobs; + word->chopped_word->blobs.insert(word->chopped_word->blobs.end(), word2->chopped_word->blobs.begin(), word2->chopped_word->blobs.end()); + word->rebuild_word->blobs.insert(word->rebuild_word->blobs.end(), word2->rebuild_word->blobs.begin(), word2->rebuild_word->blobs.end()); word2->chopped_word->blobs.clear(); word2->rebuild_word->blobs.clear(); TPOINT split_pt; @@ -234,17 +234,17 @@ void Tesseract::join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_b // Since the seam list is one element short, an empty seam marking the // end of the last blob in the first word is needed first. word->seam_array.push_back(new SEAM(0.0f, split_pt)); - word->seam_array += word2->seam_array; - word2->seam_array.truncate(0); + word->seam_array.insert(word->seam_array.end(), word2->seam_array.begin(), word2->seam_array.end()); + word2->seam_array.clear(); // Fix widths and gaps. - word->blob_widths += word2->blob_widths; - word->blob_gaps += word2->blob_gaps; + word->blob_widths.insert(word->blob_widths.end(), word2->blob_widths.begin(), word2->blob_widths.end()); + word->blob_gaps.insert(word->blob_gaps.end(), word2->blob_gaps.begin(), word2->blob_gaps.end()); // Fix the ratings matrix. int rat1 = word->ratings->dimension(); int rat2 = word2->ratings->dimension(); word->ratings->AttachOnCorner(word2->ratings); ASSERT_HOST(word->ratings->dimension() == rat1 + rat2); - word->best_state += word2->best_state; + word->best_state.insert(word->best_state.end(), word2->best_state.begin(), word2->best_state.end()); // Append the word choices. *word->raw_choice += *word2->raw_choice; diff --git a/src/ccstruct/blobs.cpp b/src/ccstruct/blobs.cpp index e0481d8b..178ff930 100644 --- a/src/ccstruct/blobs.cpp +++ b/src/ccstruct/blobs.cpp @@ -826,7 +826,9 @@ void TWERD::CopyFrom(const TWERD &src) { // Deletes owned data. void TWERD::Clear() { - blobs.delete_data_pointers(); + for (auto blob : blobs) { + delete blob; + } blobs.clear(); } @@ -869,8 +871,9 @@ void TWERD::MergeBlobs(int start, int end) { blobs[i] = nullptr; } // Remove dead blobs from the vector. + // TODO: optimize. for (int i = start + 1; i < end && start + 1 < blobs.size(); ++i) { - blobs.remove(start + 1); + blobs.erase(blobs.begin() + start + 1); } } diff --git a/src/ccstruct/blobs.h b/src/ccstruct/blobs.h index 1102b559..683129fc 100644 --- a/src/ccstruct/blobs.h +++ b/src/ccstruct/blobs.h @@ -450,8 +450,8 @@ struct TWERD { void plot(ScrollView *window); - GenericVector blobs; // Blobs in word. - bool latin_script; // This word is in a latin-based script. + std::vector blobs; // Blobs in word. + bool latin_script; // This word is in a latin-based script. }; /*---------------------------------------------------------------------- diff --git a/src/ccstruct/pageres.cpp b/src/ccstruct/pageres.cpp index 80bed0a3..e357f500 100644 --- a/src/ccstruct/pageres.cpp +++ b/src/ccstruct/pageres.cpp @@ -391,8 +391,8 @@ void WERD_RES::SetupBlamerBundle() { // Computes the blob_widths and blob_gaps from the chopped_word. void WERD_RES::SetupBlobWidthsAndGaps() { - blob_widths.truncate(0); - blob_gaps.truncate(0); + blob_widths.clear(); + blob_gaps.clear(); int num_blobs = chopped_word->NumBlobs(); for (int b = 0; b < num_blobs; ++b) { TBLOB *blob = chopped_word->blobs[b]; @@ -410,7 +410,7 @@ void WERD_RES::SetupBlobWidthsAndGaps() { void WERD_RES::InsertSeam(int blob_number, SEAM *seam) { // Insert the seam into the SEAMS array. seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true); - seam_array.insert(seam, blob_number); + seam_array.insert(seam_array.begin() + blob_number, seam); if (ratings != nullptr) { // Expand the ratings matrix. ratings = ratings->ConsumeAndMakeBigger(blob_number); @@ -753,13 +753,20 @@ void WERD_RES::ConsumeWordResults(WERD_RES *word) { MovePointerData(&chopped_word, &word->chopped_word); MovePointerData(&rebuild_word, &word->rebuild_word); MovePointerData(&box_word, &word->box_word); - seam_array.delete_data_pointers(); + for (auto data : seam_array) { + delete data; + } seam_array = word->seam_array; word->seam_array.clear(); - best_state.move(&word->best_state); - correct_text.move(&word->correct_text); - blob_widths.move(&word->blob_widths); - blob_gaps.move(&word->blob_gaps); + // TODO: optimize moves. + best_state = word->best_state; + word->best_state.clear(); + correct_text = word->correct_text; + word->correct_text.clear(); + blob_widths = word->blob_widths; + word->blob_widths.clear(); + blob_gaps = word->blob_gaps; + word->blob_gaps.clear(); if (ratings != nullptr) ratings->delete_matrix_pointers(); MovePointerData(&ratings, &word->ratings); @@ -797,7 +804,7 @@ void WERD_RES::RebuildBestState() { rebuild_word = new TWERD; if (seam_array.empty()) start_seam_list(chopped_word, &seam_array); - best_state.truncate(0); + best_state.clear(); int start = 0; for (int i = 0; i < best_choice->length(); ++i) { int length = best_choice->state(i); @@ -873,7 +880,7 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE **choices) { } FakeWordFromRatings(TOP_CHOICE_PERM); reject_map.initialise(blob_count); - best_state.init_to_size(blob_count, 1); + best_state.resize(blob_count, 1); done = true; } @@ -958,7 +965,7 @@ void WERD_RES::MergeAdjacentBlobs(int index) { box_word->MergeBoxes(index, index + 2); if (index + 1 < best_state.size()) { best_state[index] += best_state[index + 1]; - best_state.remove(index + 1); + best_state.erase(best_state.begin() + index + 1); } } @@ -1088,7 +1095,9 @@ void WERD_RES::ClearResults() { box_word = nullptr; best_state.clear(); correct_text.clear(); - seam_array.delete_data_pointers(); + for (auto data : seam_array) { + delete data; + } seam_array.clear(); blob_widths.clear(); blob_gaps.clear(); @@ -1204,7 +1213,7 @@ WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *ne // are likely very poor, if they come from LSTM, where it only outputs the // character at one pixel within it, so we find the midpoints between them. static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box, - C_BLOB_LIST *next_word_blobs, GenericVector *blob_ends) { + C_BLOB_LIST *next_word_blobs, std::vector *blob_ends) { C_BLOB_IT blob_it(word.word->cblob_list()); for (int i = 0; i < word.best_state.size(); ++i) { int length = word.best_state[i]; @@ -1341,7 +1350,7 @@ void PAGE_RES_IT::ReplaceCurrentWord(tesseract::PointerVector *words) WERD_RES *word_w = (*words)[w]; clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word); // Compute blob boundaries. - GenericVector blob_ends; + std::vector blob_ends; C_BLOB_LIST *next_word_blobs = w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr; ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends); diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h index 3a5f8530..1597a315 100644 --- a/src/ccstruct/pageres.h +++ b/src/ccstruct/pageres.h @@ -31,7 +31,7 @@ #include "werd.h" // for WERD, W_BOL, W_EOL #include // for UNICHAR_ID, INVALID_UNICHAR_ID -#include "genericvector.h" // for GenericVector, PointerVector (ptr only) +#include "genericvector.h" // for PointerVector (ptr only) #include // for int8_t #include // for int32_t, int16_t @@ -83,19 +83,19 @@ public: // the next word. This pointer is not owned by PAGE_RES class. WERD_CHOICE **prev_word_best_choice; // Sums of blame reasons computed by the blamer. - GenericVector blame_reasons; + std::vector blame_reasons; // Debug information about all the misadaptions on this page. // Each BlamerBundle contains an index into this vector, so that words that // caused misadaption could be marked. However, since words could be // deleted/split/merged, the log is stored on the PAGE_RES level. - GenericVector misadaption_log; + std::vector misadaption_log; inline void Init() { char_count = 0; rej_count = 0; rejected = false; prev_word_best_choice = nullptr; - blame_reasons.init_to_size(IRR_NUM_REASONS, 0); + blame_reasons.resize(IRR_NUM_REASONS); } PAGE_RES() { @@ -207,12 +207,12 @@ public: // The length of chopped_word matches length of seam_array + 1 (if set). TWERD *chopped_word = nullptr; // BLN chopped fragments output. // Vector of SEAM* holding chopping points matching chopped_word. - GenericVector seam_array; + std::vector seam_array; // Widths of blobs in chopped_word. - GenericVector blob_widths; + std::vector blob_widths; // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between // blob i and blob i+1. - GenericVector blob_gaps; + std::vector blob_gaps; // Stores the lstm choices of every timestep std::vector>> timesteps; // Stores the lstm choices of every timestep segmented by character @@ -277,11 +277,11 @@ public: // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i] // adjacent blobs in chopped_word. The seams in seam_array are hidden // within a rebuild_word blob and revealed between them. - GenericVector best_state; // Number of blobs in each best blob. + std::vector best_state; // Number of blobs in each best blob. // The correct_text is used during training and adaption to carry the // text to the training system without the need for a unicharset. There // is one entry in the vector for each blob in rebuild_word and box_word. - GenericVector correct_text; + std::vector correct_text; // Less-well documented members. // TODO(rays) Add more documentation here. diff --git a/src/ccstruct/seam.cpp b/src/ccstruct/seam.cpp index 6a382304..0b1a2ca3 100644 --- a/src/ccstruct/seam.cpp +++ b/src/ccstruct/seam.cpp @@ -51,8 +51,8 @@ bool SEAM::IsHealthy(const TBLOB &blob, int min_points, int min_area) const { // seam, which is about to be inserted at insert_index. Returns false if // any of the computations fails, as this indicates an invalid chop. // widthn_/widthp_ are only changed if modify is true. -bool SEAM::PrepareToInsertSeam(const GenericVector &seams, - const GenericVector &blobs, int insert_index, bool modify) { +bool SEAM::PrepareToInsertSeam(const std::vector &seams, + const std::vector &blobs, int insert_index, bool modify) { for (int s = 0; s < insert_index; ++s) { if (!seams[s]->FindBlobWidth(blobs, s, modify)) return false; @@ -68,7 +68,7 @@ bool SEAM::PrepareToInsertSeam(const GenericVector &seams, // Computes the widthp_/widthn_ range. Returns false if not all the splits // are accounted for. widthn_/widthp_ are only changed if modify is true. -bool SEAM::FindBlobWidth(const GenericVector &blobs, int index, bool modify) { +bool SEAM::FindBlobWidth(const std::vector &blobs, int index, bool modify) { int num_found = 0; if (modify) { widthp_ = 0; @@ -147,7 +147,7 @@ void SEAM::Print(const char *label) const { // Prints a collection of SEAMs. /* static */ -void SEAM::PrintSeams(const char *label, const GenericVector &seams) { +void SEAM::PrintSeams(const char *label, const std::vector &seams) { if (!seams.empty()) { tprintf("%s\n", label); for (int x = 0; x < seams.size(); ++x) { @@ -169,7 +169,7 @@ void SEAM::Mark(ScrollView *window) const { // Break up the blobs in this chain so that they are all independent. // This operation should undo the affect of join_pieces. /* static */ -void SEAM::BreakPieces(const GenericVector &seams, const GenericVector &blobs, +void SEAM::BreakPieces(const std::vector &seams, const std::vector &blobs, int first, int last) { for (int x = first; x < last; ++x) seams[x]->Reveal(); @@ -191,7 +191,7 @@ void SEAM::BreakPieces(const GenericVector &seams, const GenericVector &seams, const GenericVector &blobs, +void SEAM::JoinPieces(const std::vector &seams, const std::vector &blobs, int first, int last) { TESSLINE *outline = blobs[first]->outlines; if (!outline) @@ -245,8 +245,8 @@ float SEAM::FullPriority(int xmin, int xmax, double overlap_knob, int centered_m * present in the starting segmentation. Each of the seams created * by this routine have location information only. */ -void start_seam_list(TWERD *word, GenericVector *seam_array) { - seam_array->truncate(0); +void start_seam_list(TWERD *word, std::vector *seam_array) { + seam_array->clear(); TPOINT location; for (int b = 1; b < word->NumBlobs(); ++b) { diff --git a/src/ccstruct/seam.h b/src/ccstruct/seam.h index d9a2e5f2..51b29a9c 100644 --- a/src/ccstruct/seam.h +++ b/src/ccstruct/seam.h @@ -133,11 +133,11 @@ public: // seam, which is about to be inserted at insert_index. Returns false if // any of the computations fails, as this indicates an invalid chop. // widthn_/widthp_ are only changed if modify is true. - bool PrepareToInsertSeam(const GenericVector &seams, const GenericVector &blobs, + bool PrepareToInsertSeam(const std::vector &seams, const std::vector &blobs, int insert_index, bool modify); // Computes the widthp_/widthn_ range. Returns false if not all the splits // are accounted for. widthn_/widthp_ are only changed if modify is true. - bool FindBlobWidth(const GenericVector &blobs, int index, bool modify); + bool FindBlobWidth(const std::vector &blobs, int index, bool modify); // Splits this blob into two blobs by applying the splits included in // *this SEAM @@ -149,7 +149,7 @@ public: // Prints everything in *this SEAM. void Print(const char *label) const; // Prints a collection of SEAMs. - static void PrintSeams(const char *label, const GenericVector &seams); + static void PrintSeams(const char *label, const std::vector &seams); #ifndef GRAPHICS_DISABLED // Draws the seam in the given window. void Mark(ScrollView *window) const; @@ -157,11 +157,11 @@ public: // Break up the blobs in this chain so that they are all independent. // This operation should undo the affect of join_pieces. - static void BreakPieces(const GenericVector &seams, const GenericVector &blobs, + static void BreakPieces(const std::vector &seams, const std::vector &blobs, int first, int last); // Join a group of base level pieces into a single blob that can then // be classified. - static void JoinPieces(const GenericVector &seams, const GenericVector &blobs, + static void JoinPieces(const std::vector &seams, const std::vector &blobs, int first, int last); // Hides the seam so the outlines appear not to be cut by it. @@ -193,7 +193,7 @@ private: SPLIT splits_[kMaxNumSplits]; }; -void start_seam_list(TWERD *word, GenericVector *seam_array); +void start_seam_list(TWERD *word, std::vector *seam_array); } // namespace tesseract diff --git a/src/wordrec/chopper.cpp b/src/wordrec/chopper.cpp index 955a7982..a7d17c56 100644 --- a/src/wordrec/chopper.cpp +++ b/src/wordrec/chopper.cpp @@ -41,9 +41,6 @@ namespace tesseract { -template -class GenericVector; - // Even though the limit on the number of chunks may now be removed, keep // the same limit for repeatable behavior, and it may be a speed advantage. static const int kMaxNumChunks = 64; @@ -79,7 +76,7 @@ static int check_blob(TBLOB *blob) { * * Return true if any of the splits share a point with this one. */ -static int any_shared_split_points(const GenericVector &seams, SEAM *seam) { +static int any_shared_split_points(const std::vector &seams, SEAM *seam) { int length; int index; @@ -167,13 +164,13 @@ static int16_t total_containment(TBLOB *blob1, TBLOB *blob2) { // Helper runs all the checks on a seam to make sure it is valid. // Returns the seam if OK, otherwise deletes the seam and returns nullptr. static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob, - TBLOB *other_blob, const GenericVector &seams, SEAM *seam) { + TBLOB *other_blob, const std::vector &seams, SEAM *seam) { if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr || total_containment(blob, other_blob) || check_blob(other_blob) || !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) || any_shared_split_points(seams, seam) || !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) { - word->blobs.remove(blob_number + 1); + word->blobs.erase(word->blobs.begin() + blob_number + 1); if (seam) { seam->UndoSeam(blob, other_blob); delete seam; @@ -200,12 +197,12 @@ static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB * it was successful. */ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, - const GenericVector &seams) { + const std::vector &seams) { if (repair_unchopped_blobs) preserve_outline_tree(blob->outlines); TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ // Insert it into the word. - word->blobs.insert(other_blob, blob_number + 1); + word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); SEAM *seam = nullptr; if (prioritize_division) { @@ -235,7 +232,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, TPOINT location; if (divisible_blob(blob, italic_blob, &location)) { other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ - word->blobs.insert(other_blob, blob_number + 1); + word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob); seam = new SEAM(0.0f, location); seam->ApplySeam(italic_blob, blob, other_blob); seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); @@ -250,7 +247,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, } SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, - const GenericVector &seams) { + const std::vector &seams) { return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams); } @@ -305,7 +302,7 @@ SEAM *Wordrec::chop_overlapping_blob(const std::vector &boxes, bool italic * word->seam_array and the resulting blobs are unclassified, so this function * can be used by ApplyBox as well as during recognition. */ -SEAM *Wordrec::improve_one_blob(const GenericVector &blob_choices, DANGERR *fixpt, +SEAM *Wordrec::improve_one_blob(const std::vector &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number) { float rating_ceiling = FLT_MAX; @@ -347,7 +344,7 @@ SEAM *Wordrec::improve_one_blob(const GenericVector &blob_choices * Used for testing chopper. */ SEAM *Wordrec::chop_one_blob(const std::vector &boxes, - const GenericVector &blob_choices, WERD_RES *word_res, + const std::vector &blob_choices, WERD_RES *word_res, int *blob_number) { if (prioritize_division) { return chop_overlapping_blob(boxes, true, word_res, blob_number); @@ -427,12 +424,12 @@ void Wordrec::chop_word_main(WERD_RES *word) { void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, - GenericVector *pending) { + std::vector *pending) { int blob_number; do { // improvement loop. // Make a simple vector of BLOB_CHOICEs to make it easy to pick which // one to chop. - GenericVector blob_choices; + std::vector blob_choices; int num_blobs = word->ratings->dimension(); for (int i = 0; i < num_blobs; ++i) { BLOB_CHOICE_LIST *choices = word->ratings->get(i, i); @@ -460,7 +457,7 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word, // Remap existing pain points. pain_points->RemapForSplit(blob_number); // Insert a new pending at the chop point. - pending->insert(SegSearchPending(), blob_number); + pending->insert(pending->begin() + blob_number, SegSearchPending()); // Classify the two newly created blobs using ProcessSegSearchPainPoint, // as that updates the pending correctly and adds new pain points. @@ -501,7 +498,7 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word, * These are the results of the last classification. Find a likely * place to apply splits. If none, return -1. **********************************************************************/ -int Wordrec::select_blob_to_split(const GenericVector &blob_choices, +int Wordrec::select_blob_to_split(const std::vector &blob_choices, float rating_ceiling, bool split_next_to_fragment) { BLOB_CHOICE *blob_choice; int x; diff --git a/src/wordrec/pieces.cpp b/src/wordrec/pieces.cpp index ec5b31e0..84c1fcf6 100644 --- a/src/wordrec/pieces.cpp +++ b/src/wordrec/pieces.cpp @@ -46,7 +46,7 @@ using tesseract::ScoredFont; * the collection of small pieces un modified. **********************************************************************/ namespace tesseract { -BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector &seams, int16_t start, +BLOB_CHOICE_LIST *Wordrec::classify_piece(const std::vector &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle) { if (end > start) diff --git a/src/wordrec/segsearch.cpp b/src/wordrec/segsearch.cpp index f4ee21f4..29fec817 100644 --- a/src/wordrec/segsearch.cpp +++ b/src/wordrec/segsearch.cpp @@ -19,7 +19,6 @@ #include // for INT32_MAX #include "blamer.h" // for BlamerBundle #include "errcode.h" // for ASSERT_HOST -#include "genericvector.h" // for GenericVector #include "lm_pain_points.h" // for LMPainPoints, LM_PPTYPE_SHAPE, LMPainPoi... #include "lm_state.h" // for BestChoiceBundle, ViterbiStateEntry #include "matrix.h" // for MATRIX_COORD, MATRIX @@ -44,7 +43,7 @@ void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle // Compute scaling factor that will help us recover blob outline length // from classifier rating and certainty for the blob. float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; - GenericVector pending; + std::vector pending; InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle); if (!SegSearchDone(0)) { // find a better choice @@ -122,7 +121,7 @@ void Wordrec::SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle // without doing any additional chopping or joining. // (Internal factored version that can be used as part of the main SegSearch.) void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, - GenericVector *pending, + std::vector *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) { if (segsearch_debug_level > 0) { tprintf("Starting SegSearch on ratings matrix%s:\n", @@ -154,7 +153,7 @@ void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, // children are considered in the non-decreasing order of their column, since // this guarantees that all the parents would be up to date before an update // of a child is done. - pending->init_to_size(word_res->ratings->dimension(), SegSearchPending()); + pending->resize(word_res->ratings->dimension(), SegSearchPending()); // Search the ratings matrix for the initial best path. (*pending)[0].SetColumnClassified(); @@ -163,7 +162,7 @@ void Wordrec::InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, } void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col, - GenericVector *pending, WERD_RES *word_res, + std::vector *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) { MATRIX *ratings = word_res->ratings; @@ -223,7 +222,7 @@ void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col, void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, - GenericVector *pending, + std::vector *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle) { if (segsearch_debug_level > 0) { @@ -279,7 +278,7 @@ void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_ // Needed when the n-gram model is enabled, as the multi-length comparison // implementation will re-value existing paths to worse values. void Wordrec::ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, - GenericVector *pending) { + std::vector *pending) { // TODO(rays) More refactoring required here. // Delete existing viterbi states. for (int col = 0; col < best_choice_bundle->beam.size(); ++col) { diff --git a/src/wordrec/wordrec.h b/src/wordrec/wordrec.h index 05119704..90f2e919 100644 --- a/src/wordrec/wordrec.h +++ b/src/wordrec/wordrec.h @@ -81,8 +81,6 @@ public: # include "seam.h" // for SEAM (ptr only), PRIORITY # include "stopper.h" // for DANGERR -# include "genericvector.h" // for GenericVector - # include // for int16_t, int32_t namespace tesseract { @@ -329,7 +327,7 @@ public: // without doing any additional chopping or joining. // (Internal factored version that can be used as part of the main SegSearch.) void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, - GenericVector *pending, + std::vector *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); // Runs SegSearch() function (above) without needing a best_choice_bundle @@ -352,22 +350,22 @@ public: // chopper.cpp SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, - const GenericVector &seams); + const std::vector &seams); SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, - const GenericVector &seams); + const std::vector &seams); SEAM *chop_overlapping_blob(const std::vector &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number); - SEAM *improve_one_blob(const GenericVector &blob_choices, DANGERR *fixpt, + SEAM *improve_one_blob(const std::vector &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number); SEAM *chop_one_blob(const std::vector &boxes, - const GenericVector &blob_choices, WERD_RES *word_res, + const std::vector &blob_choices, WERD_RES *word_res, int *blob_number); void chop_word_main(WERD_RES *word); void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, - LMPainPoints *pain_points, GenericVector *pending); - int select_blob_to_split(const GenericVector &blob_choices, float rating_ceiling, + LMPainPoints *pain_points, std::vector *pending); + int select_blob_to_split(const std::vector &blob_choices, float rating_ceiling, bool split_next_to_fragment); int select_blob_to_split_from_fixpt(DANGERR *fixpt); @@ -391,7 +389,7 @@ public: bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt); // pieces.cpp - virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector &seams, int16_t start, + virtual BLOB_CHOICE_LIST *classify_piece(const std::vector &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle); // Try to merge fragments in the ratings matrix and put the result in @@ -466,7 +464,7 @@ protected: // if a new best choice is found // void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, - GenericVector *pending, WERD_RES *word_res, + std::vector *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); @@ -474,13 +472,13 @@ protected: // new pain points to join the newly classified blob with its neighbors. void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, - GenericVector *pending, WERD_RES *word_res, + std::vector *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle); // Resets enough of the results so that the Viterbi search is re-run. // Needed when the n-gram model is enabled, as the multi-length comparison // implementation will re-value existing paths to worse values. void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, - GenericVector *pending); + std::vector *pending); // Add pain points for classifying blobs on the correct segmentation path // (so that we can evaluate correct segmentation path and discover the reason From bf42f8313d6258be3db481f8cea80995aaf117db Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 15 Mar 2021 12:58:24 +0100 Subject: [PATCH 3/5] Replace remaining GenericVector by std::vector for src/dict Signed-off-by: Stefan Weil --- src/ccstruct/params_training_featdef.h | 1 + src/dict/dawg.h | 15 +++++----- src/dict/dict.cpp | 40 ++++++++++++++------------ src/dict/dict.h | 4 +-- src/dict/stopper.h | 4 +-- src/dict/trie.cpp | 26 +++++++++-------- src/dict/trie.h | 18 ++++++------ src/wordrec/language_model.cpp | 2 -- src/wordrec/params_model.cpp | 10 ++++--- src/wordrec/params_model.h | 10 +++---- 10 files changed, 67 insertions(+), 63 deletions(-) diff --git a/src/ccstruct/params_training_featdef.h b/src/ccstruct/params_training_featdef.h index 631a5a68..18c9c726 100644 --- a/src/ccstruct/params_training_featdef.h +++ b/src/ccstruct/params_training_featdef.h @@ -19,6 +19,7 @@ #ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_ #define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_ +#include // for memset #include #include diff --git a/src/dict/dawg.h b/src/dict/dawg.h index 2f00885d..67cf0b1f 100644 --- a/src/dict/dawg.h +++ b/src/dict/dawg.h @@ -57,9 +57,9 @@ struct NodeChild { NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} }; -using NodeChildVector = GenericVector; -using SuccessorList = GenericVector; -using SuccessorListsVector = GenericVector; +using NodeChildVector = std::vector; +using SuccessorList = std::vector; +using SuccessorListsVector = std::vector; enum DawgType { DAWG_TYPE_PUNCTUATION, @@ -176,7 +176,7 @@ public: /// Fills vec with unichar ids that represent the character classes /// of the given unichar_id. virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, - GenericVector *vec) const { + std::vector *vec) const { (void)unichar_id; (void)unicharset; (void)vec; @@ -355,15 +355,16 @@ struct DawgPosition { bool back_to_punc = false; }; -class DawgPositionVector : public GenericVector { +class DawgPositionVector : public std::vector { public: /// Adds an entry for the given dawg_index with the given node to the vec. /// Returns false if the same entry already exists in the vector, /// true otherwise. inline bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg) { - for (int i = 0; i < size(); ++i) { - if (data_[i] == new_pos) + for (auto position : *this) { + if (position == new_pos) { return false; + } } push_back(new_pos); if (debug) { diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp index 5ae7cc0f..122083f5 100644 --- a/src/dict/dict.cpp +++ b/src/dict/dict.cpp @@ -201,19 +201,19 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file); if (punc_dawg_) - dawgs_ += punc_dawg_; + dawgs_.push_back(punc_dawg_); } if (load_system_dawg) { Dawg *system_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); if (system_dawg) - dawgs_ += system_dawg; + dawgs_.push_back(system_dawg); } if (load_number_dawg) { Dawg *number_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) - dawgs_ += number_dawg; + dawgs_.push_back(number_dawg); } if (load_bigram_dawg) { bigram_dawg_ = @@ -225,13 +225,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file); if (freq_dawg_) - dawgs_ += freq_dawg_; + dawgs_.push_back(freq_dawg_); } if (load_unambig_dawg) { unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file); if (unambig_dawg_) - dawgs_ += unambig_dawg_; + dawgs_.push_back(unambig_dawg_); } std::string name; @@ -249,7 +249,7 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } @@ -267,13 +267,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level); - dawgs_ += document_words_; + dawgs_.push_back(document_words_); // This dawg is temporary and should not be searched by letter_is_ok. pending_words_ = @@ -287,19 +287,19 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file); if (punc_dawg_) - dawgs_ += punc_dawg_; + dawgs_.push_back(punc_dawg_); } if (load_system_dawg) { Dawg *system_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); if (system_dawg) - dawgs_ += system_dawg; + dawgs_.push_back(system_dawg); } if (load_number_dawg) { Dawg *number_dawg = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) - dawgs_ += number_dawg; + dawgs_.push_back(number_dawg); } // stolen from Dict::Load (but needs params_ from Tesseract @@ -319,7 +319,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } @@ -337,7 +337,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) { tprintf("Error: failed to load %s\n", name.c_str()); delete trie_ptr; } else { - dawgs_ += trie_ptr; + dawgs_.push_back(trie_ptr); } } } @@ -358,9 +358,9 @@ bool Dict::FinishLoad() { const Dawg *other = dawgs_[j]; if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) && kDawgSuccessors[dawg->type()][other->type()]) - *lst += j; + lst->push_back(j); } - successors_ += lst; + successors_.push_back(lst); } return true; } @@ -378,7 +378,9 @@ void Dict::End() { delete dawg_cache_; dawg_cache_ = nullptr; } - successors_.delete_data_pointers(); + for (auto successor : successors_) { + delete successor; + } dawgs_.clear(); successors_.clear(); document_words_ = nullptr; @@ -550,7 +552,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); // Try to find the edge corresponding to the exact unichar_id and to all the // edges corresponding to the character class of unichar_id. - GenericVector unichar_id_patterns; + std::vector unichar_id_patterns; unichar_id_patterns.push_back(unichar_id); dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns); for (int i = 0; i < unichar_id_patterns.size(); ++i) { @@ -605,12 +607,12 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern int dawg_ty = dawgs_[i]->type(); bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; if (dawg_ty == DAWG_TYPE_PUNCTUATION) { - *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false); + dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false)); if (dawg_debug_level >= 3) { tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); } } else if (!punc_dawg_available || !subsumed_by_punc) { - *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false); + dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false)); if (dawg_debug_level >= 3) { tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); } diff --git a/src/dict/dict.h b/src/dict/dict.h index 9a40afc6..37c14ce1 100644 --- a/src/dict/dict.h +++ b/src/dict/dict.h @@ -54,7 +54,7 @@ struct CHAR_FRAGMENT_INFO { float certainty; }; -using DawgVector = GenericVector; +using DawgVector = std::vector; // // Constants @@ -495,7 +495,7 @@ private: // matching. The first member of each list is taken as canonical. For // example, the first list contains hyphens and dashes with the first symbol // being the ASCII hyphen minus. - std::vector> equivalent_symbols_; + std::vector> equivalent_symbols_; // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs. DawgCache *dawg_cache_; bool dawg_cache_is_ours_; // we should delete our own dawg_cache_ diff --git a/src/dict/stopper.h b/src/dict/stopper.h index f675c0a6..23be742e 100644 --- a/src/dict/stopper.h +++ b/src/dict/stopper.h @@ -2,7 +2,6 @@ ** Filename: stopper.h ** Purpose: Stopping criteria for word classifier. ** Author: Dan Johnson - ** History: Wed May 1 09:42:57 1991, DSJ, Created. ** ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,7 +21,6 @@ #include "ratngs.h" #include -#include "genericvector.h" namespace tesseract { @@ -46,7 +44,7 @@ struct DANGERR_INFO { UNICHAR_ID leftmost; // in the replacement, what's the leftmost character? }; -using DANGERR = GenericVector; +using DANGERR = std::vector; } // namespace tesseract diff --git a/src/dict/trie.cpp b/src/dict/trie.cpp index e739c802..95917f41 100644 --- a/src/dict/trie.cpp +++ b/src/dict/trie.cpp @@ -24,7 +24,6 @@ #include "dawg.h" #include "dict.h" -#include "genericvector.h" #include "helpers.h" #include "kdpair.h" @@ -49,7 +48,9 @@ const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { // Reset the Trie to empty. void Trie::clear() { - nodes_.delete_data_pointers(); + for (auto node : nodes_) { + delete node; + } nodes_.clear(); root_back_freelist_.clear(); num_edges_ = 0; @@ -122,10 +123,11 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in EDGE_RECORD edge_rec; link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) { - EDGE_INDEX edge_index = root_back_freelist_.pop_back(); + EDGE_INDEX edge_index = root_back_freelist_.back(); + root_back_freelist_.pop_back(); (*vec)[edge_index] = edge_rec; } else if (search_index < vec->size()) { - vec->insert(edge_rec, search_index); + vec->insert(vec->begin() + search_index, edge_rec); } else { vec->push_back(edge_rec); } @@ -153,7 +155,7 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool m *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); } -bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector *repetitions) { +bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector *repetitions) { if (word.length() <= 0) return false; // can't add empty words if (repetitions != nullptr) @@ -330,7 +332,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) { } void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, - GenericVector *vec) const { + std::vector *vec) const { bool is_alpha = unicharset.get_isalpha(unichar_id); if (is_alpha) { vec->push_back(alpha_pattern_); @@ -388,7 +390,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset) // Parse the pattern and construct a unichar id vector. // Record the number of repetitions of each unichar in the parallel vector. WERD_CHOICE word(&unicharset); - GenericVector repetitions_vec; + std::vector repetitions_vec; const char *str_ptr = string; int step = unicharset.step(str_ptr); bool failed = false; @@ -462,12 +464,12 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo tprintf("\n"); } if (direction == FORWARD_EDGE) { - nodes_[node1]->forward_edges.remove(edge_index); + nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index); } else if (node1 == 0) { KillEdge(&nodes_[node1]->backward_edges[edge_index]); root_back_freelist_.push_back(edge_index); } else { - nodes_[node1]->backward_edges.remove(edge_index); + nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index); } --num_edges_; } @@ -476,7 +478,7 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo // 1 Avoid insertion sorting or bubble sorting the tail root node // (back links on node 0, a list of all the leaves.). The node is // huge, and sorting it with n^2 time is terrible. -// 2 Avoid using GenericVector::remove on the tail root node. +// 2 Avoid using vector::erase on the tail root node. // (a) During add of words to the trie, zero-out the unichars and // keep a freelist of spaces to re-use. // (b) During reduction, just zero-out the unichars of deleted back @@ -624,13 +626,13 @@ void Trie::sort_edges(EDGE_VECTOR *edges) { int num_edges = edges->size(); if (num_edges <= 1) return; - GenericVector> sort_vec; + std::vector> sort_vec; sort_vec.reserve(num_edges); for (int i = 0; i < num_edges; ++i) { sort_vec.push_back( KDPairInc(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i])); } - sort_vec.sort(); + std::sort(sort_vec.begin(), sort_vec.end()); for (int i = 0; i < num_edges; ++i) (*edges)[i] = sort_vec[i].data(); } diff --git a/src/dict/trie.h b/src/dict/trie.h index 7a76e008..f3c9850f 100644 --- a/src/dict/trie.h +++ b/src/dict/trie.h @@ -21,14 +21,12 @@ #include "dawg.h" -#include "genericvector.h" - namespace tesseract { class UNICHARSET; // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed -// max int32, we will need to change GenericVector to use int64 for size +// max int32, we will need to change vector to use int64 for size // and address indices. This does not seem to be needed immediately, // since currently the largest number of edges limit used by tesseract // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32. @@ -39,13 +37,13 @@ class UNICHARSET; // the 64 bit EDGE_RECORD. using EDGE_INDEX = int64_t; // index of an edge in a given node using NODE_MARKER = bool *; -using EDGE_VECTOR = GenericVector; +using EDGE_VECTOR = std::vector; struct TRIE_NODE_RECORD { EDGE_VECTOR forward_edges; EDGE_VECTOR backward_edges; }; -using TRIE_NODES = GenericVector; +using TRIE_NODES = std::vector; /** * Concrete class for Trie data structure that allows to store a list of @@ -88,7 +86,9 @@ public: initialized_patterns_ = false; } ~Trie() override { - nodes_.delete_data_pointers(); + for (auto node : nodes_) { + delete node; + } } // Reset the Trie to empty. @@ -230,7 +230,7 @@ public: // Fills in the given unichar id vector with the unichar ids that represent // the patterns of the character classes of the given unichar_id. void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, - GenericVector *vec) const override; + std::vector *vec) const override; // Returns the given EDGE_REF if the EDGE_RECORD that it points to has // a self loop and the given unichar_id matches the unichar_id stored in the @@ -256,7 +256,7 @@ public: // // Return true if add succeeded, false otherwise (e.g. when a word contained // an invalid unichar id or the trie was getting too large and was cleared). - bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector *repetitions); + bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector *repetitions); bool add_word_to_dawg(const WERD_CHOICE &word) { return add_word_to_dawg(word, nullptr); } @@ -395,7 +395,7 @@ protected: // Member variables TRIE_NODES nodes_; // vector of nodes in the Trie // Freelist of edges in the root backwards node that were previously zeroed. - GenericVector root_back_freelist_; + std::vector root_back_freelist_; uint64_t num_edges_; // sum of all edges (forward and backward) uint64_t deref_direction_mask_; // mask for EDGE_REF to extract direction uint64_t deref_node_index_mask_; // mask for EDGE_REF to extract node index diff --git a/src/wordrec/language_model.cpp b/src/wordrec/language_model.cpp index b7dd36c6..11a0abca 100644 --- a/src/wordrec/language_model.cpp +++ b/src/wordrec/language_model.cpp @@ -34,8 +34,6 @@ #include "unicharset.h" // for UNICHARSET #include "unicity_table.h" // for UnicityTable -template -class GenericVector; template class UnicityTable; diff --git a/src/wordrec/params_model.cpp b/src/wordrec/params_model.cpp index c32b37af..bc1c3aba 100644 --- a/src/wordrec/params_model.cpp +++ b/src/wordrec/params_model.cpp @@ -23,6 +23,8 @@ #include #include "bitvector.h" +#include "helpers.h" // for ClipToRange +#include "serialis.h" // for TFile #include "tprintf.h" namespace tesseract { @@ -103,8 +105,8 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) { present.Init(PTRAIN_NUM_FEATURE_TYPES); lang_ = lang; // Load weights for passes with adaption on. - GenericVector &weights = weights_vec_[pass_]; - weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0); + std::vector &weights = weights_vec_[pass_]; + weights.resize(PTRAIN_NUM_FEATURE_TYPES, 0.0f); while (fp->FGets(line, kMaxLineSize) != nullptr) { char *key = nullptr; @@ -129,13 +131,13 @@ bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) { } } lang_ = ""; - weights.truncate(0); + weights.clear(); } return complete; } bool ParamsModel::SaveToFile(const char *full_path) const { - const GenericVector &weights = weights_vec_[pass_]; + const std::vector &weights = weights_vec_[pass_]; if (weights.size() != PTRAIN_NUM_FEATURE_TYPES) { tprintf("Refusing to save ParamsModel that has not been initialized.\n"); return false; diff --git a/src/wordrec/params_model.h b/src/wordrec/params_model.h index 24a80871..b679766d 100644 --- a/src/wordrec/params_model.h +++ b/src/wordrec/params_model.h @@ -19,7 +19,7 @@ #ifndef TESSERACT_WORDREC_PARAMS_MODEL_H_ #define TESSERACT_WORDREC_PARAMS_MODEL_H_ -#include "genericvector.h" // for GenericVector +#include // for TESS_API #include "params_training_featdef.h" // for PTRAIN_NUM_FEATURE_TYPES namespace tesseract { @@ -38,7 +38,7 @@ public: }; ParamsModel() : pass_(PTRAIN_PASS1) {} - ParamsModel(const char *lang, const GenericVector &weights) + ParamsModel(const char *lang, const std::vector &weights) : lang_(lang), pass_(PTRAIN_PASS1) { weights_vec_[pass_] = weights; } @@ -65,10 +65,10 @@ public: // Returns true on success. bool LoadFromFp(const char *lang, TFile *fp); - const GenericVector &weights() const { + const std::vector &weights() const { return weights_vec_[pass_]; } - const GenericVector &weights_for_pass(PassEnum pass) const { + const std::vector &weights_for_pass(PassEnum pass) const { return weights_vec_[pass]; } void SetPass(PassEnum pass) { @@ -84,7 +84,7 @@ private: PassEnum pass_; // Several sets of weights for various OCR passes (e.g. pass1 with adaption, // pass2 without adaption, etc). - GenericVector weights_vec_[PTRAIN_NUM_PASSES]; + std::vector weights_vec_[PTRAIN_NUM_PASSES]; }; } // namespace tesseract From 1f94d79c81e93d9bff92872938d86fa7517b1bb4 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 15 Mar 2021 17:53:11 +0100 Subject: [PATCH 4/5] Replace remaining GenericVector by std::vector for src/ccmain Signed-off-by: Stefan Weil --- src/ccmain/applybox.cpp | 24 ++--- src/ccmain/control.cpp | 41 ++++---- src/ccmain/equationdetect.cpp | 112 +++++++++++---------- src/ccmain/equationdetect.h | 19 ++-- src/ccmain/paragraphs.cpp | 162 ++++++++++++++++--------------- src/ccmain/paragraphs.h | 5 +- src/ccmain/paragraphs_internal.h | 28 +++--- src/ccmain/tesseractclass.h | 32 +++--- src/ccstruct/werd.cpp | 10 +- src/ccstruct/werd.h | 11 +-- src/ccutil/genericvector.h | 10 -- unittest/equationdetect_test.cc | 16 +-- unittest/paragraphs_test.cc | 6 +- 13 files changed, 239 insertions(+), 237 deletions(-) diff --git a/src/ccmain/applybox.cpp b/src/ccmain/applybox.cpp index 5776bca6..0f3e7773 100644 --- a/src/ccmain/applybox.cpp +++ b/src/ccmain/applybox.cpp @@ -24,7 +24,6 @@ # include "boxread.h" #endif // ndef DISABLED_LEGACY_ENGINE #include -#include "genericvector.h" #include "pageres.h" #include "tesseractclass.h" #include "unicharset.h" @@ -489,7 +488,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) { if (word->text() == nullptr || word->text()[0] == '\0') continue; // Ignore words that have no text. // Convert the correct text to a vector of UNICHAR_ID - GenericVector target_text; + std::vector target_text; if (!ConvertStringToUnichars(word->text(), &target_text)) { tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text()); pr_it.DeleteCurrentWord(); @@ -505,7 +504,7 @@ void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) { /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. /// @return false if an invalid UNICHAR_ID is encountered. -bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector *class_ids) { +bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector *class_ids) { for (int step = 0; *utf8 != '\0'; utf8 += step) { const char *next_space = strchr(utf8, ' '); if (next_space == nullptr) @@ -528,10 +527,10 @@ bool Tesseract::ConvertStringToUnichars(const char *utf8, GenericVector &target_text, WERD_RES *word_res) { +bool Tesseract::FindSegmentation(const std::vector &target_text, WERD_RES *word_res) { // Classify all required combinations of blobs and save results in choices. const int word_length = word_res->box_word->length(); - auto *choices = new GenericVector[word_length]; + auto *choices = new std::vector[word_length]; for (int i = 0; i < word_length; ++i) { for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { BLOB_CHOICE_LIST *match_result = @@ -552,8 +551,11 @@ bool Tesseract::FindSegmentation(const GenericVector &target_text, W float best_rating = 0.0f; SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, &word_res->best_state); - for (int i = 0; i < word_length; ++i) - choices[i].delete_data_pointers(); + for (int i = 0; i < word_length; ++i) { + for (auto choice : choices[i]) { + delete choice; + } + } delete[] choices; if (word_res->best_state.empty()) { // Build the original segmentation and if it is the same length as the @@ -583,9 +585,9 @@ bool Tesseract::FindSegmentation(const GenericVector &target_text, W /// Recursive helper to find a match to the target_text (from text_index /// position) in the choices (from choices_pos position). -/// @param choices is an array of GenericVectors, of length choices_length, +/// @param choices is an array of vectors of length choices_length, /// with each element representing a starting position in the word, and the -/// #GenericVector holding classification results for a sequence of consecutive +/// #vector holding classification results for a sequence of consecutive /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. /// @param choices_pos /// @param choices_length @@ -595,8 +597,8 @@ bool Tesseract::FindSegmentation(const GenericVector &target_text, W /// @param segmentation /// @param best_rating /// @param best_segmentation -void Tesseract::SearchForText(const GenericVector *choices, int choices_pos, - int choices_length, const GenericVector &target_text, +void Tesseract::SearchForText(const std::vector *choices, int choices_pos, + int choices_length, const std::vector &target_text, int text_index, float rating, std::vector *segmentation, float *best_rating, std::vector *best_segmentation) { const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs(); diff --git a/src/ccmain/control.cpp b/src/ccmain/control.cpp index 59a501a4..dac19e78 100644 --- a/src/ccmain/control.cpp +++ b/src/ccmain/control.cpp @@ -461,8 +461,8 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { continue; } // Two words sharing the same language model, excellent! - GenericVector overrides_word1; - GenericVector overrides_word2; + std::vector overrides_word1; + std::vector overrides_word2; const auto orig_w1_str = w_prev->best_choice->unichar_string(); const auto orig_w2_str = w->best_choice->unichar_string(); @@ -768,7 +768,7 @@ static int SelectBestWords(double rating_ratio, double certainty_margin, bool de PointerVector *best_words) { // Process the smallest groups of words that have an overlapping word // boundary at the end. - GenericVector out_words; + std::vector out_words; // Index into each word vector (best, new). int b = 0, n = 0; int num_best = 0, num_new = 0; @@ -893,19 +893,19 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next return false; real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); // Get the noise outlines into a vector with matching bool map. - GenericVector outlines; + std::vector outlines; real_word->GetNoiseOutlines(&outlines); - GenericVector word_wanted; - GenericVector overlapped_any_blob; - GenericVector target_blobs; + std::vector word_wanted; + std::vector overlapped_any_blob; + std::vector target_blobs; AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted, &overlapped_any_blob, &target_blobs); // Filter the outlines that overlapped any blob and put them into the word // now. This simplifies the remaining task and also makes it more accurate // as it has more completed blobs to work on. - GenericVector wanted; - GenericVector wanted_blobs; - GenericVector wanted_outlines; + std::vector wanted; + std::vector wanted_blobs; + std::vector wanted_outlines; int num_overlapped = 0; int num_overlapped_used = 0; for (int i = 0; i < overlapped_any_blob.size(); ++i) { @@ -948,11 +948,11 @@ bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next // Output: word_wanted indicates which outlines are to be assigned to a blob, // target_blobs indicates which to assign to, and overlapped_any_blob is // true for all outlines that overlapped a blob. -void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector &outlines, +void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, - GenericVector *word_wanted, - GenericVector *overlapped_any_blob, - GenericVector *target_blobs) { + std::vector *word_wanted, + std::vector *overlapped_any_blob, + std::vector *target_blobs) { std::vector blob_wanted; word_wanted->resize(outlines.size(), false); overlapped_any_blob->resize(outlines.size(), false); @@ -999,10 +999,10 @@ void Tesseract::AssignDiacriticsToOverlappingBlobs(const GenericVector &outlines, int pass, +void Tesseract::AssignDiacriticsToNewBlobs(const std::vector &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, - GenericVector *word_wanted, - GenericVector *target_blobs) { + std::vector *word_wanted, + std::vector *target_blobs) { std::vector blob_wanted; word_wanted->resize(outlines.size(), false); target_blobs->resize(outlines.size(), nullptr); @@ -1077,7 +1077,7 @@ void Tesseract::AssignDiacriticsToNewBlobs(const GenericVector &out // are desired, in which case ok_outlines indicates which ones. bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, - const GenericVector &outlines, + const std::vector &outlines, int num_outlines, std::vector *ok_outlines) { std::string best_str; float target_cert = certainty_threshold; @@ -1161,7 +1161,7 @@ bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, // Classifies the given blob plus the outlines flagged by ok_outlines, undoes // the inclusion of the outlines, and returns the certainty of the raw choice. float Tesseract::ClassifyBlobPlusOutlines(const std::vector &ok_outlines, - const GenericVector &outlines, int pass_n, + const std::vector &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) { C_OUTLINE_IT ol_it; C_OUTLINE *first_to_keep = nullptr; @@ -1865,8 +1865,7 @@ void Tesseract::set_word_fonts(WERD_RES *word) { const int fontinfo_size = get_fontinfo_table().size(); if (fontinfo_size == 0) return; - GenericVector font_total_score; - font_total_score.init_to_size(fontinfo_size, 0); + std::vector font_total_score(fontinfo_size); // Compute the font scores for the word if (tessedit_debug_fonts) { diff --git a/src/ccmain/equationdetect.cpp b/src/ccmain/equationdetect.cpp index 8b725009..0f5073b5 100644 --- a/src/ccmain/equationdetect.cpp +++ b/src/ccmain/equationdetect.cpp @@ -131,7 +131,7 @@ int EquationDetect::LabelSpecialText(TO_BLOCK *to_block) { return -1; } - GenericVector blob_lists; + std::vector blob_lists; blob_lists.push_back(&(to_block->blobs)); blob_lists.push_back(&(to_block->large_blobs)); for (int i = 0; i < blob_lists.size(); ++i) { @@ -223,16 +223,17 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &uni if (unicharset.get_ispunctuation(id)) { // Exclude some special texts that are likely to be confused as math symbol. - static GenericVector ids_to_exclude; + static std::vector ids_to_exclude; if (ids_to_exclude.empty()) { static const char *kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".", "〈", "〉", "《", "》", "」", "「"}; for (auto i = 0; i < countof(kCharsToEx); i++) { ids_to_exclude.push_back(unicharset.unichar_to_id(kCharsToEx[i])); } - ids_to_exclude.sort(); + std::sort(ids_to_exclude.begin(), ids_to_exclude.end()); } - return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH; + auto found = std::binary_search(ids_to_exclude.begin(), ids_to_exclude.end(), id); + return found ? BSTT_NONE : BSTT_MATH; } // Check if it is digit. In addition to the isdigit attribute, we also check @@ -266,13 +267,13 @@ void EquationDetect::IdentifySpecialText() { IdentifyBlobsToSkip(part); BLOBNBOX_C_IT bbox_it(part->boxes()); // Compute the height threshold. - GenericVector blob_heights; + std::vector blob_heights; for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) { if (bbox_it.data()->special_text_type() != BSTT_SKIP) { blob_heights.push_back(bbox_it.data()->bounding_box().height()); } } - blob_heights.sort(); + std::sort(blob_heights.begin(), blob_heights.end()); const int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2; for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) { if (bbox_it.data()->special_text_type() != BSTT_SKIP) { @@ -377,7 +378,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS // Pass 3: expand block equation seeds. while (!cp_seeds_.empty()) { - GenericVector seeds_expanded; + std::vector seeds_expanded; for (int i = 0; i < cp_seeds_.size(); ++i) { if (ExpandSeed(cp_seeds_[i])) { // If this seed is expanded, then we add it into seeds_expanded. Note @@ -407,14 +408,14 @@ void EquationDetect::MergePartsByLocation() { while (true) { ColPartition *part = nullptr; // partitions that have been updated. - GenericVector parts_updated; + std::vector parts_updated; ColPartitionGridSearch gsearch(part_grid_); gsearch.StartFullSearch(); while ((part = gsearch.NextFullSearch()) != nullptr) { if (!IsTextOrEquationType(part->type())) { continue; } - GenericVector parts_to_merge; + std::vector parts_to_merge; SearchByOverlap(part, &parts_to_merge); if (parts_to_merge.empty()) { continue; @@ -443,7 +444,7 @@ void EquationDetect::MergePartsByLocation() { } void EquationDetect::SearchByOverlap(ColPartition *seed, - GenericVector *parts_overlap) { + std::vector *parts_overlap) { ASSERT_HOST(seed != nullptr && parts_overlap != nullptr); if (!IsTextOrEquationType(seed->type())) { return; @@ -457,7 +458,7 @@ void EquationDetect::SearchByOverlap(ColPartition *seed, // Search iteratively. ColPartition *part; - GenericVector parts; + std::vector parts; const float kLargeOverlapTh = 0.95; const float kEquXOverlap = 0.4, kEquYOverlap = 0.5; while ((part = search.NextRadSearch()) != nullptr) { @@ -518,11 +519,11 @@ void EquationDetect::IdentifySeedParts() { ColPartition *part = nullptr; gsearch.StartFullSearch(); - GenericVector seeds1, seeds2; + std::vector seeds1, seeds2; // The left coordinates of indented text partitions. - GenericVector indented_texts_left; + std::vector indented_texts_left; // The foreground density of text partitions. - GenericVector texts_foreground_density; + std::vector texts_foreground_density; while ((part = gsearch.NextFullSearch()) != nullptr) { if (!IsTextOrEquationType(part->type())) { continue; @@ -552,8 +553,8 @@ void EquationDetect::IdentifySeedParts() { } // Sort the features collected from text regions. - indented_texts_left.sort(); - texts_foreground_density.sort(); + std::sort(indented_texts_left.begin(), indented_texts_left.end()); + std::sort(texts_foreground_density.begin(), texts_foreground_density.end()); float foreground_density_th = 0.15; // Default value. if (!texts_foreground_density.empty()) { // Use the median of the texts_foreground_density. @@ -598,7 +599,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa ASSERT_HOST(part); // Split part horizontall, and check for each sub part. - GenericVector sub_boxes; + std::vector sub_boxes; SplitCPHorLite(part, &sub_boxes); float parts_passed = 0.0; for (int i = 0; i < sub_boxes.size(); ++i) { @@ -615,7 +616,7 @@ bool EquationDetect::CheckSeedFgDensity(const float density_th, ColPartition *pa return retval; } -void EquationDetect::SplitCPHor(ColPartition *part, GenericVector *parts_splitted) { +void EquationDetect::SplitCPHor(ColPartition *part, std::vector *parts_splitted) { ASSERT_HOST(part && parts_splitted); if (part->median_width() == 0 || part->boxes_count() == 0) { return; @@ -623,7 +624,9 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVectorCopyButDontOwnBlobs(); - parts_splitted->delete_data_pointers(); + for (auto part : *parts_splitted) { + delete part; + } parts_splitted->clear(); const double kThreshold = part->median_width() * 3.0; @@ -663,7 +666,7 @@ void EquationDetect::SplitCPHor(ColPartition *part, GenericVectorpush_back(right_part); } -void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector *splitted_boxes) { +void EquationDetect::SplitCPHorLite(ColPartition *part, std::vector *splitted_boxes) { ASSERT_HOST(part && splitted_boxes); splitted_boxes->clear(); if (part->median_width() == 0) { @@ -701,7 +704,7 @@ void EquationDetect::SplitCPHorLite(ColPartition *part, GenericVector *spl } } -bool EquationDetect::CheckForSeed2(const GenericVector &indented_texts_left, +bool EquationDetect::CheckForSeed2(const std::vector &indented_texts_left, const float foreground_density_th, ColPartition *part) { ASSERT_HOST(part); const TBOX &box = part->bounding_box(); @@ -720,22 +723,25 @@ bool EquationDetect::CheckForSeed2(const GenericVector &indented_texts_left return true; } -int EquationDetect::CountAlignment(const GenericVector &sorted_vec, const int val) const { +int EquationDetect::CountAlignment(const std::vector &sorted_vec, const int val) const { if (sorted_vec.empty()) { return 0; } - const int kDistTh = static_cast(roundf(0.03 * resolution_)); - const int pos = sorted_vec.binary_search(val); + const int kDistTh = static_cast(round(0.03f * resolution_)); + auto pos = std::upper_bound(sorted_vec.begin(), sorted_vec.end(), val); + if (pos > sorted_vec.begin()) { + --pos; + } int count = 0; // Search left side. - int index = pos; + auto index = pos - sorted_vec.begin(); while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) { count++; } // Search right side. - index = pos + 1; + index = pos + 1 - sorted_vec.begin(); while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) { count++; } @@ -764,9 +770,9 @@ void EquationDetect::ComputeCPsSuperBBox() { void EquationDetect::IdentifyInlinePartsHorizontal() { ASSERT_HOST(cps_super_bbox_); - GenericVector new_seeds; + std::vector new_seeds; const int kMarginDiffTh = IntCastRounded(0.5 * lang_tesseract_->source_resolution()); - const int kGapTh = static_cast(roundf(1.0 * lang_tesseract_->source_resolution())); + const int kGapTh = static_cast(round(1.0f * lang_tesseract_->source_resolution())); ColPartitionGridSearch search(part_grid_); search.SetUniqueMode(true); // The center x coordinate of the cp_super_bbox_. @@ -826,7 +832,7 @@ int EquationDetect::EstimateTextPartLineSpacing() { // Get the y gap between text partitions; ColPartition *current = nullptr, *prev = nullptr; gsearch.StartFullSearch(); - GenericVector ygaps; + std::vector ygaps; while ((current = gsearch.NextFullSearch()) != nullptr) { if (!PTIsTextType(current->type())) { continue; @@ -851,7 +857,7 @@ int EquationDetect::EstimateTextPartLineSpacing() { } // Compute the line spacing from ygaps: use the mean of the first half. - ygaps.sort(); + std::sort(ygaps.begin(), ygaps.end()); int spacing = 0, count; for (count = 0; count < ygaps.size() / 2; count++) { spacing += ygaps[count]; @@ -867,12 +873,12 @@ void EquationDetect::IdentifyInlinePartsVertical(const bool top_to_bottom, // Sort cp_seeds_. if (top_to_bottom) { // From top to bottom. - cp_seeds_.sort(&SortCPByTopReverse); + std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByTopReverse); } else { // From bottom to top. - cp_seeds_.sort(&SortCPByBottom); + std::sort(cp_seeds_.begin(), cp_seeds_.end(), &SortCPByBottom); } - GenericVector new_seeds; + std::vector new_seeds; for (int i = 0; i < cp_seeds_.size(); ++i) { ColPartition *part = cp_seeds_[i]; // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look @@ -918,8 +924,8 @@ bool EquationDetect::IsInline(const bool search_bottom, const int textparts_line // Check if neighbor and part is inline similar. const float kHeightRatioTh = 0.5; const int kYGapTh = textparts_linespacing > 0 - ? textparts_linespacing + static_cast(roundf(0.02 * resolution_)) - : static_cast(roundf(0.05 * resolution_)); // Default value. + ? textparts_linespacing + static_cast(round(0.02f * resolution_)) + : static_cast(round(0.05f * resolution_)); // Default value. if (part_box.x_overlap(neighbor_box) && // Location feature. part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing. // Geo feature. @@ -973,9 +979,9 @@ EquationDetect::IndentType EquationDetect::IsIndented(ColPartition *part) { ColPartitionGridSearch search(part_grid_); ColPartition *neighbor = nullptr; const TBOX &part_box(part->bounding_box()); - const int kXGapTh = static_cast(roundf(0.5 * resolution_)); - const int kRadiusTh = static_cast(roundf(3.0 * resolution_)); - const int kYGapTh = static_cast(roundf(0.5 * resolution_)); + const int kXGapTh = static_cast(round(0.5f * resolution_)); + const int kRadiusTh = static_cast(round(3.0f * resolution_)); + const int kYGapTh = static_cast(round(0.5f * resolution_)); // Here we use a simple approximation algorithm: from the center of part, We // perform the radius search, and check if we can find a neighboring partition @@ -1036,7 +1042,7 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) { } // Expand in four directions. - GenericVector parts_to_merge; + std::vector parts_to_merge; ExpandSeedHorizontal(true, seed, &parts_to_merge); ExpandSeedHorizontal(false, seed, &parts_to_merge); ExpandSeedVertical(true, seed, &parts_to_merge); @@ -1073,10 +1079,10 @@ bool EquationDetect::ExpandSeed(ColPartition *seed) { } void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition *seed, - GenericVector *parts_to_merge) { + std::vector *parts_to_merge) { ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr); const float kYOverlapTh = 0.6; - const int kXGapTh = static_cast(roundf(0.2 * resolution_)); + const int kXGapTh = static_cast(round(0.2f * resolution_)); ColPartitionGridSearch search(part_grid_); const TBOX &seed_box(seed->bounding_box()); @@ -1125,10 +1131,10 @@ void EquationDetect::ExpandSeedHorizontal(const bool search_left, ColPartition * } void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition *seed, - GenericVector *parts_to_merge) { + std::vector *parts_to_merge) { ASSERT_HOST(seed != nullptr && parts_to_merge != nullptr && cps_super_bbox_ != nullptr); const float kXOverlapTh = 0.4; - const int kYGapTh = static_cast(roundf(0.2 * resolution_)); + const int kYGapTh = static_cast(round(0.2f * resolution_)); ColPartitionGridSearch search(part_grid_); const TBOX &seed_box(seed->bounding_box()); @@ -1138,7 +1144,7 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition * // Search iteratively. ColPartition *part = nullptr; - GenericVector parts; + std::vector parts; int skipped_min_top = std::numeric_limits::max(), skipped_max_bottom = -1; while ((part = search.NextVerticalSearch(search_bottom)) != nullptr) { if (part == seed) { @@ -1206,8 +1212,8 @@ void EquationDetect::ExpandSeedVertical(const bool search_bottom, ColPartition * } bool EquationDetect::IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const { - const int kXGapTh = static_cast(roundf(0.25 * resolution_)); - const int kYGapTh = static_cast(roundf(0.05 * resolution_)); + const int kXGapTh = static_cast(round(0.25f * resolution_)); + const int kYGapTh = static_cast(round(0.05f * resolution_)); // Check geometric feature. if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) { @@ -1244,7 +1250,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() { // Iterate over part_grid_, and find all parts that are text type but not // equation type. ColPartition *part = nullptr; - GenericVector text_parts; + std::vector text_parts; ColPartitionGridSearch gsearch(part_grid_); gsearch.StartFullSearch(); while ((part = gsearch.NextFullSearch()) != nullptr) { @@ -1257,12 +1263,12 @@ void EquationDetect::ProcessMathBlockSatelliteParts() { } // Compute the medium height of the text_parts. - text_parts.sort(&SortCPByHeight); + std::sort(text_parts.begin(), text_parts.end(), &SortCPByHeight); const TBOX &text_box = text_parts[text_parts.size() / 2]->bounding_box(); int med_height = text_box.height(); if (text_parts.size() % 2 == 0 && text_parts.size() > 1) { const TBOX &text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box(); - med_height = static_cast(roundf(0.5 * (text_box.height() + med_height))); + med_height = static_cast(round(0.5f * (text_box.height() + med_height))); } // Iterate every text_parts and check if it is a math block satellite. @@ -1271,7 +1277,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() { if (text_box.height() > med_height) { continue; } - GenericVector math_blocks; + std::vector math_blocks; if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) { continue; } @@ -1288,7 +1294,7 @@ void EquationDetect::ProcessMathBlockSatelliteParts() { } bool EquationDetect::IsMathBlockSatellite(ColPartition *part, - GenericVector *math_blocks) { + std::vector *math_blocks) { ASSERT_HOST(part != nullptr && math_blocks != nullptr); math_blocks->clear(); const TBOX &part_box(part->bounding_box()); @@ -1344,7 +1350,7 @@ bool EquationDetect::IsMathBlockSatellite(ColPartition *part, ColPartition *EquationDetect::SearchNNVertical(const bool search_bottom, const ColPartition *part) { ASSERT_HOST(part); ColPartition *nearest_neighbor = nullptr, *neighbor = nullptr; - const int kYGapTh = static_cast(roundf(resolution_ * 0.5)); + const int kYGapTh = static_cast(round(resolution_ * 0.5f)); ColPartitionGridSearch search(part_grid_); search.SetUniqueMode(true); @@ -1379,7 +1385,7 @@ bool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *nei if (!neighbor) { return false; } - const int kYGapTh = static_cast(roundf(resolution_ * 0.1)); + const int kYGapTh = static_cast(round(resolution_ * 0.1f)); return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh; } diff --git a/src/ccmain/equationdetect.h b/src/ccmain/equationdetect.h index 3e4594b1..551be847 100644 --- a/src/ccmain/equationdetect.h +++ b/src/ccmain/equationdetect.h @@ -22,7 +22,6 @@ #include // for UNICHAR_ID #include "blobbox.h" // for BLOBNBOX (ptr only), BlobSpecialText... #include "equationdetectbase.h" // for EquationDetectBase -#include "genericvector.h" // for GenericVector #include "tesseractclass.h" // for Tesseract class TBOX; @@ -86,7 +85,7 @@ protected: // parts_overlap. Note: this function may update the part_grid_, so if the // caller is also running ColPartitionGridSearch, use the RepositionIterator // to continue. - void SearchByOverlap(ColPartition *seed, GenericVector *parts_overlap); + void SearchByOverlap(ColPartition *seed, std::vector *parts_overlap); // Insert part back into part_grid_, after it absorbs some other parts. void InsertPartAfterAbsorb(ColPartition *part); @@ -106,12 +105,12 @@ protected: // 1. If its left is aligned with any coordinates in indented_texts_left, // which we assume have been sorted. // 2. If its foreground density is over foreground_density_th. - bool CheckForSeed2(const GenericVector &indented_texts_left, + bool CheckForSeed2(const std::vector &indented_texts_left, const float foreground_density_th, ColPartition *part); // Count the number of values in sorted_vec that is close to val, used to // check if a partition is aligned with text partitions. - int CountAlignment(const GenericVector &sorted_vec, const int val) const; + int CountAlignment(const std::vector &sorted_vec, const int val) const; // Check for a seed candidate using the foreground pixel density. And we // return true if the density is below a certain threshold, because characters @@ -120,14 +119,14 @@ protected: // A light version of SplitCPHor: instead of really doing the part split, we // simply compute the union bounding box of each split part. - void SplitCPHorLite(ColPartition *part, GenericVector *splitted_boxes); + void SplitCPHorLite(ColPartition *part, std::vector *splitted_boxes); // Split the part (horizontally), and save the split result into // parts_splitted. Note that it is caller's responsibility to release the // memory owns by parts_splitted. On the other hand, the part is unchanged // during this process and still owns the blobs, so do NOT call DeleteBoxes // when freeing the colpartitions in parts_splitted. - void SplitCPHor(ColPartition *part, GenericVector *parts_splitted); + void SplitCPHor(ColPartition *part, std::vector *parts_splitted); // Check the density for a seed candidate (part) using its math density and // italic density, returns true if the check passed. @@ -167,9 +166,9 @@ protected: // merged with seed, remove them from part_grid_, and put them into // parts_to_merge. void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, - GenericVector *parts_to_merge); + std::vector *parts_to_merge); void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, - GenericVector *parts_to_merge); + std::vector *parts_to_merge); // Check if a part_box is the small neighbor of seed_box. bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const; @@ -190,7 +189,7 @@ protected: // Check if part is the satellite of one/two math blocks. If it is, we return // true, and save the blocks into math_blocks. - bool IsMathBlockSatellite(ColPartition *part, GenericVector *math_blocks); + bool IsMathBlockSatellite(ColPartition *part, std::vector *math_blocks); // Search the nearest neighbor of part in one vertical direction as defined in // search_bottom. It returns the neighbor found that major x overlap with it, @@ -237,7 +236,7 @@ protected: TBOX *cps_super_bbox_; // The seed ColPartition for equation region. - GenericVector cp_seeds_; + std::vector cp_seeds_; // The resolution (dpi) of the processing image. int resolution_; diff --git a/src/ccmain/paragraphs.cpp b/src/ccmain/paragraphs.cpp index 8c881474..40240ab3 100644 --- a/src/ccmain/paragraphs.cpp +++ b/src/ccmain/paragraphs.cpp @@ -18,7 +18,6 @@ #include "paragraphs.h" -#include "genericvector.h" // for GenericVector, GenericVectorEqEq #include "helpers.h" // for UpdateRange, ClipToRange #include "host.h" // for NearlyEqual #include "mutableiterator.h" // for MutableIterator @@ -72,7 +71,7 @@ static int Epsilon(int space_pix) { } static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name, - const GenericVector *rows, int row_start, + const std::vector *rows, int row_start, int row_end) { if (row_start < 0 || row_end > rows->size() || row_start > row_end) { tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", row_start, row_end, @@ -134,7 +133,7 @@ static std::string RtlEmbed(const std::string &word, bool rtlify) { // Print the current thoughts of the paragraph detector. static void PrintDetectorState(const ParagraphTheory &theory, - const GenericVector &rows) { + const std::vector &rows) { std::vector> output; output.push_back(std::vector()); output.back().push_back("#row"); @@ -173,7 +172,7 @@ static void PrintDetectorState(const ParagraphTheory &theory, } static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory, - const GenericVector &rows) { + const std::vector &rows) { if (!should_print) return; tprintf("# %s\n", phase); @@ -181,7 +180,7 @@ static void DebugDump(bool should_print, const char *phase, const ParagraphTheor } // Print out the text for rows[row_start, row_end) -static void PrintRowRange(const GenericVector &rows, int row_start, +static void PrintRowRange(const std::vector &rows, int row_start, int row_end) { tprintf("======================================\n"); for (int row = row_start; row < row_end; row++) { @@ -398,6 +397,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { return pos == werd->length(); } +template +void push_back_new(std::vector &vector, const T &data) { + if (std::find(vector.begin(), vector.end(), data) == vector.end()) { + vector.push_back(data); + } +} + // ========= Brain Dead Language Model (combined entry points) ================ // Given the leftmost word of a line either as a Tesseract unicharset + werd @@ -581,7 +587,7 @@ void RowScratchRegisters::SetStartLine() { tprintf("Trying to set a line to be START when it's already BODY.\n"); } if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { - hypotheses_.push_back_new(LineHypothesis(LT_START, nullptr)); + push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr)); } } @@ -591,42 +597,44 @@ void RowScratchRegisters::SetBodyLine() { tprintf("Trying to set a line to be BODY when it's already START.\n"); } if (current_lt == LT_UNKNOWN || current_lt == LT_START) { - hypotheses_.push_back_new(LineHypothesis(LT_BODY, nullptr)); + push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr)); } } void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { - hypotheses_.push_back_new(LineHypothesis(LT_START, model)); - int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, nullptr)); - if (old_idx >= 0) - hypotheses_.remove(old_idx); + push_back_new(hypotheses_, LineHypothesis(LT_START, model)); + auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr)); + if (found != hypotheses_.end()) { + hypotheses_.erase(found); + } } void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { - hypotheses_.push_back_new(LineHypothesis(LT_BODY, model)); - int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, nullptr)); - if (old_idx >= 0) - hypotheses_.remove(old_idx); + push_back_new(hypotheses_, LineHypothesis(LT_BODY, model)); + auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr)); + if (found != hypotheses_.end()) { + hypotheses_.erase(found); + } } void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { for (int h = 0; h < hypotheses_.size(); h++) { if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model)) - models->push_back_new(hypotheses_[h].model); + push_back_new(*models, hypotheses_[h].model); } } void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { for (int h = 0; h < hypotheses_.size(); h++) { if (StrongModel(hypotheses_[h].model)) - models->push_back_new(hypotheses_[h].model); + push_back_new(*models, hypotheses_[h].model); } } void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { for (int h = 0; h < hypotheses_.size(); h++) { if (hypotheses_[h].model != nullptr) - models->push_back_new(hypotheses_[h].model); + push_back_new(*models, hypotheses_[h].model); } } @@ -647,8 +655,8 @@ void RowScratchRegisters::DiscardNonMatchingHypotheses(const SetOfModels &models if (models.empty()) return; for (int h = hypotheses_.size() - 1; h >= 0; h--) { - if (!models.contains(hypotheses_[h].model)) { - hypotheses_.remove(h); + if (!contains(models, hypotheses_[h].model)) { + hypotheses_.erase(hypotheses_.begin() + h); } } } @@ -672,15 +680,15 @@ public: int size() const { return values_.size(); } - void GetClusters(GenericVector *clusters); + void GetClusters(std::vector *clusters); private: int max_cluster_width_; - GenericVector values_; + std::vector values_; }; // Return the index of the cluster closest to value. -static int ClosestCluster(const GenericVector &clusters, int value) { +static int ClosestCluster(const std::vector &clusters, int value) { int best_index = 0; for (int i = 0; i < clusters.size(); i++) { if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) @@ -689,9 +697,9 @@ static int ClosestCluster(const GenericVector &clusters, int value) { return best_index; } -void SimpleClusterer::GetClusters(GenericVector *clusters) { +void SimpleClusterer::GetClusters(std::vector *clusters) { clusters->clear(); - values_.sort(); + std::sort(values_.begin(), values_.end()); for (int i = 0; i < values_.size();) { int orig_i = i; int lo = values_[i]; @@ -705,16 +713,16 @@ void SimpleClusterer::GetClusters(GenericVector *clusters) { // Calculate left- and right-indent tab stop values seen in // rows[row_start, row_end) given a tolerance of tolerance. -static void CalculateTabStops(GenericVector *rows, int row_start, int row_end, - int tolerance, GenericVector *left_tabs, - GenericVector *right_tabs) { +static void CalculateTabStops(std::vector *rows, int row_start, int row_end, + int tolerance, std::vector *left_tabs, + std::vector *right_tabs) { if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) return; // First pass: toss all left and right indents into clusterers. SimpleClusterer initial_lefts(tolerance); SimpleClusterer initial_rights(tolerance); - GenericVector initial_left_tabs; - GenericVector initial_right_tabs; + std::vector initial_left_tabs; + std::vector initial_right_tabs; for (int i = row_start; i < row_end; i++) { initial_lefts.Add((*rows)[i].lindent_); initial_rights.Add((*rows)[i].rindent_); @@ -782,7 +790,7 @@ static void CalculateTabStops(GenericVector *rows, int row_ } } if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { - left_tabs->remove(to_prune); + left_tabs->erase(left_tabs->begin() + to_prune); } } if (right_tabs->size() == 3 && left_tabs->size() >= 4) { @@ -793,7 +801,7 @@ static void CalculateTabStops(GenericVector *rows, int row_ } } if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { - right_tabs->remove(to_prune); + right_tabs->erase(right_tabs->begin() + to_prune); } } } @@ -817,7 +825,7 @@ static void CalculateTabStops(GenericVector *rows, int row_ // Case 2b: Fully Justified. (eop_threshold > 0) // We mark a line as short (end of paragraph) if the offside indent // is greater than eop_threshold. -static void MarkRowsWithModel(GenericVector *rows, int row_start, int row_end, +static void MarkRowsWithModel(std::vector *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold) { if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) return; @@ -861,7 +869,7 @@ static void MarkRowsWithModel(GenericVector *rows, int row_ // Further, this struct holds the data we amass for the (single) ParagraphModel // we'll assign to the text lines (assuming we get that far). struct GeometricClassifierState { - GeometricClassifierState(int dbg_level, GenericVector *r, int r_start, + GeometricClassifierState(int dbg_level, std::vector *r, int r_start, int r_end) : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) { tolerance = InterwordSpace(*r, r_start, r_end); @@ -886,7 +894,7 @@ struct GeometricClassifierState { } // Align tabs are the tab stops the text is aligned to. - const GenericVector &AlignTabs() const { + const std::vector &AlignTabs() const { if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs; return left_tabs; @@ -897,7 +905,7 @@ struct GeometricClassifierState { // Note that for a left-to-right text which is aligned to the right such as // this function comment, the offside tabs are the horizontal tab stops // marking the beginning of ("Note", "this" and "marking"). - const GenericVector &OffsideTabs() const { + const std::vector &OffsideTabs() const { if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs; return right_tabs; @@ -940,7 +948,7 @@ struct GeometricClassifierState { // The Geometric Classifier was asked to find a single paragraph model // to fit the text rows (*rows)[row_start, row_end) - GenericVector *rows; + std::vector *rows; int row_start = 0; int row_end = 0; @@ -953,8 +961,8 @@ struct GeometricClassifierState { // These left and right tab stops were determined to be the common tab // stops for the given text. - GenericVector left_tabs; - GenericVector right_tabs; + std::vector left_tabs; + std::vector right_tabs; // These are parameters we must determine to create a ParagraphModel. tesseract::ParagraphJustification just = JUSTIFICATION_UNKNOWN; @@ -1083,7 +1091,7 @@ static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricCla // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), // it's worth guessing that (A1b) is the correct interpretation if there are // far more "full" lines than "short" lines. -static void GeometricClassify(int debug_level, GenericVector *rows, +static void GeometricClassify(int debug_level, std::vector *rows, int row_start, int row_end, ParagraphTheory *theory) { if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) return; @@ -1223,7 +1231,7 @@ const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { } auto *m = new ParagraphModel(model); models_->push_back(m); - models_we_added_.push_back_new(m); + push_back_new(models_we_added_, m); return m; } @@ -1231,7 +1239,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { size_t w = 0; for (size_t r = 0; r < models_->size(); r++) { ParagraphModel *m = (*models_)[r]; - if (!used_models.contains(m) && models_we_added_.contains(m)) { + if (!contains(used_models, static_cast(m)) && contains(models_we_added_, m)) { delete m; } else { if (r > w) { @@ -1246,7 +1254,7 @@ void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { // Examine rows[start, end) and try to determine if an existing non-centered // paragraph model would fit them perfectly. If so, return a pointer to it. // If not, return nullptr. -const ParagraphModel *ParagraphTheory::Fits(const GenericVector *rows, +const ParagraphModel *ParagraphTheory::Fits(const std::vector *rows, int start, int end) const { for (const auto *model : *models_) { if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) @@ -1258,7 +1266,7 @@ const ParagraphModel *ParagraphTheory::Fits(const GenericVectorjustification() != JUSTIFICATION_CENTER) - models->push_back_new(model); + push_back_new(*models, model); } } @@ -1272,7 +1280,7 @@ int ParagraphTheory::IndexOf(const ParagraphModel *model) const { return -1; } -bool ValidFirstLine(const GenericVector *rows, int row, +bool ValidFirstLine(const std::vector *rows, int row, const ParagraphModel *model) { if (!StrongModel(model)) { tprintf("ValidFirstLine() should only be called with strong models!\n"); @@ -1281,7 +1289,7 @@ bool ValidFirstLine(const GenericVector *rows, int row, (*rows)[row].rindent_, (*rows)[row].rmargin_); } -bool ValidBodyLine(const GenericVector *rows, int row, +bool ValidBodyLine(const std::vector *rows, int row, const ParagraphModel *model) { if (!StrongModel(model)) { tprintf("ValidBodyLine() should only be called with strong models!\n"); @@ -1290,7 +1298,7 @@ bool ValidBodyLine(const GenericVector *rows, int row, (*rows)[row].rindent_, (*rows)[row].rmargin_); } -bool CrownCompatible(const GenericVector *rows, int a, int b, +bool CrownCompatible(const std::vector *rows, int a, int b, const ParagraphModel *model) { if (model != kCrownRight && model != kCrownLeft) { tprintf("CrownCompatible() should only be called with crown models!\n"); @@ -1308,7 +1316,7 @@ bool CrownCompatible(const GenericVector *rows, int a, int // =============== Implementation of ParagraphModelSmearer ==================== -ParagraphModelSmearer::ParagraphModelSmearer(GenericVector *rows, +ParagraphModelSmearer::ParagraphModelSmearer(std::vector *rows, int row_start, int row_end, ParagraphTheory *theory) : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) { if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { @@ -1341,7 +1349,7 @@ void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { // This is basic filtering; we check likely paragraph starty-ness down // below in Smear() -- you know, whether the first word would have fit // and such. - still_open.push_back_new(opened[m]); + push_back_new(still_open, opened[m]); } } OpenModels(row + 1) = still_open; @@ -1449,7 +1457,7 @@ void ParagraphModelSmearer::Smear() { // Find out what ParagraphModels are actually used, and discard any // that are not. -static void DiscardUnusedModels(const GenericVector &rows, +static void DiscardUnusedModels(const std::vector &rows, ParagraphTheory *theory) { SetOfModels used_models; for (int i = 0; i < rows.size(); i++) { @@ -1483,7 +1491,7 @@ static void DiscardUnusedModels(const GenericVector &rows, // sequences of body lines of equivalent type abutted against the beginning // or a body or start line of a different type into a crown paragraph. static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, - GenericVector *rows) { + std::vector *rows) { int start; for (int end = rows->size(); end > 0; end = start) { // Search back for a body line of a unique type. @@ -1546,7 +1554,7 @@ static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory, // really just ignore it as an outlier. To express this, we allow the // user to specify the percentile (0..100) of indent values to use as // the common margin for each row in the run of rows[start, end). -void RecomputeMarginsAndClearHypotheses(GenericVector *rows, int start, +void RecomputeMarginsAndClearHypotheses(std::vector *rows, int start, int end, int percentile) { if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) return; @@ -1585,7 +1593,7 @@ void RecomputeMarginsAndClearHypotheses(GenericVector *rows } // Return the median inter-word space in rows[row_start, row_end). -int InterwordSpace(const GenericVector &rows, int row_start, int row_end) { +int InterwordSpace(const std::vector &rows, int row_start, int row_end) { if (row_end < row_start + 1) return 1; int word_height = @@ -1666,7 +1674,7 @@ static bool LikelyParagraphStart(const RowScratchRegisters &before, // If the rows given could be a consistent start to a paragraph, set *consistent // true. static ParagraphModel InternalParagraphModelByOutline( - const GenericVector *rows, int start, int end, int tolerance, + const std::vector *rows, int start, int end, int tolerance, bool *consistent) { int ltr_line_count = 0; for (int i = start; i < end; i++) { @@ -1763,7 +1771,7 @@ static ParagraphModel InternalParagraphModelByOutline( // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug // output if we're debugging. static ParagraphModel ParagraphModelByOutline(int debug_level, - const GenericVector *rows, + const std::vector *rows, int start, int end, int tolerance) { bool unused_consistent; ParagraphModel retval = @@ -1776,7 +1784,7 @@ static ParagraphModel ParagraphModelByOutline(int debug_level, } // Do rows[start, end) form a single instance of the given paragraph model? -bool RowsFitModel(const GenericVector *rows, int start, int end, +bool RowsFitModel(const std::vector *rows, int start, int end, const ParagraphModel *model) { if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) return false; @@ -1800,7 +1808,7 @@ bool RowsFitModel(const GenericVector *rows, int start, int // We only take the very strongest signals, as we don't want to get // confused and marking up centered text, poetry, or source code as // clearly part of a typical paragraph. -static void MarkStrongEvidence(GenericVector *rows, int row_start, +static void MarkStrongEvidence(std::vector *rows, int row_start, int row_end) { // Record patently obvious body text. for (int i = row_start + 1; i < row_end; i++) { @@ -1862,7 +1870,7 @@ static void MarkStrongEvidence(GenericVector *rows, int row // Look for sequences of a start line followed by some body lines in // rows[row_start, row_end) and create ParagraphModels for them if // they seem coherent. -static void ModelStrongEvidence(int debug_level, GenericVector *rows, +static void ModelStrongEvidence(int debug_level, std::vector *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory) { if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) @@ -1951,7 +1959,7 @@ static void ModelStrongEvidence(int debug_level, GenericVector *rows, +static void StrongEvidenceClassify(int debug_level, std::vector *rows, int row_start, int row_end, ParagraphTheory *theory) { if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) return; @@ -1979,7 +1987,7 @@ static void StrongEvidenceClassify(int debug_level, GenericVector *rows, int row_start, +static void SeparateSimpleLeaderLines(std::vector *rows, int row_start, int row_end, ParagraphTheory *theory) { for (int i = row_start + 1; i < row_end - 1; i++) { if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders && @@ -1994,8 +2002,8 @@ static void SeparateSimpleLeaderLines(GenericVector *rows, // Collect sequences of unique hypotheses in row registers and create proper // paragraphs for them, referencing the paragraphs in row_owners. static void ConvertHypothesizedModelRunsToParagraphs(int debug_level, - GenericVector &rows, - GenericVector *row_owners, + std::vector &rows, + std::vector *row_owners, ParagraphTheory *theory) { int end = rows.size(); int start; @@ -2090,7 +2098,7 @@ struct Interval { // (1) If a line is surrounded by lines of unknown type, it's weak. // (2) If two lines in a row are start lines for a given paragraph type, but // after that the same paragraph type does not continue, they're weak. -static bool RowIsStranded(const GenericVector &rows, int row) { +static bool RowIsStranded(const std::vector &rows, int row) { SetOfModels row_models; rows[row].StrongHypotheses(&row_models); @@ -2145,8 +2153,8 @@ static bool RowIsStranded(const GenericVector &rows, int ro // + Crown paragraphs not immediately followed by a strongly modeled line. // + Single line paragraphs surrounded by text that doesn't match the // model. -static void LeftoverSegments(const GenericVector &rows, - GenericVector *to_fix, int row_start, int row_end) { +static void LeftoverSegments(const std::vector &rows, + std::vector *to_fix, int row_start, int row_end) { to_fix->clear(); for (int i = row_start; i < row_end; i++) { bool needs_fixing = false; @@ -2195,8 +2203,8 @@ static void LeftoverSegments(const GenericVector &rows, // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), // normalize each row_owner to point to an actual PARA, and output the // paragraphs in order onto paragraphs. -void CanonicalizeDetectionResults(GenericVector *row_owners, PARA_LIST *paragraphs) { - GenericVector &rows = *row_owners; +void CanonicalizeDetectionResults(std::vector *row_owners, PARA_LIST *paragraphs) { + std::vector &rows = *row_owners; paragraphs->clear(); PARA_IT out(paragraphs); PARA *formerly_null = nullptr; @@ -2226,16 +2234,16 @@ void CanonicalizeDetectionResults(GenericVector *row_owners, PARA_LIST * // models - the list of paragraph models referenced by the PARA objects. // caller is responsible for deleting the models. void DetectParagraphs(int debug_level, std::vector *row_infos, - GenericVector *row_owners, PARA_LIST *paragraphs, + std::vector *row_owners, PARA_LIST *paragraphs, std::vector *models) { - GenericVector rows; + std::vector rows; ParagraphTheory theory(models); // Initialize row_owners to be a bunch of nullptr pointers. - row_owners->init_to_size(row_infos->size(), nullptr); + row_owners->resize(row_infos->size()); // Set up row scratch registers for the main algorithm. - rows.init_to_size(row_infos->size(), RowScratchRegisters()); + rows.resize(row_infos->size(), RowScratchRegisters()); for (int i = 0; i < row_infos->size(); i++) { rows[i].Init((*row_infos)[i]); } @@ -2249,7 +2257,7 @@ void DetectParagraphs(int debug_level, std::vector *row_infos, DebugDump(debug_level > 1, "End of Pass 1", theory, rows); - GenericVector leftovers; + std::vector leftovers; LeftoverSegments(rows, &leftovers, 0, rows.size()); for (int i = 0; i < leftovers.size(); i++) { // Pass 2a: @@ -2263,7 +2271,7 @@ void DetectParagraphs(int debug_level, std::vector *row_infos, // If we had any luck in pass 2a, we got part of the page and didn't // know how to classify a few runs of rows. Take the segments that // didn't find a model and reprocess them individually. - GenericVector leftovers2; + std::vector leftovers2; LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end); bool pass2a_was_useful = leftovers2.size() > 1 || @@ -2422,7 +2430,7 @@ static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, } PAGE_RES_IT page_res_it = *it.PageResIt(); - GenericVector werds; + std::vector werds; WERD_RES *word_res = page_res_it.restart_row(); ROW_RES *this_row = page_res_it.row(); int num_leaders = 0; @@ -2505,12 +2513,12 @@ void DetectParagraphs(int debug_level, bool after_text_recognition, } // Run the paragraph detection algorithm. - GenericVector row_owners; - GenericVector the_paragraphs; + std::vector row_owners; + std::vector the_paragraphs; if (!is_image_block) { DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models); } else { - row_owners.init_to_size(row_infos.size(), nullptr); + row_owners.resize(row_infos.size()); CanonicalizeDetectionResults(&row_owners, block->para_list()); } diff --git a/src/ccmain/paragraphs.h b/src/ccmain/paragraphs.h index bfbb7ff4..0d7b7ba0 100644 --- a/src/ccmain/paragraphs.h +++ b/src/ccmain/paragraphs.h @@ -31,9 +31,6 @@ class ParagraphModel; class PARA_LIST; struct PARA; -template -class GenericVector; - // This structure captures all information needed about a text line for the // purposes of paragraph detection. It is meant to be exceedingly light-weight // so that we can easily test paragraph detection independent of the rest of @@ -90,7 +87,7 @@ public: // caller is responsible for deleting the models. TESS_API void DetectParagraphs(int debug_level, std::vector *row_infos, - GenericVector *row_owners, PARA_LIST *paragraphs, + std::vector *row_owners, PARA_LIST *paragraphs, std::vector *models); // Given a MutableIterator to the start of a block, run DetectParagraphs on diff --git a/src/ccmain/paragraphs_internal.h b/src/ccmain/paragraphs_internal.h index e4ae847a..3c6c0162 100644 --- a/src/ccmain/paragraphs_internal.h +++ b/src/ccmain/paragraphs_internal.h @@ -95,7 +95,7 @@ struct LineHypothesis { class ParagraphTheory; // Forward Declaration -using SetOfModels = GenericVector; +using SetOfModels = std::vector; // Row Scratch Registers are data generated by the paragraph detection // algorithm based on a RowInfo input. @@ -123,7 +123,7 @@ public: // Clear all hypotheses about this line. void SetUnknown() { - hypotheses_.truncate(0); + hypotheses_.clear(); } // Append all hypotheses of strong models that match this row as a start. @@ -190,7 +190,7 @@ public: private: // Hypotheses of either LT_START or LT_BODY - GenericVector hypotheses_; + std::vector hypotheses_; }; // A collection of convenience functions for wrapping the set of @@ -219,21 +219,21 @@ public: // If any of the non-centered paragraph models we know about fit // rows[start, end), return it. Else nullptr. - const ParagraphModel *Fits(const GenericVector *rows, int start, + const ParagraphModel *Fits(const std::vector *rows, int start, int end) const; int IndexOf(const ParagraphModel *model) const; private: std::vector *models_; - GenericVector models_we_added_; + std::vector models_we_added_; }; -bool ValidFirstLine(const GenericVector *rows, int row, +bool ValidFirstLine(const std::vector *rows, int row, const ParagraphModel *model); -bool ValidBodyLine(const GenericVector *rows, int row, +bool ValidBodyLine(const std::vector *rows, int row, const ParagraphModel *model); -bool CrownCompatible(const GenericVector *rows, int a, int b, +bool CrownCompatible(const std::vector *rows, int a, int b, const ParagraphModel *model); // A class for smearing Paragraph Model hypotheses to surrounding rows. @@ -245,7 +245,7 @@ bool CrownCompatible(const GenericVector *rows, int a, int // "smear" our models over the text. class ParagraphModelSmearer { public: - ParagraphModelSmearer(GenericVector *rows, int row_start, int row_end, + ParagraphModelSmearer(std::vector *rows, int row_start, int row_end, ParagraphTheory *theory); // Smear forward paragraph models from existing row markings to subsequent @@ -266,7 +266,7 @@ private: } ParagraphTheory *theory_; - GenericVector *rows_; + std::vector *rows_; int row_start_; int row_end_; @@ -284,11 +284,11 @@ private: // Clear all hypotheses about lines [start, end) and reset the margins to the // percentile (0..100) value of the left and right row edges for this run of // rows. -void RecomputeMarginsAndClearHypotheses(GenericVector *rows, int start, +void RecomputeMarginsAndClearHypotheses(std::vector *rows, int start, int end, int percentile); // Return the median inter-word space in rows[row_start, row_end). -int InterwordSpace(const GenericVector &rows, int row_start, int row_end); +int InterwordSpace(const std::vector &rows, int row_start, int row_end); // Return whether the first word on the after line can fit in the space at // the end of the before line (knowing which way the text is aligned and read). @@ -300,13 +300,13 @@ bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRe bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after); // Do rows[start, end) form a single instance of the given paragraph model? -bool RowsFitModel(const GenericVector *rows, int start, int end, +bool RowsFitModel(const std::vector *rows, int start, int end, const ParagraphModel *model); // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), // normalize each row_owner to point to an actual PARA, and output the // paragraphs in order onto paragraphs. -void CanonicalizeDetectionResults(GenericVector *row_owners, PARA_LIST *paragraphs); +void CanonicalizeDetectionResults(std::vector *row_owners, PARA_LIST *paragraphs); } // namespace tesseract diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 67897500..04b45d79 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -45,7 +45,7 @@ #include // for OcrEngineMode, PageSegMode, OEM_L... #include // for UNICHAR_ID -#include "genericvector.h" // for GenericVector, PointerVector +#include "genericvector.h" // for PointerVector #include // for pixDestroy, pixGetWidth, pixGetHe... @@ -398,27 +398,27 @@ public: // Input: a set of noisy outlines that probably belong to the real_word. // Output: outlines that overlapped blobs are set to nullptr and put back into // the word, either in the blobs or in the reject list. - void AssignDiacriticsToOverlappingBlobs(const GenericVector &outlines, int pass, + void AssignDiacriticsToOverlappingBlobs(const std::vector &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, - GenericVector *word_wanted, - GenericVector *overlapped_any_blob, - GenericVector *target_blobs); + std::vector *word_wanted, + std::vector *overlapped_any_blob, + std::vector *target_blobs); // Attempts to assign non-overlapping outlines to their nearest blobs or // make new blobs out of them. - void AssignDiacriticsToNewBlobs(const GenericVector &outlines, int pass, + void AssignDiacriticsToNewBlobs(const std::vector &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, - GenericVector *word_wanted, - GenericVector *target_blobs); + std::vector *word_wanted, + std::vector *target_blobs); // Starting with ok_outlines set to indicate which outlines overlap the blob, // chooses the optimal set (approximately) and returns true if any outlines // are desired, in which case ok_outlines indicates which ones. bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, - C_BLOB *blob, const GenericVector &outlines, + C_BLOB *blob, const std::vector &outlines, int num_outlines, std::vector *ok_outlines); // Classifies the given blob plus the outlines flagged by ok_outlines, undoes // the inclusion of the outlines, and returns the certainty of the raw choice. float ClassifyBlobPlusOutlines(const std::vector &ok_outlines, - const GenericVector &outlines, int pass_n, + const std::vector &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str); // Classifies the given blob (part of word_data->word->word) as an individual // word, using languages, chopper etc, returning only the certainty of the @@ -703,22 +703,22 @@ public: void ReSegmentByClassification(PAGE_RES *page_res); // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. // Returns false if an invalid UNICHAR_ID is encountered. - bool ConvertStringToUnichars(const char *utf8, GenericVector *class_ids); + bool ConvertStringToUnichars(const char *utf8, std::vector *class_ids); // Resegments the word to achieve the target_text from the classifier. // Returns false if the re-segmentation fails. // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and // applies a full search on the classifier results to find the best classified // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity // substitutions ARE used. - bool FindSegmentation(const GenericVector &target_text, WERD_RES *word_res); + bool FindSegmentation(const std::vector &target_text, WERD_RES *word_res); // Recursive helper to find a match to the target_text (from text_index // position) in the choices (from choices_pos position). - // Choices is an array of GenericVectors, of length choices_length, with each + // Choices is an array of vectors of length choices_length, with each // element representing a starting position in the word, and the - // GenericVector holding classification results for a sequence of consecutive + // vector holding classification results for a sequence of consecutive // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. - void SearchForText(const GenericVector *choices, int choices_pos, - int choices_length, const GenericVector &target_text, + void SearchForText(const std::vector *choices, int choices_pos, + int choices_length, const std::vector &target_text, int text_index, float rating, std::vector *segmentation, float *best_rating, std::vector *best_segmentation); // Counts up the labelled words and the blobs within. diff --git a/src/ccstruct/werd.cpp b/src/ccstruct/werd.cpp index d7d60fc7..6f51a9bc 100644 --- a/src/ccstruct/werd.cpp +++ b/src/ccstruct/werd.cpp @@ -502,7 +502,7 @@ void WERD::CleanNoise(float size_threshold) { // Extracts all the noise outlines and stuffs the pointers into the given // vector of outlines. Afterwards, the outlines vector owns the pointers. -void WERD::GetNoiseOutlines(GenericVector *outlines) { +void WERD::GetNoiseOutlines(std::vector *outlines) { C_BLOB_IT rej_it(&rej_cblobs); for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { C_BLOB *blob = rej_it.extract(); @@ -516,13 +516,13 @@ void WERD::GetNoiseOutlines(GenericVector *outlines) { // back in rej_cblobs where they came from. Where the target_blobs entry is // nullptr, a run of wanted outlines is put into a single new blob. // Ownership of the outlines is transferred back to the word. (Hence -// GenericVector and not PointerVector.) +// vector and not PointerVector.) // Returns true if any new blob was added to the start of the word, which // suggests that it might need joining to the word before it, and likewise // sets make_next_word_fuzzy true if any new blob was added to the end. -bool WERD::AddSelectedOutlines(const GenericVector &wanted, - const GenericVector &target_blobs, - const GenericVector &outlines, +bool WERD::AddSelectedOutlines(const std::vector &wanted, + const std::vector &target_blobs, + const std::vector &outlines, bool *make_next_word_fuzzy) { bool outline_added_to_start = false; if (make_next_word_fuzzy != nullptr) diff --git a/src/ccstruct/werd.h b/src/ccstruct/werd.h index be567b58..e13c6c1c 100644 --- a/src/ccstruct/werd.h +++ b/src/ccstruct/werd.h @@ -21,7 +21,6 @@ #include "bits16.h" #include "elst2.h" -#include "genericvector.h" // GenericVector #include "params.h" #include "stepblob.h" @@ -173,18 +172,18 @@ public: // Extracts all the noise outlines and stuffs the pointers into the given // vector of outlines. Afterwards, the outlines vector owns the pointers. - void GetNoiseOutlines(GenericVector *outlines); + void GetNoiseOutlines(std::vector *outlines); // Adds the selected outlines to the indcated real blobs, and puts the rest // back in rej_cblobs where they came from. Where the target_blobs entry is // nullptr, a run of wanted outlines is put into a single new blob. // Ownership of the outlines is transferred back to the word. (Hence - // GenericVector and not PointerVector.) + // vector and not PointerVector.) // Returns true if any new blob was added to the start of the word, which // suggests that it might need joining to the word before it, and likewise // sets make_next_word_fuzzy true if any new blob was added to the end. - bool AddSelectedOutlines(const GenericVector &wanted, - const GenericVector &target_blobs, - const GenericVector &outlines, bool *make_next_word_fuzzy); + bool AddSelectedOutlines(const std::vector &wanted, + const std::vector &target_blobs, + const std::vector &outlines, bool *make_next_word_fuzzy); private: uint8_t blanks = 0; // no of blanks diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h index dc261f1a..efec0b96 100644 --- a/src/ccutil/genericvector.h +++ b/src/ccutil/genericvector.h @@ -225,16 +225,6 @@ public: qsort(data_, size_used_, sizeof(*data_), comparator); } - // Searches the array (assuming sorted in ascending order, using sort()) for - // an element equal to target and returns true if it is present. - // Use binary_search to get the index of target, or its nearest candidate. - bool bool_binary_search(const T &target) const { - int index = binary_search(target); - if (index >= size_used_) { - return false; - } - return data_[index] == target; - } // Searches the array (assuming sorted in ascending order, using sort()) for // an element equal to target and returns the index of the best candidate. // The return value is conceptually the largest index i such that diff --git a/unittest/equationdetect_test.cc b/unittest/equationdetect_test.cc index 5768c14a..0769a782 100644 --- a/unittest/equationdetect_test.cc +++ b/unittest/equationdetect_test.cc @@ -92,15 +92,15 @@ public: return ComputeForegroundDensity(tbox); } - int RunCountAlignment(const GenericVector &sorted_vec, const int val) { + int RunCountAlignment(const std::vector &sorted_vec, const int val) { return CountAlignment(sorted_vec, val); } - void RunSplitCPHorLite(ColPartition *part, GenericVector *splitted_boxes) { + void RunSplitCPHorLite(ColPartition *part, std::vector *splitted_boxes) { SplitCPHorLite(part, splitted_boxes); } - void RunSplitCPHor(ColPartition *part, GenericVector *parts_splitted) { + void RunSplitCPHor(ColPartition *part, std::vector *parts_splitted) { SplitCPHor(part, parts_splitted); } @@ -377,7 +377,7 @@ TEST_F(EquationFinderTest, ComputeForegroundDensity) { } TEST_F(EquationFinderTest, CountAlignment) { - GenericVector vec; + std::vector vec; vec.push_back(1); vec.push_back(1); vec.push_back(1); @@ -452,7 +452,7 @@ TEST_F(EquationFinderTest, SplitCPHorLite) { ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part->DeleteBoxes(); part->set_median_width(10); - GenericVector splitted_boxes; + std::vector splitted_boxes; // Test an empty part. equation_det_->RunSplitCPHorLite(part, &splitted_boxes); @@ -486,7 +486,7 @@ TEST_F(EquationFinderTest, SplitCPHor) { ColPartition *part = ColPartition::FakePartition(box, PT_FLOWING_TEXT, BRT_TEXT, BTFT_NONE); part->DeleteBoxes(); part->set_median_width(10); - GenericVector parts_splitted; + std::vector parts_splitted; // Test an empty part. equation_det_->RunSplitCPHor(part, &parts_splitted); @@ -512,7 +512,9 @@ TEST_F(EquationFinderTest, SplitCPHor) { EXPECT_TRUE(TBOX(100, 0, 140, 45) == parts_splitted[1]->bounding_box()); EXPECT_TRUE(TBOX(500, 0, 540, 35) == parts_splitted[2]->bounding_box()); - parts_splitted.delete_data_pointers(); + for (auto part_splitted : parts_splitted) { + delete part_splitted; + } part->DeleteBoxes(); delete (part); } diff --git a/unittest/paragraphs_test.cc b/unittest/paragraphs_test.cc index 7fc677e9..ec4c4d1e 100644 --- a/unittest/paragraphs_test.cc +++ b/unittest/paragraphs_test.cc @@ -107,7 +107,7 @@ void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector &detector_output) { + const std::vector &detector_output) { int incorrect_breaks = 0; int missed_breaks = 0; int poorly_matched_models = 0; @@ -186,7 +186,7 @@ void EvaluateParagraphDetection(const TextAndModel *correct, int n, void TestParagraphDetection(const TextAndModel *correct, int num_rows) { std::vector row_infos; - GenericVector row_owners; + std::vector row_owners; PARA_LIST paragraphs; std::vector models; @@ -312,7 +312,7 @@ TEST(ParagraphsTest, TestSingleFullPageContinuation) { const TextAndModel *correct = kSingleFullPageContinuation; int num_rows = countof(kSingleFullPageContinuation); std::vector row_infos; - GenericVector row_owners; + std::vector row_owners; PARA_LIST paragraphs; std::vector models; models.push_back(new ParagraphModel(kLeft, 0, 20, 0, 10)); From 5db92b26aa4cab45f3da6714328c2fcd80891441 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 15 Mar 2021 20:00:56 +0100 Subject: [PATCH 5/5] Replace remaining GenericVector by std::vector for src/textord Signed-off-by: Stefan Weil --- src/ccstruct/linlsq.h | 28 +++++++++----------- src/ccstruct/statistc.cpp | 18 ++++++------- src/ccstruct/statistc.h | 2 +- src/ccutil/genericvector.h | 19 -------------- src/textord/baselinedetect.cpp | 37 +++++++++++++------------- src/textord/baselinedetect.h | 10 +++---- src/textord/bbgrid.h | 34 ++++++++++++++++++++++++ src/textord/cjkpitch.cpp | 15 +++++------ src/textord/colfind.cpp | 42 +++++++++++++++++------------ src/textord/colpartitiongrid.cpp | 13 ++++----- src/textord/colpartitiongrid.h | 4 +-- src/textord/colpartitionset.cpp | 6 ++--- src/textord/colpartitionset.h | 3 +-- src/textord/tabfind.cpp | 8 +++--- src/textord/tabfind.h | 4 +-- src/textord/tablerecog.cpp | 45 ++++++++++++++++---------------- src/textord/tablerecog.h | 11 ++++---- src/textord/tordmain.cpp | 4 +-- unittest/stats_test.cc | 5 ++-- unittest/tablerecog_test.cc | 12 ++++----- 20 files changed, 167 insertions(+), 153 deletions(-) diff --git a/src/ccstruct/linlsq.h b/src/ccstruct/linlsq.h index c13e08fa..862f0549 100644 --- a/src/ccstruct/linlsq.h +++ b/src/ccstruct/linlsq.h @@ -2,7 +2,6 @@ * File: linlsq.h (Formerly llsq.h) * Description: Linear Least squares fitting code. * Author: Ray Smith - * Created: Thu Sep 12 08:44:51 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,13 +21,11 @@ #include "points.h" // for FCOORD +#include // for std::nth_element #include // for int32_t namespace tesseract { -template -class GenericVector; - class TESS_API LLSQ { public: LLSQ() { // constructor @@ -111,29 +108,30 @@ private: // An assumption is made that most of the values are spread over no more than // half the range, but wrap-around is accounted for if the median is near // the wrap-around point. -// Cannot be a member of GenericVector, as it makes heavy used of LLSQ. +// Cannot be a member of vector, as it makes heavy use of LLSQ. // T must be an integer or float/double type. template -T MedianOfCircularValues(T modulus, GenericVector *v) { +T MedianOfCircularValues(T modulus, std::vector &v) { LLSQ stats; T halfrange = static_cast(modulus / 2); - int num_elements = v->size(); - for (int i = 0; i < num_elements; ++i) { - stats.add((*v)[i], (*v)[i] + halfrange); + auto num_elements = v.size(); + for (auto i : v) { + stats.add(i, i + halfrange); } bool offset_needed = stats.y_variance() < stats.x_variance(); if (offset_needed) { - for (int i = 0; i < num_elements; ++i) { - (*v)[i] += halfrange; + for (auto i : v) { + i += halfrange; } } - int median_index = v->choose_nth_item(num_elements / 2); + auto median_index = num_elements / 2; + std::nth_element(v.begin(), v.begin() + median_index, v.end()); if (offset_needed) { - for (int i = 0; i < num_elements; ++i) { - (*v)[i] -= halfrange; + for (auto i : v) { + i -= halfrange; } } - return (*v)[median_index]; + return v[median_index]; } } // namespace tesseract diff --git a/src/ccstruct/statistc.cpp b/src/ccstruct/statistc.cpp index b323f7de..8a7e1640 100644 --- a/src/ccstruct/statistc.cpp +++ b/src/ccstruct/statistc.cpp @@ -462,13 +462,13 @@ static bool GatherPeak(int index, const int *src_buckets, int *used_buckets, int // to sort on the output will re-sort by increasing mean of peak if that is // more useful than decreasing total count. // Returns the actual number of modes found. -int STATS::top_n_modes(int max_modes, GenericVector> *modes) const { +int STATS::top_n_modes(int max_modes, std::vector> &modes) const { if (max_modes <= 0) return 0; int src_count = rangemax_ - rangemin_; // Used copies the counts in buckets_ as they get used. STATS used(rangemin_, rangemax_); - modes->truncate(0); + modes.clear(); // Total count of the smallest peak found so far. int least_count = 1; // Mode that is used as a seed for each peak @@ -502,21 +502,21 @@ int STATS::top_n_modes(int max_modes, GenericVector> *mode &total_value)) break; } - if (total_count > least_count || modes->size() < max_modes) { + if (total_count > least_count || modes.size() < max_modes) { // We definitely want this mode, so if we have enough discard the least. - if (modes->size() == max_modes) - modes->truncate(max_modes - 1); + if (modes.size() == max_modes) + modes.resize(max_modes - 1); int target_index = 0; // Linear search for the target insertion point. - while (target_index < modes->size() && (*modes)[target_index].data() >= total_count) + while (target_index < modes.size() && modes[target_index].data() >= total_count) ++target_index; auto peak_mean = static_cast(total_value / total_count + rangemin_); - modes->insert(KDPairInc(peak_mean, total_count), target_index); - least_count = modes->back().data(); + modes.insert(modes.begin() + target_index, KDPairInc(peak_mean, total_count)); + least_count = modes.back().data(); } } } while (max_count > 0); - return modes->size(); + return modes.size(); } /********************************************************************** diff --git a/src/ccstruct/statistc.h b/src/ccstruct/statistc.h index cb820bc5..df806608 100644 --- a/src/ccstruct/statistc.h +++ b/src/ccstruct/statistc.h @@ -113,7 +113,7 @@ public: // sort on the output will re-sort by increasing mean of peak if that is more // useful than decreasing total count. Returns the actual number of modes // found. - int top_n_modes(int max_modes, GenericVector> *modes) const; + int top_n_modes(int max_modes, std::vector> &modes) const; // Prints a summary and table of the histogram. void print() const; diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h index efec0b96..89373c8b 100644 --- a/src/ccutil/genericvector.h +++ b/src/ccutil/genericvector.h @@ -245,25 +245,6 @@ public: return bottom; } - // Compact the vector by deleting elements using operator!= on basic types. - // The vector must be sorted. - void compact_sorted() { - if (size_used_ == 0) { - return; - } - - // First element is in no matter what, hence the i = 1. - int last_write = 0; - for (int i = 1; i < size_used_; ++i) { - // Finds next unique item and writes it. - if (data_[last_write] != data_[i]) { - data_[++last_write] = data_[i]; - } - } - // last_write is the index of a valid data cell, so add 1. - size_used_ = last_write + 1; - } - // Returns the index of what would be the target_index_th item in the array // if the members were sorted, without actually sorting. Members are // shuffled around, but it takes O(n) time. diff --git a/src/textord/baselinedetect.cpp b/src/textord/baselinedetect.cpp index fbfd0d57..44144904 100644 --- a/src/textord/baselinedetect.cpp +++ b/src/textord/baselinedetect.cpp @@ -277,8 +277,8 @@ double BaselineRow::AdjustBaselineToGrid(int debug, const FCOORD &direction, dou void BaselineRow::SetupBlobDisplacements(const FCOORD &direction) { // Set of perpendicular displacements of the blob bottoms from the required // baseline direction. - GenericVector perp_blob_dists; - displacement_modes_.truncate(0); + std::vector perp_blob_dists; + displacement_modes_.clear(); // Gather the skew-corrected position of every blob. double min_dist = FLT_MAX; double max_dist = -FLT_MAX; @@ -310,8 +310,8 @@ void BaselineRow::SetupBlobDisplacements(const FCOORD &direction) { for (int i = 0; i < perp_blob_dists.size(); ++i) { dist_stats.add(IntCastRounded(perp_blob_dists[i] / disp_quant_factor_), 1); } - GenericVector> scaled_modes; - dist_stats.top_n_modes(kMaxDisplacementsModes, &scaled_modes); + std::vector> scaled_modes; + dist_stats.top_n_modes(kMaxDisplacementsModes, scaled_modes); #ifdef kDebugYCoord if (debug) { for (int i = 0; i < scaled_modes.size(); ++i) { @@ -428,7 +428,7 @@ double BaselineBlock::SpacingModelError(double perp_disp, double line_spacing, d bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) { if (non_text_block_) return false; - GenericVector angles; + std::vector angles; for (int r = 0; r < rows_.size(); ++r) { BaselineRow *row = rows_[r]; if (row->FitBaseline(use_box_bottoms)) { @@ -440,7 +440,7 @@ bool BaselineBlock::FitBaselinesAndFindSkew(bool use_box_bottoms) { } if (!angles.empty()) { - skew_angle_ = MedianOfCircularValues(M_PI, &angles); + skew_angle_ = MedianOfCircularValues(M_PI, angles); good_skew_angle_ = true; } else { skew_angle_ = 0.0f; @@ -610,7 +610,7 @@ void BaselineBlock::DrawPixSpline(Pix *pix_in) { // observations. bool BaselineBlock::ComputeLineSpacing() { FCOORD direction(cos(skew_angle_), sin(skew_angle_)); - GenericVector row_positions; + std::vector row_positions; ComputeBaselinePositions(direction, &row_positions); if (row_positions.size() < 2) return false; @@ -644,7 +644,7 @@ bool BaselineBlock::ComputeLineSpacing() { // of the block baseline a line sits, hence the function and argument name // positions not distances. void BaselineBlock::ComputeBaselinePositions(const FCOORD &direction, - GenericVector *positions) { + std::vector *positions) { positions->clear(); for (int r = 0; r < rows_.size(); ++r) { BaselineRow *row = rows_[r]; @@ -659,7 +659,7 @@ void BaselineBlock::ComputeBaselinePositions(const FCOORD &direction, // Computes an estimate of the line spacing of the block from the median // of the spacings between adjacent overlapping textlines. void BaselineBlock::EstimateLineSpacing() { - GenericVector spacings; + std::vector spacings; for (int r = 0; r < rows_.size(); ++r) { BaselineRow *row = rows_[r]; // Exclude silly lines. @@ -682,7 +682,8 @@ void BaselineBlock::EstimateLineSpacing() { // If we have at least one value, use it, otherwise leave the previous // value unchanged. if (!spacings.empty()) { - line_spacing_ = spacings[spacings.choose_nth_item(spacings.size() / 2)]; + std::nth_element(spacings.begin(), spacings.begin() + spacings.size() / 2, spacings.end()); + line_spacing_ = spacings[spacings.size() / 2]; if (debug_level_ > 1) tprintf("Estimate of linespacing = %g\n", line_spacing_); } @@ -692,7 +693,7 @@ void BaselineBlock::EstimateLineSpacing() { // line to the deskewed y-position of each baseline as a function of its // estimated line index, allowing for a small error in the initial linespacing // and choosing the best available model. -void BaselineBlock::RefineLineSpacing(const GenericVector &positions) { +void BaselineBlock::RefineLineSpacing(const std::vector &positions) { double spacings[3], offsets[3], errors[3]; int index_range; errors[0] = @@ -727,7 +728,7 @@ void BaselineBlock::RefineLineSpacing(const GenericVector &positions) { // and the corresponding intercept in c_out, and the number of spacings seen // in index_delta. Returns the error of fit to the line spacing model. // Uses a simple linear regression, but optimized the offset using the median. -double BaselineBlock::FitLineSpacingModel(const GenericVector &positions, double m_in, +double BaselineBlock::FitLineSpacingModel(const std::vector &positions, double m_in, double *m_out, double *c_out, int *index_delta) { if (m_in == 0.0f || positions.size() < 2) { *m_out = m_in; @@ -736,12 +737,12 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector &positions *index_delta = 0; return 0.0; } - GenericVector offsets; + std::vector offsets; // Get the offset (remainder) linespacing for each line and choose the median. for (int i = 0; i < positions.size(); ++i) offsets.push_back(fmod(positions[i], m_in)); // Get the median offset. - double median_offset = MedianOfCircularValues(m_in, &offsets); + double median_offset = MedianOfCircularValues(m_in, offsets); // Now fit a line to quantized line number and offset. LLSQ llsq; int min_index = INT32_MAX; @@ -755,7 +756,7 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector &positions // Get the refined line spacing. *m_out = llsq.m(); // Use the median offset rather than the mean. - offsets.truncate(0); + offsets.clear(); if (*m_out != 0.0) { for (int i = 0; i < positions.size(); ++i) { offsets.push_back(fmod(positions[i], *m_out)); @@ -766,7 +767,7 @@ double BaselineBlock::FitLineSpacingModel(const GenericVector &positions tprintf("%d: %g\n", i, offsets[i]); } } - *c_out = MedianOfCircularValues(*m_out, &offsets); + *c_out = MedianOfCircularValues(*m_out, offsets); } else { *c_out = 0.0; } @@ -808,7 +809,7 @@ BaselineDetect::BaselineDetect(int debug_level, const FCOORD &page_skew, TO_BLOC // block-wise and page-wise data to smooth small blocks/rows, and applies // smoothing based on block/page-level skew and block-level linespacing. void BaselineDetect::ComputeStraightBaselines(bool use_box_bottoms) { - GenericVector block_skew_angles; + std::vector block_skew_angles; for (int i = 0; i < blocks_.size(); ++i) { BaselineBlock *bl_block = blocks_[i]; if (debug_level_ > 0) @@ -820,7 +821,7 @@ void BaselineDetect::ComputeStraightBaselines(bool use_box_bottoms) { // Compute a page-wide default skew for blocks with too little information. double default_block_skew = page_skew_.angle(); if (!block_skew_angles.empty()) { - default_block_skew = MedianOfCircularValues(M_PI, &block_skew_angles); + default_block_skew = MedianOfCircularValues(M_PI, block_skew_angles); } if (debug_level_ > 0) { tprintf("Page skew angle = %g\n", default_block_skew); diff --git a/src/textord/baselinedetect.h b/src/textord/baselinedetect.h index df9218fd..4e276d5d 100644 --- a/src/textord/baselinedetect.h +++ b/src/textord/baselinedetect.h @@ -23,8 +23,6 @@ #include "points.h" #include "rect.h" -#include "genericvector.h" - struct Pix; namespace tesseract { @@ -109,7 +107,7 @@ private: FCOORD baseline_pt1_; FCOORD baseline_pt2_; // Set of modes of displacements. They indicate preferable baseline positions. - GenericVector displacement_modes_; + std::vector displacement_modes_; // Quantization factor used for displacement_modes_. double disp_quant_factor_; // Half the acceptance range of blob displacements for computing the @@ -187,7 +185,7 @@ private: // Computes the deskewed vertical position of each baseline in the block and // stores them in the given vector. - void ComputeBaselinePositions(const FCOORD &direction, GenericVector *positions); + void ComputeBaselinePositions(const FCOORD &direction, std::vector *positions); // Computes an estimate of the line spacing of the block from the median // of the spacings between adjacent overlapping textlines. @@ -197,13 +195,13 @@ private: // line to the deskewed y-position of each baseline as a function of its // estimated line index, allowing for a small error in the initial linespacing // and choosing the best available model. - void RefineLineSpacing(const GenericVector &positions); + void RefineLineSpacing(const std::vector &positions); // Given an initial estimate of line spacing (m_in) and the positions of each // baseline, computes the line spacing of the block more accurately in m_out, // and the corresponding intercept in c_out, and the number of spacings seen // in index_delta. Returns the error of fit to the line spacing model. - double FitLineSpacingModel(const GenericVector &positions, double m_in, double *m_out, + double FitLineSpacingModel(const std::vector &positions, double m_in, double *m_out, double *c_out, int *index_delta); // The block to which this class adds extra information used during baseline diff --git a/src/textord/bbgrid.h b/src/textord/bbgrid.h index 9b95443d..9687c15c 100644 --- a/src/textord/bbgrid.h +++ b/src/textord/bbgrid.h @@ -384,6 +384,23 @@ int SortByBoxLeft(const void *void1, const void *void2) { return p1->bounding_box().top() - p2->bounding_box().top(); } +template +bool StdSortByBoxLeft(const void *void1, const void *void2) { + // The void*s are actually doubly indirected, so get rid of one level. + const BBC *p1 = *static_cast(void1); + const BBC *p2 = *static_cast(void2); + int result = p1->bounding_box().left() - p2->bounding_box().left(); + if (result != 0) + return result < 0; + result = p1->bounding_box().right() - p2->bounding_box().right(); + if (result != 0) + return result < 0; + result = p1->bounding_box().bottom() - p2->bounding_box().bottom(); + if (result != 0) + return result < 0; + return p1->bounding_box().top() < p2->bounding_box().top(); +} + // Sort function to sort a BBC by bounding_box().right() in right-to-left order. template int SortRightToLeft(const void *void1, const void *void2) { @@ -402,6 +419,23 @@ int SortRightToLeft(const void *void1, const void *void2) { return p1->bounding_box().top() - p2->bounding_box().top(); } +template +bool StdSortRightToLeft(const void *void1, const void *void2) { + // The void*s are actually doubly indirected, so get rid of one level. + const BBC *p1 = *static_cast(void1); + const BBC *p2 = *static_cast(void2); + int result = p2->bounding_box().right() - p1->bounding_box().right(); + if (result != 0) + return result < 0; + result = p2->bounding_box().left() - p1->bounding_box().left(); + if (result != 0) + return result < 0; + result = p1->bounding_box().bottom() - p2->bounding_box().bottom(); + if (result != 0) + return result < 0; + return p1->bounding_box().top() < p2->bounding_box().top(); +} + // Sort function to sort a BBC by bounding_box().bottom(). template int SortByBoxBottom(const void *void1, const void *void2) { diff --git a/src/textord/cjkpitch.cpp b/src/textord/cjkpitch.cpp index 4fe652a7..4f284b24 100644 --- a/src/textord/cjkpitch.cpp +++ b/src/textord/cjkpitch.cpp @@ -18,7 +18,6 @@ /////////////////////////////////////////////////////////////////////// #include "cjkpitch.h" -#include "genericvector.h" #include "topitch.h" #include "tovars.h" @@ -109,7 +108,7 @@ public: ~LocalCorrelation() {} void Finish() { - values_.sort(float_pair_compare); + std::sort(values_.begin(), values_.end(), float_pair_compare); finalized_ = true; } @@ -155,14 +154,12 @@ public: } private: - static int float_pair_compare(const void *a, const void *b) { - const auto *f_a = static_cast(a); - const auto *f_b = static_cast(b); - return (f_a->x > f_b->x) ? 1 : ((f_a->x < f_b->x) ? -1 : 0); + static bool float_pair_compare(const float_pair f_a, const float_pair f_b) { + return f_a.x < f_b.x; } bool finalized_; - GenericVector values_; + std::vector values_; }; // Class to represent a character on a fixed pitch row. A FPChar may @@ -450,7 +447,7 @@ private: index++; } } - characters_.truncate(index); + characters_.resize(index); } float pitch_ = 0.0f; // Character pitch. @@ -472,7 +469,7 @@ private: SimpleStats heights_; - GenericVector characters_; + std::vector characters_; TO_ROW *real_row_ = nullptr; // Underlying TD_ROW for this row. }; diff --git a/src/textord/colfind.cpp b/src/textord/colfind.cpp index 19fcb57f..bd39cb97 100644 --- a/src/textord/colfind.cpp +++ b/src/textord/colfind.cpp @@ -101,7 +101,9 @@ ColumnFinder::ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &trig } ColumnFinder::~ColumnFinder() { - column_sets_.delete_data_pointers(); + for (auto set : column_sets_) { + delete set; + } delete[] best_columns_; delete stroke_width_; delete input_blobs_win_; @@ -552,7 +554,7 @@ bool ColumnFinder::MakeColumns(bool single_column) { bool good_only = true; do { for (int i = 0; i < gridheight_; ++i) { - ColPartitionSet *line_set = part_sets.get(i); + ColPartitionSet *line_set = part_sets.at(i); if (line_set != nullptr && line_set->LegalColumnCandidate()) { ColPartitionSet *column_candidate = line_set->Copy(good_only); if (column_candidate != nullptr) @@ -590,7 +592,7 @@ bool ColumnFinder::MakeColumns(bool single_column) { ComputeMeanColumnGap(any_multi_column); } for (int i = 0; i < part_sets.size(); ++i) { - ColPartitionSet *line_set = part_sets.get(i); + ColPartitionSet *line_set = part_sets.at(i); if (line_set != nullptr) { line_set->RelinquishParts(); delete line_set; @@ -604,8 +606,9 @@ bool ColumnFinder::MakeColumns(bool single_column) { // Src_sets may be equal to column_candidates, in which case it will // use them as a source to improve themselves. void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets) { - PartSetVector temp_cols; - temp_cols.move(column_sets); + // TODO: optimize. + PartSetVector temp_cols = *column_sets; + column_sets->clear(); if (src_sets == column_sets) src_sets = &temp_cols; int set_size = temp_cols.size(); @@ -613,7 +616,7 @@ void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVecto bool good_only = true; do { for (int i = 0; i < set_size; ++i) { - ColPartitionSet *column_candidate = temp_cols.get(i); + ColPartitionSet *column_candidate = temp_cols.at(i); ASSERT_HOST(column_candidate != nullptr); ColPartitionSet *improved = column_candidate->Copy(good_only); if (improved != nullptr) { @@ -623,10 +626,15 @@ void ColumnFinder::ImproveColumnCandidates(PartSetVector *src_sets, PartSetVecto } good_only = !good_only; } while (column_sets->empty() && !good_only); - if (column_sets->empty()) - column_sets->move(&temp_cols); - else - temp_cols.delete_data_pointers(); + if (column_sets->empty()) { + // TODO: optimize. + column_sets = &temp_cols; + temp_cols.clear(); + } else { + for (auto data : temp_cols) { + delete data; + } + } } // Prints debug information on the column candidates. @@ -635,7 +643,7 @@ void ColumnFinder::PrintColumnCandidates(const char *title) { tprintf("Found %d %s:\n", set_size, title); if (textord_debug_tabfind >= 3) { for (int i = 0; i < set_size; ++i) { - ColPartitionSet *column_set = column_sets_.get(i); + ColPartitionSet *column_set = column_sets_.at(i); column_set->Print(); } } @@ -673,7 +681,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) { // Set possible column_sets to indicate whether each set is compatible // with each column. for (int part_i = 0; part_i < set_count; ++part_i) { - ColPartitionSet *line_set = part_sets.get(part_i); + ColPartitionSet *line_set = part_sets.at(part_i); bool debug = line_set != nullptr && WithinTestRegion(2, line_set->bounding_box().left(), line_set->bounding_box().bottom()); column_set_costs[part_i] = new int[column_count]; @@ -681,8 +689,8 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) { assigned_costs[part_i] = INT32_MAX; for (int col_i = 0; col_i < column_count; ++col_i) { if (line_set != nullptr && - column_sets_.get(col_i)->CompatibleColumns(debug, line_set, WidthCB())) { - column_set_costs[part_i][col_i] = column_sets_.get(col_i)->UnmatchedWidth(line_set); + column_sets_.at(col_i)->CompatibleColumns(debug, line_set, WidthCB())) { + column_set_costs[part_i][col_i] = column_sets_.at(col_i)->UnmatchedWidth(line_set); any_columns_possible[part_i] = true; } else { column_set_costs[part_i][col_i] = INT32_MAX; @@ -702,7 +710,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) { int column_set_id = RangeModalColumnSet(column_set_costs, assigned_costs, start, end); if (textord_debug_tabfind >= 2) { tprintf("Range modal column id = %d\n", column_set_id); - column_sets_.get(column_set_id)->Print(); + column_sets_.at(column_set_id)->Print(); } // Now find the longest run of the column_set_id in the range. ShrinkRangeToLongestRun(column_set_costs, assigned_costs, any_columns_possible, column_set_id, @@ -722,7 +730,7 @@ bool ColumnFinder::AssignColumns(const PartSetVector &part_sets) { tprintf("Column id %d applies to range = %d - %d\n", column_set_id, start, end); // Assign the column to the range, which now may overlap with other ranges. AssignColumnToRange(column_set_id, start, end, column_set_costs, assigned_costs); - if (column_sets_.get(column_set_id)->GoodColumnCount() > 1) + if (column_sets_.at(column_set_id)->GoodColumnCount() > 1) any_multi_column = true; } // If anything remains unassigned, the whole lot is unassigned, so @@ -879,7 +887,7 @@ void ColumnFinder::ExtendRangePastSmallGaps(int **column_set_costs, const int *a // Assigns the given column_set_id to the given range. void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end, int **column_set_costs, int *assigned_costs) { - ColPartitionSet *column_set = column_sets_.get(column_set_id); + ColPartitionSet *column_set = column_sets_.at(column_set_id); for (int i = start; i < end; ++i) { assigned_costs[i] = column_set_costs[i][column_set_id]; best_columns_[i] = column_set; diff --git a/src/textord/colpartitiongrid.cpp b/src/textord/colpartitiongrid.cpp index 55b0f494..f9729e86 100644 --- a/src/textord/colpartitiongrid.cpp +++ b/src/textord/colpartitiongrid.cpp @@ -1472,7 +1472,7 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction ComputeSearchBoxAndScaling(direction, part_box, gridsize(), &search_box, &dist_scaling); bool image_region = ImageFind::CountPixelsInRotatedBox(search_box, im_box, rerotation, nontext_map) > 0; - GenericVector dists[NPT_COUNT]; + std::vector dists[NPT_COUNT]; AccumulatePartDistances(part, dist_scaling, search_box, nontext_map, im_box, rerotation, debug, dists); // By iteratively including the next smallest distance across the vectors, @@ -1537,12 +1537,12 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction // vectors in the dists array are sorted in increasing order. // The nontext_map (+im_box, rerotation) is used to make text invisible if // there is non-text in between. -// dists must be an array of GenericVectors of size NPT_COUNT. +// dists must be an array of vectors of size NPT_COUNT. void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part, const ICOORD &dist_scaling, const TBOX &search_box, Pix *nontext_map, const TBOX &im_box, const FCOORD &rerotation, bool debug, - GenericVector *dists) { + std::vector *dists) { const TBOX &part_box = base_part.bounding_box(); ColPartitionGridSearch rsearch(this); rsearch.SetUniqueMode(true); @@ -1571,7 +1571,7 @@ void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part, // Truncate the number of boxes, so text doesn't get too much advantage. int n_boxes = std::min(neighbour->boxes_count(), kSmoothDecisionMargin); BlobTextFlowType n_flow = neighbour->flow(); - GenericVector *count_vector = nullptr; + std::vector *count_vector = nullptr; if (n_flow == BTFT_STRONG_CHAIN) { if (n_type == BRT_TEXT) count_vector = &dists[NPT_HTEXT]; @@ -1602,8 +1602,9 @@ void ColPartitionGrid::AccumulatePartDistances(const ColPartition &base_part, neighbour->Print(); } } - for (int i = 0; i < NPT_COUNT; ++i) - dists[i].sort(); + for (int i = 0; i < NPT_COUNT; ++i) { + std::sort(dists[i].begin(), dists[i].end()); + } } // Improves the margins of the part ColPartition by searching for diff --git a/src/textord/colpartitiongrid.h b/src/textord/colpartitiongrid.h index 15e6cfca..f147827d 100644 --- a/src/textord/colpartitiongrid.h +++ b/src/textord/colpartitiongrid.h @@ -214,10 +214,10 @@ private: // distance (scaled by dist_scaling) of the part from the base_part to the // vector of the appropriate type for the partition. Prior to return, the // vectors in the dists array are sorted in increasing order. - // dists must be an array of GenericVectors of size NPT_COUNT. + // dists must be an array of vectors of size NPT_COUNT. void AccumulatePartDistances(const ColPartition &base_part, const ICOORD &dist_scaling, const TBOX &search_box, Pix *nontext_map, const TBOX &im_box, - const FCOORD &rerotation, bool debug, GenericVector *dists); + const FCOORD &rerotation, bool debug, std::vector *dists); // Improves the margins of the ColPartition by searching for // neighbours that vertically overlap significantly. diff --git a/src/textord/colpartitionset.cpp b/src/textord/colpartitionset.cpp index b681e76d..eeddc60e 100644 --- a/src/textord/colpartitionset.cpp +++ b/src/textord/colpartitionset.cpp @@ -93,7 +93,7 @@ void ColPartitionSet::ImproveColumnCandidate(WidthCallback cb, PartSetVector *sr // Iterate over the provided column sets, as each one may have something // to improve this. for (int i = 0; i < set_size; ++i) { - ColPartitionSet *column_set = src_sets->get(i); + ColPartitionSet *column_set = src_sets->at(i); if (column_set == nullptr) continue; // Iterate over the parts in this and column_set, adding bigger or @@ -184,7 +184,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthC return; } for (int i = 0; i < column_sets->size(); ++i) { - ColPartitionSet *columns = column_sets->get(i); + ColPartitionSet *columns = column_sets->at(i); // In ordering the column set candidates, good_coverage_ is king, // followed by good_column_count_ and then bad_coverage_. bool better = good_coverage_ > columns->good_coverage_; @@ -198,7 +198,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthC // The new one is better so add it. if (debug) tprintf("Good one\n"); - column_sets->insert(this, i); + column_sets->insert(column_sets->begin() + i, this); return; } if (columns->CompatibleColumns(false, this, cb)) { diff --git a/src/textord/colpartitionset.h b/src/textord/colpartitionset.h index 89852f41..7c384183 100644 --- a/src/textord/colpartitionset.h +++ b/src/textord/colpartitionset.h @@ -21,7 +21,6 @@ #define TESSERACT_TEXTORD_COLPARTITIONSET_H_ #include "colpartition.h" // For ColPartition_LIST. -#include "genericvector.h" // For GenericVector. #include "rect.h" // For TBOX. #include "tabvector.h" // For BLOBNBOX_CLIST. @@ -30,7 +29,7 @@ namespace tesseract { class WorkingPartSet_LIST; class ColSegment_LIST; class ColPartitionSet; -using PartSetVector = GenericVector; +using PartSetVector = std::vector; // ColPartitionSet is a class that holds a list of ColPartitions. // Its main use is in holding a candidate partitioning of the width of the diff --git a/src/textord/tabfind.cpp b/src/textord/tabfind.cpp index 88e3449d..629dda6f 100644 --- a/src/textord/tabfind.cpp +++ b/src/textord/tabfind.cpp @@ -516,7 +516,7 @@ ScrollView *TabFind::FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_g #ifndef GRAPHICS_DISABLED // Helper displays all the boxes in the given vector on the given window. -static void DisplayBoxVector(const GenericVector &boxes, ScrollView *win) { +static void DisplayBoxVector(const std::vector &boxes, ScrollView *win) { for (int i = 0; i < boxes.size(); ++i) { TBOX box = boxes[i]->bounding_box(); int left_x = box.left(); @@ -552,8 +552,8 @@ ScrollView *TabFind::FindTabBoxes(int min_gutter_width, double tabfind_aligned_g } // Sort left tabs by left and right by right to see the outermost one first // on a ragged tab. - left_tab_boxes_.sort(SortByBoxLeft); - right_tab_boxes_.sort(SortRightToLeft); + std::sort(left_tab_boxes_.begin(), left_tab_boxes_.end(), StdSortByBoxLeft); + std::sort(right_tab_boxes_.begin(), right_tab_boxes_.end(), StdSortRightToLeft); ScrollView *tab_win = nullptr; #ifndef GRAPHICS_DISABLED if (textord_tabfind_show_initialtabs) { @@ -831,7 +831,7 @@ int TabFind::FindTabVectors(int search_size_multiple, TabAlignment alignment, in int vector_count = 0; // Search the right or left tab boxes, looking for tab vectors. bool right = alignment == TA_RIGHT_ALIGNED || alignment == TA_RIGHT_RAGGED; - const GenericVector &boxes = right ? right_tab_boxes_ : left_tab_boxes_; + const std::vector &boxes = right ? right_tab_boxes_ : left_tab_boxes_; for (int i = 0; i < boxes.size(); ++i) { BLOBNBOX *bbox = boxes[i]; if ((!right && bbox->left_tab_type() == TT_MAYBE_ALIGNED) || diff --git a/src/textord/tabfind.h b/src/textord/tabfind.h index c7df84b1..2a3ae592 100644 --- a/src/textord/tabfind.h +++ b/src/textord/tabfind.h @@ -354,8 +354,8 @@ private: /** Callback to test an int for being a common width. */ WidthCallback width_cb_; // Sets of bounding boxes that are candidate tab stops. - GenericVector left_tab_boxes_; - GenericVector right_tab_boxes_; + std::vector left_tab_boxes_; + std::vector right_tab_boxes_; }; } // namespace tesseract. diff --git a/src/textord/tablerecog.cpp b/src/textord/tablerecog.cpp index f2226fca..40191272 100644 --- a/src/textord/tablerecog.cpp +++ b/src/textord/tablerecog.cpp @@ -156,12 +156,11 @@ bool StructuredTable::FindLinedStructure() { if (cell_x_.size() < 3 || cell_y_.size() < 3) return false; - cell_x_.sort(); - cell_y_.sort(); - - // Remove duplicates that may have occurred due to split lines. - cell_x_.compact_sorted(); - cell_y_.compact_sorted(); + // Sort and remove duplicates that may have occurred due to split lines. + std::sort(cell_x_.begin(), cell_x_.end()); + std::unique(cell_x_.begin(), cell_x_.end()); + std::sort(cell_y_.begin(), cell_y_.end()); + std::unique(cell_y_.begin(), cell_y_.end()); // The border should be the extents of line boxes, not middle. cell_x_[0] = bounding_box_.left(); @@ -170,8 +169,8 @@ bool StructuredTable::FindLinedStructure() { cell_y_[cell_y_.size() - 1] = bounding_box_.top(); // Remove duplicates that may have occurred due to moving the borders. - cell_x_.compact_sorted(); - cell_y_.compact_sorted(); + std::unique(cell_x_.begin(), cell_x_.end()); + std::unique(cell_y_.begin(), cell_y_.end()); CalculateMargins(); CalculateStats(); @@ -347,8 +346,8 @@ bool StructuredTable::VerifyWhitespacedTable() { // in the middle of the two nearest partitions. void StructuredTable::FindWhitespacedColumns() { // Set of the extents of all partitions on the page. - GenericVector left_sides; - GenericVector right_sides; + std::vector left_sides; + std::vector right_sides; // Look at each text partition. We want to find the partitions // that have extremal left/right sides. These will give us a basis @@ -371,8 +370,8 @@ void StructuredTable::FindWhitespacedColumns() { return; // Since data may be inserted in grid order, we sort the left/right sides. - left_sides.sort(); - right_sides.sort(); + std::sort(left_sides.begin(), left_sides.end()); + std::sort(right_sides.begin(), right_sides.end()); // At this point, in the "merged list", we expect to have a left side, // followed by either more left sides or a right side. The last number @@ -390,8 +389,8 @@ void StructuredTable::FindWhitespacedColumns() { // in the middle of the two nearest partitions. void StructuredTable::FindWhitespacedRows() { // Set of the extents of all partitions on the page. - GenericVector bottom_sides; - GenericVector top_sides; + std::vector bottom_sides; + std::vector top_sides; // We will be "shrinking" partitions, so keep the min/max around to // make sure the bottom/top lines do not intersect text. int min_bottom = INT32_MAX; @@ -435,8 +434,8 @@ void StructuredTable::FindWhitespacedRows() { return; // Since data may be inserted in grid order, we sort the bottom/top sides. - bottom_sides.sort(); - top_sides.sort(); + std::sort(bottom_sides.begin(), bottom_sides.end()); + std::sort(top_sides.begin(), top_sides.end()); // At this point, in the "merged list", we expect to have a bottom side, // followed by either more bottom sides or a top side. The last number @@ -573,17 +572,17 @@ void StructuredTable::AbsorbNearbyLines() { // desired height. // The first/last items are extremal values of the list and known. // NOTE: This function assumes the lists are sorted! -void StructuredTable::FindCellSplitLocations(const GenericVector &min_list, - const GenericVector &max_list, int max_merged, - GenericVector *locations) { +void StructuredTable::FindCellSplitLocations(const std::vector &min_list, + const std::vector &max_list, int max_merged, + std::vector *locations) { locations->clear(); ASSERT_HOST(min_list.size() == max_list.size()); if (min_list.size() == 0) return; - ASSERT_HOST(min_list.get(0) < max_list.get(0)); - ASSERT_HOST(min_list.get(min_list.size() - 1) < max_list.get(max_list.size() - 1)); + ASSERT_HOST(min_list.at(0) < max_list.at(0)); + ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1)); - locations->push_back(min_list.get(0)); + locations->push_back(min_list.at(0)); int min_index = 0; int max_index = 0; int stacked_partitions = 0; @@ -610,7 +609,7 @@ void StructuredTable::FindCellSplitLocations(const GenericVector &min_list, ++max_index; } } - locations->push_back(max_list.get(max_list.size() - 1)); + locations->push_back(max_list.at(max_list.size() - 1)); } // Counts the number of partitions in the table diff --git a/src/textord/tablerecog.h b/src/textord/tablerecog.h index 3e905390..8790f709 100644 --- a/src/textord/tablerecog.h +++ b/src/textord/tablerecog.h @@ -21,7 +21,6 @@ #define TABLERECOG_H_ #include "colpartitiongrid.h" -#include "genericvector.h" namespace tesseract { @@ -209,9 +208,9 @@ protected: // are inserted wherever space exists between partitions. If it is 2, // lines may intersect 2 partitions at most, but you also need at least // 2 partitions to generate a line. - static void FindCellSplitLocations(const GenericVector &min_list, - const GenericVector &max_list, int max_merged, - GenericVector *locations); + static void FindCellSplitLocations(const std::vector &min_list, + const std::vector &max_list, int max_merged, + std::vector *locations); //////// //////// Utility function for table queries @@ -236,8 +235,8 @@ protected: // bounding box is a convenient external representation. // cell_x_ and cell_y_ indicate the grid lines. TBOX bounding_box_; // Bounding box - GenericVector cell_x_; // Locations of vertical divisions (sorted) - GenericVector cell_y_; // Locations of horizontal divisions (sorted) + std::vector cell_x_; // Locations of vertical divisions (sorted) + std::vector cell_y_; // Locations of horizontal divisions (sorted) bool is_lined_; // Is the table backed up by a line structure // Table margins, set via CalculateMargins int space_above_; diff --git a/src/textord/tordmain.cpp b/src/textord/tordmain.cpp index 2d8df612..658eca26 100644 --- a/src/textord/tordmain.cpp +++ b/src/textord/tordmain.cpp @@ -49,7 +49,7 @@ #include "tprintf.h" // for tprintf #include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP -#include "genericvector.h" // for PointerVector, GenericVector +#include "genericvector.h" // for PointerVector #include // for pixDestroy, pixGetHeight, boxCreate @@ -685,7 +685,7 @@ struct BlockGroup { // Min xheight of the blocks. float min_xheight; // Collection of borrowed pointers to the blocks in the group. - GenericVector blocks; + std::vector blocks; }; // Groups blocks by rotation, then, for each group, makes a WordGrid and calls diff --git a/unittest/stats_test.cc b/unittest/stats_test.cc index 25373aa7..91b3d924 100644 --- a/unittest/stats_test.cc +++ b/unittest/stats_test.cc @@ -9,7 +9,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "genericvector.h" #include "kdpair.h" #include "statistc.h" @@ -42,8 +41,8 @@ TEST_F(STATSTest, BasicStats) { // Tests the top_n_modes function. TEST_F(STATSTest, TopNModes) { - GenericVector > modes; - int num_modes = stats_.top_n_modes(3, &modes); + std::vector > modes; + int num_modes = stats_.top_n_modes(3, modes); EXPECT_EQ(3, num_modes); // Mode0 is 12 1 1 = 14 total count with a mean of 2 3/14. EXPECT_FLOAT_EQ(2.0f + 3.0f / 14, modes[0].key()); diff --git a/unittest/tablerecog_test.cc b/unittest/tablerecog_test.cc index 1bafbda4..b738098a 100644 --- a/unittest/tablerecog_test.cc +++ b/unittest/tablerecog_test.cc @@ -39,27 +39,27 @@ public: void InjectCellY(int y) { cell_y_.push_back(y); - cell_y_.sort(); + std::sort(cell_y_.begin(), cell_y_.end()); } void InjectCellX(int x) { cell_x_.push_back(x); - cell_x_.sort(); + std::sort(cell_x_.begin(), cell_x_.end()); } void ExpectCellX(int x_min, int second, int add, int almost_done, int x_max) { ASSERT_EQ(0, (almost_done - second) % add); EXPECT_EQ(3 + (almost_done - second) / add, cell_x_.size()); - EXPECT_EQ(x_min, cell_x_.get(0)); - EXPECT_EQ(x_max, cell_x_.get(cell_x_.size() - 1)); + EXPECT_EQ(x_min, cell_x_.at(0)); + EXPECT_EQ(x_max, cell_x_.at(cell_x_.size() - 1)); for (int i = 1; i < cell_x_.size() - 1; ++i) { - EXPECT_EQ(second + add * (i - 1), cell_x_.get(i)); + EXPECT_EQ(second + add * (i - 1), cell_x_.at(i)); } } void ExpectSortedX() { EXPECT_GT(cell_x_.size(), 0); for (int i = 1; i < cell_x_.size(); ++i) { - EXPECT_LT(cell_x_.get(i - 1), cell_x_.get(i)); + EXPECT_LT(cell_x_.at(i - 1), cell_x_.at(i)); } } };