/////////////////////////////////////////////////////////////////////// // File: networkio.h // Description: Network input/output data, allowing float/int implementations. // Author: Ray Smith // Created: Tue Jun 17 08:43:11 PST 2014 // // (C) Copyright 2014, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_LSTM_NETWORKIO_H_ #define TESSERACT_LSTM_NETWORKIO_H_ #include #include #include #include "genericvector.h" #include "helpers.h" #include "static_shape.h" #include "stridemap.h" #include "weightmatrix.h" struct Pix; namespace tesseract { // Class to contain all the input/output of a network, allowing for fixed or // variable-strided 2d to 1d mapping, and float or inT8 values. Provides // enough calculating functions to hide the detail of the implementation. class NetworkIO { public: NetworkIO() : int_mode_(false) {} // Resizes the array (and stride), avoiding realloc if possible, to the given // size from various size specs: // Same stride size, but given number of features. void Resize(const NetworkIO& src, int num_features) { ResizeToMap(src.int_mode(), src.stride_map(), num_features); } // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim. void Resize2d(bool int_mode, int width, int num_features); // Resizes forcing a float representation with the stridemap of src and the // given number of features. void ResizeFloat(const NetworkIO& src, int num_features) { ResizeToMap(false, src.stride_map(), num_features); } // Resizes to a specific stride_map. void ResizeToMap(bool int_mode, const StrideMap& stride_map, int num_features); // Shrinks image size by x_scale,y_scale, and use given number of features. void ResizeScaled(const NetworkIO& src, int x_scale, int y_scale, int num_features); // Resizes to just 1 x-coord, whatever the input. void ResizeXTo1(const NetworkIO& src, int num_features); // Initialize all the array to zero. void Zero(); // Initializes to zero all elements of the array that do not correspond to // valid image positions. (If a batch of different-sized images are packed // together, then there will be padding pixels.) void ZeroInvalidElements(); // Sets up the array from the given image, using the currently set int_mode_. // If the image width doesn't match the shape, the image is truncated or // padded with noise to match. void FromPix(const StaticShape& shape, const Pix* pix, TRand* randomizer); // Sets up the array from the given set of images, using the currently set // int_mode_. If the image width doesn't match the shape, the images are // truncated or padded with noise to match. void FromPixes(const StaticShape& shape, const std::vector& pixes, TRand* randomizer); // Copies the given pix to *this at the given batch index, stretching and // clipping the pixel values so that [black, black + 2*contrast] maps to the // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int. // This is a 2-d operation in the sense that the output depth is the number // of input channels, the height is the height of the image, and the width // is the width of the image, or truncated/padded with noise if the width // is a fixed size. void Copy2DImage(int batch, Pix* pix, float black, float contrast, TRand* randomizer); // Copies the given pix to *this at the given batch index, as Copy2DImage // above, except that the output depth is the height of the input image, the // output height is 1, and the output width as for Copy2DImage. // The image is thus treated as a 1-d set of vertical pixel strips. void Copy1DGreyImage(int batch, Pix* pix, float black, float contrast, TRand* randomizer); // Helper stores the pixel value in i_ or f_ according to int_mode_. // t: is the index from the StrideMap corresponding to the current // [batch,y,x] position // f: is the index into the depth/channel // pixel: the value of the pixel from the image (in one channel) // black: the pixel value to map to the lowest of the range of *this // contrast: the range of pixel values to stretch to half the range of *this. void SetPixel(int t, int f, int pixel, float black, float contrast); // Converts the array to a Pix. Must be pixDestroyed after use. Pix* ToPix() const; // Prints the first and last num timesteps of the array for each feature. void Print(int num) const; // Returns the timestep width. int Width() const { return int_mode_ ? i_.dim1() : f_.dim1(); } // Returns the number of features. int NumFeatures() const { return int_mode_ ? i_.dim2() : f_.dim2(); } // Accessor to a timestep of the float matrix. float* f(int t) { ASSERT_HOST(!int_mode_); return f_[t]; } const float* f(int t) const { ASSERT_HOST(!int_mode_); return f_[t]; } const inT8* i(int t) const { ASSERT_HOST(int_mode_); return i_[t]; } bool int_mode() const { return int_mode_; } void set_int_mode(bool is_quantized) { int_mode_ = is_quantized; } const StrideMap& stride_map() const { return stride_map_; } void set_stride_map(const StrideMap& map) { stride_map_ = map; } const GENERIC_2D_ARRAY& float_array() const { return f_; } GENERIC_2D_ARRAY* mutable_float_array() { return &f_; } // Copies a single time step from src. void CopyTimeStepFrom(int dest_t, const NetworkIO& src, int src_t); // Copies a part of single time step from src. void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO& src, int src_t, int src_offset); // Zeroes a single time step. void ZeroTimeStep(int t) { ZeroTimeStepGeneral(t, 0, NumFeatures()); } void ZeroTimeStepGeneral(int t, int offset, int num_features); // Sets the given range to random values. void Randomize(int t, int offset, int num_features, TRand* randomizer); // Helper returns the label and score of the best choice over a range. int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float* rating, float* certainty) const; // Helper returns the rating and certainty of the choice over a range in t. void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float* rating, float* certainty) const; // Returns the index (label) of the best value at the given timestep, // and if not null, sets the score to the log of the corresponding value. int BestLabel(int t, float* score) const { return BestLabel(t, -1, -1, score); } // Returns the index (label) of the best value at the given timestep, // excluding not_this and not_that, and if not null, sets the score to the // log of the corresponding value. int BestLabel(int t, int not_this, int not_that, float* score) const; // Returns the best start position out of range (into which both start and end // must fit) to obtain the highest cumulative score for the given labels. int PositionOfBestMatch(const GenericVector& labels, int start, int end) const; // Returns the cumulative score of the given labels starting at start, and // using one label per time-step. double ScoreOfLabels(const GenericVector& labels, int start) const; // Helper function sets all the outputs for a single timestep, such that // label has value ok_score, and the other labels share 1 - ok_score. // Assumes float mode. void SetActivations(int t, int label, float ok_score); // Modifies the values, only if needed, so that the given label is // the winner at the given time step t. // Assumes float mode. void EnsureBestLabel(int t, int label); // Helper function converts prob to certainty taking the minimum into account. static float ProbToCertainty(float prob); // Returns true if there is any bad value that is suspiciously like a GT // error. Assuming that *this is the difference(gradient) between target // and forward output, returns true if there is a large negative value // (correcting a very confident output) for which there is no corresponding // positive value in an adjacent timestep for the same feature index. This // allows the box-truthed samples to make fine adjustments to position while // stopping other disagreements of confident output with ground truth. bool AnySuspiciousTruth(float confidence_thr) const; // Reads a single timestep to floats in the range [-1, 1]. void ReadTimeStep(int t, double* output) const; // Adds a single timestep to floats. void AddTimeStep(int t, double* inout) const; // Adds part of a single timestep to floats. void AddTimeStepPart(int t, int offset, int num_features, float* inout) const; // Writes a single timestep from floats in the range [-1, 1]. void WriteTimeStep(int t, const double* input); // Writes a single timestep from floats in the range [-1, 1] writing only // num_features elements of input to (*this)[t], starting at offset. void WriteTimeStepPart(int t, int offset, int num_features, const double* input); // Maxpools a single time step from src. void MaxpoolTimeStep(int dest_t, const NetworkIO& src, int src_t, int* max_line); // Runs maxpool backward, using maxes to index timesteps in *this. void MaxpoolBackward(const NetworkIO& fwd, const GENERIC_2D_ARRAY& maxes); // Returns the min over time of the maxes over features of the outputs. float MinOfMaxes() const; // Returns the min over time. float Max() const { return int_mode_ ? i_.Max() : f_.Max(); } // Computes combined results for a combiner that chooses between an existing // input and itself, with an additional output to indicate the choice. void CombineOutputs(const NetworkIO& base_output, const NetworkIO& combiner_output); // Computes deltas for a combiner that chooses between 2 sets of inputs. void ComputeCombinerDeltas(const NetworkIO& fwd_deltas, const NetworkIO& base_output); // Copies the array checking that the types match. void CopyAll(const NetworkIO& src); // Adds the array to a float array, with scaling to [-1, 1] if the src is int. void AddAllToFloat(const NetworkIO& src); // Subtracts the array from a float array. src must also be float. void SubtractAllFromFloat(const NetworkIO& src); // Copies src to *this, with maxabs normalization to match scale. void CopyWithNormalization(const NetworkIO& src, const NetworkIO& scale); // Multiplies the float data by the given factor. void ScaleFloatBy(float factor) { f_ *= factor; } // Copies src to *this with independent reversal of the y dimension. void CopyWithYReversal(const NetworkIO& src); // Copies src to *this with independent reversal of the x dimension. void CopyWithXReversal(const NetworkIO& src); // Copies src to *this with independent transpose of the x and y dimensions. void CopyWithXYTranspose(const NetworkIO& src); // Copies src to *this, at the given feature_offset, returning the total // feature offset after the copy. Multiple calls will stack outputs from // multiple sources in feature space. int CopyPacking(const NetworkIO& src, int feature_offset); // Opposite of CopyPacking, fills *this with a part of src, starting at // feature_offset, and picking num_features. Resizes *this to match. void CopyUnpacking(const NetworkIO& src, int feature_offset, int num_features); // Transposes the float part of *this into dest. void Transpose(TransposedArray* dest) const; // Clips the content of a single time-step to +/-range. void ClipVector(int t, float range); // Applies Func to timestep t of *this (u) and multiplies the result by v // component-wise, putting the product in *product. // *this and v may be int or float, but must match. The outputs are double. template void FuncMultiply(const NetworkIO& v_io, int t, double* product) { Func f; ASSERT_HOST(!int_mode_); ASSERT_HOST(!v_io.int_mode_); int dim = f_.dim2(); if (int_mode_) { const inT8* u = i_[t]; const inT8* v = v_io.i_[t]; for (int i = 0; i < dim; ++i) { product[i] = f(u[i] / static_cast(MAX_INT8)) * v[i] / static_cast(MAX_INT8); } } else { const float* u = f_[t]; const float* v = v_io.f_[t]; for (int i = 0; i < dim; ++i) { product[i] = f(u[i]) * v[i]; } } } // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, // component-wise, putting the product in *product. // All NetworkIOs are assumed to be float. template void FuncMultiply3(int u_t, const NetworkIO& v_io, int v_t, const double* w, double* product) const { ASSERT_HOST(!int_mode_); ASSERT_HOST(!v_io.int_mode_); Func f; const float* u = f_[u_t]; const float* v = v_io.f_[v_t]; int dim = f_.dim2(); for (int i = 0; i < dim; ++i) { product[i] = f(u[i]) * v[i] * w[i]; } } // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, // component-wise, adding the product to *product. // All NetworkIOs are assumed to be float. template void FuncMultiply3Add(const NetworkIO& v_io, int t, const double* w, double* product) const { ASSERT_HOST(!int_mode_); ASSERT_HOST(!v_io.int_mode_); Func f; const float* u = f_[t]; const float* v = v_io.f_[t]; int dim = f_.dim2(); for (int i = 0; i < dim; ++i) { product[i] += f(u[i]) * v[i] * w[i]; } } // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w, // component-wise, putting the product in product, all at timestep t, except // w, which is a simple array. All NetworkIOs are assumed to be float. template void Func2Multiply3(const NetworkIO& v_io, int t, const double* w, double* product) const { ASSERT_HOST(!int_mode_); ASSERT_HOST(!v_io.int_mode_); Func1 f; Func2 g; const float* u = f_[t]; const float* v = v_io.f_[t]; int dim = f_.dim2(); for (int i = 0; i < dim; ++i) { product[i] = f(u[i]) * g(v[i]) * w[i]; } } private: // Choice of float vs 8 bit int for data. GENERIC_2D_ARRAY f_; GENERIC_2D_ARRAY i_; // Which of f_ and i_ are we actually using. bool int_mode_; // Stride for 2d input data. StrideMap stride_map_; }; } // namespace tesseract. #endif // TESSERACT_LSTM_NETWORKIO_H_