mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-18 11:28:51 +08:00
342 lines
15 KiB
C++
342 lines
15 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: networkio.h
|
|
// Description: Network input/output data, allowing float/int implementations.
|
|
// Author: Ray Smith
|
|
// Created: Tue Jun 17 08:43:11 PST 2014
|
|
//
|
|
// (C) Copyright 2014, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_LSTM_NETWORKIO_H_
|
|
#define TESSERACT_LSTM_NETWORKIO_H_
|
|
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <vector>
|
|
|
|
#include "genericvector.h"
|
|
#include "helpers.h"
|
|
#include "static_shape.h"
|
|
#include "stridemap.h"
|
|
#include "weightmatrix.h"
|
|
|
|
struct Pix;
|
|
|
|
namespace tesseract {
|
|
|
|
// Class to contain all the input/output of a network, allowing for fixed or
|
|
// variable-strided 2d to 1d mapping, and float or inT8 values. Provides
|
|
// enough calculating functions to hide the detail of the implementation.
|
|
class NetworkIO {
|
|
public:
|
|
NetworkIO() : int_mode_(false) {}
|
|
// Resizes the array (and stride), avoiding realloc if possible, to the given
|
|
// size from various size specs:
|
|
// Same stride size, but given number of features.
|
|
void Resize(const NetworkIO& src, int num_features) {
|
|
ResizeToMap(src.int_mode(), src.stride_map(), num_features);
|
|
}
|
|
// Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim.
|
|
void Resize2d(bool int_mode, int width, int num_features);
|
|
// Resizes forcing a float representation with the stridemap of src and the
|
|
// given number of features.
|
|
void ResizeFloat(const NetworkIO& src, int num_features) {
|
|
ResizeToMap(false, src.stride_map(), num_features);
|
|
}
|
|
// Resizes to a specific stride_map.
|
|
void ResizeToMap(bool int_mode, const StrideMap& stride_map,
|
|
int num_features);
|
|
// Shrinks image size by x_scale,y_scale, and use given number of features.
|
|
void ResizeScaled(const NetworkIO& src, int x_scale, int y_scale,
|
|
int num_features);
|
|
// Resizes to just 1 x-coord, whatever the input.
|
|
void ResizeXTo1(const NetworkIO& src, int num_features);
|
|
// Initialize all the array to zero.
|
|
void Zero();
|
|
// Initializes to zero all elements of the array that do not correspond to
|
|
// valid image positions. (If a batch of different-sized images are packed
|
|
// together, then there will be padding pixels.)
|
|
void ZeroInvalidElements();
|
|
// Sets up the array from the given image, using the currently set int_mode_.
|
|
// If the image width doesn't match the shape, the image is truncated or
|
|
// padded with noise to match.
|
|
void FromPix(const StaticShape& shape, const Pix* pix, TRand* randomizer);
|
|
// Sets up the array from the given set of images, using the currently set
|
|
// int_mode_. If the image width doesn't match the shape, the images are
|
|
// truncated or padded with noise to match.
|
|
void FromPixes(const StaticShape& shape, const std::vector<const Pix*>& pixes,
|
|
TRand* randomizer);
|
|
// Copies the given pix to *this at the given batch index, stretching and
|
|
// clipping the pixel values so that [black, black + 2*contrast] maps to the
|
|
// dynamic range of *this, ie [-1,1] for a float and (-127,127) for int.
|
|
// This is a 2-d operation in the sense that the output depth is the number
|
|
// of input channels, the height is the height of the image, and the width
|
|
// is the width of the image, or truncated/padded with noise if the width
|
|
// is a fixed size.
|
|
void Copy2DImage(int batch, Pix* pix, float black, float contrast,
|
|
TRand* randomizer);
|
|
// Copies the given pix to *this at the given batch index, as Copy2DImage
|
|
// above, except that the output depth is the height of the input image, the
|
|
// output height is 1, and the output width as for Copy2DImage.
|
|
// The image is thus treated as a 1-d set of vertical pixel strips.
|
|
void Copy1DGreyImage(int batch, Pix* pix, float black, float contrast,
|
|
TRand* randomizer);
|
|
// Helper stores the pixel value in i_ or f_ according to int_mode_.
|
|
// t: is the index from the StrideMap corresponding to the current
|
|
// [batch,y,x] position
|
|
// f: is the index into the depth/channel
|
|
// pixel: the value of the pixel from the image (in one channel)
|
|
// black: the pixel value to map to the lowest of the range of *this
|
|
// contrast: the range of pixel values to stretch to half the range of *this.
|
|
void SetPixel(int t, int f, int pixel, float black, float contrast);
|
|
// Converts the array to a Pix. Must be pixDestroyed after use.
|
|
Pix* ToPix() const;
|
|
// Prints the first and last num timesteps of the array for each feature.
|
|
void Print(int num) const;
|
|
|
|
// Returns the timestep width.
|
|
int Width() const {
|
|
return int_mode_ ? i_.dim1() : f_.dim1();
|
|
}
|
|
// Returns the number of features.
|
|
int NumFeatures() const {
|
|
return int_mode_ ? i_.dim2() : f_.dim2();
|
|
}
|
|
// Accessor to a timestep of the float matrix.
|
|
float* f(int t) {
|
|
ASSERT_HOST(!int_mode_);
|
|
return f_[t];
|
|
}
|
|
const float* f(int t) const {
|
|
ASSERT_HOST(!int_mode_);
|
|
return f_[t];
|
|
}
|
|
const inT8* i(int t) const {
|
|
ASSERT_HOST(int_mode_);
|
|
return i_[t];
|
|
}
|
|
bool int_mode() const {
|
|
return int_mode_;
|
|
}
|
|
void set_int_mode(bool is_quantized) {
|
|
int_mode_ = is_quantized;
|
|
}
|
|
const StrideMap& stride_map() const {
|
|
return stride_map_;
|
|
}
|
|
void set_stride_map(const StrideMap& map) {
|
|
stride_map_ = map;
|
|
}
|
|
const GENERIC_2D_ARRAY<float>& float_array() const { return f_; }
|
|
GENERIC_2D_ARRAY<float>* mutable_float_array() { return &f_; }
|
|
|
|
// Copies a single time step from src.
|
|
void CopyTimeStepFrom(int dest_t, const NetworkIO& src, int src_t);
|
|
// Copies a part of single time step from src.
|
|
void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features,
|
|
const NetworkIO& src, int src_t, int src_offset);
|
|
// Zeroes a single time step.
|
|
void ZeroTimeStep(int t) { ZeroTimeStepGeneral(t, 0, NumFeatures()); }
|
|
void ZeroTimeStepGeneral(int t, int offset, int num_features);
|
|
// Sets the given range to random values.
|
|
void Randomize(int t, int offset, int num_features, TRand* randomizer);
|
|
|
|
// Helper returns the label and score of the best choice over a range.
|
|
int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch,
|
|
float* rating, float* certainty) const;
|
|
// Helper returns the rating and certainty of the choice over a range in t.
|
|
void ScoresOverRange(int t_start, int t_end, int choice, int null_ch,
|
|
float* rating, float* certainty) const;
|
|
// Returns the index (label) of the best value at the given timestep,
|
|
// and if not null, sets the score to the log of the corresponding value.
|
|
int BestLabel(int t, float* score) const {
|
|
return BestLabel(t, -1, -1, score);
|
|
}
|
|
// Returns the index (label) of the best value at the given timestep,
|
|
// excluding not_this and not_that, and if not null, sets the score to the
|
|
// log of the corresponding value.
|
|
int BestLabel(int t, int not_this, int not_that, float* score) const;
|
|
// Returns the best start position out of range (into which both start and end
|
|
// must fit) to obtain the highest cumulative score for the given labels.
|
|
int PositionOfBestMatch(const GenericVector<int>& labels, int start,
|
|
int end) const;
|
|
// Returns the cumulative score of the given labels starting at start, and
|
|
// using one label per time-step.
|
|
double ScoreOfLabels(const GenericVector<int>& labels, int start) const;
|
|
// Helper function sets all the outputs for a single timestep, such that
|
|
// label has value ok_score, and the other labels share 1 - ok_score.
|
|
// Assumes float mode.
|
|
void SetActivations(int t, int label, float ok_score);
|
|
// Modifies the values, only if needed, so that the given label is
|
|
// the winner at the given time step t.
|
|
// Assumes float mode.
|
|
void EnsureBestLabel(int t, int label);
|
|
// Helper function converts prob to certainty taking the minimum into account.
|
|
static float ProbToCertainty(float prob);
|
|
// Returns true if there is any bad value that is suspiciously like a GT
|
|
// error. Assuming that *this is the difference(gradient) between target
|
|
// and forward output, returns true if there is a large negative value
|
|
// (correcting a very confident output) for which there is no corresponding
|
|
// positive value in an adjacent timestep for the same feature index. This
|
|
// allows the box-truthed samples to make fine adjustments to position while
|
|
// stopping other disagreements of confident output with ground truth.
|
|
bool AnySuspiciousTruth(float confidence_thr) const;
|
|
|
|
// Reads a single timestep to floats in the range [-1, 1].
|
|
void ReadTimeStep(int t, double* output) const;
|
|
// Adds a single timestep to floats.
|
|
void AddTimeStep(int t, double* inout) const;
|
|
// Adds part of a single timestep to floats.
|
|
void AddTimeStepPart(int t, int offset, int num_features, float* inout) const;
|
|
// Writes a single timestep from floats in the range [-1, 1].
|
|
void WriteTimeStep(int t, const double* input);
|
|
// Writes a single timestep from floats in the range [-1, 1] writing only
|
|
// num_features elements of input to (*this)[t], starting at offset.
|
|
void WriteTimeStepPart(int t, int offset, int num_features,
|
|
const double* input);
|
|
// Maxpools a single time step from src.
|
|
void MaxpoolTimeStep(int dest_t, const NetworkIO& src, int src_t,
|
|
int* max_line);
|
|
// Runs maxpool backward, using maxes to index timesteps in *this.
|
|
void MaxpoolBackward(const NetworkIO& fwd,
|
|
const GENERIC_2D_ARRAY<int>& maxes);
|
|
// Returns the min over time of the maxes over features of the outputs.
|
|
float MinOfMaxes() const;
|
|
// Returns the min over time.
|
|
float Max() const { return int_mode_ ? i_.Max() : f_.Max(); }
|
|
// Computes combined results for a combiner that chooses between an existing
|
|
// input and itself, with an additional output to indicate the choice.
|
|
void CombineOutputs(const NetworkIO& base_output,
|
|
const NetworkIO& combiner_output);
|
|
// Computes deltas for a combiner that chooses between 2 sets of inputs.
|
|
void ComputeCombinerDeltas(const NetworkIO& fwd_deltas,
|
|
const NetworkIO& base_output);
|
|
|
|
// Copies the array checking that the types match.
|
|
void CopyAll(const NetworkIO& src);
|
|
// Adds the array to a float array, with scaling to [-1, 1] if the src is int.
|
|
void AddAllToFloat(const NetworkIO& src);
|
|
// Subtracts the array from a float array. src must also be float.
|
|
void SubtractAllFromFloat(const NetworkIO& src);
|
|
|
|
// Copies src to *this, with maxabs normalization to match scale.
|
|
void CopyWithNormalization(const NetworkIO& src, const NetworkIO& scale);
|
|
// Multiplies the float data by the given factor.
|
|
void ScaleFloatBy(float factor) { f_ *= factor; }
|
|
// Copies src to *this with independent reversal of the y dimension.
|
|
void CopyWithYReversal(const NetworkIO& src);
|
|
// Copies src to *this with independent reversal of the x dimension.
|
|
void CopyWithXReversal(const NetworkIO& src);
|
|
// Copies src to *this with independent transpose of the x and y dimensions.
|
|
void CopyWithXYTranspose(const NetworkIO& src);
|
|
// Copies src to *this, at the given feature_offset, returning the total
|
|
// feature offset after the copy. Multiple calls will stack outputs from
|
|
// multiple sources in feature space.
|
|
int CopyPacking(const NetworkIO& src, int feature_offset);
|
|
// Opposite of CopyPacking, fills *this with a part of src, starting at
|
|
// feature_offset, and picking num_features. Resizes *this to match.
|
|
void CopyUnpacking(const NetworkIO& src, int feature_offset,
|
|
int num_features);
|
|
// Transposes the float part of *this into dest.
|
|
void Transpose(TransposedArray* dest) const;
|
|
|
|
// Clips the content of a single time-step to +/-range.
|
|
void ClipVector(int t, float range);
|
|
|
|
// Applies Func to timestep t of *this (u) and multiplies the result by v
|
|
// component-wise, putting the product in *product.
|
|
// *this and v may be int or float, but must match. The outputs are double.
|
|
template <class Func>
|
|
void FuncMultiply(const NetworkIO& v_io, int t, double* product) {
|
|
Func f;
|
|
ASSERT_HOST(!int_mode_);
|
|
ASSERT_HOST(!v_io.int_mode_);
|
|
int dim = f_.dim2();
|
|
if (int_mode_) {
|
|
const inT8* u = i_[t];
|
|
const inT8* v = v_io.i_[t];
|
|
for (int i = 0; i < dim; ++i) {
|
|
product[i] = f(u[i] / static_cast<double>(MAX_INT8)) * v[i] /
|
|
static_cast<double>(MAX_INT8);
|
|
}
|
|
} else {
|
|
const float* u = f_[t];
|
|
const float* v = v_io.f_[t];
|
|
for (int i = 0; i < dim; ++i) {
|
|
product[i] = f(u[i]) * v[i];
|
|
}
|
|
}
|
|
}
|
|
// Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,
|
|
// component-wise, putting the product in *product.
|
|
// All NetworkIOs are assumed to be float.
|
|
template <class Func>
|
|
void FuncMultiply3(int u_t, const NetworkIO& v_io, int v_t, const double* w,
|
|
double* product) const {
|
|
ASSERT_HOST(!int_mode_);
|
|
ASSERT_HOST(!v_io.int_mode_);
|
|
Func f;
|
|
const float* u = f_[u_t];
|
|
const float* v = v_io.f_[v_t];
|
|
int dim = f_.dim2();
|
|
for (int i = 0; i < dim; ++i) {
|
|
product[i] = f(u[i]) * v[i] * w[i];
|
|
}
|
|
}
|
|
// Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w,
|
|
// component-wise, adding the product to *product.
|
|
// All NetworkIOs are assumed to be float.
|
|
template <class Func>
|
|
void FuncMultiply3Add(const NetworkIO& v_io, int t, const double* w,
|
|
double* product) const {
|
|
ASSERT_HOST(!int_mode_);
|
|
ASSERT_HOST(!v_io.int_mode_);
|
|
Func f;
|
|
const float* u = f_[t];
|
|
const float* v = v_io.f_[t];
|
|
int dim = f_.dim2();
|
|
for (int i = 0; i < dim; ++i) {
|
|
product[i] += f(u[i]) * v[i] * w[i];
|
|
}
|
|
}
|
|
// Applies Func1 to *this (u), Func2 to v, and multiplies the result by w,
|
|
// component-wise, putting the product in product, all at timestep t, except
|
|
// w, which is a simple array. All NetworkIOs are assumed to be float.
|
|
template <class Func1, class Func2>
|
|
void Func2Multiply3(const NetworkIO& v_io, int t, const double* w,
|
|
double* product) const {
|
|
ASSERT_HOST(!int_mode_);
|
|
ASSERT_HOST(!v_io.int_mode_);
|
|
Func1 f;
|
|
Func2 g;
|
|
const float* u = f_[t];
|
|
const float* v = v_io.f_[t];
|
|
int dim = f_.dim2();
|
|
for (int i = 0; i < dim; ++i) {
|
|
product[i] = f(u[i]) * g(v[i]) * w[i];
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Choice of float vs 8 bit int for data.
|
|
GENERIC_2D_ARRAY<float> f_;
|
|
GENERIC_2D_ARRAY<inT8> i_;
|
|
// Which of f_ and i_ are we actually using.
|
|
bool int_mode_;
|
|
// Stride for 2d input data.
|
|
StrideMap stride_map_;
|
|
};
|
|
|
|
} // namespace tesseract.
|
|
|
|
#endif // TESSERACT_LSTM_NETWORKIO_H_
|