Prepare using float instead of double for LSTM calculations

The new header file ccutils/tesstypes.h also prepares support for larger images by introducing a new data type for image size and coordinates (still unused). FloatToDouble is now a local function. Signed-off-by: Stefan Weil <sw@weilnetz.de>
2025-01-20 15:59:11 +08:00 · 2021-07-22 20:02:48 +02:00 · 2021-07-22 20:02:48 +02:00 · 66b77e6639
commit 66b77e6639
parent c3fb050daa
27 changed files with 265 additions and 221 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -150,10 +150,12 @@ endif
 if MARCH_NATIVE_OPT
 libtesseract_native_la_CXXFLAGS += -march=native -mtune=native
 endif
+libtesseract_native_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_native_la_SOURCES = src/arch/dotproduct.cpp

 if HAVE_AVX
 libtesseract_avx_la_CXXFLAGS = -mavx
+libtesseract_avx_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_avx_la_SOURCES = src/arch/dotproductavx.cpp
 libtesseract_la_LIBADD += libtesseract_avx.la
 noinst_LTLIBRARIES += libtesseract_avx.la
@ -161,6 +163,7 @@ endif

 if HAVE_AVX2
 libtesseract_avx2_la_CXXFLAGS = -mavx2
+libtesseract_avx2_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_avx2_la_SOURCES = src/arch/intsimdmatrixavx2.cpp
 libtesseract_la_LIBADD += libtesseract_avx2.la
 noinst_LTLIBRARIES += libtesseract_avx2.la
@ -168,6 +171,7 @@ endif

 if HAVE_FMA
 libtesseract_fma_la_CXXFLAGS = -mfma
+libtesseract_fma_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_fma_la_SOURCES = src/arch/dotproductfma.cpp
 libtesseract_la_LIBADD += libtesseract_fma.la
 noinst_LTLIBRARIES += libtesseract_fma.la
@ -175,6 +179,7 @@ endif

 if HAVE_SSE4_1
 libtesseract_sse_la_CXXFLAGS = -msse4.1
+libtesseract_sse_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_sse_la_SOURCES = src/arch/dotproductsse.cpp src/arch/intsimdmatrixsse.cpp
 libtesseract_la_LIBADD += libtesseract_sse.la
 noinst_LTLIBRARIES += libtesseract_sse.la
@ -182,6 +187,7 @@ endif

 if HAVE_NEON
 libtesseract_neon_la_CXXFLAGS = $(NEON_CXXFLAGS)
+libtesseract_neon_la_CXXFLAGS += -I$(top_srcdir)/src/ccutil
 libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp
 libtesseract_la_LIBADD += libtesseract_neon.la
 noinst_LTLIBRARIES += libtesseract_neon.la
--- a/src/arch/dotproduct.cpp
+++ b/src/arch/dotproduct.cpp
@ -19,12 +19,12 @@
 namespace tesseract {

 // Computes and returns the dot product of the two n-vectors u and v.
-double DotProductNative(const double *u, const double *v, int n) {
-  double total = 0.0;
+TFloat DotProductNative(const TFloat *u, const TFloat *v, int n) {
+  TFloat total = 0;
 #if defined(OPENMP_SIMD) || defined(_OPENMP)
 #pragma omp simd reduction(+:total)
 #endif
-  for (int k = 0; k < n; ++k) {
+  for (int k = 0; k < n; k++) {
    total += u[k] * v[k];
  }
  return total;
--- a/src/arch/dotproduct.h
+++ b/src/arch/dotproduct.h
@ -17,19 +17,21 @@
 #ifndef TESSERACT_ARCH_DOTPRODUCT_H_
 #define TESSERACT_ARCH_DOTPRODUCT_H_

+#include "tesstypes.h"
+
 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
-double DotProductNative(const double *u, const double *v, int n);
+TFloat DotProductNative(const TFloat *u, const TFloat *v, int n);

 // Uses Intel AVX intrinsics to access the SIMD instruction set.
-double DotProductAVX(const double *u, const double *v, int n);
+TFloat DotProductAVX(const TFloat *u, const TFloat *v, int n);

 // Use Intel FMA.
-double DotProductFMA(const double *u, const double *v, int n);
+TFloat DotProductFMA(const TFloat *u, const TFloat *v, int n);

 // Uses Intel SSE intrinsics to access the SIMD instruction set.
-double DotProductSSE(const double *u, const double *v, int n);
+TFloat DotProductSSE(const TFloat *u, const TFloat *v, int n);

 } // namespace tesseract.

--- a/src/arch/intsimdmatrix.cpp
+++ b/src/arch/intsimdmatrix.cpp
@ -76,7 +76,7 @@ void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t>
 // u is imagined to have an extra element at the end with value 1, to
 // implement the bias, but it doesn't actually have it.
 void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w,
-                                    const std::vector<double> &scales, const int8_t *u, double *v) {
+                                    const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) {
  int num_out = w.dim1();
  int num_in = w.dim2() - 1;
  // Base implementation.
--- a/src/arch/intsimdmatrix.h
+++ b/src/arch/intsimdmatrix.h
@ -23,6 +23,8 @@
 #include <cstdint>
 #include <vector>

+#include "tesstypes.h"
+
 namespace tesseract {

 template <class T>
@ -78,8 +80,8 @@ struct TESS_API IntSimdMatrix {
  // u is imagined to have an extra element at the end with value 1, to
  // implement the bias, but it doesn't actually have it.
  // Computes the base C++ implementation.
-  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<double> &scales,
-                              const int8_t *u, double *v);
+  static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<TFloat> &scales,
+                              const int8_t *u, TFloat *v);

  // Rounds the input up to a multiple of the given factor.
  static int Roundup(int input, int factor) {
@ -95,8 +97,8 @@ struct TESS_API IntSimdMatrix {
  // RoundInputs above.
  // The input will be over-read to the extent of the padding. There are no
  // alignment requirements.
-  using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const double *, const int8_t *,
-                                           double *);
+  using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const TFloat *, const int8_t *,
+                                           TFloat *);
  MatrixDotVectorFunction matrixDotVectorFunction;

  // Number of 32 bit outputs held in each register.
--- a/src/arch/intsimdmatrixneon.cpp
+++ b/src/arch/intsimdmatrixneon.cpp
@ -19,6 +19,7 @@
 #if defined(__ARM_NEON)

 #  include "intsimdmatrix.h"
+#  include "tesstypes.h"

 #  include <algorithm>
 #  include <cstdint>
@ -52,9 +53,9 @@ constexpr int kNumInputsPerGroup = 8;
 // u must be padded out with zeros to
 // kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
 static inline void PartialMatrixDotVector8(const int8_t *__restrict wi,
-                                           const double *__restrict scales,
+                                           const TFloat *__restrict scales,
                                           const int8_t *__restrict u, int num_in,
-                                           double *__restrict v, int num_out) {
+                                           TFloat *__restrict v, int num_out) {
  // Initialize all the results to 0.
  int32x4_t result0123 = {0, 0, 0, 0};
  int32x4_t result4567 = {0, 0, 0, 0};
@ -163,8 +164,8 @@ static inline void PartialMatrixDotVector8(const int8_t *__restrict wi,
  }
 }

-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
-                            const int8_t *u, double *v) {
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,
+                            const int8_t *u, TFloat *v) {
  const int num_out = dim1;
  const int num_in = dim2 - 1;
  // Each call to a partial_func_ produces group_size outputs, except the
@ -196,7 +197,8 @@ const IntSimdMatrix IntSimdMatrix::intSimdMatrixNEON = {
    // Number of 8 bit inputs in the inputs register.
    kNumInputsPerRegister,
    // Number of inputs in each weight group.
-    kNumInputsPerGroup};
+    kNumInputsPerGroup
+};

 } // namespace tesseract.

--- a/src/arch/intsimdmatrixsse.cpp
+++ b/src/arch/intsimdmatrixsse.cpp
@ -69,15 +69,15 @@ static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) {
 }

 // Computes part of matrix.vector v = Wu. Computes 1 result.
-static void PartialMatrixDotVector1(const int8_t *wi, const double *scales, const int8_t *u,
-                                    int num_in, double *v) {
-  double total = IntDotProductSSE(u, wi, num_in);
+static void PartialMatrixDotVector1(const int8_t *wi, const TFloat *scales, const int8_t *u,
+                                    int num_in, TFloat *v) {
+  TFloat total = IntDotProductSSE(u, wi, num_in);
  // Add in the bias and correct for integer values.
  *v = (total + wi[num_in] * INT8_MAX) * *scales;
 }

-static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const double *scales,
-                            const int8_t *u, double *v) {
+static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales,
+                            const int8_t *u, TFloat *v) {
  const int num_out = dim1;
  const int num_in = dim2 - 1;
  int output = 0;
@ -99,7 +99,8 @@ const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = {
    // Number of 8 bit inputs in the inputs register.
    1,
    // Number of inputs in each weight group.
-    1};
+    1
+};

 } // namespace tesseract.

--- a/src/arch/simddetect.cpp
+++ b/src/arch/simddetect.cpp
@ -93,8 +93,8 @@ bool SIMDDetect::sse_available_;
 #endif

 #if defined(HAVE_FRAMEWORK_ACCELERATE)
-static double DotProductAccelerate(const double* u, const double* v, int n) {
-  double total = 0.0;
+static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
+  TFloat total = 0;
  const int stride = 1;
  vDSP_dotprD(u, stride, v, stride, &total, n);
  return total;
@ -102,8 +102,8 @@ static double DotProductAccelerate(const double* u, const double* v, int n) {
 #endif

 // Computes and returns the dot product of the two n-vectors u and v.
-static double DotProductGeneric(const double *u, const double *v, int n) {
-  double total = 0.0;
+static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
+  TFloat total = 0;
  for (int k = 0; k < n; ++k) {
    total += u[k] * v[k];
  }
@ -111,8 +111,8 @@ static double DotProductGeneric(const double *u, const double *v, int n) {
 }

 // Compute dot product using std::inner_product.
-static double DotProductStdInnerProduct(const double *u, const double *v, int n) {
-  return std::inner_product(u, u + n, v, 0.0);
+static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
+  return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
 }

 static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
--- a/src/arch/simddetect.h
+++ b/src/arch/simddetect.h
@ -18,11 +18,12 @@
 #define TESSERACT_ARCH_SIMDDETECT_H_

 #include <tesseract/export.h>
+#include "tesstypes.h"

 namespace tesseract {

 // Function pointer for best calculation of dot product.
-using DotProductFunction = double (*)(const double *, const double *, int);
+using DotProductFunction = TFloat (*)(const TFloat *, const TFloat *, int);
 extern DotProductFunction DotProduct;

 // Architecture detector. Add code here to detect any other architectures for
--- a/src/ccutil/tesstypes.h
+++ b/src/ccutil/tesstypes.h
@ -0,0 +1,32 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tesstypes.h
+// Description: Simple data types used by Tesseract code.
+// Author:      Stefan Weil
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_TESSTYPES_H
+#define TESSERACT_TESSTYPES_H
+
+#include <cstdint> // for int16_t
+
+namespace tesseract {
+
+// Image dimensions (width and height, coordinates).
+using TDimension = int16_t;
+
+// Floating point data type used for LSTM calculations.
+using TFloat = double;
+
+}
+
+#endif // TESSERACT_TESSTYPES_H
--- a/src/lstm/fullyconnected.cpp
+++ b/src/lstm/fullyconnected.cpp
@ -156,7 +156,7 @@ void FullyConnected::Forward(bool debug, const NetworkIO &input,
    // Thread-local pointer to temporary storage.
    int thread_id = 0;
 #endif
-    double *temp_line = temp_lines[thread_id];
+    TFloat *temp_line = temp_lines[thread_id];
    if (input.int_mode()) {
      ForwardTimeStep(input.i(t), t, temp_line);
    } else {
@ -200,7 +200,7 @@ void FullyConnected::SetupForward(const NetworkIO &input, const TransposedArray
  }
 }

-void FullyConnected::ForwardTimeStep(int t, double *output_line) {
+void FullyConnected::ForwardTimeStep(int t, TFloat *output_line) {
  if (type_ == NT_TANH) {
    FuncInplace<GFunc>(no_, output_line);
  } else if (type_ == NT_LOGISTIC) {
@ -218,7 +218,7 @@ void FullyConnected::ForwardTimeStep(int t, double *output_line) {
  }
 }

-void FullyConnected::ForwardTimeStep(const double *d_input, int t, double *output_line) {
+void FullyConnected::ForwardTimeStep(const TFloat *d_input, int t, TFloat *output_line) {
  // input is copied to source_ line-by-line for cache coherency.
  if (IsTraining() && external_source_ == nullptr) {
    source_t_.WriteStrided(t, d_input);
@ -227,7 +227,7 @@ void FullyConnected::ForwardTimeStep(const double *d_input, int t, double *outpu
  ForwardTimeStep(t, output_line);
 }

-void FullyConnected::ForwardTimeStep(const int8_t *i_input, int t, double *output_line) {
+void FullyConnected::ForwardTimeStep(const int8_t *i_input, int t, TFloat *output_line) {
  // input is copied to source_ line-by-line for cache coherency.
  weights_.MatrixDotVector(i_input, output_line);
  ForwardTimeStep(t, output_line);
@ -265,11 +265,11 @@ bool FullyConnected::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkSc
  for (int t = 0; t < width; ++t) {
    int thread_id = 0;
 #endif
-    double *backprop = nullptr;
+    TFloat *backprop = nullptr;
    if (needs_to_backprop_) {
      backprop = temp_backprops[thread_id];
    }
-    double *curr_errors = errors[thread_id];
+    TFloat *curr_errors = errors[thread_id];
    BackwardTimeStep(fwd_deltas, t, curr_errors, errors_t.get(), backprop);
    if (backprop != nullptr) {
      back_deltas->WriteTimeStep(t, backprop);
@ -287,8 +287,8 @@ bool FullyConnected::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkSc
  return false; // No point going further back.
 }

-void FullyConnected::BackwardTimeStep(const NetworkIO &fwd_deltas, int t, double *curr_errors,
-                                      TransposedArray *errors_t, double *backprop) {
+void FullyConnected::BackwardTimeStep(const NetworkIO &fwd_deltas, int t, TFloat *curr_errors,
+                                      TransposedArray *errors_t, TFloat *backprop) {
  if (type_ == NT_TANH) {
    acts_.FuncMultiply<GPrime>(fwd_deltas, t, curr_errors);
  } else if (type_ == NT_LOGISTIC) {
@ -328,7 +328,7 @@ void FullyConnected::Update(float learning_rate, float momentum, float adam_beta
 // Sums the products of weight updates in *this and other, splitting into
 // positive (same direction) in *same and negative (different direction) in
 // *changed.
-void FullyConnected::CountAlternators(const Network &other, double *same, double *changed) const {
+void FullyConnected::CountAlternators(const Network &other, TFloat *same, TFloat *changed) const {
  ASSERT_HOST(other.type() == type_);
  const auto *fc = static_cast<const FullyConnected *>(&other);
  weights_.CountAlternators(fc->weights_, same, changed);
--- a/src/lstm/fullyconnected.h
+++ b/src/lstm/fullyconnected.h
@ -20,6 +20,7 @@

 #include "network.h"
 #include "networkscratch.h"
+#include "tesstypes.h"

 namespace tesseract {

@ -90,17 +91,17 @@ public:
               NetworkScratch *scratch, NetworkIO *output) override;
  // Components of Forward so FullyConnected can be reused inside LSTM.
  void SetupForward(const NetworkIO &input, const TransposedArray *input_transpose);
-  void ForwardTimeStep(int t, double *output_line);
-  void ForwardTimeStep(const double *d_input, int t, double *output_line);
-  void ForwardTimeStep(const int8_t *i_input, int t, double *output_line);
+  void ForwardTimeStep(int t, TFloat *output_line);
+  void ForwardTimeStep(const TFloat *d_input, int t, TFloat *output_line);
+  void ForwardTimeStep(const int8_t *i_input, int t, TFloat *output_line);

  // Runs backward propagation of errors on the deltas line.
  // See Network for a detailed discussion of the arguments.
  bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch,
                NetworkIO *back_deltas) override;
  // Components of Backward so FullyConnected can be reused inside LSTM.
-  void BackwardTimeStep(const NetworkIO &fwd_deltas, int t, double *curr_errors,
-                        TransposedArray *errors_t, double *backprop);
+  void BackwardTimeStep(const NetworkIO &fwd_deltas, int t, TFloat *curr_errors,
+                        TransposedArray *errors_t, TFloat *backprop);
  void FinishBackward(const TransposedArray &errors_t);

  // Updates the weights using the given learning rate, momentum and adam_beta.
@ -109,7 +110,7 @@ public:
  // Sums the products of weight updates in *this and other, splitting into
  // positive (same direction) in *same and negative (different direction) in
  // *changed.
-  void CountAlternators(const Network &other, double *same, double *changed) const override;
+  void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const override;

 protected:
  // Weight arrays of size [no, ni + 1].
--- a/src/lstm/functions.cpp
+++ b/src/lstm/functions.cpp
@ -1,7 +1,7 @@
 // Generated code with lookup tables
 #include "functions.h"
 namespace tesseract {
-const double TanhTable[] = {
+const TFloat TanhTable[] = {
    0.0,
    0.00390623013190634,
    0.007812341058161014,
@ -4099,7 +4099,7 @@ const double TanhTable[] = {
    0.9999999999999742,
    0.9999999999999745,
 };
-const double LogisticTable[] = {
+const TFloat LogisticTable[] = {
    0.5,
    0.5009765612582384,
    0.5019531150659532,
--- a/src/lstm/functions.h
+++ b/src/lstm/functions.h
@ -19,6 +19,7 @@
 #define TESSERACT_LSTM_FUNCTIONS_H_

 #include "helpers.h"
+#include "tesstypes.h"

 // Setting this to 1 or more causes massive dumps of debug data: weights,
 // updates, internal calculations etc, and reduces the number of test iterations
@ -33,14 +34,14 @@ namespace tesseract {
 // Size of static tables.
 constexpr int kTableSize = 4096;
 // Scale factor for float arg to int index.
-constexpr double kScaleFactor = 256.0;
+constexpr TFloat kScaleFactor = 256.0;

 // Generated lookup tables.
-extern const double TanhTable[];
-extern const double LogisticTable[];
+extern const TFloat TanhTable[];
+extern const TFloat LogisticTable[];

 // Non-linearity (sigmoid) functions with cache tables and clipping.
-inline double Tanh(double x) {
+inline TFloat Tanh(TFloat x) {
  if (x < 0.0) {
    return -Tanh(-x);
  }
@ -49,13 +50,13 @@ inline double Tanh(double x) {
  if (index >= (kTableSize - 1)) {
    return 1.0;
  }
-  double tanh_i0 = TanhTable[index];
-  double tanh_i1 = TanhTable[index + 1];
+  TFloat tanh_i0 = TanhTable[index];
+  TFloat tanh_i1 = TanhTable[index + 1];
  // Linear interpolation.
  return tanh_i0 + (tanh_i1 - tanh_i0) * (x - index);
 }

-inline double Logistic(double x) {
+inline TFloat Logistic(TFloat x) {
  if (x < 0.0) {
    return 1.0 - Logistic(-x);
  }
@ -64,25 +65,25 @@ inline double Logistic(double x) {
  if (index >= (kTableSize - 1)) {
    return 1.0;
  }
-  double l0 = LogisticTable[index];
-  double l1 = LogisticTable[index + 1];
+  TFloat l0 = LogisticTable[index];
+  TFloat l1 = LogisticTable[index + 1];
  // Linear interpolation.
  return l0 + (l1 - l0) * (x - index);
 }

 // Non-linearity (sigmoid) functions and their derivatives.
 struct FFunc {
-  inline double operator()(double x) const {
+  inline TFloat operator()(TFloat x) const {
    return Logistic(x);
  }
 };
 struct FPrime {
-  inline double operator()(double y) const {
+  inline TFloat operator()(TFloat y) const {
    return y * (1.0 - y);
  }
 };
 struct ClipFFunc {
-  inline double operator()(double x) const {
+  inline TFloat operator()(TFloat x) const {
    if (x <= 0.0) {
      return 0.0;
    }
@ -93,12 +94,12 @@ struct ClipFFunc {
  }
 };
 struct ClipFPrime {
-  inline double operator()(double y) const {
+  inline TFloat operator()(TFloat y) const {
    return 0.0 < y && y < 1.0 ? 1.0 : 0.0;
  }
 };
 struct Relu {
-  inline double operator()(double x) const {
+  inline TFloat operator()(TFloat x) const {
    if (x <= 0.0) {
      return 0.0;
    }
@ -106,22 +107,22 @@ struct Relu {
  }
 };
 struct ReluPrime {
-  inline double operator()(double y) const {
+  inline TFloat operator()(TFloat y) const {
    return 0.0 < y ? 1.0 : 0.0;
  }
 };
 struct GFunc {
-  inline double operator()(double x) const {
+  inline TFloat operator()(TFloat x) const {
    return Tanh(x);
  }
 };
 struct GPrime {
-  inline double operator()(double y) const {
+  inline TFloat operator()(TFloat y) const {
    return 1.0 - y * y;
  }
 };
 struct ClipGFunc {
-  inline double operator()(double x) const {
+  inline TFloat operator()(TFloat x) const {
    if (x <= -1.0) {
      return -1.0;
    }
@ -132,35 +133,35 @@ struct ClipGFunc {
  }
 };
 struct ClipGPrime {
-  inline double operator()(double y) const {
+  inline TFloat operator()(TFloat y) const {
    return -1.0 < y && y < 1.0 ? 1.0 : 0.0;
  }
 };
 struct HFunc {
-  inline double operator()(double x) const {
+  inline TFloat operator()(TFloat x) const {
    return Tanh(x);
  }
 };
 struct HPrime {
-  inline double operator()(double y) const {
-    double u = Tanh(y);
-    return 1.0 - u * u;
+  inline TFloat operator()(TFloat y) const {
+    TFloat u = Tanh(y);
+    return 1 - u * u;
  }
 };
 struct UnityFunc {
-  inline double operator()(double /*x*/) const {
+  inline TFloat operator()(TFloat /*x*/) const {
    return 1.0;
  }
 };
 struct IdentityFunc {
-  inline double operator()(double x) const {
+  inline TFloat operator()(TFloat x) const {
    return x;
  }
 };

 // Applies Func in-place to inout, of size n.
 template <class Func>
-inline void FuncInplace(int n, double *inout) {
+inline void FuncInplace(int n, TFloat *inout) {
  Func f;
  for (int i = 0; i < n; ++i) {
    inout[i] = f(inout[i]);
@ -169,7 +170,7 @@ inline void FuncInplace(int n, double *inout) {
 // Applies Func to u and multiplies the result by v component-wise,
 // putting the product in out, all of size n.
 template <class Func>
-inline void FuncMultiply(const double *u, const double *v, int n, double *out) {
+inline void FuncMultiply(const TFloat *u, const TFloat *v, int n, TFloat *out) {
  Func f;
  for (int i = 0; i < n; ++i) {
    out[i] = f(u[i]) * v[i];
@ -206,34 +207,34 @@ inline void SoftmaxInPlace(int n, T *inout) {
 }

 // Copies n values of the given src vector to dest.
-inline void CopyVector(int n, const double *src, double *dest) {
+inline void CopyVector(int n, const TFloat *src, TFloat *dest) {
  memcpy(dest, src, n * sizeof(dest[0]));
 }

 // Adds n values of the given src vector to dest.
-inline void AccumulateVector(int n, const double *src, double *dest) {
+inline void AccumulateVector(int n, const TFloat *src, TFloat *dest) {
  for (int i = 0; i < n; ++i) {
    dest[i] += src[i];
  }
 }

 // Multiplies n values of inout in-place element-wise by the given src vector.
-inline void MultiplyVectorsInPlace(int n, const double *src, double *inout) {
+inline void MultiplyVectorsInPlace(int n, const TFloat *src, TFloat *inout) {
  for (int i = 0; i < n; ++i) {
    inout[i] *= src[i];
  }
 }

 // Multiplies n values of u by v, element-wise, accumulating to out.
-inline void MultiplyAccumulate(int n, const double *u, const double *v, double *out) {
+inline void MultiplyAccumulate(int n, const TFloat *u, const TFloat *v, TFloat *out) {
  for (int i = 0; i < n; i++) {
    out[i] += u[i] * v[i];
  }
 }

 // Sums the given 5 n-vectors putting the result into sum.
-inline void SumVectors(int n, const double *v1, const double *v2, const double *v3,
-                       const double *v4, const double *v5, double *sum) {
+inline void SumVectors(int n, const TFloat *v1, const TFloat *v2, const TFloat *v3,
+                       const TFloat *v4, const TFloat *v5, TFloat *sum) {
  for (int i = 0; i < n; ++i) {
    sum[i] = v1[i] + v2[i] + v3[i] + v4[i] + v5[i];
  }
@ -255,12 +256,12 @@ inline void ClipVector(int n, T lower, T upper, T *vec) {

 // Converts the given n-vector to a binary encoding of the maximum value,
 // encoded as vector of nf binary values.
-inline void CodeInBinary(int n, int nf, double *vec) {
+inline void CodeInBinary(int n, int nf, TFloat *vec) {
  if (nf <= 0 || n < nf) {
    return;
  }
  int index = 0;
-  double best_score = vec[0];
+  TFloat best_score = vec[0];
  for (int i = 1; i < n; ++i) {
    if (vec[i] > best_score) {
      best_score = vec[i];
--- a/src/lstm/lstm.cpp
+++ b/src/lstm/lstm.cpp
@ -68,9 +68,9 @@ namespace tesseract {

 // Max absolute value of state_. It is reasonably high to enable the state
 // to count things.
-const double kStateClip = 100.0;
+const TFloat kStateClip = 100.0;
 // Max absolute value of gate_errors (the gradients).
-const double kErrClip = 1.0f;
+const TFloat kErrClip = 1.0f;

 // Calculate ceil(log2(n)).
 static inline uint32_t ceil_log2(uint32_t n) {
@ -312,9 +312,9 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
  // Single timestep buffers for the current/recurrent output and state.
  NetworkScratch::FloatVec curr_state, curr_output;
  curr_state.Init(ns_, scratch);
-  ZeroVector<double>(ns_, curr_state);
+  ZeroVector<TFloat>(ns_, curr_state);
  curr_output.Init(ns_, scratch);
-  ZeroVector<double>(ns_, curr_output);
+  ZeroVector<TFloat>(ns_, curr_output);
  // Rotating buffers of width buf_width allow storage of the state and output
  // for the other dimension, used only when working in true 2D mode. The width
  // is enough to hold an entire strip of the major direction.
@ -325,9 +325,9 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
    outputs.resize(buf_width);
    for (int i = 0; i < buf_width; ++i) {
      states[i].Init(ns_, scratch);
-      ZeroVector<double>(ns_, states[i]);
+      ZeroVector<TFloat>(ns_, states[i]);
      outputs[i].Init(ns_, scratch);
-      ZeroVector<double>(ns_, outputs[i]);
+      ZeroVector<TFloat>(ns_, outputs[i]);
    }
  }
  // Used only if a softmax LSTM.
@ -335,7 +335,7 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
  NetworkScratch::IO int_output;
  if (softmax_ != nullptr) {
    softmax_output.Init(no_, scratch);
-    ZeroVector<double>(no_, softmax_output);
+    ZeroVector<TFloat>(no_, softmax_output);
    int rounded_softmax_inputs = gate_weights_[CI].RoundInputs(ns_);
    if (input.int_mode()) {
      int_output.Resize2d(true, 1, rounded_softmax_inputs, scratch);
@ -429,7 +429,7 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
      int8_t *which_fg_col = which_fg_[t];
      memset(which_fg_col, 1, ns_ * sizeof(which_fg_col[0]));
      if (valid_2d) {
-        const double *stepped_state = states[mod_t];
+        const TFloat *stepped_state = states[mod_t];
        for (int i = 0; i < ns_; ++i) {
          if (temp_lines[GF1][i] < temp_lines[GFS][i]) {
            curr_state[i] = temp_lines[GFS][i] * stepped_state[i];
@ -440,7 +440,7 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
    }
    MultiplyAccumulate(ns_, temp_lines[CI], temp_lines[GI], curr_state);
    // Clip curr_state to a sane range.
-    ClipVector<double>(ns_, -kStateClip, kStateClip, curr_state);
+    ClipVector<TFloat>(ns_, -kStateClip, kStateClip, curr_state);
    if (IsTraining()) {
      // Save the gate node values.
      node_values_[CI].WriteTimeStep(t, temp_lines[CI]);
@ -483,8 +483,8 @@ void LSTM::Forward(bool debug, const NetworkIO &input, const TransposedArray *in
    // Always zero the states at the end of every row, but only for the major
    // direction. The 2-D state remains intact.
    if (src_index.IsLast(FD_WIDTH)) {
-      ZeroVector<double>(ns_, curr_state);
-      ZeroVector<double>(ns_, curr_output);
+      ZeroVector<TFloat>(ns_, curr_state);
+      ZeroVector<TFloat>(ns_, curr_output);
    }
  } while (src_index.Increment());
 #if DEBUG_DETAIL > 0
@ -520,8 +520,8 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
  NetworkScratch::FloatVec curr_stateerr, curr_sourceerr;
  curr_stateerr.Init(ns_, scratch);
  curr_sourceerr.Init(na_, scratch);
-  ZeroVector<double>(ns_, curr_stateerr);
-  ZeroVector<double>(na_, curr_sourceerr);
+  ZeroVector<TFloat>(ns_, curr_stateerr);
+  ZeroVector<TFloat>(na_, curr_sourceerr);
  // Errors in the gates.
  NetworkScratch::FloatVec gate_errors[WT_COUNT];
  for (auto &gate_error : gate_errors) {
@ -537,8 +537,8 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
    for (int t = 0; t < buf_width; ++t) {
      stateerr[t].Init(ns_, scratch);
      sourceerr[t].Init(na_, scratch);
-      ZeroVector<double>(ns_, stateerr[t]);
-      ZeroVector<double>(na_, sourceerr[t]);
+      ZeroVector<TFloat>(ns_, stateerr[t]);
+      ZeroVector<TFloat>(na_, sourceerr[t]);
    }
  }
  // Parallel-generated sourceerr from each of the gates.
@ -559,7 +559,7 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
    softmax_errors.Init(no_, scratch);
    softmax_errors_t.Init(no_, width, scratch);
  }
-  double state_clip = Is2D() ? 9.0 : 4.0;
+  TFloat state_clip = Is2D() ? 9.0 : 4.0;
 #if DEBUG_DETAIL > 1
  tprintf("fwd_deltas:%s\n", name_.c_str());
  fwd_deltas.Print(10);
@ -594,8 +594,8 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
    int mod_t = Modulo(t, buf_width); // Current timestep.
    // Zero the state in the major direction only at the end of every row.
    if (at_last_x) {
-      ZeroVector<double>(na_, curr_sourceerr);
-      ZeroVector<double>(ns_, curr_stateerr);
+      ZeroVector<TFloat>(na_, curr_sourceerr);
+      ZeroVector<TFloat>(ns_, curr_stateerr);
    }
    // Setup the outputerr.
    if (type_ == NT_LSTM_SUMMARY) {
@ -603,7 +603,7 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
        fwd_deltas.ReadTimeStep(src_index.t(), outputerr);
        src_index.Decrement();
      } else {
-        ZeroVector<double>(ns_, outputerr);
+        ZeroVector<TFloat>(ns_, outputerr);
      }
    } else if (softmax_ == nullptr) {
      fwd_deltas.ReadTimeStep(t, outputerr);
@ -631,7 +631,7 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
      }
      if (down_pos >= 0) {
        const float *right_node_gfs = node_values_[GFS].f(down_pos);
-        const double *right_stateerr = stateerr[mod_t];
+        const TFloat *right_stateerr = stateerr[mod_t];
        for (int i = 0; i < ns_; ++i) {
          if (which_fg_[down_pos][i] == 2) {
            curr_stateerr[i] += right_stateerr[i] * right_node_gfs[i];
@ -641,7 +641,7 @@ bool LSTM::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scr
    }
    state_.FuncMultiply3Add<HPrime>(node_values_[GO], t, outputerr, curr_stateerr);
    // Clip stateerr_ to a sane range.
-    ClipVector<double>(ns_, -state_clip, state_clip, curr_stateerr);
+    ClipVector<TFloat>(ns_, -state_clip, state_clip, curr_stateerr);
 #if DEBUG_DETAIL > 1
    if (t + 10 > width) {
      tprintf("t=%d, stateerr=", t);
@ -758,7 +758,7 @@ void LSTM::Update(float learning_rate, float momentum, float adam_beta, int num_
 // Sums the products of weight updates in *this and other, splitting into
 // positive (same direction) in *same and negative (different direction) in
 // *changed.
-void LSTM::CountAlternators(const Network &other, double *same, double *changed) const {
+void LSTM::CountAlternators(const Network &other, TFloat *same, TFloat *changed) const {
  ASSERT_HOST(other.type() == type_);
  const LSTM *lstm = static_cast<const LSTM *>(&other);
  for (int w = 0; w < WT_COUNT; ++w) {
--- a/src/lstm/lstm.h
+++ b/src/lstm/lstm.h
@ -109,7 +109,7 @@ public:
  // Sums the products of weight updates in *this and other, splitting into
  // positive (same direction) in *same and negative (different direction) in
  // *changed.
-  void CountAlternators(const Network &other, double *same, double *changed) const override;
+  void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const override;
  // Prints the weights for debug purposes.
  void PrintW();
  // Prints the weight deltas for debug purposes.
--- a/src/lstm/network.cpp
+++ b/src/lstm/network.cpp
@ -321,7 +321,7 @@ Network *Network::CreateFromFile(TFile *fp) {
 }

 // Returns a random number in [-range, range].
-double Network::Random(double range) {
+TFloat Network::Random(TFloat range) {
  ASSERT_HOST(randomizer_ != nullptr);
  return randomizer_->SignedRand(range);
 }
--- a/src/lstm/network.h
+++ b/src/lstm/network.h
@ -235,7 +235,7 @@ public:
  // Sums the products of weight updates in *this and other, splitting into
  // positive (same direction) in *same and negative (different direction) in
  // *changed.
-  virtual void CountAlternators(const Network &other, double *same, double *changed) const {}
+  virtual void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const {}

  // Reads from the given file. Returns nullptr in case of error.
  // Determines the type of the serialized class and calls its DeSerialize
@ -287,7 +287,7 @@ public:

 protected:
  // Returns a random number in [-range, range].
-  double Random(double range);
+  TFloat Random(TFloat range);

 protected:
  NetworkType type_;       // Type of the derived network class.
--- a/src/lstm/networkio.cpp
+++ b/src/lstm/networkio.cpp
@ -529,9 +529,9 @@ int NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, in
  int length = labels.size();
  int last_start = end - length;
  int best_start = -1;
-  double best_score = 0.0;
+  TFloat best_score = 0;
  for (int s = start; s <= last_start; ++s) {
-    double score = ScoreOfLabels(labels, s);
+    TFloat score = ScoreOfLabels(labels, s);
    if (score > best_score || best_start < 0) {
      best_score = score;
      best_start = s;
@ -542,9 +542,9 @@ int NetworkIO::PositionOfBestMatch(const std::vector<int> &labels, int start, in

 // Returns the cumulative score of the given labels starting at start, and
 // using one label per time-step.
-double NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const {
+TFloat NetworkIO::ScoreOfLabels(const std::vector<int> &labels, int start) const {
  int length = labels.size();
-  double score = 0.0;
+  TFloat score = 0;
  for (int i = 0; i < length; ++i) {
    score += f_(start + i, labels[i]);
  }
@ -615,27 +615,27 @@ bool NetworkIO::AnySuspiciousTruth(float confidence_thr) const {
 }

 // Reads a single timestep to floats in the range [-1, 1].
-void NetworkIO::ReadTimeStep(int t, double *output) const {
+void NetworkIO::ReadTimeStep(int t, TFloat *output) const {
  if (int_mode_) {
    const int8_t *line = i_[t];
    for (int i = 0; i < i_.dim2(); ++i) {
-      output[i] = static_cast<double>(line[i]) / INT8_MAX;
+      output[i] = static_cast<TFloat>(line[i]) / INT8_MAX;
    }
  } else {
    const float *line = f_[t];
    for (int i = 0; i < f_.dim2(); ++i) {
-      output[i] = static_cast<double>(line[i]);
+      output[i] = static_cast<TFloat>(line[i]);
    }
  }
 }

 // Adds a single timestep to floats.
-void NetworkIO::AddTimeStep(int t, double *inout) const {
+void NetworkIO::AddTimeStep(int t, TFloat *inout) const {
  int num_features = NumFeatures();
  if (int_mode_) {
    const int8_t *line = i_[t];
    for (int i = 0; i < num_features; ++i) {
-      inout[i] += static_cast<double>(line[i]) / INT8_MAX;
+      inout[i] += static_cast<TFloat>(line[i]) / INT8_MAX;
    }
  } else {
    const float *line = f_[t];
@ -661,13 +661,13 @@ void NetworkIO::AddTimeStepPart(int t, int offset, int num_features, float *inou
 }

 // Writes a single timestep from floats in the range [-1, 1].
-void NetworkIO::WriteTimeStep(int t, const double *input) {
+void NetworkIO::WriteTimeStep(int t, const TFloat *input) {
  WriteTimeStepPart(t, 0, NumFeatures(), input);
 }

 // Writes a single timestep from floats in the range [-1, 1] writing only
 // num_features elements of input to (*this)[t], starting at offset.
-void NetworkIO::WriteTimeStepPart(int t, int offset, int num_features, const double *input) {
+void NetworkIO::WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input) {
  if (int_mode_) {
    int8_t *line = i_[t] + offset;
    for (int i = 0; i < num_features; ++i) {
--- a/src/lstm/networkio.h
+++ b/src/lstm/networkio.h
@ -172,7 +172,7 @@ public:
  int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const;
  // Returns the cumulative score of the given labels starting at start, and
  // using one label per time-step.
-  double ScoreOfLabels(const std::vector<int> &labels, int start) const;
+  TFloat ScoreOfLabels(const std::vector<int> &labels, int start) const;
  // Helper function sets all the outputs for a single timestep, such that
  // label has value ok_score, and the other labels share 1 - ok_score.
  // Assumes float mode.
@ -193,16 +193,16 @@ public:
  bool AnySuspiciousTruth(float confidence_thr) const;

  // Reads a single timestep to floats in the range [-1, 1].
-  void ReadTimeStep(int t, double *output) const;
+  void ReadTimeStep(int t, TFloat *output) const;
  // Adds a single timestep to floats.
-  void AddTimeStep(int t, double *inout) const;
+  void AddTimeStep(int t, TFloat *inout) const;
  // Adds part of a single timestep to floats.
  void AddTimeStepPart(int t, int offset, int num_features, float *inout) const;
  // Writes a single timestep from floats in the range [-1, 1].
-  void WriteTimeStep(int t, const double *input);
+  void WriteTimeStep(int t, const TFloat *input);
  // Writes a single timestep from floats in the range [-1, 1] writing only
  // num_features elements of input to (*this)[t], starting at offset.
-  void WriteTimeStepPart(int t, int offset, int num_features, const double *input);
+  void WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input);
  // Maxpools a single time step from src.
  void MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line);
  // Runs maxpool backward, using maxes to index timesteps in *this.
@ -253,9 +253,9 @@ public:

  // Applies Func to timestep t of *this (u) and multiplies the result by v
  // component-wise, putting the product in *product.
-  // *this and v may be int or float, but must match. The outputs are double.
+  // *this and v may be int or float, but must match. The outputs are TFloat.
  template <class Func>
-  void FuncMultiply(const NetworkIO &v_io, int t, double *product) {
+  void FuncMultiply(const NetworkIO &v_io, int t, TFloat *product) {
    Func f;
    ASSERT_HOST(!int_mode_);
    ASSERT_HOST(!v_io.int_mode_);
@ -264,7 +264,7 @@ public:
      const int8_t *u = i_[t];
      const int8_t *v = v_io.i_[t];
      for (int i = 0; i < dim; ++i) {
-        product[i] = f(u[i] / static_cast<double>(INT8_MAX)) * v[i] / static_cast<double>(INT8_MAX);
+        product[i] = f(u[i] / static_cast<TFloat>(INT8_MAX)) * v[i] / INT8_MAX;
      }
    } else {
      const float *u = f_[t];
@ -278,8 +278,8 @@ public:
  // component-wise, putting the product in *product.
  // All NetworkIOs are assumed to be float.
  template <class Func>
-  void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const double *w,
-                     double *product) const {
+  void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const TFloat *w,
+                     TFloat *product) const {
    ASSERT_HOST(!int_mode_);
    ASSERT_HOST(!v_io.int_mode_);
    Func f;
@ -294,7 +294,7 @@ public:
  // component-wise, adding the product to *product.
  // All NetworkIOs are assumed to be float.
  template <class Func>
-  void FuncMultiply3Add(const NetworkIO &v_io, int t, const double *w, double *product) const {
+  void FuncMultiply3Add(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const {
    ASSERT_HOST(!int_mode_);
    ASSERT_HOST(!v_io.int_mode_);
    Func f;
@ -309,7 +309,7 @@ public:
  // component-wise, putting the product in product, all at timestep t, except
  // w, which is a simple array. All NetworkIOs are assumed to be float.
  template <class Func1, class Func2>
-  void Func2Multiply3(const NetworkIO &v_io, int t, const double *w, double *product) const {
+  void Func2Multiply3(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const {
    ASSERT_HOST(!int_mode_);
    ASSERT_HOST(!v_io.int_mode_);
    Func1 f;
--- a/src/lstm/networkscratch.h
+++ b/src/lstm/networkscratch.h
@ -156,25 +156,25 @@ public:
    }

    // Use the cast operator instead of operator[] so the FloatVec can be used
-    // as a double* argument to a function call.
-    operator double *() const {
+    // as a TFloat* argument to a function call.
+    operator TFloat *() const {
      return data_;
    }
-    double *get() {
+    TFloat *get() {
      return data_;
    }

  private:
    // Vector borrowed from the scratch space. Use Return to free it.
-    std::vector<double> *vec_;
+    std::vector<TFloat> *vec_;
    // Short-cut pointer to the underlying array.
-    double *data_;
+    TFloat *data_;
    // The source scratch_space_. Borrowed pointer, used to free the
    // vector. Don't delete!
    NetworkScratch *scratch_space_;
  }; // class FloatVec

-  // Class that acts like a 2-D array of double, yet actually uses space
+  // Class that acts like a 2-D array of TFloat, yet actually uses space
  // from the source NetworkScratch, and knows how to unstack the borrowed
  // array on destruction.
  class GradientStore {
@ -270,7 +270,7 @@ private:
  // deleted until the NetworkScratch is deleted.
  Stack<NetworkIO> int_stack_;
  Stack<NetworkIO> float_stack_;
-  Stack<std::vector<double>> vec_stack_;
+  Stack<std::vector<TFloat>> vec_stack_;
  Stack<TransposedArray> array_stack_;
 };

--- a/src/lstm/plumbing.cpp
+++ b/src/lstm/plumbing.cpp
@ -255,7 +255,7 @@ void Plumbing::Update(float learning_rate, float momentum, float adam_beta, int
 // Sums the products of weight updates in *this and other, splitting into
 // positive (same direction) in *same and negative (different direction) in
 // *changed.
-void Plumbing::CountAlternators(const Network &other, double *same, double *changed) const {
+void Plumbing::CountAlternators(const Network &other, TFloat *same, TFloat *changed) const {
  ASSERT_HOST(other.type() == type_);
  const auto *plumbing = static_cast<const Plumbing *>(&other);
  ASSERT_HOST(plumbing->stack_.size() == stack_.size());
--- a/src/lstm/plumbing.h
+++ b/src/lstm/plumbing.h
@ -143,7 +143,7 @@ public:
  // Sums the products of weight updates in *this and other, splitting into
  // positive (same direction) in *same and negative (different direction) in
  // *changed.
-  void CountAlternators(const Network &other, double *same, double *changed) const override;
+  void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const override;

 protected:
  // The networks.
--- a/src/lstm/weightmatrix.cpp
+++ b/src/lstm/weightmatrix.cpp
@ -26,7 +26,7 @@
 namespace tesseract {

 #if defined(ANDROID)
-static inline double log2(double n) {
+static inline TFloat log2(TFloat n) {
  return log(n) / log(2.0);
 }
 #endif // ANDROID
@ -34,7 +34,22 @@ static inline double log2(double n) {
 // Number of iterations after which the correction effectively becomes unity.
 const int kAdamCorrectionIterations = 200000;
 // Epsilon in Adam to prevent division by zero.
-const double kAdamEpsilon = 1e-8;
+const TFloat kAdamEpsilon = 1e-8;
+
+// Utility function converts an array of float to the corresponding array
+// of double.
+static void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<double> &dst) {
+  const auto dim1 = src.dim1();
+  const auto dim2 = src.dim2();
+  dst.ResizeNoInit(dim1, dim2);
+  for (int i = 0; i < dim1; ++i) {
+    const auto *src_i = src[i];
+    auto *dst_i = dst[i];
+    for (int j = 0; j < dim2; ++j) {
+      dst_i[j] = static_cast<double>(src_i[j]);
+    }
+  }
+}

 // Computes matrix.vector v = Wu.
 // u is of size W.dim2() - add_bias_fwd and the output v is of size
@ -44,13 +59,13 @@ const double kAdamEpsilon = 1e-8;
 // If skip_bias_back, we are actually performing the backwards product on a
 // transposed matrix, so we need to drop the v output corresponding to the last
 // element in dim1.
-static inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double> &w, bool add_bias_fwd,
-                                           bool skip_bias_back, const double *u, double *v) {
+static inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<TFloat> &w, bool add_bias_fwd,
+                                           bool skip_bias_back, const TFloat *u, TFloat *v) {
  int num_results = w.dim1() - skip_bias_back;
  int extent = w.dim2() - add_bias_fwd;
  for (int i = 0; i < num_results; ++i) {
-    const double *wi = w[i];
-    double total = DotProduct(wi, u, extent);
+    const TFloat *wi = w[i];
+    TFloat total = DotProduct(wi, u, extent);
    if (add_bias_fwd) {
      total += wi[extent]; // The bias value.
    }
@ -58,8 +73,8 @@ static inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double> &w, bo
  }
 }

-// Copies the whole input transposed, converted to double, into *this.
-void TransposedArray::Transpose(const GENERIC_2D_ARRAY<double> &input) {
+// Copies the whole input transposed, converted to TFloat, into *this.
+void TransposedArray::Transpose(const GENERIC_2D_ARRAY<TFloat> &input) {
  int width = input.dim1();
  int num_features = input.dim2();
  ResizeNoInit(num_features, width);
@ -97,25 +112,25 @@ int WeightMatrix::InitWeightsFloat(int no, int ni, bool use_adam, float weight_r
 // for all outputs with negative code_map entries. Returns the new number of
 // weights.
 int WeightMatrix::RemapOutputs(const std::vector<int> &code_map) {
-  GENERIC_2D_ARRAY<double> old_wf(wf_);
+  GENERIC_2D_ARRAY<TFloat> old_wf(wf_);
  int old_no = wf_.dim1();
  int new_no = code_map.size();
  int ni = wf_.dim2();
-  std::vector<double> means(ni, 0.0);
+  std::vector<TFloat> means(ni, 0.0);
  for (int c = 0; c < old_no; ++c) {
-    const double *weights = wf_[c];
+    const TFloat *weights = wf_[c];
    for (int i = 0; i < ni; ++i) {
      means[i] += weights[i];
    }
  }
-  for (double &mean : means) {
+  for (auto &mean : means) {
    mean /= old_no;
  }
  wf_.Resize(new_no, ni, 0.0);
  InitBackward();
  for (int dest = 0; dest < new_no; ++dest) {
    int src = code_map[dest];
-    const double *src_data = src >= 0 ? old_wf[src] : means.data();
+    const TFloat *src_data = src >= 0 ? old_wf[src] : means.data();
    memcpy(wf_[dest], src_data, ni * sizeof(*src_data));
  }
  return ni * new_no;
@ -126,23 +141,23 @@ int WeightMatrix::RemapOutputs(const std::vector<int> &code_map) {
 // Compute the max absolute value of the weight set.
 // Scale so the max absolute value becomes INT8_MAX.
 // Round to integer.
-// Store a multiplicative scale factor (as a double) that will reproduce
+// Store a multiplicative scale factor (as a TFloat) that will reproduce
 // the original value, subject to rounding errors.
 void WeightMatrix::ConvertToInt() {
  wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());
  scales_.reserve(wi_.dim1());
  int dim2 = wi_.dim2();
  for (int t = 0; t < wi_.dim1(); ++t) {
-    double *f_line = wf_[t];
+    TFloat *f_line = wf_[t];
    int8_t *i_line = wi_[t];
-    double max_abs = 0.0;
+    TFloat max_abs = 0;
    for (int f = 0; f < dim2; ++f) {
-      double abs_val = fabs(f_line[f]);
+      TFloat abs_val = fabs(f_line[f]);
      if (abs_val > max_abs) {
        max_abs = abs_val;
      }
    }
-    double scale = max_abs / INT8_MAX;
+    TFloat scale = max_abs / INT8_MAX;
    scales_.push_back(scale / INT8_MAX);
    if (scale == 0.0) {
      scale = 1.0;
@ -291,14 +306,14 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {
    if (!float_array.DeSerialize(fp)) {
      return false;
    }
-    FloatToDouble(float_array, &wf_);
+    FloatToDouble(float_array, wf_);
  }
  if (training) {
    InitBackward();
    if (!float_array.DeSerialize(fp)) {
      return false;
    }
-    FloatToDouble(float_array, &updates_);
+    FloatToDouble(float_array, updates_);
    // Errs was only used in int training, which is now dead.
    if (!float_array.DeSerialize(fp)) {
      return false;
@ -312,12 +327,12 @@ bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {
 // u is imagined to have an extra element at the end with value 1, to
 // implement the bias, but it doesn't actually have it.
 // Asserts that the call matches what we have.
-void WeightMatrix::MatrixDotVector(const double *u, double *v) const {
+void WeightMatrix::MatrixDotVector(const TFloat *u, TFloat *v) const {
  assert(!int_mode_);
  MatrixDotVectorInternal(wf_, true, false, u, v);
 }

-void WeightMatrix::MatrixDotVector(const int8_t *u, double *v) const {
+void WeightMatrix::MatrixDotVector(const int8_t *u, TFloat *v) const {
  assert(int_mode_);
  if (IntSimdMatrix::intSimdMatrix) {
    IntSimdMatrix::intSimdMatrix->matrixDotVectorFunction(wi_.dim1(), wi_.dim2(), &shaped_w_[0],
@ -329,11 +344,11 @@ void WeightMatrix::MatrixDotVector(const int8_t *u, double *v) const {

 // MatrixDotVector for peep weights, MultiplyAccumulate adds the
 // component-wise products of *this[0] and v to inout.
-void WeightMatrix::MultiplyAccumulate(const double *v, double *inout) {
+void WeightMatrix::MultiplyAccumulate(const TFloat *v, TFloat *inout) {
  assert(!int_mode_);
  assert(wf_.dim1() == 1);
  int n = wf_.dim2();
-  const double *u = wf_[0];
+  const TFloat *u = wf_[0];
  for (int i = 0; i < n; ++i) {
    inout[i] += u[i] * v[i];
  }
@ -343,7 +358,7 @@ void WeightMatrix::MultiplyAccumulate(const double *v, double *inout) {
 // u is of size W.dim1() and the output v is of size W.dim2() - 1.
 // The last result is discarded, as v is assumed to have an imaginary
 // last value of 1, as with MatrixDotVector.
-void WeightMatrix::VectorDotMatrix(const double *u, double *v) const {
+void WeightMatrix::VectorDotMatrix(const TFloat *u, TFloat *v) const {
  assert(!int_mode_);
  MatrixDotVectorInternal(wf_t_, false, true, u, v);
 }
@ -367,13 +382,13 @@ void WeightMatrix::SumOuterTransposed(const TransposedArray &u, const Transposed
 #  pragma omp parallel for num_threads(4) if (in_parallel)
 #endif
  for (int i = 0; i < num_outputs; ++i) {
-    double *dwi = dw_[i];
-    const double *ui = u[i];
+    TFloat *dwi = dw_[i];
+    const TFloat *ui = u[i];
    for (int j = 0; j < num_inputs; ++j) {
      dwi[j] = DotProduct(ui, v[j], num_samples);
    }
    // The last element of v is missing, presumed 1.0f.
-    double total = 0.0;
+    TFloat total = 0;
    for (int k = 0; k < num_samples; ++k) {
      total += ui[k];
    }
@ -419,17 +434,17 @@ void WeightMatrix::AddDeltas(const WeightMatrix &other) {
 // Sums the products of weight updates in *this and other, splitting into
 // positive (same direction) in *same and negative (different direction) in
 // *changed.
-void WeightMatrix::CountAlternators(const WeightMatrix &other, double *same,
-                                    double *changed) const {
+void WeightMatrix::CountAlternators(const WeightMatrix &other, TFloat *same,
+                                    TFloat *changed) const {
  int num_outputs = updates_.dim1();
  int num_inputs = updates_.dim2();
  assert(num_outputs == other.updates_.dim1());
  assert(num_inputs == other.updates_.dim2());
  for (int i = 0; i < num_outputs; ++i) {
-    const double *this_i = updates_[i];
-    const double *other_i = other.updates_[i];
+    const TFloat *this_i = updates_[i];
+    const TFloat *other_i = other.updates_[i];
    for (int j = 0; j < num_inputs; ++j) {
-      double product = this_i[j] * other_i[j];
+      TFloat product = this_i[j] * other_i[j];
      if (product < 0.0) {
        *changed -= product;
      } else {
@ -442,10 +457,10 @@ void WeightMatrix::CountAlternators(const WeightMatrix &other, double *same,
 // Helper computes an integer histogram bucket for a weight and adds it
 // to the histogram.
 const int kHistogramBuckets = 16;
-static void HistogramWeight(double weight, STATS *histogram) {
+static void HistogramWeight(TFloat weight, STATS *histogram) {
  int bucket = kHistogramBuckets - 1;
  if (weight != 0.0) {
-    double logval = -log2(fabs(weight));
+    TFloat logval = -log2(fabs(weight));
    bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);
  }
  histogram->add(bucket, 1);
@ -470,20 +485,4 @@ void WeightMatrix::Debug2D(const char *msg) {
  histogram.print();
 }

-// Utility function converts an array of float to the corresponding array
-// of double.
-/* static */
-void WeightMatrix::FloatToDouble(const GENERIC_2D_ARRAY<float> &wf, GENERIC_2D_ARRAY<double> *wd) {
-  int dim1 = wf.dim1();
-  int dim2 = wf.dim2();
-  wd->ResizeNoInit(dim1, dim2);
-  for (int i = 0; i < dim1; ++i) {
-    const float *wfi = wf[i];
-    double *wdi = (*wd)[i];
-    for (int j = 0; j < dim2; ++j) {
-      wdi[j] = static_cast<double>(wfi[j]);
-    }
-  }
-}
-
 } // namespace tesseract.
--- a/src/lstm/weightmatrix.h
+++ b/src/lstm/weightmatrix.h
@ -22,17 +22,18 @@
 #include <vector>
 #include "intsimdmatrix.h"
 #include "matrix.h"
+#include "tesstypes.h"
 #include "tprintf.h"

 namespace tesseract {

-// Convenience instantiation of GENERIC_2D_ARRAY<double> with additional
+// Convenience instantiation of GENERIC_2D_ARRAY<TFloat> with additional
 // operations to write a strided vector, so the transposed form of the input
 // is memory-contiguous.
-class TransposedArray : public GENERIC_2D_ARRAY<double> {
+class TransposedArray : public GENERIC_2D_ARRAY<TFloat> {
 public:
-  // Copies the whole input transposed, converted to double, into *this.
-  void Transpose(const GENERIC_2D_ARRAY<double> &input);
+  // Copies the whole input transposed, converted to TFloat, into *this.
+  void Transpose(const GENERIC_2D_ARRAY<TFloat> &input);
  // Writes a vector of data representing a timestep (gradients or sources).
  // The data is assumed to be of size1 in size (the strided dimension).
  ~TransposedArray() override;
@ -107,11 +108,11 @@ public:
    return int_mode_ ? wi_.dim1() : wf_.dim1();
  }
  // Provides one set of weights. Only used by peep weight maxpool.
-  const double *GetWeights(int index) const {
+  const TFloat *GetWeights(int index) const {
    return wf_[index];
  }
  // Provides access to the deltas (dw_).
-  double GetDW(int i, int j) const {
+  TFloat GetDW(int i, int j) const {
    return dw_(i, j);
  }

@ -132,16 +133,16 @@ public:
  // u is imagined to have an extra element at the end with value 1, to
  // implement the bias, but it doesn't actually have it.
  // Asserts that the call matches what we have.
-  void MatrixDotVector(const double *u, double *v) const;
-  void MatrixDotVector(const int8_t *u, double *v) const;
+  void MatrixDotVector(const TFloat *u, TFloat *v) const;
+  void MatrixDotVector(const int8_t *u, TFloat *v) const;
  // MatrixDotVector for peep weights, MultiplyAccumulate adds the
  // component-wise products of *this[0] and v to inout.
-  void MultiplyAccumulate(const double *v, double *inout);
+  void MultiplyAccumulate(const TFloat *v, TFloat *inout);
  // Computes vector.matrix v = uW.
  // u is of size W.dim1() and the output v is of size W.dim2() - 1.
  // The last result is discarded, as v is assumed to have an imaginary
  // last value of 1, as with MatrixDotVector.
-  void VectorDotMatrix(const double *u, double *v) const;
+  void VectorDotMatrix(const TFloat *u, TFloat *v) const;
  // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements
  // from u and v, starting with u[i][offset] and v[j][offset].
  // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.
@ -155,17 +156,13 @@ public:
  // Sums the products of weight updates in *this and other, splitting into
  // positive (same direction) in *same and negative (different direction) in
  // *changed.
-  void CountAlternators(const WeightMatrix &other, double *same, double *changed) const;
+  void CountAlternators(const WeightMatrix &other, TFloat *same, TFloat *changed) const;

  void Debug2D(const char *msg);

-  // Utility function converts an array of float to the corresponding array
-  // of double.
-  static void FloatToDouble(const GENERIC_2D_ARRAY<float> &wf, GENERIC_2D_ARRAY<double> *wd);
-
 private:
  // Choice between float and 8 bit int implementations.
-  GENERIC_2D_ARRAY<double> wf_;
+  GENERIC_2D_ARRAY<TFloat> wf_;
  GENERIC_2D_ARRAY<int8_t> wi_;
  // Transposed copy of wf_, used only for Backward, and set with each Update.
  TransposedArray wf_t_;
@ -175,14 +172,14 @@ private:
  bool use_adam_;
  // If we are using wi_, then scales_ is a factor to restore the row product
  // with a vector to the correct range.
-  std::vector<double> scales_;
+  std::vector<TFloat> scales_;
  // Weight deltas. dw_ is the new delta, and updates_ the momentum-decaying
  // amount to be added to wf_/wi_.
-  GENERIC_2D_ARRAY<double> dw_;
-  GENERIC_2D_ARRAY<double> updates_;
+  GENERIC_2D_ARRAY<TFloat> dw_;
+  GENERIC_2D_ARRAY<TFloat> updates_;
  // Iff use_adam_, the sum of squares of dw_. The number of samples is
  // given to Update(). Serialized iff use_adam_.
-  GENERIC_2D_ARRAY<double> dw_sq_sum_;
+  GENERIC_2D_ARRAY<TFloat> dw_sq_sum_;
  // The weights matrix reorganized in whatever way suits this instance.
  std::vector<int8_t> shaped_w_;
 };
--- a/src/training/unicharset/lstmtrainer.cpp
+++ b/src/training/unicharset/lstmtrainer.cpp
@ -661,7 +661,7 @@ void LSTMTrainer::ReduceLearningRates(LSTMTrainer *samples_trainer, std::string
 // Even if it looks like all weights should remain the same, an adjustment
 // will be made to guarantee a different result when reverting to an old best.
 // Returns the number of layer learning rates that were reduced.
-int LSTMTrainer::ReduceLayerLearningRates(double factor, int num_samples,
+int LSTMTrainer::ReduceLayerLearningRates(TFloat factor, int num_samples,
                                          LSTMTrainer *samples_trainer) {
  enum WhichWay {
    LR_DOWN, // Learning rate will go down by factor.
@ -671,13 +671,13 @@ int LSTMTrainer::ReduceLayerLearningRates(double factor, int num_samples,
  std::vector<std::string> layers = EnumerateLayers();
  int num_layers = layers.size();
  std::vector<int> num_weights(num_layers);
-  std::vector<double> bad_sums[LR_COUNT];
-  std::vector<double> ok_sums[LR_COUNT];
+  std::vector<TFloat> bad_sums[LR_COUNT];
+  std::vector<TFloat> ok_sums[LR_COUNT];
  for (int i = 0; i < LR_COUNT; ++i) {
    bad_sums[i].resize(num_layers, 0.0);
    ok_sums[i].resize(num_layers, 0.0);
  }
-  double momentum_factor = 1.0 / (1.0 - momentum_);
+  auto momentum_factor = 1 / (1 - momentum_);
  std::vector<char> orig_trainer;
  samples_trainer->SaveTrainingDump(LIGHT, *this, &orig_trainer);
  for (int i = 0; i < num_layers; ++i) {
@ -689,7 +689,7 @@ int LSTMTrainer::ReduceLayerLearningRates(double factor, int num_samples,
    // Which way will we modify the learning rate?
    for (int ww = 0; ww < LR_COUNT; ++ww) {
      // Transfer momentum to learning rate and adjust by the ww factor.
-      float ww_factor = momentum_factor;
+      auto ww_factor = momentum_factor;
      if (ww == LR_DOWN) {
        ww_factor *= factor;
      }
@ -748,10 +748,10 @@ int LSTMTrainer::ReduceLayerLearningRates(double factor, int num_samples,
    }
    Network *layer = GetLayer(layers[i]);
    float lr = GetLayerLearningRate(layers[i]);
-    double total_down = bad_sums[LR_DOWN][i] + ok_sums[LR_DOWN][i];
-    double total_same = bad_sums[LR_SAME][i] + ok_sums[LR_SAME][i];
-    double frac_down = bad_sums[LR_DOWN][i] / total_down;
-    double frac_same = bad_sums[LR_SAME][i] / total_same;
+    TFloat total_down = bad_sums[LR_DOWN][i] + ok_sums[LR_DOWN][i];
+    TFloat total_same = bad_sums[LR_SAME][i] + ok_sums[LR_SAME][i];
+    TFloat frac_down = bad_sums[LR_DOWN][i] / total_down;
+    TFloat frac_same = bad_sums[LR_SAME][i] / total_same;
    tprintf("Layer %d=%s: lr %g->%g%%, lr %g->%g%%", i, layer->name().c_str(), lr * factor,
            100.0 * frac_down, lr, 100.0 * frac_same);
    if (frac_down < frac_same * kImprovementFraction) {
--- a/src/training/unicharset/lstmtrainer.h
+++ b/src/training/unicharset/lstmtrainer.h
@ -237,7 +237,7 @@ public:
  // Even if it looks like all weights should remain the same, an adjustment
  // will be made to guarantee a different result when reverting to an old best.
  // Returns the number of layer learning rates that were reduced.
-  int ReduceLayerLearningRates(double factor, int num_samples, LSTMTrainer *samples_trainer);
+  int ReduceLayerLearningRates(TFloat factor, int num_samples, LSTMTrainer *samples_trainer);

  // Converts the string to integer class labels, with appropriate null_char_s
  // in between if not in SimpleTextOutput mode. Returns false on failure.