From 92e2ad04710bfd1caad8d38c494bcbc3b63315cc Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Fri, 6 Jul 2018 12:08:13 +0200
Subject: [PATCH 1/5] Fix CID 1164703 (Untrusted value as argument)

Wrong file data could give a large value for the number of vector elements
resulting in very large memory allocations.

Limit the allowed data range to UINT16_MAX (65535) elements
which hopefully should be sufficient for all use cases.

Changing the data type of the related member variables from int to
uint32_t allowed removing several type casts.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/ccutil/genericvector.h       |  6 ++++--
 src/classify/adaptmatch.cpp      |  2 +-
 src/classify/mastertrainer.cpp   |  8 +++-----
 src/classify/picofeat.cpp        |  4 ++--
 src/classify/shapeclassifier.cpp |  4 ++--
 src/classify/trainingsample.cpp  | 28 ++++++++++++++--------------
 src/classify/trainingsample.h    |  8 ++++----
 7 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h
index 64c5c9e4..2c0c7a34 100644
--- a/src/ccutil/genericvector.h
+++ b/src/ccutil/genericvector.h
@@ -564,12 +564,14 @@ class PointerVector : public GenericVector<T*> {
   // Also needs T::T(), as new T is used in this function.
   // Returns false in case of error.
   bool DeSerialize(bool swap, FILE* fp) {
-    int32_t reserved;
+    uint32_t reserved;
     if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false;
     if (swap) Reverse32(&reserved);
+    // Arbitrarily limit the number of elements to protect against bad data.
+    if (reserved > UINT16_MAX) return false;
     GenericVector<T*>::reserve(reserved);
     truncate(0);
-    for (int i = 0; i < reserved; ++i) {
+    for (uint32_t i = 0; i < reserved; ++i) {
       int8_t non_null;
       if (fread(&non_null, sizeof(non_null), 1, fp) != 1) return false;
       T* item = nullptr;
diff --git a/src/classify/adaptmatch.cpp b/src/classify/adaptmatch.cpp
index 074612ba..21bd07a0 100644
--- a/src/classify/adaptmatch.cpp
+++ b/src/classify/adaptmatch.cpp
@@ -1339,7 +1339,7 @@ int Classify::CharNormTrainingSample(bool pruner_only,
   ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
   adapt_results->Initialize();
   // Compute the bounding box of the features.
-  int num_features = sample.num_features();
+  uint32_t num_features = sample.num_features();
   // Only the top and bottom of the blob_box are used by MasterMatcher, so
   // fabricate right and left using top and bottom.
   TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
diff --git a/src/classify/mastertrainer.cpp b/src/classify/mastertrainer.cpp
index 90ecd12b..57e0fe11 100644
--- a/src/classify/mastertrainer.cpp
+++ b/src/classify/mastertrainer.cpp
@@ -1,5 +1,3 @@
-// Copyright 2010 Google Inc. All Rights Reserved.
-// Author: rays@google.com (Ray Smith)
 ///////////////////////////////////////////////////////////////////////
 // File:        mastertrainer.cpp
 // Description: Trainer to build the MasterClassifier.
@@ -552,8 +550,8 @@ CLUSTERER* MasterTrainer::SetupForClustering(
   int sample_id = 0;
   for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
     const TrainingSample* sample = sample_ptrs[i];
-    int num_features = sample->num_micro_features();
-    for (int f = 0; f < num_features; ++f)
+    uint32_t num_features = sample->num_micro_features();
+    for (uint32_t f = 0; f < num_features; ++f)
       MakeSample(clusterer, sample->micro_features()[f], sample_id);
     ++sample_id;
   }
@@ -706,7 +704,7 @@ void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
   if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
     const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
                                                                class_id2);
-    for (int f = 0; f < sample->num_features(); ++f) {
+    for (uint32_t f = 0; f < sample->num_features(); ++f) {
       RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
     }
   }
diff --git a/src/classify/picofeat.cpp b/src/classify/picofeat.cpp
index 60e0a681..53bb8f3b 100644
--- a/src/classify/picofeat.cpp
+++ b/src/classify/picofeat.cpp
@@ -224,10 +224,10 @@ FEATURE_SET Classify::ExtractIntCNFeatures(
       blob, false, &local_fx_info, &bl_features);
   if (sample == nullptr) return nullptr;
 
-  int num_features = sample->num_features();
+  uint32_t num_features = sample->num_features();
   const INT_FEATURE_STRUCT* features = sample->features();
   FEATURE_SET feature_set = NewFeatureSet(num_features);
-  for (int f = 0; f < num_features; ++f) {
+  for (uint32_t f = 0; f < num_features; ++f) {
     FEATURE feature = NewFeature(&IntFeatDesc);
 
     feature->Params[IntX] = features[f].X;
diff --git a/src/classify/shapeclassifier.cpp b/src/classify/shapeclassifier.cpp
index 7c10d6fe..ac899dc5 100644
--- a/src/classify/shapeclassifier.cpp
+++ b/src/classify/shapeclassifier.cpp
@@ -109,8 +109,8 @@ void ShapeClassifier::DebugDisplay(const TrainingSample& sample,
   popup_menu->BuildMenu(debug_win, false);
   // Display the features in green.
   const INT_FEATURE_STRUCT* features = sample.features();
-  int num_features = sample.num_features();
-  for (int f = 0; f < num_features; ++f) {
+  uint32_t num_features = sample.num_features();
+  for (uint32_t f = 0; f < num_features; ++f) {
     RenderIntFeature(debug_win, &features[f], ScrollView::GREEN);
   }
   debug_win->Update();
diff --git a/src/classify/trainingsample.cpp b/src/classify/trainingsample.cpp
index 688de335..65fcb3ff 100644
--- a/src/classify/trainingsample.cpp
+++ b/src/classify/trainingsample.cpp
@@ -61,12 +61,10 @@ bool TrainingSample::Serialize(FILE* fp) const {
     return false;
   if (fwrite(&outline_length_, sizeof(outline_length_), 1, fp) != 1)
     return false;
-  if (static_cast<int>(fwrite(features_, sizeof(*features_), num_features_, fp))
-      != num_features_)
+  if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_)
     return false;
-  if (static_cast<int>(fwrite(micro_features_, sizeof(*micro_features_),
-                              num_micro_features_,
-                              fp)) != num_micro_features_)
+  if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_,
+             fp) != num_micro_features_)
     return false;
   if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
       kNumCNParams) return false;
@@ -102,16 +100,18 @@ bool TrainingSample::DeSerialize(bool swap, FILE* fp) {
     ReverseN(&num_micro_features_, sizeof(num_micro_features_));
     ReverseN(&outline_length_, sizeof(outline_length_));
   }
+  // Arbitrarily limit the number of elements to protect against bad data.
+  if (num_features_ > UINT16_MAX) return false;
+  if (num_micro_features_ > UINT16_MAX) return false;
   delete [] features_;
   features_ = new INT_FEATURE_STRUCT[num_features_];
-  if (static_cast<int>(fread(features_, sizeof(*features_), num_features_, fp))
+  if (fread(features_, sizeof(*features_), num_features_, fp)
       != num_features_)
     return false;
   delete [] micro_features_;
   micro_features_ = new MicroFeature[num_micro_features_];
-  if (static_cast<int>(fread(micro_features_, sizeof(*micro_features_),
-                             num_micro_features_,
-                             fp)) != num_micro_features_)
+  if (fread(micro_features_, sizeof(*micro_features_), num_micro_features_,
+            fp) != num_micro_features_)
     return false;
   if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
             kNumCNParams) return false;
@@ -165,7 +165,7 @@ TrainingSample* TrainingSample::RandomizedCopy(int index) const {
     ++index;  // Remove the first combination.
     const int yshift = kYShiftValues[index / kSampleScaleSize];
     double scaling = kScaleValues[index % kSampleScaleSize];
-    for (int i = 0; i < num_features_; ++i) {
+    for (uint32_t i = 0; i < num_features_; ++i) {
       double result = (features_[i].X - kRandomizingCenter) * scaling;
       result += kRandomizingCenter;
       sample->features_[i].X = ClipToRange<int>(result + 0.5, 0, UINT8_MAX);
@@ -217,7 +217,7 @@ void TrainingSample::ExtractCharDesc(int int_feature_type,
   } else {
     num_features_ = char_features->NumFeatures;
     features_ = new INT_FEATURE_STRUCT[num_features_];
-    for (int f = 0; f < num_features_; ++f) {
+    for (uint32_t f = 0; f < num_features_; ++f) {
       features_[f].X =
           static_cast<uint8_t>(char_features->Features[f]->Params[IntX]);
       features_[f].Y =
@@ -238,7 +238,7 @@ void TrainingSample::ExtractCharDesc(int int_feature_type,
   } else {
     num_micro_features_ = char_features->NumFeatures;
     micro_features_ = new MicroFeature[num_micro_features_];
-    for (int f = 0; f < num_micro_features_; ++f) {
+    for (uint32_t f = 0; f < num_micro_features_; ++f) {
       for (int d = 0; d < MFCount; ++d) {
         micro_features_[f][d] = char_features->Features[f]->Params[d];
       }
@@ -294,7 +294,7 @@ void TrainingSample::MapFeatures(const IntFeatureMap& feature_map) {
 // Returns a pix representing the sample. (Int features only.)
 Pix* TrainingSample::RenderToPix(const UNICHARSET* unicharset) const {
   Pix* pix = pixCreate(kIntFeatureExtent, kIntFeatureExtent, 1);
-  for (int f = 0; f < num_features_; ++f) {
+  for (uint32_t f = 0; f < num_features_; ++f) {
     int start_x = features_[f].X;
     int start_y = kIntFeatureExtent - features_[f].Y;
     double dx = cos((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI);
@@ -315,7 +315,7 @@ Pix* TrainingSample::RenderToPix(const UNICHARSET* unicharset) const {
 void TrainingSample::DisplayFeatures(ScrollView::Color color,
                                      ScrollView* window) const {
   #ifndef GRAPHICS_DISABLED
-  for (int f = 0; f < num_features_; ++f) {
+  for (uint32_t f = 0; f < num_features_; ++f) {
     RenderIntFeature(window, &features_[f], color);
   }
   #endif  // GRAPHICS_DISABLED
diff --git a/src/classify/trainingsample.h b/src/classify/trainingsample.h
index d74ea99a..0964e2be 100644
--- a/src/classify/trainingsample.h
+++ b/src/classify/trainingsample.h
@@ -137,13 +137,13 @@ class TrainingSample : public ELIST_LINK {
   void set_bounding_box(const TBOX& box) {
     bounding_box_ = box;
   }
-  int num_features() const {
+  uint32_t num_features() const {
     return num_features_;
   }
   const INT_FEATURE_STRUCT* features() const {
     return features_;
   }
-  int num_micro_features() const {
+  uint32_t num_micro_features() const {
     return num_micro_features_;
   }
   const MicroFeature* micro_features() const {
@@ -206,9 +206,9 @@ class TrainingSample : public ELIST_LINK {
   // Bounding box of sample in original image.
   TBOX bounding_box_;
   // Number of INT_FEATURE_STRUCT in features_ array.
-  int num_features_;
+  uint32_t num_features_;
   // Number of MicroFeature in micro_features_ array.
-  int num_micro_features_;
+  uint32_t num_micro_features_;
   // Total length of outline in the baseline normalized coordinate space.
   // See comment in WERD_RES class definition for a discussion of coordinate
   // spaces.

From 8871f4d6223013935a45fa2e0ea27011dbc1fa41 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Fri, 6 Jul 2018 14:51:04 +0200
Subject: [PATCH 2/5] Fix CID 1164686 (Use of untrusted scalar value)

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/ccutil/strngs.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/ccutil/strngs.cpp b/src/ccutil/strngs.cpp
index 44e36546..1fd650e0 100644
--- a/src/ccutil/strngs.cpp
+++ b/src/ccutil/strngs.cpp
@@ -161,13 +161,14 @@ bool STRING::Serialize(TFile* fp) const {
 // Reads from the given file. Returns false in case of error.
 // If swap is true, assumes a big/little-endian swap is needed.
 bool STRING::DeSerialize(bool swap, FILE* fp) {
-  int32_t len;
+  uint32_t len;
   if (fread(&len, sizeof(len), 1, fp) != 1) return false;
   if (swap)
     ReverseN(&len, sizeof(len));
+  // Arbitrarily limit the number of characters to protect against bad data.
+  if (len > UINT16_MAX) return false;
   truncate_at(len);
-  if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false;
-  return true;
+  return fread(GetCStr(), 1, len, fp) == len;
 }
 // Reads from the given file. Returns false in case of error.
 // If swap is true, assumes a big/little-endian swap is needed.

From c1da5fbac434bdb0b7e593cd39671c4b65cc2998 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Fri, 6 Jul 2018 14:53:53 +0200
Subject: [PATCH 3/5] Fix CID 1164704 (Untrusted value as argument)

Limit the matrix to UINT16_MAX x UINT16_MAX.

Larger dimensions could also result in an arithmetic overflow
when multiplying the two dimensions.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/ccstruct/matrix.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/ccstruct/matrix.h b/src/ccstruct/matrix.h
index 9efde3c3..03a31c12 100644
--- a/src/ccstruct/matrix.h
+++ b/src/ccstruct/matrix.h
@@ -1,6 +1,6 @@
 /* -*-C-*-
  ******************************************************************************
- * File:         matrix.h  (Formerly matrix.h)
+ * File:         matrix.h
  * Description:  Generic 2-d array/matrix and banded triangular matrix class.
  * Author:       Ray Smith
  * TODO(rays) Separate from ratings matrix, which it also contains:
@@ -10,9 +10,6 @@
  * Author:       Mark Seaman, OCR Technology
  * Created:      Wed May 16 13:22:06 1990
  * Modified:     Tue Mar 19 16:00:20 1991 (Mark Seaman) marks@hpgrlt
- * Language:     C
- * Package:      N/A
- * Status:       Experimental (Do Not Distribute)
  *
  * (c) Copyright 1990, Hewlett-Packard Company.
  ** Licensed under the Apache License, Version 2.0 (the "License");
@@ -492,6 +489,9 @@ class GENERIC_2D_ARRAY {
       ReverseN(&size1, sizeof(size1));
       ReverseN(&size2, sizeof(size2));
     }
+    // Arbitrarily limit the number of elements to protect against bad data.
+    if (size1 > UINT16_MAX) return false;
+    if (size2 > UINT16_MAX) return false;
     Resize(size1, size2, empty_);
     return true;
   }
@@ -499,6 +499,9 @@ class GENERIC_2D_ARRAY {
     int32_t size1, size2;
     if (fp->FReadEndian(&size1, sizeof(size1), 1) != 1) return false;
     if (fp->FReadEndian(&size2, sizeof(size2), 1) != 1) return false;
+    // Arbitrarily limit the number of elements to protect against bad data.
+    if (size1 > UINT16_MAX) return false;
+    if (size2 > UINT16_MAX) return false;
     Resize(size1, size2, empty_);
     return true;
   }

From 992031e8248c3b95cda23315bf390303c7c7955c Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Fri, 6 Jul 2018 14:54:37 +0200
Subject: [PATCH 4/5] Fix CID 1164702 (Untrusted value as argument)

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/ccutil/genericvector.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h
index 2c0c7a34..0a6238b8 100644
--- a/src/ccutil/genericvector.h
+++ b/src/ccutil/genericvector.h
@@ -945,9 +945,11 @@ bool GenericVector<T>::Serialize(tesseract::TFile* fp) const {
 // If swap is true, assumes a big/little-endian swap is needed.
 template <typename T>
 bool GenericVector<T>::DeSerialize(bool swap, FILE* fp) {
-  int32_t reserved;
+  uint32_t reserved;
   if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false;
   if (swap) Reverse32(&reserved);
+  // Arbitrarily limit the number of elements to protect against bad data.
+  if (reserved > UINT16_MAX) return false;
   reserve(reserved);
   size_used_ = reserved;
   if (fread(data_, sizeof(T), size_used_, fp) != unsigned_size()) return false;
@@ -959,15 +961,17 @@ bool GenericVector<T>::DeSerialize(bool swap, FILE* fp) {
 }
 template <typename T>
 bool GenericVector<T>::DeSerialize(tesseract::TFile* fp) {
-  int32_t reserved;
+  uint32_t reserved;
   if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false;
+  // Arbitrarily limit the number of elements to protect against bad data.
+  if (reserved > UINT16_MAX) return false;
   reserve(reserved);
   size_used_ = reserved;
   return fp->FReadEndian(data_, sizeof(T), size_used_) == size_used_;
 }
 template <typename T>
 bool GenericVector<T>::SkipDeSerialize(tesseract::TFile* fp) {
-  int32_t reserved;
+  uint32_t reserved;
   if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false;
   return fp->FRead(nullptr, sizeof(T), reserved) == reserved;
 }

From 4bb41b8952e9517d1a920bca9570225f98b23c31 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Fri, 6 Jul 2018 15:13:13 +0200
Subject: [PATCH 5/5] Fix CID 1164693 (Untrusted value as argument)

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 src/ccutil/indexmapbidi.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ccutil/indexmapbidi.cpp b/src/ccutil/indexmapbidi.cpp
index fe9e083e..5d969f07 100644
--- a/src/ccutil/indexmapbidi.cpp
+++ b/src/ccutil/indexmapbidi.cpp
@@ -50,10 +50,12 @@ bool IndexMap::Serialize(FILE* fp) const {
 // Reads from the given file. Returns false in case of error.
 // If swap is true, assumes a big/little-endian swap is needed.
 bool IndexMap::DeSerialize(bool swap, FILE* fp) {
-  int32_t sparse_size;
+  uint32_t sparse_size;
   if (fread(&sparse_size, sizeof(sparse_size), 1, fp) != 1) return false;
   if (swap)
     ReverseN(&sparse_size, sizeof(sparse_size));
+  // Arbitrarily limit the number of elements to protect against bad data.
+  if (sparse_size > UINT16_MAX) return false;
   sparse_size_ = sparse_size;
   if (!compact_map_.DeSerialize(swap, fp)) return false;
   return true;