From 92e2ad04710bfd1caad8d38c494bcbc3b63315cc Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 6 Jul 2018 12:08:13 +0200 Subject: [PATCH 1/5] Fix CID 1164703 (Untrusted value as argument) Wrong file data could give a large value for the number of vector elements resulting in very large memory allocations. Limit the allowed data range to UINT16_MAX (65535) elements which hopefully should be sufficient for all use cases. Changing the data type of the related member variables from int to uint32_t allowed removing several type casts. Signed-off-by: Stefan Weil --- src/ccutil/genericvector.h | 6 ++++-- src/classify/adaptmatch.cpp | 2 +- src/classify/mastertrainer.cpp | 8 +++----- src/classify/picofeat.cpp | 4 ++-- src/classify/shapeclassifier.cpp | 4 ++-- src/classify/trainingsample.cpp | 28 ++++++++++++++-------------- src/classify/trainingsample.h | 8 ++++---- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h index 64c5c9e4..2c0c7a34 100644 --- a/src/ccutil/genericvector.h +++ b/src/ccutil/genericvector.h @@ -564,12 +564,14 @@ class PointerVector : public GenericVector { // Also needs T::T(), as new T is used in this function. // Returns false in case of error. bool DeSerialize(bool swap, FILE* fp) { - int32_t reserved; + uint32_t reserved; if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false; if (swap) Reverse32(&reserved); + // Arbitrarily limit the number of elements to protect against bad data. + if (reserved > UINT16_MAX) return false; GenericVector::reserve(reserved); truncate(0); - for (int i = 0; i < reserved; ++i) { + for (uint32_t i = 0; i < reserved; ++i) { int8_t non_null; if (fread(&non_null, sizeof(non_null), 1, fp) != 1) return false; T* item = nullptr; diff --git a/src/classify/adaptmatch.cpp b/src/classify/adaptmatch.cpp index 074612ba..21bd07a0 100644 --- a/src/classify/adaptmatch.cpp +++ b/src/classify/adaptmatch.cpp @@ -1339,7 +1339,7 @@ int Classify::CharNormTrainingSample(bool pruner_only, ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS(); adapt_results->Initialize(); // Compute the bounding box of the features. - int num_features = sample.num_features(); + uint32_t num_features = sample.num_features(); // Only the top and bottom of the blob_box are used by MasterMatcher, so // fabricate right and left using top and bottom. TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom), diff --git a/src/classify/mastertrainer.cpp b/src/classify/mastertrainer.cpp index 90ecd12b..57e0fe11 100644 --- a/src/classify/mastertrainer.cpp +++ b/src/classify/mastertrainer.cpp @@ -1,5 +1,3 @@ -// Copyright 2010 Google Inc. All Rights Reserved. -// Author: rays@google.com (Ray Smith) /////////////////////////////////////////////////////////////////////// // File: mastertrainer.cpp // Description: Trainer to build the MasterClassifier. @@ -552,8 +550,8 @@ CLUSTERER* MasterTrainer::SetupForClustering( int sample_id = 0; for (int i = sample_ptrs.size() - 1; i >= 0; --i) { const TrainingSample* sample = sample_ptrs[i]; - int num_features = sample->num_micro_features(); - for (int f = 0; f < num_features; ++f) + uint32_t num_features = sample->num_micro_features(); + for (uint32_t f = 0; f < num_features; ++f) MakeSample(clusterer, sample->micro_features()[f], sample_id); ++sample_id; } @@ -706,7 +704,7 @@ void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font, if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) { const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font, class_id2); - for (int f = 0; f < sample->num_features(); ++f) { + for (uint32_t f = 0; f < sample->num_features(); ++f) { RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED); } } diff --git a/src/classify/picofeat.cpp b/src/classify/picofeat.cpp index 60e0a681..53bb8f3b 100644 --- a/src/classify/picofeat.cpp +++ b/src/classify/picofeat.cpp @@ -224,10 +224,10 @@ FEATURE_SET Classify::ExtractIntCNFeatures( blob, false, &local_fx_info, &bl_features); if (sample == nullptr) return nullptr; - int num_features = sample->num_features(); + uint32_t num_features = sample->num_features(); const INT_FEATURE_STRUCT* features = sample->features(); FEATURE_SET feature_set = NewFeatureSet(num_features); - for (int f = 0; f < num_features; ++f) { + for (uint32_t f = 0; f < num_features; ++f) { FEATURE feature = NewFeature(&IntFeatDesc); feature->Params[IntX] = features[f].X; diff --git a/src/classify/shapeclassifier.cpp b/src/classify/shapeclassifier.cpp index 7c10d6fe..ac899dc5 100644 --- a/src/classify/shapeclassifier.cpp +++ b/src/classify/shapeclassifier.cpp @@ -109,8 +109,8 @@ void ShapeClassifier::DebugDisplay(const TrainingSample& sample, popup_menu->BuildMenu(debug_win, false); // Display the features in green. const INT_FEATURE_STRUCT* features = sample.features(); - int num_features = sample.num_features(); - for (int f = 0; f < num_features; ++f) { + uint32_t num_features = sample.num_features(); + for (uint32_t f = 0; f < num_features; ++f) { RenderIntFeature(debug_win, &features[f], ScrollView::GREEN); } debug_win->Update(); diff --git a/src/classify/trainingsample.cpp b/src/classify/trainingsample.cpp index 688de335..65fcb3ff 100644 --- a/src/classify/trainingsample.cpp +++ b/src/classify/trainingsample.cpp @@ -61,12 +61,10 @@ bool TrainingSample::Serialize(FILE* fp) const { return false; if (fwrite(&outline_length_, sizeof(outline_length_), 1, fp) != 1) return false; - if (static_cast(fwrite(features_, sizeof(*features_), num_features_, fp)) - != num_features_) + if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_) return false; - if (static_cast(fwrite(micro_features_, sizeof(*micro_features_), - num_micro_features_, - fp)) != num_micro_features_) + if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_, + fp) != num_micro_features_) return false; if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) return false; @@ -102,16 +100,18 @@ bool TrainingSample::DeSerialize(bool swap, FILE* fp) { ReverseN(&num_micro_features_, sizeof(num_micro_features_)); ReverseN(&outline_length_, sizeof(outline_length_)); } + // Arbitrarily limit the number of elements to protect against bad data. + if (num_features_ > UINT16_MAX) return false; + if (num_micro_features_ > UINT16_MAX) return false; delete [] features_; features_ = new INT_FEATURE_STRUCT[num_features_]; - if (static_cast(fread(features_, sizeof(*features_), num_features_, fp)) + if (fread(features_, sizeof(*features_), num_features_, fp) != num_features_) return false; delete [] micro_features_; micro_features_ = new MicroFeature[num_micro_features_]; - if (static_cast(fread(micro_features_, sizeof(*micro_features_), - num_micro_features_, - fp)) != num_micro_features_) + if (fread(micro_features_, sizeof(*micro_features_), num_micro_features_, + fp) != num_micro_features_) return false; if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) return false; @@ -165,7 +165,7 @@ TrainingSample* TrainingSample::RandomizedCopy(int index) const { ++index; // Remove the first combination. const int yshift = kYShiftValues[index / kSampleScaleSize]; double scaling = kScaleValues[index % kSampleScaleSize]; - for (int i = 0; i < num_features_; ++i) { + for (uint32_t i = 0; i < num_features_; ++i) { double result = (features_[i].X - kRandomizingCenter) * scaling; result += kRandomizingCenter; sample->features_[i].X = ClipToRange(result + 0.5, 0, UINT8_MAX); @@ -217,7 +217,7 @@ void TrainingSample::ExtractCharDesc(int int_feature_type, } else { num_features_ = char_features->NumFeatures; features_ = new INT_FEATURE_STRUCT[num_features_]; - for (int f = 0; f < num_features_; ++f) { + for (uint32_t f = 0; f < num_features_; ++f) { features_[f].X = static_cast(char_features->Features[f]->Params[IntX]); features_[f].Y = @@ -238,7 +238,7 @@ void TrainingSample::ExtractCharDesc(int int_feature_type, } else { num_micro_features_ = char_features->NumFeatures; micro_features_ = new MicroFeature[num_micro_features_]; - for (int f = 0; f < num_micro_features_; ++f) { + for (uint32_t f = 0; f < num_micro_features_; ++f) { for (int d = 0; d < MFCount; ++d) { micro_features_[f][d] = char_features->Features[f]->Params[d]; } @@ -294,7 +294,7 @@ void TrainingSample::MapFeatures(const IntFeatureMap& feature_map) { // Returns a pix representing the sample. (Int features only.) Pix* TrainingSample::RenderToPix(const UNICHARSET* unicharset) const { Pix* pix = pixCreate(kIntFeatureExtent, kIntFeatureExtent, 1); - for (int f = 0; f < num_features_; ++f) { + for (uint32_t f = 0; f < num_features_; ++f) { int start_x = features_[f].X; int start_y = kIntFeatureExtent - features_[f].Y; double dx = cos((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI); @@ -315,7 +315,7 @@ Pix* TrainingSample::RenderToPix(const UNICHARSET* unicharset) const { void TrainingSample::DisplayFeatures(ScrollView::Color color, ScrollView* window) const { #ifndef GRAPHICS_DISABLED - for (int f = 0; f < num_features_; ++f) { + for (uint32_t f = 0; f < num_features_; ++f) { RenderIntFeature(window, &features_[f], color); } #endif // GRAPHICS_DISABLED diff --git a/src/classify/trainingsample.h b/src/classify/trainingsample.h index d74ea99a..0964e2be 100644 --- a/src/classify/trainingsample.h +++ b/src/classify/trainingsample.h @@ -137,13 +137,13 @@ class TrainingSample : public ELIST_LINK { void set_bounding_box(const TBOX& box) { bounding_box_ = box; } - int num_features() const { + uint32_t num_features() const { return num_features_; } const INT_FEATURE_STRUCT* features() const { return features_; } - int num_micro_features() const { + uint32_t num_micro_features() const { return num_micro_features_; } const MicroFeature* micro_features() const { @@ -206,9 +206,9 @@ class TrainingSample : public ELIST_LINK { // Bounding box of sample in original image. TBOX bounding_box_; // Number of INT_FEATURE_STRUCT in features_ array. - int num_features_; + uint32_t num_features_; // Number of MicroFeature in micro_features_ array. - int num_micro_features_; + uint32_t num_micro_features_; // Total length of outline in the baseline normalized coordinate space. // See comment in WERD_RES class definition for a discussion of coordinate // spaces. From 8871f4d6223013935a45fa2e0ea27011dbc1fa41 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 6 Jul 2018 14:51:04 +0200 Subject: [PATCH 2/5] Fix CID 1164686 (Use of untrusted scalar value) Signed-off-by: Stefan Weil --- src/ccutil/strngs.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ccutil/strngs.cpp b/src/ccutil/strngs.cpp index 44e36546..1fd650e0 100644 --- a/src/ccutil/strngs.cpp +++ b/src/ccutil/strngs.cpp @@ -161,13 +161,14 @@ bool STRING::Serialize(TFile* fp) const { // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool STRING::DeSerialize(bool swap, FILE* fp) { - int32_t len; + uint32_t len; if (fread(&len, sizeof(len), 1, fp) != 1) return false; if (swap) ReverseN(&len, sizeof(len)); + // Arbitrarily limit the number of characters to protect against bad data. + if (len > UINT16_MAX) return false; truncate_at(len); - if (static_cast(fread(GetCStr(), 1, len, fp)) != len) return false; - return true; + return fread(GetCStr(), 1, len, fp) == len; } // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. From c1da5fbac434bdb0b7e593cd39671c4b65cc2998 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 6 Jul 2018 14:53:53 +0200 Subject: [PATCH 3/5] Fix CID 1164704 (Untrusted value as argument) Limit the matrix to UINT16_MAX x UINT16_MAX. Larger dimensions could also result in an arithmetic overflow when multiplying the two dimensions. Signed-off-by: Stefan Weil --- src/ccstruct/matrix.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ccstruct/matrix.h b/src/ccstruct/matrix.h index 9efde3c3..03a31c12 100644 --- a/src/ccstruct/matrix.h +++ b/src/ccstruct/matrix.h @@ -1,6 +1,6 @@ /* -*-C-*- ****************************************************************************** - * File: matrix.h (Formerly matrix.h) + * File: matrix.h * Description: Generic 2-d array/matrix and banded triangular matrix class. * Author: Ray Smith * TODO(rays) Separate from ratings matrix, which it also contains: @@ -10,9 +10,6 @@ * Author: Mark Seaman, OCR Technology * Created: Wed May 16 13:22:06 1990 * Modified: Tue Mar 19 16:00:20 1991 (Mark Seaman) marks@hpgrlt - * Language: C - * Package: N/A - * Status: Experimental (Do Not Distribute) * * (c) Copyright 1990, Hewlett-Packard Company. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -492,6 +489,9 @@ class GENERIC_2D_ARRAY { ReverseN(&size1, sizeof(size1)); ReverseN(&size2, sizeof(size2)); } + // Arbitrarily limit the number of elements to protect against bad data. + if (size1 > UINT16_MAX) return false; + if (size2 > UINT16_MAX) return false; Resize(size1, size2, empty_); return true; } @@ -499,6 +499,9 @@ class GENERIC_2D_ARRAY { int32_t size1, size2; if (fp->FReadEndian(&size1, sizeof(size1), 1) != 1) return false; if (fp->FReadEndian(&size2, sizeof(size2), 1) != 1) return false; + // Arbitrarily limit the number of elements to protect against bad data. + if (size1 > UINT16_MAX) return false; + if (size2 > UINT16_MAX) return false; Resize(size1, size2, empty_); return true; } From 992031e8248c3b95cda23315bf390303c7c7955c Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 6 Jul 2018 14:54:37 +0200 Subject: [PATCH 4/5] Fix CID 1164702 (Untrusted value as argument) Signed-off-by: Stefan Weil --- src/ccutil/genericvector.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h index 2c0c7a34..0a6238b8 100644 --- a/src/ccutil/genericvector.h +++ b/src/ccutil/genericvector.h @@ -945,9 +945,11 @@ bool GenericVector::Serialize(tesseract::TFile* fp) const { // If swap is true, assumes a big/little-endian swap is needed. template bool GenericVector::DeSerialize(bool swap, FILE* fp) { - int32_t reserved; + uint32_t reserved; if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false; if (swap) Reverse32(&reserved); + // Arbitrarily limit the number of elements to protect against bad data. + if (reserved > UINT16_MAX) return false; reserve(reserved); size_used_ = reserved; if (fread(data_, sizeof(T), size_used_, fp) != unsigned_size()) return false; @@ -959,15 +961,17 @@ bool GenericVector::DeSerialize(bool swap, FILE* fp) { } template bool GenericVector::DeSerialize(tesseract::TFile* fp) { - int32_t reserved; + uint32_t reserved; if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false; + // Arbitrarily limit the number of elements to protect against bad data. + if (reserved > UINT16_MAX) return false; reserve(reserved); size_used_ = reserved; return fp->FReadEndian(data_, sizeof(T), size_used_) == size_used_; } template bool GenericVector::SkipDeSerialize(tesseract::TFile* fp) { - int32_t reserved; + uint32_t reserved; if (fp->FReadEndian(&reserved, sizeof(reserved), 1) != 1) return false; return fp->FRead(nullptr, sizeof(T), reserved) == reserved; } From 4bb41b8952e9517d1a920bca9570225f98b23c31 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 6 Jul 2018 15:13:13 +0200 Subject: [PATCH 5/5] Fix CID 1164693 (Untrusted value as argument) Signed-off-by: Stefan Weil --- src/ccutil/indexmapbidi.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ccutil/indexmapbidi.cpp b/src/ccutil/indexmapbidi.cpp index fe9e083e..5d969f07 100644 --- a/src/ccutil/indexmapbidi.cpp +++ b/src/ccutil/indexmapbidi.cpp @@ -50,10 +50,12 @@ bool IndexMap::Serialize(FILE* fp) const { // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool IndexMap::DeSerialize(bool swap, FILE* fp) { - int32_t sparse_size; + uint32_t sparse_size; if (fread(&sparse_size, sizeof(sparse_size), 1, fp) != 1) return false; if (swap) ReverseN(&sparse_size, sizeof(sparse_size)); + // Arbitrarily limit the number of elements to protect against bad data. + if (sparse_size > UINT16_MAX) return false; sparse_size_ = sparse_size; if (!compact_map_.DeSerialize(swap, fp)) return false; return true;