From f24ef67df41d36ce9238a1fc8191d18e7d0e3a76 Mon Sep 17 00:00:00 2001
From: Ray Smith <rays@google.com>
Date: Tue, 8 Nov 2016 14:01:04 -0800
Subject: [PATCH] Limited max height to 48 even in variable height input,
 enabled neural nets via ocr engine mode

---
 ChangeLog                    |  5 +++++
 api/tesseractmain.cpp        | 36 ++++++++++++++++++++++++++++++------
 ccmain/tessedit.cpp          |  6 +++++-
 ccstruct/imagedata.cpp       | 10 +++++++---
 ccstruct/imagedata.h         |  5 +++--
 lstm/input.cpp               |  7 +++++--
 lstm/lstmtrainer.cpp         | 10 ++++------
 training/pango_font_info.cpp |  3 ++-
 8 files changed, 61 insertions(+), 21 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 492d6984..b54a3bed 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2016-11-11 - V4.00.00
+  * Added new neural network system based on LSTMs, with major accuracy gains.
+  * Improvements to PDF rendering.
+  * Fixes to trainingdata rendering.
+
 2016-02-17 - V3.04.01
   * Added OSD renderer for psm 0. Works for single page and multi-page images.
   * Improve tesstrain.sh script.
diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp
index fe07e2af..7eab3d77 100644
--- a/api/tesseractmain.cpp
+++ b/api/tesseractmain.cpp
@@ -90,7 +90,7 @@ void PrintVersionInfo() {
 void PrintUsage(const char* program) {
   printf(
       "Usage:\n"
-      "  %s --help | --help-psm | --version\n"
+      "  %s --help | --help-psm | --help-oem | --version\n"
       "  %s --list-langs [--tessdata-dir PATH]\n"
       "  %s --print-parameters [options...] [configfile...]\n"
       "  %s imagename|stdin outputbase|stdout [options...] [configfile...]\n",
@@ -120,6 +120,18 @@ void PrintHelpForPSM() {
   printf("%s", msg);
 }
 
+void PrintHelpForOEM() {
+  const char* msg =
+      "OCR Engine modes:\n"
+      "  0    Original Tesseract only.\n"
+      "  1    Cube only.\n"
+      "  2    Tesseract + cube.\n"
+      "  3    Default, based on what is available.\n"
+      "  4    Neural nets (LSTM) only.\n";
+
+  printf("%s", msg);
+}
+
 void PrintHelpMessage(const char* program) {
   PrintUsage(program);
 
@@ -132,15 +144,18 @@ void PrintHelpMessage(const char* program) {
       "  -c VAR=VALUE          Set value for config variables.\n"
       "                        Multiple -c arguments are allowed.\n"
       "  -psm NUM              Specify page segmentation mode.\n"
+      "  -oem NUM              Specify OCR Engine mode.\n"
       "NOTE: These options must occur before any configfile.\n";
 
   printf("\n%s\n", ocr_options);
   PrintHelpForPSM();
+  PrintHelpForOEM();
 
   const char* single_options =
       "Single options:\n"
       "  -h, --help            Show this help message.\n"
       "  --help-psm            Show page segmentation modes.\n"
+      "  --help-oem            Show OCR Engine modes.\n"
       "  -v, --version         Show version information.\n"
       "  --list-langs          List available languages for tesseract engine.\n"
       "  --print-parameters    Print tesseract parameters to stdout.\n";
@@ -214,7 +229,8 @@ void ParseArgs(const int argc, char** argv, const char** lang,
                const char** datapath, bool* list_langs, bool* print_parameters,
                GenericVector<STRING>* vars_vec,
                GenericVector<STRING>* vars_values, int* arg_i,
-               tesseract::PageSegMode* pagesegmode) {
+               tesseract::PageSegMode* pagesegmode,
+               tesseract::OcrEngineMode* enginemode) {
   if (argc == 1) {
     PrintHelpMessage(argv[0]);
     exit(0);
@@ -229,6 +245,10 @@ void ParseArgs(const int argc, char** argv, const char** lang,
       PrintHelpForPSM();
       exit(0);
     }
+    if ((strcmp(argv[1], "--help-oem") == 0)) {
+      PrintHelpForOEM();
+      exit(0);
+    }
     if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) {
       PrintVersionInfo();
       exit(0);
@@ -258,6 +278,9 @@ void ParseArgs(const int argc, char** argv, const char** lang,
     } else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) {
       *pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
       ++i;
+    } else if (strcmp(argv[i], "-oem") == 0 && i + 1 < argc) {
+      *enginemode = static_cast<tesseract::OcrEngineMode>(atoi(argv[i + 1]));
+      ++i;
     } else if (strcmp(argv[i], "--print-parameters") == 0) {
       noocr = true;
       *print_parameters = true;
@@ -355,6 +378,7 @@ int main(int argc, char** argv) {
   bool print_parameters = false;
   int arg_i = 1;
   tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
+  tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT;
   /* main() calls functions like ParseArgs which call exit().
    * This results in memory leaks if vars_vec and vars_values are
    * declared as auto variables (destructor is not called then). */
@@ -367,7 +391,8 @@ int main(int argc, char** argv) {
 #endif /* HAVE_TIFFIO_H &&  _WIN32 */
 
   ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
-            &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode);
+            &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
+            &enginemode);
 
   bool banner = false;
   if (outputbase != NULL && strcmp(outputbase, "-") &&
@@ -380,9 +405,8 @@ int main(int argc, char** argv) {
 
   api.SetOutputName(outputbase);
 
-  int init_failed =
-      api.Init(datapath, lang, tesseract::OEM_DEFAULT, &(argv[arg_i]),
-               argc - arg_i, &vars_vec, &vars_values, false);
+  int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
+                             argc - arg_i, &vars_vec, &vars_values, false);
   if (init_failed) {
     fprintf(stderr, "Could not initialize tesseract.\n");
     exit(1);
diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp
index cf6b8b67..9a7e6081 100644
--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@@ -218,7 +218,11 @@ bool Tesseract::init_tesseract_lang_data(
     if (tessdata_manager_debug_level)
       tprintf("Loaded Cube with combiner\n");
   } else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
-    if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
+    if (tessdata_manager.swap()) {
+      tprintf("Error: LSTM requested on big-endian hardware!!\n");
+      tprintf("Big-endian not yet supported! Loading tesseract.\n");
+      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
+    } else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
       lstm_recognizer_ = new LSTMRecognizer;
       TFile fp;
       fp.Open(tessdata_manager.GetDataFilePtr(), -1);
diff --git a/ccstruct/imagedata.cpp b/ccstruct/imagedata.cpp
index 77e49693..2100aaf1 100644
--- a/ccstruct/imagedata.cpp
+++ b/ccstruct/imagedata.cpp
@@ -217,7 +217,7 @@ Pix* ImageData::GetPix() const {
 // The return value is the scaled Pix, which must be pixDestroyed after use,
 // and scale_factor (if not NULL) is set to the scale factor that was applied
 // to the image to achieve the target_height.
-Pix* ImageData::PreScale(int target_height, float* scale_factor,
+Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
                          int* scaled_width, int* scaled_height,
                          GenericVector<TBOX>* boxes) const {
   int input_width = 0;
@@ -226,8 +226,12 @@ Pix* ImageData::PreScale(int target_height, float* scale_factor,
   ASSERT_HOST(src_pix != NULL);
   input_width = pixGetWidth(src_pix);
   input_height = pixGetHeight(src_pix);
-  if (target_height == 0)
-    target_height = input_height;
+  if (target_height == 0) {
+    if (input_height > max_height)
+      target_height = max_height;
+    else
+      target_height = input_height;
+  }
   float im_factor = static_cast<float>(target_height) / input_height;
   if (scaled_width != NULL)
     *scaled_width = IntCastRounded(im_factor * input_width);
diff --git a/ccstruct/imagedata.h b/ccstruct/imagedata.h
index 7ffca76f..ae672293 100644
--- a/ccstruct/imagedata.h
+++ b/ccstruct/imagedata.h
@@ -165,8 +165,9 @@ class ImageData {
   // The return value is the scaled Pix, which must be pixDestroyed after use,
   // and scale_factor (if not NULL) is set to the scale factor that was applied
   // to the image to achieve the target_height.
-  Pix* PreScale(int target_height, float* scale_factor, int* scaled_width,
-                int* scaled_height, GenericVector<TBOX>* boxes) const;
+  Pix* PreScale(int target_height, int max_height, float* scale_factor,
+                int* scaled_width, int* scaled_height,
+                GenericVector<TBOX>* boxes) const;
 
   int MemoryUsed() const;
 
diff --git a/lstm/input.cpp b/lstm/input.cpp
index c0f61781..c283d6b1 100644
--- a/lstm/input.cpp
+++ b/lstm/input.cpp
@@ -25,6 +25,9 @@
 
 namespace tesseract {
 
+// Max height for variable height inputs before scaling anyway.
+const int kMaxInputHeight = 48;
+
 Input::Input(const STRING& name, int ni, int no)
     : Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {}
 Input::Input(const STRING& name, const StaticShape& shape)
@@ -92,8 +95,8 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
   // Note that NumInputs() is defined as input image height.
   int target_height = network->NumInputs();
   int width, height;
-  Pix* pix =
-      image_data.PreScale(target_height, image_scale, &width, &height, nullptr);
+  Pix* pix = image_data.PreScale(target_height, kMaxInputHeight, image_scale,
+                                 &width, &height, nullptr);
   if (pix == nullptr) {
     tprintf("Bad pix from ImageData!\n");
     return nullptr;
diff --git a/lstm/lstmtrainer.cpp b/lstm/lstmtrainer.cpp
index 009aa413..9e91dde4 100644
--- a/lstm/lstmtrainer.cpp
+++ b/lstm/lstmtrainer.cpp
@@ -34,8 +34,6 @@
 
 #include "callcpp.h"
 
-using std::string;
-
 namespace tesseract {
 
 // Min actual error rate increase to constitute divergence.
@@ -203,7 +201,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
 
 // Initializes a trainer from a serialized TFNetworkModel proto.
 // Returns the global step of TensorFlow graph or 0 if failed.
-int LSTMTrainer::InitTensorFlowNetwork(const string& tf_proto) {
+int LSTMTrainer::InitTensorFlowNetwork(const std::string& tf_proto) {
 #ifdef INCLUDE_TENSORFLOW
   delete network_;
   TFNetwork* tf_net = new TFNetwork("TensorFlow");
@@ -1199,14 +1197,14 @@ double LSTMTrainer::ComputeCharError(const GenericVector<int>& truth_str,
 // Computes a very simple bag of words word recall error rate.
 // NOTE that this is destructive on both input strings.
 double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
-  typedef TessHashMap<string, int, std::hash<string> > StrMap;
+  typedef TessHashMap<std::string, int, std::hash<std::string> > StrMap;
   GenericVector<STRING> truth_words, ocr_words;
   truth_str->split(' ', &truth_words);
   if (truth_words.empty()) return 0.0;
   ocr_str->split(' ', &ocr_words);
   StrMap word_counts;
   for (int i = 0; i < truth_words.size(); ++i) {
-    string truth_word(truth_words[i].string());
+    std::string truth_word(truth_words[i].string());
     StrMap::iterator it = word_counts.find(truth_word);
     if (it == word_counts.end())
       word_counts.insert(make_pair(truth_word, 1));
@@ -1214,7 +1212,7 @@ double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
       ++it->second;
   }
   for (int i = 0; i < ocr_words.size(); ++i) {
-    string ocr_word(ocr_words[i].string());
+    std::string ocr_word(ocr_words[i].string());
     StrMap::iterator it = word_counts.find(ocr_word);
     if (it == word_counts.end())
       word_counts.insert(make_pair(ocr_word, -1));
diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp
index 2a26d700..41e352ea 100644
--- a/training/pango_font_info.cpp
+++ b/training/pango_font_info.cpp
@@ -127,7 +127,8 @@ string PangoFontInfo::DescriptionName() const {
 /* static */
 void PangoFontInfo::SoftInitFontConfig() {
   if (fonts_dir_.empty()) {
-    HardInitFontConfig(FLAGS_fonts_dir.c_str(), FLAGS_fontconfig_tmpdir.c_str());
+    HardInitFontConfig(FLAGS_fonts_dir.c_str(),
+                       FLAGS_fontconfig_tmpdir.c_str());
   }
 }