From f24ef67df41d36ce9238a1fc8191d18e7d0e3a76 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 8 Nov 2016 14:01:04 -0800 Subject: [PATCH] Limited max height to 48 even in variable height input, enabled neural nets via ocr engine mode --- ChangeLog | 5 +++++ api/tesseractmain.cpp | 36 ++++++++++++++++++++++++++++++------ ccmain/tessedit.cpp | 6 +++++- ccstruct/imagedata.cpp | 10 +++++++--- ccstruct/imagedata.h | 5 +++-- lstm/input.cpp | 7 +++++-- lstm/lstmtrainer.cpp | 10 ++++------ training/pango_font_info.cpp | 3 ++- 8 files changed, 61 insertions(+), 21 deletions(-) diff --git a/ChangeLog b/ChangeLog index 492d6984..b54a3bed 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2016-11-11 - V4.00.00 + * Added new neural network system based on LSTMs, with major accuracy gains. + * Improvements to PDF rendering. + * Fixes to trainingdata rendering. + 2016-02-17 - V3.04.01 * Added OSD renderer for psm 0. Works for single page and multi-page images. * Improve tesstrain.sh script. diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index fe07e2af..7eab3d77 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -90,7 +90,7 @@ void PrintVersionInfo() { void PrintUsage(const char* program) { printf( "Usage:\n" - " %s --help | --help-psm | --version\n" + " %s --help | --help-psm | --help-oem | --version\n" " %s --list-langs [--tessdata-dir PATH]\n" " %s --print-parameters [options...] [configfile...]\n" " %s imagename|stdin outputbase|stdout [options...] [configfile...]\n", @@ -120,6 +120,18 @@ void PrintHelpForPSM() { printf("%s", msg); } +void PrintHelpForOEM() { + const char* msg = + "OCR Engine modes:\n" + " 0 Original Tesseract only.\n" + " 1 Cube only.\n" + " 2 Tesseract + cube.\n" + " 3 Default, based on what is available.\n" + " 4 Neural nets (LSTM) only.\n"; + + printf("%s", msg); +} + void PrintHelpMessage(const char* program) { PrintUsage(program); @@ -132,15 +144,18 @@ void PrintHelpMessage(const char* program) { " -c VAR=VALUE Set value for config variables.\n" " Multiple -c arguments are allowed.\n" " -psm NUM Specify page segmentation mode.\n" + " -oem NUM Specify OCR Engine mode.\n" "NOTE: These options must occur before any configfile.\n"; printf("\n%s\n", ocr_options); PrintHelpForPSM(); + PrintHelpForOEM(); const char* single_options = "Single options:\n" " -h, --help Show this help message.\n" " --help-psm Show page segmentation modes.\n" + " --help-oem Show OCR Engine modes.\n" " -v, --version Show version information.\n" " --list-langs List available languages for tesseract engine.\n" " --print-parameters Print tesseract parameters to stdout.\n"; @@ -214,7 +229,8 @@ void ParseArgs(const int argc, char** argv, const char** lang, const char** datapath, bool* list_langs, bool* print_parameters, GenericVector* vars_vec, GenericVector* vars_values, int* arg_i, - tesseract::PageSegMode* pagesegmode) { + tesseract::PageSegMode* pagesegmode, + tesseract::OcrEngineMode* enginemode) { if (argc == 1) { PrintHelpMessage(argv[0]); exit(0); @@ -229,6 +245,10 @@ void ParseArgs(const int argc, char** argv, const char** lang, PrintHelpForPSM(); exit(0); } + if ((strcmp(argv[1], "--help-oem") == 0)) { + PrintHelpForOEM(); + exit(0); + } if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) { PrintVersionInfo(); exit(0); @@ -258,6 +278,9 @@ void ParseArgs(const int argc, char** argv, const char** lang, } else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) { *pagesegmode = static_cast(atoi(argv[i + 1])); ++i; + } else if (strcmp(argv[i], "-oem") == 0 && i + 1 < argc) { + *enginemode = static_cast(atoi(argv[i + 1])); + ++i; } else if (strcmp(argv[i], "--print-parameters") == 0) { noocr = true; *print_parameters = true; @@ -355,6 +378,7 @@ int main(int argc, char** argv) { bool print_parameters = false; int arg_i = 1; tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; + tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT; /* main() calls functions like ParseArgs which call exit(). * This results in memory leaks if vars_vec and vars_values are * declared as auto variables (destructor is not called then). */ @@ -367,7 +391,8 @@ int main(int argc, char** argv) { #endif /* HAVE_TIFFIO_H && _WIN32 */ ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs, - &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode); + &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode, + &enginemode); bool banner = false; if (outputbase != NULL && strcmp(outputbase, "-") && @@ -380,9 +405,8 @@ int main(int argc, char** argv) { api.SetOutputName(outputbase); - int init_failed = - api.Init(datapath, lang, tesseract::OEM_DEFAULT, &(argv[arg_i]), - argc - arg_i, &vars_vec, &vars_values, false); + int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]), + argc - arg_i, &vars_vec, &vars_values, false); if (init_failed) { fprintf(stderr, "Could not initialize tesseract.\n"); exit(1); diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp index cf6b8b67..9a7e6081 100644 --- a/ccmain/tessedit.cpp +++ b/ccmain/tessedit.cpp @@ -218,7 +218,11 @@ bool Tesseract::init_tesseract_lang_data( if (tessdata_manager_debug_level) tprintf("Loaded Cube with combiner\n"); } else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { - if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) { + if (tessdata_manager.swap()) { + tprintf("Error: LSTM requested on big-endian hardware!!\n"); + tprintf("Big-endian not yet supported! Loading tesseract.\n"); + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) { lstm_recognizer_ = new LSTMRecognizer; TFile fp; fp.Open(tessdata_manager.GetDataFilePtr(), -1); diff --git a/ccstruct/imagedata.cpp b/ccstruct/imagedata.cpp index 77e49693..2100aaf1 100644 --- a/ccstruct/imagedata.cpp +++ b/ccstruct/imagedata.cpp @@ -217,7 +217,7 @@ Pix* ImageData::GetPix() const { // The return value is the scaled Pix, which must be pixDestroyed after use, // and scale_factor (if not NULL) is set to the scale factor that was applied // to the image to achieve the target_height. -Pix* ImageData::PreScale(int target_height, float* scale_factor, +Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor, int* scaled_width, int* scaled_height, GenericVector* boxes) const { int input_width = 0; @@ -226,8 +226,12 @@ Pix* ImageData::PreScale(int target_height, float* scale_factor, ASSERT_HOST(src_pix != NULL); input_width = pixGetWidth(src_pix); input_height = pixGetHeight(src_pix); - if (target_height == 0) - target_height = input_height; + if (target_height == 0) { + if (input_height > max_height) + target_height = max_height; + else + target_height = input_height; + } float im_factor = static_cast(target_height) / input_height; if (scaled_width != NULL) *scaled_width = IntCastRounded(im_factor * input_width); diff --git a/ccstruct/imagedata.h b/ccstruct/imagedata.h index 7ffca76f..ae672293 100644 --- a/ccstruct/imagedata.h +++ b/ccstruct/imagedata.h @@ -165,8 +165,9 @@ class ImageData { // The return value is the scaled Pix, which must be pixDestroyed after use, // and scale_factor (if not NULL) is set to the scale factor that was applied // to the image to achieve the target_height. - Pix* PreScale(int target_height, float* scale_factor, int* scaled_width, - int* scaled_height, GenericVector* boxes) const; + Pix* PreScale(int target_height, int max_height, float* scale_factor, + int* scaled_width, int* scaled_height, + GenericVector* boxes) const; int MemoryUsed() const; diff --git a/lstm/input.cpp b/lstm/input.cpp index c0f61781..c283d6b1 100644 --- a/lstm/input.cpp +++ b/lstm/input.cpp @@ -25,6 +25,9 @@ namespace tesseract { +// Max height for variable height inputs before scaling anyway. +const int kMaxInputHeight = 48; + Input::Input(const STRING& name, int ni, int no) : Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {} Input::Input(const STRING& name, const StaticShape& shape) @@ -92,8 +95,8 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data, // Note that NumInputs() is defined as input image height. int target_height = network->NumInputs(); int width, height; - Pix* pix = - image_data.PreScale(target_height, image_scale, &width, &height, nullptr); + Pix* pix = image_data.PreScale(target_height, kMaxInputHeight, image_scale, + &width, &height, nullptr); if (pix == nullptr) { tprintf("Bad pix from ImageData!\n"); return nullptr; diff --git a/lstm/lstmtrainer.cpp b/lstm/lstmtrainer.cpp index 009aa413..9e91dde4 100644 --- a/lstm/lstmtrainer.cpp +++ b/lstm/lstmtrainer.cpp @@ -34,8 +34,6 @@ #include "callcpp.h" -using std::string; - namespace tesseract { // Min actual error rate increase to constitute divergence. @@ -203,7 +201,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index, // Initializes a trainer from a serialized TFNetworkModel proto. // Returns the global step of TensorFlow graph or 0 if failed. -int LSTMTrainer::InitTensorFlowNetwork(const string& tf_proto) { +int LSTMTrainer::InitTensorFlowNetwork(const std::string& tf_proto) { #ifdef INCLUDE_TENSORFLOW delete network_; TFNetwork* tf_net = new TFNetwork("TensorFlow"); @@ -1199,14 +1197,14 @@ double LSTMTrainer::ComputeCharError(const GenericVector& truth_str, // Computes a very simple bag of words word recall error rate. // NOTE that this is destructive on both input strings. double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) { - typedef TessHashMap > StrMap; + typedef TessHashMap > StrMap; GenericVector truth_words, ocr_words; truth_str->split(' ', &truth_words); if (truth_words.empty()) return 0.0; ocr_str->split(' ', &ocr_words); StrMap word_counts; for (int i = 0; i < truth_words.size(); ++i) { - string truth_word(truth_words[i].string()); + std::string truth_word(truth_words[i].string()); StrMap::iterator it = word_counts.find(truth_word); if (it == word_counts.end()) word_counts.insert(make_pair(truth_word, 1)); @@ -1214,7 +1212,7 @@ double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) { ++it->second; } for (int i = 0; i < ocr_words.size(); ++i) { - string ocr_word(ocr_words[i].string()); + std::string ocr_word(ocr_words[i].string()); StrMap::iterator it = word_counts.find(ocr_word); if (it == word_counts.end()) word_counts.insert(make_pair(ocr_word, -1)); diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp index 2a26d700..41e352ea 100644 --- a/training/pango_font_info.cpp +++ b/training/pango_font_info.cpp @@ -127,7 +127,8 @@ string PangoFontInfo::DescriptionName() const { /* static */ void PangoFontInfo::SoftInitFontConfig() { if (fonts_dir_.empty()) { - HardInitFontConfig(FLAGS_fonts_dir.c_str(), FLAGS_fontconfig_tmpdir.c_str()); + HardInitFontConfig(FLAGS_fonts_dir.c_str(), + FLAGS_fontconfig_tmpdir.c_str()); } }