mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 18:02:40 +08:00
Limited max height to 48 even in variable height input, enabled neural nets via ocr engine mode
This commit is contained in:
parent
c1c1e426b3
commit
f24ef67df4
@ -1,3 +1,8 @@
|
|||||||
|
2016-11-11 - V4.00.00
|
||||||
|
* Added new neural network system based on LSTMs, with major accuracy gains.
|
||||||
|
* Improvements to PDF rendering.
|
||||||
|
* Fixes to trainingdata rendering.
|
||||||
|
|
||||||
2016-02-17 - V3.04.01
|
2016-02-17 - V3.04.01
|
||||||
* Added OSD renderer for psm 0. Works for single page and multi-page images.
|
* Added OSD renderer for psm 0. Works for single page and multi-page images.
|
||||||
* Improve tesstrain.sh script.
|
* Improve tesstrain.sh script.
|
||||||
|
@ -90,7 +90,7 @@ void PrintVersionInfo() {
|
|||||||
void PrintUsage(const char* program) {
|
void PrintUsage(const char* program) {
|
||||||
printf(
|
printf(
|
||||||
"Usage:\n"
|
"Usage:\n"
|
||||||
" %s --help | --help-psm | --version\n"
|
" %s --help | --help-psm | --help-oem | --version\n"
|
||||||
" %s --list-langs [--tessdata-dir PATH]\n"
|
" %s --list-langs [--tessdata-dir PATH]\n"
|
||||||
" %s --print-parameters [options...] [configfile...]\n"
|
" %s --print-parameters [options...] [configfile...]\n"
|
||||||
" %s imagename|stdin outputbase|stdout [options...] [configfile...]\n",
|
" %s imagename|stdin outputbase|stdout [options...] [configfile...]\n",
|
||||||
@ -120,6 +120,18 @@ void PrintHelpForPSM() {
|
|||||||
printf("%s", msg);
|
printf("%s", msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void PrintHelpForOEM() {
|
||||||
|
const char* msg =
|
||||||
|
"OCR Engine modes:\n"
|
||||||
|
" 0 Original Tesseract only.\n"
|
||||||
|
" 1 Cube only.\n"
|
||||||
|
" 2 Tesseract + cube.\n"
|
||||||
|
" 3 Default, based on what is available.\n"
|
||||||
|
" 4 Neural nets (LSTM) only.\n";
|
||||||
|
|
||||||
|
printf("%s", msg);
|
||||||
|
}
|
||||||
|
|
||||||
void PrintHelpMessage(const char* program) {
|
void PrintHelpMessage(const char* program) {
|
||||||
PrintUsage(program);
|
PrintUsage(program);
|
||||||
|
|
||||||
@ -132,15 +144,18 @@ void PrintHelpMessage(const char* program) {
|
|||||||
" -c VAR=VALUE Set value for config variables.\n"
|
" -c VAR=VALUE Set value for config variables.\n"
|
||||||
" Multiple -c arguments are allowed.\n"
|
" Multiple -c arguments are allowed.\n"
|
||||||
" -psm NUM Specify page segmentation mode.\n"
|
" -psm NUM Specify page segmentation mode.\n"
|
||||||
|
" -oem NUM Specify OCR Engine mode.\n"
|
||||||
"NOTE: These options must occur before any configfile.\n";
|
"NOTE: These options must occur before any configfile.\n";
|
||||||
|
|
||||||
printf("\n%s\n", ocr_options);
|
printf("\n%s\n", ocr_options);
|
||||||
PrintHelpForPSM();
|
PrintHelpForPSM();
|
||||||
|
PrintHelpForOEM();
|
||||||
|
|
||||||
const char* single_options =
|
const char* single_options =
|
||||||
"Single options:\n"
|
"Single options:\n"
|
||||||
" -h, --help Show this help message.\n"
|
" -h, --help Show this help message.\n"
|
||||||
" --help-psm Show page segmentation modes.\n"
|
" --help-psm Show page segmentation modes.\n"
|
||||||
|
" --help-oem Show OCR Engine modes.\n"
|
||||||
" -v, --version Show version information.\n"
|
" -v, --version Show version information.\n"
|
||||||
" --list-langs List available languages for tesseract engine.\n"
|
" --list-langs List available languages for tesseract engine.\n"
|
||||||
" --print-parameters Print tesseract parameters to stdout.\n";
|
" --print-parameters Print tesseract parameters to stdout.\n";
|
||||||
@ -214,7 +229,8 @@ void ParseArgs(const int argc, char** argv, const char** lang,
|
|||||||
const char** datapath, bool* list_langs, bool* print_parameters,
|
const char** datapath, bool* list_langs, bool* print_parameters,
|
||||||
GenericVector<STRING>* vars_vec,
|
GenericVector<STRING>* vars_vec,
|
||||||
GenericVector<STRING>* vars_values, int* arg_i,
|
GenericVector<STRING>* vars_values, int* arg_i,
|
||||||
tesseract::PageSegMode* pagesegmode) {
|
tesseract::PageSegMode* pagesegmode,
|
||||||
|
tesseract::OcrEngineMode* enginemode) {
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
PrintHelpMessage(argv[0]);
|
PrintHelpMessage(argv[0]);
|
||||||
exit(0);
|
exit(0);
|
||||||
@ -229,6 +245,10 @@ void ParseArgs(const int argc, char** argv, const char** lang,
|
|||||||
PrintHelpForPSM();
|
PrintHelpForPSM();
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
|
if ((strcmp(argv[1], "--help-oem") == 0)) {
|
||||||
|
PrintHelpForOEM();
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) {
|
if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) {
|
||||||
PrintVersionInfo();
|
PrintVersionInfo();
|
||||||
exit(0);
|
exit(0);
|
||||||
@ -258,6 +278,9 @@ void ParseArgs(const int argc, char** argv, const char** lang,
|
|||||||
} else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) {
|
} else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) {
|
||||||
*pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
|
*pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
|
||||||
++i;
|
++i;
|
||||||
|
} else if (strcmp(argv[i], "-oem") == 0 && i + 1 < argc) {
|
||||||
|
*enginemode = static_cast<tesseract::OcrEngineMode>(atoi(argv[i + 1]));
|
||||||
|
++i;
|
||||||
} else if (strcmp(argv[i], "--print-parameters") == 0) {
|
} else if (strcmp(argv[i], "--print-parameters") == 0) {
|
||||||
noocr = true;
|
noocr = true;
|
||||||
*print_parameters = true;
|
*print_parameters = true;
|
||||||
@ -355,6 +378,7 @@ int main(int argc, char** argv) {
|
|||||||
bool print_parameters = false;
|
bool print_parameters = false;
|
||||||
int arg_i = 1;
|
int arg_i = 1;
|
||||||
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
|
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
|
||||||
|
tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT;
|
||||||
/* main() calls functions like ParseArgs which call exit().
|
/* main() calls functions like ParseArgs which call exit().
|
||||||
* This results in memory leaks if vars_vec and vars_values are
|
* This results in memory leaks if vars_vec and vars_values are
|
||||||
* declared as auto variables (destructor is not called then). */
|
* declared as auto variables (destructor is not called then). */
|
||||||
@ -367,7 +391,8 @@ int main(int argc, char** argv) {
|
|||||||
#endif /* HAVE_TIFFIO_H && _WIN32 */
|
#endif /* HAVE_TIFFIO_H && _WIN32 */
|
||||||
|
|
||||||
ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
|
ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
|
||||||
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode);
|
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
|
||||||
|
&enginemode);
|
||||||
|
|
||||||
bool banner = false;
|
bool banner = false;
|
||||||
if (outputbase != NULL && strcmp(outputbase, "-") &&
|
if (outputbase != NULL && strcmp(outputbase, "-") &&
|
||||||
@ -380,9 +405,8 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
api.SetOutputName(outputbase);
|
api.SetOutputName(outputbase);
|
||||||
|
|
||||||
int init_failed =
|
int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
|
||||||
api.Init(datapath, lang, tesseract::OEM_DEFAULT, &(argv[arg_i]),
|
argc - arg_i, &vars_vec, &vars_values, false);
|
||||||
argc - arg_i, &vars_vec, &vars_values, false);
|
|
||||||
if (init_failed) {
|
if (init_failed) {
|
||||||
fprintf(stderr, "Could not initialize tesseract.\n");
|
fprintf(stderr, "Could not initialize tesseract.\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -218,7 +218,11 @@ bool Tesseract::init_tesseract_lang_data(
|
|||||||
if (tessdata_manager_debug_level)
|
if (tessdata_manager_debug_level)
|
||||||
tprintf("Loaded Cube with combiner\n");
|
tprintf("Loaded Cube with combiner\n");
|
||||||
} else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
|
} else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
|
||||||
if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
|
if (tessdata_manager.swap()) {
|
||||||
|
tprintf("Error: LSTM requested on big-endian hardware!!\n");
|
||||||
|
tprintf("Big-endian not yet supported! Loading tesseract.\n");
|
||||||
|
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
|
||||||
|
} else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
|
||||||
lstm_recognizer_ = new LSTMRecognizer;
|
lstm_recognizer_ = new LSTMRecognizer;
|
||||||
TFile fp;
|
TFile fp;
|
||||||
fp.Open(tessdata_manager.GetDataFilePtr(), -1);
|
fp.Open(tessdata_manager.GetDataFilePtr(), -1);
|
||||||
|
@ -217,7 +217,7 @@ Pix* ImageData::GetPix() const {
|
|||||||
// The return value is the scaled Pix, which must be pixDestroyed after use,
|
// The return value is the scaled Pix, which must be pixDestroyed after use,
|
||||||
// and scale_factor (if not NULL) is set to the scale factor that was applied
|
// and scale_factor (if not NULL) is set to the scale factor that was applied
|
||||||
// to the image to achieve the target_height.
|
// to the image to achieve the target_height.
|
||||||
Pix* ImageData::PreScale(int target_height, float* scale_factor,
|
Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
|
||||||
int* scaled_width, int* scaled_height,
|
int* scaled_width, int* scaled_height,
|
||||||
GenericVector<TBOX>* boxes) const {
|
GenericVector<TBOX>* boxes) const {
|
||||||
int input_width = 0;
|
int input_width = 0;
|
||||||
@ -226,8 +226,12 @@ Pix* ImageData::PreScale(int target_height, float* scale_factor,
|
|||||||
ASSERT_HOST(src_pix != NULL);
|
ASSERT_HOST(src_pix != NULL);
|
||||||
input_width = pixGetWidth(src_pix);
|
input_width = pixGetWidth(src_pix);
|
||||||
input_height = pixGetHeight(src_pix);
|
input_height = pixGetHeight(src_pix);
|
||||||
if (target_height == 0)
|
if (target_height == 0) {
|
||||||
target_height = input_height;
|
if (input_height > max_height)
|
||||||
|
target_height = max_height;
|
||||||
|
else
|
||||||
|
target_height = input_height;
|
||||||
|
}
|
||||||
float im_factor = static_cast<float>(target_height) / input_height;
|
float im_factor = static_cast<float>(target_height) / input_height;
|
||||||
if (scaled_width != NULL)
|
if (scaled_width != NULL)
|
||||||
*scaled_width = IntCastRounded(im_factor * input_width);
|
*scaled_width = IntCastRounded(im_factor * input_width);
|
||||||
|
@ -165,8 +165,9 @@ class ImageData {
|
|||||||
// The return value is the scaled Pix, which must be pixDestroyed after use,
|
// The return value is the scaled Pix, which must be pixDestroyed after use,
|
||||||
// and scale_factor (if not NULL) is set to the scale factor that was applied
|
// and scale_factor (if not NULL) is set to the scale factor that was applied
|
||||||
// to the image to achieve the target_height.
|
// to the image to achieve the target_height.
|
||||||
Pix* PreScale(int target_height, float* scale_factor, int* scaled_width,
|
Pix* PreScale(int target_height, int max_height, float* scale_factor,
|
||||||
int* scaled_height, GenericVector<TBOX>* boxes) const;
|
int* scaled_width, int* scaled_height,
|
||||||
|
GenericVector<TBOX>* boxes) const;
|
||||||
|
|
||||||
int MemoryUsed() const;
|
int MemoryUsed() const;
|
||||||
|
|
||||||
|
@ -25,6 +25,9 @@
|
|||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
|
// Max height for variable height inputs before scaling anyway.
|
||||||
|
const int kMaxInputHeight = 48;
|
||||||
|
|
||||||
Input::Input(const STRING& name, int ni, int no)
|
Input::Input(const STRING& name, int ni, int no)
|
||||||
: Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {}
|
: Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {}
|
||||||
Input::Input(const STRING& name, const StaticShape& shape)
|
Input::Input(const STRING& name, const StaticShape& shape)
|
||||||
@ -92,8 +95,8 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
|
|||||||
// Note that NumInputs() is defined as input image height.
|
// Note that NumInputs() is defined as input image height.
|
||||||
int target_height = network->NumInputs();
|
int target_height = network->NumInputs();
|
||||||
int width, height;
|
int width, height;
|
||||||
Pix* pix =
|
Pix* pix = image_data.PreScale(target_height, kMaxInputHeight, image_scale,
|
||||||
image_data.PreScale(target_height, image_scale, &width, &height, nullptr);
|
&width, &height, nullptr);
|
||||||
if (pix == nullptr) {
|
if (pix == nullptr) {
|
||||||
tprintf("Bad pix from ImageData!\n");
|
tprintf("Bad pix from ImageData!\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -34,8 +34,6 @@
|
|||||||
|
|
||||||
#include "callcpp.h"
|
#include "callcpp.h"
|
||||||
|
|
||||||
using std::string;
|
|
||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
// Min actual error rate increase to constitute divergence.
|
// Min actual error rate increase to constitute divergence.
|
||||||
@ -203,7 +201,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
|
|||||||
|
|
||||||
// Initializes a trainer from a serialized TFNetworkModel proto.
|
// Initializes a trainer from a serialized TFNetworkModel proto.
|
||||||
// Returns the global step of TensorFlow graph or 0 if failed.
|
// Returns the global step of TensorFlow graph or 0 if failed.
|
||||||
int LSTMTrainer::InitTensorFlowNetwork(const string& tf_proto) {
|
int LSTMTrainer::InitTensorFlowNetwork(const std::string& tf_proto) {
|
||||||
#ifdef INCLUDE_TENSORFLOW
|
#ifdef INCLUDE_TENSORFLOW
|
||||||
delete network_;
|
delete network_;
|
||||||
TFNetwork* tf_net = new TFNetwork("TensorFlow");
|
TFNetwork* tf_net = new TFNetwork("TensorFlow");
|
||||||
@ -1199,14 +1197,14 @@ double LSTMTrainer::ComputeCharError(const GenericVector<int>& truth_str,
|
|||||||
// Computes a very simple bag of words word recall error rate.
|
// Computes a very simple bag of words word recall error rate.
|
||||||
// NOTE that this is destructive on both input strings.
|
// NOTE that this is destructive on both input strings.
|
||||||
double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
|
double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
|
||||||
typedef TessHashMap<string, int, std::hash<string> > StrMap;
|
typedef TessHashMap<std::string, int, std::hash<std::string> > StrMap;
|
||||||
GenericVector<STRING> truth_words, ocr_words;
|
GenericVector<STRING> truth_words, ocr_words;
|
||||||
truth_str->split(' ', &truth_words);
|
truth_str->split(' ', &truth_words);
|
||||||
if (truth_words.empty()) return 0.0;
|
if (truth_words.empty()) return 0.0;
|
||||||
ocr_str->split(' ', &ocr_words);
|
ocr_str->split(' ', &ocr_words);
|
||||||
StrMap word_counts;
|
StrMap word_counts;
|
||||||
for (int i = 0; i < truth_words.size(); ++i) {
|
for (int i = 0; i < truth_words.size(); ++i) {
|
||||||
string truth_word(truth_words[i].string());
|
std::string truth_word(truth_words[i].string());
|
||||||
StrMap::iterator it = word_counts.find(truth_word);
|
StrMap::iterator it = word_counts.find(truth_word);
|
||||||
if (it == word_counts.end())
|
if (it == word_counts.end())
|
||||||
word_counts.insert(make_pair(truth_word, 1));
|
word_counts.insert(make_pair(truth_word, 1));
|
||||||
@ -1214,7 +1212,7 @@ double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
|
|||||||
++it->second;
|
++it->second;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < ocr_words.size(); ++i) {
|
for (int i = 0; i < ocr_words.size(); ++i) {
|
||||||
string ocr_word(ocr_words[i].string());
|
std::string ocr_word(ocr_words[i].string());
|
||||||
StrMap::iterator it = word_counts.find(ocr_word);
|
StrMap::iterator it = word_counts.find(ocr_word);
|
||||||
if (it == word_counts.end())
|
if (it == word_counts.end())
|
||||||
word_counts.insert(make_pair(ocr_word, -1));
|
word_counts.insert(make_pair(ocr_word, -1));
|
||||||
|
@ -127,7 +127,8 @@ string PangoFontInfo::DescriptionName() const {
|
|||||||
/* static */
|
/* static */
|
||||||
void PangoFontInfo::SoftInitFontConfig() {
|
void PangoFontInfo::SoftInitFontConfig() {
|
||||||
if (fonts_dir_.empty()) {
|
if (fonts_dir_.empty()) {
|
||||||
HardInitFontConfig(FLAGS_fonts_dir.c_str(), FLAGS_fontconfig_tmpdir.c_str());
|
HardInitFontConfig(FLAGS_fonts_dir.c_str(),
|
||||||
|
FLAGS_fontconfig_tmpdir.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user