From 349de8b7397454681d1af7f174c8c7b48efe61eb Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 19 Feb 2018 07:30:38 +0100 Subject: [PATCH] Support different help texts for normal and advanced users and restore legacy mode (#1325) * Restore support for the legacy engine It is still needed to get text attributes which are unsupported by the LSTM engine, and it also has better recognition rates for some texts. Signed-off-by: Stefan Weil * tesseractmain: Add missing 'static' attributes Signed-off-by: Stefan Weil * Support different help texts for normal and advanced users The old option --help now shows a very basic help text. The new option --help-extra shows the full help information. It now also includes a hint that Tesseract supports lists of images. Fix also the indentation in the PSM help and use a more neutral text in the OEM help. Signed-off-by: Stefan Weil * Add missing line feed in error message Signed-off-by: Stefan Weil --- api/tesseractmain.cpp | 116 ++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 49 deletions(-) diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index bc6e03b0..a3bd891d 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -51,7 +51,7 @@ static void Win32WarningHandler(const char* module, const char* fmt, #endif /* HAVE_TIFFIO_H && _WIN32 */ -void PrintVersionInfo() { +static void PrintVersionInfo() { char* versionStrP; printf("tesseract %s\n", tesseract::TessBaseAPI::Version()); @@ -103,17 +103,7 @@ void PrintVersionInfo() { if (SIMDDetect::IsSSEAvailable()) printf(" Found SSE\n"); } -void PrintUsage(const char* program) { - printf( - "Usage:\n" - " %s --help | --help-psm | --help-oem | --version\n" - " %s --list-langs [--tessdata-dir PATH]\n" - " %s --print-parameters [options...] [configfile...]\n" - " %s imagename|stdin outputbase|stdout [options...] [configfile...]\n", - program, program, program, program); -} - -void PrintHelpForPSM() { +static void PrintHelpForPSM() { const char* msg = "Page segmentation modes:\n" " 0 Orientation and script detection (OSD) only.\n" @@ -131,26 +121,30 @@ void PrintHelpForPSM() { " particular order.\n" " 12 Sparse text with OSD.\n" " 13 Raw line. Treat the image as a single text line,\n" - "\t\t\tbypassing hacks that are Tesseract-specific.\n"; + " bypassing hacks that are Tesseract-specific.\n"; printf("%s", msg); } -void PrintHelpForOEM() { +static void PrintHelpForOEM() { const char* msg = "OCR Engine modes:\n" - " 0 Original Tesseract only (unsupported).\n" + " 0 Legacy Tesseract only.\n" " 1 Neural nets LSTM only.\n" - " 2 Tesseract + LSTM (unsupported).\n" + " 2 Legacy + LSTM Tesseract.\n" " 3 Default, based on what is available.\n"; printf("%s", msg); } -void PrintHelpMessage(const char* program) { - PrintUsage(program); - - const char* ocr_options = +static void PrintHelpExtra(const char* program) { + printf( + "Usage:\n" + " %s --help | --help-extra | --help-psm | --help-oem | --version\n" + " %s --list-langs [--tessdata-dir PATH]\n" + " %s --print-parameters [options...] [configfile...]\n" + " %s imagename|imagelist|stdin outputbase|stdout [options...] [configfile...]\n" + "\n" "OCR options:\n" " --tessdata-dir PATH Specify the location of tessdata path.\n" " --user-words PATH Specify the location of user words file.\n" @@ -160,26 +154,50 @@ void PrintHelpMessage(const char* program) { " Multiple -c arguments are allowed.\n" " --psm NUM Specify page segmentation mode.\n" " --oem NUM Specify OCR Engine mode.\n" - "NOTE: These options must occur before any configfile.\n"; + "NOTE: These options must occur before any configfile.\n" + "\n", + program, program, program, program + ); - printf("\n%s\n", ocr_options); PrintHelpForPSM(); + printf("\n"); PrintHelpForOEM(); - const char* single_options = + printf( + "\n" "Single options:\n" - " -h, --help Show this help message.\n" + " -h, --help Show minimal help message.\n" + " --help-extra Show extra help for advanced users.\n" " --help-psm Show page segmentation modes.\n" " --help-oem Show OCR Engine modes.\n" " -v, --version Show version information.\n" " --list-langs List available languages for tesseract engine.\n" - " --print-parameters Print tesseract parameters.\n"; - - printf("\n%s", single_options); + " --print-parameters Print tesseract parameters.\n" + ); } -void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc, - char** argv) { +static void PrintHelpMessage(const char* program) { + printf( + "Usage:\n" + " %s --help | --help-extra | --version\n" + " %s --list-langs\n" + " %s imagename outputbase [options...] [configfile...]\n" + "\n" + "OCR options:\n" + " -l LANG[+LANG] Specify language(s) used for OCR.\n" + "NOTE: These options must occur before any configfile.\n" + "\n" + "Single options:\n" + " --help Show this help message.\n" + " --help-extra Show extra help for advanced users.\n" + " --version Show version information.\n" + " --list-langs List available languages for tesseract engine.\n", + program, program, program + ); +} + +static void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc, + char** argv) { char opt1[256], opt2[255]; for (int i = 0; i < argc; i++) { if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) { @@ -202,7 +220,7 @@ void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc, } } -void PrintLangsList(tesseract::TessBaseAPI* api) { +static void PrintLangsList(tesseract::TessBaseAPI* api) { GenericVector languages; api->GetAvailableLanguagesAsVector(&languages); printf("List of available languages (%d):\n", languages.size()); @@ -213,7 +231,7 @@ void PrintLangsList(tesseract::TessBaseAPI* api) { api->End(); } -void PrintBanner() { +static void PrintBanner() { tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n", tesseract::TessBaseAPI::Version()); } @@ -232,27 +250,28 @@ void PrintBanner() { * It would be simpler if we could set the value before Init, * but that doesn't work. */ -void FixPageSegMode(tesseract::TessBaseAPI* api, - tesseract::PageSegMode pagesegmode) { +static void FixPageSegMode(tesseract::TessBaseAPI* api, + tesseract::PageSegMode pagesegmode) { if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) api->SetPageSegMode(pagesegmode); } -void checkArgValues (int arg, const char* mode, int count) { +static void checkArgValues(int arg, const char* mode, int count) { if (arg >= count || arg < 0) { - printf("Invalid %s value, please enter a number between 0-%d", mode, count - 1); - exit(0); + printf("Invalid %s value, please enter a number between 0-%d\n", mode, count - 1); + exit(0); } } // NOTE: arg_i is used here to avoid ugly *i so many times in this function -void ParseArgs(const int argc, char** argv, const char** lang, - const char** image, const char** outputbase, - const char** datapath, bool* list_langs, bool* print_parameters, - GenericVector* vars_vec, - GenericVector* vars_values, int* arg_i, - tesseract::PageSegMode* pagesegmode, - tesseract::OcrEngineMode* enginemode) { +static void ParseArgs(const int argc, char** argv, const char** lang, + const char** image, const char** outputbase, + const char** datapath, + bool* list_langs, bool* print_parameters, + GenericVector* vars_vec, + GenericVector* vars_values, int* arg_i, + tesseract::PageSegMode* pagesegmode, + tesseract::OcrEngineMode* enginemode) { if (argc == 1) { PrintHelpMessage(argv[0]); exit(0); @@ -263,6 +282,10 @@ void ParseArgs(const int argc, char** argv, const char** lang, PrintHelpMessage(argv[0]); exit(0); } + if (strcmp(argv[1], "--help-extra") == 0) { + PrintHelpExtra(argv[0]); + exit(0); + } if ((strcmp(argv[1], "--help-psm") == 0)) { PrintHelpForPSM(); exit(0); @@ -310,11 +333,6 @@ void ParseArgs(const int argc, char** argv, const char** lang, } else if (strcmp(argv[i], "--oem") == 0 && i + 1 < argc) { int oem = atoi(argv[i + 1]); checkArgValues(oem, "OEM", tesseract::OEM_COUNT); - if (oem == tesseract::OEM_TESSERACT_ONLY || - oem == tesseract::OEM_TESSERACT_LSTM_COMBINED) { - printf("Legacy OCR Engine is not supported anymore.\n"); - exit(2); - } *enginemode = static_cast(oem); ++i; } else if (strcmp(argv[i], "--print-parameters") == 0) { @@ -344,7 +362,7 @@ void ParseArgs(const int argc, char** argv, const char** lang, } } -void PreloadRenderers( +static void PreloadRenderers( tesseract::TessBaseAPI* api, tesseract::PointerVector* renderers, tesseract::PageSegMode pagesegmode, const char* outputbase) {