From c2f5e9b8498df42db955cf755c513e8e3bb34112 Mon Sep 17 00:00:00 2001 From: amitdo Date: Fri, 11 Dec 2015 19:06:49 +0200 Subject: [PATCH 1/2] If there is no explicit renderer(s), default to TessTextRenderer Revert fd429c32, 43834da7, 05de195e. See #49, #59. The code in this commit solves the issue in a more elegant way, IMHO. Now you can use: * `tesseract eurotext.tif eurotext txt pdf` * `tesseract eurotext.tif eurotext txt hocr` * `tesseract eurotext.tif eurotext txt hocr pdf` NOTE: With `tesseract eurotext.tif eurotext` or `tesseract eurotext.tif eurotext txt` the psm will be set to '3', but... With `tesseract eurotext.tif eurotext txt pdf` or `tesseract eurotext.tif eurotext txt hocr` the psm will be set to '1'. --- api/tesseractmain.cpp | 48 +++++++++++++++++++++++++++------------ ccmain/tesseractclass.cpp | 2 +- ccmain/tesseractclass.h | 2 +- tessdata/configs/hocr | 1 - tessdata/configs/makebox | 1 - tessdata/configs/pdf | 1 - tessdata/configs/txt | 3 +++ tessdata/configs/unlv | 1 - 8 files changed, 38 insertions(+), 21 deletions(-) create mode 100644 tessdata/configs/txt diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index 822e868e..4698c90e 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -176,16 +176,16 @@ void PrintLangsList(tesseract::TessBaseAPI* api) { /** * We have 2 possible sources of pagesegmode: a config file and * the command line. For backwards compatibility reasons, the - * default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the - * default for this program is tesseract::PSM_AUTO. We will let - * the config file take priority, so the command-line default - * can take priority over the tesseract default, so we use the - * value from the command line only if the retrieved mode - * is still tesseract::PSM_SINGLE_BLOCK, indicating no change - * in any config file. Therefore the only way to force - * tesseract::PSM_SINGLE_BLOCK is from the command line. - * It would be simpler if we could set the value before Init, - * but that doesn't work. + * default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the + * default for this program is tesseract::PSM_AUTO. We will let + * the config file take priority, so the command-line default + * can take priority over the tesseract default, so we use the + * value from the command line only if the retrieved mode + * is still tesseract::PSM_SINGLE_BLOCK, indicating no change + * in any config file. Therefore the only way to force + * tesseract::PSM_SINGLE_BLOCK is from the command line. + * It would be simpler if we could set the value before Init, + * but that doesn't work. */ void FixPageSegMode(tesseract::TessBaseAPI* api, tesseract::PageSegMode pagesegmode) { @@ -295,19 +295,37 @@ void PreloadRenderers(tesseract::TessBaseAPI* api, if (b) { bool font_info; api->GetBoolVariable("hocr_font_info", &font_info); - renderers->push_back(new tesseract::TessHOcrRenderer(outputbase, font_info)); + renderers->push_back( + new tesseract::TessHOcrRenderer(outputbase, font_info)); } + api->GetBoolVariable("tessedit_create_pdf", &b); if (b) { renderers->push_back(new tesseract::TessPDFRenderer(outputbase, - api->GetDatapath())); + api->GetDatapath())); } + api->GetBoolVariable("tessedit_write_unlv", &b); - if (b) renderers->push_back(new tesseract::TessUnlvRenderer(outputbase)); + if (b) { + renderers->push_back(new tesseract::TessUnlvRenderer(outputbase)); + } + api->GetBoolVariable("tessedit_create_boxfile", &b); - if (b) renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase)); + if (b) { + renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase)); + } + + // disable text renderer when using one of these configs: + // ambigs.train, box.train, box.train.stderr, linebox, rebox + bool disable_text_renderer = + (api->GetBoolVariable("tessedit_ambigs_training", &b) && b) || + (api->GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) || + (api->GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b); + api->GetBoolVariable("tessedit_create_txt", &b); - if (b) renderers->push_back(new tesseract::TessTextRenderer(outputbase)); + if (b || (renderers->empty() && !disable_text_renderer) { + renderers->push_back(new tesseract::TessTextRenderer(outputbase)); + } } if (!renderers->empty()) { diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index e348c93f..04a4b1b0 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -381,7 +381,7 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params()), - BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file", + BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params()), BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params()), diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 6666dec3..b6976a2d 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -1001,7 +1001,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_write_rep_codes, false, "Write repetition char code"); BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); - BOOL_VAR_H(tessedit_create_txt, true, "Write .txt output file"); + BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file"); BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); STRING_VAR_H(unrecognised_char, "|", diff --git a/tessdata/configs/hocr b/tessdata/configs/hocr index eba5c1dd..2bf1d848 100644 --- a/tessdata/configs/hocr +++ b/tessdata/configs/hocr @@ -1,3 +1,2 @@ -tessedit_create_txt 0 tessedit_create_hocr 1 tessedit_pageseg_mode 1 diff --git a/tessdata/configs/makebox b/tessdata/configs/makebox index 48506ca4..3d90ac26 100644 --- a/tessdata/configs/makebox +++ b/tessdata/configs/makebox @@ -1,2 +1 @@ -tessedit_create_txt 0 tessedit_create_boxfile 1 diff --git a/tessdata/configs/pdf b/tessdata/configs/pdf index cc75e694..0d5f0f14 100644 --- a/tessdata/configs/pdf +++ b/tessdata/configs/pdf @@ -1,3 +1,2 @@ -tessedit_create_txt 0 tessedit_create_pdf 1 tessedit_pageseg_mode 1 diff --git a/tessdata/configs/txt b/tessdata/configs/txt new file mode 100644 index 00000000..5046f0b0 --- /dev/null +++ b/tessdata/configs/txt @@ -0,0 +1,3 @@ +# This config file should be used with other cofig files which creates renderers. +# usage example: tesseract eurotext.tif eurotext txt hocr pdf +tessedit_create_txt 1 diff --git a/tessdata/configs/unlv b/tessdata/configs/unlv index b3eea318..87c111bd 100644 --- a/tessdata/configs/unlv +++ b/tessdata/configs/unlv @@ -1,3 +1,2 @@ -tessedit_create_txt 0 tessedit_write_unlv 1 tessedit_pageseg_mode 6 From a20156fc67de372dd542c9f3ddef1fdda1c12d4c Mon Sep 17 00:00:00 2001 From: amitdo Date: Fri, 11 Dec 2015 19:42:16 +0200 Subject: [PATCH 2/2] Add missing ')'_to make the code compile --- api/tesseractmain.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index 4698c90e..1246bcbb 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -323,7 +323,7 @@ void PreloadRenderers(tesseract::TessBaseAPI* api, (api->GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b); api->GetBoolVariable("tessedit_create_txt", &b); - if (b || (renderers->empty() && !disable_text_renderer) { + if (b || (renderers->empty() && !disable_text_renderer)) { renderers->push_back(new tesseract::TessTextRenderer(outputbase)); } }