If there is no explicit renderer(s), default to TessTextRenderer

Revert fd429c32, 43834da7, 05de195e.

See #49, #59.

The code in this commit solves the issue in a more elegant way, IMHO.

Now you can use:
  * `tesseract eurotext.tif eurotext txt pdf`
  * `tesseract eurotext.tif eurotext txt hocr`
  * `tesseract eurotext.tif eurotext txt hocr pdf`

NOTE:
  With `tesseract eurotext.tif eurotext`
  or `tesseract eurotext.tif eurotext txt`
  the psm will be set to '3', but...
  With `tesseract eurotext.tif eurotext txt pdf`
  or `tesseract eurotext.tif eurotext txt hocr`
  the psm will be set to '1'.
This commit is contained in:
amitdo 2015-12-11 19:06:49 +02:00
parent d4e0c6459a
commit c2f5e9b849
8 changed files with 38 additions and 21 deletions

View File

@ -176,16 +176,16 @@ void PrintLangsList(tesseract::TessBaseAPI* api) {
/**
* We have 2 possible sources of pagesegmode: a config file and
* the command line. For backwards compatibility reasons, the
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
* default for this program is tesseract::PSM_AUTO. We will let
* the config file take priority, so the command-line default
* can take priority over the tesseract default, so we use the
* value from the command line only if the retrieved mode
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
* in any config file. Therefore the only way to force
* tesseract::PSM_SINGLE_BLOCK is from the command line.
* It would be simpler if we could set the value before Init,
* but that doesn't work.
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
* default for this program is tesseract::PSM_AUTO. We will let
* the config file take priority, so the command-line default
* can take priority over the tesseract default, so we use the
* value from the command line only if the retrieved mode
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
* in any config file. Therefore the only way to force
* tesseract::PSM_SINGLE_BLOCK is from the command line.
* It would be simpler if we could set the value before Init,
* but that doesn't work.
*/
void FixPageSegMode(tesseract::TessBaseAPI* api,
tesseract::PageSegMode pagesegmode) {
@ -295,19 +295,37 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
if (b) {
bool font_info;
api->GetBoolVariable("hocr_font_info", &font_info);
renderers->push_back(new tesseract::TessHOcrRenderer(outputbase, font_info));
renderers->push_back(
new tesseract::TessHOcrRenderer(outputbase, font_info));
}
api->GetBoolVariable("tessedit_create_pdf", &b);
if (b) {
renderers->push_back(new tesseract::TessPDFRenderer(outputbase,
api->GetDatapath()));
api->GetDatapath()));
}
api->GetBoolVariable("tessedit_write_unlv", &b);
if (b) renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
if (b) {
renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
}
api->GetBoolVariable("tessedit_create_boxfile", &b);
if (b) renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
if (b) {
renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
}
// disable text renderer when using one of these configs:
// ambigs.train, box.train, box.train.stderr, linebox, rebox
bool disable_text_renderer =
(api->GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
(api->GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
(api->GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
api->GetBoolVariable("tessedit_create_txt", &b);
if (b) renderers->push_back(new tesseract::TessTextRenderer(outputbase));
if (b || (renderers->empty() && !disable_text_renderer) {
renderers->push_back(new tesseract::TessTextRenderer(outputbase));
}
}
if (!renderers->empty()) {

View File

@ -381,7 +381,7 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
this->params()),
BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
this->params()),
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
this->params()),

View File

@ -1001,7 +1001,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_write_rep_codes, false,
"Write repetition char code");
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
BOOL_VAR_H(tessedit_create_txt, true, "Write .txt output file");
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
STRING_VAR_H(unrecognised_char, "|",

View File

@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_create_hocr 1
tessedit_pageseg_mode 1

View File

@ -1,2 +1 @@
tessedit_create_txt 0
tessedit_create_boxfile 1

View File

@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_create_pdf 1
tessedit_pageseg_mode 1

3
tessdata/configs/txt Normal file
View File

@ -0,0 +1,3 @@
# This config file should be used with other cofig files which creates renderers.
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
tessedit_create_txt 1

View File

@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_write_unlv 1
tessedit_pageseg_mode 6