mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-23 18:49:08 +08:00
If there is no explicit renderer(s), default to TessTextRenderer
Revertfd429c32
,43834da7
,05de195e
. See #49, #59. The code in this commit solves the issue in a more elegant way, IMHO. Now you can use: * `tesseract eurotext.tif eurotext txt pdf` * `tesseract eurotext.tif eurotext txt hocr` * `tesseract eurotext.tif eurotext txt hocr pdf` NOTE: With `tesseract eurotext.tif eurotext` or `tesseract eurotext.tif eurotext txt` the psm will be set to '3', but... With `tesseract eurotext.tif eurotext txt pdf` or `tesseract eurotext.tif eurotext txt hocr` the psm will be set to '1'.
This commit is contained in:
parent
d4e0c6459a
commit
c2f5e9b849
@ -176,16 +176,16 @@ void PrintLangsList(tesseract::TessBaseAPI* api) {
|
||||
/**
|
||||
* We have 2 possible sources of pagesegmode: a config file and
|
||||
* the command line. For backwards compatibility reasons, the
|
||||
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
|
||||
* default for this program is tesseract::PSM_AUTO. We will let
|
||||
* the config file take priority, so the command-line default
|
||||
* can take priority over the tesseract default, so we use the
|
||||
* value from the command line only if the retrieved mode
|
||||
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
|
||||
* in any config file. Therefore the only way to force
|
||||
* tesseract::PSM_SINGLE_BLOCK is from the command line.
|
||||
* It would be simpler if we could set the value before Init,
|
||||
* but that doesn't work.
|
||||
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
|
||||
* default for this program is tesseract::PSM_AUTO. We will let
|
||||
* the config file take priority, so the command-line default
|
||||
* can take priority over the tesseract default, so we use the
|
||||
* value from the command line only if the retrieved mode
|
||||
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
|
||||
* in any config file. Therefore the only way to force
|
||||
* tesseract::PSM_SINGLE_BLOCK is from the command line.
|
||||
* It would be simpler if we could set the value before Init,
|
||||
* but that doesn't work.
|
||||
*/
|
||||
void FixPageSegMode(tesseract::TessBaseAPI* api,
|
||||
tesseract::PageSegMode pagesegmode) {
|
||||
@ -295,19 +295,37 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
|
||||
if (b) {
|
||||
bool font_info;
|
||||
api->GetBoolVariable("hocr_font_info", &font_info);
|
||||
renderers->push_back(new tesseract::TessHOcrRenderer(outputbase, font_info));
|
||||
renderers->push_back(
|
||||
new tesseract::TessHOcrRenderer(outputbase, font_info));
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_pdf", &b);
|
||||
if (b) {
|
||||
renderers->push_back(new tesseract::TessPDFRenderer(outputbase,
|
||||
api->GetDatapath()));
|
||||
api->GetDatapath()));
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_write_unlv", &b);
|
||||
if (b) renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
|
||||
if (b) {
|
||||
renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_boxfile", &b);
|
||||
if (b) renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
|
||||
if (b) {
|
||||
renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
|
||||
}
|
||||
|
||||
// disable text renderer when using one of these configs:
|
||||
// ambigs.train, box.train, box.train.stderr, linebox, rebox
|
||||
bool disable_text_renderer =
|
||||
(api->GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
|
||||
(api->GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
|
||||
(api->GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
|
||||
|
||||
api->GetBoolVariable("tessedit_create_txt", &b);
|
||||
if (b) renderers->push_back(new tesseract::TessTextRenderer(outputbase));
|
||||
if (b || (renderers->empty() && !disable_text_renderer) {
|
||||
renderers->push_back(new tesseract::TessTextRenderer(outputbase));
|
||||
}
|
||||
}
|
||||
|
||||
if (!renderers->empty()) {
|
||||
|
@ -381,7 +381,7 @@ Tesseract::Tesseract()
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
|
||||
BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
|
||||
this->params()),
|
||||
|
@ -1001,7 +1001,7 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(tessedit_write_rep_codes, false,
|
||||
"Write repetition char code");
|
||||
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
|
||||
BOOL_VAR_H(tessedit_create_txt, true, "Write .txt output file");
|
||||
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
|
||||
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
|
||||
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
||||
STRING_VAR_H(unrecognised_char, "|",
|
||||
|
@ -1,3 +1,2 @@
|
||||
tessedit_create_txt 0
|
||||
tessedit_create_hocr 1
|
||||
tessedit_pageseg_mode 1
|
||||
|
@ -1,2 +1 @@
|
||||
tessedit_create_txt 0
|
||||
tessedit_create_boxfile 1
|
||||
|
@ -1,3 +1,2 @@
|
||||
tessedit_create_txt 0
|
||||
tessedit_create_pdf 1
|
||||
tessedit_pageseg_mode 1
|
||||
|
3
tessdata/configs/txt
Normal file
3
tessdata/configs/txt
Normal file
@ -0,0 +1,3 @@
|
||||
# This config file should be used with other cofig files which creates renderers.
|
||||
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
|
||||
tessedit_create_txt 1
|
@ -1,3 +1,2 @@
|
||||
tessedit_create_txt 0
|
||||
tessedit_write_unlv 1
|
||||
tessedit_pageseg_mode 6
|
||||
|
Loading…
Reference in New Issue
Block a user