implement parameter min_characters_to_try for minimum characters to try to skip page entirely.

fixes #1729
This commit is contained in:
zdenop 2018-10-05 19:05:28 +02:00
parent 2cb609d202
commit 660dbaa9d5
3 changed files with 10 additions and 6 deletions

View File

@ -36,9 +36,6 @@
#include <algorithm>
#include <memory>
const int kMinCharactersToTry = 50;
const int kMaxCharactersToTry = 5 * kMinCharactersToTry;
const float kSizeRatioToReject = 2.0;
const int kMinAcceptableBlobHeight = 10;
@ -278,6 +275,8 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
BLOBNBOX_CLIST* blob_list, OSResults* osr,
tesseract::Tesseract* tess) {
OSResults osr_;
int minCharactersToTry = tess->min_characters_to_try;
int maxCharactersToTry = 5 * minCharactersToTry;
if (osr == nullptr)
osr = &osr_;
@ -286,13 +285,13 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
ScriptDetector s(allowed_scripts, osr, tess);
BLOBNBOX_C_IT filtered_it(blob_list);
int real_max = std::min(filtered_it.length(), kMaxCharactersToTry);
int real_max = std::min(filtered_it.length(), maxCharactersToTry);
// tprintf("Total blobs found = %d\n", blobs_total);
// tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
// tprintf("Number of blobs to try = %d\n", real_max);
// If there are too few characters, skip this page entirely.
if (real_max < kMinCharactersToTry / 2) {
if (real_max < minCharactersToTry / 2) {
tprintf("Too few characters. Skipping this page\n");
return 0;
}
@ -307,7 +306,7 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
int num_blobs_evaluated = 0;
for (int i = 0; i < real_max; ++i) {
if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
&& i > kMinCharactersToTry) {
&& i > minCharactersToTry) {
break;
}
++num_blobs_evaluated;

View File

@ -397,6 +397,9 @@ Tesseract::Tesseract()
INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
this->params()),
INT_MEMBER(min_characters_to_try, 50,
"Specify minimum characters to try to skip page entirely",
this->params()),
STRING_MEMBER(unrecognised_char, "|",
"Output char for unidentified blobs", this->params()),
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),

View File

@ -1043,6 +1043,8 @@ class Tesseract : public Wordrec {
"Create PDF with only one invisible text layer");
INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
INT_VAR_H(min_characters_to_try, 50,
"Specify minimum characters to try to skip page entirely");
STRING_VAR_H(unrecognised_char, "|",
"Output char for unidentified blobs");
INT_VAR_H(suspect_level, 99, "Suspect marker level");