mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
implement parameter min_characters_to_try for minimum characters to try to skip page entirely.
fixes #1729
This commit is contained in:
parent
2cb609d202
commit
660dbaa9d5
@ -36,9 +36,6 @@
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
const int kMinCharactersToTry = 50;
|
||||
const int kMaxCharactersToTry = 5 * kMinCharactersToTry;
|
||||
|
||||
const float kSizeRatioToReject = 2.0;
|
||||
const int kMinAcceptableBlobHeight = 10;
|
||||
|
||||
@ -278,6 +275,8 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
|
||||
BLOBNBOX_CLIST* blob_list, OSResults* osr,
|
||||
tesseract::Tesseract* tess) {
|
||||
OSResults osr_;
|
||||
int minCharactersToTry = tess->min_characters_to_try;
|
||||
int maxCharactersToTry = 5 * minCharactersToTry;
|
||||
if (osr == nullptr)
|
||||
osr = &osr_;
|
||||
|
||||
@ -286,13 +285,13 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
|
||||
ScriptDetector s(allowed_scripts, osr, tess);
|
||||
|
||||
BLOBNBOX_C_IT filtered_it(blob_list);
|
||||
int real_max = std::min(filtered_it.length(), kMaxCharactersToTry);
|
||||
int real_max = std::min(filtered_it.length(), maxCharactersToTry);
|
||||
// tprintf("Total blobs found = %d\n", blobs_total);
|
||||
// tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
|
||||
// tprintf("Number of blobs to try = %d\n", real_max);
|
||||
|
||||
// If there are too few characters, skip this page entirely.
|
||||
if (real_max < kMinCharactersToTry / 2) {
|
||||
if (real_max < minCharactersToTry / 2) {
|
||||
tprintf("Too few characters. Skipping this page\n");
|
||||
return 0;
|
||||
}
|
||||
@ -307,7 +306,7 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
|
||||
int num_blobs_evaluated = 0;
|
||||
for (int i = 0; i < real_max; ++i) {
|
||||
if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
|
||||
&& i > kMinCharactersToTry) {
|
||||
&& i > minCharactersToTry) {
|
||||
break;
|
||||
}
|
||||
++num_blobs_evaluated;
|
||||
|
@ -397,6 +397,9 @@ Tesseract::Tesseract()
|
||||
INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
|
||||
INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
|
||||
this->params()),
|
||||
INT_MEMBER(min_characters_to_try, 50,
|
||||
"Specify minimum characters to try to skip page entirely",
|
||||
this->params()),
|
||||
STRING_MEMBER(unrecognised_char, "|",
|
||||
"Output char for unidentified blobs", this->params()),
|
||||
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
|
||||
|
@ -1043,6 +1043,8 @@ class Tesseract : public Wordrec {
|
||||
"Create PDF with only one invisible text layer");
|
||||
INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
|
||||
INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
|
||||
INT_VAR_H(min_characters_to_try, 50,
|
||||
"Specify minimum characters to try to skip page entirely");
|
||||
STRING_VAR_H(unrecognised_char, "|",
|
||||
"Output char for unidentified blobs");
|
||||
INT_VAR_H(suspect_level, 99, "Suspect marker level");
|
||||
|
Loading…
Reference in New Issue
Block a user