mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-11 12:43:17 +08:00
Fixed issue 1207
This commit is contained in:
parent
d0cb1071b2
commit
f927728169
@ -1,7 +1,23 @@
|
|||||||
///////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////
|
||||||
// File: tesseractclass.cpp
|
// File: tesseractclass.cpp
|
||||||
// Description: An instance of Tesseract. For thread safety, *every*
|
// Description: The Tesseract class. It holds/owns everything needed
|
||||||
// global variable goes in here, directly, or indirectly.
|
// to run Tesseract on a single language, and also a set of
|
||||||
|
// sub-Tesseracts to run sub-languages. For thread safety, *every*
|
||||||
|
// variable that was previously global or static (except for
|
||||||
|
// constant data, and some visual debugging flags) has been moved
|
||||||
|
// in here, directly, or indirectly.
|
||||||
|
// This makes it safe to run multiple Tesseracts in different
|
||||||
|
// threads in parallel, and keeps the different language
|
||||||
|
// instances separate.
|
||||||
|
// Some global functions remain, but they are isolated re-entrant
|
||||||
|
// functions that operate on their arguments. Functions that work
|
||||||
|
// on variable data have been moved to an appropriate class based
|
||||||
|
// mostly on the directory hierarchy. For more information see
|
||||||
|
// slide 6 of "2ArchitectureAndDataStructures" in
|
||||||
|
// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
|
||||||
|
// Some global data and related functions still exist in the
|
||||||
|
// training-related code, but they don't interfere with normal
|
||||||
|
// recognition operation.
|
||||||
// Author: Ray Smith
|
// Author: Ray Smith
|
||||||
// Created: Fri Mar 07 08:17:01 PST 2008
|
// Created: Fri Mar 07 08:17:01 PST 2008
|
||||||
//
|
//
|
||||||
@ -65,6 +81,9 @@ Tesseract::Tesseract()
|
|||||||
"Blacklist of chars not to recognize", this->params()),
|
"Blacklist of chars not to recognize", this->params()),
|
||||||
STRING_MEMBER(tessedit_char_whitelist, "",
|
STRING_MEMBER(tessedit_char_whitelist, "",
|
||||||
"Whitelist of chars to recognize", this->params()),
|
"Whitelist of chars to recognize", this->params()),
|
||||||
|
STRING_MEMBER(tessedit_char_unblacklist, "",
|
||||||
|
"List of chars to override tessedit_char_blacklist",
|
||||||
|
this->params()),
|
||||||
BOOL_MEMBER(tessedit_ambigs_training, false,
|
BOOL_MEMBER(tessedit_ambigs_training, false,
|
||||||
"Perform training for ambiguities", this->params()),
|
"Perform training for ambiguities", this->params()),
|
||||||
INT_MEMBER(pageseg_devanagari_split_strategy,
|
INT_MEMBER(pageseg_devanagari_split_strategy,
|
||||||
@ -578,11 +597,13 @@ void Tesseract::ResetDocumentDictionary() {
|
|||||||
void Tesseract::SetBlackAndWhitelist() {
|
void Tesseract::SetBlackAndWhitelist() {
|
||||||
// Set the white and blacklists (if any)
|
// Set the white and blacklists (if any)
|
||||||
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
|
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
|
||||||
tessedit_char_whitelist.string());
|
tessedit_char_whitelist.string(),
|
||||||
|
tessedit_char_unblacklist.string());
|
||||||
// Black and white lists should apply to all loaded classifiers.
|
// Black and white lists should apply to all loaded classifiers.
|
||||||
for (int i = 0; i < sub_langs_.size(); ++i) {
|
for (int i = 0; i < sub_langs_.size(); ++i) {
|
||||||
sub_langs_[i]->unicharset.set_black_and_whitelist(
|
sub_langs_[i]->unicharset.set_black_and_whitelist(
|
||||||
tessedit_char_blacklist.string(), tessedit_char_whitelist.string());
|
tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
|
||||||
|
tessedit_char_unblacklist.string());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,12 @@
|
|||||||
///////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////
|
||||||
// File: tesseractclass.h
|
// File: tesseractclass.h
|
||||||
// Description: An instance of Tesseract. For thread safety, *every*
|
// Description: The Tesseract class. It holds/owns everything needed
|
||||||
|
// to run Tesseract on a single language, and also a set of
|
||||||
|
// sub-Tesseracts to run sub-languages. For thread safety, *every*
|
||||||
// global variable goes in here, directly, or indirectly.
|
// global variable goes in here, directly, or indirectly.
|
||||||
|
// This makes it safe to run multiple Tesseracts in different
|
||||||
|
// threads in parallel, and keeps the different language
|
||||||
|
// instances separate.
|
||||||
// Author: Ray Smith
|
// Author: Ray Smith
|
||||||
// Created: Fri Mar 07 08:17:01 PST 2008
|
// Created: Fri Mar 07 08:17:01 PST 2008
|
||||||
//
|
//
|
||||||
@ -743,6 +748,8 @@ class Tesseract : public Wordrec {
|
|||||||
"Blacklist of chars not to recognize");
|
"Blacklist of chars not to recognize");
|
||||||
STRING_VAR_H(tessedit_char_whitelist, "",
|
STRING_VAR_H(tessedit_char_whitelist, "",
|
||||||
"Whitelist of chars to recognize");
|
"Whitelist of chars to recognize");
|
||||||
|
STRING_VAR_H(tessedit_char_unblacklist, "",
|
||||||
|
"List of chars to override tessedit_char_blacklist");
|
||||||
BOOL_VAR_H(tessedit_ambigs_training, false,
|
BOOL_VAR_H(tessedit_ambigs_training, false,
|
||||||
"Perform training for ambiguities");
|
"Perform training for ambiguities");
|
||||||
INT_VAR_H(pageseg_devanagari_split_strategy,
|
INT_VAR_H(pageseg_devanagari_split_strategy,
|
||||||
|
@ -985,8 +985,10 @@ bool UNICHARSET::major_right_to_left() const {
|
|||||||
// Set a whitelist and/or blacklist of characters to recognize.
|
// Set a whitelist and/or blacklist of characters to recognize.
|
||||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||||
// An empty or NULL blacklist disables nothing.
|
// An empty or NULL blacklist disables nothing.
|
||||||
|
// An empty or NULL blacklist has no effect.
|
||||||
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
||||||
const char* whitelist) {
|
const char* whitelist,
|
||||||
|
const char* unblacklist) {
|
||||||
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
|
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
|
||||||
// Set everything to default
|
// Set everything to default
|
||||||
for (int ch = 0; ch < size_used; ++ch)
|
for (int ch = 0; ch < size_used; ++ch)
|
||||||
@ -1009,6 +1011,15 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
|||||||
unichars[encoding[i]].properties.enabled = false;
|
unichars[encoding[i]].properties.enabled = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (unblacklist != NULL && unblacklist[0] != '\0') {
|
||||||
|
// Re-enable the unblacklist.
|
||||||
|
GenericVector<UNICHAR_ID> encoding;
|
||||||
|
encode_string(unblacklist, false, &encoding, NULL, NULL);
|
||||||
|
for (int i = 0; i < encoding.size(); ++i) {
|
||||||
|
if (encoding[i] != INVALID_UNICHAR_ID)
|
||||||
|
unichars[encoding[i]].properties.enabled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int UNICHARSET::add_script(const char* script) {
|
int UNICHARSET::add_script(const char* script) {
|
||||||
|
@ -381,11 +381,14 @@ class UNICHARSET {
|
|||||||
// Set a whitelist and/or blacklist of characters to recognize.
|
// Set a whitelist and/or blacklist of characters to recognize.
|
||||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||||
// An empty or NULL blacklist disables nothing.
|
// An empty or NULL blacklist disables nothing.
|
||||||
|
// An empty or NULL unblacklist has no effect.
|
||||||
// The blacklist overrides the whitelist.
|
// The blacklist overrides the whitelist.
|
||||||
|
// The unblacklist overrides the blacklist.
|
||||||
// Each list is a string of utf8 character strings. Boundaries between
|
// Each list is a string of utf8 character strings. Boundaries between
|
||||||
// unicharset units are worked out automatically, and characters not in
|
// unicharset units are worked out automatically, and characters not in
|
||||||
// the unicharset are silently ignored.
|
// the unicharset are silently ignored.
|
||||||
void set_black_and_whitelist(const char* blacklist, const char* whitelist);
|
void set_black_and_whitelist(const char* blacklist, const char* whitelist,
|
||||||
|
const char* unblacklist);
|
||||||
|
|
||||||
// Set the isalpha property of the given unichar to the given value.
|
// Set the isalpha property of the given unichar to the given value.
|
||||||
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
|
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
|
||||||
|
Loading…
Reference in New Issue
Block a user