mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-11 12:43:17 +08:00
Fixed issue 1207
This commit is contained in:
parent
d0cb1071b2
commit
f927728169
@ -1,7 +1,23 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: tesseractclass.cpp
|
||||
// Description: An instance of Tesseract. For thread safety, *every*
|
||||
// global variable goes in here, directly, or indirectly.
|
||||
// Description: The Tesseract class. It holds/owns everything needed
|
||||
// to run Tesseract on a single language, and also a set of
|
||||
// sub-Tesseracts to run sub-languages. For thread safety, *every*
|
||||
// variable that was previously global or static (except for
|
||||
// constant data, and some visual debugging flags) has been moved
|
||||
// in here, directly, or indirectly.
|
||||
// This makes it safe to run multiple Tesseracts in different
|
||||
// threads in parallel, and keeps the different language
|
||||
// instances separate.
|
||||
// Some global functions remain, but they are isolated re-entrant
|
||||
// functions that operate on their arguments. Functions that work
|
||||
// on variable data have been moved to an appropriate class based
|
||||
// mostly on the directory hierarchy. For more information see
|
||||
// slide 6 of "2ArchitectureAndDataStructures" in
|
||||
// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
|
||||
// Some global data and related functions still exist in the
|
||||
// training-related code, but they don't interfere with normal
|
||||
// recognition operation.
|
||||
// Author: Ray Smith
|
||||
// Created: Fri Mar 07 08:17:01 PST 2008
|
||||
//
|
||||
@ -65,6 +81,9 @@ Tesseract::Tesseract()
|
||||
"Blacklist of chars not to recognize", this->params()),
|
||||
STRING_MEMBER(tessedit_char_whitelist, "",
|
||||
"Whitelist of chars to recognize", this->params()),
|
||||
STRING_MEMBER(tessedit_char_unblacklist, "",
|
||||
"List of chars to override tessedit_char_blacklist",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_ambigs_training, false,
|
||||
"Perform training for ambiguities", this->params()),
|
||||
INT_MEMBER(pageseg_devanagari_split_strategy,
|
||||
@ -578,11 +597,13 @@ void Tesseract::ResetDocumentDictionary() {
|
||||
void Tesseract::SetBlackAndWhitelist() {
|
||||
// Set the white and blacklists (if any)
|
||||
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
|
||||
tessedit_char_whitelist.string());
|
||||
tessedit_char_whitelist.string(),
|
||||
tessedit_char_unblacklist.string());
|
||||
// Black and white lists should apply to all loaded classifiers.
|
||||
for (int i = 0; i < sub_langs_.size(); ++i) {
|
||||
sub_langs_[i]->unicharset.set_black_and_whitelist(
|
||||
tessedit_char_blacklist.string(), tessedit_char_whitelist.string());
|
||||
tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
|
||||
tessedit_char_unblacklist.string());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,12 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: tesseractclass.h
|
||||
// Description: An instance of Tesseract. For thread safety, *every*
|
||||
// Description: The Tesseract class. It holds/owns everything needed
|
||||
// to run Tesseract on a single language, and also a set of
|
||||
// sub-Tesseracts to run sub-languages. For thread safety, *every*
|
||||
// global variable goes in here, directly, or indirectly.
|
||||
// This makes it safe to run multiple Tesseracts in different
|
||||
// threads in parallel, and keeps the different language
|
||||
// instances separate.
|
||||
// Author: Ray Smith
|
||||
// Created: Fri Mar 07 08:17:01 PST 2008
|
||||
//
|
||||
@ -743,6 +748,8 @@ class Tesseract : public Wordrec {
|
||||
"Blacklist of chars not to recognize");
|
||||
STRING_VAR_H(tessedit_char_whitelist, "",
|
||||
"Whitelist of chars to recognize");
|
||||
STRING_VAR_H(tessedit_char_unblacklist, "",
|
||||
"List of chars to override tessedit_char_blacklist");
|
||||
BOOL_VAR_H(tessedit_ambigs_training, false,
|
||||
"Perform training for ambiguities");
|
||||
INT_VAR_H(pageseg_devanagari_split_strategy,
|
||||
|
@ -985,8 +985,10 @@ bool UNICHARSET::major_right_to_left() const {
|
||||
// Set a whitelist and/or blacklist of characters to recognize.
|
||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||
// An empty or NULL blacklist disables nothing.
|
||||
// An empty or NULL blacklist has no effect.
|
||||
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
||||
const char* whitelist) {
|
||||
const char* whitelist,
|
||||
const char* unblacklist) {
|
||||
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
|
||||
// Set everything to default
|
||||
for (int ch = 0; ch < size_used; ++ch)
|
||||
@ -1009,6 +1011,15 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
||||
unichars[encoding[i]].properties.enabled = false;
|
||||
}
|
||||
}
|
||||
if (unblacklist != NULL && unblacklist[0] != '\0') {
|
||||
// Re-enable the unblacklist.
|
||||
GenericVector<UNICHAR_ID> encoding;
|
||||
encode_string(unblacklist, false, &encoding, NULL, NULL);
|
||||
for (int i = 0; i < encoding.size(); ++i) {
|
||||
if (encoding[i] != INVALID_UNICHAR_ID)
|
||||
unichars[encoding[i]].properties.enabled = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int UNICHARSET::add_script(const char* script) {
|
||||
|
@ -381,11 +381,14 @@ class UNICHARSET {
|
||||
// Set a whitelist and/or blacklist of characters to recognize.
|
||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||
// An empty or NULL blacklist disables nothing.
|
||||
// An empty or NULL unblacklist has no effect.
|
||||
// The blacklist overrides the whitelist.
|
||||
// The unblacklist overrides the blacklist.
|
||||
// Each list is a string of utf8 character strings. Boundaries between
|
||||
// unicharset units are worked out automatically, and characters not in
|
||||
// the unicharset are silently ignored.
|
||||
void set_black_and_whitelist(const char* blacklist, const char* whitelist);
|
||||
void set_black_and_whitelist(const char* blacklist, const char* whitelist,
|
||||
const char* unblacklist);
|
||||
|
||||
// Set the isalpha property of the given unichar to the given value.
|
||||
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
|
||||
|
Loading…
Reference in New Issue
Block a user