mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 18:02:40 +08:00
fix issue 123: user-words (and user-patterns) file specified by command line
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1093 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
bc09cd9040
commit
ee73e3b107
@ -90,6 +90,7 @@ int main(int argc, char **argv) {
|
|||||||
bool noocr = false;
|
bool noocr = false;
|
||||||
bool list_langs = false;
|
bool list_langs = false;
|
||||||
bool print_parameters = false;
|
bool print_parameters = false;
|
||||||
|
GenericVector<STRING> vars_vec, vars_values;
|
||||||
|
|
||||||
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
|
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
|
||||||
int arg = 1;
|
int arg = 1;
|
||||||
@ -100,6 +101,14 @@ int main(int argc, char **argv) {
|
|||||||
} else if (strcmp(argv[arg], "--tessdata-dir") == 0 && arg + 1 < argc) {
|
} else if (strcmp(argv[arg], "--tessdata-dir") == 0 && arg + 1 < argc) {
|
||||||
datapath = argv[arg + 1];
|
datapath = argv[arg + 1];
|
||||||
++arg;
|
++arg;
|
||||||
|
} else if (strcmp(argv[arg], "--user-words") == 0 && arg + 1 < argc) {
|
||||||
|
vars_vec.push_back("user_words_file");
|
||||||
|
vars_values.push_back(argv[arg + 1]);
|
||||||
|
++arg;
|
||||||
|
} else if (strcmp(argv[arg], "--user-patterns") == 0 && arg + 1 < argc) {
|
||||||
|
vars_vec.push_back("user_patterns_file");
|
||||||
|
vars_values.push_back(argv[arg + 1]);
|
||||||
|
++arg;
|
||||||
} else if (strcmp(argv[arg], "--list-langs") == 0) {
|
} else if (strcmp(argv[arg], "--list-langs") == 0) {
|
||||||
noocr = true;
|
noocr = true;
|
||||||
list_langs = true;
|
list_langs = true;
|
||||||
@ -130,8 +139,12 @@ int main(int argc, char **argv) {
|
|||||||
"[options...] [configfile...]\n\n", argv[0]);
|
"[options...] [configfile...]\n\n", argv[0]);
|
||||||
|
|
||||||
fprintf(stderr, "OCR options:\n");
|
fprintf(stderr, "OCR options:\n");
|
||||||
fprintf(stderr, " --tessdata-dir /path\tspecify location of tessdata"
|
fprintf(stderr, " --tessdata-dir /path\tspecify the location of tessdata"
|
||||||
" path\n");
|
" path\n");
|
||||||
|
fprintf(stderr, " --user-words /path/to/file\tspecify the location of user"
|
||||||
|
" words file\n");
|
||||||
|
fprintf(stderr, " --user-patterns /path/to/file\tspecify the location of"
|
||||||
|
" user patterns file\n");
|
||||||
fprintf(stderr, " -l lang[+lang]\tspecify language(s) used for OCR\n");
|
fprintf(stderr, " -l lang[+lang]\tspecify language(s) used for OCR\n");
|
||||||
fprintf(stderr, " -c configvar=value\tset value for control parameter.\n"
|
fprintf(stderr, " -c configvar=value\tset value for control parameter.\n"
|
||||||
"\t\t\tMultiple -c arguments are allowed.\n");
|
"\t\t\tMultiple -c arguments are allowed.\n");
|
||||||
@ -168,7 +181,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
api.SetOutputName(output);
|
api.SetOutputName(output);
|
||||||
int rc = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
|
int rc = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
|
||||||
&(argv[arg]), argc - arg, NULL, NULL, false);
|
&(argv[arg]), argc - arg, &vars_vec, &vars_values, false);
|
||||||
|
|
||||||
if (rc) {
|
if (rc) {
|
||||||
fprintf(stderr, "Could not initialize tesseract.\n");
|
fprintf(stderr, "Could not initialize tesseract.\n");
|
||||||
|
@ -35,11 +35,18 @@ Dict::Dict(CCUtil* ccutil)
|
|||||||
probability_in_context_(&tesseract::Dict::def_probability_in_context),
|
probability_in_context_(&tesseract::Dict::def_probability_in_context),
|
||||||
params_model_classify_(NULL),
|
params_model_classify_(NULL),
|
||||||
ccutil_(ccutil),
|
ccutil_(ccutil),
|
||||||
|
STRING_MEMBER(user_words_file, "",
|
||||||
|
"A filename of user-provided words.",
|
||||||
|
getCCUtil()->params()),
|
||||||
STRING_INIT_MEMBER(user_words_suffix, "",
|
STRING_INIT_MEMBER(user_words_suffix, "",
|
||||||
"A list of user-provided words.",
|
"A suffix of user-provided words located in tessdata.",
|
||||||
getCCUtil()->params()),
|
getCCUtil()->params()),
|
||||||
|
STRING_MEMBER(user_patterns_file, "",
|
||||||
|
"A filename of user-provided patterns.",
|
||||||
|
getCCUtil()->params()),
|
||||||
STRING_INIT_MEMBER(user_patterns_suffix, "",
|
STRING_INIT_MEMBER(user_patterns_suffix, "",
|
||||||
"A list of user-provided patterns.",
|
"A suffix of user-provided patterns located in "
|
||||||
|
"tessdata.",
|
||||||
getCCUtil()->params()),
|
getCCUtil()->params()),
|
||||||
BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
|
BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
|
||||||
getCCUtil()->params()),
|
getCCUtil()->params()),
|
||||||
@ -237,11 +244,16 @@ void Dict::Load(DawgCache *dawg_cache) {
|
|||||||
if (unambig_dawg_) dawgs_ += unambig_dawg_;
|
if (unambig_dawg_) dawgs_ += unambig_dawg_;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (((STRING &)user_words_suffix).length() > 0) {
|
if (((STRING &)user_words_suffix).length() > 0 ||
|
||||||
|
((STRING &)user_words_file).length() > 0) {
|
||||||
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
|
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
|
||||||
getUnicharset().size(), dawg_debug_level);
|
getUnicharset().size(), dawg_debug_level);
|
||||||
name = getCCUtil()->language_data_path_prefix;
|
if (((STRING &)user_words_file).length() > 0) {
|
||||||
name += user_words_suffix;
|
name = user_words_file;
|
||||||
|
} else {
|
||||||
|
name = getCCUtil()->language_data_path_prefix;
|
||||||
|
name += user_words_suffix;
|
||||||
|
}
|
||||||
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
|
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
|
||||||
Trie::RRP_REVERSE_IF_HAS_RTL)) {
|
Trie::RRP_REVERSE_IF_HAS_RTL)) {
|
||||||
tprintf("Error: failed to load %s\n", name.string());
|
tprintf("Error: failed to load %s\n", name.string());
|
||||||
@ -251,12 +263,17 @@ void Dict::Load(DawgCache *dawg_cache) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (((STRING &)user_patterns_suffix).length() > 0) {
|
if (((STRING &)user_patterns_suffix).length() > 0 ||
|
||||||
|
((STRING &)user_patterns_file).length() > 0) {
|
||||||
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
|
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
|
||||||
getUnicharset().size(), dawg_debug_level);
|
getUnicharset().size(), dawg_debug_level);
|
||||||
trie_ptr->initialize_patterns(&(getUnicharset()));
|
trie_ptr->initialize_patterns(&(getUnicharset()));
|
||||||
name = getCCUtil()->language_data_path_prefix;
|
if (((STRING &)user_patterns_file).length() > 0) {
|
||||||
name += user_patterns_suffix;
|
name = user_patterns_file;
|
||||||
|
} else {
|
||||||
|
name = getCCUtil()->language_data_path_prefix;
|
||||||
|
name += user_patterns_suffix;
|
||||||
|
}
|
||||||
if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
|
if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
|
||||||
tprintf("Error: failed to load %s\n", name.string());
|
tprintf("Error: failed to load %s\n", name.string());
|
||||||
delete trie_ptr;
|
delete trie_ptr;
|
||||||
|
@ -544,9 +544,13 @@ class Dict {
|
|||||||
/// Variable members.
|
/// Variable members.
|
||||||
/// These have to be declared and initialized after image_ptr_, which contains
|
/// These have to be declared and initialized after image_ptr_, which contains
|
||||||
/// the pointer to the params vector - the member of its base CCUtil class.
|
/// the pointer to the params vector - the member of its base CCUtil class.
|
||||||
STRING_VAR_H(user_words_suffix, "", "A list of user-provided words.");
|
STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
|
||||||
|
STRING_VAR_H(user_words_suffix, "",
|
||||||
|
"A suffix of user-provided words located in tessdata.");
|
||||||
|
STRING_VAR_H(user_patterns_file, "",
|
||||||
|
"A filename of user-provided patterns.");
|
||||||
STRING_VAR_H(user_patterns_suffix, "",
|
STRING_VAR_H(user_patterns_suffix, "",
|
||||||
"A list of user-provided patterns.");
|
"A suffix of user-provided patterns located in tessdata.");
|
||||||
BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
|
BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
|
||||||
BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
|
BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
|
||||||
BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
|
BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
|
||||||
|
Loading…
Reference in New Issue
Block a user