mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
fix issue 123: user-words (and user-patterns) file specified by command line
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1093 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
bc09cd9040
commit
ee73e3b107
@ -90,6 +90,7 @@ int main(int argc, char **argv) {
|
||||
bool noocr = false;
|
||||
bool list_langs = false;
|
||||
bool print_parameters = false;
|
||||
GenericVector<STRING> vars_vec, vars_values;
|
||||
|
||||
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
|
||||
int arg = 1;
|
||||
@ -100,6 +101,14 @@ int main(int argc, char **argv) {
|
||||
} else if (strcmp(argv[arg], "--tessdata-dir") == 0 && arg + 1 < argc) {
|
||||
datapath = argv[arg + 1];
|
||||
++arg;
|
||||
} else if (strcmp(argv[arg], "--user-words") == 0 && arg + 1 < argc) {
|
||||
vars_vec.push_back("user_words_file");
|
||||
vars_values.push_back(argv[arg + 1]);
|
||||
++arg;
|
||||
} else if (strcmp(argv[arg], "--user-patterns") == 0 && arg + 1 < argc) {
|
||||
vars_vec.push_back("user_patterns_file");
|
||||
vars_values.push_back(argv[arg + 1]);
|
||||
++arg;
|
||||
} else if (strcmp(argv[arg], "--list-langs") == 0) {
|
||||
noocr = true;
|
||||
list_langs = true;
|
||||
@ -130,8 +139,12 @@ int main(int argc, char **argv) {
|
||||
"[options...] [configfile...]\n\n", argv[0]);
|
||||
|
||||
fprintf(stderr, "OCR options:\n");
|
||||
fprintf(stderr, " --tessdata-dir /path\tspecify location of tessdata"
|
||||
fprintf(stderr, " --tessdata-dir /path\tspecify the location of tessdata"
|
||||
" path\n");
|
||||
fprintf(stderr, " --user-words /path/to/file\tspecify the location of user"
|
||||
" words file\n");
|
||||
fprintf(stderr, " --user-patterns /path/to/file\tspecify the location of"
|
||||
" user patterns file\n");
|
||||
fprintf(stderr, " -l lang[+lang]\tspecify language(s) used for OCR\n");
|
||||
fprintf(stderr, " -c configvar=value\tset value for control parameter.\n"
|
||||
"\t\t\tMultiple -c arguments are allowed.\n");
|
||||
@ -168,7 +181,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
api.SetOutputName(output);
|
||||
int rc = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
|
||||
&(argv[arg]), argc - arg, NULL, NULL, false);
|
||||
&(argv[arg]), argc - arg, &vars_vec, &vars_values, false);
|
||||
|
||||
if (rc) {
|
||||
fprintf(stderr, "Could not initialize tesseract.\n");
|
||||
|
@ -35,11 +35,18 @@ Dict::Dict(CCUtil* ccutil)
|
||||
probability_in_context_(&tesseract::Dict::def_probability_in_context),
|
||||
params_model_classify_(NULL),
|
||||
ccutil_(ccutil),
|
||||
STRING_MEMBER(user_words_file, "",
|
||||
"A filename of user-provided words.",
|
||||
getCCUtil()->params()),
|
||||
STRING_INIT_MEMBER(user_words_suffix, "",
|
||||
"A list of user-provided words.",
|
||||
"A suffix of user-provided words located in tessdata.",
|
||||
getCCUtil()->params()),
|
||||
STRING_MEMBER(user_patterns_file, "",
|
||||
"A filename of user-provided patterns.",
|
||||
getCCUtil()->params()),
|
||||
STRING_INIT_MEMBER(user_patterns_suffix, "",
|
||||
"A list of user-provided patterns.",
|
||||
"A suffix of user-provided patterns located in "
|
||||
"tessdata.",
|
||||
getCCUtil()->params()),
|
||||
BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
|
||||
getCCUtil()->params()),
|
||||
@ -237,11 +244,16 @@ void Dict::Load(DawgCache *dawg_cache) {
|
||||
if (unambig_dawg_) dawgs_ += unambig_dawg_;
|
||||
}
|
||||
|
||||
if (((STRING &)user_words_suffix).length() > 0) {
|
||||
if (((STRING &)user_words_suffix).length() > 0 ||
|
||||
((STRING &)user_words_file).length() > 0) {
|
||||
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
|
||||
getUnicharset().size(), dawg_debug_level);
|
||||
if (((STRING &)user_words_file).length() > 0) {
|
||||
name = user_words_file;
|
||||
} else {
|
||||
name = getCCUtil()->language_data_path_prefix;
|
||||
name += user_words_suffix;
|
||||
}
|
||||
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
|
||||
Trie::RRP_REVERSE_IF_HAS_RTL)) {
|
||||
tprintf("Error: failed to load %s\n", name.string());
|
||||
@ -251,12 +263,17 @@ void Dict::Load(DawgCache *dawg_cache) {
|
||||
}
|
||||
}
|
||||
|
||||
if (((STRING &)user_patterns_suffix).length() > 0) {
|
||||
if (((STRING &)user_patterns_suffix).length() > 0 ||
|
||||
((STRING &)user_patterns_file).length() > 0) {
|
||||
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
|
||||
getUnicharset().size(), dawg_debug_level);
|
||||
trie_ptr->initialize_patterns(&(getUnicharset()));
|
||||
if (((STRING &)user_patterns_file).length() > 0) {
|
||||
name = user_patterns_file;
|
||||
} else {
|
||||
name = getCCUtil()->language_data_path_prefix;
|
||||
name += user_patterns_suffix;
|
||||
}
|
||||
if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
|
||||
tprintf("Error: failed to load %s\n", name.string());
|
||||
delete trie_ptr;
|
||||
|
@ -544,9 +544,13 @@ class Dict {
|
||||
/// Variable members.
|
||||
/// These have to be declared and initialized after image_ptr_, which contains
|
||||
/// the pointer to the params vector - the member of its base CCUtil class.
|
||||
STRING_VAR_H(user_words_suffix, "", "A list of user-provided words.");
|
||||
STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
|
||||
STRING_VAR_H(user_words_suffix, "",
|
||||
"A suffix of user-provided words located in tessdata.");
|
||||
STRING_VAR_H(user_patterns_file, "",
|
||||
"A filename of user-provided patterns.");
|
||||
STRING_VAR_H(user_patterns_suffix, "",
|
||||
"A list of user-provided patterns.");
|
||||
"A suffix of user-provided patterns located in tessdata.");
|
||||
BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
|
||||
BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
|
||||
BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
|
||||
|
Loading…
Reference in New Issue
Block a user