fix issue 123: user-words (and user-patterns) file specified by command line

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1093 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
zdenop 2014-05-04 21:11:00 +00:00
parent bc09cd9040
commit ee73e3b107
3 changed files with 46 additions and 12 deletions

View File

@ -90,6 +90,7 @@ int main(int argc, char **argv) {
bool noocr = false;
bool list_langs = false;
bool print_parameters = false;
GenericVector<STRING> vars_vec, vars_values;
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
int arg = 1;
@ -100,6 +101,14 @@ int main(int argc, char **argv) {
} else if (strcmp(argv[arg], "--tessdata-dir") == 0 && arg + 1 < argc) {
datapath = argv[arg + 1];
++arg;
} else if (strcmp(argv[arg], "--user-words") == 0 && arg + 1 < argc) {
vars_vec.push_back("user_words_file");
vars_values.push_back(argv[arg + 1]);
++arg;
} else if (strcmp(argv[arg], "--user-patterns") == 0 && arg + 1 < argc) {
vars_vec.push_back("user_patterns_file");
vars_values.push_back(argv[arg + 1]);
++arg;
} else if (strcmp(argv[arg], "--list-langs") == 0) {
noocr = true;
list_langs = true;
@ -130,8 +139,12 @@ int main(int argc, char **argv) {
"[options...] [configfile...]\n\n", argv[0]);
fprintf(stderr, "OCR options:\n");
fprintf(stderr, " --tessdata-dir /path\tspecify location of tessdata"
fprintf(stderr, " --tessdata-dir /path\tspecify the location of tessdata"
" path\n");
fprintf(stderr, " --user-words /path/to/file\tspecify the location of user"
" words file\n");
fprintf(stderr, " --user-patterns /path/to/file\tspecify the location of"
" user patterns file\n");
fprintf(stderr, " -l lang[+lang]\tspecify language(s) used for OCR\n");
fprintf(stderr, " -c configvar=value\tset value for control parameter.\n"
"\t\t\tMultiple -c arguments are allowed.\n");
@ -168,7 +181,7 @@ int main(int argc, char **argv) {
api.SetOutputName(output);
int rc = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
&(argv[arg]), argc - arg, NULL, NULL, false);
&(argv[arg]), argc - arg, &vars_vec, &vars_values, false);
if (rc) {
fprintf(stderr, "Could not initialize tesseract.\n");

View File

@ -35,11 +35,18 @@ Dict::Dict(CCUtil* ccutil)
probability_in_context_(&tesseract::Dict::def_probability_in_context),
params_model_classify_(NULL),
ccutil_(ccutil),
STRING_MEMBER(user_words_file, "",
"A filename of user-provided words.",
getCCUtil()->params()),
STRING_INIT_MEMBER(user_words_suffix, "",
"A list of user-provided words.",
"A suffix of user-provided words located in tessdata.",
getCCUtil()->params()),
STRING_MEMBER(user_patterns_file, "",
"A filename of user-provided patterns.",
getCCUtil()->params()),
STRING_INIT_MEMBER(user_patterns_suffix, "",
"A list of user-provided patterns.",
"A suffix of user-provided patterns located in "
"tessdata.",
getCCUtil()->params()),
BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
getCCUtil()->params()),
@ -237,11 +244,16 @@ void Dict::Load(DawgCache *dawg_cache) {
if (unambig_dawg_) dawgs_ += unambig_dawg_;
}
if (((STRING &)user_words_suffix).length() > 0) {
if (((STRING &)user_words_suffix).length() > 0 ||
((STRING &)user_words_file).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
getUnicharset().size(), dawg_debug_level);
if (((STRING &)user_words_file).length() > 0) {
name = user_words_file;
} else {
name = getCCUtil()->language_data_path_prefix;
name += user_words_suffix;
}
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
Trie::RRP_REVERSE_IF_HAS_RTL)) {
tprintf("Error: failed to load %s\n", name.string());
@ -251,12 +263,17 @@ void Dict::Load(DawgCache *dawg_cache) {
}
}
if (((STRING &)user_patterns_suffix).length() > 0) {
if (((STRING &)user_patterns_suffix).length() > 0 ||
((STRING &)user_patterns_file).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
getUnicharset().size(), dawg_debug_level);
trie_ptr->initialize_patterns(&(getUnicharset()));
if (((STRING &)user_patterns_file).length() > 0) {
name = user_patterns_file;
} else {
name = getCCUtil()->language_data_path_prefix;
name += user_patterns_suffix;
}
if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
tprintf("Error: failed to load %s\n", name.string());
delete trie_ptr;

View File

@ -544,9 +544,13 @@ class Dict {
/// Variable members.
/// These have to be declared and initialized after image_ptr_, which contains
/// the pointer to the params vector - the member of its base CCUtil class.
STRING_VAR_H(user_words_suffix, "", "A list of user-provided words.");
STRING_VAR_H(user_words_file, "", "A filename of user-provided words.");
STRING_VAR_H(user_words_suffix, "",
"A suffix of user-provided words located in tessdata.");
STRING_VAR_H(user_patterns_file, "",
"A filename of user-provided patterns.");
STRING_VAR_H(user_patterns_suffix, "",
"A list of user-provided patterns.");
"A suffix of user-provided patterns located in tessdata.");
BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");