Adding --print-fonts-table parameter & tessedit_font_id configuration option

This commit is contained in:
Lucas Cimon 2021-04-29 09:57:59 +02:00
parent 2e2a5b3ef4
commit b852d658cb
6 changed files with 67 additions and 5 deletions

View File

@ -150,6 +150,11 @@ public:
*/
const char *GetStringVariable(const char *name) const;
/**
* Print Tesseract fonts table to the given file.
*/
void PrintFontsTable(FILE* fp) const;
/**
* Print Tesseract parameters to the given file.
*/

View File

@ -330,6 +330,22 @@ bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const
return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
}
/** Print Tesseract fonts table to the given file. */
void TessBaseAPI::PrintFontsTable(FILE *fp) const {
const int fontinfo_size = tesseract_->get_fontinfo_table().size();
for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
" is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
font_index, font.name,
font.is_italic() ? "true" : "false",
font.is_bold() ? "true" : "false",
font.is_fixed_pitch() ? "true" : "false",
font.is_serif() ? "true" : "false",
font.is_fraktur() ? "true" : "false");
}
}
/** Print Tesseract parameters to the given file. */
void TessBaseAPI::PrintVariables(FILE *fp) const {
ParamUtils::PrintParams(fp, tesseract_->params());

View File

@ -226,6 +226,9 @@ static void PrintHelpExtra(const char *program) {
#endif
"--version\n"
" %s --list-langs [--tessdata-dir PATH]\n"
#ifndef DISABLED_LEGACY_ENGINE
" %s --print-fonts-table [options...] [configfile...]\n"
#endif // ndef DISABLED_LEGACY_ENGINE
" %s --print-parameters [options...] [configfile...]\n"
" %s imagename|imagelist|stdin outputbase|stdout [options...] "
"[configfile...]\n"
@ -244,7 +247,11 @@ static void PrintHelpExtra(const char *program) {
#endif
"NOTE: These options must occur before any configfile.\n"
"\n",
program, program, program, program);
program, program, program, program
#ifndef DISABLED_LEGACY_ENGINE
, program
#endif // ndef DISABLED_LEGACY_ENGINE
);
PrintHelpForPSM();
#ifndef DISABLED_LEGACY_ENGINE
@ -263,6 +270,9 @@ static void PrintHelpExtra(const char *program) {
#endif
" -v, --version Show version information.\n"
" --list-langs List available languages for tesseract engine.\n"
#ifndef DISABLED_LEGACY_ENGINE
" --print-fonts-table Print tesseract fonts table.\n"
#endif // ndef DISABLED_LEGACY_ENGINE
" --print-parameters Print tesseract parameters.\n");
}
@ -358,7 +368,7 @@ static bool checkArgValues(int arg, const char *mode, int count) {
// NOTE: arg_i is used here to avoid ugly *i so many times in this function
static bool ParseArgs(int argc, char **argv, const char **lang, const char **image,
const char **outputbase, const char **datapath, l_int32 *dpi,
bool *list_langs, bool *print_parameters, std::vector<std::string> *vars_vec,
bool *list_langs, bool *print_parameters, bool* print_fonts_table, std::vector<std::string> *vars_vec,
std::vector<std::string> *vars_values, l_int32 *arg_i,
tesseract::PageSegMode *pagesegmode, tesseract::OcrEngineMode *enginemode) {
bool noocr = false;
@ -422,6 +432,11 @@ static bool ParseArgs(int argc, char **argv, const char **lang, const char **ima
} else if (strcmp(argv[i], "--print-parameters") == 0) {
noocr = true;
*print_parameters = true;
#ifndef DISABLED_LEGACY_ENGINE
} else if (strcmp(argv[i], "--print-fonts-table") == 0) {
noocr = true;
*print_fonts_table = true;
#endif // ndef DISABLED_LEGACY_ENGINE
} else if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) {
// handled properly after api init
++i;
@ -608,6 +623,7 @@ int main(int argc, char **argv) {
const char *datapath = nullptr;
bool list_langs = false;
bool print_parameters = false;
bool print_fonts_table = false;
l_int32 dpi = 0;
int arg_i = 1;
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
@ -634,7 +650,7 @@ int main(int argc, char **argv) {
#endif // HAVE_TIFFIO_H && _WIN32
if (!ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &dpi, &list_langs,
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode, &enginemode)) {
&print_parameters, &print_fonts_table, &vars_vec, &vars_values, &arg_i, &pagesegmode, &enginemode)) {
return EXIT_FAILURE;
}
@ -643,7 +659,7 @@ int main(int argc, char **argv) {
lang = "eng";
}
if (image == nullptr && !list_langs && !print_parameters) {
if (image == nullptr && !list_langs && !print_parameters && !print_fonts_table) {
return EXIT_SUCCESS;
}
@ -684,6 +700,16 @@ int main(int argc, char **argv) {
return EXIT_SUCCESS;
}
#ifndef DISABLED_LEGACY_ENGINE
if (print_fonts_table) {
FILE* fout = stdout;
fprintf(stdout, "Tesseract fonts table:\n");
api.PrintFontsTable(fout);
api.End();
return EXIT_SUCCESS;
}
#endif // ndef DISABLED_LEGACY_ENGINE
FixPageSegMode(api, pagesegmode);
if (dpi) {

View File

@ -1922,10 +1922,22 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
ASSERT_HOST(word->best_choice != nullptr);
#ifndef DISABLED_LEGACY_ENGINE
const int fontinfo_size = get_fontinfo_table().size();
const int fontinfo_size = fontinfo_table_.size();
if (fontinfo_size == 0) {
return;
}
if (tessedit_font_id > 0) {
if (tessedit_font_id >= fontinfo_size) {
tprintf("Error, invalid font ID provided: must be below %d.\n"
"Falling back to font auto-detection.\n", fontinfo_size);
} else {
word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
word->fontinfo2 = nullptr;
word->fontinfo_id_count = INT8_MAX;
word->fontinfo_id2_count = 0;
return;
}
}
std::vector<int> font_total_score(fontinfo_size);
// Compute the font scores for the word

View File

@ -129,6 +129,7 @@ Tesseract::Tesseract()
, BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary",
this->params())
, BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params())
, INT_MEMBER(tessedit_font_id, 0, "Font ID to use or zero", this->params())
, BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params())
, BOOL_MEMBER(tessedit_enable_bigram_correction, true,
"Enable correction based on the word bigram dictionary.", this->params())

View File

@ -798,6 +798,8 @@ public:
BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
BOOL_VAR_H(tessedit_enable_doc_dict, true, "Add words to the document dictionary");
BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
INT_VAR_H(tessedit_font_id, 0, "Disable font detection and use the font"
" corresponding to the ID specified instead");
BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
BOOL_VAR_H(tessedit_enable_bigram_correction, true,
"Enable correction based on the word bigram dictionary.");