mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 18:02:40 +08:00
Major updates to training system as a result of extensive testing on 100 languages
This commit is contained in:
parent
21805e63a4
commit
6be25156f7
1131
training/language-specific.sh
Normal file
1131
training/language-specific.sh
Normal file
File diff suppressed because it is too large
Load Diff
@ -43,7 +43,7 @@ static string EncodeAsUTF8(const char32 ch32) {
|
|||||||
// from. Note that this range does not contain the custom ligatures that we
|
// from. Note that this range does not contain the custom ligatures that we
|
||||||
// encode in the private use area.
|
// encode in the private use area.
|
||||||
const int kMinLigature = 0xfb00;
|
const int kMinLigature = 0xfb00;
|
||||||
const int kMaxLigature = 0xfb4f;
|
const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in.
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
SmartPtr<LigatureTable> LigatureTable::instance_;
|
SmartPtr<LigatureTable> LigatureTable::instance_;
|
||||||
|
@ -51,6 +51,12 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
|
|||||||
BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
|
BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
|
||||||
"Does a one-time deletion of cache files from the "
|
"Does a one-time deletion of cache files from the "
|
||||||
"fontconfig_tmpdir before initializing fontconfig.");
|
"fontconfig_tmpdir before initializing fontconfig.");
|
||||||
|
BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true,
|
||||||
|
"Does a one-time reset of the fontconfig config file to point"
|
||||||
|
" to fonts_dir before initializing fontconfig. Set to true"
|
||||||
|
" if fontconfig_refresh_cache is true. Set it to false to use"
|
||||||
|
" multiple instances in separate processes without having to"
|
||||||
|
" rescan the fonts_dir, using a previously setup font cache");
|
||||||
|
|
||||||
#ifndef USE_STD_NAMESPACE
|
#ifndef USE_STD_NAMESPACE
|
||||||
#include "ocr/trainingdata/typesetting/legacy_fonts.h"
|
#include "ocr/trainingdata/typesetting/legacy_fonts.h"
|
||||||
@ -67,6 +73,8 @@ namespace tesseract {
|
|||||||
// in pixels.
|
// in pixels.
|
||||||
const int kDefaultResolution = 300;
|
const int kDefaultResolution = 300;
|
||||||
|
|
||||||
|
bool PangoFontInfo::fontconfig_initialized_ = false;
|
||||||
|
|
||||||
PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
|
PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
|
||||||
Clear();
|
Clear();
|
||||||
}
|
}
|
||||||
@ -103,34 +111,35 @@ string PangoFontInfo::DescriptionName() const {
|
|||||||
|
|
||||||
// Initializes Fontconfig for use by writing a fake fonts.conf file into the
|
// Initializes Fontconfig for use by writing a fake fonts.conf file into the
|
||||||
// FLAGS_fontconfigs_tmpdir directory, that points to the supplied
|
// FLAGS_fontconfigs_tmpdir directory, that points to the supplied
|
||||||
// FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
|
// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
|
||||||
// to point to this fonts.conf file.
|
// to point to this fonts.conf file. If force_clear, the cache is refreshed
|
||||||
static void InitFontconfig() {
|
// even if it has already been initialized.
|
||||||
static bool init_fontconfig = false;
|
void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) {
|
||||||
if (init_fontconfig || FLAGS_fonts_dir.empty()) {
|
if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
|
||||||
init_fontconfig = true;
|
fontconfig_initialized_ = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (FLAGS_fontconfig_refresh_cache) {
|
if (FLAGS_fontconfig_refresh_cache || force_clear) {
|
||||||
tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str());
|
|
||||||
File::DeleteMatchingFiles(File::JoinPath(
|
File::DeleteMatchingFiles(File::JoinPath(
|
||||||
FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str());
|
FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str());
|
||||||
|
}
|
||||||
|
if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
|
||||||
|
force_clear) {
|
||||||
|
const int MAX_FONTCONF_FILESIZE = 1024;
|
||||||
|
char fonts_conf_template[MAX_FONTCONF_FILESIZE];
|
||||||
|
snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
|
||||||
|
"<?xml version=\"1.0\"?>\n"
|
||||||
|
"<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
|
||||||
|
"<fontconfig>\n"
|
||||||
|
"<dir>%s</dir>\n"
|
||||||
|
"<cachedir>%s</cachedir>\n"
|
||||||
|
"<config></config>\n"
|
||||||
|
"</fontconfig>", fonts_dir.c_str(),
|
||||||
|
FLAGS_fontconfig_tmpdir.c_str());
|
||||||
|
string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
|
||||||
|
"fonts.conf");
|
||||||
|
File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
|
||||||
}
|
}
|
||||||
tprintf("Initializing fontconfig\n");
|
|
||||||
const int MAX_FONTCONF_FILESIZE = 1024;
|
|
||||||
char fonts_conf_template[MAX_FONTCONF_FILESIZE];
|
|
||||||
snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
|
|
||||||
"<?xml version=\"1.0\"?>\n"
|
|
||||||
"<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
|
|
||||||
"<fontconfig>\n"
|
|
||||||
"<dir>%s</dir>\n"
|
|
||||||
"<cachedir>%s</cachedir>\n"
|
|
||||||
"<config></config>\n"
|
|
||||||
"</fontconfig>", FLAGS_fonts_dir.c_str(),
|
|
||||||
FLAGS_fontconfig_tmpdir.c_str());
|
|
||||||
string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
|
|
||||||
"fonts.conf");
|
|
||||||
File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
std::string env("FONTCONFIG_PATH=");
|
std::string env("FONTCONFIG_PATH=");
|
||||||
env.append(FLAGS_fontconfig_tmpdir.c_str());
|
env.append(FLAGS_fontconfig_tmpdir.c_str());
|
||||||
@ -141,12 +150,18 @@ static void InitFontconfig() {
|
|||||||
// Fix the locale so that the reported font names are consistent.
|
// Fix the locale so that the reported font names are consistent.
|
||||||
setenv("LANG", "en_US.utf8", true);
|
setenv("LANG", "en_US.utf8", true);
|
||||||
#endif // _WIN32
|
#endif // _WIN32
|
||||||
init_fontconfig = true;
|
if (!fontconfig_initialized_ || force_clear) {
|
||||||
|
if (FcInitReinitialize() != FcTrue) {
|
||||||
|
tprintf("FcInitiReinitialize failed!!\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fontconfig_initialized_ = true;
|
||||||
|
FontUtils::ReInit();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ListFontFamilies(PangoFontFamily*** families,
|
static void ListFontFamilies(PangoFontFamily*** families,
|
||||||
int* n_families) {
|
int* n_families) {
|
||||||
InitFontconfig();
|
PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
|
||||||
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
||||||
DISABLE_HEAP_LEAK_CHECK;
|
DISABLE_HEAP_LEAK_CHECK;
|
||||||
pango_font_map_list_families(font_map, families, n_families);
|
pango_font_map_list_families(font_map, families, n_families);
|
||||||
@ -220,7 +235,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
|
|||||||
// in the font map. Note that if the font is wholly missing, this could
|
// in the font map. Note that if the font is wholly missing, this could
|
||||||
// correspond to a completely different font family and face.
|
// correspond to a completely different font family and face.
|
||||||
PangoFont* PangoFontInfo::ToPangoFont() const {
|
PangoFont* PangoFontInfo::ToPangoFont() const {
|
||||||
InitFontconfig();
|
InitFontConfig(false, FLAGS_fonts_dir);
|
||||||
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
||||||
PangoContext* context = pango_context_new();
|
PangoContext* context = pango_context_new();
|
||||||
pango_cairo_context_set_resolution(context, resolution_);
|
pango_cairo_context_set_resolution(context, resolution_);
|
||||||
@ -253,6 +268,28 @@ bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This variant of strncpy permits src and dest to overlap. It will copy the
|
||||||
|
// first byte first.
|
||||||
|
static char* my_strnmove(char* dest, const char* src, size_t n) {
|
||||||
|
char* ret = dest;
|
||||||
|
|
||||||
|
// Copy characters until n reaches zero or the src byte is a nul.
|
||||||
|
do {
|
||||||
|
*dest = *src;
|
||||||
|
--n;
|
||||||
|
++dest;
|
||||||
|
++src;
|
||||||
|
} while (n && src[0]);
|
||||||
|
|
||||||
|
// If we reached a nul byte and there are more 'n' left, zero them out.
|
||||||
|
while (n) {
|
||||||
|
*dest = '\0';
|
||||||
|
--n;
|
||||||
|
++dest;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
|
int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
|
||||||
PangoFont* font = ToPangoFont();
|
PangoFont* font = ToPangoFont();
|
||||||
PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
|
PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
|
||||||
@ -265,23 +302,30 @@ int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
|
|||||||
UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
|
UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
|
||||||
const UNICHAR::const_iterator it_end =
|
const UNICHAR::const_iterator it_end =
|
||||||
UNICHAR::end(utf8_text->c_str(), utf8_text->length());
|
UNICHAR::end(utf8_text->c_str(), utf8_text->length());
|
||||||
for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
|
for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
|
||||||
// Skip bad utf-8.
|
// Skip bad utf-8.
|
||||||
if (!it.is_legal())
|
if (!it.is_legal()) {
|
||||||
continue; // One suitable error message will still be issued.
|
++it; // One suitable error message will still be issued.
|
||||||
if (!IsWhitespace(*it) && !pango_is_zero_width(*it) &&
|
continue;
|
||||||
pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
|
}
|
||||||
|
int unicode = *it;
|
||||||
|
int utf8_len = it.utf8_len();
|
||||||
|
const char* utf8_char = it.utf8_data();
|
||||||
|
// Move it forward before the data gets modified.
|
||||||
|
++it;
|
||||||
|
if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
|
||||||
|
pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
|
||||||
if (TLOG_IS_ON(2)) {
|
if (TLOG_IS_ON(2)) {
|
||||||
char tmp[5];
|
UNICHAR unichar(unicode);
|
||||||
int len = it.get_utf8(tmp);
|
char* str = unichar.utf8_str();
|
||||||
tmp[len] = '\0';
|
tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
|
||||||
tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
|
delete[] str;
|
||||||
}
|
}
|
||||||
++num_dropped_chars;
|
++num_dropped_chars;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
strncpy(out, it.utf8_data(), it.utf8_len());
|
my_strnmove(out, utf8_char, utf8_len);
|
||||||
out += it.utf8_len();
|
out += utf8_len;
|
||||||
}
|
}
|
||||||
utf8_text->resize(out - utf8_text->c_str());
|
utf8_text->resize(out - utf8_text->c_str());
|
||||||
return num_dropped_chars;
|
return num_dropped_chars;
|
||||||
@ -438,6 +482,7 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
|
|||||||
|
|
||||||
|
|
||||||
// ------------------------ FontUtils ------------------------------------
|
// ------------------------ FontUtils ------------------------------------
|
||||||
|
vector<string> FontUtils::available_fonts_; // cache list
|
||||||
|
|
||||||
// Returns whether the specified font description is available in the fonts
|
// Returns whether the specified font description is available in the fonts
|
||||||
// directory.
|
// directory.
|
||||||
@ -449,7 +494,8 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
|
|||||||
// from the font_map, and then check what we loaded to see if it has the
|
// from the font_map, and then check what we loaded to see if it has the
|
||||||
// description we expected. If it is not, then the font is deemed unavailable.
|
// description we expected. If it is not, then the font is deemed unavailable.
|
||||||
/* static */
|
/* static */
|
||||||
bool FontUtils::IsAvailableFont(const char* input_query_desc) {
|
bool FontUtils::IsAvailableFont(const char* input_query_desc,
|
||||||
|
string* best_match) {
|
||||||
string query_desc(input_query_desc);
|
string query_desc(input_query_desc);
|
||||||
if (PANGO_VERSION <= 12005) {
|
if (PANGO_VERSION <= 12005) {
|
||||||
// Strip commas and any ' Medium' substring in the name.
|
// Strip commas and any ' Medium' substring in the name.
|
||||||
@ -466,7 +512,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
|
|||||||
query_desc.c_str());
|
query_desc.c_str());
|
||||||
PangoFont* selected_font = NULL;
|
PangoFont* selected_font = NULL;
|
||||||
{
|
{
|
||||||
InitFontconfig();
|
PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
|
||||||
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
||||||
PangoContext* context = pango_context_new();
|
PangoContext* context = pango_context_new();
|
||||||
pango_context_set_font_map(context, font_map);
|
pango_context_set_font_map(context, font_map);
|
||||||
@ -490,7 +536,16 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
|
|||||||
char* selected_desc_str = pango_font_description_to_string(selected_desc);
|
char* selected_desc_str = pango_font_description_to_string(selected_desc);
|
||||||
tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
|
tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
|
||||||
selected_desc_str);
|
selected_desc_str);
|
||||||
|
if (!equal && best_match != NULL) {
|
||||||
|
*best_match = selected_desc_str;
|
||||||
|
// Clip the ending ' 0' if there is one. It seems that, if there is no
|
||||||
|
// point size on the end of the fontname, then Pango always appends ' 0'.
|
||||||
|
int len = best_match->size();
|
||||||
|
if (len > 2 && best_match->at(len - 1) == '0' &&
|
||||||
|
best_match->at(len - 2) == ' ') {
|
||||||
|
*best_match = best_match->substr(0, len - 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
g_free(selected_desc_str);
|
g_free(selected_desc_str);
|
||||||
pango_font_description_free(selected_desc);
|
pango_font_description_free(selected_desc);
|
||||||
g_object_unref(selected_font);
|
g_object_unref(selected_font);
|
||||||
@ -512,7 +567,6 @@ static bool ShouldIgnoreFontFamilyName(const char* query) {
|
|||||||
// Outputs description names of available fonts.
|
// Outputs description names of available fonts.
|
||||||
/* static */
|
/* static */
|
||||||
const vector<string>& FontUtils::ListAvailableFonts() {
|
const vector<string>& FontUtils::ListAvailableFonts() {
|
||||||
static vector<string> available_fonts_; // cache list
|
|
||||||
if (available_fonts_.size()) {
|
if (available_fonts_.size()) {
|
||||||
return available_fonts_;
|
return available_fonts_;
|
||||||
}
|
}
|
||||||
@ -536,8 +590,9 @@ const vector<string>& FontUtils::ListAvailableFonts() {
|
|||||||
for (int i = 0; i < n_families; ++i) {
|
for (int i = 0; i < n_families; ++i) {
|
||||||
const char* family_name = pango_font_family_get_name(families[i]);
|
const char* family_name = pango_font_family_get_name(families[i]);
|
||||||
tlog(2, "Listing family %s\n", family_name);
|
tlog(2, "Listing family %s\n", family_name);
|
||||||
if (ShouldIgnoreFontFamilyName(family_name))
|
if (ShouldIgnoreFontFamilyName(family_name)) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
int n_faces;
|
int n_faces;
|
||||||
PangoFontFace** faces = NULL;
|
PangoFontFace** faces = NULL;
|
||||||
@ -733,4 +788,8 @@ bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PangoFontInfo is reinitialized, so clear the static list of fonts.
|
||||||
|
/* static */
|
||||||
|
void FontUtils::ReInit() { available_fonts_.clear(); }
|
||||||
|
|
||||||
} // namespace tesseract
|
} // namespace tesseract
|
||||||
|
@ -83,6 +83,11 @@ class PangoFontInfo {
|
|||||||
bool GetSpacingProperties(const string& utf8_char,
|
bool GetSpacingProperties(const string& utf8_char,
|
||||||
int* x_bearing, int* x_advance) const;
|
int* x_bearing, int* x_advance) const;
|
||||||
|
|
||||||
|
// Initializes FontConfig by setting its environment variable and creating
|
||||||
|
// a fonts.conf file that points to the given fonts_dir. Once initialized,
|
||||||
|
// it is not re-initialized unless force_clear is true.
|
||||||
|
static void InitFontConfig(bool force_clear, const string& fonts_dir);
|
||||||
|
|
||||||
// Accessors
|
// Accessors
|
||||||
string DescriptionName() const;
|
string DescriptionName() const;
|
||||||
// Font Family name eg. "Arial"
|
// Font Family name eg. "Arial"
|
||||||
@ -123,6 +128,10 @@ class PangoFontInfo {
|
|||||||
// Default output resolution to assume for GetSpacingProperties() and any
|
// Default output resolution to assume for GetSpacingProperties() and any
|
||||||
// other methods that returns pixel values.
|
// other methods that returns pixel values.
|
||||||
int resolution_;
|
int resolution_;
|
||||||
|
// Fontconfig operates through an environment variable, so it intrinsically
|
||||||
|
// cannot be thread-friendly, but you can serialize multiple independent
|
||||||
|
// font configurations by calling InitFontConfig(true, path).
|
||||||
|
static bool fontconfig_initialized_;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
PangoFontInfo(const PangoFontInfo&);
|
PangoFontInfo(const PangoFontInfo&);
|
||||||
@ -135,7 +144,13 @@ class FontUtils {
|
|||||||
public:
|
public:
|
||||||
// Returns true if the font of the given description name is available in the
|
// Returns true if the font of the given description name is available in the
|
||||||
// target directory specified by --fonts_dir
|
// target directory specified by --fonts_dir
|
||||||
static bool IsAvailableFont(const char* font_desc);
|
static bool IsAvailableFont(const char* font_desc) {
|
||||||
|
return IsAvailableFont(font_desc, NULL);
|
||||||
|
}
|
||||||
|
// Returns true if the font of the given description name is available in the
|
||||||
|
// target directory specified by --fonts_dir. If false is returned, and
|
||||||
|
// best_match is not NULL, the closest matching font is returned there.
|
||||||
|
static bool IsAvailableFont(const char* font_desc, string* best_match);
|
||||||
// Outputs description names of available fonts.
|
// Outputs description names of available fonts.
|
||||||
static const vector<string>& ListAvailableFonts();
|
static const vector<string>& ListAvailableFonts();
|
||||||
|
|
||||||
@ -181,6 +196,12 @@ class FontUtils {
|
|||||||
static int FontScore(const unordered_map<char32, inT64>& ch_map,
|
static int FontScore(const unordered_map<char32, inT64>& ch_map,
|
||||||
const string& fontname, int* raw_score,
|
const string& fontname, int* raw_score,
|
||||||
vector<bool>* ch_flags);
|
vector<bool>* ch_flags);
|
||||||
|
|
||||||
|
// PangoFontInfo is reinitialized, so clear the static list of fonts.
|
||||||
|
static void ReInit();
|
||||||
|
|
||||||
|
private:
|
||||||
|
static vector<string> available_fonts_; // cache list
|
||||||
};
|
};
|
||||||
} // namespace tesseract
|
} // namespace tesseract
|
||||||
|
|
||||||
|
@ -7,14 +7,8 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "commandlineflags.h"
|
#include "commandlineflags.h"
|
||||||
#include "fileio.h"
|
#include "tprintf.h"
|
||||||
#include "genericvector.h"
|
#include "unicharset_training_utils.h"
|
||||||
#include "icuerrorcode.h"
|
|
||||||
#include "normstrngs.h"
|
|
||||||
#include "strngs.h"
|
|
||||||
#include "unicharset.h"
|
|
||||||
#include "unicode/uchar.h" // from libicu
|
|
||||||
#include "unicode/uscript.h" // from libicu
|
|
||||||
|
|
||||||
// The directory that is searched for universal script unicharsets.
|
// The directory that is searched for universal script unicharsets.
|
||||||
STRING_PARAM_FLAG(script_dir, "",
|
STRING_PARAM_FLAG(script_dir, "",
|
||||||
@ -25,157 +19,6 @@ DECLARE_STRING_PARAM_FLAG(U);
|
|||||||
DECLARE_STRING_PARAM_FLAG(O);
|
DECLARE_STRING_PARAM_FLAG(O);
|
||||||
DECLARE_STRING_PARAM_FLAG(X);
|
DECLARE_STRING_PARAM_FLAG(X);
|
||||||
|
|
||||||
namespace tesseract {
|
|
||||||
|
|
||||||
// Helper sets the character attribute properties and sets up the script table.
|
|
||||||
// Does not set tops and bottoms.
|
|
||||||
static void SetupBasicProperties(UNICHARSET* unicharset) {
|
|
||||||
for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
|
|
||||||
// Convert any custom ligatures.
|
|
||||||
const char* unichar_str = unicharset->id_to_unichar(unichar_id);
|
|
||||||
for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
|
|
||||||
if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
|
|
||||||
unichar_str = UNICHARSET::kCustomLigatures[i][0];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Convert the unichar to UTF32 representation
|
|
||||||
GenericVector<char32> uni_vector;
|
|
||||||
tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
|
|
||||||
|
|
||||||
// Assume that if the property is true for any character in the string,
|
|
||||||
// then it holds for the whole "character".
|
|
||||||
bool unichar_isalpha = false;
|
|
||||||
bool unichar_islower = false;
|
|
||||||
bool unichar_isupper = false;
|
|
||||||
bool unichar_isdigit = false;
|
|
||||||
bool unichar_ispunct = false;
|
|
||||||
|
|
||||||
for (int i = 0; i < uni_vector.size(); ++i) {
|
|
||||||
if (u_isalpha(uni_vector[i]))
|
|
||||||
unichar_isalpha = true;
|
|
||||||
if (u_islower(uni_vector[i]))
|
|
||||||
unichar_islower = true;
|
|
||||||
if (u_isupper(uni_vector[i]))
|
|
||||||
unichar_isupper = true;
|
|
||||||
if (u_isdigit(uni_vector[i]))
|
|
||||||
unichar_isdigit = true;
|
|
||||||
if (u_ispunct(uni_vector[i]))
|
|
||||||
unichar_ispunct = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
unicharset->set_isalpha(unichar_id, unichar_isalpha);
|
|
||||||
unicharset->set_islower(unichar_id, unichar_islower);
|
|
||||||
unicharset->set_isupper(unichar_id, unichar_isupper);
|
|
||||||
unicharset->set_isdigit(unichar_id, unichar_isdigit);
|
|
||||||
unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
|
|
||||||
|
|
||||||
tesseract::IcuErrorCode err;
|
|
||||||
unicharset->set_script(unichar_id, uscript_getName(
|
|
||||||
uscript_getScript(uni_vector[0], err)));
|
|
||||||
|
|
||||||
const int num_code_points = uni_vector.size();
|
|
||||||
// Obtain the lower/upper case if needed and record it in the properties.
|
|
||||||
unicharset->set_other_case(unichar_id, unichar_id);
|
|
||||||
if (unichar_islower || unichar_isupper) {
|
|
||||||
GenericVector<char32> other_case(num_code_points, 0);
|
|
||||||
for (int i = 0; i < num_code_points; ++i) {
|
|
||||||
// TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
|
|
||||||
// However since they deal with UChars (so need a conversion function
|
|
||||||
// from char32 or UTF8string) and require a meaningful locale string,
|
|
||||||
// for now u_tolower()/u_toupper() are used.
|
|
||||||
other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
|
|
||||||
u_tolower(uni_vector[i]);
|
|
||||||
}
|
|
||||||
STRING other_case_uch;
|
|
||||||
tesseract::UTF32ToUTF8(other_case, &other_case_uch);
|
|
||||||
UNICHAR_ID other_case_id =
|
|
||||||
unicharset->unichar_to_id(other_case_uch.c_str());
|
|
||||||
if (other_case_id != INVALID_UNICHAR_ID) {
|
|
||||||
unicharset->set_other_case(unichar_id, other_case_id);
|
|
||||||
} else {
|
|
||||||
tprintf("Other case %s of %s is not in unicharset\n",
|
|
||||||
other_case_uch.c_str(), unichar_str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set RTL property and obtain mirror unichar ID from ICU.
|
|
||||||
GenericVector<char32> mirrors(num_code_points, 0);
|
|
||||||
for (int i = 0; i < num_code_points; ++i) {
|
|
||||||
mirrors[i] = u_charMirror(uni_vector[i]);
|
|
||||||
if (i == 0) { // set directionality to that of the 1st code point
|
|
||||||
unicharset->set_direction(unichar_id,
|
|
||||||
static_cast<UNICHARSET::Direction>(
|
|
||||||
u_charDirection(uni_vector[i])));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
STRING mirror_uch;
|
|
||||||
tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
|
|
||||||
UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
|
|
||||||
if (mirror_uch_id != INVALID_UNICHAR_ID) {
|
|
||||||
unicharset->set_mirror(unichar_id, mirror_uch_id);
|
|
||||||
} else {
|
|
||||||
tprintf("Mirror %s of %s is not in unicharset\n",
|
|
||||||
mirror_uch.c_str(), unichar_str);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Record normalized version of this unichar.
|
|
||||||
STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
|
|
||||||
if (unichar_id != 0 && normed_str.length() > 0) {
|
|
||||||
unicharset->set_normed(unichar_id, normed_str.c_str());
|
|
||||||
} else {
|
|
||||||
unicharset->set_normed(unichar_id, unichar_str);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
unicharset->post_load_setup();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper to set the properties for an input unicharset file, writes to the
|
|
||||||
// output file. If an appropriate script unicharset can be found in the
|
|
||||||
// script_dir directory, then the tops and bottoms are expanded using the
|
|
||||||
// script unicharset.
|
|
||||||
// If non-empty, xheight data for the fonts are written to the xheights_file.
|
|
||||||
static void SetPropertiesForInputFile(const string& script_dir,
|
|
||||||
const string& input_unicharset_file,
|
|
||||||
const string& output_unicharset_file,
|
|
||||||
const string& output_xheights_file) {
|
|
||||||
UNICHARSET unicharset;
|
|
||||||
|
|
||||||
// Load the input unicharset
|
|
||||||
unicharset.load_from_file(input_unicharset_file.c_str());
|
|
||||||
tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
|
|
||||||
input_unicharset_file.c_str());
|
|
||||||
|
|
||||||
// Set unichar properties
|
|
||||||
tprintf("Setting unichar properties\n");
|
|
||||||
SetupBasicProperties(&unicharset);
|
|
||||||
string xheights_str;
|
|
||||||
for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
|
|
||||||
// Load the unicharset for the script if available.
|
|
||||||
string filename = script_dir + "/" +
|
|
||||||
unicharset.get_script_from_script_id(s) + ".unicharset";
|
|
||||||
UNICHARSET script_set;
|
|
||||||
if (script_set.load_from_file(filename.c_str())) {
|
|
||||||
unicharset.SetPropertiesFromOther(script_set);
|
|
||||||
}
|
|
||||||
// Load the xheights for the script if available.
|
|
||||||
filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
|
|
||||||
".xheights";
|
|
||||||
string script_heights;
|
|
||||||
if (File::ReadFileToString(filename, &script_heights))
|
|
||||||
xheights_str += script_heights;
|
|
||||||
}
|
|
||||||
if (!output_xheights_file.empty())
|
|
||||||
File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
|
|
||||||
|
|
||||||
// Write the output unicharset
|
|
||||||
tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
|
|
||||||
unicharset.save_to_file(output_unicharset_file.c_str());
|
|
||||||
}
|
|
||||||
} // namespace tesseract
|
|
||||||
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
||||||
|
|
||||||
|
@ -819,6 +819,7 @@ int StringRenderer::RenderToImage(const char* text, int text_length,
|
|||||||
int StringRenderer::RenderAllFontsToImage(double min_coverage,
|
int StringRenderer::RenderAllFontsToImage(double min_coverage,
|
||||||
const char* text, int text_length,
|
const char* text, int text_length,
|
||||||
string* font_used, Pix** image) {
|
string* font_used, Pix** image) {
|
||||||
|
*image = NULL;
|
||||||
// Select a suitable font to render the title with.
|
// Select a suitable font to render the title with.
|
||||||
const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
|
const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
|
||||||
string title_font;
|
string title_font;
|
||||||
@ -882,10 +883,9 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage,
|
|||||||
all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
|
all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*image = NULL;
|
|
||||||
font_index_ = 0;
|
font_index_ = 0;
|
||||||
char_map_.clear();
|
char_map_.clear();
|
||||||
return last_offset_;
|
return last_offset_ == 0 ? -1 : last_offset_;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tesseract
|
} // namespace tesseract
|
||||||
|
@ -44,516 +44,7 @@
|
|||||||
# appropriate --fonts_dir path.
|
# appropriate --fonts_dir path.
|
||||||
|
|
||||||
|
|
||||||
FONTS=(
|
source `dirname $0`/tesstrain_utils.sh
|
||||||
"Arial" \
|
|
||||||
"Times New Roman," \
|
|
||||||
)
|
|
||||||
FONTS_DIR="/usr/share/fonts/truetype/"
|
|
||||||
OUTPUT_DIR="/tmp/tesstrain/tessdata"
|
|
||||||
OVERWRITE=0
|
|
||||||
RUN_SHAPE_CLUSTERING=0
|
|
||||||
EXTRACT_FONT_PROPERTIES=1
|
|
||||||
WORKSPACE_DIR="/tmp/tesstrain"
|
|
||||||
|
|
||||||
|
|
||||||
# Logging helper functions.
|
|
||||||
tlog() {
|
|
||||||
echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE}
|
|
||||||
}
|
|
||||||
|
|
||||||
err() {
|
|
||||||
echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE}
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
# Helper function to run a command and append its output to a log. Aborts early
|
|
||||||
# if the program file is not found.
|
|
||||||
# Usage: run_cmd CMD ARG1 ARG2...
|
|
||||||
run_cmd() {
|
|
||||||
local cmd=$1
|
|
||||||
shift
|
|
||||||
if [[ ! -x ${cmd} ]]; then
|
|
||||||
err "File ${cmd} not found"
|
|
||||||
fi
|
|
||||||
tlog "[$(date)] ${cmd} $@"
|
|
||||||
${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
|
|
||||||
# check completion status
|
|
||||||
if [[ $? -gt 0 ]]; then
|
|
||||||
err "Program $(basename ${cmd}) failed. Abort."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check if all the given files exist, or exit otherwise.
|
|
||||||
# Used to check required input files and produced output files in each phase.
|
|
||||||
# Usage: check_file_readable FILE1 FILE2...
|
|
||||||
check_file_readable() {
|
|
||||||
for file in $@; do
|
|
||||||
if [[ ! -r ${file} ]]; then
|
|
||||||
err "${file} does not exist or is not readable"
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Write a file (with name specified in $2) with records that account for
|
|
||||||
# n% (specified in $3) of the total weights of records in the input file
|
|
||||||
# (input file name specified in $1). The input file should have one record
|
|
||||||
# per line along with its weight separated by \t. The records should be
|
|
||||||
# sorted in non-ascending order of frequency.
|
|
||||||
# If $4 is true the first record is skipped.
|
|
||||||
# USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE
|
|
||||||
discard_tail() {
|
|
||||||
local infile=$1
|
|
||||||
local outfile=$2
|
|
||||||
local pct=$3
|
|
||||||
local skip_first=$4
|
|
||||||
|
|
||||||
local more_arg="1";
|
|
||||||
if [[ ${skip_first} ]]; then
|
|
||||||
more_arg="2"
|
|
||||||
fi
|
|
||||||
local sum=$(tail -n +${more_arg} ${infile} \
|
|
||||||
| awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}')
|
|
||||||
if [[ ${sum} == "" ]]; then sum=0
|
|
||||||
fi
|
|
||||||
local limit=$((${sum}*${pct}/100))
|
|
||||||
tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"}
|
|
||||||
{if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \
|
|
||||||
>> ${outfile}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Set global path variables that are based on parsed flags.
|
|
||||||
set_prog_paths() {
|
|
||||||
if [[ -z ${BINDIR} ]]; then
|
|
||||||
err "Need to specify location of program files"
|
|
||||||
fi
|
|
||||||
CN_TRAINING_EXE=${BINDIR}/cntraining
|
|
||||||
COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
|
|
||||||
MF_TRAINING_EXE=${BINDIR}/mftraining
|
|
||||||
SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
|
|
||||||
SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
|
|
||||||
TESSERACT_EXE=${BINDIR}/tesseract
|
|
||||||
TEXT2IMAGE_EXE=${BINDIR}/text2image
|
|
||||||
UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
|
|
||||||
WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
|
|
||||||
}
|
|
||||||
|
|
||||||
# Sets the named variable to given value. Aborts if the value is missing or
|
|
||||||
# if it looks like a flag.
|
|
||||||
# Usage: parse_value VAR_NAME VALUE
|
|
||||||
parse_value() {
|
|
||||||
local val="$2"
|
|
||||||
if [[ -z $val ]]; then
|
|
||||||
err "Missing value for variable $1"
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
if [[ ${val:0:2} == "--" ]]; then
|
|
||||||
err "Invalid value $val passed for variable $1"
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
eval $1=\"$val\"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Does simple command-line parsing and initialization.
|
|
||||||
parse_flags() {
|
|
||||||
local i=0
|
|
||||||
while test $i -lt ${#ARGV[@]}; do
|
|
||||||
local j=$((i+1))
|
|
||||||
case ${ARGV[$i]} in
|
|
||||||
--)
|
|
||||||
break;;
|
|
||||||
--bin_dir)
|
|
||||||
parse_value "BINDIR" ${ARGV[$j]}
|
|
||||||
i=$j ;;
|
|
||||||
--fontlist) # Expect a plus-separated list of names
|
|
||||||
if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
|
|
||||||
err "Invalid value passed to --fontlist"
|
|
||||||
fi
|
|
||||||
local ofs=$IFS
|
|
||||||
IFS='+'
|
|
||||||
FONTS=( ${ARGV[$j]} )
|
|
||||||
IFS=$ofs
|
|
||||||
i=$j ;;
|
|
||||||
--fonts_dir)
|
|
||||||
parse_value "FONTS_DIR" ${ARGV[$j]}
|
|
||||||
i=$j ;;
|
|
||||||
--lang)
|
|
||||||
parse_value "LANG_CODE" ${ARGV[$j]}
|
|
||||||
i=$j ;;
|
|
||||||
--langdata_dir)
|
|
||||||
parse_value "LANGDATA_ROOT" ${ARGV[$j]}
|
|
||||||
i=$j ;;
|
|
||||||
--output_dir)
|
|
||||||
parse_value "OUTPUT_DIR" ${ARGV[$j]}
|
|
||||||
i=$j ;;
|
|
||||||
--overwrite)
|
|
||||||
OVERWRITE=1 ;;
|
|
||||||
--extract_font_properties)
|
|
||||||
EXTRACT_FONT_PROPERTIES=1 ;;
|
|
||||||
--noextract_font_properties)
|
|
||||||
EXTRACT_FONT_PROPERTIES=0 ;;
|
|
||||||
--run_shape_clustering)
|
|
||||||
RUN_SHAPE_CLUSTERING=1 ;;
|
|
||||||
--tessdata_dir)
|
|
||||||
parse_value "TESSDATA_DIR" ${ARGV[$j]}
|
|
||||||
i=$j ;;
|
|
||||||
--training_text)
|
|
||||||
parse_value "TRAINING_TEXT" "${ARGV[$j]}"
|
|
||||||
i=$j ;;
|
|
||||||
--wordlist)
|
|
||||||
parse_value "WORDLIST_FILE" ${ARGV[$j]}
|
|
||||||
i=$j ;;
|
|
||||||
*)
|
|
||||||
err "Unrecognized argument ${ARGV[$i]}" ;;
|
|
||||||
esac
|
|
||||||
i=$((i+1))
|
|
||||||
done
|
|
||||||
if [[ -z ${LANG_CODE} ]]; then
|
|
||||||
err "Need to specify a language --lang"
|
|
||||||
fi
|
|
||||||
if [[ -z ${BINDIR} ]]; then
|
|
||||||
err "Need to specify path to built binaries --bin_dir"
|
|
||||||
fi
|
|
||||||
if [[ -z ${LANGDATA_ROOT} ]]; then
|
|
||||||
err "Need to specify path to language files --langdata_dir"
|
|
||||||
fi
|
|
||||||
if [[ -z ${TESSDATA_DIR} ]]; then
|
|
||||||
if [[ -z ${TESSDATA_PREFIX} ]]; then
|
|
||||||
err "Need to specify a --tessdata_dir or have a "\
|
|
||||||
"TESSDATA_PREFIX variable defined in your environment"
|
|
||||||
else
|
|
||||||
TESSDATA_DIR="${TESSDATA_PREFIX}"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
set_prog_paths
|
|
||||||
|
|
||||||
# Location where intermediate files will be created.
|
|
||||||
TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
|
|
||||||
# Location of log file for the whole run.
|
|
||||||
LOG_FILE=${TRAINING_DIR}/tesstrain.log
|
|
||||||
|
|
||||||
# Take training text and wordlist from the langdata directory if not
|
|
||||||
# specified in the commend-line.
|
|
||||||
if [[ -z ${TRAINING_TEXT} ]]; then
|
|
||||||
TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text
|
|
||||||
fi
|
|
||||||
if [[ -z ${WORDLIST_FILE} ]]; then
|
|
||||||
WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean
|
|
||||||
fi
|
|
||||||
WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean
|
|
||||||
NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
|
|
||||||
PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
|
|
||||||
BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
|
|
||||||
UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
|
|
||||||
TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
|
|
||||||
}
|
|
||||||
|
|
||||||
# Phase I : Generate (I)mages from training text for each font.
|
|
||||||
phaseI_generate_image() {
|
|
||||||
tlog "\n=== Phase I: Generating training images ==="
|
|
||||||
if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then
|
|
||||||
err "Could not find training text file ${TRAINING_TEXT}"
|
|
||||||
fi
|
|
||||||
BOX_PADDING="0"
|
|
||||||
CHAR_SPACING="0.0"
|
|
||||||
EXPOSURE="0"
|
|
||||||
LEADING="32"
|
|
||||||
NGRAM_CHAR_SPACING="0.0"
|
|
||||||
|
|
||||||
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS} ]]; then
|
|
||||||
# Parse .bigram_freqs file and compose a .train_ngrams file with text
|
|
||||||
# for tesseract to recognize during training. Take only the ngrams whose
|
|
||||||
# combined weight accounts for 95% of all the bigrams in the language.
|
|
||||||
TMP_FILE="${TRAINING_DIR}/_tmp"
|
|
||||||
cat ${BIGRAM_FREQS_FILE} > ${TMP_FILE}
|
|
||||||
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
|
|
||||||
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
|
|
||||||
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
|
|
||||||
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
|
|
||||||
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
|
|
||||||
check_file_readable ${TRAIN_NGRAMS_FILE}
|
|
||||||
fi
|
|
||||||
|
|
||||||
for font in "${FONTS[@]}"; do
|
|
||||||
tlog "Rendering using ${font}"
|
|
||||||
fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
|
|
||||||
outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
|
|
||||||
|
|
||||||
common_args="--leading=${LEADING} --fonts_dir=${FONTS_DIR} "
|
|
||||||
common_args+=" --box_padding=${BOX_PADDING} --strip_unrenderable_words"
|
|
||||||
|
|
||||||
run_cmd ${TEXT2IMAGE_EXE} ${common_args} \
|
|
||||||
--char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE} \
|
|
||||||
--font="${font}" --outputbase=${outbase} --text=${TRAINING_TEXT}
|
|
||||||
check_file_readable ${outbase}.box ${outbase}.tif
|
|
||||||
|
|
||||||
if (( ${EXTRACT_FONT_PROPERTIES} )) &&
|
|
||||||
[[ -r ${TRAIN_NGRAMS_FILE} ]]; then
|
|
||||||
tlog "Rendering ngrams using ${font}"
|
|
||||||
outbase=${TRAINING_DIR}/ngrams/${LANG_CODE}.ngrams.${fontname}.exp${EXPOSURE}
|
|
||||||
run_cmd ${TEXT2IMAGE_EXE} ${common_args} \
|
|
||||||
--char_spacing=${NGRAM_CHAR_SPACING} --exposure=${EXPOSURE} \
|
|
||||||
--font="${font}" --outputbase=${outbase} \
|
|
||||||
--box_padding=${BOX_PADDING} --render_ngrams=1 \
|
|
||||||
--text=${TRAIN_NGRAMS_FILE}
|
|
||||||
check_file_readable ${outbase}.box ${outbase}.tif
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Phase UP : Generate (U)nicharset and (P)roperties file.
|
|
||||||
phaseUP_generate_unicharset() {
|
|
||||||
tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
|
|
||||||
|
|
||||||
box_files=$(ls ${TRAINING_DIR}/*.box)
|
|
||||||
run_cmd ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
|
|
||||||
outfile=${TRAINING_DIR}/unicharset
|
|
||||||
UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
|
|
||||||
check_file_readable ${outfile}
|
|
||||||
mv ${outfile} ${UNICHARSET_FILE}
|
|
||||||
|
|
||||||
XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
|
|
||||||
check_file_readable ${UNICHARSET_FILE}
|
|
||||||
run_cmd ${SET_UNICHARSET_PROPERTIES_EXE} \
|
|
||||||
-U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
|
|
||||||
--script_dir=${LANGDATA_ROOT}
|
|
||||||
check_file_readable ${XHEIGHTS_FILE}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Phase D : Generate (D)awg files from unicharset file and wordlist files
|
|
||||||
phaseD_generate_dawg() {
|
|
||||||
tlog "\n=== Phase D: Generating Dawg files ==="
|
|
||||||
# Output files
|
|
||||||
WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
|
|
||||||
FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
|
|
||||||
PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
|
|
||||||
NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
|
|
||||||
BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
|
|
||||||
|
|
||||||
# Word DAWG
|
|
||||||
local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
|
|
||||||
if [[ -r ${WORDLIST_FILE} ]]; then
|
|
||||||
tlog "Generating word Dawg"
|
|
||||||
check_file_readable ${UNICHARSET_FILE}
|
|
||||||
run_cmd ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
|
|
||||||
${UNICHARSET_FILE}
|
|
||||||
check_file_readable ${WORD_DAWG}
|
|
||||||
|
|
||||||
FREQ_DAWG_SIZE=100
|
|
||||||
head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Freq-word DAWG
|
|
||||||
if [[ -r ${freq_wordlist_file} ]]; then
|
|
||||||
check_file_readable ${UNICHARSET_FILE}
|
|
||||||
tlog "Generating frequent-word Dawg"
|
|
||||||
run_cmd ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \
|
|
||||||
${UNICHARSET_FILE}
|
|
||||||
check_file_readable ${FREQ_DAWG}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Punctuation DAWG
|
|
||||||
local punc_clean="${LANGDATA_ROOT}/common.punc"
|
|
||||||
if [[ -r ${PUNC_FILE} ]]; then
|
|
||||||
local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top
|
|
||||||
head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
|
|
||||||
> ${top_punc_file}
|
|
||||||
discard_tail ${PUNC_FILE} ${top_punc_file} 99 1
|
|
||||||
punc_clean="${top_punc_file}"
|
|
||||||
fi
|
|
||||||
# -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
|
|
||||||
# (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
|
|
||||||
# We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
|
|
||||||
# 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
|
|
||||||
# 2/RRP_FORCE_REVERSE for the punctuation DAWG.
|
|
||||||
local punc_reverse_policy=0;
|
|
||||||
if [[ ${LANG_CODE} == "heb" || ${LANG_CODE} == "ara" ]]; then
|
|
||||||
punc_reverse_policy=2
|
|
||||||
fi
|
|
||||||
if [[ -r ${punc_clean} ]]; then
|
|
||||||
run_cmd ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
|
|
||||||
${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE}
|
|
||||||
check_file_readable ${PUNC_DAWG}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Numbers DAWG
|
|
||||||
if [[ -r ${NUMBERS_FILE} ]]; then
|
|
||||||
local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top
|
|
||||||
head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
|
|
||||||
> ${top_num_file}
|
|
||||||
discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1
|
|
||||||
run_cmd ${WORDLIST2DAWG_EXE} -r 0 \
|
|
||||||
${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE}
|
|
||||||
check_file_readable ${NUMBER_DAWG}
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Bigram dawg
|
|
||||||
if [[ -r ${WORD_BIGRAMS_FILE} ]]; then
|
|
||||||
run_cmd ${WORDLIST2DAWG_EXE} -r 1 \
|
|
||||||
${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
|
|
||||||
check_file_readable ${BIGRAM_DAWG}
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
# Phase E : (E)xtract .tr feature files from .tif/.box files
|
|
||||||
phaseE_extract_features() {
|
|
||||||
tlog "\n=== Phase E: Extracting features ==="
|
|
||||||
local box_config="box.train"
|
|
||||||
TRAIN_EXPOSURES='0'
|
|
||||||
|
|
||||||
for exposure in ${TRAIN_EXPOSURES}; do
|
|
||||||
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
|
|
||||||
done
|
|
||||||
|
|
||||||
# Use any available language-specific configs.
|
|
||||||
local config=""
|
|
||||||
if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
|
|
||||||
config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
|
|
||||||
fi
|
|
||||||
|
|
||||||
OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
|
|
||||||
export TESSDATA_PREFIX=${TESSDATA_DIR}
|
|
||||||
tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
|
||||||
for img_file in ${img_files}; do
|
|
||||||
run_cmd ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
|
|
||||||
${box_config} ${config}
|
|
||||||
done
|
|
||||||
export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
|
|
||||||
# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
|
|
||||||
phaseC_cluster_prototypes() {
|
|
||||||
tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
|
|
||||||
local out_normproto=${TRAINING_DIR}/${LANG_CODE}.normproto
|
|
||||||
|
|
||||||
run_cmd ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
|
|
||||||
$(ls ${TRAINING_DIR}/*.tr)
|
|
||||||
|
|
||||||
check_file_readable ${TRAINING_DIR}/normproto
|
|
||||||
mv ${TRAINING_DIR}/normproto ${out_normproto}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Phase S : (S)hape clustering
|
|
||||||
phaseS_cluster_shapes() {
|
|
||||||
if (( ! ${RUN_SHAPE_CLUSTERING} )); then
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
check_file_readable ${LANGDATA_ROOT}/font_properties
|
|
||||||
local font_props=${LANGDATA_ROOT}/font_properties
|
|
||||||
if [[ -r ${font_props} ]]; then
|
|
||||||
font_props="-F ${font_props}"
|
|
||||||
else
|
|
||||||
font_props=""
|
|
||||||
fi
|
|
||||||
if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
|
|
||||||
[[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
|
|
||||||
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
|
|
||||||
fi
|
|
||||||
|
|
||||||
run_cmd ${SHAPE_TRAINING_EXE} \
|
|
||||||
-D "${TRAINING_DIR}/" \
|
|
||||||
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
|
|
||||||
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
|
|
||||||
${font_props} \
|
|
||||||
$(ls ${TRAINING_DIR}/*.tr)
|
|
||||||
check_file_readable ${TRAINING_DIR}/shapetable \
|
|
||||||
${TRAINING_DIR}/${LANG_CODE}.mfunicharset
|
|
||||||
}
|
|
||||||
|
|
||||||
# Phase M : Clustering microfeatures (mfTraining)
|
|
||||||
phaseM_cluster_microfeatures() {
|
|
||||||
tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
|
|
||||||
|
|
||||||
font_props=${LANGDATA_ROOT}/font_properties
|
|
||||||
if [[ -r ${font_props} ]]; then
|
|
||||||
font_props="-F ${font_props}"
|
|
||||||
else
|
|
||||||
font_props=""
|
|
||||||
fi
|
|
||||||
if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
|
|
||||||
[[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
|
|
||||||
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
|
|
||||||
fi
|
|
||||||
|
|
||||||
run_cmd ${MF_TRAINING_EXE} \
|
|
||||||
-D "${TRAINING_DIR}/" \
|
|
||||||
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
|
|
||||||
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
|
|
||||||
${font_props} \
|
|
||||||
$(ls ${TRAINING_DIR}/*.tr)
|
|
||||||
check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
|
|
||||||
${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
|
|
||||||
mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
|
|
||||||
mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
|
|
||||||
mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
|
|
||||||
mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
|
|
||||||
}
|
|
||||||
|
|
||||||
phaseB_generate_ambiguities() {
|
|
||||||
tlog "\n=== Phase B : ambiguities training ==="
|
|
||||||
|
|
||||||
# Check for manually created ambiguities data.
|
|
||||||
if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
|
|
||||||
tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
|
|
||||||
cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
|
|
||||||
${TRAINING_DIR}/${LANG_CODE}.unicharambigs
|
|
||||||
# Make it writable, as it may be read-only in the client.
|
|
||||||
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
|
|
||||||
return
|
|
||||||
else
|
|
||||||
tlog "No unicharambigs file found!"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# TODO: Add support for generating ambiguities automatically.
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
make_traineddata() {
|
|
||||||
tlog "\n=== Making final traineddata file ==="
|
|
||||||
local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
|
|
||||||
|
|
||||||
# Combine available files for this language from the langdata dir.
|
|
||||||
if [[ -r ${lang_prefix}.config ]]; then
|
|
||||||
tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
|
|
||||||
cp ${lang_prefix}.config ${TRAINING_DIR}
|
|
||||||
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
|
|
||||||
fi
|
|
||||||
if [[ -r ${lang_prefix}.cube-unicharset ]]; then
|
|
||||||
tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}"
|
|
||||||
cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR}
|
|
||||||
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset
|
|
||||||
fi
|
|
||||||
if [[ -r ${lang_prefix}.cube-word-dawg ]]; then
|
|
||||||
tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}"
|
|
||||||
cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR}
|
|
||||||
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg
|
|
||||||
fi
|
|
||||||
if [[ -r ${lang_prefix}.params-model ]]; then
|
|
||||||
tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
|
|
||||||
cp ${lang_prefix}.params-model ${TRAINING_DIR}
|
|
||||||
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Compose the traineddata file.
|
|
||||||
run_cmd ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
|
|
||||||
|
|
||||||
# Copy it to the output dir, overwriting only if allowed by the cmdline flag.
|
|
||||||
if [[ ! -d ${OUTPUT_DIR} ]]; then
|
|
||||||
tlog "Creating new directory ${OUTPUT_DIR}"
|
|
||||||
mkdir -p ${OUTPUT_DIR}
|
|
||||||
fi
|
|
||||||
local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
|
|
||||||
if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then
|
|
||||||
err "File ${destfile} exists and no --overwrite specified";
|
|
||||||
fi
|
|
||||||
tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
|
|
||||||
cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
ARGV=("$@")
|
ARGV=("$@")
|
||||||
parse_flags
|
parse_flags
|
||||||
@ -564,14 +55,21 @@ tlog "Cleaning workspace directory ${TRAINING_DIR}..."
|
|||||||
mkdir -p ${TRAINING_DIR}
|
mkdir -p ${TRAINING_DIR}
|
||||||
rm -fr ${TRAINING_DIR}/*
|
rm -fr ${TRAINING_DIR}/*
|
||||||
|
|
||||||
phaseI_generate_image
|
source `dirname $0`/language-specific.sh
|
||||||
phaseUP_generate_unicharset
|
set_lang_specific_parameters ${LANG_CODE}
|
||||||
phaseD_generate_dawg
|
|
||||||
phaseE_extract_features
|
initialize_fontconfig
|
||||||
phaseC_cluster_prototypes
|
|
||||||
phaseS_cluster_shapes
|
phase_I_generate_image 8
|
||||||
phaseM_cluster_microfeatures
|
phase_UP_generate_unicharset
|
||||||
phaseB_generate_ambiguities
|
phase_D_generate_dawg
|
||||||
make_traineddata
|
phase_E_extract_features "box.train" 8
|
||||||
|
phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"
|
||||||
|
if [[ "${ENABLE_SHAPE_CLUSTERING}" == "y" ]]; then
|
||||||
|
phase_S_cluster_shapes
|
||||||
|
fi
|
||||||
|
phase_M_cluster_microfeatures
|
||||||
|
phase_B_generate_ambiguities
|
||||||
|
make__traineddata
|
||||||
|
|
||||||
tlog "\nCompleted training for language '${LANG_CODE}'\n"
|
tlog "\nCompleted training for language '${LANG_CODE}'\n"
|
||||||
|
578
training/tesstrain_utils.sh
Executable file
578
training/tesstrain_utils.sh
Executable file
@ -0,0 +1,578 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# (C) Copyright 2014, Google Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
# This script defines functions that are used by tesstrain.sh
|
||||||
|
# For a detailed description of the phases, see
|
||||||
|
# https://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3
|
||||||
|
#
|
||||||
|
# USAGE: source tesstrain_utils.sh
|
||||||
|
|
||||||
|
FONTS=(
|
||||||
|
"Arial" \
|
||||||
|
"Times New Roman," \
|
||||||
|
)
|
||||||
|
FONTS_DIR="/usr/share/fonts/truetype/"
|
||||||
|
OUTPUT_DIR="/tmp/tesstrain/tessdata"
|
||||||
|
OVERWRITE=0
|
||||||
|
RUN_SHAPE_CLUSTERING=0
|
||||||
|
EXTRACT_FONT_PROPERTIES=1
|
||||||
|
WORKSPACE_DIR="/tmp/tesstrain"
|
||||||
|
|
||||||
|
# Logging helper functions.
|
||||||
|
tlog() {
|
||||||
|
echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE}
|
||||||
|
}
|
||||||
|
|
||||||
|
err_exit() {
|
||||||
|
echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE}
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function to run a command and append its output to a log. Aborts early
|
||||||
|
# if the program file is not found.
|
||||||
|
# Usage: run_command CMD ARG1 ARG2...
|
||||||
|
run_command() {
|
||||||
|
local cmd=$1
|
||||||
|
shift
|
||||||
|
if [[ ! -x ${cmd} ]]; then
|
||||||
|
err_exit "File ${cmd} not found"
|
||||||
|
fi
|
||||||
|
tlog "[$(date)] ${cmd} $@"
|
||||||
|
${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
|
||||||
|
# check completion status
|
||||||
|
if [[ $? -gt 0 ]]; then
|
||||||
|
err_exit "Program $(basename ${cmd}) failed. Abort."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check if all the given files exist, or exit otherwise.
|
||||||
|
# Used to check required input files and produced output files in each phase.
|
||||||
|
# Usage: check_file_readable FILE1 FILE2...
|
||||||
|
check_file_readable() {
|
||||||
|
for file in $@; do
|
||||||
|
if [[ ! -r ${file} ]]; then
|
||||||
|
err_exit "${file} does not exist or is not readable"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# Write a file (with name specified in $2) with records that account for
|
||||||
|
# n% (specified in $3) of the total weights of records in the input file
|
||||||
|
# (input file name specified in $1). The input file should have one record
|
||||||
|
# per line along with its weight separated by \t. The records should be
|
||||||
|
# sorted in non-ascending order of frequency.
|
||||||
|
# If $4 is true the first record is skipped.
|
||||||
|
# USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE
|
||||||
|
discard_tail() {
|
||||||
|
local infile=$1
|
||||||
|
local outfile=$2
|
||||||
|
local pct=$3
|
||||||
|
local skip_first=$4
|
||||||
|
|
||||||
|
local more_arg="1";
|
||||||
|
if [[ ${skip_first} ]]; then
|
||||||
|
more_arg="2"
|
||||||
|
fi
|
||||||
|
local sum=$(tail -n +${more_arg} ${infile} \
|
||||||
|
| awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}')
|
||||||
|
if [[ ${sum} == "" ]]; then sum=0
|
||||||
|
fi
|
||||||
|
local limit=$((${sum}*${pct}/100))
|
||||||
|
tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"}
|
||||||
|
{if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \
|
||||||
|
>> ${outfile}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set global path variables that are based on parsed flags.
|
||||||
|
set_prog_paths() {
|
||||||
|
if [[ -z ${BINDIR} ]]; then
|
||||||
|
err_exit "Need to specify location of program files"
|
||||||
|
fi
|
||||||
|
CN_TRAINING_EXE=${BINDIR}/cntraining
|
||||||
|
COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
|
||||||
|
MF_TRAINING_EXE=${BINDIR}/mftraining
|
||||||
|
SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
|
||||||
|
SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
|
||||||
|
TESSERACT_EXE=${BINDIR}/tesseract
|
||||||
|
TEXT2IMAGE_EXE=${BINDIR}/text2image
|
||||||
|
UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
|
||||||
|
WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
|
||||||
|
}
|
||||||
|
|
||||||
|
# Sets the named variable to given value. Aborts if the value is missing or
|
||||||
|
# if it looks like a flag.
|
||||||
|
# Usage: parse_value VAR_NAME VALUE
|
||||||
|
parse_value() {
|
||||||
|
local val="$2"
|
||||||
|
if [[ -z $val ]]; then
|
||||||
|
err_exit "Missing value for variable $1"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
if [[ ${val:0:2} == "--" ]]; then
|
||||||
|
err_exit "Invalid value $val passed for variable $1"
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
eval $1=\"$val\"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Does simple command-line parsing and initialization.
|
||||||
|
parse_flags() {
|
||||||
|
local i=0
|
||||||
|
while test $i -lt ${#ARGV[@]}; do
|
||||||
|
local j=$((i+1))
|
||||||
|
case ${ARGV[$i]} in
|
||||||
|
--)
|
||||||
|
break;;
|
||||||
|
--bin_dir)
|
||||||
|
parse_value "BINDIR" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
|
--fontlist) # Expect a plus-separated list of names
|
||||||
|
if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
|
||||||
|
err_exit "Invalid value passed to --fontlist"
|
||||||
|
fi
|
||||||
|
local ofs=$IFS
|
||||||
|
IFS='+'
|
||||||
|
FONTS=( ${ARGV[$j]} )
|
||||||
|
IFS=$ofs
|
||||||
|
i=$j ;;
|
||||||
|
--fonts_dir)
|
||||||
|
parse_value "FONTS_DIR" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
|
--lang)
|
||||||
|
parse_value "LANG_CODE" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
|
--langdata_dir)
|
||||||
|
parse_value "LANGDATA_ROOT" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
|
--output_dir)
|
||||||
|
parse_value "OUTPUT_DIR" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
|
--overwrite)
|
||||||
|
OVERWRITE=1 ;;
|
||||||
|
--extract_font_properties)
|
||||||
|
EXTRACT_FONT_PROPERTIES=1 ;;
|
||||||
|
--noextract_font_properties)
|
||||||
|
EXTRACT_FONT_PROPERTIES=0 ;;
|
||||||
|
--tessdata_dir)
|
||||||
|
parse_value "TESSDATA_DIR" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
|
--training_text)
|
||||||
|
parse_value "TRAINING_TEXT" "${ARGV[$j]}"
|
||||||
|
i=$j ;;
|
||||||
|
--wordlist)
|
||||||
|
parse_value "WORDLIST_FILE" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
|
*)
|
||||||
|
err_exit "Unrecognized argument ${ARGV[$i]}" ;;
|
||||||
|
esac
|
||||||
|
i=$((i+1))
|
||||||
|
done
|
||||||
|
if [[ -z ${LANG_CODE} ]]; then
|
||||||
|
err_exit "Need to specify a language --lang"
|
||||||
|
fi
|
||||||
|
if [[ -z ${BINDIR} ]]; then
|
||||||
|
err_exit "Need to specify path to built binaries --bin_dir"
|
||||||
|
fi
|
||||||
|
if [[ -z ${LANGDATA_ROOT} ]]; then
|
||||||
|
err_exit "Need to specify path to language files --langdata_dir"
|
||||||
|
fi
|
||||||
|
if [[ -z ${TESSDATA_DIR} ]]; then
|
||||||
|
if [[ -z ${TESSDATA_PREFIX} ]]; then
|
||||||
|
err_exit "Need to specify a --tessdata_dir or have a "\
|
||||||
|
"TESSDATA_PREFIX variable defined in your environment"
|
||||||
|
else
|
||||||
|
TESSDATA_DIR="${TESSDATA_PREFIX}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
set_prog_paths
|
||||||
|
|
||||||
|
# Location where intermediate files will be created.
|
||||||
|
TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
|
||||||
|
# Location of log file for the whole run.
|
||||||
|
LOG_FILE=${TRAINING_DIR}/tesstrain.log
|
||||||
|
|
||||||
|
# Take training text and wordlist from the langdata directory if not
|
||||||
|
# specified in the commend-line.
|
||||||
|
if [[ -z ${TRAINING_TEXT} ]]; then
|
||||||
|
TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text
|
||||||
|
fi
|
||||||
|
if [[ -z ${WORDLIST_FILE} ]]; then
|
||||||
|
WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean
|
||||||
|
fi
|
||||||
|
WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean
|
||||||
|
NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
|
||||||
|
PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
|
||||||
|
BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
|
||||||
|
UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
|
||||||
|
TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
|
||||||
|
GENERATE_DAWGS=1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function initializes font config with a unique font cache dir.
|
||||||
|
initialize_fontconfig() {
|
||||||
|
export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
|
||||||
|
local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
|
||||||
|
echo "Text" >${sample_path}
|
||||||
|
run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
|
||||||
|
--font="Arial" --outputbase=${sample_path} --text=${sample_path} \
|
||||||
|
--fontconfig_tmpdir=${FONT_CONFIG_CACHE}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function for phaseI_generate_image. Generates the image for a single
|
||||||
|
# language/font combination in a way that can be run in parallel.
|
||||||
|
generate_font_image() {
|
||||||
|
local font="$1"
|
||||||
|
tlog "Rendering using ${font}"
|
||||||
|
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
|
||||||
|
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
|
||||||
|
|
||||||
|
local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}"
|
||||||
|
common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
|
||||||
|
common_args+=" --fontconfig_refresh_config_file=false --leading=${LEADING}"
|
||||||
|
common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
|
||||||
|
common_args+=" --outputbase=${outbase}"
|
||||||
|
|
||||||
|
# add --writing_mode=vertical-upright to common_args if the font is
|
||||||
|
# specified to be rendered vertically.
|
||||||
|
for vfont in "${VERTICAL_FONTS[@]}"; do
|
||||||
|
if [[ "${font}" == "${vfont}" ]]; then
|
||||||
|
common_args+=" --writing_mode=vertical-upright "
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
|
||||||
|
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
|
||||||
|
check_file_readable ${outbase}.box ${outbase}.tif
|
||||||
|
|
||||||
|
if (( ${EXTRACT_FONT_PROPERTIES} )) &&
|
||||||
|
[[ -r ${TRAIN_NGRAMS_FILE} ]]; then
|
||||||
|
tlog "Extracting font properties of ${font}"
|
||||||
|
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
|
||||||
|
--ligatures=false --text=${TRAIN_NGRAMS_FILE} \
|
||||||
|
--only_extract_font_properties --ptsize=32
|
||||||
|
check_file_readable ${outbase}.fontinfo
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Phase I : Generate (I)mages from training text for each font.
|
||||||
|
phase_I_generate_image() {
|
||||||
|
local par_factor=$1
|
||||||
|
if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
|
||||||
|
par_factor=1
|
||||||
|
fi
|
||||||
|
tlog "\n=== Phase I: Generating training images ==="
|
||||||
|
if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then
|
||||||
|
err_exit "Could not find training text file ${TRAINING_TEXT}"
|
||||||
|
fi
|
||||||
|
CHAR_SPACING="0.0"
|
||||||
|
EXPOSURE="0"
|
||||||
|
|
||||||
|
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
|
||||||
|
# Parse .bigram_freqs file and compose a .train_ngrams file with text
|
||||||
|
# for tesseract to recognize during training. Take only the ngrams whose
|
||||||
|
# combined weight accounts for 95% of all the bigrams in the language.
|
||||||
|
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
|
||||||
|
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
|
||||||
|
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
|
||||||
|
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
|
||||||
|
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
|
||||||
|
check_file_readable ${TRAIN_NGRAMS_FILE}
|
||||||
|
fi
|
||||||
|
|
||||||
|
local counter=0
|
||||||
|
for font in "${FONTS[@]}"; do
|
||||||
|
generate_font_image "${font}" &
|
||||||
|
let counter=counter+1
|
||||||
|
let rem=counter%par_factor
|
||||||
|
if [[ "${rem}" -eq 0 ]]; then
|
||||||
|
wait
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
# Check that each process was successful.
|
||||||
|
for font in "${FONTS[@]}"; do
|
||||||
|
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
|
||||||
|
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
|
||||||
|
check_file_readable ${outbase}.box ${outbase}.tif
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phase UP : Generate (U)nicharset and (P)roperties file.
|
||||||
|
phase_UP_generate_unicharset() {
|
||||||
|
tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
|
||||||
|
|
||||||
|
local box_files=$(ls ${TRAINING_DIR}/*.box)
|
||||||
|
run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
|
||||||
|
local outfile=${TRAINING_DIR}/unicharset
|
||||||
|
UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
|
||||||
|
check_file_readable ${outfile}
|
||||||
|
mv ${outfile} ${UNICHARSET_FILE}
|
||||||
|
|
||||||
|
XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
|
||||||
|
check_file_readable ${UNICHARSET_FILE}
|
||||||
|
run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
|
||||||
|
-U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
|
||||||
|
--script_dir=${LANGDATA_ROOT}
|
||||||
|
check_file_readable ${XHEIGHTS_FILE}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phase D : Generate (D)awg files from unicharset file and wordlist files
|
||||||
|
phase_D_generate_dawg() {
|
||||||
|
tlog "\n=== Phase D: Generating Dawg files ==="
|
||||||
|
|
||||||
|
# Skip if requested
|
||||||
|
if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
|
||||||
|
tlog "Skipping ${phase_name}"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Output files
|
||||||
|
WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
|
||||||
|
FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
|
||||||
|
PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
|
||||||
|
NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
|
||||||
|
BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
|
||||||
|
|
||||||
|
# Word DAWG
|
||||||
|
local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
|
||||||
|
if [[ -r ${WORDLIST_FILE} ]]; then
|
||||||
|
tlog "Generating word Dawg"
|
||||||
|
check_file_readable ${UNICHARSET_FILE}
|
||||||
|
run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
|
||||||
|
${UNICHARSET_FILE}
|
||||||
|
check_file_readable ${WORD_DAWG}
|
||||||
|
|
||||||
|
FREQ_DAWG_SIZE=100
|
||||||
|
head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Freq-word DAWG
|
||||||
|
if [[ -r ${freq_wordlist_file} ]]; then
|
||||||
|
check_file_readable ${UNICHARSET_FILE}
|
||||||
|
tlog "Generating frequent-word Dawg"
|
||||||
|
run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \
|
||||||
|
${UNICHARSET_FILE}
|
||||||
|
check_file_readable ${FREQ_DAWG}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Punctuation DAWG
|
||||||
|
local punc_clean="${LANGDATA_ROOT}/common.punc"
|
||||||
|
if [[ -r ${PUNC_FILE} ]]; then
|
||||||
|
local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top
|
||||||
|
head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
|
||||||
|
> ${top_punc_file}
|
||||||
|
discard_tail ${PUNC_FILE} ${top_punc_file} 99 1
|
||||||
|
punc_clean="${top_punc_file}"
|
||||||
|
fi
|
||||||
|
# -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
|
||||||
|
# (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
|
||||||
|
# We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
|
||||||
|
# 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
|
||||||
|
# 2/RRP_FORCE_REVERSE for the punctuation DAWG.
|
||||||
|
local punc_reverse_policy=0;
|
||||||
|
case ${LANG_CODE} in
|
||||||
|
ara | div| fas | pus | snd | syr | uig | urd | heb | yid )
|
||||||
|
punc_reverse_policy=2 ;;
|
||||||
|
* ) ;;
|
||||||
|
esac
|
||||||
|
if [[ -r ${punc_clean} ]]; then
|
||||||
|
run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
|
||||||
|
${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE}
|
||||||
|
check_file_readable ${PUNC_DAWG}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Numbers DAWG
|
||||||
|
if [[ -r ${NUMBERS_FILE} ]]; then
|
||||||
|
local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top
|
||||||
|
head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
|
||||||
|
> ${top_num_file}
|
||||||
|
discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1
|
||||||
|
run_command ${WORDLIST2DAWG_EXE} -r 0 \
|
||||||
|
${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE}
|
||||||
|
check_file_readable ${NUMBER_DAWG}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Bigram dawg
|
||||||
|
if [[ -r ${WORD_BIGRAMS_FILE} ]]; then
|
||||||
|
run_command ${WORDLIST2DAWG_EXE} -r 1 \
|
||||||
|
${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
|
||||||
|
check_file_readable ${BIGRAM_DAWG}
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phase E : (E)xtract .tr feature files from .tif/.box files
|
||||||
|
phase_E_extract_features() {
|
||||||
|
local box_config=$1
|
||||||
|
local par_factor=$2
|
||||||
|
if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
|
||||||
|
par_factor=1
|
||||||
|
fi
|
||||||
|
tlog "\n=== Phase E: Extracting features ==="
|
||||||
|
TRAIN_EXPOSURES='0'
|
||||||
|
|
||||||
|
local img_files=""
|
||||||
|
for exposure in ${TRAIN_EXPOSURES}; do
|
||||||
|
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
|
||||||
|
done
|
||||||
|
|
||||||
|
# Use any available language-specific configs.
|
||||||
|
local config=""
|
||||||
|
if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
|
||||||
|
config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
|
||||||
|
fi
|
||||||
|
|
||||||
|
OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
|
||||||
|
export TESSDATA_PREFIX=${TESSDATA_DIR}
|
||||||
|
tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||||
|
local counter=0
|
||||||
|
for img_file in ${img_files}; do
|
||||||
|
run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
|
||||||
|
${box_config} ${config} &
|
||||||
|
let counter=counter+1
|
||||||
|
let rem=counter%par_factor
|
||||||
|
if [[ "${rem}" -eq 0 ]]; then
|
||||||
|
wait
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
|
||||||
|
# Check that all the output files were produced.
|
||||||
|
for img_file in ${img_files}; do
|
||||||
|
check_file_readable ${img_file%.*}.tr
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
|
||||||
|
# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
|
||||||
|
phase_C_cluster_prototypes() {
|
||||||
|
tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
|
||||||
|
local out_normproto=$1
|
||||||
|
|
||||||
|
run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
|
||||||
|
$(ls ${TRAINING_DIR}/*.tr)
|
||||||
|
|
||||||
|
check_file_readable ${TRAINING_DIR}/normproto
|
||||||
|
mv ${TRAINING_DIR}/normproto ${out_normproto}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phase S : (S)hape clustering
|
||||||
|
phase_S_cluster_shapes() {
|
||||||
|
if (( ! ${RUN_SHAPE_CLUSTERING} )); then
|
||||||
|
tlog "\n=== Shape Clustering disabled ==="
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
check_file_readable ${LANGDATA_ROOT}/font_properties
|
||||||
|
local font_props="-F ${LANGDATA_ROOT}/font_properties"
|
||||||
|
if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
|
||||||
|
[[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
|
||||||
|
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_command ${SHAPE_TRAINING_EXE} \
|
||||||
|
-D "${TRAINING_DIR}/" \
|
||||||
|
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
|
||||||
|
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
|
||||||
|
${font_props} \
|
||||||
|
$(ls ${TRAINING_DIR}/*.tr)
|
||||||
|
check_file_readable ${TRAINING_DIR}/shapetable \
|
||||||
|
${TRAINING_DIR}/${LANG_CODE}.mfunicharset
|
||||||
|
}
|
||||||
|
|
||||||
|
# Phase M : Clustering microfeatures (mfTraining)
|
||||||
|
phase_M_cluster_microfeatures() {
|
||||||
|
tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
|
||||||
|
|
||||||
|
check_file_readable ${LANGDATA_ROOT}/font_properties
|
||||||
|
font_props="-F ${LANGDATA_ROOT}/font_properties"
|
||||||
|
if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
|
||||||
|
[[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
|
||||||
|
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_command ${MF_TRAINING_EXE} \
|
||||||
|
-D "${TRAINING_DIR}/" \
|
||||||
|
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
|
||||||
|
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
|
||||||
|
${font_props} \
|
||||||
|
$(ls ${TRAINING_DIR}/*.tr)
|
||||||
|
check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
|
||||||
|
${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
|
||||||
|
mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
|
||||||
|
mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
|
||||||
|
mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
|
||||||
|
mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
|
||||||
|
}
|
||||||
|
|
||||||
|
phase_B_generate_ambiguities() {
|
||||||
|
tlog "\n=== Phase B : ambiguities training ==="
|
||||||
|
|
||||||
|
# Check for manually created ambiguities data.
|
||||||
|
if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
|
||||||
|
tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
|
||||||
|
cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
|
||||||
|
${TRAINING_DIR}/${LANG_CODE}.unicharambigs
|
||||||
|
# Make it writable, as it may be read-only in the client.
|
||||||
|
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
|
||||||
|
return
|
||||||
|
else
|
||||||
|
tlog "No unicharambigs file found!"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# TODO: Add support for generating ambiguities automatically.
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
make__traineddata() {
|
||||||
|
tlog "\n=== Making final traineddata file ==="
|
||||||
|
local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
|
||||||
|
|
||||||
|
# Combine available files for this language from the langdata dir.
|
||||||
|
if [[ -r ${lang_prefix}.config ]]; then
|
||||||
|
tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
|
||||||
|
cp ${lang_prefix}.config ${TRAINING_DIR}
|
||||||
|
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
|
||||||
|
fi
|
||||||
|
if [[ -r ${lang_prefix}.cube-unicharset ]]; then
|
||||||
|
tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}"
|
||||||
|
cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR}
|
||||||
|
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset
|
||||||
|
fi
|
||||||
|
if [[ -r ${lang_prefix}.cube-word-dawg ]]; then
|
||||||
|
tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}"
|
||||||
|
cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR}
|
||||||
|
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg
|
||||||
|
fi
|
||||||
|
if [[ -r ${lang_prefix}.params-model ]]; then
|
||||||
|
tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
|
||||||
|
cp ${lang_prefix}.params-model ${TRAINING_DIR}
|
||||||
|
chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Compose the traineddata file.
|
||||||
|
run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
|
||||||
|
|
||||||
|
# Copy it to the output dir, overwriting only if allowed by the cmdline flag.
|
||||||
|
if [[ ! -d ${OUTPUT_DIR} ]]; then
|
||||||
|
tlog "Creating new directory ${OUTPUT_DIR}"
|
||||||
|
mkdir -p ${OUTPUT_DIR}
|
||||||
|
fi
|
||||||
|
local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
|
||||||
|
if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then
|
||||||
|
err_exit "File ${destfile} exists and no --overwrite specified";
|
||||||
|
fi
|
||||||
|
tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
|
||||||
|
cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
|
||||||
|
}
|
||||||
|
|
@ -115,7 +115,7 @@ STRING_PARAM_FLAG(writing_mode, "horizontal",
|
|||||||
|
|
||||||
INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes");
|
INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes");
|
||||||
|
|
||||||
BOOL_PARAM_FLAG(strip_unrenderable_words, false,
|
BOOL_PARAM_FLAG(strip_unrenderable_words, true,
|
||||||
"Remove unrenderable words from source text");
|
"Remove unrenderable words from source text");
|
||||||
|
|
||||||
// Font name.
|
// Font name.
|
||||||
@ -618,9 +618,9 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
pixDestroy(&binary);
|
pixDestroy(&binary);
|
||||||
}
|
}
|
||||||
if (FLAGS_find_fonts && !FLAGS_render_per_font && !font_names.empty()) {
|
if (FLAGS_find_fonts && offset != 0) {
|
||||||
// We just want a list of names, so we don't need to render any more
|
// We just want a list of names, or some sample images so we don't need
|
||||||
// of the text.
|
// to render more than the first page of the text.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -630,8 +630,7 @@ int main(int argc, char** argv) {
|
|||||||
box_name += ".box";
|
box_name += ".box";
|
||||||
render.WriteAllBoxes(box_name);
|
render.WriteAllBoxes(box_name);
|
||||||
} else if (!FLAGS_render_per_font && !font_names.empty()) {
|
} else if (!FLAGS_render_per_font && !font_names.empty()) {
|
||||||
string filename = FLAGS_outputbase.c_str();
|
string filename = FLAGS_outputbase + ".fontlist.txt";
|
||||||
filename += ".fontlist.txt";
|
|
||||||
FILE* fp = fopen(filename.c_str(), "wb");
|
FILE* fp = fopen(filename.c_str(), "wb");
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
tprintf("Failed to create output font list %s\n", filename.c_str());
|
tprintf("Failed to create output font list %s\n", filename.c_str());
|
||||||
|
193
training/unicharset_training_utils.cpp
Normal file
193
training/unicharset_training_utils.cpp
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
// File: unicharset_training_utils.cpp
|
||||||
|
// Description: Training utilities for UNICHARSET.
|
||||||
|
// Author: Ray Smith
|
||||||
|
// Created: Fri Oct 17 17:09:01 PDT 2014
|
||||||
|
//
|
||||||
|
// (C) Copyright 2014, Google Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
//
|
||||||
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#include "unicharset_training_utils.h"
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "fileio.h"
|
||||||
|
#include "genericvector.h"
|
||||||
|
#include "icuerrorcode.h"
|
||||||
|
#include "normstrngs.h"
|
||||||
|
#include "statistc.h"
|
||||||
|
#include "strngs.h"
|
||||||
|
#include "unicharset.h"
|
||||||
|
#include "unicode/uchar.h" // from libicu
|
||||||
|
#include "unicode/uscript.h" // from libicu
|
||||||
|
|
||||||
|
namespace tesseract {
|
||||||
|
|
||||||
|
// Helper sets the character attribute properties and sets up the script table.
|
||||||
|
// Does not set tops and bottoms.
|
||||||
|
void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) {
|
||||||
|
for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
|
||||||
|
// Convert any custom ligatures.
|
||||||
|
const char* unichar_str = unicharset->id_to_unichar(unichar_id);
|
||||||
|
for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
|
||||||
|
if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
|
||||||
|
unichar_str = UNICHARSET::kCustomLigatures[i][0];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert the unichar to UTF32 representation
|
||||||
|
GenericVector<char32> uni_vector;
|
||||||
|
tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
|
||||||
|
|
||||||
|
// Assume that if the property is true for any character in the string,
|
||||||
|
// then it holds for the whole "character".
|
||||||
|
bool unichar_isalpha = false;
|
||||||
|
bool unichar_islower = false;
|
||||||
|
bool unichar_isupper = false;
|
||||||
|
bool unichar_isdigit = false;
|
||||||
|
bool unichar_ispunct = false;
|
||||||
|
|
||||||
|
for (int i = 0; i < uni_vector.size(); ++i) {
|
||||||
|
if (u_isalpha(uni_vector[i]))
|
||||||
|
unichar_isalpha = true;
|
||||||
|
if (u_islower(uni_vector[i]))
|
||||||
|
unichar_islower = true;
|
||||||
|
if (u_isupper(uni_vector[i]))
|
||||||
|
unichar_isupper = true;
|
||||||
|
if (u_isdigit(uni_vector[i]))
|
||||||
|
unichar_isdigit = true;
|
||||||
|
if (u_ispunct(uni_vector[i]))
|
||||||
|
unichar_ispunct = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
unicharset->set_isalpha(unichar_id, unichar_isalpha);
|
||||||
|
unicharset->set_islower(unichar_id, unichar_islower);
|
||||||
|
unicharset->set_isupper(unichar_id, unichar_isupper);
|
||||||
|
unicharset->set_isdigit(unichar_id, unichar_isdigit);
|
||||||
|
unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
|
||||||
|
|
||||||
|
tesseract::IcuErrorCode err;
|
||||||
|
unicharset->set_script(unichar_id, uscript_getName(
|
||||||
|
uscript_getScript(uni_vector[0], err)));
|
||||||
|
|
||||||
|
const int num_code_points = uni_vector.size();
|
||||||
|
// Obtain the lower/upper case if needed and record it in the properties.
|
||||||
|
unicharset->set_other_case(unichar_id, unichar_id);
|
||||||
|
if (unichar_islower || unichar_isupper) {
|
||||||
|
GenericVector<char32> other_case(num_code_points, 0);
|
||||||
|
for (int i = 0; i < num_code_points; ++i) {
|
||||||
|
// TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
|
||||||
|
// However since they deal with UChars (so need a conversion function
|
||||||
|
// from char32 or UTF8string) and require a meaningful locale string,
|
||||||
|
// for now u_tolower()/u_toupper() are used.
|
||||||
|
other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
|
||||||
|
u_tolower(uni_vector[i]);
|
||||||
|
}
|
||||||
|
STRING other_case_uch;
|
||||||
|
tesseract::UTF32ToUTF8(other_case, &other_case_uch);
|
||||||
|
UNICHAR_ID other_case_id =
|
||||||
|
unicharset->unichar_to_id(other_case_uch.c_str());
|
||||||
|
if (other_case_id != INVALID_UNICHAR_ID) {
|
||||||
|
unicharset->set_other_case(unichar_id, other_case_id);
|
||||||
|
} else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
|
||||||
|
tprintf("Other case %s of %s is not in unicharset\n",
|
||||||
|
other_case_uch.c_str(), unichar_str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set RTL property and obtain mirror unichar ID from ICU.
|
||||||
|
GenericVector<char32> mirrors(num_code_points, 0);
|
||||||
|
for (int i = 0; i < num_code_points; ++i) {
|
||||||
|
mirrors[i] = u_charMirror(uni_vector[i]);
|
||||||
|
if (i == 0) { // set directionality to that of the 1st code point
|
||||||
|
unicharset->set_direction(unichar_id,
|
||||||
|
static_cast<UNICHARSET::Direction>(
|
||||||
|
u_charDirection(uni_vector[i])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
STRING mirror_uch;
|
||||||
|
tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
|
||||||
|
UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
|
||||||
|
if (mirror_uch_id != INVALID_UNICHAR_ID) {
|
||||||
|
unicharset->set_mirror(unichar_id, mirror_uch_id);
|
||||||
|
} else if (report_errors) {
|
||||||
|
tprintf("Mirror %s of %s is not in unicharset\n",
|
||||||
|
mirror_uch.c_str(), unichar_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record normalized version of this unichar.
|
||||||
|
STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
|
||||||
|
if (unichar_id != 0 && normed_str.length() > 0) {
|
||||||
|
unicharset->set_normed(unichar_id, normed_str.c_str());
|
||||||
|
} else {
|
||||||
|
unicharset->set_normed(unichar_id, unichar_str);
|
||||||
|
}
|
||||||
|
ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
|
||||||
|
}
|
||||||
|
unicharset->post_load_setup();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper to set the properties for an input unicharset file, writes to the
|
||||||
|
// output file. If an appropriate script unicharset can be found in the
|
||||||
|
// script_dir directory, then the tops and bottoms are expanded using the
|
||||||
|
// script unicharset.
|
||||||
|
// If non-empty, xheight data for the fonts are written to the xheights_file.
|
||||||
|
void SetPropertiesForInputFile(const string& script_dir,
|
||||||
|
const string& input_unicharset_file,
|
||||||
|
const string& output_unicharset_file,
|
||||||
|
const string& output_xheights_file) {
|
||||||
|
UNICHARSET unicharset;
|
||||||
|
|
||||||
|
// Load the input unicharset
|
||||||
|
unicharset.load_from_file(input_unicharset_file.c_str());
|
||||||
|
tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
|
||||||
|
input_unicharset_file.c_str());
|
||||||
|
|
||||||
|
// Set unichar properties
|
||||||
|
tprintf("Setting unichar properties\n");
|
||||||
|
SetupBasicProperties(true, &unicharset);
|
||||||
|
string xheights_str;
|
||||||
|
for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
|
||||||
|
// Load the unicharset for the script if available.
|
||||||
|
string filename = script_dir + "/" +
|
||||||
|
unicharset.get_script_from_script_id(s) + ".unicharset";
|
||||||
|
UNICHARSET script_set;
|
||||||
|
if (script_set.load_from_file(filename.c_str())) {
|
||||||
|
unicharset.SetPropertiesFromOther(script_set);
|
||||||
|
}
|
||||||
|
// Load the xheights for the script if available.
|
||||||
|
filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
|
||||||
|
".xheights";
|
||||||
|
string script_heights;
|
||||||
|
if (File::ReadFileToString(filename, &script_heights))
|
||||||
|
xheights_str += script_heights;
|
||||||
|
}
|
||||||
|
if (!output_xheights_file.empty())
|
||||||
|
File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
|
||||||
|
for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
|
||||||
|
if (unicharset.PropertiesIncomplete(c)) {
|
||||||
|
tprintf("Warning: properties incomplete for index %d = %s\n",
|
||||||
|
c, unicharset.id_to_unichar(c));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the output unicharset
|
||||||
|
tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
|
||||||
|
unicharset.save_to_file(output_unicharset_file.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tesseract
|
||||||
|
|
50
training/unicharset_training_utils.h
Normal file
50
training/unicharset_training_utils.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
// File: unicharset_training_utils.h
|
||||||
|
// Description: Training utilities for UNICHARSET.
|
||||||
|
// Author: Ray Smith
|
||||||
|
// Created: Fri Oct 17 17:14:01 PDT 2014
|
||||||
|
//
|
||||||
|
// (C) Copyright 2014, Google Inc.
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
//
|
||||||
|
///////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#ifndef TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_
|
||||||
|
#define TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#ifdef USE_STD_NAMESPACE
|
||||||
|
using std::string;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class STATS;
|
||||||
|
class UNICHARSET;
|
||||||
|
|
||||||
|
namespace tesseract {
|
||||||
|
|
||||||
|
// Helper sets the character attribute properties and sets up the script table.
|
||||||
|
// Does not set tops and bottoms.
|
||||||
|
void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset);
|
||||||
|
|
||||||
|
// Helper to set the properties for an input unicharset file, writes to the
|
||||||
|
// output file. If an appropriate script unicharset can be found in the
|
||||||
|
// script_dir directory, then the tops and bottoms are expanded using the
|
||||||
|
// script unicharset.
|
||||||
|
// If non-empty, xheight data for the fonts are written to the xheights_file.
|
||||||
|
void SetPropertiesForInputFile(const string& script_dir,
|
||||||
|
const string& input_unicharset_file,
|
||||||
|
const string& output_unicharset_file,
|
||||||
|
const string& output_xheights_file);
|
||||||
|
|
||||||
|
} // namespace tesseract.
|
||||||
|
|
||||||
|
#endif // TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_
|
Loading…
Reference in New Issue
Block a user