Major updates to training system as a result of extensive testing on 100 languages

2025-06-07 18:02:40 +08:00 · 2015-05-12 18:04:31 -07:00 · 2015-05-12 18:04:31 -07:00 · 6be25156f7
commit 6be25156f7
parent 21805e63a4
11 changed files with 2103 additions and 731 deletions
--- a/training/language-specific.sh
+++ b/training/language-specific.sh
--- a/training/ligature_table.cpp
+++ b/training/ligature_table.cpp
@ -43,7 +43,7 @@ static string EncodeAsUTF8(const char32 ch32) {
 // from. Note that this range does not contain the custom ligatures that we
 // encode in the private use area.
 const int kMinLigature = 0xfb00;
-const int kMaxLigature = 0xfb4f;
+const int kMaxLigature = 0xfb17;  // Don't put the wide Hebrew letters in.
 /* static */
 SmartPtr<LigatureTable> LigatureTable::instance_;
--- a/training/pango_font_info.cpp
+++ b/training/pango_font_info.cpp
@ -51,6 +51,12 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
 BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
                "Does a one-time deletion of cache files from the "
                "fontconfig_tmpdir before initializing fontconfig.");
 BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true,
                "Does a one-time reset of the fontconfig config file to point"
                " to fonts_dir before initializing fontconfig. Set to true"
                " if fontconfig_refresh_cache is true. Set it to false to use"
                " multiple instances in separate processes without having to"
                " rescan the fonts_dir, using a previously setup font cache");
 #ifndef USE_STD_NAMESPACE
 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
@ -67,6 +73,8 @@ namespace tesseract {
 // in pixels.
 const int kDefaultResolution = 300;
 bool PangoFontInfo::fontconfig_initialized_ = false;
 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
  Clear();
 }
@ -103,34 +111,35 @@ string PangoFontInfo::DescriptionName() const {
 // Initializes Fontconfig for use by writing a fake fonts.conf file into the
 // FLAGS_fontconfigs_tmpdir directory, that points to the supplied
-// FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
+// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
-// to point to this fonts.conf file.
+// to point to this fonts.conf file. If force_clear, the cache is refreshed
-static void InitFontconfig() {
+// even if it has already been initialized.
-  static bool init_fontconfig = false;
+void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) {
-  if (init_fontconfig || FLAGS_fonts_dir.empty()) {
+  if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
-    init_fontconfig = true;
+    fontconfig_initialized_ = true;
    return;
  }
-  if (FLAGS_fontconfig_refresh_cache) {
+  if (FLAGS_fontconfig_refresh_cache || force_clear) {
    tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str());
    File::DeleteMatchingFiles(File::JoinPath(
-        FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str());
+        FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str());
  }
  if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
      force_clear) {
    const int MAX_FONTCONF_FILESIZE = 1024;
    char fonts_conf_template[MAX_FONTCONF_FILESIZE];
    snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
             "<?xml version=\"1.0\"?>\n"
             "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
             "<fontconfig>\n"
             "<dir>%s</dir>\n"
             "<cachedir>%s</cachedir>\n"
             "<config></config>\n"
             "</fontconfig>", fonts_dir.c_str(),
             FLAGS_fontconfig_tmpdir.c_str());
    string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
                                            "fonts.conf");
    File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
  }
  tprintf("Initializing fontconfig\n");
  const int MAX_FONTCONF_FILESIZE = 1024;
  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
           "<?xml version=\"1.0\"?>\n"
           "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
           "<fontconfig>\n"
           "<dir>%s</dir>\n"
           "<cachedir>%s</cachedir>\n"
           "<config></config>\n"
           "</fontconfig>", FLAGS_fonts_dir.c_str(),
           FLAGS_fontconfig_tmpdir.c_str());
  string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
                                          "fonts.conf");
  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
 #ifdef _WIN32
  std::string env("FONTCONFIG_PATH=");
  env.append(FLAGS_fontconfig_tmpdir.c_str());
@ -141,12 +150,18 @@ static void InitFontconfig() {
  // Fix the locale so that the reported font names are consistent.
  setenv("LANG", "en_US.utf8", true);
 #endif  // _WIN32
-  init_fontconfig = true;
+  if (!fontconfig_initialized_ || force_clear) {
    if (FcInitReinitialize() != FcTrue) {
      tprintf("FcInitiReinitialize failed!!\n");
    }
  }
  fontconfig_initialized_ = true;
  FontUtils::ReInit();
 }
 static void ListFontFamilies(PangoFontFamily*** families,
                             int* n_families) {
-  InitFontconfig();
+  PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
  PangoFontMap* font_map = pango_cairo_font_map_get_default();
  DISABLE_HEAP_LEAK_CHECK;
  pango_font_map_list_families(font_map, families, n_families);
@ -220,7 +235,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
 // in the font map. Note that if the font is wholly missing, this could
 // correspond to a completely different font family and face.
 PangoFont* PangoFontInfo::ToPangoFont() const {
-  InitFontconfig();
+  InitFontConfig(false, FLAGS_fonts_dir);
  PangoFontMap* font_map = pango_cairo_font_map_get_default();
  PangoContext* context = pango_context_new();
  pango_cairo_context_set_resolution(context, resolution_);
@ -253,6 +268,28 @@ bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const
  return true;
 }
 // This variant of strncpy permits src and dest to overlap. It will copy the
 // first byte first.
 static char* my_strnmove(char* dest, const char* src, size_t n) {
  char* ret = dest;
  // Copy characters until n reaches zero or the src byte is a nul.
  do {
    *dest = *src;
    --n;
    ++dest;
    ++src;
  } while (n && src[0]);
  // If we reached a nul byte and there are more 'n' left, zero them out.
  while (n) {
    *dest = '\0';
    --n;
    ++dest;
  }
  return ret;
 }
 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
  PangoFont* font = ToPangoFont();
  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
@ -265,23 +302,30 @@ int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
      UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
  const UNICHAR::const_iterator it_end =
      UNICHAR::end(utf8_text->c_str(), utf8_text->length());
-  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
+  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
    // Skip bad utf-8.
-    if (!it.is_legal())
+    if (!it.is_legal()) {
-      continue;  // One suitable error message will still be issued.
+      ++it;  // One suitable error message will still be issued.
-    if (!IsWhitespace(*it) && !pango_is_zero_width(*it) &&
+      continue;
-        pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
+    }
    int unicode = *it;
    int utf8_len = it.utf8_len();
    const char* utf8_char = it.utf8_data();
    // Move it forward before the data gets modified.
    ++it;
    if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
        pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
      if (TLOG_IS_ON(2)) {
-        char tmp[5];
+        UNICHAR unichar(unicode);
-        int len = it.get_utf8(tmp);
+        char* str = unichar.utf8_str();
-        tmp[len] = '\0';
+        tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
-        tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
+        delete[] str;
      }
      ++num_dropped_chars;
      continue;
    }
-    strncpy(out, it.utf8_data(), it.utf8_len());
+    my_strnmove(out, utf8_char, utf8_len);
-    out += it.utf8_len();
+    out += utf8_len;
  }
  utf8_text->resize(out - utf8_text->c_str());
  return num_dropped_chars;
@ -438,6 +482,7 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
 // ------------------------ FontUtils ------------------------------------
 vector<string> FontUtils::available_fonts_;  // cache list
 // Returns whether the specified font description is available in the fonts
 // directory.
@ -449,7 +494,8 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
 // from the font_map, and then check what we loaded to see if it has the
 // description we expected. If it is not, then the font is deemed unavailable.
 /* static */
-bool FontUtils::IsAvailableFont(const char* input_query_desc) {
+bool FontUtils::IsAvailableFont(const char* input_query_desc,
                                string* best_match) {
  string query_desc(input_query_desc);
  if (PANGO_VERSION <= 12005) {
    // Strip commas and any ' Medium' substring in the name.
@ -466,7 +512,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
      query_desc.c_str());
  PangoFont* selected_font = NULL;
  {
-    InitFontconfig();
+    PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
    PangoFontMap* font_map = pango_cairo_font_map_get_default();
    PangoContext* context = pango_context_new();
    pango_context_set_font_map(context, font_map);
@ -490,7 +536,16 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
  char* selected_desc_str = pango_font_description_to_string(selected_desc);
  tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
       selected_desc_str);
-
+  if (!equal && best_match != NULL) {
    *best_match = selected_desc_str;
    // Clip the ending ' 0' if there is one. It seems that, if there is no
    // point size on the end of the fontname, then Pango always appends ' 0'.
    int len = best_match->size();
    if (len > 2 && best_match->at(len - 1) == '0' &&
        best_match->at(len - 2) == ' ') {
      *best_match = best_match->substr(0, len - 2);
    }
  }
  g_free(selected_desc_str);
  pango_font_description_free(selected_desc);
  g_object_unref(selected_font);
@ -512,7 +567,6 @@ static bool ShouldIgnoreFontFamilyName(const char* query) {
 // Outputs description names of available fonts.
 /* static */
 const vector<string>& FontUtils::ListAvailableFonts() {
  static vector<string> available_fonts_;  // cache list
  if (available_fonts_.size()) {
    return available_fonts_;
  }
@ -536,8 +590,9 @@ const vector<string>& FontUtils::ListAvailableFonts() {
  for (int i = 0; i < n_families; ++i) {
    const char* family_name = pango_font_family_get_name(families[i]);
    tlog(2, "Listing family %s\n", family_name);
-    if (ShouldIgnoreFontFamilyName(family_name))
+    if (ShouldIgnoreFontFamilyName(family_name)) {
      continue;
    }
    int n_faces;
    PangoFontFace** faces = NULL;
@ -733,4 +788,8 @@ bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
  return false;
 }
 // PangoFontInfo is reinitialized, so clear the static list of fonts.
 /* static */
 void FontUtils::ReInit() { available_fonts_.clear(); }
 }  // namespace tesseract
--- a/training/pango_font_info.h
+++ b/training/pango_font_info.h
@ -83,6 +83,11 @@ class PangoFontInfo {
  bool GetSpacingProperties(const string& utf8_char,
                            int* x_bearing, int* x_advance) const;
  // Initializes FontConfig by setting its environment variable and creating
  // a fonts.conf file that points to the given fonts_dir. Once initialized,
  // it is not re-initialized unless force_clear is true.
  static void InitFontConfig(bool force_clear, const string& fonts_dir);
  // Accessors
  string DescriptionName() const;
  // Font Family name eg. "Arial"
@ -123,6 +128,10 @@ class PangoFontInfo {
  // Default output resolution to assume for GetSpacingProperties() and any
  // other methods that returns pixel values.
  int resolution_;
  // Fontconfig operates through an environment variable, so it intrinsically
  // cannot be thread-friendly, but you can serialize multiple independent
  // font configurations by calling InitFontConfig(true, path).
  static bool fontconfig_initialized_;
 private:
  PangoFontInfo(const PangoFontInfo&);
@ -135,7 +144,13 @@ class FontUtils {
 public:
  // Returns true if the font of the given description name is available in the
  // target directory specified by --fonts_dir
-  static bool IsAvailableFont(const char* font_desc);
+  static bool IsAvailableFont(const char* font_desc) {
    return IsAvailableFont(font_desc, NULL);
  }
  // Returns true if the font of the given description name is available in the
  // target directory specified by --fonts_dir. If false is returned, and
  // best_match is not NULL, the closest matching font is returned there.
  static bool IsAvailableFont(const char* font_desc, string* best_match);
  // Outputs description names of available fonts.
  static const vector<string>& ListAvailableFonts();
@ -181,6 +196,12 @@ class FontUtils {
  static int FontScore(const unordered_map<char32, inT64>& ch_map,
                       const string& fontname, int* raw_score,
                       vector<bool>* ch_flags);
  // PangoFontInfo is reinitialized, so clear the static list of fonts.
  static void ReInit();
 private:
  static vector<string> available_fonts_;  // cache list
 };
 }  // namespace tesseract
--- a/training/set_unicharset_properties.cpp
+++ b/training/set_unicharset_properties.cpp
@ -7,14 +7,8 @@
 #include <string>
 #include "commandlineflags.h"
-#include "fileio.h"
+#include "tprintf.h"
-#include "genericvector.h"
+#include "unicharset_training_utils.h"
 #include "icuerrorcode.h"
 #include "normstrngs.h"
 #include "strngs.h"
 #include "unicharset.h"
 #include "unicode/uchar.h"    // from libicu
 #include "unicode/uscript.h"  // from libicu
 // The directory that is searched for universal script unicharsets.
 STRING_PARAM_FLAG(script_dir, "",
@ -25,157 +19,6 @@ DECLARE_STRING_PARAM_FLAG(U);
 DECLARE_STRING_PARAM_FLAG(O);
 DECLARE_STRING_PARAM_FLAG(X);
 namespace tesseract {
 // Helper sets the character attribute properties and sets up the script table.
 // Does not set tops and bottoms.
 static void SetupBasicProperties(UNICHARSET* unicharset) {
  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
    // Convert any custom ligatures.
    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
        unichar_str = UNICHARSET::kCustomLigatures[i][0];
        break;
      }
    }
    // Convert the unichar to UTF32 representation
    GenericVector<char32> uni_vector;
    tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
    // Assume that if the property is true for any character in the string,
    // then it holds for the whole "character".
    bool unichar_isalpha = false;
    bool unichar_islower = false;
    bool unichar_isupper = false;
    bool unichar_isdigit = false;
    bool unichar_ispunct = false;
    for (int i = 0; i < uni_vector.size(); ++i) {
      if (u_isalpha(uni_vector[i]))
        unichar_isalpha = true;
      if (u_islower(uni_vector[i]))
        unichar_islower = true;
      if (u_isupper(uni_vector[i]))
        unichar_isupper = true;
      if (u_isdigit(uni_vector[i]))
        unichar_isdigit = true;
      if (u_ispunct(uni_vector[i]))
        unichar_ispunct = true;
    }
    unicharset->set_isalpha(unichar_id, unichar_isalpha);
    unicharset->set_islower(unichar_id, unichar_islower);
    unicharset->set_isupper(unichar_id, unichar_isupper);
    unicharset->set_isdigit(unichar_id, unichar_isdigit);
    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
    tesseract::IcuErrorCode err;
    unicharset->set_script(unichar_id, uscript_getName(
        uscript_getScript(uni_vector[0], err)));
    const int num_code_points = uni_vector.size();
    // Obtain the lower/upper case if needed and record it in the properties.
    unicharset->set_other_case(unichar_id, unichar_id);
    if (unichar_islower || unichar_isupper) {
      GenericVector<char32> other_case(num_code_points, 0);
      for (int i = 0; i < num_code_points; ++i) {
        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
        // However since they deal with UChars (so need a conversion function
        // from char32 or UTF8string) and require a meaningful locale string,
        // for now u_tolower()/u_toupper() are used.
        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
          u_tolower(uni_vector[i]);
      }
      STRING other_case_uch;
      tesseract::UTF32ToUTF8(other_case, &other_case_uch);
      UNICHAR_ID other_case_id =
          unicharset->unichar_to_id(other_case_uch.c_str());
      if (other_case_id != INVALID_UNICHAR_ID) {
        unicharset->set_other_case(unichar_id, other_case_id);
      } else {
        tprintf("Other case %s of %s is not in unicharset\n",
                other_case_uch.c_str(), unichar_str);
      }
    }
    // Set RTL property and obtain mirror unichar ID from ICU.
    GenericVector<char32> mirrors(num_code_points, 0);
    for (int i = 0; i < num_code_points; ++i) {
      mirrors[i] = u_charMirror(uni_vector[i]);
      if (i == 0) {  // set directionality to that of the 1st code point
        unicharset->set_direction(unichar_id,
                                  static_cast<UNICHARSET::Direction>(
                                      u_charDirection(uni_vector[i])));
      }
    }
    STRING mirror_uch;
    tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
    if (mirror_uch_id != INVALID_UNICHAR_ID) {
      unicharset->set_mirror(unichar_id, mirror_uch_id);
    } else {
      tprintf("Mirror %s of %s is not in unicharset\n",
              mirror_uch.c_str(), unichar_str);
    }
    // Record normalized version of this unichar.
    STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
    if (unichar_id != 0 && normed_str.length() > 0) {
      unicharset->set_normed(unichar_id, normed_str.c_str());
    } else {
      unicharset->set_normed(unichar_id, unichar_str);
    }
  }
  unicharset->post_load_setup();
 }
 // Helper to set the properties for an input unicharset file, writes to the
 // output file. If an appropriate script unicharset can be found in the
 // script_dir directory, then the tops and bottoms are expanded using the
 // script unicharset.
 // If non-empty, xheight data for the fonts are written to the xheights_file.
 static void SetPropertiesForInputFile(const string& script_dir,
                                      const string& input_unicharset_file,
                                      const string& output_unicharset_file,
                                      const string& output_xheights_file) {
  UNICHARSET unicharset;
  // Load the input unicharset
  unicharset.load_from_file(input_unicharset_file.c_str());
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          input_unicharset_file.c_str());
  // Set unichar properties
  tprintf("Setting unichar properties\n");
  SetupBasicProperties(&unicharset);
  string xheights_str;
  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
    // Load the unicharset for the script if available.
    string filename = script_dir + "/" +
        unicharset.get_script_from_script_id(s) + ".unicharset";
    UNICHARSET script_set;
    if (script_set.load_from_file(filename.c_str())) {
      unicharset.SetPropertiesFromOther(script_set);
    }
    // Load the xheights for the script if available.
    filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
        ".xheights";
    string script_heights;
    if (File::ReadFileToString(filename, &script_heights))
      xheights_str += script_heights;
  }
  if (!output_xheights_file.empty())
    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
  // Write the output unicharset
  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
  unicharset.save_to_file(output_unicharset_file.c_str());
 }
 }  // namespace tesseract
 int main(int argc, char** argv) {
  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
--- a/training/stringrenderer.cpp
+++ b/training/stringrenderer.cpp
@ -819,6 +819,7 @@ int StringRenderer::RenderToImage(const char* text, int text_length,
 int StringRenderer::RenderAllFontsToImage(double min_coverage,
                                          const char* text, int text_length,
                                          string* font_used, Pix** image) {
  *image = NULL;
  // Select a suitable font to render the title with.
  const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
  string title_font;
@ -882,10 +883,9 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage,
              all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
    }
  }
  *image = NULL;
  font_index_ = 0;
  char_map_.clear();
-  return last_offset_;
+  return last_offset_ == 0 ? -1 : last_offset_;
 }
 }  // namespace tesseract
--- a/training/tesstrain.sh
+++ b/training/tesstrain.sh
@ -44,516 +44,7 @@
 # appropriate --fonts_dir path.
-FONTS=(
+source `dirname $0`/tesstrain_utils.sh
    "Arial" \
    "Times New Roman," \
 )
 FONTS_DIR="/usr/share/fonts/truetype/"
 OUTPUT_DIR="/tmp/tesstrain/tessdata"
 OVERWRITE=0
 RUN_SHAPE_CLUSTERING=0
 EXTRACT_FONT_PROPERTIES=1
 WORKSPACE_DIR="/tmp/tesstrain"
 # Logging helper functions.
 tlog() {
    echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE}
 }
 err() {
    echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE}
    exit 1
 }
 # Helper function to run a command and append its output to a log. Aborts early
 # if the program file is not found.
 # Usage: run_cmd CMD ARG1 ARG2...
 run_cmd() {
    local cmd=$1
    shift
    if [[ ! -x ${cmd} ]]; then
        err "File ${cmd} not found"
    fi
    tlog "[$(date)] ${cmd} $@"
    ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
    # check completion status
    if [[ $? -gt 0 ]]; then
        err "Program $(basename ${cmd}) failed. Abort."
    fi
 }
 # Check if all the given files exist, or exit otherwise.
 # Used to check required input files and produced output files in each phase.
 # Usage: check_file_readable FILE1 FILE2...
 check_file_readable() {
    for file in $@; do
        if [[ ! -r ${file} ]]; then
            err "${file} does not exist or is not readable"
        fi
    done
 }
 # Write a file (with name specified in $2) with records that account for
 # n% (specified in $3) of the total weights of records in the input file
 # (input file name specified in $1). The input file should have one record
 # per line along with its weight separated by \t. The records should be
 # sorted in non-ascending order of frequency.
 # If $4 is true the first record is skipped.
 # USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE
 discard_tail() {
    local infile=$1
    local outfile=$2
    local pct=$3
    local skip_first=$4
    local more_arg="1";
    if [[ ${skip_first} ]]; then
        more_arg="2"
    fi
    local sum=$(tail -n +${more_arg} ${infile} \
        | awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}')
    if [[ ${sum} == "" ]]; then sum=0
    fi
    local limit=$((${sum}*${pct}/100))
    tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"}
        {if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \
            >> ${outfile}
 }
 # Set global path variables that are based on parsed flags.
 set_prog_paths() {
    if [[ -z ${BINDIR} ]]; then
        err "Need to specify location of program files"
    fi
    CN_TRAINING_EXE=${BINDIR}/cntraining
    COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
    MF_TRAINING_EXE=${BINDIR}/mftraining
    SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
    SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
    TESSERACT_EXE=${BINDIR}/tesseract
    TEXT2IMAGE_EXE=${BINDIR}/text2image
    UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
    WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
 }
 # Sets the named variable to given value. Aborts if the value is missing or
 # if it looks like a flag.
 # Usage: parse_value VAR_NAME VALUE
 parse_value() {
    local val="$2"
    if [[ -z $val ]]; then
        err "Missing value for variable $1"
        exit
    fi
    if [[ ${val:0:2} == "--" ]]; then
        err "Invalid value $val passed for variable $1"
        exit
    fi
    eval $1=\"$val\"
 }
 # Does simple command-line parsing and initialization.
 parse_flags() {
    local i=0
    while test $i -lt ${#ARGV[@]}; do
        local j=$((i+1))
        case ${ARGV[$i]} in
            --)
                break;;
            --bin_dir)
                parse_value "BINDIR" ${ARGV[$j]}
                i=$j ;;
            --fontlist)   # Expect a plus-separated list of names
                if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
                    err "Invalid value passed to --fontlist"
                fi
                local ofs=$IFS
                IFS='+'
                FONTS=( ${ARGV[$j]} )
                IFS=$ofs
                i=$j ;;
            --fonts_dir)
                parse_value "FONTS_DIR" ${ARGV[$j]}
                i=$j ;;
            --lang)
                parse_value "LANG_CODE" ${ARGV[$j]}
                i=$j ;;
            --langdata_dir)
                parse_value "LANGDATA_ROOT" ${ARGV[$j]}
                i=$j ;;
            --output_dir)
                parse_value "OUTPUT_DIR" ${ARGV[$j]}
                i=$j ;;
            --overwrite)
                OVERWRITE=1 ;;
            --extract_font_properties)
                EXTRACT_FONT_PROPERTIES=1 ;;
            --noextract_font_properties)
                EXTRACT_FONT_PROPERTIES=0 ;;
            --run_shape_clustering)
                RUN_SHAPE_CLUSTERING=1 ;;
            --tessdata_dir)
                parse_value "TESSDATA_DIR" ${ARGV[$j]}
                i=$j ;;
            --training_text)
                parse_value "TRAINING_TEXT" "${ARGV[$j]}"
                i=$j ;;
            --wordlist)
                parse_value "WORDLIST_FILE" ${ARGV[$j]}
                i=$j ;;
            *)
                err "Unrecognized argument ${ARGV[$i]}" ;;
        esac
        i=$((i+1))
    done
    if [[ -z ${LANG_CODE} ]]; then
        err "Need to specify a language --lang"
    fi
    if [[ -z ${BINDIR} ]]; then
        err "Need to specify path to built binaries --bin_dir"
    fi
    if [[ -z ${LANGDATA_ROOT} ]]; then
        err "Need to specify path to language files --langdata_dir"
    fi
    if [[ -z ${TESSDATA_DIR} ]]; then
        if [[ -z ${TESSDATA_PREFIX} ]]; then
            err "Need to specify a --tessdata_dir or have a "\
        "TESSDATA_PREFIX variable defined in your environment"
        else
            TESSDATA_DIR="${TESSDATA_PREFIX}"
        fi
    fi
    set_prog_paths
    # Location where intermediate files will be created.
    TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
    # Location of log file for the whole run.
    LOG_FILE=${TRAINING_DIR}/tesstrain.log
    # Take training text and wordlist from the langdata directory if not
    # specified in the commend-line.
    if [[ -z ${TRAINING_TEXT} ]]; then
        TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text
    fi
    if [[ -z ${WORDLIST_FILE} ]]; then
        WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean
    fi
    WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean
    NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
    PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
    BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
    UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
    TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
 }
 # Phase I : Generate (I)mages from training text for each font.
 phaseI_generate_image() {
    tlog "\n=== Phase I: Generating training images ==="
    if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then
        err "Could not find training text file ${TRAINING_TEXT}"
    fi
    BOX_PADDING="0"
    CHAR_SPACING="0.0"
    EXPOSURE="0"
    LEADING="32"
    NGRAM_CHAR_SPACING="0.0"
    if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS} ]]; then
        # Parse .bigram_freqs file and compose a .train_ngrams file with text
        # for tesseract to recognize during training. Take only the ngrams whose
        # combined weight accounts for 95% of all the bigrams in the language.
        TMP_FILE="${TRAINING_DIR}/_tmp"
        cat ${BIGRAM_FREQS_FILE} > ${TMP_FILE}
        NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
            | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
        cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
            | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
            x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
        check_file_readable ${TRAIN_NGRAMS_FILE}
    fi
    for font in "${FONTS[@]}"; do
        tlog "Rendering using ${font}"
        fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
        outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
        common_args="--leading=${LEADING} --fonts_dir=${FONTS_DIR} "
        common_args+=" --box_padding=${BOX_PADDING} --strip_unrenderable_words"
        run_cmd ${TEXT2IMAGE_EXE} ${common_args} \
            --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE} \
            --font="${font}" --outputbase=${outbase} --text=${TRAINING_TEXT}
        check_file_readable ${outbase}.box ${outbase}.tif
        if (( ${EXTRACT_FONT_PROPERTIES} )) &&
            [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
            tlog "Rendering ngrams using ${font}"
            outbase=${TRAINING_DIR}/ngrams/${LANG_CODE}.ngrams.${fontname}.exp${EXPOSURE}
            run_cmd ${TEXT2IMAGE_EXE} ${common_args} \
                --char_spacing=${NGRAM_CHAR_SPACING} --exposure=${EXPOSURE} \
                --font="${font}" --outputbase=${outbase} \
                --box_padding=${BOX_PADDING} --render_ngrams=1 \
                --text=${TRAIN_NGRAMS_FILE}
            check_file_readable ${outbase}.box ${outbase}.tif
        fi
    done
 }
 # Phase UP : Generate (U)nicharset and (P)roperties file.
 phaseUP_generate_unicharset() {
    tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
    box_files=$(ls ${TRAINING_DIR}/*.box)
    run_cmd ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
    outfile=${TRAINING_DIR}/unicharset
    UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
    check_file_readable ${outfile}
    mv ${outfile} ${UNICHARSET_FILE}
    XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
    check_file_readable ${UNICHARSET_FILE}
    run_cmd ${SET_UNICHARSET_PROPERTIES_EXE} \
        -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
        --script_dir=${LANGDATA_ROOT}
    check_file_readable ${XHEIGHTS_FILE}
 }
 # Phase D : Generate (D)awg files from unicharset file and wordlist files
 phaseD_generate_dawg() {
    tlog "\n=== Phase D: Generating Dawg files ==="
    # Output files
    WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
    FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
    PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
    NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
    BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
    # Word DAWG
    local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
    if [[ -r ${WORDLIST_FILE} ]]; then
        tlog "Generating word Dawg"
        check_file_readable ${UNICHARSET_FILE}
        run_cmd ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
            ${UNICHARSET_FILE}
        check_file_readable ${WORD_DAWG}
        FREQ_DAWG_SIZE=100
        head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
    fi
    # Freq-word DAWG
    if [[ -r ${freq_wordlist_file} ]]; then
        check_file_readable ${UNICHARSET_FILE}
        tlog "Generating frequent-word Dawg"
        run_cmd ${WORDLIST2DAWG_EXE}  -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \
            ${UNICHARSET_FILE}
        check_file_readable ${FREQ_DAWG}
    fi
    # Punctuation DAWG
    local punc_clean="${LANGDATA_ROOT}/common.punc"
    if [[ -r ${PUNC_FILE} ]]; then
        local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top
        head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
            > ${top_punc_file}
        discard_tail ${PUNC_FILE} ${top_punc_file} 99 1
        punc_clean="${top_punc_file}"
    fi
    # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
    # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
    # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
    # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
    # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
    local punc_reverse_policy=0;
    if [[ ${LANG_CODE} == "heb" || ${LANG_CODE} == "ara" ]]; then
        punc_reverse_policy=2
    fi
    if [[ -r ${punc_clean} ]]; then
        run_cmd ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
            ${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE}
        check_file_readable ${PUNC_DAWG}
    fi
    # Numbers DAWG
    if [[ -r ${NUMBERS_FILE} ]]; then
        local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top
        head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
            > ${top_num_file}
        discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1
        run_cmd ${WORDLIST2DAWG_EXE} -r 0 \
            ${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE}
        check_file_readable ${NUMBER_DAWG}
    fi
    # Bigram dawg
    if [[ -r ${WORD_BIGRAMS_FILE} ]]; then
        run_cmd ${WORDLIST2DAWG_EXE} -r 1 \
            ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
        check_file_readable ${BIGRAM_DAWG}
    fi
 }
 # Phase E : (E)xtract .tr feature files from .tif/.box files
 phaseE_extract_features() {
    tlog "\n=== Phase E: Extracting features ==="
    local box_config="box.train"
    TRAIN_EXPOSURES='0'
    for exposure in ${TRAIN_EXPOSURES}; do
        img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
    done
    # Use any available language-specific configs.
    local config=""
    if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
        config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
    fi
    OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
    export TESSDATA_PREFIX=${TESSDATA_DIR}
    tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
    for img_file in ${img_files}; do
        run_cmd ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
            ${box_config} ${config}
    done
    export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
 }
 # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
 # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
 phaseC_cluster_prototypes() {
    tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
    local out_normproto=${TRAINING_DIR}/${LANG_CODE}.normproto
    run_cmd ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
        $(ls ${TRAINING_DIR}/*.tr)
    check_file_readable ${TRAINING_DIR}/normproto
    mv ${TRAINING_DIR}/normproto ${out_normproto}
 }
 # Phase S : (S)hape clustering
 phaseS_cluster_shapes() {
    if (( ! ${RUN_SHAPE_CLUSTERING} )); then
        return
    fi
    check_file_readable ${LANGDATA_ROOT}/font_properties
    local font_props=${LANGDATA_ROOT}/font_properties
    if [[ -r ${font_props} ]]; then
        font_props="-F ${font_props}"
    else
        font_props=""
    fi
    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
     [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
    fi
    run_cmd ${SHAPE_TRAINING_EXE} \
        -D "${TRAINING_DIR}/" \
        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
        ${font_props} \
        $(ls ${TRAINING_DIR}/*.tr)
    check_file_readable ${TRAINING_DIR}/shapetable \
        ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
 }
 # Phase M : Clustering microfeatures (mfTraining)
 phaseM_cluster_microfeatures() {
    tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
    font_props=${LANGDATA_ROOT}/font_properties
    if [[ -r ${font_props} ]]; then
        font_props="-F ${font_props}"
    else
        font_props=""
    fi
    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
    fi
    run_cmd ${MF_TRAINING_EXE} \
        -D "${TRAINING_DIR}/" \
        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
        ${font_props} \
        $(ls ${TRAINING_DIR}/*.tr)
    check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
        ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
    mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
    mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
    mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
    mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
 }
 phaseB_generate_ambiguities() {
  tlog "\n=== Phase B : ambiguities training ==="
  # Check for manually created ambiguities data.
  if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
      tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
      cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
          ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
      # Make it writable, as it may be read-only in the client.
      chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
      return
  else
      tlog "No unicharambigs file found!"
  fi
  # TODO: Add support for generating ambiguities automatically.
 }
 make_traineddata() {
  tlog "\n=== Making final traineddata file ==="
  local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
  # Combine available files for this language from the langdata dir.
  if [[ -r ${lang_prefix}.config ]]; then
    tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
    cp ${lang_prefix}.config ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
  fi
  if [[ -r ${lang_prefix}.cube-unicharset ]]; then
    tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}"
    cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset
  fi
  if [[ -r ${lang_prefix}.cube-word-dawg ]]; then
    tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}"
    cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg
  fi
  if [[ -r ${lang_prefix}.params-model ]]; then
    tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
    cp ${lang_prefix}.params-model ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
  fi
  # Compose the traineddata file.
  run_cmd ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
  # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
  if [[ ! -d ${OUTPUT_DIR} ]]; then
      tlog "Creating new directory ${OUTPUT_DIR}"
      mkdir -p ${OUTPUT_DIR}
  fi
  local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
  if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then
      err "File ${destfile} exists and no --overwrite specified";
  fi
  tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
  cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
 }
 ARGV=("$@")
 parse_flags
@ -564,14 +55,21 @@ tlog "Cleaning workspace directory ${TRAINING_DIR}..."
 mkdir -p ${TRAINING_DIR}
 rm -fr ${TRAINING_DIR}/*
-phaseI_generate_image
+source `dirname $0`/language-specific.sh
-phaseUP_generate_unicharset
+set_lang_specific_parameters ${LANG_CODE}
-phaseD_generate_dawg
+
-phaseE_extract_features
+initialize_fontconfig
-phaseC_cluster_prototypes
+
-phaseS_cluster_shapes
+phase_I_generate_image 8
-phaseM_cluster_microfeatures
+phase_UP_generate_unicharset
-phaseB_generate_ambiguities
+phase_D_generate_dawg
-make_traineddata
+phase_E_extract_features "box.train" 8
 phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"
 if [[ "${ENABLE_SHAPE_CLUSTERING}" == "y" ]]; then
    phase_S_cluster_shapes
 fi
 phase_M_cluster_microfeatures
 phase_B_generate_ambiguities
 make__traineddata
 tlog "\nCompleted training for language '${LANG_CODE}'\n"
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
@ -0,0 +1,578 @@
 #!/bin/bash
 # (C) Copyright 2014, Google Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # This script defines functions that are used by tesstrain.sh
 # For a detailed description of the phases, see
 # https://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3
 #
 # USAGE: source tesstrain_utils.sh
 FONTS=(
    "Arial" \
    "Times New Roman," \
 )
 FONTS_DIR="/usr/share/fonts/truetype/"
 OUTPUT_DIR="/tmp/tesstrain/tessdata"
 OVERWRITE=0
 RUN_SHAPE_CLUSTERING=0
 EXTRACT_FONT_PROPERTIES=1
 WORKSPACE_DIR="/tmp/tesstrain"
 # Logging helper functions.
 tlog() {
    echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE}
 }
 err_exit() {
    echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE}
    exit 1
 }
 # Helper function to run a command and append its output to a log. Aborts early
 # if the program file is not found.
 # Usage: run_command CMD ARG1 ARG2...
 run_command() {
    local cmd=$1
    shift
    if [[ ! -x ${cmd} ]]; then
        err_exit "File ${cmd} not found"
    fi
    tlog "[$(date)] ${cmd} $@"
    ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
    # check completion status
    if [[ $? -gt 0 ]]; then
        err_exit "Program $(basename ${cmd}) failed. Abort."
    fi
 }
 # Check if all the given files exist, or exit otherwise.
 # Used to check required input files and produced output files in each phase.
 # Usage: check_file_readable FILE1 FILE2...
 check_file_readable() {
    for file in $@; do
        if [[ ! -r ${file} ]]; then
            err_exit "${file} does not exist or is not readable"
        fi
    done
 }
 # Write a file (with name specified in $2) with records that account for
 # n% (specified in $3) of the total weights of records in the input file
 # (input file name specified in $1). The input file should have one record
 # per line along with its weight separated by \t. The records should be
 # sorted in non-ascending order of frequency.
 # If $4 is true the first record is skipped.
 # USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE
 discard_tail() {
    local infile=$1
    local outfile=$2
    local pct=$3
    local skip_first=$4
    local more_arg="1";
    if [[ ${skip_first} ]]; then
        more_arg="2"
    fi
    local sum=$(tail -n +${more_arg} ${infile} \
        | awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}')
    if [[ ${sum} == "" ]]; then sum=0
    fi
    local limit=$((${sum}*${pct}/100))
    tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"}
        {if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \
            >> ${outfile}
 }
 # Set global path variables that are based on parsed flags.
 set_prog_paths() {
    if [[ -z ${BINDIR} ]]; then
        err_exit "Need to specify location of program files"
    fi
    CN_TRAINING_EXE=${BINDIR}/cntraining
    COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
    MF_TRAINING_EXE=${BINDIR}/mftraining
    SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
    SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
    TESSERACT_EXE=${BINDIR}/tesseract
    TEXT2IMAGE_EXE=${BINDIR}/text2image
    UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
    WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
 }
 # Sets the named variable to given value. Aborts if the value is missing or
 # if it looks like a flag.
 # Usage: parse_value VAR_NAME VALUE
 parse_value() {
    local val="$2"
    if [[ -z $val ]]; then
        err_exit "Missing value for variable $1"
        exit
    fi
    if [[ ${val:0:2} == "--" ]]; then
        err_exit "Invalid value $val passed for variable $1"
        exit
    fi
    eval $1=\"$val\"
 }
 # Does simple command-line parsing and initialization.
 parse_flags() {
    local i=0
    while test $i -lt ${#ARGV[@]}; do
        local j=$((i+1))
        case ${ARGV[$i]} in
            --)
                break;;
            --bin_dir)
                parse_value "BINDIR" ${ARGV[$j]}
                i=$j ;;
            --fontlist)   # Expect a plus-separated list of names
                if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
                    err_exit "Invalid value passed to --fontlist"
                fi
                local ofs=$IFS
                IFS='+'
                FONTS=( ${ARGV[$j]} )
                IFS=$ofs
                i=$j ;;
            --fonts_dir)
                parse_value "FONTS_DIR" ${ARGV[$j]}
                i=$j ;;
            --lang)
                parse_value "LANG_CODE" ${ARGV[$j]}
                i=$j ;;
            --langdata_dir)
                parse_value "LANGDATA_ROOT" ${ARGV[$j]}
                i=$j ;;
            --output_dir)
                parse_value "OUTPUT_DIR" ${ARGV[$j]}
                i=$j ;;
            --overwrite)
                OVERWRITE=1 ;;
            --extract_font_properties)
                EXTRACT_FONT_PROPERTIES=1 ;;
            --noextract_font_properties)
                EXTRACT_FONT_PROPERTIES=0 ;;
            --tessdata_dir)
                parse_value "TESSDATA_DIR" ${ARGV[$j]}
                i=$j ;;
            --training_text)
                parse_value "TRAINING_TEXT" "${ARGV[$j]}"
                i=$j ;;
            --wordlist)
                parse_value "WORDLIST_FILE" ${ARGV[$j]}
                i=$j ;;
            *)
                err_exit "Unrecognized argument ${ARGV[$i]}" ;;
        esac
        i=$((i+1))
    done
    if [[ -z ${LANG_CODE} ]]; then
        err_exit "Need to specify a language --lang"
    fi
    if [[ -z ${BINDIR} ]]; then
        err_exit "Need to specify path to built binaries --bin_dir"
    fi
    if [[ -z ${LANGDATA_ROOT} ]]; then
        err_exit "Need to specify path to language files --langdata_dir"
    fi
    if [[ -z ${TESSDATA_DIR} ]]; then
        if [[ -z ${TESSDATA_PREFIX} ]]; then
            err_exit "Need to specify a --tessdata_dir or have a "\
        "TESSDATA_PREFIX variable defined in your environment"
        else
            TESSDATA_DIR="${TESSDATA_PREFIX}"
        fi
    fi
    set_prog_paths
    # Location where intermediate files will be created.
    TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
    # Location of log file for the whole run.
    LOG_FILE=${TRAINING_DIR}/tesstrain.log
    # Take training text and wordlist from the langdata directory if not
    # specified in the commend-line.
    if [[ -z ${TRAINING_TEXT} ]]; then
        TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text
    fi
    if [[ -z ${WORDLIST_FILE} ]]; then
        WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean
    fi
    WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean
    NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
    PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
    BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
    UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
    TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
    GENERATE_DAWGS=1
 }
 # Function initializes font config with a unique font cache dir.
 initialize_fontconfig() {
    export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
    local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
    echo "Text" >${sample_path}
    run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
        --font="Arial" --outputbase=${sample_path} --text=${sample_path} \
        --fontconfig_tmpdir=${FONT_CONFIG_CACHE}
 }
 # Helper function for phaseI_generate_image. Generates the image for a single
 # language/font combination in a way that can be run in parallel.
 generate_font_image() {
    local font="$1"
    tlog "Rendering using ${font}"
    local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
    local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
    local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}"
    common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
    common_args+=" --fontconfig_refresh_config_file=false --leading=${LEADING}"
    common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
    common_args+=" --outputbase=${outbase}"
    # add --writing_mode=vertical-upright to common_args if the font is
    # specified to be rendered vertically.
    for vfont in "${VERTICAL_FONTS[@]}"; do
      if [[ "${font}" == "${vfont}" ]]; then
        common_args+=" --writing_mode=vertical-upright "
        break
      fi
    done
    run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
        --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
    check_file_readable ${outbase}.box ${outbase}.tif
    if (( ${EXTRACT_FONT_PROPERTIES} )) &&
        [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
        tlog "Extracting font properties of ${font}"
        run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
            --ligatures=false --text=${TRAIN_NGRAMS_FILE} \
            --only_extract_font_properties --ptsize=32
        check_file_readable ${outbase}.fontinfo
    fi
 }
 # Phase I : Generate (I)mages from training text for each font.
 phase_I_generate_image() {
    local par_factor=$1
    if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
        par_factor=1
    fi
    tlog "\n=== Phase I: Generating training images ==="
    if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then
        err_exit "Could not find training text file ${TRAINING_TEXT}"
    fi
    CHAR_SPACING="0.0"
    EXPOSURE="0"
    if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
        # Parse .bigram_freqs file and compose a .train_ngrams file with text
        # for tesseract to recognize during training. Take only the ngrams whose
        # combined weight accounts for 95% of all the bigrams in the language.
        NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
            | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
        cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
            | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
            x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
        check_file_readable ${TRAIN_NGRAMS_FILE}
    fi
    local counter=0
    for font in "${FONTS[@]}"; do
        generate_font_image "${font}" &
        let counter=counter+1
        let rem=counter%par_factor
        if [[ "${rem}" -eq 0 ]]; then
          wait
        fi
    done
    wait
    # Check that each process was successful.
    for font in "${FONTS[@]}"; do
        local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
        local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
        check_file_readable ${outbase}.box ${outbase}.tif
    done
 }
 # Phase UP : Generate (U)nicharset and (P)roperties file.
 phase_UP_generate_unicharset() {
    tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
    local box_files=$(ls ${TRAINING_DIR}/*.box)
    run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
    local outfile=${TRAINING_DIR}/unicharset
    UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
    check_file_readable ${outfile}
    mv ${outfile} ${UNICHARSET_FILE}
    XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
    check_file_readable ${UNICHARSET_FILE}
    run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
        -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
        --script_dir=${LANGDATA_ROOT}
    check_file_readable ${XHEIGHTS_FILE}
 }
 # Phase D : Generate (D)awg files from unicharset file and wordlist files
 phase_D_generate_dawg() {
    tlog "\n=== Phase D: Generating Dawg files ==="
    # Skip if requested
    if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
      tlog "Skipping ${phase_name}"
      return
    fi
    # Output files
    WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
    FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
    PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
    NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
    BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
    # Word DAWG
    local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
    if [[ -r ${WORDLIST_FILE} ]]; then
        tlog "Generating word Dawg"
        check_file_readable ${UNICHARSET_FILE}
        run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
            ${UNICHARSET_FILE}
        check_file_readable ${WORD_DAWG}
        FREQ_DAWG_SIZE=100
        head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
    fi
    # Freq-word DAWG
    if [[ -r ${freq_wordlist_file} ]]; then
        check_file_readable ${UNICHARSET_FILE}
        tlog "Generating frequent-word Dawg"
        run_command ${WORDLIST2DAWG_EXE}  -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \
            ${UNICHARSET_FILE}
        check_file_readable ${FREQ_DAWG}
    fi
    # Punctuation DAWG
    local punc_clean="${LANGDATA_ROOT}/common.punc"
    if [[ -r ${PUNC_FILE} ]]; then
        local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top
        head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
            > ${top_punc_file}
        discard_tail ${PUNC_FILE} ${top_punc_file} 99 1
        punc_clean="${top_punc_file}"
    fi
    # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
    # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
    # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
    # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
    # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
    local punc_reverse_policy=0;
    case ${LANG_CODE} in
      ara | div| fas | pus | snd | syr | uig | urd | heb | yid )
        punc_reverse_policy=2 ;;
      * ) ;;
    esac
    if [[ -r ${punc_clean} ]]; then
        run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
            ${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE}
        check_file_readable ${PUNC_DAWG}
    fi
    # Numbers DAWG
    if [[ -r ${NUMBERS_FILE} ]]; then
        local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top
        head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
            > ${top_num_file}
        discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1
        run_command ${WORDLIST2DAWG_EXE} -r 0 \
            ${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE}
        check_file_readable ${NUMBER_DAWG}
    fi
    # Bigram dawg
    if [[ -r ${WORD_BIGRAMS_FILE} ]]; then
        run_command ${WORDLIST2DAWG_EXE} -r 1 \
            ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
        check_file_readable ${BIGRAM_DAWG}
    fi
 }
 # Phase E : (E)xtract .tr feature files from .tif/.box files
 phase_E_extract_features() {
    local box_config=$1
    local par_factor=$2
    if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
        par_factor=1
    fi
    tlog "\n=== Phase E: Extracting features ==="
    TRAIN_EXPOSURES='0'
    local img_files=""
    for exposure in ${TRAIN_EXPOSURES}; do
        img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
    done
    # Use any available language-specific configs.
    local config=""
    if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
        config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
    fi
    OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
    export TESSDATA_PREFIX=${TESSDATA_DIR}
    tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
    local counter=0
    for img_file in ${img_files}; do
        run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
            ${box_config} ${config} &
      let counter=counter+1
      let rem=counter%par_factor
      if [[ "${rem}" -eq 0 ]]; then
        wait
      fi
    done
    wait
    export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
    # Check that all the output files were produced.
    for img_file in ${img_files}; do
        check_file_readable ${img_file%.*}.tr
    done
 }
 # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
 # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
 phase_C_cluster_prototypes() {
    tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
    local out_normproto=$1
    run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
        $(ls ${TRAINING_DIR}/*.tr)
    check_file_readable ${TRAINING_DIR}/normproto
    mv ${TRAINING_DIR}/normproto ${out_normproto}
 }
 # Phase S : (S)hape clustering
 phase_S_cluster_shapes() {
    if (( ! ${RUN_SHAPE_CLUSTERING} )); then
        tlog "\n=== Shape Clustering disabled ==="
        return
    fi
    check_file_readable ${LANGDATA_ROOT}/font_properties
    local font_props="-F ${LANGDATA_ROOT}/font_properties"
    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
    fi
    run_command ${SHAPE_TRAINING_EXE} \
        -D "${TRAINING_DIR}/" \
        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
        ${font_props} \
        $(ls ${TRAINING_DIR}/*.tr)
    check_file_readable ${TRAINING_DIR}/shapetable \
        ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
 }
 # Phase M : Clustering microfeatures (mfTraining)
 phase_M_cluster_microfeatures() {
    tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
    check_file_readable ${LANGDATA_ROOT}/font_properties
    font_props="-F ${LANGDATA_ROOT}/font_properties"
    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
    fi
    run_command ${MF_TRAINING_EXE} \
        -D "${TRAINING_DIR}/" \
        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
        ${font_props} \
        $(ls ${TRAINING_DIR}/*.tr)
    check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
        ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
    mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
    mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
    mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
    mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
 }
 phase_B_generate_ambiguities() {
  tlog "\n=== Phase B : ambiguities training ==="
  # Check for manually created ambiguities data.
  if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
      tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
      cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
          ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
      # Make it writable, as it may be read-only in the client.
      chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
      return
  else
      tlog "No unicharambigs file found!"
  fi
  # TODO: Add support for generating ambiguities automatically.
 }
 make__traineddata() {
  tlog "\n=== Making final traineddata file ==="
  local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
  # Combine available files for this language from the langdata dir.
  if [[ -r ${lang_prefix}.config ]]; then
    tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
    cp ${lang_prefix}.config ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
  fi
  if [[ -r ${lang_prefix}.cube-unicharset ]]; then
    tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}"
    cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset
  fi
  if [[ -r ${lang_prefix}.cube-word-dawg ]]; then
    tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}"
    cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg
  fi
  if [[ -r ${lang_prefix}.params-model ]]; then
    tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
    cp ${lang_prefix}.params-model ${TRAINING_DIR}
    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
  fi
  # Compose the traineddata file.
  run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
  # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
  if [[ ! -d ${OUTPUT_DIR} ]]; then
      tlog "Creating new directory ${OUTPUT_DIR}"
      mkdir -p ${OUTPUT_DIR}
  fi
  local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
  if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then
      err_exit "File ${destfile} exists and no --overwrite specified";
  fi
  tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
  cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
 }
--- a/training/text2image.cpp
+++ b/training/text2image.cpp
@ -115,7 +115,7 @@ STRING_PARAM_FLAG(writing_mode, "horizontal",
 INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes");
-BOOL_PARAM_FLAG(strip_unrenderable_words, false,
+BOOL_PARAM_FLAG(strip_unrenderable_words, true,
                "Remove unrenderable words from source text");
 // Font name.
@ -618,9 +618,9 @@ int main(int argc, char** argv) {
        }
        pixDestroy(&binary);
      }
-      if (FLAGS_find_fonts && !FLAGS_render_per_font && !font_names.empty()) {
+      if (FLAGS_find_fonts && offset != 0) {
-        // We just want a list of names, so we don't need to render any more
+        // We just want a list of names, or some sample images so we don't need
-        // of the text.
+        // to render more than the first page of the text.
        break;
      }
    }
@ -630,8 +630,7 @@ int main(int argc, char** argv) {
    box_name += ".box";
    render.WriteAllBoxes(box_name);
  } else if (!FLAGS_render_per_font && !font_names.empty()) {
-    string filename = FLAGS_outputbase.c_str();
+    string filename = FLAGS_outputbase + ".fontlist.txt";
    filename += ".fontlist.txt";
    FILE* fp = fopen(filename.c_str(), "wb");
    if (fp == NULL) {
      tprintf("Failed to create output font list %s\n", filename.c_str());
--- a/training/unicharset_training_utils.cpp
+++ b/training/unicharset_training_utils.cpp
@ -0,0 +1,193 @@
 ///////////////////////////////////////////////////////////////////////
 // File:        unicharset_training_utils.cpp
 // Description: Training utilities for UNICHARSET.
 // Author:      Ray Smith
 // Created:     Fri Oct 17 17:09:01 PDT 2014
 //
 // (C) Copyright 2014, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////
 #include "unicharset_training_utils.h"
 #include <stdlib.h>
 #include <string.h>
 #include <string>
 #include "fileio.h"
 #include "genericvector.h"
 #include "icuerrorcode.h"
 #include "normstrngs.h"
 #include "statistc.h"
 #include "strngs.h"
 #include "unicharset.h"
 #include "unicode/uchar.h"    // from libicu
 #include "unicode/uscript.h"  // from libicu
 namespace tesseract {
 // Helper sets the character attribute properties and sets up the script table.
 // Does not set tops and bottoms.
 void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) {
  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
    // Convert any custom ligatures.
    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
        unichar_str = UNICHARSET::kCustomLigatures[i][0];
        break;
      }
    }
    // Convert the unichar to UTF32 representation
    GenericVector<char32> uni_vector;
    tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
    // Assume that if the property is true for any character in the string,
    // then it holds for the whole "character".
    bool unichar_isalpha = false;
    bool unichar_islower = false;
    bool unichar_isupper = false;
    bool unichar_isdigit = false;
    bool unichar_ispunct = false;
    for (int i = 0; i < uni_vector.size(); ++i) {
      if (u_isalpha(uni_vector[i]))
        unichar_isalpha = true;
      if (u_islower(uni_vector[i]))
        unichar_islower = true;
      if (u_isupper(uni_vector[i]))
        unichar_isupper = true;
      if (u_isdigit(uni_vector[i]))
        unichar_isdigit = true;
      if (u_ispunct(uni_vector[i]))
        unichar_ispunct = true;
    }
    unicharset->set_isalpha(unichar_id, unichar_isalpha);
    unicharset->set_islower(unichar_id, unichar_islower);
    unicharset->set_isupper(unichar_id, unichar_isupper);
    unicharset->set_isdigit(unichar_id, unichar_isdigit);
    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
    tesseract::IcuErrorCode err;
    unicharset->set_script(unichar_id, uscript_getName(
        uscript_getScript(uni_vector[0], err)));
    const int num_code_points = uni_vector.size();
    // Obtain the lower/upper case if needed and record it in the properties.
    unicharset->set_other_case(unichar_id, unichar_id);
    if (unichar_islower || unichar_isupper) {
      GenericVector<char32> other_case(num_code_points, 0);
      for (int i = 0; i < num_code_points; ++i) {
        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
        // However since they deal with UChars (so need a conversion function
        // from char32 or UTF8string) and require a meaningful locale string,
        // for now u_tolower()/u_toupper() are used.
        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
          u_tolower(uni_vector[i]);
      }
      STRING other_case_uch;
      tesseract::UTF32ToUTF8(other_case, &other_case_uch);
      UNICHAR_ID other_case_id =
          unicharset->unichar_to_id(other_case_uch.c_str());
      if (other_case_id != INVALID_UNICHAR_ID) {
        unicharset->set_other_case(unichar_id, other_case_id);
      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
        tprintf("Other case %s of %s is not in unicharset\n",
                other_case_uch.c_str(), unichar_str);
      }
    }
    // Set RTL property and obtain mirror unichar ID from ICU.
    GenericVector<char32> mirrors(num_code_points, 0);
    for (int i = 0; i < num_code_points; ++i) {
      mirrors[i] = u_charMirror(uni_vector[i]);
      if (i == 0) {  // set directionality to that of the 1st code point
        unicharset->set_direction(unichar_id,
                                  static_cast<UNICHARSET::Direction>(
                                      u_charDirection(uni_vector[i])));
      }
    }
    STRING mirror_uch;
    tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
    if (mirror_uch_id != INVALID_UNICHAR_ID) {
      unicharset->set_mirror(unichar_id, mirror_uch_id);
    } else if (report_errors) {
      tprintf("Mirror %s of %s is not in unicharset\n",
              mirror_uch.c_str(), unichar_str);
    }
    // Record normalized version of this unichar.
    STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
    if (unichar_id != 0 && normed_str.length() > 0) {
      unicharset->set_normed(unichar_id, normed_str.c_str());
    } else {
      unicharset->set_normed(unichar_id, unichar_str);
    }
    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
  }
  unicharset->post_load_setup();
 }
 // Helper to set the properties for an input unicharset file, writes to the
 // output file. If an appropriate script unicharset can be found in the
 // script_dir directory, then the tops and bottoms are expanded using the
 // script unicharset.
 // If non-empty, xheight data for the fonts are written to the xheights_file.
 void SetPropertiesForInputFile(const string& script_dir,
                               const string& input_unicharset_file,
                               const string& output_unicharset_file,
                               const string& output_xheights_file) {
  UNICHARSET unicharset;
  // Load the input unicharset
  unicharset.load_from_file(input_unicharset_file.c_str());
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          input_unicharset_file.c_str());
  // Set unichar properties
  tprintf("Setting unichar properties\n");
  SetupBasicProperties(true, &unicharset);
  string xheights_str;
  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
    // Load the unicharset for the script if available.
    string filename = script_dir + "/" +
        unicharset.get_script_from_script_id(s) + ".unicharset";
    UNICHARSET script_set;
    if (script_set.load_from_file(filename.c_str())) {
      unicharset.SetPropertiesFromOther(script_set);
    }
    // Load the xheights for the script if available.
    filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
        ".xheights";
    string script_heights;
    if (File::ReadFileToString(filename, &script_heights))
      xheights_str += script_heights;
  }
  if (!output_xheights_file.empty())
    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
    if (unicharset.PropertiesIncomplete(c)) {
      tprintf("Warning: properties incomplete for index %d = %s\n",
              c, unicharset.id_to_unichar(c));
    }
  }
  // Write the output unicharset
  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
  unicharset.save_to_file(output_unicharset_file.c_str());
 }
 }  // namespace tesseract
--- a/training/unicharset_training_utils.h
+++ b/training/unicharset_training_utils.h
@ -0,0 +1,50 @@
 ///////////////////////////////////////////////////////////////////////
 // File:        unicharset_training_utils.h
 // Description: Training utilities for UNICHARSET.
 // Author:      Ray Smith
 // Created:     Fri Oct 17 17:14:01 PDT 2014
 //
 // (C) Copyright 2014, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////
 #ifndef TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_
 #define TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_
 #include <string>
 #ifdef USE_STD_NAMESPACE
 using std::string;
 #endif
 class STATS;
 class UNICHARSET;
 namespace tesseract {
 // Helper sets the character attribute properties and sets up the script table.
 // Does not set tops and bottoms.
 void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset);
 // Helper to set the properties for an input unicharset file, writes to the
 // output file. If an appropriate script unicharset can be found in the
 // script_dir directory, then the tops and bottoms are expanded using the
 // script unicharset.
 // If non-empty, xheight data for the fonts are written to the xheights_file.
 void SetPropertiesForInputFile(const string& script_dir,
                               const string& input_unicharset_file,
                               const string& output_unicharset_file,
                               const string& output_xheights_file);
 }  // namespace tesseract.
 #endif  // TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_