Major updates to training system as a result of extensive testing on 100 languages

2025-06-07 09:52:40 +08:00 · 2015-05-12 18:04:31 -07:00 · 2015-05-12 18:04:31 -07:00 · 6be25156f7
commit 6be25156f7
parent 21805e63a4
11 changed files with 2103 additions and 731 deletions
--- a/training/language-specific.sh
+++ b/training/language-specific.sh
--- a/training/ligature_table.cpp
+++ b/training/ligature_table.cpp
@ -43,7 +43,7 @@ static string EncodeAsUTF8(const char32 ch32) {
 // from. Note that this range does not contain the custom ligatures that we
 // encode in the private use area.
 const int kMinLigature = 0xfb00;
-const int kMaxLigature = 0xfb4f;
+const int kMaxLigature = 0xfb17;  // Don't put the wide Hebrew letters in.

 /* static */
 SmartPtr<LigatureTable> LigatureTable::instance_;
--- a/training/pango_font_info.cpp
+++ b/training/pango_font_info.cpp
@ -51,6 +51,12 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
 BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
                "Does a one-time deletion of cache files from the "
                "fontconfig_tmpdir before initializing fontconfig.");
+BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true,
+                "Does a one-time reset of the fontconfig config file to point"
+                " to fonts_dir before initializing fontconfig. Set to true"
+                " if fontconfig_refresh_cache is true. Set it to false to use"
+                " multiple instances in separate processes without having to"
+                " rescan the fonts_dir, using a previously setup font cache");

 #ifndef USE_STD_NAMESPACE
 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
@ -67,6 +73,8 @@ namespace tesseract {
 // in pixels.
 const int kDefaultResolution = 300;

+bool PangoFontInfo::fontconfig_initialized_ = false;
+
 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
  Clear();
 }
@ -103,34 +111,35 @@ string PangoFontInfo::DescriptionName() const {

 // Initializes Fontconfig for use by writing a fake fonts.conf file into the
 // FLAGS_fontconfigs_tmpdir directory, that points to the supplied
-// FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
-// to point to this fonts.conf file.
-static void InitFontconfig() {
-  static bool init_fontconfig = false;
-  if (init_fontconfig || FLAGS_fonts_dir.empty()) {
-    init_fontconfig = true;
+// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
+// to point to this fonts.conf file. If force_clear, the cache is refreshed
+// even if it has already been initialized.
+void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) {
+  if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
+    fontconfig_initialized_ = true;
    return;
  }
-  if (FLAGS_fontconfig_refresh_cache) {
-    tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str());
+  if (FLAGS_fontconfig_refresh_cache || force_clear) {
    File::DeleteMatchingFiles(File::JoinPath(
-        FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str());
+        FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str());
+  }
+  if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
+      force_clear) {
+    const int MAX_FONTCONF_FILESIZE = 1024;
+    char fonts_conf_template[MAX_FONTCONF_FILESIZE];
+    snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
+             "<?xml version=\"1.0\"?>\n"
+             "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
+             "<fontconfig>\n"
+             "<dir>%s</dir>\n"
+             "<cachedir>%s</cachedir>\n"
+             "<config></config>\n"
+             "</fontconfig>", fonts_dir.c_str(),
+             FLAGS_fontconfig_tmpdir.c_str());
+    string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
+                                            "fonts.conf");
+    File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
  }
-  tprintf("Initializing fontconfig\n");
-  const int MAX_FONTCONF_FILESIZE = 1024;
-  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
-  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
-           "<?xml version=\"1.0\"?>\n"
-           "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
-           "<fontconfig>\n"
-           "<dir>%s</dir>\n"
-           "<cachedir>%s</cachedir>\n"
-           "<config></config>\n"
-           "</fontconfig>", FLAGS_fonts_dir.c_str(),
-           FLAGS_fontconfig_tmpdir.c_str());
-  string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
-                                          "fonts.conf");
-  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
 #ifdef _WIN32
  std::string env("FONTCONFIG_PATH=");
  env.append(FLAGS_fontconfig_tmpdir.c_str());
@ -141,12 +150,18 @@ static void InitFontconfig() {
  // Fix the locale so that the reported font names are consistent.
  setenv("LANG", "en_US.utf8", true);
 #endif  // _WIN32
-  init_fontconfig = true;
+  if (!fontconfig_initialized_ || force_clear) {
+    if (FcInitReinitialize() != FcTrue) {
+      tprintf("FcInitiReinitialize failed!!\n");
+    }
+  }
+  fontconfig_initialized_ = true;
+  FontUtils::ReInit();
 }

 static void ListFontFamilies(PangoFontFamily*** families,
                             int* n_families) {
-  InitFontconfig();
+  PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
  PangoFontMap* font_map = pango_cairo_font_map_get_default();
  DISABLE_HEAP_LEAK_CHECK;
  pango_font_map_list_families(font_map, families, n_families);
@ -220,7 +235,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
 // in the font map. Note that if the font is wholly missing, this could
 // correspond to a completely different font family and face.
 PangoFont* PangoFontInfo::ToPangoFont() const {
-  InitFontconfig();
+  InitFontConfig(false, FLAGS_fonts_dir);
  PangoFontMap* font_map = pango_cairo_font_map_get_default();
  PangoContext* context = pango_context_new();
  pango_cairo_context_set_resolution(context, resolution_);
@ -253,6 +268,28 @@ bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const
  return true;
 }

+// This variant of strncpy permits src and dest to overlap. It will copy the
+// first byte first.
+static char* my_strnmove(char* dest, const char* src, size_t n) {
+  char* ret = dest;
+
+  // Copy characters until n reaches zero or the src byte is a nul.
+  do {
+    *dest = *src;
+    --n;
+    ++dest;
+    ++src;
+  } while (n && src[0]);
+
+  // If we reached a nul byte and there are more 'n' left, zero them out.
+  while (n) {
+    *dest = '\0';
+    --n;
+    ++dest;
+  }
+  return ret;
+}
+
 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
  PangoFont* font = ToPangoFont();
  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
@ -265,23 +302,30 @@ int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
      UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
  const UNICHAR::const_iterator it_end =
      UNICHAR::end(utf8_text->c_str(), utf8_text->length());
-  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
+  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
    // Skip bad utf-8.
-    if (!it.is_legal())
-      continue;  // One suitable error message will still be issued.
-    if (!IsWhitespace(*it) && !pango_is_zero_width(*it) &&
-        pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
+    if (!it.is_legal()) {
+      ++it;  // One suitable error message will still be issued.
+      continue;
+    }
+    int unicode = *it;
+    int utf8_len = it.utf8_len();
+    const char* utf8_char = it.utf8_data();
+    // Move it forward before the data gets modified.
+    ++it;
+    if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
+        pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
      if (TLOG_IS_ON(2)) {
-        char tmp[5];
-        int len = it.get_utf8(tmp);
-        tmp[len] = '\0';
-        tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
+        UNICHAR unichar(unicode);
+        char* str = unichar.utf8_str();
+        tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
+        delete[] str;
      }
      ++num_dropped_chars;
      continue;
    }
-    strncpy(out, it.utf8_data(), it.utf8_len());
-    out += it.utf8_len();
+    my_strnmove(out, utf8_char, utf8_len);
+    out += utf8_len;
  }
  utf8_text->resize(out - utf8_text->c_str());
  return num_dropped_chars;
@ -438,6 +482,7 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,


 // ------------------------ FontUtils ------------------------------------
+vector<string> FontUtils::available_fonts_;  // cache list

 // Returns whether the specified font description is available in the fonts
 // directory.
@ -449,7 +494,8 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
 // from the font_map, and then check what we loaded to see if it has the
 // description we expected. If it is not, then the font is deemed unavailable.
 /* static */
-bool FontUtils::IsAvailableFont(const char* input_query_desc) {
+bool FontUtils::IsAvailableFont(const char* input_query_desc,
+                                string* best_match) {
  string query_desc(input_query_desc);
  if (PANGO_VERSION <= 12005) {
    // Strip commas and any ' Medium' substring in the name.
@ -466,7 +512,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
      query_desc.c_str());
  PangoFont* selected_font = NULL;
  {
-    InitFontconfig();
+    PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir);
    PangoFontMap* font_map = pango_cairo_font_map_get_default();
    PangoContext* context = pango_context_new();
    pango_context_set_font_map(context, font_map);
@ -490,7 +536,16 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) {
  char* selected_desc_str = pango_font_description_to_string(selected_desc);
  tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(),
       selected_desc_str);
-
+  if (!equal && best_match != NULL) {
+    *best_match = selected_desc_str;
+    // Clip the ending ' 0' if there is one. It seems that, if there is no
+    // point size on the end of the fontname, then Pango always appends ' 0'.
+    int len = best_match->size();
+    if (len > 2 && best_match->at(len - 1) == '0' &&
+        best_match->at(len - 2) == ' ') {
+      *best_match = best_match->substr(0, len - 2);
+    }
+  }
  g_free(selected_desc_str);
  pango_font_description_free(selected_desc);
  g_object_unref(selected_font);
@ -512,7 +567,6 @@ static bool ShouldIgnoreFontFamilyName(const char* query) {
 // Outputs description names of available fonts.
 /* static */
 const vector<string>& FontUtils::ListAvailableFonts() {
-  static vector<string> available_fonts_;  // cache list
  if (available_fonts_.size()) {
    return available_fonts_;
  }
@ -536,8 +590,9 @@ const vector<string>& FontUtils::ListAvailableFonts() {
  for (int i = 0; i < n_families; ++i) {
    const char* family_name = pango_font_family_get_name(families[i]);
    tlog(2, "Listing family %s\n", family_name);
-    if (ShouldIgnoreFontFamilyName(family_name))
+    if (ShouldIgnoreFontFamilyName(family_name)) {
      continue;
+    }

    int n_faces;
    PangoFontFace** faces = NULL;
@ -733,4 +788,8 @@ bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
  return false;
 }

+// PangoFontInfo is reinitialized, so clear the static list of fonts.
+/* static */
+void FontUtils::ReInit() { available_fonts_.clear(); }
+
 }  // namespace tesseract
--- a/training/pango_font_info.h
+++ b/training/pango_font_info.h
@ -83,6 +83,11 @@ class PangoFontInfo {
  bool GetSpacingProperties(const string& utf8_char,
                            int* x_bearing, int* x_advance) const;

+  // Initializes FontConfig by setting its environment variable and creating
+  // a fonts.conf file that points to the given fonts_dir. Once initialized,
+  // it is not re-initialized unless force_clear is true.
+  static void InitFontConfig(bool force_clear, const string& fonts_dir);
+
  // Accessors
  string DescriptionName() const;
  // Font Family name eg. "Arial"
@ -123,6 +128,10 @@ class PangoFontInfo {
  // Default output resolution to assume for GetSpacingProperties() and any
  // other methods that returns pixel values.
  int resolution_;
+  // Fontconfig operates through an environment variable, so it intrinsically
+  // cannot be thread-friendly, but you can serialize multiple independent
+  // font configurations by calling InitFontConfig(true, path).
+  static bool fontconfig_initialized_;

 private:
  PangoFontInfo(const PangoFontInfo&);
@ -135,7 +144,13 @@ class FontUtils {
 public:
  // Returns true if the font of the given description name is available in the
  // target directory specified by --fonts_dir
-  static bool IsAvailableFont(const char* font_desc);
+  static bool IsAvailableFont(const char* font_desc) {
+    return IsAvailableFont(font_desc, NULL);
+  }
+  // Returns true if the font of the given description name is available in the
+  // target directory specified by --fonts_dir. If false is returned, and
+  // best_match is not NULL, the closest matching font is returned there.
+  static bool IsAvailableFont(const char* font_desc, string* best_match);
  // Outputs description names of available fonts.
  static const vector<string>& ListAvailableFonts();

@ -181,6 +196,12 @@ class FontUtils {
  static int FontScore(const unordered_map<char32, inT64>& ch_map,
                       const string& fontname, int* raw_score,
                       vector<bool>* ch_flags);
+
+  // PangoFontInfo is reinitialized, so clear the static list of fonts.
+  static void ReInit();
+
+ private:
+  static vector<string> available_fonts_;  // cache list
 };
 }  // namespace tesseract

--- a/training/set_unicharset_properties.cpp
+++ b/training/set_unicharset_properties.cpp
@ -7,14 +7,8 @@
 #include <string>

 #include "commandlineflags.h"
-#include "fileio.h"
-#include "genericvector.h"
-#include "icuerrorcode.h"
-#include "normstrngs.h"
-#include "strngs.h"
-#include "unicharset.h"
-#include "unicode/uchar.h"    // from libicu
-#include "unicode/uscript.h"  // from libicu
+#include "tprintf.h"
+#include "unicharset_training_utils.h"

 // The directory that is searched for universal script unicharsets.
 STRING_PARAM_FLAG(script_dir, "",
@ -25,157 +19,6 @@ DECLARE_STRING_PARAM_FLAG(U);
 DECLARE_STRING_PARAM_FLAG(O);
 DECLARE_STRING_PARAM_FLAG(X);

-namespace tesseract {
-
-// Helper sets the character attribute properties and sets up the script table.
-// Does not set tops and bottoms.
-static void SetupBasicProperties(UNICHARSET* unicharset) {
-  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
-    // Convert any custom ligatures.
-    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
-    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
-      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
-        unichar_str = UNICHARSET::kCustomLigatures[i][0];
-        break;
-      }
-    }
-
-    // Convert the unichar to UTF32 representation
-    GenericVector<char32> uni_vector;
-    tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
-
-    // Assume that if the property is true for any character in the string,
-    // then it holds for the whole "character".
-    bool unichar_isalpha = false;
-    bool unichar_islower = false;
-    bool unichar_isupper = false;
-    bool unichar_isdigit = false;
-    bool unichar_ispunct = false;
-
-    for (int i = 0; i < uni_vector.size(); ++i) {
-      if (u_isalpha(uni_vector[i]))
-        unichar_isalpha = true;
-      if (u_islower(uni_vector[i]))
-        unichar_islower = true;
-      if (u_isupper(uni_vector[i]))
-        unichar_isupper = true;
-      if (u_isdigit(uni_vector[i]))
-        unichar_isdigit = true;
-      if (u_ispunct(uni_vector[i]))
-        unichar_ispunct = true;
-    }
-
-    unicharset->set_isalpha(unichar_id, unichar_isalpha);
-    unicharset->set_islower(unichar_id, unichar_islower);
-    unicharset->set_isupper(unichar_id, unichar_isupper);
-    unicharset->set_isdigit(unichar_id, unichar_isdigit);
-    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
-
-    tesseract::IcuErrorCode err;
-    unicharset->set_script(unichar_id, uscript_getName(
-        uscript_getScript(uni_vector[0], err)));
-
-    const int num_code_points = uni_vector.size();
-    // Obtain the lower/upper case if needed and record it in the properties.
-    unicharset->set_other_case(unichar_id, unichar_id);
-    if (unichar_islower || unichar_isupper) {
-      GenericVector<char32> other_case(num_code_points, 0);
-      for (int i = 0; i < num_code_points; ++i) {
-        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
-        // However since they deal with UChars (so need a conversion function
-        // from char32 or UTF8string) and require a meaningful locale string,
-        // for now u_tolower()/u_toupper() are used.
-        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
-          u_tolower(uni_vector[i]);
-      }
-      STRING other_case_uch;
-      tesseract::UTF32ToUTF8(other_case, &other_case_uch);
-      UNICHAR_ID other_case_id =
-          unicharset->unichar_to_id(other_case_uch.c_str());
-      if (other_case_id != INVALID_UNICHAR_ID) {
-        unicharset->set_other_case(unichar_id, other_case_id);
-      } else {
-        tprintf("Other case %s of %s is not in unicharset\n",
-                other_case_uch.c_str(), unichar_str);
-      }
-    }
-
-    // Set RTL property and obtain mirror unichar ID from ICU.
-    GenericVector<char32> mirrors(num_code_points, 0);
-    for (int i = 0; i < num_code_points; ++i) {
-      mirrors[i] = u_charMirror(uni_vector[i]);
-      if (i == 0) {  // set directionality to that of the 1st code point
-        unicharset->set_direction(unichar_id,
-                                  static_cast<UNICHARSET::Direction>(
-                                      u_charDirection(uni_vector[i])));
-      }
-    }
-    STRING mirror_uch;
-    tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
-    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
-    if (mirror_uch_id != INVALID_UNICHAR_ID) {
-      unicharset->set_mirror(unichar_id, mirror_uch_id);
-    } else {
-      tprintf("Mirror %s of %s is not in unicharset\n",
-              mirror_uch.c_str(), unichar_str);
-    }
-
-    // Record normalized version of this unichar.
-    STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
-    if (unichar_id != 0 && normed_str.length() > 0) {
-      unicharset->set_normed(unichar_id, normed_str.c_str());
-    } else {
-      unicharset->set_normed(unichar_id, unichar_str);
-    }
-  }
-  unicharset->post_load_setup();
-}
-
-// Helper to set the properties for an input unicharset file, writes to the
-// output file. If an appropriate script unicharset can be found in the
-// script_dir directory, then the tops and bottoms are expanded using the
-// script unicharset.
-// If non-empty, xheight data for the fonts are written to the xheights_file.
-static void SetPropertiesForInputFile(const string& script_dir,
-                                      const string& input_unicharset_file,
-                                      const string& output_unicharset_file,
-                                      const string& output_xheights_file) {
-  UNICHARSET unicharset;
-
-  // Load the input unicharset
-  unicharset.load_from_file(input_unicharset_file.c_str());
-  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
-          input_unicharset_file.c_str());
-
-  // Set unichar properties
-  tprintf("Setting unichar properties\n");
-  SetupBasicProperties(&unicharset);
-  string xheights_str;
-  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
-    // Load the unicharset for the script if available.
-    string filename = script_dir + "/" +
-        unicharset.get_script_from_script_id(s) + ".unicharset";
-    UNICHARSET script_set;
-    if (script_set.load_from_file(filename.c_str())) {
-      unicharset.SetPropertiesFromOther(script_set);
-    }
-    // Load the xheights for the script if available.
-    filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
-        ".xheights";
-    string script_heights;
-    if (File::ReadFileToString(filename, &script_heights))
-      xheights_str += script_heights;
-  }
-  if (!output_xheights_file.empty())
-    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
-
-  // Write the output unicharset
-  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
-  unicharset.save_to_file(output_unicharset_file.c_str());
-}
-}  // namespace tesseract
-
-
 int main(int argc, char** argv) {
  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);

--- a/training/stringrenderer.cpp
+++ b/training/stringrenderer.cpp
@ -819,6 +819,7 @@ int StringRenderer::RenderToImage(const char* text, int text_length,
 int StringRenderer::RenderAllFontsToImage(double min_coverage,
                                          const char* text, int text_length,
                                          string* font_used, Pix** image) {
+  *image = NULL;
  // Select a suitable font to render the title with.
  const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
  string title_font;
@ -882,10 +883,9 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage,
              all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
    }
  }
-  *image = NULL;
  font_index_ = 0;
  char_map_.clear();
-  return last_offset_;
+  return last_offset_ == 0 ? -1 : last_offset_;
 }

 }  // namespace tesseract
--- a/training/tesstrain.sh
+++ b/training/tesstrain.sh
@ -44,516 +44,7 @@
 # appropriate --fonts_dir path.


-FONTS=(
-    "Arial" \
-    "Times New Roman," \
-)
-FONTS_DIR="/usr/share/fonts/truetype/"
-OUTPUT_DIR="/tmp/tesstrain/tessdata"
-OVERWRITE=0
-RUN_SHAPE_CLUSTERING=0
-EXTRACT_FONT_PROPERTIES=1
-WORKSPACE_DIR="/tmp/tesstrain"
-
-
-# Logging helper functions.
-tlog() {
-    echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE}
-}
-
-err() {
-    echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE}
-    exit 1
-}
-
-# Helper function to run a command and append its output to a log. Aborts early
-# if the program file is not found.
-# Usage: run_cmd CMD ARG1 ARG2...
-run_cmd() {
-    local cmd=$1
-    shift
-    if [[ ! -x ${cmd} ]]; then
-        err "File ${cmd} not found"
-    fi
-    tlog "[$(date)] ${cmd} $@"
-    ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
-    # check completion status
-    if [[ $? -gt 0 ]]; then
-        err "Program $(basename ${cmd}) failed. Abort."
-    fi
-}
-
-# Check if all the given files exist, or exit otherwise.
-# Used to check required input files and produced output files in each phase.
-# Usage: check_file_readable FILE1 FILE2...
-check_file_readable() {
-    for file in $@; do
-        if [[ ! -r ${file} ]]; then
-            err "${file} does not exist or is not readable"
-        fi
-    done
-}
-
-
-# Write a file (with name specified in $2) with records that account for
-# n% (specified in $3) of the total weights of records in the input file
-# (input file name specified in $1). The input file should have one record
-# per line along with its weight separated by \t. The records should be
-# sorted in non-ascending order of frequency.
-# If $4 is true the first record is skipped.
-# USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE
-discard_tail() {
-    local infile=$1
-    local outfile=$2
-    local pct=$3
-    local skip_first=$4
-
-    local more_arg="1";
-    if [[ ${skip_first} ]]; then
-        more_arg="2"
-    fi
-    local sum=$(tail -n +${more_arg} ${infile} \
-        | awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}')
-    if [[ ${sum} == "" ]]; then sum=0
-    fi
-    local limit=$((${sum}*${pct}/100))
-    tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"}
-        {if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \
-            >> ${outfile}
-}
-
-
-# Set global path variables that are based on parsed flags.
-set_prog_paths() {
-    if [[ -z ${BINDIR} ]]; then
-        err "Need to specify location of program files"
-    fi
-    CN_TRAINING_EXE=${BINDIR}/cntraining
-    COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
-    MF_TRAINING_EXE=${BINDIR}/mftraining
-    SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
-    SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
-    TESSERACT_EXE=${BINDIR}/tesseract
-    TEXT2IMAGE_EXE=${BINDIR}/text2image
-    UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
-    WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
-}
-
-# Sets the named variable to given value. Aborts if the value is missing or
-# if it looks like a flag.
-# Usage: parse_value VAR_NAME VALUE
-parse_value() {
-    local val="$2"
-    if [[ -z $val ]]; then
-        err "Missing value for variable $1"
-        exit
-    fi
-    if [[ ${val:0:2} == "--" ]]; then
-        err "Invalid value $val passed for variable $1"
-        exit
-    fi
-    eval $1=\"$val\"
-}
-
-# Does simple command-line parsing and initialization.
-parse_flags() {
-    local i=0
-    while test $i -lt ${#ARGV[@]}; do
-        local j=$((i+1))
-        case ${ARGV[$i]} in
-            --)
-                break;;
-            --bin_dir)
-                parse_value "BINDIR" ${ARGV[$j]}
-                i=$j ;;
-            --fontlist)   # Expect a plus-separated list of names
-                if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
-                    err "Invalid value passed to --fontlist"
-                fi
-                local ofs=$IFS
-                IFS='+'
-                FONTS=( ${ARGV[$j]} )
-                IFS=$ofs
-                i=$j ;;
-            --fonts_dir)
-                parse_value "FONTS_DIR" ${ARGV[$j]}
-                i=$j ;;
-            --lang)
-                parse_value "LANG_CODE" ${ARGV[$j]}
-                i=$j ;;
-            --langdata_dir)
-                parse_value "LANGDATA_ROOT" ${ARGV[$j]}
-                i=$j ;;
-            --output_dir)
-                parse_value "OUTPUT_DIR" ${ARGV[$j]}
-                i=$j ;;
-            --overwrite)
-                OVERWRITE=1 ;;
-            --extract_font_properties)
-                EXTRACT_FONT_PROPERTIES=1 ;;
-            --noextract_font_properties)
-                EXTRACT_FONT_PROPERTIES=0 ;;
-            --run_shape_clustering)
-                RUN_SHAPE_CLUSTERING=1 ;;
-            --tessdata_dir)
-                parse_value "TESSDATA_DIR" ${ARGV[$j]}
-                i=$j ;;
-            --training_text)
-                parse_value "TRAINING_TEXT" "${ARGV[$j]}"
-                i=$j ;;
-            --wordlist)
-                parse_value "WORDLIST_FILE" ${ARGV[$j]}
-                i=$j ;;
-            *)
-                err "Unrecognized argument ${ARGV[$i]}" ;;
-        esac
-        i=$((i+1))
-    done
-    if [[ -z ${LANG_CODE} ]]; then
-        err "Need to specify a language --lang"
-    fi
-    if [[ -z ${BINDIR} ]]; then
-        err "Need to specify path to built binaries --bin_dir"
-    fi
-    if [[ -z ${LANGDATA_ROOT} ]]; then
-        err "Need to specify path to language files --langdata_dir"
-    fi
-    if [[ -z ${TESSDATA_DIR} ]]; then
-        if [[ -z ${TESSDATA_PREFIX} ]]; then
-            err "Need to specify a --tessdata_dir or have a "\
-        "TESSDATA_PREFIX variable defined in your environment"
-        else
-            TESSDATA_DIR="${TESSDATA_PREFIX}"
-        fi
-    fi
-
-    set_prog_paths
-
-    # Location where intermediate files will be created.
-    TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
-    # Location of log file for the whole run.
-    LOG_FILE=${TRAINING_DIR}/tesstrain.log
-
-    # Take training text and wordlist from the langdata directory if not
-    # specified in the commend-line.
-    if [[ -z ${TRAINING_TEXT} ]]; then
-        TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text
-    fi
-    if [[ -z ${WORDLIST_FILE} ]]; then
-        WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean
-    fi
-    WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean
-    NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
-    PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
-    BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
-    UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
-    TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
-}
-
-# Phase I : Generate (I)mages from training text for each font.
-phaseI_generate_image() {
-    tlog "\n=== Phase I: Generating training images ==="
-    if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then
-        err "Could not find training text file ${TRAINING_TEXT}"
-    fi
-    BOX_PADDING="0"
-    CHAR_SPACING="0.0"
-    EXPOSURE="0"
-    LEADING="32"
-    NGRAM_CHAR_SPACING="0.0"
-
-    if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS} ]]; then
-        # Parse .bigram_freqs file and compose a .train_ngrams file with text
-        # for tesseract to recognize during training. Take only the ngrams whose
-        # combined weight accounts for 95% of all the bigrams in the language.
-        TMP_FILE="${TRAINING_DIR}/_tmp"
-        cat ${BIGRAM_FREQS_FILE} > ${TMP_FILE}
-        NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
-            | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
-        cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
-            | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
-            x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
-        check_file_readable ${TRAIN_NGRAMS_FILE}
-    fi
-
-    for font in "${FONTS[@]}"; do
-        tlog "Rendering using ${font}"
-        fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
-        outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
-
-        common_args="--leading=${LEADING} --fonts_dir=${FONTS_DIR} "
-        common_args+=" --box_padding=${BOX_PADDING} --strip_unrenderable_words"
-
-        run_cmd ${TEXT2IMAGE_EXE} ${common_args} \
-            --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE} \
-            --font="${font}" --outputbase=${outbase} --text=${TRAINING_TEXT}
-        check_file_readable ${outbase}.box ${outbase}.tif
-
-        if (( ${EXTRACT_FONT_PROPERTIES} )) &&
-            [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
-            tlog "Rendering ngrams using ${font}"
-            outbase=${TRAINING_DIR}/ngrams/${LANG_CODE}.ngrams.${fontname}.exp${EXPOSURE}
-            run_cmd ${TEXT2IMAGE_EXE} ${common_args} \
-                --char_spacing=${NGRAM_CHAR_SPACING} --exposure=${EXPOSURE} \
-                --font="${font}" --outputbase=${outbase} \
-                --box_padding=${BOX_PADDING} --render_ngrams=1 \
-                --text=${TRAIN_NGRAMS_FILE}
-            check_file_readable ${outbase}.box ${outbase}.tif
-        fi
-    done
-}
-
-
-# Phase UP : Generate (U)nicharset and (P)roperties file.
-phaseUP_generate_unicharset() {
-    tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
-
-    box_files=$(ls ${TRAINING_DIR}/*.box)
-    run_cmd ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
-    outfile=${TRAINING_DIR}/unicharset
-    UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
-    check_file_readable ${outfile}
-    mv ${outfile} ${UNICHARSET_FILE}
-
-    XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
-    check_file_readable ${UNICHARSET_FILE}
-    run_cmd ${SET_UNICHARSET_PROPERTIES_EXE} \
-        -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
-        --script_dir=${LANGDATA_ROOT}
-    check_file_readable ${XHEIGHTS_FILE}
-}
-
-# Phase D : Generate (D)awg files from unicharset file and wordlist files
-phaseD_generate_dawg() {
-    tlog "\n=== Phase D: Generating Dawg files ==="
-    # Output files
-    WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
-    FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
-    PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
-    NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
-    BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
-
-    # Word DAWG
-    local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
-    if [[ -r ${WORDLIST_FILE} ]]; then
-        tlog "Generating word Dawg"
-        check_file_readable ${UNICHARSET_FILE}
-        run_cmd ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
-            ${UNICHARSET_FILE}
-        check_file_readable ${WORD_DAWG}
-
-        FREQ_DAWG_SIZE=100
-        head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
-    fi
-
-    # Freq-word DAWG
-    if [[ -r ${freq_wordlist_file} ]]; then
-        check_file_readable ${UNICHARSET_FILE}
-        tlog "Generating frequent-word Dawg"
-        run_cmd ${WORDLIST2DAWG_EXE}  -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \
-            ${UNICHARSET_FILE}
-        check_file_readable ${FREQ_DAWG}
-    fi
-
-    # Punctuation DAWG
-    local punc_clean="${LANGDATA_ROOT}/common.punc"
-    if [[ -r ${PUNC_FILE} ]]; then
-        local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top
-        head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
-            > ${top_punc_file}
-        discard_tail ${PUNC_FILE} ${top_punc_file} 99 1
-        punc_clean="${top_punc_file}"
-    fi
-    # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
-    # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
-    # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
-    # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
-    # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
-    local punc_reverse_policy=0;
-    if [[ ${LANG_CODE} == "heb" || ${LANG_CODE} == "ara" ]]; then
-        punc_reverse_policy=2
-    fi
-    if [[ -r ${punc_clean} ]]; then
-        run_cmd ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
-            ${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE}
-        check_file_readable ${PUNC_DAWG}
-    fi
-
-    # Numbers DAWG
-    if [[ -r ${NUMBERS_FILE} ]]; then
-        local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top
-        head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
-            > ${top_num_file}
-        discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1
-        run_cmd ${WORDLIST2DAWG_EXE} -r 0 \
-            ${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE}
-        check_file_readable ${NUMBER_DAWG}
-    fi
-
-    # Bigram dawg
-    if [[ -r ${WORD_BIGRAMS_FILE} ]]; then
-        run_cmd ${WORDLIST2DAWG_EXE} -r 1 \
-            ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
-        check_file_readable ${BIGRAM_DAWG}
-    fi
-}
-
-# Phase E : (E)xtract .tr feature files from .tif/.box files
-phaseE_extract_features() {
-    tlog "\n=== Phase E: Extracting features ==="
-    local box_config="box.train"
-    TRAIN_EXPOSURES='0'
-
-    for exposure in ${TRAIN_EXPOSURES}; do
-        img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
-    done
-
-    # Use any available language-specific configs.
-    local config=""
-    if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
-        config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
-    fi
-
-    OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
-    export TESSDATA_PREFIX=${TESSDATA_DIR}
-    tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
-    for img_file in ${img_files}; do
-        run_cmd ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
-            ${box_config} ${config}
-    done
-    export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
-}
-
-# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
-# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
-phaseC_cluster_prototypes() {
-    tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
-    local out_normproto=${TRAINING_DIR}/${LANG_CODE}.normproto
-
-    run_cmd ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
-        $(ls ${TRAINING_DIR}/*.tr)
-
-    check_file_readable ${TRAINING_DIR}/normproto
-    mv ${TRAINING_DIR}/normproto ${out_normproto}
-}
-
-# Phase S : (S)hape clustering
-phaseS_cluster_shapes() {
-    if (( ! ${RUN_SHAPE_CLUSTERING} )); then
-        return
-    fi
-    check_file_readable ${LANGDATA_ROOT}/font_properties
-    local font_props=${LANGDATA_ROOT}/font_properties
-    if [[ -r ${font_props} ]]; then
-        font_props="-F ${font_props}"
-    else
-        font_props=""
-    fi
-    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
-     [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
-        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
-    fi
-
-    run_cmd ${SHAPE_TRAINING_EXE} \
-        -D "${TRAINING_DIR}/" \
-        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
-        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
-        ${font_props} \
-        $(ls ${TRAINING_DIR}/*.tr)
-    check_file_readable ${TRAINING_DIR}/shapetable \
-        ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
-}
-
-# Phase M : Clustering microfeatures (mfTraining)
-phaseM_cluster_microfeatures() {
-    tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
-
-    font_props=${LANGDATA_ROOT}/font_properties
-    if [[ -r ${font_props} ]]; then
-        font_props="-F ${font_props}"
-    else
-        font_props=""
-    fi
-    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
-       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
-        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
-    fi
-
-    run_cmd ${MF_TRAINING_EXE} \
-        -D "${TRAINING_DIR}/" \
-        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
-        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
-        ${font_props} \
-        $(ls ${TRAINING_DIR}/*.tr)
-    check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
-        ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
-    mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
-    mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
-    mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
-    mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
-}
-
-phaseB_generate_ambiguities() {
-  tlog "\n=== Phase B : ambiguities training ==="
-
-  # Check for manually created ambiguities data.
-  if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
-      tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
-      cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
-          ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
-      # Make it writable, as it may be read-only in the client.
-      chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
-      return
-  else
-      tlog "No unicharambigs file found!"
-  fi
-
-  # TODO: Add support for generating ambiguities automatically.
-}
-
-
-make_traineddata() {
-  tlog "\n=== Making final traineddata file ==="
-  local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
-
-  # Combine available files for this language from the langdata dir.
-  if [[ -r ${lang_prefix}.config ]]; then
-    tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
-    cp ${lang_prefix}.config ${TRAINING_DIR}
-    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
-  fi
-  if [[ -r ${lang_prefix}.cube-unicharset ]]; then
-    tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}"
-    cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR}
-    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset
-  fi
-  if [[ -r ${lang_prefix}.cube-word-dawg ]]; then
-    tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}"
-    cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR}
-    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg
-  fi
-  if [[ -r ${lang_prefix}.params-model ]]; then
-    tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
-    cp ${lang_prefix}.params-model ${TRAINING_DIR}
-    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
-  fi
-
-  # Compose the traineddata file.
-  run_cmd ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
-
-  # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
-  if [[ ! -d ${OUTPUT_DIR} ]]; then
-      tlog "Creating new directory ${OUTPUT_DIR}"
-      mkdir -p ${OUTPUT_DIR}
-  fi
-  local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
-  if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then
-      err "File ${destfile} exists and no --overwrite specified";
-  fi
-  tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
-  cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
-}
-
+source `dirname $0`/tesstrain_utils.sh

 ARGV=("$@")
 parse_flags
@ -564,14 +55,21 @@ tlog "Cleaning workspace directory ${TRAINING_DIR}..."
 mkdir -p ${TRAINING_DIR}
 rm -fr ${TRAINING_DIR}/*

-phaseI_generate_image
-phaseUP_generate_unicharset
-phaseD_generate_dawg
-phaseE_extract_features
-phaseC_cluster_prototypes
-phaseS_cluster_shapes
-phaseM_cluster_microfeatures
-phaseB_generate_ambiguities
-make_traineddata
+source `dirname $0`/language-specific.sh
+set_lang_specific_parameters ${LANG_CODE}
+
+initialize_fontconfig
+
+phase_I_generate_image 8
+phase_UP_generate_unicharset
+phase_D_generate_dawg
+phase_E_extract_features "box.train" 8
+phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"
+if [[ "${ENABLE_SHAPE_CLUSTERING}" == "y" ]]; then
+    phase_S_cluster_shapes
+fi
+phase_M_cluster_microfeatures
+phase_B_generate_ambiguities
+make__traineddata

 tlog "\nCompleted training for language '${LANG_CODE}'\n"
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
@ -0,0 +1,578 @@
+#!/bin/bash
+# (C) Copyright 2014, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script defines functions that are used by tesstrain.sh
+# For a detailed description of the phases, see
+# https://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3
+#
+# USAGE: source tesstrain_utils.sh
+
+FONTS=(
+    "Arial" \
+    "Times New Roman," \
+)
+FONTS_DIR="/usr/share/fonts/truetype/"
+OUTPUT_DIR="/tmp/tesstrain/tessdata"
+OVERWRITE=0
+RUN_SHAPE_CLUSTERING=0
+EXTRACT_FONT_PROPERTIES=1
+WORKSPACE_DIR="/tmp/tesstrain"
+
+# Logging helper functions.
+tlog() {
+    echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE}
+}
+
+err_exit() {
+    echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE}
+    exit 1
+}
+
+# Helper function to run a command and append its output to a log. Aborts early
+# if the program file is not found.
+# Usage: run_command CMD ARG1 ARG2...
+run_command() {
+    local cmd=$1
+    shift
+    if [[ ! -x ${cmd} ]]; then
+        err_exit "File ${cmd} not found"
+    fi
+    tlog "[$(date)] ${cmd} $@"
+    ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
+    # check completion status
+    if [[ $? -gt 0 ]]; then
+        err_exit "Program $(basename ${cmd}) failed. Abort."
+    fi
+}
+
+# Check if all the given files exist, or exit otherwise.
+# Used to check required input files and produced output files in each phase.
+# Usage: check_file_readable FILE1 FILE2...
+check_file_readable() {
+    for file in $@; do
+        if [[ ! -r ${file} ]]; then
+            err_exit "${file} does not exist or is not readable"
+        fi
+    done
+}
+
+# Write a file (with name specified in $2) with records that account for
+# n% (specified in $3) of the total weights of records in the input file
+# (input file name specified in $1). The input file should have one record
+# per line along with its weight separated by \t. The records should be
+# sorted in non-ascending order of frequency.
+# If $4 is true the first record is skipped.
+# USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE
+discard_tail() {
+    local infile=$1
+    local outfile=$2
+    local pct=$3
+    local skip_first=$4
+
+    local more_arg="1";
+    if [[ ${skip_first} ]]; then
+        more_arg="2"
+    fi
+    local sum=$(tail -n +${more_arg} ${infile} \
+        | awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}')
+    if [[ ${sum} == "" ]]; then sum=0
+    fi
+    local limit=$((${sum}*${pct}/100))
+    tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"}
+        {if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \
+            >> ${outfile}
+}
+
+# Set global path variables that are based on parsed flags.
+set_prog_paths() {
+    if [[ -z ${BINDIR} ]]; then
+        err_exit "Need to specify location of program files"
+    fi
+    CN_TRAINING_EXE=${BINDIR}/cntraining
+    COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
+    MF_TRAINING_EXE=${BINDIR}/mftraining
+    SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
+    SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
+    TESSERACT_EXE=${BINDIR}/tesseract
+    TEXT2IMAGE_EXE=${BINDIR}/text2image
+    UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
+    WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
+}
+
+# Sets the named variable to given value. Aborts if the value is missing or
+# if it looks like a flag.
+# Usage: parse_value VAR_NAME VALUE
+parse_value() {
+    local val="$2"
+    if [[ -z $val ]]; then
+        err_exit "Missing value for variable $1"
+        exit
+    fi
+    if [[ ${val:0:2} == "--" ]]; then
+        err_exit "Invalid value $val passed for variable $1"
+        exit
+    fi
+    eval $1=\"$val\"
+}
+
+# Does simple command-line parsing and initialization.
+parse_flags() {
+    local i=0
+    while test $i -lt ${#ARGV[@]}; do
+        local j=$((i+1))
+        case ${ARGV[$i]} in
+            --)
+                break;;
+            --bin_dir)
+                parse_value "BINDIR" ${ARGV[$j]}
+                i=$j ;;
+            --fontlist)   # Expect a plus-separated list of names
+                if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
+                    err_exit "Invalid value passed to --fontlist"
+                fi
+                local ofs=$IFS
+                IFS='+'
+                FONTS=( ${ARGV[$j]} )
+                IFS=$ofs
+                i=$j ;;
+            --fonts_dir)
+                parse_value "FONTS_DIR" ${ARGV[$j]}
+                i=$j ;;
+            --lang)
+                parse_value "LANG_CODE" ${ARGV[$j]}
+                i=$j ;;
+            --langdata_dir)
+                parse_value "LANGDATA_ROOT" ${ARGV[$j]}
+                i=$j ;;
+            --output_dir)
+                parse_value "OUTPUT_DIR" ${ARGV[$j]}
+                i=$j ;;
+            --overwrite)
+                OVERWRITE=1 ;;
+            --extract_font_properties)
+                EXTRACT_FONT_PROPERTIES=1 ;;
+            --noextract_font_properties)
+                EXTRACT_FONT_PROPERTIES=0 ;;
+            --tessdata_dir)
+                parse_value "TESSDATA_DIR" ${ARGV[$j]}
+                i=$j ;;
+            --training_text)
+                parse_value "TRAINING_TEXT" "${ARGV[$j]}"
+                i=$j ;;
+            --wordlist)
+                parse_value "WORDLIST_FILE" ${ARGV[$j]}
+                i=$j ;;
+            *)
+                err_exit "Unrecognized argument ${ARGV[$i]}" ;;
+        esac
+        i=$((i+1))
+    done
+    if [[ -z ${LANG_CODE} ]]; then
+        err_exit "Need to specify a language --lang"
+    fi
+    if [[ -z ${BINDIR} ]]; then
+        err_exit "Need to specify path to built binaries --bin_dir"
+    fi
+    if [[ -z ${LANGDATA_ROOT} ]]; then
+        err_exit "Need to specify path to language files --langdata_dir"
+    fi
+    if [[ -z ${TESSDATA_DIR} ]]; then
+        if [[ -z ${TESSDATA_PREFIX} ]]; then
+            err_exit "Need to specify a --tessdata_dir or have a "\
+        "TESSDATA_PREFIX variable defined in your environment"
+        else
+            TESSDATA_DIR="${TESSDATA_PREFIX}"
+        fi
+    fi
+
+    set_prog_paths
+
+    # Location where intermediate files will be created.
+    TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
+    # Location of log file for the whole run.
+    LOG_FILE=${TRAINING_DIR}/tesstrain.log
+
+    # Take training text and wordlist from the langdata directory if not
+    # specified in the commend-line.
+    if [[ -z ${TRAINING_TEXT} ]]; then
+        TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text
+    fi
+    if [[ -z ${WORDLIST_FILE} ]]; then
+        WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean
+    fi
+    WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean
+    NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
+    PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
+    BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
+    UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
+    TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
+    GENERATE_DAWGS=1
+}
+
+# Function initializes font config with a unique font cache dir.
+initialize_fontconfig() {
+    export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
+    local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
+    echo "Text" >${sample_path}
+    run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
+        --font="Arial" --outputbase=${sample_path} --text=${sample_path} \
+        --fontconfig_tmpdir=${FONT_CONFIG_CACHE}
+}
+
+# Helper function for phaseI_generate_image. Generates the image for a single
+# language/font combination in a way that can be run in parallel.
+generate_font_image() {
+    local font="$1"
+    tlog "Rendering using ${font}"
+    local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
+    local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
+
+    local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}"
+    common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
+    common_args+=" --fontconfig_refresh_config_file=false --leading=${LEADING}"
+    common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
+    common_args+=" --outputbase=${outbase}"
+
+    # add --writing_mode=vertical-upright to common_args if the font is
+    # specified to be rendered vertically.
+    for vfont in "${VERTICAL_FONTS[@]}"; do
+      if [[ "${font}" == "${vfont}" ]]; then
+        common_args+=" --writing_mode=vertical-upright "
+        break
+      fi
+    done
+
+    run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
+        --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
+    check_file_readable ${outbase}.box ${outbase}.tif
+
+    if (( ${EXTRACT_FONT_PROPERTIES} )) &&
+        [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
+        tlog "Extracting font properties of ${font}"
+        run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
+            --ligatures=false --text=${TRAIN_NGRAMS_FILE} \
+            --only_extract_font_properties --ptsize=32
+        check_file_readable ${outbase}.fontinfo
+    fi
+}
+
+
+# Phase I : Generate (I)mages from training text for each font.
+phase_I_generate_image() {
+    local par_factor=$1
+    if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
+        par_factor=1
+    fi
+    tlog "\n=== Phase I: Generating training images ==="
+    if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then
+        err_exit "Could not find training text file ${TRAINING_TEXT}"
+    fi
+    CHAR_SPACING="0.0"
+    EXPOSURE="0"
+
+    if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
+        # Parse .bigram_freqs file and compose a .train_ngrams file with text
+        # for tesseract to recognize during training. Take only the ngrams whose
+        # combined weight accounts for 95% of all the bigrams in the language.
+        NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
+            | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
+        cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
+            | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
+            x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
+        check_file_readable ${TRAIN_NGRAMS_FILE}
+    fi
+
+    local counter=0
+    for font in "${FONTS[@]}"; do
+        generate_font_image "${font}" &
+        let counter=counter+1
+        let rem=counter%par_factor
+        if [[ "${rem}" -eq 0 ]]; then
+          wait
+        fi
+    done
+    wait
+    # Check that each process was successful.
+    for font in "${FONTS[@]}"; do
+        local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
+        local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
+        check_file_readable ${outbase}.box ${outbase}.tif
+    done
+}
+
+# Phase UP : Generate (U)nicharset and (P)roperties file.
+phase_UP_generate_unicharset() {
+    tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
+
+    local box_files=$(ls ${TRAINING_DIR}/*.box)
+    run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
+    local outfile=${TRAINING_DIR}/unicharset
+    UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
+    check_file_readable ${outfile}
+    mv ${outfile} ${UNICHARSET_FILE}
+
+    XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
+    check_file_readable ${UNICHARSET_FILE}
+    run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
+        -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
+        --script_dir=${LANGDATA_ROOT}
+    check_file_readable ${XHEIGHTS_FILE}
+}
+
+# Phase D : Generate (D)awg files from unicharset file and wordlist files
+phase_D_generate_dawg() {
+    tlog "\n=== Phase D: Generating Dawg files ==="
+
+    # Skip if requested
+    if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
+      tlog "Skipping ${phase_name}"
+      return
+    fi
+
+    # Output files
+    WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
+    FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
+    PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
+    NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
+    BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
+
+    # Word DAWG
+    local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
+    if [[ -r ${WORDLIST_FILE} ]]; then
+        tlog "Generating word Dawg"
+        check_file_readable ${UNICHARSET_FILE}
+        run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
+            ${UNICHARSET_FILE}
+        check_file_readable ${WORD_DAWG}
+
+        FREQ_DAWG_SIZE=100
+        head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
+    fi
+
+    # Freq-word DAWG
+    if [[ -r ${freq_wordlist_file} ]]; then
+        check_file_readable ${UNICHARSET_FILE}
+        tlog "Generating frequent-word Dawg"
+        run_command ${WORDLIST2DAWG_EXE}  -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \
+            ${UNICHARSET_FILE}
+        check_file_readable ${FREQ_DAWG}
+    fi
+
+    # Punctuation DAWG
+    local punc_clean="${LANGDATA_ROOT}/common.punc"
+    if [[ -r ${PUNC_FILE} ]]; then
+        local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top
+        head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
+            > ${top_punc_file}
+        discard_tail ${PUNC_FILE} ${top_punc_file} 99 1
+        punc_clean="${top_punc_file}"
+    fi
+    # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
+    # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
+    # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
+    # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
+    # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
+    local punc_reverse_policy=0;
+    case ${LANG_CODE} in
+      ara | div| fas | pus | snd | syr | uig | urd | heb | yid )
+        punc_reverse_policy=2 ;;
+      * ) ;;
+    esac
+    if [[ -r ${punc_clean} ]]; then
+        run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
+            ${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE}
+        check_file_readable ${PUNC_DAWG}
+    fi
+
+    # Numbers DAWG
+    if [[ -r ${NUMBERS_FILE} ]]; then
+        local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top
+        head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \
+            > ${top_num_file}
+        discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1
+        run_command ${WORDLIST2DAWG_EXE} -r 0 \
+            ${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE}
+        check_file_readable ${NUMBER_DAWG}
+    fi
+
+    # Bigram dawg
+    if [[ -r ${WORD_BIGRAMS_FILE} ]]; then
+        run_command ${WORDLIST2DAWG_EXE} -r 1 \
+            ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
+        check_file_readable ${BIGRAM_DAWG}
+    fi
+}
+
+# Phase E : (E)xtract .tr feature files from .tif/.box files
+phase_E_extract_features() {
+    local box_config=$1
+    local par_factor=$2
+    if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
+        par_factor=1
+    fi
+    tlog "\n=== Phase E: Extracting features ==="
+    TRAIN_EXPOSURES='0'
+
+    local img_files=""
+    for exposure in ${TRAIN_EXPOSURES}; do
+        img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
+    done
+
+    # Use any available language-specific configs.
+    local config=""
+    if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
+        config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
+    fi
+
+    OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
+    export TESSDATA_PREFIX=${TESSDATA_DIR}
+    tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+    local counter=0
+    for img_file in ${img_files}; do
+        run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
+            ${box_config} ${config} &
+      let counter=counter+1
+      let rem=counter%par_factor
+      if [[ "${rem}" -eq 0 ]]; then
+        wait
+      fi
+    done
+    wait
+    export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
+    # Check that all the output files were produced.
+    for img_file in ${img_files}; do
+        check_file_readable ${img_file%.*}.tr
+    done
+}
+
+# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
+# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
+phase_C_cluster_prototypes() {
+    tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
+    local out_normproto=$1
+
+    run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
+        $(ls ${TRAINING_DIR}/*.tr)
+
+    check_file_readable ${TRAINING_DIR}/normproto
+    mv ${TRAINING_DIR}/normproto ${out_normproto}
+}
+
+# Phase S : (S)hape clustering
+phase_S_cluster_shapes() {
+    if (( ! ${RUN_SHAPE_CLUSTERING} )); then
+        tlog "\n=== Shape Clustering disabled ==="
+        return
+    fi
+    check_file_readable ${LANGDATA_ROOT}/font_properties
+    local font_props="-F ${LANGDATA_ROOT}/font_properties"
+    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
+       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
+        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
+    fi
+
+    run_command ${SHAPE_TRAINING_EXE} \
+        -D "${TRAINING_DIR}/" \
+        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
+        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
+        ${font_props} \
+        $(ls ${TRAINING_DIR}/*.tr)
+    check_file_readable ${TRAINING_DIR}/shapetable \
+        ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
+}
+
+# Phase M : Clustering microfeatures (mfTraining)
+phase_M_cluster_microfeatures() {
+    tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
+
+    check_file_readable ${LANGDATA_ROOT}/font_properties
+    font_props="-F ${LANGDATA_ROOT}/font_properties"
+    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
+       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
+        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
+    fi
+
+    run_command ${MF_TRAINING_EXE} \
+        -D "${TRAINING_DIR}/" \
+        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
+        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
+        ${font_props} \
+        $(ls ${TRAINING_DIR}/*.tr)
+    check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
+        ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
+    mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
+    mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
+    mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
+    mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
+}
+
+phase_B_generate_ambiguities() {
+  tlog "\n=== Phase B : ambiguities training ==="
+
+  # Check for manually created ambiguities data.
+  if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
+      tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
+      cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
+          ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
+      # Make it writable, as it may be read-only in the client.
+      chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
+      return
+  else
+      tlog "No unicharambigs file found!"
+  fi
+
+  # TODO: Add support for generating ambiguities automatically.
+}
+
+
+make__traineddata() {
+  tlog "\n=== Making final traineddata file ==="
+  local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
+
+  # Combine available files for this language from the langdata dir.
+  if [[ -r ${lang_prefix}.config ]]; then
+    tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
+    cp ${lang_prefix}.config ${TRAINING_DIR}
+    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
+  fi
+  if [[ -r ${lang_prefix}.cube-unicharset ]]; then
+    tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}"
+    cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR}
+    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset
+  fi
+  if [[ -r ${lang_prefix}.cube-word-dawg ]]; then
+    tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}"
+    cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR}
+    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg
+  fi
+  if [[ -r ${lang_prefix}.params-model ]]; then
+    tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
+    cp ${lang_prefix}.params-model ${TRAINING_DIR}
+    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
+  fi
+
+  # Compose the traineddata file.
+  run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
+
+  # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
+  if [[ ! -d ${OUTPUT_DIR} ]]; then
+      tlog "Creating new directory ${OUTPUT_DIR}"
+      mkdir -p ${OUTPUT_DIR}
+  fi
+  local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
+  if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then
+      err_exit "File ${destfile} exists and no --overwrite specified";
+  fi
+  tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
+  cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
+}
+
--- a/training/text2image.cpp
+++ b/training/text2image.cpp
@ -115,7 +115,7 @@ STRING_PARAM_FLAG(writing_mode, "horizontal",

 INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes");

-BOOL_PARAM_FLAG(strip_unrenderable_words, false,
+BOOL_PARAM_FLAG(strip_unrenderable_words, true,
                "Remove unrenderable words from source text");

 // Font name.
@ -618,9 +618,9 @@ int main(int argc, char** argv) {
        }
        pixDestroy(&binary);
      }
-      if (FLAGS_find_fonts && !FLAGS_render_per_font && !font_names.empty()) {
-        // We just want a list of names, so we don't need to render any more
-        // of the text.
+      if (FLAGS_find_fonts && offset != 0) {
+        // We just want a list of names, or some sample images so we don't need
+        // to render more than the first page of the text.
        break;
      }
    }
@ -630,8 +630,7 @@ int main(int argc, char** argv) {
    box_name += ".box";
    render.WriteAllBoxes(box_name);
  } else if (!FLAGS_render_per_font && !font_names.empty()) {
-    string filename = FLAGS_outputbase.c_str();
-    filename += ".fontlist.txt";
+    string filename = FLAGS_outputbase + ".fontlist.txt";
    FILE* fp = fopen(filename.c_str(), "wb");
    if (fp == NULL) {
      tprintf("Failed to create output font list %s\n", filename.c_str());
--- a/training/unicharset_training_utils.cpp
+++ b/training/unicharset_training_utils.cpp
@ -0,0 +1,193 @@
+///////////////////////////////////////////////////////////////////////
+// File:        unicharset_training_utils.cpp
+// Description: Training utilities for UNICHARSET.
+// Author:      Ray Smith
+// Created:     Fri Oct 17 17:09:01 PDT 2014
+//
+// (C) Copyright 2014, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "unicharset_training_utils.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+
+#include "fileio.h"
+#include "genericvector.h"
+#include "icuerrorcode.h"
+#include "normstrngs.h"
+#include "statistc.h"
+#include "strngs.h"
+#include "unicharset.h"
+#include "unicode/uchar.h"    // from libicu
+#include "unicode/uscript.h"  // from libicu
+
+namespace tesseract {
+
+// Helper sets the character attribute properties and sets up the script table.
+// Does not set tops and bottoms.
+void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) {
+  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
+    // Convert any custom ligatures.
+    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
+    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
+      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
+        unichar_str = UNICHARSET::kCustomLigatures[i][0];
+        break;
+      }
+    }
+
+    // Convert the unichar to UTF32 representation
+    GenericVector<char32> uni_vector;
+    tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
+
+    // Assume that if the property is true for any character in the string,
+    // then it holds for the whole "character".
+    bool unichar_isalpha = false;
+    bool unichar_islower = false;
+    bool unichar_isupper = false;
+    bool unichar_isdigit = false;
+    bool unichar_ispunct = false;
+
+    for (int i = 0; i < uni_vector.size(); ++i) {
+      if (u_isalpha(uni_vector[i]))
+        unichar_isalpha = true;
+      if (u_islower(uni_vector[i]))
+        unichar_islower = true;
+      if (u_isupper(uni_vector[i]))
+        unichar_isupper = true;
+      if (u_isdigit(uni_vector[i]))
+        unichar_isdigit = true;
+      if (u_ispunct(uni_vector[i]))
+        unichar_ispunct = true;
+    }
+
+    unicharset->set_isalpha(unichar_id, unichar_isalpha);
+    unicharset->set_islower(unichar_id, unichar_islower);
+    unicharset->set_isupper(unichar_id, unichar_isupper);
+    unicharset->set_isdigit(unichar_id, unichar_isdigit);
+    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
+
+    tesseract::IcuErrorCode err;
+    unicharset->set_script(unichar_id, uscript_getName(
+        uscript_getScript(uni_vector[0], err)));
+
+    const int num_code_points = uni_vector.size();
+    // Obtain the lower/upper case if needed and record it in the properties.
+    unicharset->set_other_case(unichar_id, unichar_id);
+    if (unichar_islower || unichar_isupper) {
+      GenericVector<char32> other_case(num_code_points, 0);
+      for (int i = 0; i < num_code_points; ++i) {
+        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
+        // However since they deal with UChars (so need a conversion function
+        // from char32 or UTF8string) and require a meaningful locale string,
+        // for now u_tolower()/u_toupper() are used.
+        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
+          u_tolower(uni_vector[i]);
+      }
+      STRING other_case_uch;
+      tesseract::UTF32ToUTF8(other_case, &other_case_uch);
+      UNICHAR_ID other_case_id =
+          unicharset->unichar_to_id(other_case_uch.c_str());
+      if (other_case_id != INVALID_UNICHAR_ID) {
+        unicharset->set_other_case(unichar_id, other_case_id);
+      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
+        tprintf("Other case %s of %s is not in unicharset\n",
+                other_case_uch.c_str(), unichar_str);
+      }
+    }
+
+    // Set RTL property and obtain mirror unichar ID from ICU.
+    GenericVector<char32> mirrors(num_code_points, 0);
+    for (int i = 0; i < num_code_points; ++i) {
+      mirrors[i] = u_charMirror(uni_vector[i]);
+      if (i == 0) {  // set directionality to that of the 1st code point
+        unicharset->set_direction(unichar_id,
+                                  static_cast<UNICHARSET::Direction>(
+                                      u_charDirection(uni_vector[i])));
+      }
+    }
+    STRING mirror_uch;
+    tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
+    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
+    if (mirror_uch_id != INVALID_UNICHAR_ID) {
+      unicharset->set_mirror(unichar_id, mirror_uch_id);
+    } else if (report_errors) {
+      tprintf("Mirror %s of %s is not in unicharset\n",
+              mirror_uch.c_str(), unichar_str);
+    }
+
+    // Record normalized version of this unichar.
+    STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
+    if (unichar_id != 0 && normed_str.length() > 0) {
+      unicharset->set_normed(unichar_id, normed_str.c_str());
+    } else {
+      unicharset->set_normed(unichar_id, unichar_str);
+    }
+    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
+  }
+  unicharset->post_load_setup();
+}
+
+// Helper to set the properties for an input unicharset file, writes to the
+// output file. If an appropriate script unicharset can be found in the
+// script_dir directory, then the tops and bottoms are expanded using the
+// script unicharset.
+// If non-empty, xheight data for the fonts are written to the xheights_file.
+void SetPropertiesForInputFile(const string& script_dir,
+                               const string& input_unicharset_file,
+                               const string& output_unicharset_file,
+                               const string& output_xheights_file) {
+  UNICHARSET unicharset;
+
+  // Load the input unicharset
+  unicharset.load_from_file(input_unicharset_file.c_str());
+  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
+          input_unicharset_file.c_str());
+
+  // Set unichar properties
+  tprintf("Setting unichar properties\n");
+  SetupBasicProperties(true, &unicharset);
+  string xheights_str;
+  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
+    // Load the unicharset for the script if available.
+    string filename = script_dir + "/" +
+        unicharset.get_script_from_script_id(s) + ".unicharset";
+    UNICHARSET script_set;
+    if (script_set.load_from_file(filename.c_str())) {
+      unicharset.SetPropertiesFromOther(script_set);
+    }
+    // Load the xheights for the script if available.
+    filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
+        ".xheights";
+    string script_heights;
+    if (File::ReadFileToString(filename, &script_heights))
+      xheights_str += script_heights;
+  }
+  if (!output_xheights_file.empty())
+    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
+  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
+    if (unicharset.PropertiesIncomplete(c)) {
+      tprintf("Warning: properties incomplete for index %d = %s\n",
+              c, unicharset.id_to_unichar(c));
+    }
+  }
+
+  // Write the output unicharset
+  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
+  unicharset.save_to_file(output_unicharset_file.c_str());
+}
+
+}  // namespace tesseract
+
--- a/training/unicharset_training_utils.h
+++ b/training/unicharset_training_utils.h
@ -0,0 +1,50 @@
+///////////////////////////////////////////////////////////////////////
+// File:        unicharset_training_utils.h
+// Description: Training utilities for UNICHARSET.
+// Author:      Ray Smith
+// Created:     Fri Oct 17 17:14:01 PDT 2014
+//
+// (C) Copyright 2014, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_
+#define TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_
+
+#include <string>
+
+#ifdef USE_STD_NAMESPACE
+using std::string;
+#endif
+
+class STATS;
+class UNICHARSET;
+
+namespace tesseract {
+
+// Helper sets the character attribute properties and sets up the script table.
+// Does not set tops and bottoms.
+void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset);
+
+// Helper to set the properties for an input unicharset file, writes to the
+// output file. If an appropriate script unicharset can be found in the
+// script_dir directory, then the tops and bottoms are expanded using the
+// script unicharset.
+// If non-empty, xheight data for the fonts are written to the xheights_file.
+void SetPropertiesForInputFile(const string& script_dir,
+                               const string& input_unicharset_file,
+                               const string& output_unicharset_file,
+                               const string& output_xheights_file);
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_