From 93348a83a324a479978d9dd399b34d15ec6c5d83 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Tue, 18 May 2021 10:47:44 +0200
Subject: [PATCH] Remove scripts for training

They were replaced by Python3 scripts (part of the tesstrain repository).

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 Makefile.am                          |    4 -
 src/training/language-specific.sh    | 1199 --------------------------
 src/training/tesstrain.sh            |   98 ---
 src/training/tesstrain_utils.sh      |  632 --------------
 src/training/unicharset/lstmtester.h |    2 +-
 unittest/lstm_test.cc                |    2 +-
 6 files changed, 2 insertions(+), 1935 deletions(-)
 delete mode 100755 src/training/language-specific.sh
 delete mode 100755 src/training/tesstrain.sh
 delete mode 100644 src/training/tesstrain_utils.sh

diff --git a/Makefile.am b/Makefile.am
index 663afb2e..e95f18ea 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -795,10 +795,6 @@ training_CPPFLAGS += $(ICU_UC_CFLAGS) $(ICU_I18N_CFLAGS)
 training_CPPFLAGS += $(pango_CFLAGS)
 training_CPPFLAGS += $(cairo_CFLAGS)
 
-bin_SCRIPTS = src/training/language-specific.sh src/training/tesstrain.sh
-scripts_DATA = src/training/tesstrain_utils.sh
-scriptsdir = $(bindir)
-
 if DISABLED_LEGACY_ENGINE
 training_CPPFLAGS += -DDISABLED_LEGACY_ENGINE
 endif
diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh
deleted file mode 100755
index e5a7f096..00000000
--- a/src/training/language-specific.sh
+++ /dev/null
@@ -1,1199 +0,0 @@
-#!/bin/bash
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Set some language specific variables. Works in conjunction with
-# tesstrain.sh
-#
-
-#=============================================================================
-# Language specific info
-#=============================================================================
-
-# Array of all valid language codes.
-VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
-                      ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
-                      ell eng enm epo est eus fas fil fin fra frk frm gle glg
-                      grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
-                      jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat
-                      lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
-                      pan pol por pus ron rus san sin slk slv snd spa spa_old
-                      sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur
-                      uig ukr urd uzb uzb_cyrl vie yid gle_uncial"
-
-# Codes for which we have webtext but no fonts:
-UNUSABLE_LANGUAGE_CODES=""
-
-FRAKTUR_FONTS=(
-    "CaslonishFraxx Medium" \
-    "Cloister Black, Light" \
-    "Proclamate Light" \
-    "UnifrakturMaguntia" \
-    "Walbaum-Fraktur" \
-)
-
-# List of fonts to train on
-LATIN_FONTS=(
-    "Arial Bold" \
-    "Arial Bold Italic" \
-    "Arial Italic" \
-    "Arial" \
-    "Courier New Bold" \
-    "Courier New Bold Italic" \
-    "Courier New Italic" \
-    "Courier New" \
-    "Times New Roman, Bold" \
-    "Times New Roman, Bold Italic" \
-    "Times New Roman, Italic" \
-    "Times New Roman," \
-    "Georgia Bold" \
-    "Georgia Italic" \
-    "Georgia" \
-    "Georgia Bold Italic" \
-    "Trebuchet MS Bold" \
-    "Trebuchet MS Bold Italic" \
-    "Trebuchet MS Italic" \
-    "Trebuchet MS" \
-    "Verdana Bold" \
-    "Verdana Italic" \
-    "Verdana" \
-    "Verdana Bold Italic" \
-    "Tex Gyre Bonum Bold" \
-    "Tex Gyre Bonum Italic" \
-    "Tex Gyre Bonum Bold Italic" \
-    "Tex Gyre Schola Bold" \
-    "Tex Gyre Schola Italic" \
-    "Tex Gyre Schola Bold Italic" \
-    "Tex Gyre Schola Regular" \
-    "DejaVu Sans Ultra-Light" \
-)
-
-# List of fonts for printed/neo-Latin ('lat' language code, different from Latin script)
-NEOLATIN_FONTS=(
-    "GFS Bodoni" \
-    "GFS Bodoni Bold" \
-    "GFS Bodoni Italic" \
-    "GFS Bodoni Bold Italic" \
-    "GFS Didot" \
-    "GFS Didot Bold" \
-    "GFS Didot Italic" \
-    "GFS Didot Bold Italic" \
-    "Cardo" \
-    "Cardo Bold" \
-    "Cardo Italic" \
-    "Wyld" \
-    "Wyld Italic" \
-    "EB Garamond" \
-    "EB Garamond Italic" \
-    "Junicode" \
-    "Junicode Bold" \
-    "Junicode Italic" \
-    "Junicode Bold Italic" \
-    "IM FELL DW Pica PRO" \
-    "IM FELL English PRO" \
-    "IM FELL Double Pica PRO" \
-    "IM FELL French Canon PRO" \
-    "IM FELL Great Primer PRO" \
-    "IM FELL DW Pica PRO Italic" \
-    "IM FELL English PRO Italic" \
-    "IM FELL Double Pica PRO Italic" \
-    "IM FELL French Canon PRO Italic" \
-    "IM FELL Great Primer PRO Italic" \
-)
-
-IRISH_UNCIAL_FONTS=(
-  "Bunchlo Arsa Dubh GC" \
-  "Bunchlo Arsa GC" \
-  "Bunchlo Arsa GC Bold" \
-  "Bunchlo Dubh GC" \
-  "Bunchlo GC" \
-  "Bunchlo GC Bold" \
-  "Bunchlo Nua GC Bold" \
-  "Bunchló na Nod GC" \
-  "Gadelica" \
-  "Glanchlo Dubh GC" \
-  "Glanchlo GC" \
-  "Glanchlo GC Bold" \
-  "Seanchló Dubh GC" \
-  "Seanchló GC" \
-  "Seanchló GC Bold" \
-  "Seanchló na Nod GC" \
-  "Seanchló Ársa Dubh GC" \
-  "Seanchló Ársa GC" \
-  "Seanchló Ársa GC Bold" \
-  "Tromchlo Beag GC" \
-  "Tromchlo Mor GC" \
-  "Urchlo GC" \
-  "Urchlo GC Bold" \
-)
-
-EARLY_LATIN_FONTS=(
-    "${FRAKTUR_FONTS[@]}" \
-    "${LATIN_FONTS[@]}" \
-    # The Wyld font family renders early modern ligatures encoded in the private
-    # unicode area.
-    "Wyld" \
-    "Wyld Italic" \
-    # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English.
-    "GentiumAlt" \
-)
-
-VIETNAMESE_FONTS=( \
-    "Arial Unicode MS Bold" \
-    "Arial Bold Italic" \
-    "Arial Italic" \
-    "Arial Unicode MS" \
-    "FreeMono Bold" \
-    "Courier New Bold Italic" \
-    "FreeMono Italic" \
-    "FreeMono" \
-    "GentiumAlt Italic" \
-    "GentiumAlt" \
-    "Palatino Linotype Bold" \
-    "Palatino Linotype Bold Italic" \
-    "Palatino Linotype Italic" \
-    "Palatino Linotype" \
-    "Really No 2 LT W2G Light" \
-    "Really No 2 LT W2G Light Italic" \
-    "Really No 2 LT W2G Medium" \
-    "Really No 2 LT W2G Medium Italic" \
-    "Really No 2 LT W2G Semi-Bold" \
-    "Really No 2 LT W2G Semi-Bold Italic" \
-    "Really No 2 LT W2G Ultra-Bold" \
-    "Really No 2 LT W2G Ultra-Bold Italic" \
-    "Times New Roman, Bold" \
-    "Times New Roman, Bold Italic" \
-    "Times New Roman, Italic" \
-    "Times New Roman," \
-    "Verdana Bold" \
-    "Verdana Italic" \
-    "Verdana" \
-    "Verdana Bold Italic" \
-    "VL Gothic" \
-    "VL PGothic" \
-    )
-
-DEVANAGARI_FONTS=( \
-    "FreeSans" \
-    "Chandas" \
-    "Kalimati" \
-    "Uttara" \
-    "Lucida Sans" \
-    "gargi Medium" \
-    "Lohit Devanagari" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "Noto Sans Devanagari Bold" \
-    "Noto Sans Devanagari" \
-    "Samyak Devanagari Medium" \
-    "Sarai" \
-    "Saral LT Bold" \
-    "Saral LT Light" \
-    "Nakula" \
-    "Sahadeva" \
-    "Samanata" \
-    "Santipur OT Medium" \
-    )
-
-KANNADA_FONTS=( \
-    "Kedage Bold" \
-    "Kedage Italic" \
-    "Kedage" \
-    "Kedage Bold Italic" \
-    "Mallige Bold" \
-    "Mallige Italic" \
-    "Mallige" \
-    "Mallige Bold Italic" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "cheluvi Medium" \
-    "Noto Sans Kannada Bold" \
-    "Noto Sans Kannada" \
-    "Lohit Kannada" \
-    "Tunga" \
-    "Tunga Bold" \
-    )
-
-TELUGU_FONTS=( \
-    "Pothana2000" \
-    "Vemana2000" \
-    "Lohit Telugu" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "Dhurjati" \
-    "Gautami Bold" \
-    "Gidugu" \
-    "Gurajada" \
-    "Lakki Reddy" \
-    "Mallanna" \
-    "Mandali" \
-    "NATS" \
-    "NTR" \
-    "Noto Sans Telugu Bold" \
-    "Noto Sans Telugu" \
-    "Peddana" \
-    "Ponnala" \
-    "Ramabhadra" \
-    "Ravi Prakash" \
-    "Sree Krushnadevaraya" \
-    "Suranna" \
-    "Suravaram" \
-    "Tenali Ramakrishna" \
-    "Gautami" \
-    )
-
-TAMIL_FONTS=( \
-    "TAMu_Kadambri" \
-    "TAMu_Kalyani" \
-    "TAMu_Maduram" \
-    "TSCu_Paranar" \
-    "TSCu_Times" \
-    "TSCu_Paranar Bold" \
-    "FreeSans" \
-    "FreeSerif" \
-    "Lohit Tamil" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "Droid Sans Tamil Bold" \
-    "Droid Sans Tamil" \
-    "Karla Tamil Inclined Bold Italic" \
-    "Karla Tamil Inclined Italic" \
-    "Karla Tamil Upright Bold" \
-    "Karla Tamil Upright" \
-    "Noto Sans Tamil Bold" \
-    "Noto Sans Tamil" \
-    "Noto Sans Tamil UI Bold" \
-    "Noto Sans Tamil UI" \
-    "TSCu_Comic Normal" \
-    "Lohit Tamil Classical" \
-    )
-
-THAI_FONTS=( \
-    "FreeSerif" \
-    "FreeSerif Italic" \
-    "Garuda" \
-    "Norasi" \
-    "Lucida Sans Typewriter" \
-    "Lucida Sans" \
-    "Garuda Oblique" \
-    "Norasi Oblique" \
-    "Norasi Italic" \
-    "Garuda Bold" \
-    "Norasi Bold" \
-    "Lucida Sans Typewriter Bold" \
-    "Lucida Sans Semi-Bold" \
-    "Garuda Bold Oblique" \
-    "Norasi Bold Italic" \
-    "Norasi Bold Oblique" \
-    "AnuParp LT Thai" \
-    "Arial Unicode MS Bold" \
-    "Arial Unicode MS" \
-    "Ascender Uni" \
-    "Loma" \
-    "Noto Serif Thai Bold" \
-    "Noto Serif Thai" \
-    "Purisa Light" \
-    "Sirichana LT Bold" \
-    "Sirichana LT" \
-    "Sukothai LT Bold" \
-    "Sukothai LT" \
-    "UtSaHaGumm LT Thai" \
-    "Tahoma" \
-    )
-
-KOREAN_FONTS=( \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Baekmuk Batang Patched" \
-    "Baekmuk Batang" \
-    "Baekmuk Dotum" \
-    "Baekmuk Gulim" \
-    "Baekmuk Headline" \
-    )
-
-CHI_SIM_FONTS=( \
-    "AR PL UKai CN" \
-    "AR PL UMing Patched Light" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "WenQuanYi Zen Hei Medium" \
-    )
-
-CHI_TRA_FONTS=( \
-    "AR PL UKai TW" \
-    "AR PL UMing TW MBE Light" \
-    "AR PL UKai Patched" \
-    "AR PL UMing Patched Light" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "WenQuanYi Zen Hei Medium" \
-    )
-
-JPN_FONTS=( \
-    "TakaoExGothic" \
-    "TakaoExMincho" \
-    "TakaoGothic" \
-    "TakaoMincho" \
-    "TakaoPGothic" \
-    "TakaoPMincho" \
-    "VL Gothic" \
-    "VL PGothic" \
-    "Noto Sans Japanese Bold" \
-    "Noto Sans Japanese Light" \
-    )
-
-RUSSIAN_FONTS=( \
-    "Arial Bold" \
-    "Arial Bold Italic" \
-    "Arial Italic" \
-    "Arial" \
-    "Courier New Bold" \
-    "Courier New Bold Italic" \
-    "Courier New Italic" \
-    "Courier New" \
-    "Times New Roman, Bold" \
-    "Times New Roman, Bold Italic" \
-    "Times New Roman, Italic" \
-    "Times New Roman," \
-    "Georgia Bold" \
-    "Georgia Italic" \
-    "Georgia" \
-    "Georgia Bold Italic" \
-    "Trebuchet MS Bold" \
-    "Trebuchet MS Bold Italic" \
-    "Trebuchet MS Italic" \
-    "Trebuchet MS" \
-    "Verdana Bold" \
-    "Verdana Italic" \
-    "Verdana" \
-    "Verdana Bold Italic" \
-    "DejaVu Serif" \
-    "DejaVu Serif Oblique" \
-    "DejaVu Serif Bold" \
-    "DejaVu Serif Bold Oblique" \
-    "Lucida Bright" \
-    "FreeSerif Bold" \
-    "FreeSerif Bold Italic" \
-    "DejaVu Sans Ultra-Light" \
-    )
-
-GREEK_FONTS=( \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "DejaVu Sans Mono" \
-    "DejaVu Sans Mono Oblique" \
-    "DejaVu Sans Mono Bold" \
-    "DejaVu Sans Mono Bold Oblique" \
-    "DejaVu Serif" \
-    "DejaVu Serif Semi-Condensed" \
-    "DejaVu Serif Oblique" \
-    "DejaVu Serif Bold" \
-    "DejaVu Serif Bold Oblique" \
-    "DejaVu Serif Bold Semi-Condensed" \
-    "FreeSerif Bold" \
-    "FreeSerif Bold Italic" \
-    "FreeSerif Italic" \
-    "FreeSerif" \
-    "GentiumAlt" \
-    "GentiumAlt Italic" \
-    "Linux Biolinum O Bold" \
-    "Linux Biolinum O" \
-    "Linux Libertine O Bold" \
-    "Linux Libertine O" \
-    "Linux Libertine O Bold Italic" \
-    "Linux Libertine O Italic" \
-    "Palatino Linotype Bold" \
-    "Palatino Linotype Bold Italic" \
-    "Palatino Linotype Italic" \
-    "Palatino Linotype" \
-    "UmePlus P Gothic" \
-    "VL PGothic" \
-    )
-
-ANCIENT_GREEK_FONTS=( \
-    "GFS Artemisia" \
-    "GFS Artemisia Bold" \
-    "GFS Artemisia Bold Italic" \
-    "GFS Artemisia Italic" \
-    "GFS Bodoni" \
-    "GFS Bodoni Bold" \
-    "GFS Bodoni Bold Italic" \
-    "GFS Bodoni Italic" \
-    "GFS Didot" \
-    "GFS Didot Bold" \
-    "GFS Didot Bold Italic" \
-    "GFS Didot Italic" \
-    "GFS DidotClassic" \
-    "GFS Neohellenic" \
-    "GFS Neohellenic Bold" \
-    "GFS Neohellenic Bold Italic" \
-    "GFS Neohellenic Italic" \
-    "GFS Philostratos" \
-    "GFS Porson" \
-    "GFS Pyrsos" \
-    "GFS Solomos" \
-    )
-
-ARABIC_FONTS=( \
-    "Arabic Transparent Bold" \
-    "Arabic Transparent" \
-    "Arab" \
-    "Arial Unicode MS Bold" \
-    "Arial Unicode MS" \
-    "ASVCodar LT Bold" \
-    "ASVCodar LT Light" \
-    "Badiya LT Bold" \
-    "Badiya LT" \
-    "Badr LT Bold" \
-    "Badr LT" \
-    "Dimnah" \
-    "Frutiger LT Arabic Bold" \
-    "Frutiger LT Arabic" \
-    "Furat" \
-    "Hassan LT Bold" \
-    "Hassan LT Light" \
-    "Jalal LT Bold" \
-    "Jalal LT Light" \
-    "Midan Bold" \
-    "Midan" \
-    "Mitra LT Bold" \
-    "Mitra LT Light" \
-    "Palatino LT Arabic" \
-    "Palatino Sans Arabic Bold" \
-    "Palatino Sans Arabic" \
-    "Simplified Arabic Bold" \
-    "Simplified Arabic" \
-    "Times New Roman, Bold" \
-    "Times New Roman," \
-    "Traditional Arabic Bold" \
-    "Traditional Arabic" \
-    )
-
-HEBREW_FONTS=( \
-    "Arial Bold" \
-    "Arial Bold Italic" \
-    "Arial Italic" \
-    "Arial" \
-    "Courier New Bold" \
-    "Courier New Bold Italic" \
-    "Courier New Italic" \
-    "Courier New" \
-    "Ergo Hebrew Semi-Bold" \
-    "Ergo Hebrew Semi-Bold Italic" \
-    "Ergo Hebrew" \
-    "Ergo Hebrew Italic" \
-    "Really No 2 LT W2G Light" \
-    "Really No 2 LT W2G Light Italic" \
-    "Really No 2 LT W2G Medium" \
-    "Really No 2 LT W2G Medium Italic" \
-    "Really No 2 LT W2G Semi-Bold" \
-    "Really No 2 LT W2G Semi-Bold Italic" \
-    "Really No 2 LT W2G Ultra-Bold" \
-    "Really No 2 LT W2G Ultra-Bold Italic" \
-    "Times New Roman, Bold" \
-    "Times New Roman, Bold Italic" \
-    "Times New Roman, Italic" \
-    "Times New Roman," \
-    "Lucida Sans" \
-    "Tahoma" \
-    )
-
-BENGALI_FONTS=( \
-    "Bangla Medium" \
-    "Lohit Bengali" \
-    "Mukti Narrow" \
-    "Mukti Narrow Bold" \
-    "Jamrul Medium Semi-Expanded" \
-    "Likhan Medium" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "FreeSans" \
-    "FreeSans Oblique" \
-    "FreeSerif" \
-    "FreeSerif Italic" \
-    "Noto Sans Bengali Bold" \
-    "Noto Sans Bengali" \
-    "Ani" \
-    "Lohit Assamese" \
-    "Lohit Bengali" \
-    "Mitra Mono" \
-    )
-
-KYRGYZ_FONTS=( \
-    "Arial" \
-    "Arial Bold" \
-    "Arial Italic" \
-    "Arial Bold Italic" \
-    "Courier New" \
-    "Courier New Bold" \
-    "Courier New Italic" \
-    "Courier New Bold Italic" \
-    "Times New Roman," \
-    "Times New Roman, Bold" \
-    "Times New Roman, Bold Italic" \
-    "Times New Roman, Italic" \
-    "DejaVu Serif" \
-    "DejaVu Serif Oblique" \
-    "DejaVu Serif Bold" \
-    "DejaVu Serif Bold Oblique" \
-    "Lucida Bright" \
-    "FreeSerif Bold" \
-    "FreeSerif Bold Italic" \
-    )
-
-PERSIAN_FONTS=( \
-    "Amiri Bold Italic" \
-    "Amiri Bold" \
-    "Amiri Italic" \
-    "Amiri" \
-    "Andale Sans Arabic Farsi" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Lateef" \
-    "Lucida Bright" \
-    "Lucida Sans Oblique" \
-    "Lucida Sans Semi-Bold" \
-    "Lucida Sans" \
-    "Lucida Sans Typewriter Bold" \
-    "Lucida Sans Typewriter Oblique" \
-    "Lucida Sans Typewriter" \
-    "Scheherazade" \
-    "Tahoma" \
-    "Times New Roman," \
-    "Times New Roman, Bold" \
-    "Times New Roman, Bold Italic" \
-    "Times New Roman, Italic" \
-    "Yakout Linotype Bold" \
-    "Yakout Linotype" \
-    )
-
-AMHARIC_FONTS=( \
-    "Abyssinica SIL" \
-    "Droid Sans Ethiopic Bold" \
-    "Droid Sans Ethiopic" \
-    "FreeSerif" \
-    "Noto Sans Ethiopic Bold" \
-    "Noto Sans Ethiopic" \
-    )
-
-ARMENIAN_FONTS=( \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "FreeMono" \
-    "FreeMono Italic" \
-    "FreeSans" \
-    "FreeSans Bold" \
-    "FreeSans Oblique" \
-    )
-
-BURMESE_FONTS=( \
-    "Myanmar Sans Pro" \
-    "Noto Sans Myanmar Bold" \
-    "Noto Sans Myanmar" \
-    "Padauk Bold" \
-    "Padauk" \
-    "TharLon" \
-    )
-
-JAVANESE_FONTS=( \
-    "Prada" \
-    )
-
-NORTH_AMERICAN_ABORIGINAL_FONTS=( \
-    "Aboriginal Sans" \
-    "Aboriginal Sans Bold Italic" \
-    "Aboriginal Sans Italic" \
-    "Aboriginal Sans Bold" \
-    "Aboriginal Serif Bold" \
-    "Aboriginal Serif Bold Italic" \
-    "Aboriginal Serif Italic" \
-    "Aboriginal Serif" \
-    )
-
-GEORGIAN_FONTS=( \
-    "Arial Unicode MS Bold" \
-    "Arial Unicode MS" \
-    "BPG Algeti GPL\&GNU" \
-    "BPG Chveulebrivi GPL\&GNU" \
-    "BPG Courier GPL\&GNU" \
-    "BPG Courier S GPL\&GNU" \
-    "BPG DejaVu Sans 2011 GNU-GPL" \
-    "BPG Elite GPL\&GNU" \
-    "BPG Excelsior GPL\&GNU" \
-    "BPG Glaho GPL\&GNU" \
-    "BPG Gorda GPL\&GNU" \
-    "BPG Ingiri GPL\&GNU" \
-    "BPG Mrgvlovani Caps GNU\&GPL" \
-    "BPG Mrgvlovani GPL\&GNU" \
-    "BPG Nateli Caps GPL\&GNU Light" \
-    "BPG Nateli Condenced GPL\&GNU Light" \
-    "BPG Nateli GPL\&GNU Light" \
-    "BPG Nino Medium Cond GPL\&GNU" \
-    "BPG Nino Medium GPL\&GNU Medium" \
-    "BPG Sans GPL\&GNU" \
-    "BPG Sans Medium GPL\&GNU" \
-    "BPG Sans Modern GPL\&GNU" \
-    "BPG Sans Regular GPL\&GNU" \
-    "BPG Serif GPL\&GNU" \
-    "BPG Serif Modern GPL\&GNU" \
-    "FreeMono" \
-    "FreeMono Bold Italic" \
-    "FreeSans" \
-    "FreeSerif" \
-    "FreeSerif Bold" \
-    "FreeSerif Bold Italic" \
-    "FreeSerif Italic" \
-    )
-
-OLD_GEORGIAN_FONTS=( \
-    "Arial Unicode MS Bold" \
-    "Arial Unicode MS" \
-    "BPG Algeti GPL\&GNU" \
-    "BPG Courier S GPL\&GNU" \
-    "BPG DejaVu Sans 2011 GNU-GPL" \
-    "BPG Elite GPL\&GNU" \
-    "BPG Excelsior GPL\&GNU" \
-    "BPG Glaho GPL\&GNU" \
-    "BPG Ingiri GPL\&GNU" \
-    "BPG Mrgvlovani Caps GNU\&GPL" \
-    "BPG Mrgvlovani GPL\&GNU" \
-    "BPG Nateli Caps GPL\&GNU Light" \
-    "BPG Nateli Condenced GPL\&GNU Light" \
-    "BPG Nateli GPL\&GNU Light" \
-    "BPG Nino Medium Cond GPL\&GNU" \
-    "BPG Nino Medium GPL\&GNU Medium" \
-    "BPG Sans GPL\&GNU" \
-    "BPG Sans Medium GPL\&GNU" \
-    "BPG Sans Modern GPL\&GNU" \
-    "BPG Sans Regular GPL\&GNU" \
-    "BPG Serif GPL\&GNU" \
-    "BPG Serif Modern GPL\&GNU" \
-    "FreeSans" \
-    "FreeSerif" \
-    "FreeSerif Bold" \
-    "FreeSerif Bold Italic" \
-    "FreeSerif Italic" \
-    )
-
-KHMER_FONTS=( \
-    "Khmer OS" \
-    "Khmer OS System" \
-    "Khmer OS Battambang" \
-    "Khmer OS Bokor" \
-    "Khmer OS Content" \
-    "Khmer OS Fasthand" \
-    "Khmer OS Freehand" \
-    "Khmer OS Metal Chrieng" \
-    "Khmer OS Muol Light" \
-    "Khmer OS Muol Pali" \
-    "Khmer OS Muol" \
-    "Khmer OS Siemreap" \
-    "Noto Sans Bold" \
-    "Noto Sans" \
-    "Noto Serif Khmer Bold" \
-    "Noto Serif Khmer Light" \
-    )
-
-KURDISH_FONTS=( \
-    "Amiri Bold Italic" \
-    "Amiri Bold" \
-    "Amiri Italic" \
-    "Amiri" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Lateef" \
-    "Lucida Bright" \
-    "Lucida Sans Oblique" \
-    "Lucida Sans Semi-Bold" \
-    "Lucida Sans" \
-    "Lucida Sans Typewriter Bold" \
-    "Lucida Sans Typewriter Oblique" \
-    "Lucida Sans Typewriter" \
-    "Scheherazade" \
-    "Tahoma" \
-    "Times New Roman," \
-    "Times New Roman, Bold" \
-    "Times New Roman, Bold Italic" \
-    "Times New Roman, Italic" \
-    "Unikurd Web" \
-    "Yakout Linotype Bold" \
-    "Yakout Linotype" \
-    )
-
-LAOTHIAN_FONTS=( \
-    "Phetsarath OT" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "Dhyana Bold" \
-    "Dhyana" \
-    "Lao Muang Don" \
-    "Lao Muang Khong" \
-    "Lao Sans Pro" \
-    "Noto Sans Lao Bold" \
-    "Noto Sans Lao" \
-    "Noto Sans Lao UI Bold" \
-    "Noto Sans Lao UI" \
-    "Noto Serif Lao Bold" \
-    "Noto Serif Lao" \
-    "Phetsarath Bold" \
-    "Phetsarath" \
-    "Souliyo Unicode" \
-)
-
-GUJARATI_FONTS=( \
-    "Lohit Gujarati" \
-    "Rekha Medium" \
-    "Samyak Gujarati Medium" \
-    "aakar Medium" \
-    "padmaa Bold" \
-    "padmaa Medium" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "FreeSans" \
-    "Noto Sans Gujarati Bold" \
-    "Noto Sans Gujarati" \
-    "Shruti" \
-    "Shruti Bold" \
-    )
-
-MALAYALAM_FONTS=( \
-    "AnjaliOldLipi" \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "Dyuthi" \
-    "FreeSerif" \
-    "Kalyani" \
-    "Kartika" \
-    "Kartika Bold" \
-    "Lohit Malayalam" \
-    "Meera" \
-    "Noto Sans Malayalam Bold" \
-    "Noto Sans Malayalam" \
-    "Rachana" \
-    "Rachana_w01" \
-    "RaghuMalayalam" \
-    "suruma" \
-    )
-
-ORIYA_FONTS=( \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "ori1Uni Medium" \
-    "Samyak Oriya Medium" \
-    "Lohit Oriya" \
-    )
-
-PUNJABI_FONTS=( \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "Saab" \
-    "Lohit Punjabi" \
-    "Noto Sans Gurmukhi" \
-    "Noto Sans Gurmukhi Bold" \
-    "FreeSans" \
-    "FreeSans Bold" \
-    "FreeSerif" \
-    )
-
-SINHALA_FONTS=( \
-    "Noto Sans Sinhala Bold" \
-    "Noto Sans Sinhala" \
-    "OCRUnicode" \
-    "Yagpo" \
-    "LKLUG" \
-    "FreeSerif" \
-    )
-
-SYRIAC_FONTS=( \
-    "East Syriac Adiabene" \
-    "East Syriac Ctesiphon" \
-    "Estrangelo Antioch" \
-    "Estrangelo Edessa" \
-    "Estrangelo Midyat" \
-    "Estrangelo Nisibin" \
-    "Estrangelo Quenneshrin" \
-    "Estrangelo Talada" \
-    "Estrangelo TurAbdin" \
-    "Serto Batnan Bold" \
-    "Serto Batnan" \
-    "Serto Jerusalem Bold" \
-    "Serto Jerusalem Italic" \
-    "Serto Jerusalem" \
-    "Serto Kharput" \
-    "Serto Malankara" \
-    "Serto Mardin Bold" \
-    "Serto Mardin" \
-    "Serto Urhoy Bold" \
-    "Serto Urhoy" \
-    "FreeSans" \
-    )
-
-THAANA_FONTS=( \
-    "FreeSerif" \
-    )
-
-TIBETAN_FONTS=( \
-    "Arial Unicode MS" \
-    "Arial Unicode MS Bold" \
-    "Ascender Uni" \
-    "DDC Uchen" \
-    "Jomolhari" \
-    "Kailasa" \
-    "Kokonor" \
-    "Tibetan Machine Uni" \
-    "TibetanTsugRing" \
-    "Yagpo" \
-    )
-
-# The following fonts will be rendered vertically in phase I.
-VERTICAL_FONTS=( \
-    "TakaoExGothic" \ # for jpn
-    "TakaoExMincho" \ # for jpn
-    "AR PL UKai Patched" \ # for chi_tra
-    "AR PL UMing Patched Light" \ # for chi_tra
-    "Baekmuk Batang Patched" \ # for kor
-    )
-
-FLAGS_webtext_prefix=${FLAGS_webtext_prefix:-}
-
-# Set language-specific values for several global variables, including
-#   ${TEXT_CORPUS}
-#      holds the text corpus file for the language, used in phase F
-#   ${FONTS[@]}
-#      holds a sequence of applicable fonts for the language, used in
-#      phase F & I. only set if not already set, i.e. from command line
-#   ${TRAINING_DATA_ARGUMENTS}
-#      non-default arguments to the training_data program used in phase T
-#   ${FILTER_ARGUMENTS} -
-#      character-code-specific filtering to distinguish between scripts
-#      (eg. CJK) used by filter_borbidden_characters in phase F
-#   ${WORDLIST2DAWG_ARGUMENTS}
-#      specify fixed length dawg generation for non-space-delimited lang
-# TODO(dsl): We can refactor these into functions that assign FONTS,
-# TEXT_CORPUS, etc. separately.
-set_lang_specific_parameters() {
-  local lang=$1
-  # The default text location is now given directly from the language code.
-  TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt"
-  FILTER_ARGUMENTS=""
-  WORDLIST2DAWG_ARGUMENTS=""
-  # These dawg factors represent the fraction of the corpus not covered by the
-  # dawg, and seem like reasonable defaults, but the optimal value is likely
-  # to be highly corpus-dependent, as well as somewhat language-dependent.
-  # Number dawg factor is the fraction of all numeric strings that are not
-  # covered, which is why it is higher relative to the others.
-  PUNC_DAWG_FACTOR=
-  NUMBER_DAWG_FACTOR=0.125
-  WORD_DAWG_FACTOR=0.05
-  BIGRAM_DAWG_FACTOR=0.015
-  TRAINING_DATA_ARGUMENTS=""
-  FRAGMENTS_DISABLED="y"
-  RUN_SHAPE_CLUSTERING=false
-  AMBIGS_FILTER_DENOMINATOR="100000"
-  LEADING="32"
-  MEAN_COUNT="40"  # Default for latin script.
-  # Language to mix with the language for maximum accuracy. Defaults to eng.
-  # If no language is good, set to the base language.
-  MIX_LANG="eng"
-  EXPOSURES=${EXPOSURES:-}
-  FONTS=${FONTS:-}
-
-  case ${lang} in
-    # Latin languages.
-    enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures"   # Add ligatures when supported
-          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
-    frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt"
-          # Make long-s substitutions for Middle French text
-          FILTER_ARGUMENTS="--make_early_language_variant=fra"
-          TEXT2IMAGE_EXTRA_ARGS=" --ligatures"   # Add ligatures when supported.
-          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
-    frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt"
-          test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );;
-    ita_old )
-          TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt"
-          # Make long-s substitutions for Early Italian text
-          FILTER_ARGUMENTS="--make_early_language_variant=ita"
-          TEXT2IMAGE_EXTRA_ARGS=" --ligatures"   # Add ligatures when supported.
-          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
-    lat )
-          test -z "$EXPOSURES" && EXPOSURES="-3 -2 -1 0 1 2 3"
-          test -z "$FONTS" && FONTS=( "${NEOLATIN_FONTS[@]}" ) ;;
-    spa_old )
-          TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt"
-          # Make long-s substitutions for Early Spanish text
-          FILTER_ARGUMENTS="--make_early_language_variant=spa"
-          TEXT2IMAGE_EXTRA_ARGS=" --ligatures"  # Add ligatures when supported.
-          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
-    srp_latn )
-          TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;;
-    vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
-    # Highly inflective languages get a bigger dawg size.
-    # TODO(rays) Add more here!
-    hun ) WORD_DAWG_SIZE=1000000 ;;
-    pol ) WORD_DAWG_SIZE=1000000 ;;
-
-    # Latin with default treatment.
-    afr ) ;;
-    aze ) ;;
-    bos ) ;;
-    cat ) ;;
-    ceb ) ;;
-    ces ) PUNC_DAWG_FACTOR=0.004 ;;
-    cym ) ;;
-    dan ) ;;
-    deu ) WORD_DAWG_FACTOR=0.125 ;;
-    eng ) WORD_DAWG_FACTOR=0.03 ;;
-    epo ) ;;
-    est ) ;;
-    eus ) ;;
-    fil ) ;;
-    fin ) ;;
-    fra ) WORD_DAWG_FACTOR=0.08 ;;
-    gle ) ;;
-    gle_uncial ) test -z "$FONTS" && FONTS=( "${IRISH_UNCIAL_FONTS[@]}" );;
-    glg ) ;;
-    hat ) ;;
-    hrv ) ;;
-    iast ) ;;
-    ind ) ;;
-    isl ) ;;
-    ita ) ;;
-    jav ) ;;
-    lav ) ;;
-    lit ) ;;
-    mlt ) ;;
-    msa ) ;;
-    nld ) WORD_DAWG_FACTOR=0.02 ;;
-    nor ) ;;
-    por ) ;;
-    ron ) ;;
-    slk ) ;;
-    slv ) ;;
-    spa ) ;;
-    sqi ) ;;
-    swa ) ;;
-    swe ) ;;
-    tgl ) ;;
-    tur ) ;;
-    uzb ) ;;
-    zlm ) ;;
-
-    # Special code for performing language-id that is trained on
-    # EFIGS+Latin+Vietnamese text with regular + fraktur fonts.
-    lat_lid )
-          TEXT_CORPUS=${FLAGS_webtext_prefix}/lat_lid.corpus.txt
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          GENERATE_WORD_BIGRAMS=0
-          # Strip unrenderable words as not all fonts will render the extended
-          # latin symbols found in Vietnamese text.
-          WORD_DAWG_SIZE=1000000
-          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
-
-    # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic.
-    rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" )
-          MIX_LANG="rus"
-          NUMBER_DAWG_FACTOR=0.05
-          WORD_DAWG_SIZE=1000000 ;;
-    aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
-          MIX_LANG="${lang}"
-          test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
-
-    # Special code for performing Cyrillic language-id that is trained on
-    # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian
-    # text with the list of Russian fonts.
-    cyr_lid )
-          TEXT_CORPUS=${FLAGS_webtext_prefix}/cyr_lid.corpus.txt
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          GENERATE_WORD_BIGRAMS=0
-          WORD_DAWG_SIZE=1000000
-          test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );;
-
-    # South Asian scripts mostly have a lot of different graphemes, so trim
-    # down the MEAN_COUNT so as not to get a huge amount of text.
-    asm | ben )
-          MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;;
-    bih | hin | mar | nep | san )
-          MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
-    bod ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
-    dzo )
-          WORD_DAWG_FACTOR=0.01
-          test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
-    guj ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
-    kan ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
-          TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;;
-    mal ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
-          TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
-    ori )
-          WORD_DAWG_FACTOR=0.01
-          test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;;
-    pan ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.01
-          test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
-    sin ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.01
-          test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;;
-    tam ) MEAN_COUNT="30"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
-          TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;;
-    tel ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
-          TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
-
-    # SouthEast Asian scripts.
-    jav_java ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          test -z "$FONTS" && FONTS=( "${JAVANESE_FONTS[@]}" ) ;;
-    khm ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;;
-    lao ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
-    mya ) MEAN_COUNT="12"
-          WORD_DAWG_FACTOR=0.15
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;;
-    tha ) MEAN_COUNT="30"
-          WORD_DAWG_FACTOR=0.01
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          FILTER_ARGUMENTS="--segmenter_lang=tha"
-          TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
-          AMBIGS_FILTER_DENOMINATOR="1000"
-          LEADING=48
-          test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;;
-
-    # CJK
-    chi_sim )
-          MEAN_COUNT="15"
-          PUNC_DAWG_FACTOR=0.015
-          WORD_DAWG_FACTOR=0.015
-          GENERATE_WORD_BIGRAMS=0
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
-          FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim"
-          test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
-    chi_tra )
-          MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.015
-          GENERATE_WORD_BIGRAMS=0
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
-          FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra"
-          test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
-    jpn ) MEAN_COUNT="15"
-          WORD_DAWG_FACTOR=0.015
-          GENERATE_WORD_BIGRAMS=0
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
-          FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn"
-          test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;;
-    kor ) MEAN_COUNT="20"
-          WORD_DAWG_FACTOR=0.015
-          NUMBER_DAWG_FACTOR=0.05
-          TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
-          GENERATE_WORD_BIGRAMS=0
-          FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor"
-          test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;;
-
-    # Middle-Eastern scripts.
-    ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;;
-    div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;;
-    fas | pus | snd | uig | urd )
-          test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
-    heb | yid )
-          NUMBER_DAWG_FACTOR=0.05
-          WORD_DAWG_FACTOR=0.08
-          test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;;
-    syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
-
-    # Other scripts.
-    amh | tir)
-          test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
-    chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
-                  "Noto Sans Cherokee" \
-                ) ;;
-    ell )
-          NUMBER_DAWG_FACTOR=0.05
-          WORD_DAWG_FACTOR=0.08
-          test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;;
-    grc )
-          test -z "$EXPOSURES" && EXPOSURES="-3 -2 -1 0 1 2 3"
-          test -z "$FONTS" && FONTS=( "${ANCIENT_GREEK_FONTS[@]}" ) ;;
-    hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
-    iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
-    kat)  test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
-    kat_old)
-          TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt"
-          test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
-    kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" )
-          TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;;
-    kmr ) test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) ;;
-    kur_ara ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;;
-
-    *) err_exit "Error: ${lang} is not a valid language code"
-  esac
-  if [[ ${FLAGS_mean_count:-} -gt 0 ]]; then
-    TRAINING_DATA_ARGUMENTS+=" --mean_count=${FLAGS_mean_count}"
-  elif [[ ! -z ${MEAN_COUNT:-} ]]; then
-    TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}"
-  fi
-  # Default to Latin fonts if none have been set
-  test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
-
-  # Default to 0 exposure if it hasn't been set
-  test -z "$EXPOSURES" && EXPOSURES=0
-  # Set right-to-left and normalization mode.
-  case "${LANG_CODE}" in
-    ara | div| fas | pus | snd | syr | uig | urd | kur_ara | heb | yid )
-      LANG_IS_RTL="1"
-      NORM_MODE="2" ;;
-    asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
-    dzo | sin | san | bod | ori | khm | mya | tha | lao | jav  | jav_java)
-      LANG_IS_RTL="0"
-      NORM_MODE="2" ;;
-    * )
-      LANG_IS_RTL="0"
-      NORM_MODE="1" ;;
-  esac
-}
-
-#=============================================================================
-# END of Language specific info
-#=============================================================================
diff --git a/src/training/tesstrain.sh b/src/training/tesstrain.sh
deleted file mode 100755
index afc98f6a..00000000
--- a/src/training/tesstrain.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-# (C) Copyright 2014, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This script provides an easy way to execute various phases of training
-# Tesseract.  For a detailed description of the phases, see
-# https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html.
-#
-
-display_usage() {
-echo -e 'USAGE: tesstrain.sh
-     --exposures EXPOSURES      # A list of exposure levels to use (e.g. "-1 0 1").
-     --fontlist FONTS           # A list of fontnames to train on.
-     --fonts_dir FONTS_PATH     # Path to font files.
-     --lang LANG_CODE           # ISO 639 code.
-     --langdata_dir DATADIR     # Path to tesseract/training/langdata directory.
-     --linedata_only            # Only generate training data for lstmtraining.
-     --output_dir OUTPUTDIR     # Location of output traineddata file.
-     --overwrite                # Safe to overwrite files in output_dir.
-     --run_shape_clustering     # Run shape clustering (use for Indic langs).
-     --maxpages                 # Specify maximum pages to output (default:0=all)
-     --save_box_tiff            # Save box/tiff pairs along with lstmf files.
-     --xsize                    # Specify width of output image (default:3600)
-
-  OPTIONAL flag for specifying directory with user specified box/tiff pairs.
-  Files should be named similar to ${LANG_CODE}.${fontname}.exp${EXPOSURE}.box/tif
-     --my_boxtiff_dir MY_BOXTIFF_DIR # Location of user specified box/tiff files.
-
-  OPTIONAL flags for input data. If unspecified we will look for them in
-  the langdata_dir directory.
-     --training_text TEXTFILE   # Text to render and use for training.
-     --wordlist WORDFILE        # Word list for the language ordered by
-                                # decreasing frequency.
-  OPTIONAL flag to specify location of existing traineddata files, required
-  during feature extraction. If unspecified will use TESSDATA_PREFIX defined in
-  the current environment.
-     --tessdata_dir TESSDATADIR     # Path to tesseract/tessdata directory.
-  NOTE:
-  The font names specified in --fontlist need to be recognizable by Pango using
-  fontconfig. An easy way to list the canonical names of all fonts available on
-  your system is to run text2image with --list_available_fonts and the
-  appropriate --fonts_dir path.'
-}
-
-source "$(dirname $0)/tesstrain_utils.sh"
-if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
-    display_usage
-    exit 0
-fi
-if [ $# == 0 ]; then
-    display_usage
-    exit 1
-fi
-
-ARGV=("$@")
-parse_flags
-
-mkdir -p ${TRAINING_DIR}
-
-if [[ ${MY_BOXTIFF_DIR} != "" ]]; then
-    tlog "\n=== Copy existing box/tiff pairs from '${MY_BOXTIFF_DIR}'"
-    cp  ${MY_BOXTIFF_DIR}/*.box ${TRAINING_DIR} | true
-    cp  ${MY_BOXTIFF_DIR}/*.tif ${TRAINING_DIR} | true
-    ls -l  ${TRAINING_DIR}
-fi
-
-tlog "\n=== Starting training for language '${LANG_CODE}'"
-
-source "$(dirname $0)/language-specific.sh"
-set_lang_specific_parameters ${LANG_CODE}
-
-initialize_fontconfig
-
-phase_I_generate_image 8
-phase_UP_generate_unicharset
-if $LINEDATA; then
-  phase_E_extract_features "  lstm.train " 8 "lstmf"
-  make__lstmdata
-  tlog "\nCreated starter traineddata for LSTM training of language '${LANG_CODE}'\n"
-  tlog "\nRun 'lstmtraining' command to continue LSTM training for language '${LANG_CODE}'\n"
-else
-  phase_D_generate_dawg
-  phase_E_extract_features "box.train" 8 "tr"
-  phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"
-  phase_S_cluster_shapes
-  phase_M_cluster_microfeatures
-  phase_B_generate_ambiguities
-  make__traineddata
-  tlog "\nCompleted training for language '${LANG_CODE}'\n"
-fi
diff --git a/src/training/tesstrain_utils.sh b/src/training/tesstrain_utils.sh
deleted file mode 100644
index 9e0c9637..00000000
--- a/src/training/tesstrain_utils.sh
+++ /dev/null
@@ -1,632 +0,0 @@
-#!/bin/bash
-# (C) Copyright 2014, Google Inc.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This script defines functions that are used by tesstrain.sh
-# For a detailed description of the phases, see
-# https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html.
-#
-# USAGE: source tesstrain_utils.sh
-
-if [ -n "$BASH_VERSION" ];then
-  set -u  # comment in case of "unbound variable" error or fix the code
-  set -eo pipefail;
-else
-   echo "Warning: you aren't running script in bash - expect problems..."
- fi
-
-UNAME=$(uname -s | tr 'A-Z' 'a-z')
-
-FONT_CONFIG_CACHE=$(mktemp -d -t font_tmp.XXXXXXXXXX)
-
-if [[ ($UNAME == *darwin*) ]]; then
-    FONTS_DIR="/Library/Fonts/"
-else
-    FONTS_DIR="/usr/share/fonts/"
-fi
-
-DISTORT_IMAGE=false
-EXTRACT_FONT_PROPERTIES=false
-LINEDATA=false
-MAX_PAGES=0
-MY_BOXTIFF_DIR=""
-OUTPUT_DIR="/tmp/tesstrain/tessdata"
-OVERWRITE=false
-RUN_SHAPE_CLUSTERING=false
-SAVE_BOX_TIFF=false
-WORKSPACE_DIR=$(mktemp -d)
-X_SIZE=3600
-PT_SIZE=12
-
-# set TESSDATA_PREFIX as empty, if not defined in environment to avoid an unbound variable
-TESSDATA_PREFIX=${TESSDATA_PREFIX:-}
-
-# Logging helper functions.
-tlog() {
-    if test -z "${LOG_FILE:-}"; then
-        echo -e $*
-    else
-        echo -e $* | tee -a ${LOG_FILE}
-    fi
-}
-
-err_exit() {
-    if test -z "${LOG_FILE:-}"; then
-        echo -e "ERROR: "$*
-    else
-        echo -e "ERROR: "$* | tee -a ${LOG_FILE}
-    fi
-    exit 1
-}
-
-# Helper function to run a command and append its output to a log. Aborts early
-# if the program file is not found.
-# Usage: run_command CMD ARG1 ARG2...
-run_command() {
-    local cmd
-    cmd=$(which $1 || \
-              for d in api training; do
-                  which $d/$1 && break
-              done) || err_exit "'$1' not found"
-    shift
-    tlog "[$(date)] ${cmd} $@"
-    if ! "${cmd}" "$@" 2>&1 | tee -a "${LOG_FILE}"; then
-        err_exit "Program $(basename ${cmd}) failed. Abort. Command line: ${cmd} $@"
-    fi
-}
-
-# Check if all the given files exist, or exit otherwise.
-# Used to check required input files and produced output files in each phase.
-# Usage: check_file_readable FILE1 FILE2...
-check_file_readable() {
-    for file in $@; do
-        if [[ ! -r ${file} ]]; then
-            err_exit "${file} does not exist or is not readable"
-        fi
-    done
-}
-
-# Sets the named variable to given value. Aborts if the value is missing or
-# if it looks like a flag.
-# Usage: parse_value VAR_NAME VALUE
-parse_value() {
-    local val="${2:-}"
-    if [[ -z "$val" ]]; then
-        err_exit "Missing value for variable $1"
-        exit
-    fi
-    if [[ ${val:0:2} == "--" ]]; then
-        err_exit "Invalid value $val passed for variable $1"
-        exit
-    fi
-    eval $1=\"$val\"
-}
-
-# Does simple command-line parsing and initialization.
-parse_flags() {
-    local i=0
-    while test $i -lt ${#ARGV[@]}; do
-        local j=$((i+1))
-        case ${ARGV[$i]} in
-            --)
-                break;;
-            --fontlist)
-                fn=0
-                FONTS=""
-                while test $j -lt ${#ARGV[@]}; do
-                    test -z "${ARGV[$j]}" && break
-                    test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break
-                    FONTS[$fn]="${ARGV[$j]}"
-                    fn=$((fn+1))
-                    j=$((j+1))
-                done
-                i=$((j-1)) ;;
-            --exposures)
-                exp=""
-                while test $j -lt ${#ARGV[@]}; do
-                    test -z "${ARGV[$j]}" && break
-                    test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break
-                    exp="$exp ${ARGV[$j]}"
-                    j=$((j+1))
-                done
-                parse_value "EXPOSURES" "$exp"
-                i=$((j-1)) ;;
-            --fonts_dir)
-                parse_value "FONTS_DIR" ${ARGV[$j]:-}
-                i=$j ;;
-	    --tmp_dir)
-		parse_value "TMP_DIR"   ${ARGV[$j]:-}
-		i=$j ;;
-            --lang)
-                parse_value "LANG_CODE" ${ARGV[$j]:-}
-                i=$j ;;
-            --langdata_dir)
-                parse_value "LANGDATA_ROOT" ${ARGV[$j]:-}
-                i=$j ;;
-            --maxpages)
-                parse_value "MAX_PAGES" ${ARGV[$j]:-}
-                i=$j ;;
-            --ptsize)
-                parse_value "PT_SIZE" ${ARGV[$j]:-}
-                i=$j ;;
-            --my_boxtiff_dir)
-                parse_value "MY_BOXTIFF_DIR" ${ARGV[$j]:-}
-                i=$j ;;
-            --distort_image)
-                DISTORT_IMAGE=true ;;
-            --output_dir)
-                parse_value "OUTPUT_DIR" ${ARGV[$j]:-}
-                i=$j ;;
-            --overwrite)
-                OVERWRITE=true ;;
-            --save_box_tiff)
-                SAVE_BOX_TIFF=true ;;
-            --linedata_only)
-                LINEDATA=true ;;
-            --extract_font_properties)
-                EXTRACT_FONT_PROPERTIES=true ;;
-            --noextract_font_properties)
-                EXTRACT_FONT_PROPERTIES=false ;;
-            --tessdata_dir)
-                parse_value "TESSDATA_DIR" ${ARGV[$j]:-}
-                i=$j ;;
-            --training_text)
-                parse_value "TRAINING_TEXT" "${ARGV[$j]:-}"
-                i=$j ;;
-            --wordlist)
-                parse_value "WORDLIST_FILE" ${ARGV[$j]:-}
-                i=$j ;;
-            --workspace_dir)
-                rmdir "$FONT_CONFIG_CACHE"
-                rmdir "$WORKSPACE_DIR"
-                parse_value "WORKSPACE_DIR" ${ARGV[$j]:-}
-                FONT_CONFIG_CACHE=$WORKSPACE_DIR/fc-cache
-                mkdir -p $FONT_CONFIG_CACHE
-                i=$j ;;
-            --xsize)
-                parse_value "X_SIZE" ${ARGV[$j]:-}
-                i=$j ;;
-            *)
-                err_exit "Unrecognized argument ${ARGV[$i]}" ;;
-        esac
-        i=$((i+1))
-    done
-    if [[ -z ${LANG_CODE:-} ]]; then
-        err_exit "Need to specify a language --lang"
-    fi
-    if [[ -z ${LANGDATA_ROOT:-} ]]; then
-        err_exit "Need to specify path to language files --langdata_dir"
-    fi
-    if [[ -z ${TESSDATA_DIR:-} ]]; then
-        if [[ -z ${TESSDATA_PREFIX} ]]; then
-            err_exit "Need to specify a --tessdata_dir or have a "\
-        "TESSDATA_PREFIX variable defined in your environment"
-        else
-            TESSDATA_DIR="${TESSDATA_PREFIX}"
-        fi
-    fi
-    if [[ ! -d "${OUTPUT_DIR}" ]]; then
-        tlog "Creating new directory ${OUTPUT_DIR}"
-        mkdir -p "${OUTPUT_DIR}"
-    fi
-
-    # Location where intermediate files will be created.
-    TIMESTAMP=$(date +%Y-%m-%d)
-    if [[ -z ${TMP_DIR:-} ]]; then
-        TMP_DIR=$(mktemp -d -t ${LANG_CODE}-${TIMESTAMP}.XXX)
-    else
-        TMP_DIR=$(mktemp -d -p ${TMP_DIR} -t ${LANG_CODE}-${TIMESTAMP}.XXX)
-    fi
-    TRAINING_DIR=${TMP_DIR}
-    # Location of log file for the whole run.
-    LOG_FILE=${TRAINING_DIR}/tesstrain.log
-
-    # Take training text and wordlist from the langdata directory if not
-    # specified in the command-line.
-    TRAINING_TEXT=${TRAINING_TEXT:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text}
-    WORDLIST_FILE=${WORDLIST_FILE:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist}
-
-    WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams
-    NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
-    PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
-    BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
-    UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
-    TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
-    GENERATE_DAWGS=1
-}
-
-# Function initializes font config with a unique font cache dir.
-initialize_fontconfig() {
-    export FONT_CONFIG_CACHE
-    local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
-    echo "Text" >${sample_path}
-    run_command text2image --fonts_dir=${FONTS_DIR} --ptsize ${PT_SIZE} \
-        --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
-        --fontconfig_tmpdir=${FONT_CONFIG_CACHE}
-}
-
-# Helper function for phaseI_generate_image. Generates the image for a single
-# language/font combination in a way that can be run in parallel.
-generate_font_image() {
-    local font="$1"
-    tlog "Rendering using ${font}"
-    local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
-    local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
-
-    local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}"
-    common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
-    common_args+=" --leading=${LEADING} --xsize=${X_SIZE}"
-    common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
-    common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}"
-    if $DISTORT_IMAGE; then
-        common_args+=" --distort_image --invert=false"
-    fi
-
-    # add --writing_mode=vertical-upright to common_args if the font is
-    # specified to be rendered vertically.
-    for vfont in "${VERTICAL_FONTS[@]}"; do
-      if [[ "${font}" == "${vfont}" ]]; then
-        common_args+=" --writing_mode=vertical-upright "
-        break
-      fi
-    done
-
-    run_command text2image ${common_args} --font="${font}" --ptsize ${PT_SIZE} \
-        --text=${TRAINING_TEXT}  ${TEXT2IMAGE_EXTRA_ARGS:-}
-    check_file_readable ${outbase}.box ${outbase}.tif
-
-    if $EXTRACT_FONT_PROPERTIES &&
-        [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
-        tlog "Extracting font properties of ${font}"
-        run_command text2image ${common_args} --font="${font}" \
-            --ligatures=false --text=${TRAIN_NGRAMS_FILE} \
-            --only_extract_font_properties --ptsize=32
-        check_file_readable ${outbase}.fontinfo
-    fi
-}
-
-# Phase I : Generate (I)mages from training text for each font.
-phase_I_generate_image() {
-    local par_factor=${1:-}
-    if ! [[ "${par_factor}" -gt 0 ]]; then
-        par_factor=1
-    fi
-    tlog "\n=== Phase I: Generating training images ==="
-    if [[ -z ${TRAINING_TEXT:-} ]] || test ! -r "${TRAINING_TEXT}"; then
-        err_exit "Could not find training text file ${TRAINING_TEXT:-}"
-    fi
-    CHAR_SPACING="0.0"
-
-    for EXPOSURE in $EXPOSURES; do
-        if $EXTRACT_FONT_PROPERTIES && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
-            # Parse .bigram_freqs file and compose a .train_ngrams file with text
-            # for tesseract to recognize during training. Take only the ngrams whose
-            # combined weight accounts for 95% of all the bigrams in the language.
-            NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
-                | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
-            sort -rnk2 ${BIGRAM_FREQS_FILE} \
-                | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
-                x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
-            check_file_readable ${TRAIN_NGRAMS_FILE}
-        fi
-
-        local jobs=
-        trap "kill $$" INT
-        for font in "${FONTS[@]}"; do
-            sleep 1
-            test $(jobs -r | wc -l) -ge $par_factor && wait -n
-            generate_font_image "${font}" &
-            jobs="$jobs $!"
-        done
-        wait $jobs
-        # Check that each process was successful.
-        for font in "${FONTS[@]}"; do
-            local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
-            local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
-            check_file_readable ${outbase}.box ${outbase}.tif
-        done
-    done
-    if $SAVE_BOX_TIFF && ( ! $LINEDATA ) ; then
-    tlog "\n=== Saving box/tiff pairs for training data ==="
-        for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do
-            tlog "Moving ${f} to ${OUTPUT_DIR}"
-            cp "${f}" "${OUTPUT_DIR}"
-        done
-        for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do
-            tlog "Moving ${f} to ${OUTPUT_DIR}"
-            cp "${f}" "${OUTPUT_DIR}"
-        done
-    fi
-}
-
-# Phase UP : Generate (U)nicharset and (P)roperties file.
-phase_UP_generate_unicharset() {
-    tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
-
-    local box_files=$(ls ${TRAINING_DIR}/*.box)
-    UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
-    if [[ "${NORM_MODE}" == "2" ]] && [[ "${LANG_IS_RTL}" == "0" ]] ; then
-          run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
-               --norm_mode "${NORM_MODE}" ${TRAINING_TEXT}
-    else
-          run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
-               --norm_mode "${NORM_MODE}" ${box_files}
-    fi
-    check_file_readable ${UNICHARSET_FILE}
-
-    XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
-    run_command set_unicharset_properties \
-        -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
-        --script_dir=${LANGDATA_ROOT}
-    check_file_readable ${XHEIGHTS_FILE}
-}
-
-# Phase D : Generate (D)awg files from unicharset file and wordlist files
-phase_D_generate_dawg() {
-    tlog "\n=== Phase D: Generating Dawg files ==="
-
-    # Skip if requested
-    if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
-      tlog "Skipping ${phase_name}"
-      return
-    fi
-
-    # Output files
-    WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
-    FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
-    PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
-    NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
-    BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
-
-    # Word DAWG
-    local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
-    if [[ -s ${WORDLIST_FILE} ]]; then
-        tlog "Generating word Dawg"
-        check_file_readable ${UNICHARSET_FILE}
-        run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
-            ${UNICHARSET_FILE}
-        check_file_readable ${WORD_DAWG}
-
-        FREQ_DAWG_SIZE=100
-        head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
-    fi
-
-    # Freq-word DAWG
-    if [[ -s ${freq_wordlist_file} ]]; then
-        check_file_readable ${UNICHARSET_FILE}
-        tlog "Generating frequent-word Dawg"
-        run_command wordlist2dawg  -r 1 ${freq_wordlist_file} \
-            ${FREQ_DAWG} ${UNICHARSET_FILE}
-        check_file_readable ${FREQ_DAWG}
-    fi
-
-    # Punctuation DAWG
-    # -r arguments to wordlist2dawg denote RTL reverse policy
-    # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h).
-    # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
-    # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
-    # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
-    local punc_reverse_policy=0;
-    if [[ "${LANG_IS_RTL}" == "1" ]]; then
-      punc_reverse_policy=2
-    fi
-    if [[ ! -s ${PUNC_FILE} ]]; then
-        PUNC_FILE="${LANGDATA_ROOT}/common.punc"
-    fi
-    check_file_readable ${PUNC_FILE}
-    run_command wordlist2dawg -r ${punc_reverse_policy} \
-        ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
-    check_file_readable ${PUNC_DAWG}
-
-    # Numbers DAWG
-    if [[ -s ${NUMBERS_FILE} ]]; then
-        run_command wordlist2dawg -r 0 \
-            ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
-        check_file_readable ${NUMBER_DAWG}
-    fi
-
-    # Bigram dawg
-    if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
-        run_command wordlist2dawg -r 1 \
-            ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
-        check_file_readable ${BIGRAM_DAWG}
-    fi
-}
-
-# Phase E : (E)xtract .tr feature files from .tif/.box files
-phase_E_extract_features() {
-    local box_config=$1
-    local par_factor=$2
-    local ext=$3
-    if ! [[ "${par_factor}" -gt 0 ]]; then
-        par_factor=1
-    fi
-    tlog "\n=== Phase E: Generating ${ext} files ==="
-
-    local img_files=""
-    for exposure in ${EXPOSURES}; do
-        img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
-    done
-
-    # Use any available language-specific configs.
-    local config=""
-    if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
-        config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
-    fi
-
-    OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
-    export TESSDATA_PREFIX=${TESSDATA_DIR}
-    tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
-    local jobs=
-    trap "kill $$" INT
-    for img_file in ${img_files}; do
-        test $(jobs -r | wc -l) -ge $par_factor && wait -n
-        run_command tesseract ${img_file} ${img_file%.*} \
-            ${box_config} ${config} &
-        jobs="$jobs $!"
-    done
-    wait $jobs
-    export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
-    # Check that all the output files were produced.
-    for img_file in ${img_files}; do
-        check_file_readable "${img_file%.*}.${ext}"
-    done
-}
-
-# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
-# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
-phase_C_cluster_prototypes() {
-    tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
-    local out_normproto=$1
-
-    run_command cntraining -D "${TRAINING_DIR}/" \
-        $(ls ${TRAINING_DIR}/*.tr)
-
-    check_file_readable ${TRAINING_DIR}/normproto
-    mv ${TRAINING_DIR}/normproto ${out_normproto}
-}
-
-# Phase S : (S)hape clustering
-phase_S_cluster_shapes() {
-    if ! $RUN_SHAPE_CLUSTERING; then
-        tlog "\n=== Shape Clustering disabled ==="
-        return
-    fi
-    check_file_readable ${LANGDATA_ROOT}/font_properties
-    local font_props="-F ${LANGDATA_ROOT}/font_properties"
-    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
-       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
-        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
-    fi
-
-    run_command shapeclustering \
-        -D "${TRAINING_DIR}/" \
-        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
-        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
-        ${font_props} \
-        $(ls ${TRAINING_DIR}/*.tr)
-    check_file_readable ${TRAINING_DIR}/shapetable \
-        ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
-}
-
-# Phase M : Clustering microfeatures (mfTraining)
-phase_M_cluster_microfeatures() {
-    tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
-
-    check_file_readable ${LANGDATA_ROOT}/font_properties
-    font_props="-F ${LANGDATA_ROOT}/font_properties"
-    if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
-       [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
-        font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
-    fi
-
-    run_command mftraining \
-        -D "${TRAINING_DIR}/" \
-        -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
-        -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
-        ${font_props} \
-        $(ls ${TRAINING_DIR}/*.tr)
-    check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
-        ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
-    mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
-    mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
-    mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
-    mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
-}
-
-phase_B_generate_ambiguities() {
-  tlog "\n=== Phase B : ambiguities training ==="
-
-  # Check for manually created ambiguities data.
-  if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
-      tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
-      cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
-          ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
-      # Make it writable, as it may be read-only in the client.
-      chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
-      return
-  else
-      tlog "No unicharambigs file found!"
-  fi
-
-  # TODO: Add support for generating ambiguities automatically.
-}
-
-make__lstmdata() {
-  tlog "\n=== Constructing LSTM training data ==="
-  local lang_prefix="${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}"
-  local lang_is_rtl=""
-  if [[ "${LANG_IS_RTL}" == "1" ]]; then
-    lang_is_rtl="--lang_is_rtl"
-  fi
-  local pass_through=""
-  if [[ "${NORM_MODE}" -ge "2" ]]; then
-    pass_through="--pass_through_recoder"
-  fi
-
-  # Build the starter traineddata from the inputs.
-  run_command combine_lang_model \
-    --input_unicharset "${TRAINING_DIR}/${LANG_CODE}.unicharset" \
-    --script_dir "${LANGDATA_ROOT}" \
-    --words "${lang_prefix}.wordlist" \
-    --numbers "${lang_prefix}.numbers" \
-    --puncs "${lang_prefix}.punc" \
-    --output_dir "${OUTPUT_DIR}" --lang "${LANG_CODE}" \
-    "${pass_through}" "${lang_is_rtl}"
-
-  if $SAVE_BOX_TIFF; then
-    tlog "\n=== Saving box/tiff pairs for training data ==="
-  for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do
-    tlog "Moving ${f} to ${OUTPUT_DIR}"
-    mv "${f}" "${OUTPUT_DIR}"
-  done
-  for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do
-    tlog "Moving ${f} to ${OUTPUT_DIR}"
-    mv "${f}" "${OUTPUT_DIR}"
-  done
-  fi
-
-  tlog "\n=== Moving lstmf files for training data ==="
-  for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do
-    tlog "Moving ${f} to ${OUTPUT_DIR}"
-    mv "${f}" "${OUTPUT_DIR}"
-  done
-  local lstm_list="${OUTPUT_DIR}/${LANG_CODE}.training_files.txt"
-  ls -1 "${OUTPUT_DIR}/${LANG_CODE}".*.lstmf > "${lstm_list}"
-}
-
-make__traineddata() {
-  tlog "\n=== Making final traineddata file ==="
-  local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
-
-  # Combine available files for this language from the langdata dir.
-  if [[ -r ${lang_prefix}.config ]]; then
-    tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
-    cp ${lang_prefix}.config ${TRAINING_DIR}
-    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
-  fi
-  if [[ -r ${lang_prefix}.params-model ]]; then
-    tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
-    cp ${lang_prefix}.params-model ${TRAINING_DIR}
-    chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
-  fi
-
-  # Compose the traineddata file.
-  run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
-
-  # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
-  local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
-  if [[ -f ${destfile} ]] && ! $OVERWRITE; then
-      err_exit "File ${destfile} exists and no --overwrite specified";
-  fi
-  tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
-  cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
-}
diff --git a/src/training/unicharset/lstmtester.h b/src/training/unicharset/lstmtester.h
index 678aefbb..b0e3a1cf 100644
--- a/src/training/unicharset/lstmtester.h
+++ b/src/training/unicharset/lstmtester.h
@@ -35,7 +35,7 @@ public:
   // Loads a set of lstmf files that were created using the lstm.train config to
   // tesseract into memory ready for testing. Returns false if nothing was
   // loaded. The arg is a filename of a file that lists the filenames, with one
-  // name per line. Conveniently, tesstrain.sh generates such a file, along
+  // name per line. Conveniently, tesstrain.py generates such a file, along
   // with the files themselves.
   bool LoadAllEvalData(const char *filenames_file);
   // Loads a set of lstmf files that were created using the lstm.train config to
diff --git a/unittest/lstm_test.cc b/unittest/lstm_test.cc
index 45302ee9..4b3d4ac2 100644
--- a/unittest/lstm_test.cc
+++ b/unittest/lstm_test.cc
@@ -15,7 +15,7 @@
 //
 // Use --xsize 800 for text2image to be similar to original training data.
 //
-// src/training/tesstrain.sh --fonts_dir /usr/share/fonts --lang eng \
+// tesstrain.py --fonts_dir /usr/share/fonts --lang eng \
 // --linedata_only   --noextract_font_properties --langdata_dir ../langdata_lstm \
 // --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \
 // --fontlist "Arial" --maxpages 10