From 93348a83a324a479978d9dd399b34d15ec6c5d83 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 18 May 2021 10:47:44 +0200 Subject: [PATCH] Remove scripts for training They were replaced by Python3 scripts (part of the tesstrain repository). Signed-off-by: Stefan Weil --- Makefile.am | 4 - src/training/language-specific.sh | 1199 -------------------------- src/training/tesstrain.sh | 98 --- src/training/tesstrain_utils.sh | 632 -------------- src/training/unicharset/lstmtester.h | 2 +- unittest/lstm_test.cc | 2 +- 6 files changed, 2 insertions(+), 1935 deletions(-) delete mode 100755 src/training/language-specific.sh delete mode 100755 src/training/tesstrain.sh delete mode 100644 src/training/tesstrain_utils.sh diff --git a/Makefile.am b/Makefile.am index 663afb2e..e95f18ea 100644 --- a/Makefile.am +++ b/Makefile.am @@ -795,10 +795,6 @@ training_CPPFLAGS += $(ICU_UC_CFLAGS) $(ICU_I18N_CFLAGS) training_CPPFLAGS += $(pango_CFLAGS) training_CPPFLAGS += $(cairo_CFLAGS) -bin_SCRIPTS = src/training/language-specific.sh src/training/tesstrain.sh -scripts_DATA = src/training/tesstrain_utils.sh -scriptsdir = $(bindir) - if DISABLED_LEGACY_ENGINE training_CPPFLAGS += -DDISABLED_LEGACY_ENGINE endif diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh deleted file mode 100755 index e5a7f096..00000000 --- a/src/training/language-specific.sh +++ /dev/null @@ -1,1199 +0,0 @@ -#!/bin/bash -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Set some language specific variables. Works in conjunction with -# tesstrain.sh -# - -#============================================================================= -# Language specific info -#============================================================================= - -# Array of all valid language codes. -VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat - ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo - ell eng enm epo est eus fas fil fin fra frk frm gle glg - grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old - jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat - lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori - pan pol por pus ron rus san sin slk slv snd spa spa_old - sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur - uig ukr urd uzb uzb_cyrl vie yid gle_uncial" - -# Codes for which we have webtext but no fonts: -UNUSABLE_LANGUAGE_CODES="" - -FRAKTUR_FONTS=( - "CaslonishFraxx Medium" \ - "Cloister Black, Light" \ - "Proclamate Light" \ - "UnifrakturMaguntia" \ - "Walbaum-Fraktur" \ -) - -# List of fonts to train on -LATIN_FONTS=( - "Arial Bold" \ - "Arial Bold Italic" \ - "Arial Italic" \ - "Arial" \ - "Courier New Bold" \ - "Courier New Bold Italic" \ - "Courier New Italic" \ - "Courier New" \ - "Times New Roman, Bold" \ - "Times New Roman, Bold Italic" \ - "Times New Roman, Italic" \ - "Times New Roman," \ - "Georgia Bold" \ - "Georgia Italic" \ - "Georgia" \ - "Georgia Bold Italic" \ - "Trebuchet MS Bold" \ - "Trebuchet MS Bold Italic" \ - "Trebuchet MS Italic" \ - "Trebuchet MS" \ - "Verdana Bold" \ - "Verdana Italic" \ - "Verdana" \ - "Verdana Bold Italic" \ - "Tex Gyre Bonum Bold" \ - "Tex Gyre Bonum Italic" \ - "Tex Gyre Bonum Bold Italic" \ - "Tex Gyre Schola Bold" \ - "Tex Gyre Schola Italic" \ - "Tex Gyre Schola Bold Italic" \ - "Tex Gyre Schola Regular" \ - "DejaVu Sans Ultra-Light" \ -) - -# List of fonts for printed/neo-Latin ('lat' language code, different from Latin script) -NEOLATIN_FONTS=( - "GFS Bodoni" \ - "GFS Bodoni Bold" \ - "GFS Bodoni Italic" \ - "GFS Bodoni Bold Italic" \ - "GFS Didot" \ - "GFS Didot Bold" \ - "GFS Didot Italic" \ - "GFS Didot Bold Italic" \ - "Cardo" \ - "Cardo Bold" \ - "Cardo Italic" \ - "Wyld" \ - "Wyld Italic" \ - "EB Garamond" \ - "EB Garamond Italic" \ - "Junicode" \ - "Junicode Bold" \ - "Junicode Italic" \ - "Junicode Bold Italic" \ - "IM FELL DW Pica PRO" \ - "IM FELL English PRO" \ - "IM FELL Double Pica PRO" \ - "IM FELL French Canon PRO" \ - "IM FELL Great Primer PRO" \ - "IM FELL DW Pica PRO Italic" \ - "IM FELL English PRO Italic" \ - "IM FELL Double Pica PRO Italic" \ - "IM FELL French Canon PRO Italic" \ - "IM FELL Great Primer PRO Italic" \ -) - -IRISH_UNCIAL_FONTS=( - "Bunchlo Arsa Dubh GC" \ - "Bunchlo Arsa GC" \ - "Bunchlo Arsa GC Bold" \ - "Bunchlo Dubh GC" \ - "Bunchlo GC" \ - "Bunchlo GC Bold" \ - "Bunchlo Nua GC Bold" \ - "Bunchló na Nod GC" \ - "Gadelica" \ - "Glanchlo Dubh GC" \ - "Glanchlo GC" \ - "Glanchlo GC Bold" \ - "Seanchló Dubh GC" \ - "Seanchló GC" \ - "Seanchló GC Bold" \ - "Seanchló na Nod GC" \ - "Seanchló Ársa Dubh GC" \ - "Seanchló Ársa GC" \ - "Seanchló Ársa GC Bold" \ - "Tromchlo Beag GC" \ - "Tromchlo Mor GC" \ - "Urchlo GC" \ - "Urchlo GC Bold" \ -) - -EARLY_LATIN_FONTS=( - "${FRAKTUR_FONTS[@]}" \ - "${LATIN_FONTS[@]}" \ - # The Wyld font family renders early modern ligatures encoded in the private - # unicode area. - "Wyld" \ - "Wyld Italic" \ - # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English. - "GentiumAlt" \ -) - -VIETNAMESE_FONTS=( \ - "Arial Unicode MS Bold" \ - "Arial Bold Italic" \ - "Arial Italic" \ - "Arial Unicode MS" \ - "FreeMono Bold" \ - "Courier New Bold Italic" \ - "FreeMono Italic" \ - "FreeMono" \ - "GentiumAlt Italic" \ - "GentiumAlt" \ - "Palatino Linotype Bold" \ - "Palatino Linotype Bold Italic" \ - "Palatino Linotype Italic" \ - "Palatino Linotype" \ - "Really No 2 LT W2G Light" \ - "Really No 2 LT W2G Light Italic" \ - "Really No 2 LT W2G Medium" \ - "Really No 2 LT W2G Medium Italic" \ - "Really No 2 LT W2G Semi-Bold" \ - "Really No 2 LT W2G Semi-Bold Italic" \ - "Really No 2 LT W2G Ultra-Bold" \ - "Really No 2 LT W2G Ultra-Bold Italic" \ - "Times New Roman, Bold" \ - "Times New Roman, Bold Italic" \ - "Times New Roman, Italic" \ - "Times New Roman," \ - "Verdana Bold" \ - "Verdana Italic" \ - "Verdana" \ - "Verdana Bold Italic" \ - "VL Gothic" \ - "VL PGothic" \ - ) - -DEVANAGARI_FONTS=( \ - "FreeSans" \ - "Chandas" \ - "Kalimati" \ - "Uttara" \ - "Lucida Sans" \ - "gargi Medium" \ - "Lohit Devanagari" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "Noto Sans Devanagari Bold" \ - "Noto Sans Devanagari" \ - "Samyak Devanagari Medium" \ - "Sarai" \ - "Saral LT Bold" \ - "Saral LT Light" \ - "Nakula" \ - "Sahadeva" \ - "Samanata" \ - "Santipur OT Medium" \ - ) - -KANNADA_FONTS=( \ - "Kedage Bold" \ - "Kedage Italic" \ - "Kedage" \ - "Kedage Bold Italic" \ - "Mallige Bold" \ - "Mallige Italic" \ - "Mallige" \ - "Mallige Bold Italic" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "cheluvi Medium" \ - "Noto Sans Kannada Bold" \ - "Noto Sans Kannada" \ - "Lohit Kannada" \ - "Tunga" \ - "Tunga Bold" \ - ) - -TELUGU_FONTS=( \ - "Pothana2000" \ - "Vemana2000" \ - "Lohit Telugu" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "Dhurjati" \ - "Gautami Bold" \ - "Gidugu" \ - "Gurajada" \ - "Lakki Reddy" \ - "Mallanna" \ - "Mandali" \ - "NATS" \ - "NTR" \ - "Noto Sans Telugu Bold" \ - "Noto Sans Telugu" \ - "Peddana" \ - "Ponnala" \ - "Ramabhadra" \ - "Ravi Prakash" \ - "Sree Krushnadevaraya" \ - "Suranna" \ - "Suravaram" \ - "Tenali Ramakrishna" \ - "Gautami" \ - ) - -TAMIL_FONTS=( \ - "TAMu_Kadambri" \ - "TAMu_Kalyani" \ - "TAMu_Maduram" \ - "TSCu_Paranar" \ - "TSCu_Times" \ - "TSCu_Paranar Bold" \ - "FreeSans" \ - "FreeSerif" \ - "Lohit Tamil" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "Droid Sans Tamil Bold" \ - "Droid Sans Tamil" \ - "Karla Tamil Inclined Bold Italic" \ - "Karla Tamil Inclined Italic" \ - "Karla Tamil Upright Bold" \ - "Karla Tamil Upright" \ - "Noto Sans Tamil Bold" \ - "Noto Sans Tamil" \ - "Noto Sans Tamil UI Bold" \ - "Noto Sans Tamil UI" \ - "TSCu_Comic Normal" \ - "Lohit Tamil Classical" \ - ) - -THAI_FONTS=( \ - "FreeSerif" \ - "FreeSerif Italic" \ - "Garuda" \ - "Norasi" \ - "Lucida Sans Typewriter" \ - "Lucida Sans" \ - "Garuda Oblique" \ - "Norasi Oblique" \ - "Norasi Italic" \ - "Garuda Bold" \ - "Norasi Bold" \ - "Lucida Sans Typewriter Bold" \ - "Lucida Sans Semi-Bold" \ - "Garuda Bold Oblique" \ - "Norasi Bold Italic" \ - "Norasi Bold Oblique" \ - "AnuParp LT Thai" \ - "Arial Unicode MS Bold" \ - "Arial Unicode MS" \ - "Ascender Uni" \ - "Loma" \ - "Noto Serif Thai Bold" \ - "Noto Serif Thai" \ - "Purisa Light" \ - "Sirichana LT Bold" \ - "Sirichana LT" \ - "Sukothai LT Bold" \ - "Sukothai LT" \ - "UtSaHaGumm LT Thai" \ - "Tahoma" \ - ) - -KOREAN_FONTS=( \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Baekmuk Batang Patched" \ - "Baekmuk Batang" \ - "Baekmuk Dotum" \ - "Baekmuk Gulim" \ - "Baekmuk Headline" \ - ) - -CHI_SIM_FONTS=( \ - "AR PL UKai CN" \ - "AR PL UMing Patched Light" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "WenQuanYi Zen Hei Medium" \ - ) - -CHI_TRA_FONTS=( \ - "AR PL UKai TW" \ - "AR PL UMing TW MBE Light" \ - "AR PL UKai Patched" \ - "AR PL UMing Patched Light" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "WenQuanYi Zen Hei Medium" \ - ) - -JPN_FONTS=( \ - "TakaoExGothic" \ - "TakaoExMincho" \ - "TakaoGothic" \ - "TakaoMincho" \ - "TakaoPGothic" \ - "TakaoPMincho" \ - "VL Gothic" \ - "VL PGothic" \ - "Noto Sans Japanese Bold" \ - "Noto Sans Japanese Light" \ - ) - -RUSSIAN_FONTS=( \ - "Arial Bold" \ - "Arial Bold Italic" \ - "Arial Italic" \ - "Arial" \ - "Courier New Bold" \ - "Courier New Bold Italic" \ - "Courier New Italic" \ - "Courier New" \ - "Times New Roman, Bold" \ - "Times New Roman, Bold Italic" \ - "Times New Roman, Italic" \ - "Times New Roman," \ - "Georgia Bold" \ - "Georgia Italic" \ - "Georgia" \ - "Georgia Bold Italic" \ - "Trebuchet MS Bold" \ - "Trebuchet MS Bold Italic" \ - "Trebuchet MS Italic" \ - "Trebuchet MS" \ - "Verdana Bold" \ - "Verdana Italic" \ - "Verdana" \ - "Verdana Bold Italic" \ - "DejaVu Serif" \ - "DejaVu Serif Oblique" \ - "DejaVu Serif Bold" \ - "DejaVu Serif Bold Oblique" \ - "Lucida Bright" \ - "FreeSerif Bold" \ - "FreeSerif Bold Italic" \ - "DejaVu Sans Ultra-Light" \ - ) - -GREEK_FONTS=( \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "DejaVu Sans Mono" \ - "DejaVu Sans Mono Oblique" \ - "DejaVu Sans Mono Bold" \ - "DejaVu Sans Mono Bold Oblique" \ - "DejaVu Serif" \ - "DejaVu Serif Semi-Condensed" \ - "DejaVu Serif Oblique" \ - "DejaVu Serif Bold" \ - "DejaVu Serif Bold Oblique" \ - "DejaVu Serif Bold Semi-Condensed" \ - "FreeSerif Bold" \ - "FreeSerif Bold Italic" \ - "FreeSerif Italic" \ - "FreeSerif" \ - "GentiumAlt" \ - "GentiumAlt Italic" \ - "Linux Biolinum O Bold" \ - "Linux Biolinum O" \ - "Linux Libertine O Bold" \ - "Linux Libertine O" \ - "Linux Libertine O Bold Italic" \ - "Linux Libertine O Italic" \ - "Palatino Linotype Bold" \ - "Palatino Linotype Bold Italic" \ - "Palatino Linotype Italic" \ - "Palatino Linotype" \ - "UmePlus P Gothic" \ - "VL PGothic" \ - ) - -ANCIENT_GREEK_FONTS=( \ - "GFS Artemisia" \ - "GFS Artemisia Bold" \ - "GFS Artemisia Bold Italic" \ - "GFS Artemisia Italic" \ - "GFS Bodoni" \ - "GFS Bodoni Bold" \ - "GFS Bodoni Bold Italic" \ - "GFS Bodoni Italic" \ - "GFS Didot" \ - "GFS Didot Bold" \ - "GFS Didot Bold Italic" \ - "GFS Didot Italic" \ - "GFS DidotClassic" \ - "GFS Neohellenic" \ - "GFS Neohellenic Bold" \ - "GFS Neohellenic Bold Italic" \ - "GFS Neohellenic Italic" \ - "GFS Philostratos" \ - "GFS Porson" \ - "GFS Pyrsos" \ - "GFS Solomos" \ - ) - -ARABIC_FONTS=( \ - "Arabic Transparent Bold" \ - "Arabic Transparent" \ - "Arab" \ - "Arial Unicode MS Bold" \ - "Arial Unicode MS" \ - "ASVCodar LT Bold" \ - "ASVCodar LT Light" \ - "Badiya LT Bold" \ - "Badiya LT" \ - "Badr LT Bold" \ - "Badr LT" \ - "Dimnah" \ - "Frutiger LT Arabic Bold" \ - "Frutiger LT Arabic" \ - "Furat" \ - "Hassan LT Bold" \ - "Hassan LT Light" \ - "Jalal LT Bold" \ - "Jalal LT Light" \ - "Midan Bold" \ - "Midan" \ - "Mitra LT Bold" \ - "Mitra LT Light" \ - "Palatino LT Arabic" \ - "Palatino Sans Arabic Bold" \ - "Palatino Sans Arabic" \ - "Simplified Arabic Bold" \ - "Simplified Arabic" \ - "Times New Roman, Bold" \ - "Times New Roman," \ - "Traditional Arabic Bold" \ - "Traditional Arabic" \ - ) - -HEBREW_FONTS=( \ - "Arial Bold" \ - "Arial Bold Italic" \ - "Arial Italic" \ - "Arial" \ - "Courier New Bold" \ - "Courier New Bold Italic" \ - "Courier New Italic" \ - "Courier New" \ - "Ergo Hebrew Semi-Bold" \ - "Ergo Hebrew Semi-Bold Italic" \ - "Ergo Hebrew" \ - "Ergo Hebrew Italic" \ - "Really No 2 LT W2G Light" \ - "Really No 2 LT W2G Light Italic" \ - "Really No 2 LT W2G Medium" \ - "Really No 2 LT W2G Medium Italic" \ - "Really No 2 LT W2G Semi-Bold" \ - "Really No 2 LT W2G Semi-Bold Italic" \ - "Really No 2 LT W2G Ultra-Bold" \ - "Really No 2 LT W2G Ultra-Bold Italic" \ - "Times New Roman, Bold" \ - "Times New Roman, Bold Italic" \ - "Times New Roman, Italic" \ - "Times New Roman," \ - "Lucida Sans" \ - "Tahoma" \ - ) - -BENGALI_FONTS=( \ - "Bangla Medium" \ - "Lohit Bengali" \ - "Mukti Narrow" \ - "Mukti Narrow Bold" \ - "Jamrul Medium Semi-Expanded" \ - "Likhan Medium" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "FreeSans" \ - "FreeSans Oblique" \ - "FreeSerif" \ - "FreeSerif Italic" \ - "Noto Sans Bengali Bold" \ - "Noto Sans Bengali" \ - "Ani" \ - "Lohit Assamese" \ - "Lohit Bengali" \ - "Mitra Mono" \ - ) - -KYRGYZ_FONTS=( \ - "Arial" \ - "Arial Bold" \ - "Arial Italic" \ - "Arial Bold Italic" \ - "Courier New" \ - "Courier New Bold" \ - "Courier New Italic" \ - "Courier New Bold Italic" \ - "Times New Roman," \ - "Times New Roman, Bold" \ - "Times New Roman, Bold Italic" \ - "Times New Roman, Italic" \ - "DejaVu Serif" \ - "DejaVu Serif Oblique" \ - "DejaVu Serif Bold" \ - "DejaVu Serif Bold Oblique" \ - "Lucida Bright" \ - "FreeSerif Bold" \ - "FreeSerif Bold Italic" \ - ) - -PERSIAN_FONTS=( \ - "Amiri Bold Italic" \ - "Amiri Bold" \ - "Amiri Italic" \ - "Amiri" \ - "Andale Sans Arabic Farsi" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Lateef" \ - "Lucida Bright" \ - "Lucida Sans Oblique" \ - "Lucida Sans Semi-Bold" \ - "Lucida Sans" \ - "Lucida Sans Typewriter Bold" \ - "Lucida Sans Typewriter Oblique" \ - "Lucida Sans Typewriter" \ - "Scheherazade" \ - "Tahoma" \ - "Times New Roman," \ - "Times New Roman, Bold" \ - "Times New Roman, Bold Italic" \ - "Times New Roman, Italic" \ - "Yakout Linotype Bold" \ - "Yakout Linotype" \ - ) - -AMHARIC_FONTS=( \ - "Abyssinica SIL" \ - "Droid Sans Ethiopic Bold" \ - "Droid Sans Ethiopic" \ - "FreeSerif" \ - "Noto Sans Ethiopic Bold" \ - "Noto Sans Ethiopic" \ - ) - -ARMENIAN_FONTS=( \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "FreeMono" \ - "FreeMono Italic" \ - "FreeSans" \ - "FreeSans Bold" \ - "FreeSans Oblique" \ - ) - -BURMESE_FONTS=( \ - "Myanmar Sans Pro" \ - "Noto Sans Myanmar Bold" \ - "Noto Sans Myanmar" \ - "Padauk Bold" \ - "Padauk" \ - "TharLon" \ - ) - -JAVANESE_FONTS=( \ - "Prada" \ - ) - -NORTH_AMERICAN_ABORIGINAL_FONTS=( \ - "Aboriginal Sans" \ - "Aboriginal Sans Bold Italic" \ - "Aboriginal Sans Italic" \ - "Aboriginal Sans Bold" \ - "Aboriginal Serif Bold" \ - "Aboriginal Serif Bold Italic" \ - "Aboriginal Serif Italic" \ - "Aboriginal Serif" \ - ) - -GEORGIAN_FONTS=( \ - "Arial Unicode MS Bold" \ - "Arial Unicode MS" \ - "BPG Algeti GPL\&GNU" \ - "BPG Chveulebrivi GPL\&GNU" \ - "BPG Courier GPL\&GNU" \ - "BPG Courier S GPL\&GNU" \ - "BPG DejaVu Sans 2011 GNU-GPL" \ - "BPG Elite GPL\&GNU" \ - "BPG Excelsior GPL\&GNU" \ - "BPG Glaho GPL\&GNU" \ - "BPG Gorda GPL\&GNU" \ - "BPG Ingiri GPL\&GNU" \ - "BPG Mrgvlovani Caps GNU\&GPL" \ - "BPG Mrgvlovani GPL\&GNU" \ - "BPG Nateli Caps GPL\&GNU Light" \ - "BPG Nateli Condenced GPL\&GNU Light" \ - "BPG Nateli GPL\&GNU Light" \ - "BPG Nino Medium Cond GPL\&GNU" \ - "BPG Nino Medium GPL\&GNU Medium" \ - "BPG Sans GPL\&GNU" \ - "BPG Sans Medium GPL\&GNU" \ - "BPG Sans Modern GPL\&GNU" \ - "BPG Sans Regular GPL\&GNU" \ - "BPG Serif GPL\&GNU" \ - "BPG Serif Modern GPL\&GNU" \ - "FreeMono" \ - "FreeMono Bold Italic" \ - "FreeSans" \ - "FreeSerif" \ - "FreeSerif Bold" \ - "FreeSerif Bold Italic" \ - "FreeSerif Italic" \ - ) - -OLD_GEORGIAN_FONTS=( \ - "Arial Unicode MS Bold" \ - "Arial Unicode MS" \ - "BPG Algeti GPL\&GNU" \ - "BPG Courier S GPL\&GNU" \ - "BPG DejaVu Sans 2011 GNU-GPL" \ - "BPG Elite GPL\&GNU" \ - "BPG Excelsior GPL\&GNU" \ - "BPG Glaho GPL\&GNU" \ - "BPG Ingiri GPL\&GNU" \ - "BPG Mrgvlovani Caps GNU\&GPL" \ - "BPG Mrgvlovani GPL\&GNU" \ - "BPG Nateli Caps GPL\&GNU Light" \ - "BPG Nateli Condenced GPL\&GNU Light" \ - "BPG Nateli GPL\&GNU Light" \ - "BPG Nino Medium Cond GPL\&GNU" \ - "BPG Nino Medium GPL\&GNU Medium" \ - "BPG Sans GPL\&GNU" \ - "BPG Sans Medium GPL\&GNU" \ - "BPG Sans Modern GPL\&GNU" \ - "BPG Sans Regular GPL\&GNU" \ - "BPG Serif GPL\&GNU" \ - "BPG Serif Modern GPL\&GNU" \ - "FreeSans" \ - "FreeSerif" \ - "FreeSerif Bold" \ - "FreeSerif Bold Italic" \ - "FreeSerif Italic" \ - ) - -KHMER_FONTS=( \ - "Khmer OS" \ - "Khmer OS System" \ - "Khmer OS Battambang" \ - "Khmer OS Bokor" \ - "Khmer OS Content" \ - "Khmer OS Fasthand" \ - "Khmer OS Freehand" \ - "Khmer OS Metal Chrieng" \ - "Khmer OS Muol Light" \ - "Khmer OS Muol Pali" \ - "Khmer OS Muol" \ - "Khmer OS Siemreap" \ - "Noto Sans Bold" \ - "Noto Sans" \ - "Noto Serif Khmer Bold" \ - "Noto Serif Khmer Light" \ - ) - -KURDISH_FONTS=( \ - "Amiri Bold Italic" \ - "Amiri Bold" \ - "Amiri Italic" \ - "Amiri" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Lateef" \ - "Lucida Bright" \ - "Lucida Sans Oblique" \ - "Lucida Sans Semi-Bold" \ - "Lucida Sans" \ - "Lucida Sans Typewriter Bold" \ - "Lucida Sans Typewriter Oblique" \ - "Lucida Sans Typewriter" \ - "Scheherazade" \ - "Tahoma" \ - "Times New Roman," \ - "Times New Roman, Bold" \ - "Times New Roman, Bold Italic" \ - "Times New Roman, Italic" \ - "Unikurd Web" \ - "Yakout Linotype Bold" \ - "Yakout Linotype" \ - ) - -LAOTHIAN_FONTS=( \ - "Phetsarath OT" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "Dhyana Bold" \ - "Dhyana" \ - "Lao Muang Don" \ - "Lao Muang Khong" \ - "Lao Sans Pro" \ - "Noto Sans Lao Bold" \ - "Noto Sans Lao" \ - "Noto Sans Lao UI Bold" \ - "Noto Sans Lao UI" \ - "Noto Serif Lao Bold" \ - "Noto Serif Lao" \ - "Phetsarath Bold" \ - "Phetsarath" \ - "Souliyo Unicode" \ -) - -GUJARATI_FONTS=( \ - "Lohit Gujarati" \ - "Rekha Medium" \ - "Samyak Gujarati Medium" \ - "aakar Medium" \ - "padmaa Bold" \ - "padmaa Medium" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "FreeSans" \ - "Noto Sans Gujarati Bold" \ - "Noto Sans Gujarati" \ - "Shruti" \ - "Shruti Bold" \ - ) - -MALAYALAM_FONTS=( \ - "AnjaliOldLipi" \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "Dyuthi" \ - "FreeSerif" \ - "Kalyani" \ - "Kartika" \ - "Kartika Bold" \ - "Lohit Malayalam" \ - "Meera" \ - "Noto Sans Malayalam Bold" \ - "Noto Sans Malayalam" \ - "Rachana" \ - "Rachana_w01" \ - "RaghuMalayalam" \ - "suruma" \ - ) - -ORIYA_FONTS=( \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "ori1Uni Medium" \ - "Samyak Oriya Medium" \ - "Lohit Oriya" \ - ) - -PUNJABI_FONTS=( \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "Saab" \ - "Lohit Punjabi" \ - "Noto Sans Gurmukhi" \ - "Noto Sans Gurmukhi Bold" \ - "FreeSans" \ - "FreeSans Bold" \ - "FreeSerif" \ - ) - -SINHALA_FONTS=( \ - "Noto Sans Sinhala Bold" \ - "Noto Sans Sinhala" \ - "OCRUnicode" \ - "Yagpo" \ - "LKLUG" \ - "FreeSerif" \ - ) - -SYRIAC_FONTS=( \ - "East Syriac Adiabene" \ - "East Syriac Ctesiphon" \ - "Estrangelo Antioch" \ - "Estrangelo Edessa" \ - "Estrangelo Midyat" \ - "Estrangelo Nisibin" \ - "Estrangelo Quenneshrin" \ - "Estrangelo Talada" \ - "Estrangelo TurAbdin" \ - "Serto Batnan Bold" \ - "Serto Batnan" \ - "Serto Jerusalem Bold" \ - "Serto Jerusalem Italic" \ - "Serto Jerusalem" \ - "Serto Kharput" \ - "Serto Malankara" \ - "Serto Mardin Bold" \ - "Serto Mardin" \ - "Serto Urhoy Bold" \ - "Serto Urhoy" \ - "FreeSans" \ - ) - -THAANA_FONTS=( \ - "FreeSerif" \ - ) - -TIBETAN_FONTS=( \ - "Arial Unicode MS" \ - "Arial Unicode MS Bold" \ - "Ascender Uni" \ - "DDC Uchen" \ - "Jomolhari" \ - "Kailasa" \ - "Kokonor" \ - "Tibetan Machine Uni" \ - "TibetanTsugRing" \ - "Yagpo" \ - ) - -# The following fonts will be rendered vertically in phase I. -VERTICAL_FONTS=( \ - "TakaoExGothic" \ # for jpn - "TakaoExMincho" \ # for jpn - "AR PL UKai Patched" \ # for chi_tra - "AR PL UMing Patched Light" \ # for chi_tra - "Baekmuk Batang Patched" \ # for kor - ) - -FLAGS_webtext_prefix=${FLAGS_webtext_prefix:-} - -# Set language-specific values for several global variables, including -# ${TEXT_CORPUS} -# holds the text corpus file for the language, used in phase F -# ${FONTS[@]} -# holds a sequence of applicable fonts for the language, used in -# phase F & I. only set if not already set, i.e. from command line -# ${TRAINING_DATA_ARGUMENTS} -# non-default arguments to the training_data program used in phase T -# ${FILTER_ARGUMENTS} - -# character-code-specific filtering to distinguish between scripts -# (eg. CJK) used by filter_borbidden_characters in phase F -# ${WORDLIST2DAWG_ARGUMENTS} -# specify fixed length dawg generation for non-space-delimited lang -# TODO(dsl): We can refactor these into functions that assign FONTS, -# TEXT_CORPUS, etc. separately. -set_lang_specific_parameters() { - local lang=$1 - # The default text location is now given directly from the language code. - TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt" - FILTER_ARGUMENTS="" - WORDLIST2DAWG_ARGUMENTS="" - # These dawg factors represent the fraction of the corpus not covered by the - # dawg, and seem like reasonable defaults, but the optimal value is likely - # to be highly corpus-dependent, as well as somewhat language-dependent. - # Number dawg factor is the fraction of all numeric strings that are not - # covered, which is why it is higher relative to the others. - PUNC_DAWG_FACTOR= - NUMBER_DAWG_FACTOR=0.125 - WORD_DAWG_FACTOR=0.05 - BIGRAM_DAWG_FACTOR=0.015 - TRAINING_DATA_ARGUMENTS="" - FRAGMENTS_DISABLED="y" - RUN_SHAPE_CLUSTERING=false - AMBIGS_FILTER_DENOMINATOR="100000" - LEADING="32" - MEAN_COUNT="40" # Default for latin script. - # Language to mix with the language for maximum accuracy. Defaults to eng. - # If no language is good, set to the base language. - MIX_LANG="eng" - EXPOSURES=${EXPOSURES:-} - FONTS=${FONTS:-} - - case ${lang} in - # Latin languages. - enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported - test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; - frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt" - # Make long-s substitutions for Middle French text - FILTER_ARGUMENTS="--make_early_language_variant=fra" - TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; - frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt" - test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );; - ita_old ) - TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt" - # Make long-s substitutions for Early Italian text - FILTER_ARGUMENTS="--make_early_language_variant=ita" - TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; - lat ) - test -z "$EXPOSURES" && EXPOSURES="-3 -2 -1 0 1 2 3" - test -z "$FONTS" && FONTS=( "${NEOLATIN_FONTS[@]}" ) ;; - spa_old ) - TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt" - # Make long-s substitutions for Early Spanish text - FILTER_ARGUMENTS="--make_early_language_variant=spa" - TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; - srp_latn ) - TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;; - vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;; - # Highly inflective languages get a bigger dawg size. - # TODO(rays) Add more here! - hun ) WORD_DAWG_SIZE=1000000 ;; - pol ) WORD_DAWG_SIZE=1000000 ;; - - # Latin with default treatment. - afr ) ;; - aze ) ;; - bos ) ;; - cat ) ;; - ceb ) ;; - ces ) PUNC_DAWG_FACTOR=0.004 ;; - cym ) ;; - dan ) ;; - deu ) WORD_DAWG_FACTOR=0.125 ;; - eng ) WORD_DAWG_FACTOR=0.03 ;; - epo ) ;; - est ) ;; - eus ) ;; - fil ) ;; - fin ) ;; - fra ) WORD_DAWG_FACTOR=0.08 ;; - gle ) ;; - gle_uncial ) test -z "$FONTS" && FONTS=( "${IRISH_UNCIAL_FONTS[@]}" );; - glg ) ;; - hat ) ;; - hrv ) ;; - iast ) ;; - ind ) ;; - isl ) ;; - ita ) ;; - jav ) ;; - lav ) ;; - lit ) ;; - mlt ) ;; - msa ) ;; - nld ) WORD_DAWG_FACTOR=0.02 ;; - nor ) ;; - por ) ;; - ron ) ;; - slk ) ;; - slv ) ;; - spa ) ;; - sqi ) ;; - swa ) ;; - swe ) ;; - tgl ) ;; - tur ) ;; - uzb ) ;; - zlm ) ;; - - # Special code for performing language-id that is trained on - # EFIGS+Latin+Vietnamese text with regular + fraktur fonts. - lat_lid ) - TEXT_CORPUS=${FLAGS_webtext_prefix}/lat_lid.corpus.txt - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - GENERATE_WORD_BIGRAMS=0 - # Strip unrenderable words as not all fonts will render the extended - # latin symbols found in Vietnamese text. - WORD_DAWG_SIZE=1000000 - test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; - - # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic. - rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) - MIX_LANG="rus" - NUMBER_DAWG_FACTOR=0.05 - WORD_DAWG_SIZE=1000000 ;; - aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl ) - MIX_LANG="${lang}" - test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;; - - # Special code for performing Cyrillic language-id that is trained on - # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian - # text with the list of Russian fonts. - cyr_lid ) - TEXT_CORPUS=${FLAGS_webtext_prefix}/cyr_lid.corpus.txt - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - GENERATE_WORD_BIGRAMS=0 - WORD_DAWG_SIZE=1000000 - test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );; - - # South Asian scripts mostly have a lot of different graphemes, so trim - # down the MEAN_COUNT so as not to get a huge amount of text. - asm | ben ) - MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;; - bih | hin | mar | nep | san ) - MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;; - bod ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;; - dzo ) - WORD_DAWG_FACTOR=0.01 - test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;; - guj ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;; - kan ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" - TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;; - mal ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" - TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;; - ori ) - WORD_DAWG_FACTOR=0.01 - test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;; - pan ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.01 - test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;; - sin ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.01 - test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;; - tam ) MEAN_COUNT="30" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" - TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;; - tel ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" - TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;; - - # SouthEast Asian scripts. - jav_java ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - test -z "$FONTS" && FONTS=( "${JAVANESE_FONTS[@]}" ) ;; - khm ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;; - lao ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;; - mya ) MEAN_COUNT="12" - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;; - tha ) MEAN_COUNT="30" - WORD_DAWG_FACTOR=0.01 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FILTER_ARGUMENTS="--segmenter_lang=tha" - TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" - AMBIGS_FILTER_DENOMINATOR="1000" - LEADING=48 - test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;; - - # CJK - chi_sim ) - MEAN_COUNT="15" - PUNC_DAWG_FACTOR=0.015 - WORD_DAWG_FACTOR=0.015 - GENERATE_WORD_BIGRAMS=0 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" - FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim" - test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;; - chi_tra ) - MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.015 - GENERATE_WORD_BIGRAMS=0 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" - FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra" - test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;; - jpn ) MEAN_COUNT="15" - WORD_DAWG_FACTOR=0.015 - GENERATE_WORD_BIGRAMS=0 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" - FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn" - test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;; - kor ) MEAN_COUNT="20" - WORD_DAWG_FACTOR=0.015 - NUMBER_DAWG_FACTOR=0.05 - TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - TRAINING_DATA_ARGUMENTS+=" --desired_bigrams=" - GENERATE_WORD_BIGRAMS=0 - FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor" - test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;; - - # Middle-Eastern scripts. - ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;; - div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;; - fas | pus | snd | uig | urd ) - test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;; - heb | yid ) - NUMBER_DAWG_FACTOR=0.05 - WORD_DAWG_FACTOR=0.08 - test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;; - syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;; - - # Other scripts. - amh | tir) - test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;; - chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \ - "Noto Sans Cherokee" \ - ) ;; - ell ) - NUMBER_DAWG_FACTOR=0.05 - WORD_DAWG_FACTOR=0.08 - test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;; - grc ) - test -z "$EXPOSURES" && EXPOSURES="-3 -2 -1 0 1 2 3" - test -z "$FONTS" && FONTS=( "${ANCIENT_GREEK_FONTS[@]}" ) ;; - hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;; - iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;; - kat) test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;; - kat_old) - TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt" - test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; - kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" ) - TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; - kmr ) test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) ;; - kur_ara ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; - - *) err_exit "Error: ${lang} is not a valid language code" - esac - if [[ ${FLAGS_mean_count:-} -gt 0 ]]; then - TRAINING_DATA_ARGUMENTS+=" --mean_count=${FLAGS_mean_count}" - elif [[ ! -z ${MEAN_COUNT:-} ]]; then - TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}" - fi - # Default to Latin fonts if none have been set - test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) - - # Default to 0 exposure if it hasn't been set - test -z "$EXPOSURES" && EXPOSURES=0 - # Set right-to-left and normalization mode. - case "${LANG_CODE}" in - ara | div| fas | pus | snd | syr | uig | urd | kur_ara | heb | yid ) - LANG_IS_RTL="1" - NORM_MODE="2" ;; - asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \ - dzo | sin | san | bod | ori | khm | mya | tha | lao | jav | jav_java) - LANG_IS_RTL="0" - NORM_MODE="2" ;; - * ) - LANG_IS_RTL="0" - NORM_MODE="1" ;; - esac -} - -#============================================================================= -# END of Language specific info -#============================================================================= diff --git a/src/training/tesstrain.sh b/src/training/tesstrain.sh deleted file mode 100755 index afc98f6a..00000000 --- a/src/training/tesstrain.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash -# (C) Copyright 2014, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This script provides an easy way to execute various phases of training -# Tesseract. For a detailed description of the phases, see -# https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html. -# - -display_usage() { -echo -e 'USAGE: tesstrain.sh - --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1"). - --fontlist FONTS # A list of fontnames to train on. - --fonts_dir FONTS_PATH # Path to font files. - --lang LANG_CODE # ISO 639 code. - --langdata_dir DATADIR # Path to tesseract/training/langdata directory. - --linedata_only # Only generate training data for lstmtraining. - --output_dir OUTPUTDIR # Location of output traineddata file. - --overwrite # Safe to overwrite files in output_dir. - --run_shape_clustering # Run shape clustering (use for Indic langs). - --maxpages # Specify maximum pages to output (default:0=all) - --save_box_tiff # Save box/tiff pairs along with lstmf files. - --xsize # Specify width of output image (default:3600) - - OPTIONAL flag for specifying directory with user specified box/tiff pairs. - Files should be named similar to ${LANG_CODE}.${fontname}.exp${EXPOSURE}.box/tif - --my_boxtiff_dir MY_BOXTIFF_DIR # Location of user specified box/tiff files. - - OPTIONAL flags for input data. If unspecified we will look for them in - the langdata_dir directory. - --training_text TEXTFILE # Text to render and use for training. - --wordlist WORDFILE # Word list for the language ordered by - # decreasing frequency. - OPTIONAL flag to specify location of existing traineddata files, required - during feature extraction. If unspecified will use TESSDATA_PREFIX defined in - the current environment. - --tessdata_dir TESSDATADIR # Path to tesseract/tessdata directory. - NOTE: - The font names specified in --fontlist need to be recognizable by Pango using - fontconfig. An easy way to list the canonical names of all fonts available on - your system is to run text2image with --list_available_fonts and the - appropriate --fonts_dir path.' -} - -source "$(dirname $0)/tesstrain_utils.sh" -if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then - display_usage - exit 0 -fi -if [ $# == 0 ]; then - display_usage - exit 1 -fi - -ARGV=("$@") -parse_flags - -mkdir -p ${TRAINING_DIR} - -if [[ ${MY_BOXTIFF_DIR} != "" ]]; then - tlog "\n=== Copy existing box/tiff pairs from '${MY_BOXTIFF_DIR}'" - cp ${MY_BOXTIFF_DIR}/*.box ${TRAINING_DIR} | true - cp ${MY_BOXTIFF_DIR}/*.tif ${TRAINING_DIR} | true - ls -l ${TRAINING_DIR} -fi - -tlog "\n=== Starting training for language '${LANG_CODE}'" - -source "$(dirname $0)/language-specific.sh" -set_lang_specific_parameters ${LANG_CODE} - -initialize_fontconfig - -phase_I_generate_image 8 -phase_UP_generate_unicharset -if $LINEDATA; then - phase_E_extract_features " lstm.train " 8 "lstmf" - make__lstmdata - tlog "\nCreated starter traineddata for LSTM training of language '${LANG_CODE}'\n" - tlog "\nRun 'lstmtraining' command to continue LSTM training for language '${LANG_CODE}'\n" -else - phase_D_generate_dawg - phase_E_extract_features "box.train" 8 "tr" - phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto" - phase_S_cluster_shapes - phase_M_cluster_microfeatures - phase_B_generate_ambiguities - make__traineddata - tlog "\nCompleted training for language '${LANG_CODE}'\n" -fi diff --git a/src/training/tesstrain_utils.sh b/src/training/tesstrain_utils.sh deleted file mode 100644 index 9e0c9637..00000000 --- a/src/training/tesstrain_utils.sh +++ /dev/null @@ -1,632 +0,0 @@ -#!/bin/bash -# (C) Copyright 2014, Google Inc. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This script defines functions that are used by tesstrain.sh -# For a detailed description of the phases, see -# https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html. -# -# USAGE: source tesstrain_utils.sh - -if [ -n "$BASH_VERSION" ];then - set -u # comment in case of "unbound variable" error or fix the code - set -eo pipefail; -else - echo "Warning: you aren't running script in bash - expect problems..." - fi - -UNAME=$(uname -s | tr 'A-Z' 'a-z') - -FONT_CONFIG_CACHE=$(mktemp -d -t font_tmp.XXXXXXXXXX) - -if [[ ($UNAME == *darwin*) ]]; then - FONTS_DIR="/Library/Fonts/" -else - FONTS_DIR="/usr/share/fonts/" -fi - -DISTORT_IMAGE=false -EXTRACT_FONT_PROPERTIES=false -LINEDATA=false -MAX_PAGES=0 -MY_BOXTIFF_DIR="" -OUTPUT_DIR="/tmp/tesstrain/tessdata" -OVERWRITE=false -RUN_SHAPE_CLUSTERING=false -SAVE_BOX_TIFF=false -WORKSPACE_DIR=$(mktemp -d) -X_SIZE=3600 -PT_SIZE=12 - -# set TESSDATA_PREFIX as empty, if not defined in environment to avoid an unbound variable -TESSDATA_PREFIX=${TESSDATA_PREFIX:-} - -# Logging helper functions. -tlog() { - if test -z "${LOG_FILE:-}"; then - echo -e $* - else - echo -e $* | tee -a ${LOG_FILE} - fi -} - -err_exit() { - if test -z "${LOG_FILE:-}"; then - echo -e "ERROR: "$* - else - echo -e "ERROR: "$* | tee -a ${LOG_FILE} - fi - exit 1 -} - -# Helper function to run a command and append its output to a log. Aborts early -# if the program file is not found. -# Usage: run_command CMD ARG1 ARG2... -run_command() { - local cmd - cmd=$(which $1 || \ - for d in api training; do - which $d/$1 && break - done) || err_exit "'$1' not found" - shift - tlog "[$(date)] ${cmd} $@" - if ! "${cmd}" "$@" 2>&1 | tee -a "${LOG_FILE}"; then - err_exit "Program $(basename ${cmd}) failed. Abort. Command line: ${cmd} $@" - fi -} - -# Check if all the given files exist, or exit otherwise. -# Used to check required input files and produced output files in each phase. -# Usage: check_file_readable FILE1 FILE2... -check_file_readable() { - for file in $@; do - if [[ ! -r ${file} ]]; then - err_exit "${file} does not exist or is not readable" - fi - done -} - -# Sets the named variable to given value. Aborts if the value is missing or -# if it looks like a flag. -# Usage: parse_value VAR_NAME VALUE -parse_value() { - local val="${2:-}" - if [[ -z "$val" ]]; then - err_exit "Missing value for variable $1" - exit - fi - if [[ ${val:0:2} == "--" ]]; then - err_exit "Invalid value $val passed for variable $1" - exit - fi - eval $1=\"$val\" -} - -# Does simple command-line parsing and initialization. -parse_flags() { - local i=0 - while test $i -lt ${#ARGV[@]}; do - local j=$((i+1)) - case ${ARGV[$i]} in - --) - break;; - --fontlist) - fn=0 - FONTS="" - while test $j -lt ${#ARGV[@]}; do - test -z "${ARGV[$j]}" && break - test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break - FONTS[$fn]="${ARGV[$j]}" - fn=$((fn+1)) - j=$((j+1)) - done - i=$((j-1)) ;; - --exposures) - exp="" - while test $j -lt ${#ARGV[@]}; do - test -z "${ARGV[$j]}" && break - test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break - exp="$exp ${ARGV[$j]}" - j=$((j+1)) - done - parse_value "EXPOSURES" "$exp" - i=$((j-1)) ;; - --fonts_dir) - parse_value "FONTS_DIR" ${ARGV[$j]:-} - i=$j ;; - --tmp_dir) - parse_value "TMP_DIR" ${ARGV[$j]:-} - i=$j ;; - --lang) - parse_value "LANG_CODE" ${ARGV[$j]:-} - i=$j ;; - --langdata_dir) - parse_value "LANGDATA_ROOT" ${ARGV[$j]:-} - i=$j ;; - --maxpages) - parse_value "MAX_PAGES" ${ARGV[$j]:-} - i=$j ;; - --ptsize) - parse_value "PT_SIZE" ${ARGV[$j]:-} - i=$j ;; - --my_boxtiff_dir) - parse_value "MY_BOXTIFF_DIR" ${ARGV[$j]:-} - i=$j ;; - --distort_image) - DISTORT_IMAGE=true ;; - --output_dir) - parse_value "OUTPUT_DIR" ${ARGV[$j]:-} - i=$j ;; - --overwrite) - OVERWRITE=true ;; - --save_box_tiff) - SAVE_BOX_TIFF=true ;; - --linedata_only) - LINEDATA=true ;; - --extract_font_properties) - EXTRACT_FONT_PROPERTIES=true ;; - --noextract_font_properties) - EXTRACT_FONT_PROPERTIES=false ;; - --tessdata_dir) - parse_value "TESSDATA_DIR" ${ARGV[$j]:-} - i=$j ;; - --training_text) - parse_value "TRAINING_TEXT" "${ARGV[$j]:-}" - i=$j ;; - --wordlist) - parse_value "WORDLIST_FILE" ${ARGV[$j]:-} - i=$j ;; - --workspace_dir) - rmdir "$FONT_CONFIG_CACHE" - rmdir "$WORKSPACE_DIR" - parse_value "WORKSPACE_DIR" ${ARGV[$j]:-} - FONT_CONFIG_CACHE=$WORKSPACE_DIR/fc-cache - mkdir -p $FONT_CONFIG_CACHE - i=$j ;; - --xsize) - parse_value "X_SIZE" ${ARGV[$j]:-} - i=$j ;; - *) - err_exit "Unrecognized argument ${ARGV[$i]}" ;; - esac - i=$((i+1)) - done - if [[ -z ${LANG_CODE:-} ]]; then - err_exit "Need to specify a language --lang" - fi - if [[ -z ${LANGDATA_ROOT:-} ]]; then - err_exit "Need to specify path to language files --langdata_dir" - fi - if [[ -z ${TESSDATA_DIR:-} ]]; then - if [[ -z ${TESSDATA_PREFIX} ]]; then - err_exit "Need to specify a --tessdata_dir or have a "\ - "TESSDATA_PREFIX variable defined in your environment" - else - TESSDATA_DIR="${TESSDATA_PREFIX}" - fi - fi - if [[ ! -d "${OUTPUT_DIR}" ]]; then - tlog "Creating new directory ${OUTPUT_DIR}" - mkdir -p "${OUTPUT_DIR}" - fi - - # Location where intermediate files will be created. - TIMESTAMP=$(date +%Y-%m-%d) - if [[ -z ${TMP_DIR:-} ]]; then - TMP_DIR=$(mktemp -d -t ${LANG_CODE}-${TIMESTAMP}.XXX) - else - TMP_DIR=$(mktemp -d -p ${TMP_DIR} -t ${LANG_CODE}-${TIMESTAMP}.XXX) - fi - TRAINING_DIR=${TMP_DIR} - # Location of log file for the whole run. - LOG_FILE=${TRAINING_DIR}/tesstrain.log - - # Take training text and wordlist from the langdata directory if not - # specified in the command-line. - TRAINING_TEXT=${TRAINING_TEXT:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text} - WORDLIST_FILE=${WORDLIST_FILE:-${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist} - - WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams - NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers - PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc - BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs - UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs - TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams - GENERATE_DAWGS=1 -} - -# Function initializes font config with a unique font cache dir. -initialize_fontconfig() { - export FONT_CONFIG_CACHE - local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt - echo "Text" >${sample_path} - run_command text2image --fonts_dir=${FONTS_DIR} --ptsize ${PT_SIZE} \ - --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \ - --fontconfig_tmpdir=${FONT_CONFIG_CACHE} -} - -# Helper function for phaseI_generate_image. Generates the image for a single -# language/font combination in a way that can be run in parallel. -generate_font_image() { - local font="$1" - tlog "Rendering using ${font}" - local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') - local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} - - local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}" - common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words" - common_args+=" --leading=${LEADING} --xsize=${X_SIZE}" - common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}" - common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}" - if $DISTORT_IMAGE; then - common_args+=" --distort_image --invert=false" - fi - - # add --writing_mode=vertical-upright to common_args if the font is - # specified to be rendered vertically. - for vfont in "${VERTICAL_FONTS[@]}"; do - if [[ "${font}" == "${vfont}" ]]; then - common_args+=" --writing_mode=vertical-upright " - break - fi - done - - run_command text2image ${common_args} --font="${font}" --ptsize ${PT_SIZE} \ - --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS:-} - check_file_readable ${outbase}.box ${outbase}.tif - - if $EXTRACT_FONT_PROPERTIES && - [[ -r ${TRAIN_NGRAMS_FILE} ]]; then - tlog "Extracting font properties of ${font}" - run_command text2image ${common_args} --font="${font}" \ - --ligatures=false --text=${TRAIN_NGRAMS_FILE} \ - --only_extract_font_properties --ptsize=32 - check_file_readable ${outbase}.fontinfo - fi -} - -# Phase I : Generate (I)mages from training text for each font. -phase_I_generate_image() { - local par_factor=${1:-} - if ! [[ "${par_factor}" -gt 0 ]]; then - par_factor=1 - fi - tlog "\n=== Phase I: Generating training images ===" - if [[ -z ${TRAINING_TEXT:-} ]] || test ! -r "${TRAINING_TEXT}"; then - err_exit "Could not find training text file ${TRAINING_TEXT:-}" - fi - CHAR_SPACING="0.0" - - for EXPOSURE in $EXPOSURES; do - if $EXTRACT_FONT_PROPERTIES && [[ -r ${BIGRAM_FREQS_FILE} ]]; then - # Parse .bigram_freqs file and compose a .train_ngrams file with text - # for tesseract to recognize during training. Take only the ngrams whose - # combined weight accounts for 95% of all the bigrams in the language. - NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ - | awk '{s=s+$2}; END {print (s/100)*p}' p=99) - sort -rnk2 ${BIGRAM_FREQS_FILE} \ - | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ - x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} - check_file_readable ${TRAIN_NGRAMS_FILE} - fi - - local jobs= - trap "kill $$" INT - for font in "${FONTS[@]}"; do - sleep 1 - test $(jobs -r | wc -l) -ge $par_factor && wait -n - generate_font_image "${font}" & - jobs="$jobs $!" - done - wait $jobs - # Check that each process was successful. - for font in "${FONTS[@]}"; do - local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') - local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} - check_file_readable ${outbase}.box ${outbase}.tif - done - done - if $SAVE_BOX_TIFF && ( ! $LINEDATA ) ; then - tlog "\n=== Saving box/tiff pairs for training data ===" - for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - cp "${f}" "${OUTPUT_DIR}" - done - for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - cp "${f}" "${OUTPUT_DIR}" - done - fi -} - -# Phase UP : Generate (U)nicharset and (P)roperties file. -phase_UP_generate_unicharset() { - tlog "\n=== Phase UP: Generating unicharset and unichar properties files ===" - - local box_files=$(ls ${TRAINING_DIR}/*.box) - UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset" - if [[ "${NORM_MODE}" == "2" ]] && [[ "${LANG_IS_RTL}" == "0" ]] ; then - run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \ - --norm_mode "${NORM_MODE}" ${TRAINING_TEXT} - else - run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \ - --norm_mode "${NORM_MODE}" ${box_files} - fi - check_file_readable ${UNICHARSET_FILE} - - XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights" - run_command set_unicharset_properties \ - -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \ - --script_dir=${LANGDATA_ROOT} - check_file_readable ${XHEIGHTS_FILE} -} - -# Phase D : Generate (D)awg files from unicharset file and wordlist files -phase_D_generate_dawg() { - tlog "\n=== Phase D: Generating Dawg files ===" - - # Skip if requested - if [[ ${GENERATE_DAWGS} -eq 0 ]]; then - tlog "Skipping ${phase_name}" - return - fi - - # Output files - WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg - FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg - PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg - NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg - BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg - - # Word DAWG - local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq - if [[ -s ${WORDLIST_FILE} ]]; then - tlog "Generating word Dawg" - check_file_readable ${UNICHARSET_FILE} - run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ - ${UNICHARSET_FILE} - check_file_readable ${WORD_DAWG} - - FREQ_DAWG_SIZE=100 - head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file} - fi - - # Freq-word DAWG - if [[ -s ${freq_wordlist_file} ]]; then - check_file_readable ${UNICHARSET_FILE} - tlog "Generating frequent-word Dawg" - run_command wordlist2dawg -r 1 ${freq_wordlist_file} \ - ${FREQ_DAWG} ${UNICHARSET_FILE} - check_file_readable ${FREQ_DAWG} - fi - - # Punctuation DAWG - # -r arguments to wordlist2dawg denote RTL reverse policy - # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h). - # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, - # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, - # 2/RRP_FORCE_REVERSE for the punctuation DAWG. - local punc_reverse_policy=0; - if [[ "${LANG_IS_RTL}" == "1" ]]; then - punc_reverse_policy=2 - fi - if [[ ! -s ${PUNC_FILE} ]]; then - PUNC_FILE="${LANGDATA_ROOT}/common.punc" - fi - check_file_readable ${PUNC_FILE} - run_command wordlist2dawg -r ${punc_reverse_policy} \ - ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE} - check_file_readable ${PUNC_DAWG} - - # Numbers DAWG - if [[ -s ${NUMBERS_FILE} ]]; then - run_command wordlist2dawg -r 0 \ - ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE} - check_file_readable ${NUMBER_DAWG} - fi - - # Bigram dawg - if [[ -s ${WORD_BIGRAMS_FILE} ]]; then - run_command wordlist2dawg -r 1 \ - ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE} - check_file_readable ${BIGRAM_DAWG} - fi -} - -# Phase E : (E)xtract .tr feature files from .tif/.box files -phase_E_extract_features() { - local box_config=$1 - local par_factor=$2 - local ext=$3 - if ! [[ "${par_factor}" -gt 0 ]]; then - par_factor=1 - fi - tlog "\n=== Phase E: Generating ${ext} files ===" - - local img_files="" - for exposure in ${EXPOSURES}; do - img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif) - done - - # Use any available language-specific configs. - local config="" - if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then - config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config - fi - - OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX} - export TESSDATA_PREFIX=${TESSDATA_DIR} - tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}" - local jobs= - trap "kill $$" INT - for img_file in ${img_files}; do - test $(jobs -r | wc -l) -ge $par_factor && wait -n - run_command tesseract ${img_file} ${img_file%.*} \ - ${box_config} ${config} & - jobs="$jobs $!" - done - wait $jobs - export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX} - # Check that all the output files were produced. - for img_file in ${img_files}; do - check_file_readable "${img_file%.*}.${ext}" - done -} - -# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining) -# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto -phase_C_cluster_prototypes() { - tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ===" - local out_normproto=$1 - - run_command cntraining -D "${TRAINING_DIR}/" \ - $(ls ${TRAINING_DIR}/*.tr) - - check_file_readable ${TRAINING_DIR}/normproto - mv ${TRAINING_DIR}/normproto ${out_normproto} -} - -# Phase S : (S)hape clustering -phase_S_cluster_shapes() { - if ! $RUN_SHAPE_CLUSTERING; then - tlog "\n=== Shape Clustering disabled ===" - return - fi - check_file_readable ${LANGDATA_ROOT}/font_properties - local font_props="-F ${LANGDATA_ROOT}/font_properties" - if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\ - [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then - font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" - fi - - run_command shapeclustering \ - -D "${TRAINING_DIR}/" \ - -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ - -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ - ${font_props} \ - $(ls ${TRAINING_DIR}/*.tr) - check_file_readable ${TRAINING_DIR}/shapetable \ - ${TRAINING_DIR}/${LANG_CODE}.mfunicharset -} - -# Phase M : Clustering microfeatures (mfTraining) -phase_M_cluster_microfeatures() { - tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ===" - - check_file_readable ${LANGDATA_ROOT}/font_properties - font_props="-F ${LANGDATA_ROOT}/font_properties" - if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \ - [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then - font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" - fi - - run_command mftraining \ - -D "${TRAINING_DIR}/" \ - -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ - -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ - ${font_props} \ - $(ls ${TRAINING_DIR}/*.tr) - check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \ - ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset - mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp - mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable - mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable - mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset -} - -phase_B_generate_ambiguities() { - tlog "\n=== Phase B : ambiguities training ===" - - # Check for manually created ambiguities data. - if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then - tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs" - cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \ - ${TRAINING_DIR}/${LANG_CODE}.unicharambigs - # Make it writable, as it may be read-only in the client. - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs - return - else - tlog "No unicharambigs file found!" - fi - - # TODO: Add support for generating ambiguities automatically. -} - -make__lstmdata() { - tlog "\n=== Constructing LSTM training data ===" - local lang_prefix="${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}" - local lang_is_rtl="" - if [[ "${LANG_IS_RTL}" == "1" ]]; then - lang_is_rtl="--lang_is_rtl" - fi - local pass_through="" - if [[ "${NORM_MODE}" -ge "2" ]]; then - pass_through="--pass_through_recoder" - fi - - # Build the starter traineddata from the inputs. - run_command combine_lang_model \ - --input_unicharset "${TRAINING_DIR}/${LANG_CODE}.unicharset" \ - --script_dir "${LANGDATA_ROOT}" \ - --words "${lang_prefix}.wordlist" \ - --numbers "${lang_prefix}.numbers" \ - --puncs "${lang_prefix}.punc" \ - --output_dir "${OUTPUT_DIR}" --lang "${LANG_CODE}" \ - "${pass_through}" "${lang_is_rtl}" - - if $SAVE_BOX_TIFF; then - tlog "\n=== Saving box/tiff pairs for training data ===" - for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - mv "${f}" "${OUTPUT_DIR}" - done - for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - mv "${f}" "${OUTPUT_DIR}" - done - fi - - tlog "\n=== Moving lstmf files for training data ===" - for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - mv "${f}" "${OUTPUT_DIR}" - done - local lstm_list="${OUTPUT_DIR}/${LANG_CODE}.training_files.txt" - ls -1 "${OUTPUT_DIR}/${LANG_CODE}".*.lstmf > "${lstm_list}" -} - -make__traineddata() { - tlog "\n=== Making final traineddata file ===" - local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE} - - # Combine available files for this language from the langdata dir. - if [[ -r ${lang_prefix}.config ]]; then - tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}" - cp ${lang_prefix}.config ${TRAINING_DIR} - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config - fi - if [[ -r ${lang_prefix}.params-model ]]; then - tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}" - cp ${lang_prefix}.params-model ${TRAINING_DIR} - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model - fi - - # Compose the traineddata file. - run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}. - - # Copy it to the output dir, overwriting only if allowed by the cmdline flag. - local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata; - if [[ -f ${destfile} ]] && ! $OVERWRITE; then - err_exit "File ${destfile} exists and no --overwrite specified"; - fi - tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}" - cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile} -} diff --git a/src/training/unicharset/lstmtester.h b/src/training/unicharset/lstmtester.h index 678aefbb..b0e3a1cf 100644 --- a/src/training/unicharset/lstmtester.h +++ b/src/training/unicharset/lstmtester.h @@ -35,7 +35,7 @@ public: // Loads a set of lstmf files that were created using the lstm.train config to // tesseract into memory ready for testing. Returns false if nothing was // loaded. The arg is a filename of a file that lists the filenames, with one - // name per line. Conveniently, tesstrain.sh generates such a file, along + // name per line. Conveniently, tesstrain.py generates such a file, along // with the files themselves. bool LoadAllEvalData(const char *filenames_file); // Loads a set of lstmf files that were created using the lstm.train config to diff --git a/unittest/lstm_test.cc b/unittest/lstm_test.cc index 45302ee9..4b3d4ac2 100644 --- a/unittest/lstm_test.cc +++ b/unittest/lstm_test.cc @@ -15,7 +15,7 @@ // // Use --xsize 800 for text2image to be similar to original training data. // -// src/training/tesstrain.sh --fonts_dir /usr/share/fonts --lang eng \ +// tesstrain.py --fonts_dir /usr/share/fonts --lang eng \ // --linedata_only --noextract_font_properties --langdata_dir ../langdata_lstm \ // --tessdata_dir ../tessdata --output_dir ~/tesseract/test/testdata \ // --fontlist "Arial" --maxpages 10