From 37befdf6c4d53ca37767b95b5efeac3ed01d81f8 Mon Sep 17 00:00:00 2001 From: Shree Date: Fri, 15 Mar 2019 13:32:36 +0000 Subject: [PATCH 1/4] Add option for --distort_image --- src/training/tesstrain_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/training/tesstrain_utils.py b/src/training/tesstrain_utils.py index dcd7295e..54725205 100644 --- a/src/training/tesstrain_utils.py +++ b/src/training/tesstrain_utils.py @@ -172,6 +172,10 @@ parser.add_argument( "--noextract_font_properties", dest="extract_font_properties", action="store_false" ) +parser.add_argument( + "--distort_image", dest="distort_image", help="--distort_image=true." +) + tessdata_group = parser.add_argument_group( "tessdata", "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.", @@ -310,6 +314,7 @@ def generate_font_image(ctx, font, exposure, char_spacing): f"--exposure={exposure}", f"--outputbase={outbase}", f"--max_pages={ctx.max_pages}", + f"--distort_image={ctx.distort_image}", ] # add --writing_mode=vertical-upright to common_args if the font is From b2ebf0195f3c7e6f3e95b816466e7210baaeb7d0 Mon Sep 17 00:00:00 2001 From: Shree Date: Fri, 15 Mar 2019 14:39:39 +0000 Subject: [PATCH 2/4] Add kmr and kur_ara, remove kur from training scripts --- src/training/language_specific.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/training/language_specific.py b/src/training/language_specific.py index a5cc89a4..8c7d27bf 100644 --- a/src/training/language_specific.py +++ b/src/training/language_specific.py @@ -30,7 +30,7 @@ VALID_LANGUAGE_CODES = ( "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo " "ell eng enm epo est eus fas fil fin fra frk frm gle glg " "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old " - "jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat " + "jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat " "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori " "pan pol por pus ron rus san sin slk slv snd spa spa_old " "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur " @@ -1302,7 +1302,10 @@ def set_lang_specific_parameters(ctx, lang): if not FONTS: FONTS = KYRGYZ_FONTS TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"] - elif lang == "kur": + elif lang == "kmr": + if not FONTS: + FONTS = LATIN_FONTS + elif lang == "kur_ara": if not FONTS: FONTS = KURDISH_FONTS else: From 3eee1d217a3a984ecb817460d34ece1e3a3affcd Mon Sep 17 00:00:00 2001 From: Shree Date: Fri, 15 Mar 2019 15:37:49 +0000 Subject: [PATCH 3/4] Add kmr and kur_ara, remove kur from training scripts --- src/training/language-specific.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh index 10e4677c..46ca36ee 100755 --- a/src/training/language-specific.sh +++ b/src/training/language-specific.sh @@ -22,7 +22,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo ell eng enm epo est eus fas fil fin fra frk frm gle glg grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old - jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat + jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori pan pol por pus ron rus san sin slk slv snd spa spa_old sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur @@ -1164,7 +1164,8 @@ set_lang_specific_parameters() { test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" ) TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; - kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; + kmr ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; + kur_ara ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; *) err_exit "Error: ${lang} is not a valid language code" esac From d47b0d588a571d66efaa0246454b6e66a5c32a92 Mon Sep 17 00:00:00 2001 From: Shree Date: Fri, 15 Mar 2019 15:47:56 +0000 Subject: [PATCH 4/4] Use LATIN_FONTS for kmr --- src/training/language-specific.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh index 46ca36ee..ef72af5e 100755 --- a/src/training/language-specific.sh +++ b/src/training/language-specific.sh @@ -1164,7 +1164,7 @@ set_lang_specific_parameters() { test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" ) TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; - kmr ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; + kmr ) test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) ;; kur_ara ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; *) err_exit "Error: ${lang} is not a valid language code"