diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh index 10e4677c..ef72af5e 100755 --- a/src/training/language-specific.sh +++ b/src/training/language-specific.sh @@ -22,7 +22,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo ell eng enm epo est eus fas fil fin fra frk frm gle glg grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old - jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat + jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori pan pol por pus ron rus san sin slk slv snd spa spa_old sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur @@ -1164,7 +1164,8 @@ set_lang_specific_parameters() { test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" ) TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; - kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; + kmr ) test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) ;; + kur_ara ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; *) err_exit "Error: ${lang} is not a valid language code" esac diff --git a/src/training/language_specific.py b/src/training/language_specific.py index a5cc89a4..8c7d27bf 100644 --- a/src/training/language_specific.py +++ b/src/training/language_specific.py @@ -30,7 +30,7 @@ VALID_LANGUAGE_CODES = ( "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo " "ell eng enm epo est eus fas fil fin fra frk frm gle glg " "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old " - "jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat " + "jav jav_java jpn kan kat kat_old kaz khm kir kmr kor kur_ara lao lat " "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori " "pan pol por pus ron rus san sin slk slv snd spa spa_old " "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur " @@ -1302,7 +1302,10 @@ def set_lang_specific_parameters(ctx, lang): if not FONTS: FONTS = KYRGYZ_FONTS TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"] - elif lang == "kur": + elif lang == "kmr": + if not FONTS: + FONTS = LATIN_FONTS + elif lang == "kur_ara": if not FONTS: FONTS = KURDISH_FONTS else: diff --git a/src/training/tesstrain_utils.py b/src/training/tesstrain_utils.py index dcd7295e..54725205 100644 --- a/src/training/tesstrain_utils.py +++ b/src/training/tesstrain_utils.py @@ -172,6 +172,10 @@ parser.add_argument( "--noextract_font_properties", dest="extract_font_properties", action="store_false" ) +parser.add_argument( + "--distort_image", dest="distort_image", help="--distort_image=true." +) + tessdata_group = parser.add_argument_group( "tessdata", "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.", @@ -310,6 +314,7 @@ def generate_font_image(ctx, font, exposure, char_spacing): f"--exposure={exposure}", f"--outputbase={outbase}", f"--max_pages={ctx.max_pages}", + f"--distort_image={ctx.distort_image}", ] # add --writing_mode=vertical-upright to common_args if the font is