Javanese script training

This commit is contained in:
Shree Devi Kumar 2018-08-16 12:15:10 +00:00
parent e1c387c9b3
commit b34cf9d424
6 changed files with 31 additions and 14 deletions

View File

@ -22,7 +22,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
ell eng enm epo est eus fas fil fin fra frk frm gle glg ell eng enm epo est eus fas fil fin fra frk frm gle glg
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
jav jpn kan kat kat_old kaz khm kir kor kur lao lat jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
pan pol por pus ron rus san sin slk slv snd spa spa_old pan pol por pus ron rus san sin slk slv snd spa spa_old
sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur
@ -604,6 +604,10 @@ BURMESE_FONTS=( \
"TharLon" \ "TharLon" \
) )
JAVANESE_FONTS=( \
"Prada" \
)
NORTH_AMERICAN_ABORIGINAL_FONTS=( \ NORTH_AMERICAN_ABORIGINAL_FONTS=( \
"Aboriginal Sans" \ "Aboriginal Sans" \
"Aboriginal Sans Bold Italic" \ "Aboriginal Sans Bold Italic" \
@ -1065,6 +1069,10 @@ set_lang_specific_parameters() {
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;; test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
# SouthEast Asian scripts. # SouthEast Asian scripts.
jav_java ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
test -z "$FONTS" && FONTS=( "${JAVANESE_FONTS[@]}" ) ;;
khm ) MEAN_COUNT="15" khm ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15 WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
@ -1172,7 +1180,7 @@ set_lang_specific_parameters() {
LANG_IS_RTL="1" LANG_IS_RTL="1"
NORM_MODE="2" ;; NORM_MODE="2" ;;
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \ asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav ) dzo | sin | san | bod | ori | khm | mya | tha | lao | jav | jav_java)
LANG_IS_RTL="0" LANG_IS_RTL="0"
NORM_MODE="2" ;; NORM_MODE="2" ;;
* ) * )

View File

@ -61,7 +61,7 @@ initialize_fontconfig
phase_I_generate_image 8 phase_I_generate_image 8
phase_UP_generate_unicharset phase_UP_generate_unicharset
if ((LINEDATA)); then if ((LINEDATA)); then
phase_E_extract_features "lstm.train" 8 "lstmf" phase_E_extract_features " --psm 6 lstm.train " 8 "lstmf"
make__lstmdata make__lstmdata
tlog "\nCreated starter traineddata for language '${LANG_CODE}'\n" tlog "\nCreated starter traineddata for language '${LANG_CODE}'\n"
tlog "\nRun lstmtraining to do the LSTM training for language '${LANG_CODE}'\n" tlog "\nRun lstmtraining to do the LSTM training for language '${LANG_CODE}'\n"

View File

@ -23,6 +23,7 @@ else
FONTS_DIR="/usr/share/fonts/" FONTS_DIR="/usr/share/fonts/"
FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX) FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
fi fi
MAX_PAGES=0
OUTPUT_DIR="/tmp/tesstrain/tessdata" OUTPUT_DIR="/tmp/tesstrain/tessdata"
OVERWRITE=0 OVERWRITE=0
LINEDATA=0 LINEDATA=0
@ -130,6 +131,9 @@ parse_flags() {
--langdata_dir) --langdata_dir)
parse_value "LANGDATA_ROOT" ${ARGV[$j]} parse_value "LANGDATA_ROOT" ${ARGV[$j]}
i=$j ;; i=$j ;;
--maxpages)
parse_value "MAX_PAGES" ${ARGV[$j]}
i=$j ;;
--output_dir) --output_dir)
parse_value "OUTPUT_DIR" ${ARGV[$j]} parse_value "OUTPUT_DIR" ${ARGV[$j]}
i=$j ;; i=$j ;;
@ -221,7 +225,7 @@ generate_font_image() {
common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words" common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
common_args+=" --leading=${LEADING}" common_args+=" --leading=${LEADING}"
common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}" common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
common_args+=" --outputbase=${outbase} --max_pages=0" common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}"
# add --writing_mode=vertical-upright to common_args if the font is # add --writing_mode=vertical-upright to common_args if the font is
# specified to be rendered vertically. # specified to be rendered vertically.
@ -246,7 +250,6 @@ generate_font_image() {
fi fi
} }
# Phase I : Generate (I)mages from training text for each font. # Phase I : Generate (I)mages from training text for each font.
phase_I_generate_image() { phase_I_generate_image() {
local par_factor=$1 local par_factor=$1

View File

@ -65,7 +65,7 @@ Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
return CharClass::kConsonant; return CharClass::kConsonant;
// Sinhala doesn't have Nukta or Avagraha. // Sinhala doesn't have Nukta or Avagraha.
if (off == 0x3c) return CharClass::kNukta; if (off == 0x3c) return CharClass::kNukta;
if (off == 0x3d) return CharClass::kVowel; if (off == 0x3d) return CharClass::kVowel; // avagraha
if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra; if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece; if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
if (off == 0x4d) return CharClass::kVirama; if (off == 0x4d) return CharClass::kVirama;

View File

@ -26,12 +26,13 @@ namespace tesseract {
// Taken from unicode standard: // Taken from unicode standard:
// http://www.unicode.org/charts/PDF/UA980.pdf // http://www.unicode.org/charts/PDF/UA980.pdf
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf // http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
// The Consonant class here includes independent vowels.
// The order of components in an orthographic syllable as expressed in BNF is: // The order of components in an orthographic syllable as expressed in BNF is:
// {C F} C {{R}Y} {V{A}} {Z} // {C F} C {{R}Y} {V{A}} {Z}
// Translated to the codes used by the CharClass enum: // Translated to the codes used by the CharClass enum:
// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D] // [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
// Also the Consonant class here includes independent vowels, as they are // Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
// treated the same anyway. // Validation rules copied from validate_indic.cpp and modified for Javanese.
// Indic - for reference // Indic - for reference
// + vowel Grapheme: V[D](v)* // + vowel Grapheme: V[D](v)*
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* // + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
@ -63,7 +64,6 @@ bool ValidateJavanese::ConsumeGraphemeIfValid() {
} }
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page. // Offset from the start of the relevant unicode code block aka code page.
@ -74,6 +74,8 @@ Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
if (off <= 0x39) return CharClass::kMatra;
if (off <= 0x3a) return CharClass::kMatraPiece; // A9BA TALING
if (off <= 0x3d) return CharClass::kMatra; if (off <= 0x3d) return CharClass::kMatra;
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
@ -229,6 +231,11 @@ bool ValidateJavanese::ConsumeConsonantTailIfValid() {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
} }
// Tarung also used for long versions of u and o vowels and vocalic r
// Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
while (codes_[codes_used_].first == CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
while (codes_[codes_used_].first == CharClass::kVowelModifier) { while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true; if (UseMultiCode(1)) return true;
} }
@ -260,4 +267,3 @@ bool ValidateJavanese::ConsumeVowelIfValid() {
} }
} // namespace tesseract } // namespace tesseract