mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-12 15:39:04 +08:00
Javanese script training
This commit is contained in:
parent
e1c387c9b3
commit
b34cf9d424
@ -22,7 +22,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
|
||||
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
|
||||
ell eng enm epo est eus fas fil fin fra frk frm gle glg
|
||||
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
|
||||
jav jpn kan kat kat_old kaz khm kir kor kur lao lat
|
||||
jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat
|
||||
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
|
||||
pan pol por pus ron rus san sin slk slv snd spa spa_old
|
||||
sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur
|
||||
@ -603,6 +603,10 @@ BURMESE_FONTS=( \
|
||||
"Padauk" \
|
||||
"TharLon" \
|
||||
)
|
||||
|
||||
JAVANESE_FONTS=( \
|
||||
"Prada" \
|
||||
)
|
||||
|
||||
NORTH_AMERICAN_ABORIGINAL_FONTS=( \
|
||||
"Aboriginal Sans" \
|
||||
@ -1065,6 +1069,10 @@ set_lang_specific_parameters() {
|
||||
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
|
||||
|
||||
# SouthEast Asian scripts.
|
||||
jav_java ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
test -z "$FONTS" && FONTS=( "${JAVANESE_FONTS[@]}" ) ;;
|
||||
khm ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
@ -1172,7 +1180,7 @@ set_lang_specific_parameters() {
|
||||
LANG_IS_RTL="1"
|
||||
NORM_MODE="2" ;;
|
||||
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
|
||||
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
|
||||
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav | jav_java)
|
||||
LANG_IS_RTL="0"
|
||||
NORM_MODE="2" ;;
|
||||
* )
|
||||
|
@ -61,7 +61,7 @@ initialize_fontconfig
|
||||
phase_I_generate_image 8
|
||||
phase_UP_generate_unicharset
|
||||
if ((LINEDATA)); then
|
||||
phase_E_extract_features "lstm.train" 8 "lstmf"
|
||||
phase_E_extract_features " --psm 6 lstm.train " 8 "lstmf"
|
||||
make__lstmdata
|
||||
tlog "\nCreated starter traineddata for language '${LANG_CODE}'\n"
|
||||
tlog "\nRun lstmtraining to do the LSTM training for language '${LANG_CODE}'\n"
|
||||
|
@ -23,6 +23,7 @@ else
|
||||
FONTS_DIR="/usr/share/fonts/"
|
||||
FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
|
||||
fi
|
||||
MAX_PAGES=0
|
||||
OUTPUT_DIR="/tmp/tesstrain/tessdata"
|
||||
OVERWRITE=0
|
||||
LINEDATA=0
|
||||
@ -130,6 +131,9 @@ parse_flags() {
|
||||
--langdata_dir)
|
||||
parse_value "LANGDATA_ROOT" ${ARGV[$j]}
|
||||
i=$j ;;
|
||||
--maxpages)
|
||||
parse_value "MAX_PAGES" ${ARGV[$j]}
|
||||
i=$j ;;
|
||||
--output_dir)
|
||||
parse_value "OUTPUT_DIR" ${ARGV[$j]}
|
||||
i=$j ;;
|
||||
@ -221,7 +225,7 @@ generate_font_image() {
|
||||
common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
|
||||
common_args+=" --leading=${LEADING}"
|
||||
common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
|
||||
common_args+=" --outputbase=${outbase} --max_pages=0"
|
||||
common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}"
|
||||
|
||||
# add --writing_mode=vertical-upright to common_args if the font is
|
||||
# specified to be rendered vertically.
|
||||
@ -233,7 +237,7 @@ generate_font_image() {
|
||||
done
|
||||
|
||||
run_command text2image ${common_args} --font="${font}" \
|
||||
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
|
||||
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
|
||||
check_file_readable ${outbase}.box ${outbase}.tif
|
||||
|
||||
if ((EXTRACT_FONT_PROPERTIES)) &&
|
||||
@ -246,7 +250,6 @@ generate_font_image() {
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# Phase I : Generate (I)mages from training text for each font.
|
||||
phase_I_generate_image() {
|
||||
local par_factor=$1
|
||||
|
@ -65,7 +65,7 @@ Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
|
||||
return CharClass::kConsonant;
|
||||
// Sinhala doesn't have Nukta or Avagraha.
|
||||
if (off == 0x3c) return CharClass::kNukta;
|
||||
if (off == 0x3d) return CharClass::kVowel;
|
||||
if (off == 0x3d) return CharClass::kVowel; // avagraha
|
||||
if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
|
||||
if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
|
||||
if (off == 0x4d) return CharClass::kVirama;
|
||||
|
@ -26,12 +26,13 @@ namespace tesseract {
|
||||
// Taken from unicode standard:
|
||||
// http://www.unicode.org/charts/PDF/UA980.pdf
|
||||
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
|
||||
// The Consonant class here includes independent vowels.
|
||||
// The order of components in an orthographic syllable as expressed in BNF is:
|
||||
// {C F} C {{R}Y} {V{A}} {Z}
|
||||
// Translated to the codes used by the CharClass enum:
|
||||
// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
|
||||
// Also the Consonant class here includes independent vowels, as they are
|
||||
// treated the same anyway.
|
||||
// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
|
||||
// Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
|
||||
// Validation rules copied from validate_indic.cpp and modified for Javanese.
|
||||
// Indic - for reference
|
||||
// + vowel Grapheme: V[D](v)*
|
||||
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
|
||||
@ -63,7 +64,6 @@ bool ValidateJavanese::ConsumeGraphemeIfValid() {
|
||||
}
|
||||
|
||||
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
|
||||
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
|
||||
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
|
||||
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
|
||||
// Offset from the start of the relevant unicode code block aka code page.
|
||||
@ -74,6 +74,8 @@ Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
|
||||
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
|
||||
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
|
||||
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
|
||||
if (off <= 0x39) return CharClass::kMatra;
|
||||
if (off <= 0x3a) return CharClass::kMatraPiece; // A9BA TALING
|
||||
if (off <= 0x3d) return CharClass::kMatra;
|
||||
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
|
||||
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
|
||||
@ -229,6 +231,11 @@ bool ValidateJavanese::ConsumeConsonantTailIfValid() {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
// Tarung also used for long versions of u and o vowels and vocalic r
|
||||
// Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
|
||||
while (codes_[codes_used_].first == CharClass::kMatraPiece) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
@ -259,5 +266,4 @@ bool ValidateJavanese::ConsumeVowelIfValid() {
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
} // namespace tesseract
|
@ -60,4 +60,4 @@ class ValidateJavanese : public Validator {
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
|
Loading…
Reference in New Issue
Block a user