mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-05 02:47:00 +08:00
Javanese script training
This commit is contained in:
parent
e1c387c9b3
commit
b34cf9d424
@ -22,7 +22,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
|
|||||||
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
|
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
|
||||||
ell eng enm epo est eus fas fil fin fra frk frm gle glg
|
ell eng enm epo est eus fas fil fin fra frk frm gle glg
|
||||||
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
|
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
|
||||||
jav jpn kan kat kat_old kaz khm kir kor kur lao lat
|
jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat
|
||||||
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
|
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
|
||||||
pan pol por pus ron rus san sin slk slv snd spa spa_old
|
pan pol por pus ron rus san sin slk slv snd spa spa_old
|
||||||
sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur
|
sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur
|
||||||
@ -604,6 +604,10 @@ BURMESE_FONTS=( \
|
|||||||
"TharLon" \
|
"TharLon" \
|
||||||
)
|
)
|
||||||
|
|
||||||
|
JAVANESE_FONTS=( \
|
||||||
|
"Prada" \
|
||||||
|
)
|
||||||
|
|
||||||
NORTH_AMERICAN_ABORIGINAL_FONTS=( \
|
NORTH_AMERICAN_ABORIGINAL_FONTS=( \
|
||||||
"Aboriginal Sans" \
|
"Aboriginal Sans" \
|
||||||
"Aboriginal Sans Bold Italic" \
|
"Aboriginal Sans Bold Italic" \
|
||||||
@ -1065,6 +1069,10 @@ set_lang_specific_parameters() {
|
|||||||
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
|
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
|
||||||
|
|
||||||
# SouthEast Asian scripts.
|
# SouthEast Asian scripts.
|
||||||
|
jav_java ) MEAN_COUNT="15"
|
||||||
|
WORD_DAWG_FACTOR=0.15
|
||||||
|
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||||
|
test -z "$FONTS" && FONTS=( "${JAVANESE_FONTS[@]}" ) ;;
|
||||||
khm ) MEAN_COUNT="15"
|
khm ) MEAN_COUNT="15"
|
||||||
WORD_DAWG_FACTOR=0.15
|
WORD_DAWG_FACTOR=0.15
|
||||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||||
@ -1172,7 +1180,7 @@ set_lang_specific_parameters() {
|
|||||||
LANG_IS_RTL="1"
|
LANG_IS_RTL="1"
|
||||||
NORM_MODE="2" ;;
|
NORM_MODE="2" ;;
|
||||||
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
|
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
|
||||||
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
|
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav | jav_java)
|
||||||
LANG_IS_RTL="0"
|
LANG_IS_RTL="0"
|
||||||
NORM_MODE="2" ;;
|
NORM_MODE="2" ;;
|
||||||
* )
|
* )
|
||||||
|
@ -61,7 +61,7 @@ initialize_fontconfig
|
|||||||
phase_I_generate_image 8
|
phase_I_generate_image 8
|
||||||
phase_UP_generate_unicharset
|
phase_UP_generate_unicharset
|
||||||
if ((LINEDATA)); then
|
if ((LINEDATA)); then
|
||||||
phase_E_extract_features "lstm.train" 8 "lstmf"
|
phase_E_extract_features " --psm 6 lstm.train " 8 "lstmf"
|
||||||
make__lstmdata
|
make__lstmdata
|
||||||
tlog "\nCreated starter traineddata for language '${LANG_CODE}'\n"
|
tlog "\nCreated starter traineddata for language '${LANG_CODE}'\n"
|
||||||
tlog "\nRun lstmtraining to do the LSTM training for language '${LANG_CODE}'\n"
|
tlog "\nRun lstmtraining to do the LSTM training for language '${LANG_CODE}'\n"
|
||||||
|
@ -23,6 +23,7 @@ else
|
|||||||
FONTS_DIR="/usr/share/fonts/"
|
FONTS_DIR="/usr/share/fonts/"
|
||||||
FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
|
FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
|
||||||
fi
|
fi
|
||||||
|
MAX_PAGES=0
|
||||||
OUTPUT_DIR="/tmp/tesstrain/tessdata"
|
OUTPUT_DIR="/tmp/tesstrain/tessdata"
|
||||||
OVERWRITE=0
|
OVERWRITE=0
|
||||||
LINEDATA=0
|
LINEDATA=0
|
||||||
@ -130,6 +131,9 @@ parse_flags() {
|
|||||||
--langdata_dir)
|
--langdata_dir)
|
||||||
parse_value "LANGDATA_ROOT" ${ARGV[$j]}
|
parse_value "LANGDATA_ROOT" ${ARGV[$j]}
|
||||||
i=$j ;;
|
i=$j ;;
|
||||||
|
--maxpages)
|
||||||
|
parse_value "MAX_PAGES" ${ARGV[$j]}
|
||||||
|
i=$j ;;
|
||||||
--output_dir)
|
--output_dir)
|
||||||
parse_value "OUTPUT_DIR" ${ARGV[$j]}
|
parse_value "OUTPUT_DIR" ${ARGV[$j]}
|
||||||
i=$j ;;
|
i=$j ;;
|
||||||
@ -221,7 +225,7 @@ generate_font_image() {
|
|||||||
common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
|
common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words"
|
||||||
common_args+=" --leading=${LEADING}"
|
common_args+=" --leading=${LEADING}"
|
||||||
common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
|
common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
|
||||||
common_args+=" --outputbase=${outbase} --max_pages=0"
|
common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}"
|
||||||
|
|
||||||
# add --writing_mode=vertical-upright to common_args if the font is
|
# add --writing_mode=vertical-upright to common_args if the font is
|
||||||
# specified to be rendered vertically.
|
# specified to be rendered vertically.
|
||||||
@ -246,7 +250,6 @@ generate_font_image() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# Phase I : Generate (I)mages from training text for each font.
|
# Phase I : Generate (I)mages from training text for each font.
|
||||||
phase_I_generate_image() {
|
phase_I_generate_image() {
|
||||||
local par_factor=$1
|
local par_factor=$1
|
||||||
|
@ -65,7 +65,7 @@ Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
|
|||||||
return CharClass::kConsonant;
|
return CharClass::kConsonant;
|
||||||
// Sinhala doesn't have Nukta or Avagraha.
|
// Sinhala doesn't have Nukta or Avagraha.
|
||||||
if (off == 0x3c) return CharClass::kNukta;
|
if (off == 0x3c) return CharClass::kNukta;
|
||||||
if (off == 0x3d) return CharClass::kVowel;
|
if (off == 0x3d) return CharClass::kVowel; // avagraha
|
||||||
if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
|
if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
|
||||||
if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
|
if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
|
||||||
if (off == 0x4d) return CharClass::kVirama;
|
if (off == 0x4d) return CharClass::kVirama;
|
||||||
|
@ -26,12 +26,13 @@ namespace tesseract {
|
|||||||
// Taken from unicode standard:
|
// Taken from unicode standard:
|
||||||
// http://www.unicode.org/charts/PDF/UA980.pdf
|
// http://www.unicode.org/charts/PDF/UA980.pdf
|
||||||
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
|
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
|
||||||
|
// The Consonant class here includes independent vowels.
|
||||||
// The order of components in an orthographic syllable as expressed in BNF is:
|
// The order of components in an orthographic syllable as expressed in BNF is:
|
||||||
// {C F} C {{R}Y} {V{A}} {Z}
|
// {C F} C {{R}Y} {V{A}} {Z}
|
||||||
// Translated to the codes used by the CharClass enum:
|
// Translated to the codes used by the CharClass enum:
|
||||||
// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
|
// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
|
||||||
// Also the Consonant class here includes independent vowels, as they are
|
// Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
|
||||||
// treated the same anyway.
|
// Validation rules copied from validate_indic.cpp and modified for Javanese.
|
||||||
// Indic - for reference
|
// Indic - for reference
|
||||||
// + vowel Grapheme: V[D](v)*
|
// + vowel Grapheme: V[D](v)*
|
||||||
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
|
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
|
||||||
@ -63,7 +64,6 @@ bool ValidateJavanese::ConsumeGraphemeIfValid() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
|
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
|
||||||
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
|
|
||||||
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
|
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
|
||||||
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
|
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
|
||||||
// Offset from the start of the relevant unicode code block aka code page.
|
// Offset from the start of the relevant unicode code block aka code page.
|
||||||
@ -74,6 +74,8 @@ Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
|
|||||||
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
|
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
|
||||||
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
|
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
|
||||||
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
|
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
|
||||||
|
if (off <= 0x39) return CharClass::kMatra;
|
||||||
|
if (off <= 0x3a) return CharClass::kMatraPiece; // A9BA TALING
|
||||||
if (off <= 0x3d) return CharClass::kMatra;
|
if (off <= 0x3d) return CharClass::kMatra;
|
||||||
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
|
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
|
||||||
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
|
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
|
||||||
@ -229,6 +231,11 @@ bool ValidateJavanese::ConsumeConsonantTailIfValid() {
|
|||||||
if (UseMultiCode(1)) return true;
|
if (UseMultiCode(1)) return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Tarung also used for long versions of u and o vowels and vocalic r
|
||||||
|
// Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
|
||||||
|
while (codes_[codes_used_].first == CharClass::kMatraPiece) {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
}
|
||||||
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
||||||
if (UseMultiCode(1)) return true;
|
if (UseMultiCode(1)) return true;
|
||||||
}
|
}
|
||||||
@ -260,4 +267,3 @@ bool ValidateJavanese::ConsumeVowelIfValid() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tesseract
|
} // namespace tesseract
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user