Merge pull request #1823 from Shreeshrii/javanese

Add support for Javanese script - aksara Jawa
This commit is contained in:
Egor Pugin 2018-08-04 16:08:23 +03:00 committed by GitHub
commit 3b723ba102
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 350 additions and 8 deletions

View File

@ -186,9 +186,9 @@ set(unicharset_training_src
unicharset_training_utils.h unicharset_training_utils.h
validate_grapheme.h validate_indic.h validate_khmer.h validate_grapheme.h validate_indic.h validate_khmer.h
validate_myanmar.h validator.h validate_javanese.h validate_myanmar.h validator.h
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
validate_myanmar.cpp validator.cpp validate_javanese.cpp validate_myanmar.cpp validator.cpp
) )
add_library (unicharset_training ${unicharset_training_src}) add_library (unicharset_training ${unicharset_training_src})

View File

@ -45,6 +45,7 @@ noinst_HEADERS = \
util.h \ util.h \
validate_grapheme.h \ validate_grapheme.h \
validate_indic.h \ validate_indic.h \
validate_javanese.h \
validate_khmer.h \ validate_khmer.h \
validate_myanmar.h \ validate_myanmar.h \
validator.h validator.h
@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \
unicharset_training_utils.cpp \ unicharset_training_utils.cpp \
validate_grapheme.cpp \ validate_grapheme.cpp \
validate_indic.cpp \ validate_indic.cpp \
validate_javanese.cpp \
validate_khmer.cpp \ validate_khmer.cpp \
validate_myanmar.cpp \ validate_myanmar.cpp \
validator.cpp validator.cpp

View File

@ -21,7 +21,7 @@
VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
ell eng enm epo est eus fas fil fin fra frk frm gle glg ell eng enm epo est eus fas fil fin fra frk frm gle glg
grc guj hat heb hin hrv hun hye iku ind isl ita ita_old grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
jav jpn kan kat kat_old kaz khm kir kor kur lao lat jav jpn kan kat kat_old kaz khm kir kor kur lao lat
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
pan pol por pus ron rus san sin slk slv snd spa spa_old pan pol por pus ron rus san sin slk slv snd spa spa_old
@ -961,6 +961,7 @@ set_lang_specific_parameters() {
glg ) ;; glg ) ;;
hat ) ;; hat ) ;;
hrv ) ;; hrv ) ;;
iast ) ;;
ind ) ;; ind ) ;;
isl ) ;; isl ) ;;
ita ) ;; ita ) ;;
@ -1171,7 +1172,7 @@ set_lang_specific_parameters() {
LANG_IS_RTL="1" LANG_IS_RTL="1"
NORM_MODE="2" ;; NORM_MODE="2" ;;
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \ asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
dzo | sin | san | bod | ori | khm | mya | tha | lao ) dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
LANG_IS_RTL="0" LANG_IS_RTL="0"
NORM_MODE="2" ;; NORM_MODE="2" ;;
* ) * )

View File

@ -0,0 +1,263 @@
/**********************************************************************
* File: validate_javanese.cpp
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#include "validate_javanese.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Javanese Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/charts/PDF/UA980.pdf
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
// The order of components in an orthographic syllable as expressed in BNF is:
// {C F} C {{R}Y} {V{A}} {Z}
// Translated to the codes used by the CharClass enum:
// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// Indic - for reference
// + vowel Grapheme: V[D](v)*
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
bool ValidateJavanese::ConsumeGraphemeIfValid() {
switch (codes_[codes_used_].first) {
case CharClass::kConsonant:
return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
case CharClass::kVowel:
case CharClass::kVedicMark:
return ConsumeVowelIfValid();
case CharClass::kZeroWidthJoiner:
case CharClass::kZeroWidthNonJoiner:
// Apart from within an aksara, joiners are silently dropped.
if (report_errors_)
tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
++codes_used_;
return true;
case CharClass::kOther:
UseMultiCode(1);
return true;
default:
if (report_errors_) {
tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
codes_[codes_used_].first, codes_[codes_used_].second);
}
return false;
}
}
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off < 0x4) return CharClass::kVowelModifier;
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
if (off <= 0x3d) return CharClass::kMatra;
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
return CharClass::kOther;
}
// Helper consumes/copies a virama and any associated post-virama joiners.
// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
// no joiner at all) must be followed by a consonant.
// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
// consonant, space, or character from a different script. We clean up the
// representation to make it consistent by adding a ZWNJ if missing from a
// non-linking virama. Returns false with an invalid sequence.
bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
int num_codes = codes_.size();
if (joiner.first == CharClass::kOther) {
CodeOnlyToOutput();
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthJoiner) {
// Post-matra viramas must be explicit, so no joiners allowed here.
if (post_matra) {
if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
return false;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_ - 2].second != kCakra &&
(codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
codes_[codes_used_ + 1].second == kPengkal ||
codes_[codes_used_ + 1].second == kCakra)) {
// This combination will be picked up later.
ASSERT_HOST(!CodeOnlyToOutput());
} else {
// Half-form with optional Nukta.
int len = output_.size() + 1 - output_used_;
if (UseMultiCode(len)) return true;
}
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (output_used_ == output_.size() ||
output_[output_used_] != kCakra) {
if (report_errors_) {
tprintf("Virama ZWJ ZWNJ : base=0x%x!\n",
static_cast<int>(script_));
}
return false;
}
}
} else if (codes_used_ == num_codes ||
codes_[codes_used_].first != CharClass::kConsonant ||
post_matra) {
if (codes_used_ == num_codes ||
codes_[codes_used_].second != kZeroWidthNonJoiner) {
// It is valid to have an unterminated virama at the end of a word, but
// for consistency, we will always add ZWNJ if not present.
output_.push_back(kZeroWidthNonJoiner);
} else {
CodeOnlyToOutput();
}
// Explicit virama [H z]
MultiCodePart(2);
}
} else {
// Pre-virama joiner [{Z|z} H] requests specific conjunct.
if (UseMultiCode(2)) {
if (report_errors_)
tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
return false;
}
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (report_errors_) {
tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
codes_[codes_used_].second);
}
return false;
}
}
// It is good so far as it goes.
return true;
}
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
const int num_codes = codes_.size();
// Consonant aksara
do {
CodeOnlyToOutput();
// Special case of medial consonants [H Z Pengkal/Cakra].
int index = output_.size() - 3;
if (output_used_ <= index &&
(output_.back() == kPengkal || output_.back() == kCakra) &&
IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
MultiCodePart(3);
}
bool have_nukta = false;
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kNukta) {
have_nukta = true;
CodeOnlyToOutput();
}
// Test for subscript conjunct.
index = output_.size() - 2 - have_nukta;
if (output_used_ <= index && IsSubscriptScript() &&
IsVirama(output_[index])) {
// Output previous virama, consonant + optional nukta.
MultiCodePart(2 + have_nukta);
}
IndicPair joiner(CharClass::kOther, 0);
if (codes_used_ < num_codes &&
(codes_[codes_used_].second == kZeroWidthJoiner ||
(codes_[codes_used_].second == kZeroWidthNonJoiner &&
script_ == ViramaScript::kMalayalam))) {
joiner = codes_[codes_used_];
if (++codes_used_ == num_codes) {
if (report_errors_) {
tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
joiner.second);
}
return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
output_.push_back(joiner.second);
} else {
if (report_errors_) {
tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
output_.back(), joiner.second, codes_[codes_used_].second);
}
joiner = std::make_pair(CharClass::kOther, 0);
}
}
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(joiner, false)) return false;
} else {
break; // No virama, so the run of consonants is over.
}
} while (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kConsonant);
if (output_used_ < output_.size()) MultiCodePart(1);
return true;
}
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ValidateJavanese::ConsumeConsonantTailIfValid() {
if (codes_used_ == codes_.size()) return true;
// No virama: Finish the grapheme.
// Are multiple matras allowed?
if (codes_[codes_used_].first == CharClass::kMatra) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
}
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
return false;
}
}
// What we have consumed so far is a valid consonant cluster.
if (output_used_ < output_.size()) MultiCodePart(1);
return true;
}
// Helper consumes/copies a vowel and optional modifiers.
bool ValidateJavanese::ConsumeVowelIfValid() {
if (UseMultiCode(1)) return true;
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
// What we have consumed so far is a valid vowel cluster.
return true;
}
} // namespace tesseract

View File

@ -0,0 +1,63 @@
/**********************************************************************
* File: validate_javanese.h
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Javanese scripts
class ValidateJavanese : public Validator {
public:
ValidateJavanese(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateJavanese() {}
protected:
// Returns whether codes matches the pattern for an Javanese Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid();
// Some special unicodes used only for Javanese processing.
static const char32 kPengkal = 0xa9be; // Javanese Ya
static const char32 kCakra = 0xa9bf; // Javanese Ra
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_

View File

@ -10,6 +10,7 @@
#include "unicode/uscript.h" // From libicu #include "unicode/uscript.h" // From libicu
#include "validate_grapheme.h" #include "validate_grapheme.h"
#include "validate_indic.h" #include "validate_indic.h"
#include "validate_javanese.h"
#include "validate_khmer.h" #include "validate_khmer.h"
#include "validate_myanmar.h" #include "validate_myanmar.h"
@ -68,6 +69,9 @@ std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
case ViramaScript::kNonVirama: case ViramaScript::kNonVirama:
return std::unique_ptr<Validator>( return std::unique_ptr<Validator>(
new ValidateGrapheme(script, report_errors)); new ValidateGrapheme(script, report_errors));
case ViramaScript::kJavanese:
return std::unique_ptr<Validator>(
new ValidateJavanese(script, report_errors));
case ViramaScript::kMyanmar: case ViramaScript::kMyanmar:
return std::unique_ptr<Validator>( return std::unique_ptr<Validator>(
new ValidateMyanmar(script, report_errors)); new ValidateMyanmar(script, report_errors));
@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript(
const std::vector<char32>& utf32) { const std::vector<char32>& utf32) {
std::unordered_map<int, int> histogram; std::unordered_map<int, int> histogram;
for (char32 ch : utf32) { for (char32 ch : utf32) {
// Determine the codepage base. For the Indic scripts, and Khmer, it is // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
// sufficient to divide by kIndicCodePageSize but Myanmar is all over the // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
// unicode code space, so use its script id. // unicode code space, so use its script id.
int base = ch / kIndicCodePageSize; int base = ch / kIndicCodePageSize;
IcuErrorCode err; IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err); UScriptCode script_code = uscript_getScript(ch, err);
if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode && if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
script_code != USCRIPT_COMMON) || script_code != USCRIPT_COMMON) ||
script_code == USCRIPT_MYANMAR) { script_code == USCRIPT_MYANMAR) {
if (script_code == USCRIPT_MYANMAR) if (script_code == USCRIPT_MYANMAR)
@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript(
char32 codebase = static_cast<char32>(base * kIndicCodePageSize); char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
// Check for validity. // Check for validity.
if (codebase == static_cast<char32>(ViramaScript::kMyanmar) || if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
codebase == static_cast<char32>(ViramaScript::kJavanese) ||
codebase == static_cast<char32>(ViramaScript::kKhmer) || codebase == static_cast<char32>(ViramaScript::kKhmer) ||
(static_cast<char32>(ViramaScript::kDevanagari) <= codebase && (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
codebase <= static_cast<char32>(ViramaScript::kSinhala))) { codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript(
bool Validator::IsVirama(char32 unicode) { bool Validator::IsVirama(char32 unicode) {
return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode && return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
(unicode & 0x7f) == 0x4d) || (unicode & 0x7f) == 0x4d) ||
unicode == kSinhalaVirama || unicode == kMyanmarVirama || unicode == kSinhalaVirama ||
unicode == kJavaneseVirama ||
unicode == kMyanmarVirama ||
unicode == kKhmerVirama; unicode == kKhmerVirama;
} }
@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) {
bool Validator::IsSubscriptScript() const { bool Validator::IsSubscriptScript() const {
return script_ == ViramaScript::kTelugu || return script_ == ViramaScript::kTelugu ||
script_ == ViramaScript::kKannada || script_ == ViramaScript::kKannada ||
script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer; script_ == ViramaScript::kJavanese ||
script_ == ViramaScript::kMyanmar ||
script_ == ViramaScript::kKhmer;
} }
void Validator::ComputeClassCodes(const std::vector<char32>& text) { void Validator::ComputeClassCodes(const std::vector<char32>& text) {

View File

@ -64,6 +64,7 @@ enum class ViramaScript : char32 {
kSinhala = 0xd80, kSinhala = 0xd80,
kMyanmar = 0x1000, kMyanmar = 0x1000,
kKhmer = 0x1780, kKhmer = 0x1780,
kJavanese = 0xa980,
}; };
// Base class offers a validation API and protected methods to allow subclasses // Base class offers a validation API and protected methods to allow subclasses
@ -221,6 +222,9 @@ class Validator {
static const char32 kSinhalaVirama = 0xdca; static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039; static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2; static const char32 kKhmerVirama = 0x17d2;
// Javanese Script - aksarajawa
static const char32 kJavaneseVirama = 0xa9c0;
static const char32 kMaxJavaneseUnicode = 0xa9df;
// Script we are operating on. // Script we are operating on.
ViramaScript script_; ViramaScript script_;