diff --git a/src/training/CMakeLists.txt b/src/training/CMakeLists.txt index 8de67772..cb6cf1a0 100644 --- a/src/training/CMakeLists.txt +++ b/src/training/CMakeLists.txt @@ -186,9 +186,9 @@ set(unicharset_training_src unicharset_training_utils.h validate_grapheme.h validate_indic.h validate_khmer.h - validate_myanmar.h validator.h + validate_javanese.h validate_myanmar.h validator.h validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp - validate_myanmar.cpp validator.cpp + validate_javanese.cpp validate_myanmar.cpp validator.cpp ) add_library (unicharset_training ${unicharset_training_src}) diff --git a/src/training/Makefile.am b/src/training/Makefile.am index fd38ffbe..c7b01d73 100644 --- a/src/training/Makefile.am +++ b/src/training/Makefile.am @@ -45,6 +45,7 @@ noinst_HEADERS = \ util.h \ validate_grapheme.h \ validate_indic.h \ + validate_javanese.h \ validate_khmer.h \ validate_myanmar.h \ validator.h @@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \ unicharset_training_utils.cpp \ validate_grapheme.cpp \ validate_indic.cpp \ + validate_javanese.cpp \ validate_khmer.cpp \ validate_myanmar.cpp \ validator.cpp diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh index 0f8fa6ed..b6d834bb 100755 --- a/src/training/language-specific.sh +++ b/src/training/language-specific.sh @@ -21,7 +21,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo ell eng enm epo est eus fas fil fin fra frk frm gle glg - grc guj hat heb hin hrv hun hye iku ind isl ita ita_old + grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old jav jpn kan kat kat_old kaz khm kir kor kur lao lat lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori pan pol por pus ron rus san sin slk slv snd spa spa_old @@ -961,6 +961,7 @@ set_lang_specific_parameters() { glg ) ;; hat ) ;; hrv ) ;; + iast ) ;; ind ) ;; isl ) ;; ita ) ;; @@ -1171,7 +1172,7 @@ set_lang_specific_parameters() { LANG_IS_RTL="1" NORM_MODE="2" ;; asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \ - dzo | sin | san | bod | ori | khm | mya | tha | lao ) + dzo | sin | san | bod | ori | khm | mya | tha | lao | jav ) LANG_IS_RTL="0" NORM_MODE="2" ;; * ) diff --git a/src/training/validate_javanese.cpp b/src/training/validate_javanese.cpp new file mode 100644 index 00000000..8ee6ef96 --- /dev/null +++ b/src/training/validate_javanese.cpp @@ -0,0 +1,263 @@ +/********************************************************************** + * File: validate_javanese.cpp + * Description: Text validator for Javanese Script - aksara jawa. + * Author: Shree Devi Kumar + * Created: August 03, 2018 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#include "validate_javanese.h" +#include "errcode.h" +#include "tprintf.h" + +namespace tesseract { + +// Returns whether codes matches the pattern for a Javanese Grapheme. +// Taken from unicode standard: +// http://www.unicode.org/charts/PDF/UA980.pdf +// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf +// The order of components in an orthographic syllable as expressed in BNF is: +// {C F} C {{R}Y} {V{A}} {Z} +// Translated to the codes used by the CharClass enum: +// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D] +// Also the Consonant class here includes independent vowels, as they are +// treated the same anyway. +// Indic - for reference +// + vowel Grapheme: V[D](v)* +// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* + +bool ValidateJavanese::ConsumeGraphemeIfValid() { + switch (codes_[codes_used_].first) { + case CharClass::kConsonant: + return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid(); + case CharClass::kVowel: + case CharClass::kVedicMark: + return ConsumeVowelIfValid(); + case CharClass::kZeroWidthJoiner: + case CharClass::kZeroWidthNonJoiner: + // Apart from within an aksara, joiners are silently dropped. + if (report_errors_) + tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second); + ++codes_used_; + return true; + case CharClass::kOther: + UseMultiCode(1); + return true; + default: + if (report_errors_) { + tprintf("Invalid start of grapheme sequence:%c=0x%x\n", + codes_[codes_used_].first, codes_[codes_used_].second); + } + return false; + } +} + +Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { + if (IsVedicAccent(ch)) return CharClass::kVedicMark; + if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; + if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; + // Offset from the start of the relevant unicode code block aka code page. + int off = ch - static_cast(script_); + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; + if (off < 0x4) return CharClass::kVowelModifier; + if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels + if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU + if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels + if (off <= 0x3d) return CharClass::kMatra; + if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants + if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON + return CharClass::kOther; +} + +// Helper consumes/copies a virama and any associated post-virama joiners. +// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or +// no joiner at all) must be followed by a consonant. +// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non +// consonant, space, or character from a different script. We clean up the +// representation to make it consistent by adding a ZWNJ if missing from a +// non-linking virama. Returns false with an invalid sequence. +bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { + int num_codes = codes_.size(); + if (joiner.first == CharClass::kOther) { + CodeOnlyToOutput(); + if (codes_used_ < num_codes && + codes_[codes_used_].second == kZeroWidthJoiner) { + // Post-matra viramas must be explicit, so no joiners allowed here. + if (post_matra) { + if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n"); + return false; + } + if (codes_used_ + 1 < num_codes && + codes_[codes_used_ - 2].second != kCakra && + (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner || + codes_[codes_used_ + 1].second == kPengkal || + codes_[codes_used_ + 1].second == kCakra)) { + // This combination will be picked up later. + ASSERT_HOST(!CodeOnlyToOutput()); + } else { + // Half-form with optional Nukta. + int len = output_.size() + 1 - output_used_; + if (UseMultiCode(len)) return true; + } + if (codes_used_ < num_codes && + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (output_used_ == output_.size() || + output_[output_used_] != kCakra) { + if (report_errors_) { + tprintf("Virama ZWJ ZWNJ : base=0x%x!\n", + static_cast(script_)); + } + return false; + } + } + } else if (codes_used_ == num_codes || + codes_[codes_used_].first != CharClass::kConsonant || + post_matra) { + if (codes_used_ == num_codes || + codes_[codes_used_].second != kZeroWidthNonJoiner) { + // It is valid to have an unterminated virama at the end of a word, but + // for consistency, we will always add ZWNJ if not present. + output_.push_back(kZeroWidthNonJoiner); + } else { + CodeOnlyToOutput(); + } + // Explicit virama [H z] + MultiCodePart(2); + } + } else { + // Pre-virama joiner [{Z|z} H] requests specific conjunct. + if (UseMultiCode(2)) { + if (report_errors_) + tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n"); + return false; + } + if (codes_[codes_used_].second == kZeroWidthJoiner || + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (report_errors_) { + tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(), + codes_[codes_used_].second); + } + return false; + } + } + // It is good so far as it goes. + return true; +} + +// Helper consumes/copies a series of consonants separated by viramas while +// valid, but not any vowel or other modifiers. +bool ValidateJavanese::ConsumeConsonantHeadIfValid() { + const int num_codes = codes_.size(); + // Consonant aksara + do { + CodeOnlyToOutput(); + // Special case of medial consonants [H Z Pengkal/Cakra]. + int index = output_.size() - 3; + if (output_used_ <= index && + (output_.back() == kPengkal || output_.back() == kCakra) && + IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) { + MultiCodePart(3); + } + bool have_nukta = false; + if (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kNukta) { + have_nukta = true; + CodeOnlyToOutput(); + } + // Test for subscript conjunct. + index = output_.size() - 2 - have_nukta; + if (output_used_ <= index && IsSubscriptScript() && + IsVirama(output_[index])) { + // Output previous virama, consonant + optional nukta. + MultiCodePart(2 + have_nukta); + } + IndicPair joiner(CharClass::kOther, 0); + if (codes_used_ < num_codes && + (codes_[codes_used_].second == kZeroWidthJoiner || + (codes_[codes_used_].second == kZeroWidthNonJoiner && + script_ == ViramaScript::kMalayalam))) { + joiner = codes_[codes_used_]; + if (++codes_used_ == num_codes) { + if (report_errors_) { + tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), + joiner.second); + } + return true; + } + if (codes_[codes_used_].first == CharClass::kVirama) { + output_.push_back(joiner.second); + } else { + if (report_errors_) { + tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", + output_.back(), joiner.second, codes_[codes_used_].second); + } + joiner = std::make_pair(CharClass::kOther, 0); + } + } + if (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(joiner, false)) return false; + } else { + break; // No virama, so the run of consonants is over. + } + } while (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kConsonant); + if (output_used_ < output_.size()) MultiCodePart(1); + return true; +} + +// Helper consumes/copies a tail part of a consonant, comprising optional +// matra/piece, vowel modifier, vedic mark, terminating virama. +bool ValidateJavanese::ConsumeConsonantTailIfValid() { + if (codes_used_ == codes_.size()) return true; + // No virama: Finish the grapheme. + // Are multiple matras allowed? + if (codes_[codes_used_].first == CharClass::kMatra) { + if (UseMultiCode(1)) return true; + if (codes_[codes_used_].first == CharClass::kMatraPiece) { + if (UseMultiCode(1)) return true; + } + } + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) return true; + } + if (codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) { + return false; + } + } + // What we have consumed so far is a valid consonant cluster. + if (output_used_ < output_.size()) MultiCodePart(1); + + return true; +} + +// Helper consumes/copies a vowel and optional modifiers. +bool ValidateJavanese::ConsumeVowelIfValid() { + if (UseMultiCode(1)) return true; + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) return true; + } + // What we have consumed so far is a valid vowel cluster. + return true; +} + +} // namespace tesseract + diff --git a/src/training/validate_javanese.h b/src/training/validate_javanese.h new file mode 100644 index 00000000..2d22c64d --- /dev/null +++ b/src/training/validate_javanese.h @@ -0,0 +1,63 @@ +/********************************************************************** + * File: validate_javanese.h + * Description: Text validator for Javanese Script - aksara jawa. + * Author: Shree Devi Kumar + * Created: August 03, 2018 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ +#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ + +#include "validator.h" + + +namespace tesseract { + +// Subclass of Validator that validates and segments Javanese scripts + +class ValidateJavanese : public Validator { + public: + ValidateJavanese(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateJavanese() {} + + protected: + // Returns whether codes matches the pattern for an Javanese Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + Validator::CharClass UnicodeToCharClass(char32 ch) const override; + + private: + // Helper consumes/copies a virama and any associated post-virama joiners. + bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra); + // Helper consumes/copies a series of consonants separated by viramas while + // valid, but not any vowel or other modifiers. + bool ConsumeConsonantHeadIfValid(); + // Helper consumes/copies a tail part of a consonant, comprising optional + // matra/piece, vowel modifier, vedic mark, terminating virama. + bool ConsumeConsonantTailIfValid(); + // Helper consumes/copies a vowel and optional modifiers. + bool ConsumeVowelIfValid(); + + // Some special unicodes used only for Javanese processing. + static const char32 kPengkal = 0xa9be; // Javanese Ya + static const char32 kCakra = 0xa9bf; // Javanese Ra +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ diff --git a/src/training/validator.cpp b/src/training/validator.cpp index d764c3da..ea5978c7 100644 --- a/src/training/validator.cpp +++ b/src/training/validator.cpp @@ -10,6 +10,7 @@ #include "unicode/uscript.h" // From libicu #include "validate_grapheme.h" #include "validate_indic.h" +#include "validate_javanese.h" #include "validate_khmer.h" #include "validate_myanmar.h" @@ -68,6 +69,9 @@ std::unique_ptr Validator::ScriptValidator(ViramaScript script, case ViramaScript::kNonVirama: return std::unique_ptr( new ValidateGrapheme(script, report_errors)); + case ViramaScript::kJavanese: + return std::unique_ptr( + new ValidateJavanese(script, report_errors)); case ViramaScript::kMyanmar: return std::unique_ptr( new ValidateMyanmar(script, report_errors)); @@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript( const std::vector& utf32) { std::unordered_map histogram; for (char32 ch : utf32) { - // Determine the codepage base. For the Indic scripts, and Khmer, it is + // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is // sufficient to divide by kIndicCodePageSize but Myanmar is all over the // unicode code space, so use its script id. int base = ch / kIndicCodePageSize; IcuErrorCode err; UScriptCode script_code = uscript_getScript(ch, err); - if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode && + if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) || script_code == USCRIPT_MYANMAR) { if (script_code == USCRIPT_MYANMAR) @@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript( char32 codebase = static_cast(base * kIndicCodePageSize); // Check for validity. if (codebase == static_cast(ViramaScript::kMyanmar) || + codebase == static_cast(ViramaScript::kJavanese) || codebase == static_cast(ViramaScript::kKhmer) || (static_cast(ViramaScript::kDevanagari) <= codebase && codebase <= static_cast(ViramaScript::kSinhala))) { @@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript( bool Validator::IsVirama(char32 unicode) { return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode && (unicode & 0x7f) == 0x4d) || - unicode == kSinhalaVirama || unicode == kMyanmarVirama || + unicode == kSinhalaVirama || + unicode == kJavaneseVirama || + unicode == kMyanmarVirama || unicode == kKhmerVirama; } @@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) { bool Validator::IsSubscriptScript() const { return script_ == ViramaScript::kTelugu || script_ == ViramaScript::kKannada || - script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer; + script_ == ViramaScript::kJavanese || + script_ == ViramaScript::kMyanmar || + script_ == ViramaScript::kKhmer; } void Validator::ComputeClassCodes(const std::vector& text) { diff --git a/src/training/validator.h b/src/training/validator.h index 890cfac5..741e76e2 100644 --- a/src/training/validator.h +++ b/src/training/validator.h @@ -64,6 +64,7 @@ enum class ViramaScript : char32 { kSinhala = 0xd80, kMyanmar = 0x1000, kKhmer = 0x1780, + kJavanese = 0xa980, }; // Base class offers a validation API and protected methods to allow subclasses @@ -221,6 +222,9 @@ class Validator { static const char32 kSinhalaVirama = 0xdca; static const char32 kMyanmarVirama = 0x1039; static const char32 kKhmerVirama = 0x17d2; + // Javanese Script - aksarajawa + static const char32 kJavaneseVirama = 0xa9c0; + static const char32 kMaxJavaneseUnicode = 0xa9df; // Script we are operating on. ViramaScript script_;