diff --git a/src/training/CMakeLists.txt b/src/training/CMakeLists.txt index 8de67772..cb6cf1a0 100644 --- a/src/training/CMakeLists.txt +++ b/src/training/CMakeLists.txt @@ -186,9 +186,9 @@ set(unicharset_training_src unicharset_training_utils.h validate_grapheme.h validate_indic.h validate_khmer.h - validate_myanmar.h validator.h + validate_javanese.h validate_myanmar.h validator.h validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp - validate_myanmar.cpp validator.cpp + validate_javanese.cpp validate_myanmar.cpp validator.cpp ) add_library (unicharset_training ${unicharset_training_src}) diff --git a/src/training/Makefile.am b/src/training/Makefile.am index fd38ffbe..3a8e6c1a 100644 --- a/src/training/Makefile.am +++ b/src/training/Makefile.am @@ -45,6 +45,7 @@ noinst_HEADERS = \ util.h \ validate_grapheme.h \ validate_indic.h \ + validate_javanese.h \ validate_khmer.h \ validate_myanmar.h \ validator.h @@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \ unicharset_training_utils.cpp \ validate_grapheme.cpp \ validate_indic.cpp \ + validate_javanese.h \ validate_khmer.cpp \ validate_myanmar.cpp \ validator.cpp diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh index 0f8fa6ed..b6d834bb 100755 --- a/src/training/language-specific.sh +++ b/src/training/language-specific.sh @@ -21,7 +21,7 @@ VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo ell eng enm epo est eus fas fil fin fra frk frm gle glg - grc guj hat heb hin hrv hun hye iku ind isl ita ita_old + grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old jav jpn kan kat kat_old kaz khm kir kor kur lao lat lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori pan pol por pus ron rus san sin slk slv snd spa spa_old @@ -961,6 +961,7 @@ set_lang_specific_parameters() { glg ) ;; hat ) ;; hrv ) ;; + iast ) ;; ind ) ;; isl ) ;; ita ) ;; @@ -1171,7 +1172,7 @@ set_lang_specific_parameters() { LANG_IS_RTL="1" NORM_MODE="2" ;; asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \ - dzo | sin | san | bod | ori | khm | mya | tha | lao ) + dzo | sin | san | bod | ori | khm | mya | tha | lao | jav ) LANG_IS_RTL="0" NORM_MODE="2" ;; * ) diff --git a/src/training/validate_javanese.cpp b/src/training/validate_javanese.cpp new file mode 100644 index 00000000..38119917 --- /dev/null +++ b/src/training/validate_javanese.cpp @@ -0,0 +1,116 @@ +/********************************************************************** + * File: validate_javanese.cpp + * Description: Text validator for Javanese Script - aksara jawa. + * Author: Shree Devi Kumar + * Created: August 03, 2018 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + + #include "validate_javanese.h" +#include "errcode.h" +#include "tprintf.h" + +namespace tesseract { + +// Returns whether codes matches the pattern for a Javanese Grapheme. +// Taken from unicode standard: +// http://www.unicode.org/charts/PDF/UA980.pdf +// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf +// Also the Consonant class here includes independent vowels, as they are +// treated the same anyway. + +bool ValidateJavanese::ConsumeGraphemeIfValid() { + int num_codes = codes_.size(); + if (codes_used_ == num_codes) return false; + if (codes_[codes_used_].first == CharClass::kOther) { + UseMultiCode(1); + return true; + } + if (codes_[codes_used_].first != CharClass::kConsonant) { + if (report_errors_) { + tprintf("Invalid start of Javanese syllable:0x%x\n", + codes_[codes_used_].second); + } + return false; + } + if (UseMultiCode(1)) return true; + if ( codes_[codes_used_].first == CharClass::kNukta) { + if (UseMultiCode(1)) return true; + } + while (codes_used_ + 1 < num_codes && + codes_[codes_used_].first == CharClass::kVirama && + codes_[codes_used_ + 1].first == CharClass::kConsonant) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + if (codes_[codes_used_].first == CharClass::kRobat) { + if (UseMultiCode(1)) return true; + } + } + int num_matra_parts = 0; + if (codes_[codes_used_].second == kZeroWidthJoiner || + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (CodeOnlyToOutput()) { + if (report_errors_) { + tprintf("Unterminated joiner: 0x%x\n", output_.back()); + } + return false; + } + ++num_matra_parts; + } + // Not quite as shown by the BNF, the matra piece is allowed as a matra on its + // own or as an addition to other matras. + if (codes_[codes_used_].first == CharClass::kMatra) { + ++num_matra_parts; + if (UseMultiCode(num_matra_parts)) return true; + } else if (num_matra_parts) { + if (report_errors_) { + tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", + output_.back(), codes_[codes_used_].second); + } + return false; + } + if (codes_[codes_used_].first == CharClass::kMatraPiece && + codes_[codes_used_ - 1].first != CharClass::kMatraPiece) { + if (UseMultiCode(1)) return true; + } + if (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + } + if (codes_used_ + 1 < num_codes && + codes_[codes_used_].first == CharClass::kVirama && + codes_[codes_used_ + 1].first == CharClass::kConsonant) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + } + return true; +} + +Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const { + if (IsVedicAccent(ch)) return CharClass::kVedicMark; + if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; + if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; + // Offset from the start of the relevant unicode code block aka code page. + int off = ch - static_cast(script_); + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; + if (off < 0x4) return CharClass::kVowelModifier; + if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels + if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU + if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG + if (off <= 0x3d) return CharClass::kMatra; + if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA + if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON + return CharClass::kOther; +} + +} // namespace tesseract diff --git a/src/training/validate_javanese.h b/src/training/validate_javanese.h new file mode 100644 index 00000000..adc8256b --- /dev/null +++ b/src/training/validate_javanese.h @@ -0,0 +1,45 @@ +/********************************************************************** + * File: validate_javanese.h + * Description: Text validator for Javanese Script - aksara jawa. + * Author: Shree Devi Kumar + * Created: August 03, 2018 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ +#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments Javanese. +class ValidateJavanese : public Validator { + public: + ValidateJavanese(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateJavanese() {} + + protected: + // Returns whether codes matches the pattern for an Javanese Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + CharClass UnicodeToCharClass(char32 ch) const override; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_ diff --git a/src/training/validator.cpp b/src/training/validator.cpp index d764c3da..ea5978c7 100644 --- a/src/training/validator.cpp +++ b/src/training/validator.cpp @@ -10,6 +10,7 @@ #include "unicode/uscript.h" // From libicu #include "validate_grapheme.h" #include "validate_indic.h" +#include "validate_javanese.h" #include "validate_khmer.h" #include "validate_myanmar.h" @@ -68,6 +69,9 @@ std::unique_ptr Validator::ScriptValidator(ViramaScript script, case ViramaScript::kNonVirama: return std::unique_ptr( new ValidateGrapheme(script, report_errors)); + case ViramaScript::kJavanese: + return std::unique_ptr( + new ValidateJavanese(script, report_errors)); case ViramaScript::kMyanmar: return std::unique_ptr( new ValidateMyanmar(script, report_errors)); @@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript( const std::vector& utf32) { std::unordered_map histogram; for (char32 ch : utf32) { - // Determine the codepage base. For the Indic scripts, and Khmer, it is + // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is // sufficient to divide by kIndicCodePageSize but Myanmar is all over the // unicode code space, so use its script id. int base = ch / kIndicCodePageSize; IcuErrorCode err; UScriptCode script_code = uscript_getScript(ch, err); - if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode && + if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode && script_code != USCRIPT_COMMON) || script_code == USCRIPT_MYANMAR) { if (script_code == USCRIPT_MYANMAR) @@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript( char32 codebase = static_cast(base * kIndicCodePageSize); // Check for validity. if (codebase == static_cast(ViramaScript::kMyanmar) || + codebase == static_cast(ViramaScript::kJavanese) || codebase == static_cast(ViramaScript::kKhmer) || (static_cast(ViramaScript::kDevanagari) <= codebase && codebase <= static_cast(ViramaScript::kSinhala))) { @@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript( bool Validator::IsVirama(char32 unicode) { return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode && (unicode & 0x7f) == 0x4d) || - unicode == kSinhalaVirama || unicode == kMyanmarVirama || + unicode == kSinhalaVirama || + unicode == kJavaneseVirama || + unicode == kMyanmarVirama || unicode == kKhmerVirama; } @@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) { bool Validator::IsSubscriptScript() const { return script_ == ViramaScript::kTelugu || script_ == ViramaScript::kKannada || - script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer; + script_ == ViramaScript::kJavanese || + script_ == ViramaScript::kMyanmar || + script_ == ViramaScript::kKhmer; } void Validator::ComputeClassCodes(const std::vector& text) { diff --git a/src/training/validator.h b/src/training/validator.h index 890cfac5..741e76e2 100644 --- a/src/training/validator.h +++ b/src/training/validator.h @@ -64,6 +64,7 @@ enum class ViramaScript : char32 { kSinhala = 0xd80, kMyanmar = 0x1000, kKhmer = 0x1780, + kJavanese = 0xa980, }; // Base class offers a validation API and protected methods to allow subclasses @@ -221,6 +222,9 @@ class Validator { static const char32 kSinhalaVirama = 0xdca; static const char32 kMyanmarVirama = 0x1039; static const char32 kKhmerVirama = 0x17d2; + // Javanese Script - aksarajawa + static const char32 kJavaneseVirama = 0xa9c0; + static const char32 kMaxJavaneseUnicode = 0xa9df; // Script we are operating on. ViramaScript script_;