Initial COmmit to add Aksara Jawa - Javanese script

This commit is contained in:
Shree Devi Kumar 2018-08-03 13:59:27 +00:00
parent e9b4e21e6f
commit 0eb7be1cd1
7 changed files with 185 additions and 8 deletions

View File

@ -186,9 +186,9 @@ set(unicharset_training_src
unicharset_training_utils.h unicharset_training_utils.h
validate_grapheme.h validate_indic.h validate_khmer.h validate_grapheme.h validate_indic.h validate_khmer.h
validate_myanmar.h validator.h validate_javanese.h validate_myanmar.h validator.h
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
validate_myanmar.cpp validator.cpp validate_javanese.cpp validate_myanmar.cpp validator.cpp
) )
add_library (unicharset_training ${unicharset_training_src}) add_library (unicharset_training ${unicharset_training_src})

View File

@ -45,6 +45,7 @@ noinst_HEADERS = \
util.h \ util.h \
validate_grapheme.h \ validate_grapheme.h \
validate_indic.h \ validate_indic.h \
validate_javanese.h \
validate_khmer.h \ validate_khmer.h \
validate_myanmar.h \ validate_myanmar.h \
validator.h validator.h
@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \
unicharset_training_utils.cpp \ unicharset_training_utils.cpp \
validate_grapheme.cpp \ validate_grapheme.cpp \
validate_indic.cpp \ validate_indic.cpp \
validate_javanese.h \
validate_khmer.cpp \ validate_khmer.cpp \
validate_myanmar.cpp \ validate_myanmar.cpp \
validator.cpp validator.cpp

View File

@ -21,7 +21,7 @@
VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
ell eng enm epo est eus fas fil fin fra frk frm gle glg ell eng enm epo est eus fas fil fin fra frk frm gle glg
grc guj hat heb hin hrv hun hye iku ind isl ita ita_old grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
jav jpn kan kat kat_old kaz khm kir kor kur lao lat jav jpn kan kat kat_old kaz khm kir kor kur lao lat
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
pan pol por pus ron rus san sin slk slv snd spa spa_old pan pol por pus ron rus san sin slk slv snd spa spa_old
@ -961,6 +961,7 @@ set_lang_specific_parameters() {
glg ) ;; glg ) ;;
hat ) ;; hat ) ;;
hrv ) ;; hrv ) ;;
iast ) ;;
ind ) ;; ind ) ;;
isl ) ;; isl ) ;;
ita ) ;; ita ) ;;
@ -1171,7 +1172,7 @@ set_lang_specific_parameters() {
LANG_IS_RTL="1" LANG_IS_RTL="1"
NORM_MODE="2" ;; NORM_MODE="2" ;;
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \ asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
dzo | sin | san | bod | ori | khm | mya | tha | lao ) dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
LANG_IS_RTL="0" LANG_IS_RTL="0"
NORM_MODE="2" ;; NORM_MODE="2" ;;
* ) * )

View File

@ -0,0 +1,116 @@
/**********************************************************************
* File: validate_javanese.cpp
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#include "validate_javanese.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Javanese Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/charts/PDF/UA980.pdf
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
bool ValidateJavanese::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Javanese syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if ( codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off < 0x4) return CharClass::kVowelModifier;
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG
if (off <= 0x3d) return CharClass::kMatra;
if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
return CharClass::kOther;
}
} // namespace tesseract

View File

@ -0,0 +1,45 @@
/**********************************************************************
* File: validate_javanese.h
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Javanese.
class ValidateJavanese : public Validator {
public:
ValidateJavanese(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateJavanese() {}
protected:
// Returns whether codes matches the pattern for an Javanese Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_

View File

@ -10,6 +10,7 @@
#include "unicode/uscript.h" // From libicu #include "unicode/uscript.h" // From libicu
#include "validate_grapheme.h" #include "validate_grapheme.h"
#include "validate_indic.h" #include "validate_indic.h"
#include "validate_javanese.h"
#include "validate_khmer.h" #include "validate_khmer.h"
#include "validate_myanmar.h" #include "validate_myanmar.h"
@ -68,6 +69,9 @@ std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
case ViramaScript::kNonVirama: case ViramaScript::kNonVirama:
return std::unique_ptr<Validator>( return std::unique_ptr<Validator>(
new ValidateGrapheme(script, report_errors)); new ValidateGrapheme(script, report_errors));
case ViramaScript::kJavanese:
return std::unique_ptr<Validator>(
new ValidateJavanese(script, report_errors));
case ViramaScript::kMyanmar: case ViramaScript::kMyanmar:
return std::unique_ptr<Validator>( return std::unique_ptr<Validator>(
new ValidateMyanmar(script, report_errors)); new ValidateMyanmar(script, report_errors));
@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript(
const std::vector<char32>& utf32) { const std::vector<char32>& utf32) {
std::unordered_map<int, int> histogram; std::unordered_map<int, int> histogram;
for (char32 ch : utf32) { for (char32 ch : utf32) {
// Determine the codepage base. For the Indic scripts, and Khmer, it is // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
// sufficient to divide by kIndicCodePageSize but Myanmar is all over the // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
// unicode code space, so use its script id. // unicode code space, so use its script id.
int base = ch / kIndicCodePageSize; int base = ch / kIndicCodePageSize;
IcuErrorCode err; IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err); UScriptCode script_code = uscript_getScript(ch, err);
if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode && if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
script_code != USCRIPT_COMMON) || script_code != USCRIPT_COMMON) ||
script_code == USCRIPT_MYANMAR) { script_code == USCRIPT_MYANMAR) {
if (script_code == USCRIPT_MYANMAR) if (script_code == USCRIPT_MYANMAR)
@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript(
char32 codebase = static_cast<char32>(base * kIndicCodePageSize); char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
// Check for validity. // Check for validity.
if (codebase == static_cast<char32>(ViramaScript::kMyanmar) || if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
codebase == static_cast<char32>(ViramaScript::kJavanese) ||
codebase == static_cast<char32>(ViramaScript::kKhmer) || codebase == static_cast<char32>(ViramaScript::kKhmer) ||
(static_cast<char32>(ViramaScript::kDevanagari) <= codebase && (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
codebase <= static_cast<char32>(ViramaScript::kSinhala))) { codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript(
bool Validator::IsVirama(char32 unicode) { bool Validator::IsVirama(char32 unicode) {
return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode && return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
(unicode & 0x7f) == 0x4d) || (unicode & 0x7f) == 0x4d) ||
unicode == kSinhalaVirama || unicode == kMyanmarVirama || unicode == kSinhalaVirama ||
unicode == kJavaneseVirama ||
unicode == kMyanmarVirama ||
unicode == kKhmerVirama; unicode == kKhmerVirama;
} }
@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) {
bool Validator::IsSubscriptScript() const { bool Validator::IsSubscriptScript() const {
return script_ == ViramaScript::kTelugu || return script_ == ViramaScript::kTelugu ||
script_ == ViramaScript::kKannada || script_ == ViramaScript::kKannada ||
script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer; script_ == ViramaScript::kJavanese ||
script_ == ViramaScript::kMyanmar ||
script_ == ViramaScript::kKhmer;
} }
void Validator::ComputeClassCodes(const std::vector<char32>& text) { void Validator::ComputeClassCodes(const std::vector<char32>& text) {

View File

@ -64,6 +64,7 @@ enum class ViramaScript : char32 {
kSinhala = 0xd80, kSinhala = 0xd80,
kMyanmar = 0x1000, kMyanmar = 0x1000,
kKhmer = 0x1780, kKhmer = 0x1780,
kJavanese = 0xa980,
}; };
// Base class offers a validation API and protected methods to allow subclasses // Base class offers a validation API and protected methods to allow subclasses
@ -221,6 +222,9 @@ class Validator {
static const char32 kSinhalaVirama = 0xdca; static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039; static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2; static const char32 kKhmerVirama = 0x17d2;
// Javanese Script - aksarajawa
static const char32 kJavaneseVirama = 0xa9c0;
static const char32 kMaxJavaneseUnicode = 0xa9df;
// Script we are operating on. // Script we are operating on.
ViramaScript script_; ViramaScript script_;