Initial COmmit to add Aksara Jawa - Javanese script

This commit is contained in:
Shree Devi Kumar 2018-08-03 13:59:27 +00:00
parent e9b4e21e6f
commit 0eb7be1cd1
7 changed files with 185 additions and 8 deletions

View File

@ -186,9 +186,9 @@ set(unicharset_training_src
unicharset_training_utils.h
validate_grapheme.h validate_indic.h validate_khmer.h
validate_myanmar.h validator.h
validate_javanese.h validate_myanmar.h validator.h
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
validate_myanmar.cpp validator.cpp
validate_javanese.cpp validate_myanmar.cpp validator.cpp
)
add_library (unicharset_training ${unicharset_training_src})

View File

@ -45,6 +45,7 @@ noinst_HEADERS = \
util.h \
validate_grapheme.h \
validate_indic.h \
validate_javanese.h \
validate_khmer.h \
validate_myanmar.h \
validator.h
@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \
unicharset_training_utils.cpp \
validate_grapheme.cpp \
validate_indic.cpp \
validate_javanese.h \
validate_khmer.cpp \
validate_myanmar.cpp \
validator.cpp

View File

@ -21,7 +21,7 @@
VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
ell eng enm epo est eus fas fil fin fra frk frm gle glg
grc guj hat heb hin hrv hun hye iku ind isl ita ita_old
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
jav jpn kan kat kat_old kaz khm kir kor kur lao lat
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
pan pol por pus ron rus san sin slk slv snd spa spa_old
@ -961,6 +961,7 @@ set_lang_specific_parameters() {
glg ) ;;
hat ) ;;
hrv ) ;;
iast ) ;;
ind ) ;;
isl ) ;;
ita ) ;;
@ -1171,7 +1172,7 @@ set_lang_specific_parameters() {
LANG_IS_RTL="1"
NORM_MODE="2" ;;
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
dzo | sin | san | bod | ori | khm | mya | tha | lao )
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
LANG_IS_RTL="0"
NORM_MODE="2" ;;
* )

View File

@ -0,0 +1,116 @@
/**********************************************************************
* File: validate_javanese.cpp
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#include "validate_javanese.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Javanese Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/charts/PDF/UA980.pdf
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
bool ValidateJavanese::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Javanese syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if ( codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off < 0x4) return CharClass::kVowelModifier;
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG
if (off <= 0x3d) return CharClass::kMatra;
if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
return CharClass::kOther;
}
} // namespace tesseract

View File

@ -0,0 +1,45 @@
/**********************************************************************
* File: validate_javanese.h
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Javanese.
class ValidateJavanese : public Validator {
public:
ValidateJavanese(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateJavanese() {}
protected:
// Returns whether codes matches the pattern for an Javanese Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_

View File

@ -10,6 +10,7 @@
#include "unicode/uscript.h" // From libicu
#include "validate_grapheme.h"
#include "validate_indic.h"
#include "validate_javanese.h"
#include "validate_khmer.h"
#include "validate_myanmar.h"
@ -68,6 +69,9 @@ std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
case ViramaScript::kNonVirama:
return std::unique_ptr<Validator>(
new ValidateGrapheme(script, report_errors));
case ViramaScript::kJavanese:
return std::unique_ptr<Validator>(
new ValidateJavanese(script, report_errors));
case ViramaScript::kMyanmar:
return std::unique_ptr<Validator>(
new ValidateMyanmar(script, report_errors));
@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript(
const std::vector<char32>& utf32) {
std::unordered_map<int, int> histogram;
for (char32 ch : utf32) {
// Determine the codepage base. For the Indic scripts, and Khmer, it is
// Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
// sufficient to divide by kIndicCodePageSize but Myanmar is all over the
// unicode code space, so use its script id.
int base = ch / kIndicCodePageSize;
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
script_code != USCRIPT_COMMON) ||
script_code == USCRIPT_MYANMAR) {
if (script_code == USCRIPT_MYANMAR)
@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript(
char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
// Check for validity.
if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
codebase == static_cast<char32>(ViramaScript::kJavanese) ||
codebase == static_cast<char32>(ViramaScript::kKhmer) ||
(static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript(
bool Validator::IsVirama(char32 unicode) {
return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
(unicode & 0x7f) == 0x4d) ||
unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
unicode == kSinhalaVirama ||
unicode == kJavaneseVirama ||
unicode == kMyanmarVirama ||
unicode == kKhmerVirama;
}
@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) {
bool Validator::IsSubscriptScript() const {
return script_ == ViramaScript::kTelugu ||
script_ == ViramaScript::kKannada ||
script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer;
script_ == ViramaScript::kJavanese ||
script_ == ViramaScript::kMyanmar ||
script_ == ViramaScript::kKhmer;
}
void Validator::ComputeClassCodes(const std::vector<char32>& text) {

View File

@ -64,6 +64,7 @@ enum class ViramaScript : char32 {
kSinhala = 0xd80,
kMyanmar = 0x1000,
kKhmer = 0x1780,
kJavanese = 0xa980,
};
// Base class offers a validation API and protected methods to allow subclasses
@ -221,6 +222,9 @@ class Validator {
static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2;
// Javanese Script - aksarajawa
static const char32 kJavaneseVirama = 0xa9c0;
static const char32 kMaxJavaneseUnicode = 0xa9df;
// Script we are operating on.
ViramaScript script_;