mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-11 23:19:04 +08:00
Merge pull request #1823 from Shreeshrii/javanese
Add support for Javanese script - aksara Jawa
This commit is contained in:
commit
3b723ba102
@ -186,9 +186,9 @@ set(unicharset_training_src
|
|||||||
unicharset_training_utils.h
|
unicharset_training_utils.h
|
||||||
|
|
||||||
validate_grapheme.h validate_indic.h validate_khmer.h
|
validate_grapheme.h validate_indic.h validate_khmer.h
|
||||||
validate_myanmar.h validator.h
|
validate_javanese.h validate_myanmar.h validator.h
|
||||||
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
|
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
|
||||||
validate_myanmar.cpp validator.cpp
|
validate_javanese.cpp validate_myanmar.cpp validator.cpp
|
||||||
|
|
||||||
)
|
)
|
||||||
add_library (unicharset_training ${unicharset_training_src})
|
add_library (unicharset_training ${unicharset_training_src})
|
||||||
|
@ -45,6 +45,7 @@ noinst_HEADERS = \
|
|||||||
util.h \
|
util.h \
|
||||||
validate_grapheme.h \
|
validate_grapheme.h \
|
||||||
validate_indic.h \
|
validate_indic.h \
|
||||||
|
validate_javanese.h \
|
||||||
validate_khmer.h \
|
validate_khmer.h \
|
||||||
validate_myanmar.h \
|
validate_myanmar.h \
|
||||||
validator.h
|
validator.h
|
||||||
@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \
|
|||||||
unicharset_training_utils.cpp \
|
unicharset_training_utils.cpp \
|
||||||
validate_grapheme.cpp \
|
validate_grapheme.cpp \
|
||||||
validate_indic.cpp \
|
validate_indic.cpp \
|
||||||
|
validate_javanese.cpp \
|
||||||
validate_khmer.cpp \
|
validate_khmer.cpp \
|
||||||
validate_myanmar.cpp \
|
validate_myanmar.cpp \
|
||||||
validator.cpp
|
validator.cpp
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
|
VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
|
||||||
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
|
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
|
||||||
ell eng enm epo est eus fas fil fin fra frk frm gle glg
|
ell eng enm epo est eus fas fil fin fra frk frm gle glg
|
||||||
grc guj hat heb hin hrv hun hye iku ind isl ita ita_old
|
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
|
||||||
jav jpn kan kat kat_old kaz khm kir kor kur lao lat
|
jav jpn kan kat kat_old kaz khm kir kor kur lao lat
|
||||||
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
|
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
|
||||||
pan pol por pus ron rus san sin slk slv snd spa spa_old
|
pan pol por pus ron rus san sin slk slv snd spa spa_old
|
||||||
@ -961,6 +961,7 @@ set_lang_specific_parameters() {
|
|||||||
glg ) ;;
|
glg ) ;;
|
||||||
hat ) ;;
|
hat ) ;;
|
||||||
hrv ) ;;
|
hrv ) ;;
|
||||||
|
iast ) ;;
|
||||||
ind ) ;;
|
ind ) ;;
|
||||||
isl ) ;;
|
isl ) ;;
|
||||||
ita ) ;;
|
ita ) ;;
|
||||||
@ -1171,7 +1172,7 @@ set_lang_specific_parameters() {
|
|||||||
LANG_IS_RTL="1"
|
LANG_IS_RTL="1"
|
||||||
NORM_MODE="2" ;;
|
NORM_MODE="2" ;;
|
||||||
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
|
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
|
||||||
dzo | sin | san | bod | ori | khm | mya | tha | lao )
|
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
|
||||||
LANG_IS_RTL="0"
|
LANG_IS_RTL="0"
|
||||||
NORM_MODE="2" ;;
|
NORM_MODE="2" ;;
|
||||||
* )
|
* )
|
||||||
|
263
src/training/validate_javanese.cpp
Normal file
263
src/training/validate_javanese.cpp
Normal file
@ -0,0 +1,263 @@
|
|||||||
|
/**********************************************************************
|
||||||
|
* File: validate_javanese.cpp
|
||||||
|
* Description: Text validator for Javanese Script - aksara jawa.
|
||||||
|
* Author: Shree Devi Kumar
|
||||||
|
* Created: August 03, 2018
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
|
||||||
|
#include "validate_javanese.h"
|
||||||
|
#include "errcode.h"
|
||||||
|
#include "tprintf.h"
|
||||||
|
|
||||||
|
namespace tesseract {
|
||||||
|
|
||||||
|
// Returns whether codes matches the pattern for a Javanese Grapheme.
|
||||||
|
// Taken from unicode standard:
|
||||||
|
// http://www.unicode.org/charts/PDF/UA980.pdf
|
||||||
|
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
|
||||||
|
// The order of components in an orthographic syllable as expressed in BNF is:
|
||||||
|
// {C F} C {{R}Y} {V{A}} {Z}
|
||||||
|
// Translated to the codes used by the CharClass enum:
|
||||||
|
// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
|
||||||
|
// Also the Consonant class here includes independent vowels, as they are
|
||||||
|
// treated the same anyway.
|
||||||
|
// Indic - for reference
|
||||||
|
// + vowel Grapheme: V[D](v)*
|
||||||
|
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
|
||||||
|
|
||||||
|
bool ValidateJavanese::ConsumeGraphemeIfValid() {
|
||||||
|
switch (codes_[codes_used_].first) {
|
||||||
|
case CharClass::kConsonant:
|
||||||
|
return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
|
||||||
|
case CharClass::kVowel:
|
||||||
|
case CharClass::kVedicMark:
|
||||||
|
return ConsumeVowelIfValid();
|
||||||
|
case CharClass::kZeroWidthJoiner:
|
||||||
|
case CharClass::kZeroWidthNonJoiner:
|
||||||
|
// Apart from within an aksara, joiners are silently dropped.
|
||||||
|
if (report_errors_)
|
||||||
|
tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
|
||||||
|
++codes_used_;
|
||||||
|
return true;
|
||||||
|
case CharClass::kOther:
|
||||||
|
UseMultiCode(1);
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
if (report_errors_) {
|
||||||
|
tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
|
||||||
|
codes_[codes_used_].first, codes_[codes_used_].second);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
|
||||||
|
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
|
||||||
|
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
|
||||||
|
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
|
||||||
|
// Offset from the start of the relevant unicode code block aka code page.
|
||||||
|
int off = ch - static_cast<char32>(script_);
|
||||||
|
// Anything in another code block is other.
|
||||||
|
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
|
||||||
|
if (off < 0x4) return CharClass::kVowelModifier;
|
||||||
|
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
|
||||||
|
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
|
||||||
|
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
|
||||||
|
if (off <= 0x3d) return CharClass::kMatra;
|
||||||
|
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
|
||||||
|
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
|
||||||
|
return CharClass::kOther;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper consumes/copies a virama and any associated post-virama joiners.
|
||||||
|
// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
|
||||||
|
// no joiner at all) must be followed by a consonant.
|
||||||
|
// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
|
||||||
|
// consonant, space, or character from a different script. We clean up the
|
||||||
|
// representation to make it consistent by adding a ZWNJ if missing from a
|
||||||
|
// non-linking virama. Returns false with an invalid sequence.
|
||||||
|
bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
|
||||||
|
int num_codes = codes_.size();
|
||||||
|
if (joiner.first == CharClass::kOther) {
|
||||||
|
CodeOnlyToOutput();
|
||||||
|
if (codes_used_ < num_codes &&
|
||||||
|
codes_[codes_used_].second == kZeroWidthJoiner) {
|
||||||
|
// Post-matra viramas must be explicit, so no joiners allowed here.
|
||||||
|
if (post_matra) {
|
||||||
|
if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (codes_used_ + 1 < num_codes &&
|
||||||
|
codes_[codes_used_ - 2].second != kCakra &&
|
||||||
|
(codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
|
||||||
|
codes_[codes_used_ + 1].second == kPengkal ||
|
||||||
|
codes_[codes_used_ + 1].second == kCakra)) {
|
||||||
|
// This combination will be picked up later.
|
||||||
|
ASSERT_HOST(!CodeOnlyToOutput());
|
||||||
|
} else {
|
||||||
|
// Half-form with optional Nukta.
|
||||||
|
int len = output_.size() + 1 - output_used_;
|
||||||
|
if (UseMultiCode(len)) return true;
|
||||||
|
}
|
||||||
|
if (codes_used_ < num_codes &&
|
||||||
|
codes_[codes_used_].second == kZeroWidthNonJoiner) {
|
||||||
|
if (output_used_ == output_.size() ||
|
||||||
|
output_[output_used_] != kCakra) {
|
||||||
|
if (report_errors_) {
|
||||||
|
tprintf("Virama ZWJ ZWNJ : base=0x%x!\n",
|
||||||
|
static_cast<int>(script_));
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (codes_used_ == num_codes ||
|
||||||
|
codes_[codes_used_].first != CharClass::kConsonant ||
|
||||||
|
post_matra) {
|
||||||
|
if (codes_used_ == num_codes ||
|
||||||
|
codes_[codes_used_].second != kZeroWidthNonJoiner) {
|
||||||
|
// It is valid to have an unterminated virama at the end of a word, but
|
||||||
|
// for consistency, we will always add ZWNJ if not present.
|
||||||
|
output_.push_back(kZeroWidthNonJoiner);
|
||||||
|
} else {
|
||||||
|
CodeOnlyToOutput();
|
||||||
|
}
|
||||||
|
// Explicit virama [H z]
|
||||||
|
MultiCodePart(2);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Pre-virama joiner [{Z|z} H] requests specific conjunct.
|
||||||
|
if (UseMultiCode(2)) {
|
||||||
|
if (report_errors_)
|
||||||
|
tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (codes_[codes_used_].second == kZeroWidthJoiner ||
|
||||||
|
codes_[codes_used_].second == kZeroWidthNonJoiner) {
|
||||||
|
if (report_errors_) {
|
||||||
|
tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
|
||||||
|
codes_[codes_used_].second);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// It is good so far as it goes.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper consumes/copies a series of consonants separated by viramas while
|
||||||
|
// valid, but not any vowel or other modifiers.
|
||||||
|
bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
|
||||||
|
const int num_codes = codes_.size();
|
||||||
|
// Consonant aksara
|
||||||
|
do {
|
||||||
|
CodeOnlyToOutput();
|
||||||
|
// Special case of medial consonants [H Z Pengkal/Cakra].
|
||||||
|
int index = output_.size() - 3;
|
||||||
|
if (output_used_ <= index &&
|
||||||
|
(output_.back() == kPengkal || output_.back() == kCakra) &&
|
||||||
|
IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
|
||||||
|
MultiCodePart(3);
|
||||||
|
}
|
||||||
|
bool have_nukta = false;
|
||||||
|
if (codes_used_ < num_codes &&
|
||||||
|
codes_[codes_used_].first == CharClass::kNukta) {
|
||||||
|
have_nukta = true;
|
||||||
|
CodeOnlyToOutput();
|
||||||
|
}
|
||||||
|
// Test for subscript conjunct.
|
||||||
|
index = output_.size() - 2 - have_nukta;
|
||||||
|
if (output_used_ <= index && IsSubscriptScript() &&
|
||||||
|
IsVirama(output_[index])) {
|
||||||
|
// Output previous virama, consonant + optional nukta.
|
||||||
|
MultiCodePart(2 + have_nukta);
|
||||||
|
}
|
||||||
|
IndicPair joiner(CharClass::kOther, 0);
|
||||||
|
if (codes_used_ < num_codes &&
|
||||||
|
(codes_[codes_used_].second == kZeroWidthJoiner ||
|
||||||
|
(codes_[codes_used_].second == kZeroWidthNonJoiner &&
|
||||||
|
script_ == ViramaScript::kMalayalam))) {
|
||||||
|
joiner = codes_[codes_used_];
|
||||||
|
if (++codes_used_ == num_codes) {
|
||||||
|
if (report_errors_) {
|
||||||
|
tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
|
||||||
|
joiner.second);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (codes_[codes_used_].first == CharClass::kVirama) {
|
||||||
|
output_.push_back(joiner.second);
|
||||||
|
} else {
|
||||||
|
if (report_errors_) {
|
||||||
|
tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
|
||||||
|
output_.back(), joiner.second, codes_[codes_used_].second);
|
||||||
|
}
|
||||||
|
joiner = std::make_pair(CharClass::kOther, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (codes_used_ < num_codes &&
|
||||||
|
codes_[codes_used_].first == CharClass::kVirama) {
|
||||||
|
if (!ConsumeViramaIfValid(joiner, false)) return false;
|
||||||
|
} else {
|
||||||
|
break; // No virama, so the run of consonants is over.
|
||||||
|
}
|
||||||
|
} while (codes_used_ < num_codes &&
|
||||||
|
codes_[codes_used_].first == CharClass::kConsonant);
|
||||||
|
if (output_used_ < output_.size()) MultiCodePart(1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper consumes/copies a tail part of a consonant, comprising optional
|
||||||
|
// matra/piece, vowel modifier, vedic mark, terminating virama.
|
||||||
|
bool ValidateJavanese::ConsumeConsonantTailIfValid() {
|
||||||
|
if (codes_used_ == codes_.size()) return true;
|
||||||
|
// No virama: Finish the grapheme.
|
||||||
|
// Are multiple matras allowed?
|
||||||
|
if (codes_[codes_used_].first == CharClass::kMatra) {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
if (codes_[codes_used_].first == CharClass::kMatraPiece) {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
}
|
||||||
|
while (codes_[codes_used_].first == CharClass::kVedicMark) {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
}
|
||||||
|
if (codes_[codes_used_].first == CharClass::kVirama) {
|
||||||
|
if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// What we have consumed so far is a valid consonant cluster.
|
||||||
|
if (output_used_ < output_.size()) MultiCodePart(1);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper consumes/copies a vowel and optional modifiers.
|
||||||
|
bool ValidateJavanese::ConsumeVowelIfValid() {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
}
|
||||||
|
while (codes_[codes_used_].first == CharClass::kVedicMark) {
|
||||||
|
if (UseMultiCode(1)) return true;
|
||||||
|
}
|
||||||
|
// What we have consumed so far is a valid vowel cluster.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tesseract
|
||||||
|
|
63
src/training/validate_javanese.h
Normal file
63
src/training/validate_javanese.h
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
/**********************************************************************
|
||||||
|
* File: validate_javanese.h
|
||||||
|
* Description: Text validator for Javanese Script - aksara jawa.
|
||||||
|
* Author: Shree Devi Kumar
|
||||||
|
* Created: August 03, 2018
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
|
||||||
|
#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
|
||||||
|
#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
|
||||||
|
|
||||||
|
#include "validator.h"
|
||||||
|
|
||||||
|
|
||||||
|
namespace tesseract {
|
||||||
|
|
||||||
|
// Subclass of Validator that validates and segments Javanese scripts
|
||||||
|
|
||||||
|
class ValidateJavanese : public Validator {
|
||||||
|
public:
|
||||||
|
ValidateJavanese(ViramaScript script, bool report_errors)
|
||||||
|
: Validator(script, report_errors) {}
|
||||||
|
~ValidateJavanese() {}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// Returns whether codes matches the pattern for an Javanese Grapheme.
|
||||||
|
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||||
|
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||||
|
// otherwise does not increment codes_used_.
|
||||||
|
bool ConsumeGraphemeIfValid() override;
|
||||||
|
// Returns the CharClass corresponding to the given Unicode ch.
|
||||||
|
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Helper consumes/copies a virama and any associated post-virama joiners.
|
||||||
|
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
|
||||||
|
// Helper consumes/copies a series of consonants separated by viramas while
|
||||||
|
// valid, but not any vowel or other modifiers.
|
||||||
|
bool ConsumeConsonantHeadIfValid();
|
||||||
|
// Helper consumes/copies a tail part of a consonant, comprising optional
|
||||||
|
// matra/piece, vowel modifier, vedic mark, terminating virama.
|
||||||
|
bool ConsumeConsonantTailIfValid();
|
||||||
|
// Helper consumes/copies a vowel and optional modifiers.
|
||||||
|
bool ConsumeVowelIfValid();
|
||||||
|
|
||||||
|
// Some special unicodes used only for Javanese processing.
|
||||||
|
static const char32 kPengkal = 0xa9be; // Javanese Ya
|
||||||
|
static const char32 kCakra = 0xa9bf; // Javanese Ra
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace tesseract
|
||||||
|
|
||||||
|
#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
|
@ -10,6 +10,7 @@
|
|||||||
#include "unicode/uscript.h" // From libicu
|
#include "unicode/uscript.h" // From libicu
|
||||||
#include "validate_grapheme.h"
|
#include "validate_grapheme.h"
|
||||||
#include "validate_indic.h"
|
#include "validate_indic.h"
|
||||||
|
#include "validate_javanese.h"
|
||||||
#include "validate_khmer.h"
|
#include "validate_khmer.h"
|
||||||
#include "validate_myanmar.h"
|
#include "validate_myanmar.h"
|
||||||
|
|
||||||
@ -68,6 +69,9 @@ std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
|
|||||||
case ViramaScript::kNonVirama:
|
case ViramaScript::kNonVirama:
|
||||||
return std::unique_ptr<Validator>(
|
return std::unique_ptr<Validator>(
|
||||||
new ValidateGrapheme(script, report_errors));
|
new ValidateGrapheme(script, report_errors));
|
||||||
|
case ViramaScript::kJavanese:
|
||||||
|
return std::unique_ptr<Validator>(
|
||||||
|
new ValidateJavanese(script, report_errors));
|
||||||
case ViramaScript::kMyanmar:
|
case ViramaScript::kMyanmar:
|
||||||
return std::unique_ptr<Validator>(
|
return std::unique_ptr<Validator>(
|
||||||
new ValidateMyanmar(script, report_errors));
|
new ValidateMyanmar(script, report_errors));
|
||||||
@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript(
|
|||||||
const std::vector<char32>& utf32) {
|
const std::vector<char32>& utf32) {
|
||||||
std::unordered_map<int, int> histogram;
|
std::unordered_map<int, int> histogram;
|
||||||
for (char32 ch : utf32) {
|
for (char32 ch : utf32) {
|
||||||
// Determine the codepage base. For the Indic scripts, and Khmer, it is
|
// Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
|
||||||
// sufficient to divide by kIndicCodePageSize but Myanmar is all over the
|
// sufficient to divide by kIndicCodePageSize but Myanmar is all over the
|
||||||
// unicode code space, so use its script id.
|
// unicode code space, so use its script id.
|
||||||
int base = ch / kIndicCodePageSize;
|
int base = ch / kIndicCodePageSize;
|
||||||
IcuErrorCode err;
|
IcuErrorCode err;
|
||||||
UScriptCode script_code = uscript_getScript(ch, err);
|
UScriptCode script_code = uscript_getScript(ch, err);
|
||||||
if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
|
if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
|
||||||
script_code != USCRIPT_COMMON) ||
|
script_code != USCRIPT_COMMON) ||
|
||||||
script_code == USCRIPT_MYANMAR) {
|
script_code == USCRIPT_MYANMAR) {
|
||||||
if (script_code == USCRIPT_MYANMAR)
|
if (script_code == USCRIPT_MYANMAR)
|
||||||
@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript(
|
|||||||
char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
|
char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
|
||||||
// Check for validity.
|
// Check for validity.
|
||||||
if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
|
if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
|
||||||
|
codebase == static_cast<char32>(ViramaScript::kJavanese) ||
|
||||||
codebase == static_cast<char32>(ViramaScript::kKhmer) ||
|
codebase == static_cast<char32>(ViramaScript::kKhmer) ||
|
||||||
(static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
|
(static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
|
||||||
codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
|
codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
|
||||||
@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript(
|
|||||||
bool Validator::IsVirama(char32 unicode) {
|
bool Validator::IsVirama(char32 unicode) {
|
||||||
return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
|
return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
|
||||||
(unicode & 0x7f) == 0x4d) ||
|
(unicode & 0x7f) == 0x4d) ||
|
||||||
unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
|
unicode == kSinhalaVirama ||
|
||||||
|
unicode == kJavaneseVirama ||
|
||||||
|
unicode == kMyanmarVirama ||
|
||||||
unicode == kKhmerVirama;
|
unicode == kKhmerVirama;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) {
|
|||||||
bool Validator::IsSubscriptScript() const {
|
bool Validator::IsSubscriptScript() const {
|
||||||
return script_ == ViramaScript::kTelugu ||
|
return script_ == ViramaScript::kTelugu ||
|
||||||
script_ == ViramaScript::kKannada ||
|
script_ == ViramaScript::kKannada ||
|
||||||
script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer;
|
script_ == ViramaScript::kJavanese ||
|
||||||
|
script_ == ViramaScript::kMyanmar ||
|
||||||
|
script_ == ViramaScript::kKhmer;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Validator::ComputeClassCodes(const std::vector<char32>& text) {
|
void Validator::ComputeClassCodes(const std::vector<char32>& text) {
|
||||||
|
@ -64,6 +64,7 @@ enum class ViramaScript : char32 {
|
|||||||
kSinhala = 0xd80,
|
kSinhala = 0xd80,
|
||||||
kMyanmar = 0x1000,
|
kMyanmar = 0x1000,
|
||||||
kKhmer = 0x1780,
|
kKhmer = 0x1780,
|
||||||
|
kJavanese = 0xa980,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Base class offers a validation API and protected methods to allow subclasses
|
// Base class offers a validation API and protected methods to allow subclasses
|
||||||
@ -221,6 +222,9 @@ class Validator {
|
|||||||
static const char32 kSinhalaVirama = 0xdca;
|
static const char32 kSinhalaVirama = 0xdca;
|
||||||
static const char32 kMyanmarVirama = 0x1039;
|
static const char32 kMyanmarVirama = 0x1039;
|
||||||
static const char32 kKhmerVirama = 0x17d2;
|
static const char32 kKhmerVirama = 0x17d2;
|
||||||
|
// Javanese Script - aksarajawa
|
||||||
|
static const char32 kJavaneseVirama = 0xa9c0;
|
||||||
|
static const char32 kMaxJavaneseUnicode = 0xa9df;
|
||||||
|
|
||||||
// Script we are operating on.
|
// Script we are operating on.
|
||||||
ViramaScript script_;
|
ViramaScript script_;
|
||||||
|
Loading…
Reference in New Issue
Block a user