mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-11 15:09:03 +08:00
161 lines
6.3 KiB
C++
161 lines
6.3 KiB
C++
|
#include "validate_myanmar.h"
|
||
|
#include "errcode.h"
|
||
|
#include "icuerrorcode.h"
|
||
|
#include "tprintf.h"
|
||
|
#include "unicode/uchar.h" // From libicu
|
||
|
#include "unicode/uscript.h" // From libicu
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
// Returns whether codes matches the pattern for a Myanmar Grapheme.
|
||
|
// Taken directly from the unicode table 16-3.
|
||
|
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
|
||
|
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
|
||
|
int num_codes = codes_.size();
|
||
|
if (codes_used_ == num_codes) return true;
|
||
|
// Other.
|
||
|
if (IsMyanmarOther(codes_[codes_used_].second)) {
|
||
|
UseMultiCode(1);
|
||
|
return true;
|
||
|
}
|
||
|
// Kinzi.
|
||
|
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
|
||
|
codes_[codes_used_ + 1].second == kMyanmarAsat &&
|
||
|
codes_[codes_used_ + 2].second == kMyanmarVirama) {
|
||
|
ASSERT_HOST(!CodeOnlyToOutput());
|
||
|
ASSERT_HOST(!CodeOnlyToOutput());
|
||
|
if (UseMultiCode(3)) return true;
|
||
|
}
|
||
|
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
|
||
|
// optional, except the base, this is the only place where invalid input can
|
||
|
// be detected and false returned.
|
||
|
if (IsMyanmarLetter(codes_[codes_used_].second)) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
} else {
|
||
|
if (report_errors_) {
|
||
|
tprintf("Invalid start of Myanmar syllable:0x%x\n",
|
||
|
codes_[codes_used_].second);
|
||
|
}
|
||
|
return false; // One of these is required.
|
||
|
}
|
||
|
if (ConsumeSubscriptIfPresent()) return true;
|
||
|
ConsumeOptionalSignsIfPresent();
|
||
|
// What we have consumed so far is a valid syllable.
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
|
||
|
// is little correspondence between the content of table 16-3 and the char
|
||
|
// classes of the Indic languages. (Experts may disagree and improve!)
|
||
|
// In unicode table 16-3 there is basically a long list of optional characters,
|
||
|
// which can be coded quite easily.
|
||
|
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
|
||
|
// The table also allows sequences that still result in dotted circles!!
|
||
|
// So with a lot of guesswork the rest have been added in a reasonable place.
|
||
|
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
|
||
|
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
|
||
|
return CharClass::kOther;
|
||
|
}
|
||
|
|
||
|
// Helper consumes/copies a virama and any subscript consonant.
|
||
|
// Returns true if the end of input is reached.
|
||
|
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
|
||
|
// Subscript consonant. It appears there can be only one.
|
||
|
int num_codes = codes_.size();
|
||
|
if (codes_used_ + 1 < num_codes &&
|
||
|
codes_[codes_used_].second == kMyanmarVirama) {
|
||
|
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
|
||
|
ASSERT_HOST(!CodeOnlyToOutput());
|
||
|
if (UseMultiCode(2)) return true;
|
||
|
}
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Helper consumes/copies a series of optional signs.
|
||
|
// Returns true if the end of input is reached.
|
||
|
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
|
||
|
// The following characters are allowed, all optional, and in sequence.
|
||
|
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
|
||
|
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
|
||
|
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
|
||
|
0x1081, 0x1031});
|
||
|
for (char32 ch : kMedials) {
|
||
|
if (codes_[codes_used_].second == ch) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
if (ch == kMyanmarMedialYa &&
|
||
|
codes_[codes_used_].second == kMyanmarAsat) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Vowel sign i, ii, ai.
|
||
|
char32 ch = codes_[codes_used_].second;
|
||
|
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
}
|
||
|
// Vowel sign u, uu, and extensions.
|
||
|
ch = codes_[codes_used_].second;
|
||
|
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
|
||
|
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
|
||
|
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
|
||
|
ch == 0x109c || ch == 0x109d) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
}
|
||
|
// Tall aa, aa with optional asat.
|
||
|
if (codes_[codes_used_].second == 0x102b ||
|
||
|
codes_[codes_used_].second == 0x102c) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
if (codes_[codes_used_].second == kMyanmarAsat) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
}
|
||
|
}
|
||
|
// The following characters are allowed, all optional, and in sequence.
|
||
|
const std::vector<char32> kSigns({0x1036, 0x1037});
|
||
|
for (char32 ch : kSigns) {
|
||
|
if (codes_[codes_used_].second == ch) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
}
|
||
|
}
|
||
|
// Tone mark extensions.
|
||
|
ch = codes_[codes_used_].second;
|
||
|
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
|
||
|
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
|
||
|
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
|
||
|
(0xaa7b <= ch && ch <= 0xaa7d)) {
|
||
|
if (UseMultiCode(1)) return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Returns true if the unicode is a Myanmar "letter" including consonants
|
||
|
// and independent vowels. Although table 16-3 distinguishes between some
|
||
|
// base consonants and vowels, the extensions make no such distinction, so we
|
||
|
// put them all into a single bucket.
|
||
|
/* static */
|
||
|
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
|
||
|
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
|
||
|
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
|
||
|
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
|
||
|
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
|
||
|
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
|
||
|
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
|
||
|
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
|
||
|
}
|
||
|
|
||
|
// Returns true if ch is a Myanmar digit or other symbol that does not take
|
||
|
// part in being a syllable.
|
||
|
/* static */
|
||
|
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
|
||
|
IcuErrorCode err;
|
||
|
UScriptCode script_code = uscript_getScript(ch, err);
|
||
|
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
|
||
|
ch != Validator::kZeroWidthNonJoiner)
|
||
|
return true;
|
||
|
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
|
||
|
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
|
||
|
(0xaa74 <= ch && ch <= 0xaa79);
|
||
|
}
|
||
|
|
||
|
} // namespace tesseract
|