mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-11 13:47:48 +08:00
0e1a1fc3cf
This also fixes a regression in validate_grapheme_test introduced
by commit 32e9d7c8f5
.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
282 lines
11 KiB
C++
282 lines
11 KiB
C++
#include "validate_indic.h"
|
|
#include "errcode.h"
|
|
#include "tprintf.h"
|
|
|
|
namespace tesseract {
|
|
|
|
// Returns whether codes matches the pattern for an Indic Grapheme.
|
|
// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
|
|
// has a BNF for valid syllables (Graphemes) which is modified slightly
|
|
// for Unicode. Notably U+200C and U+200D are used before/after the
|
|
// virama/virama to express explicit or soft viramas.
|
|
// Also the unicode v.9 Malayalam entry states that CZHC can be used in several
|
|
// Indic languages to request traditional ligatures, and CzHC is Malayalam-
|
|
// specific for requesting open conjuncts.
|
|
//
|
|
// + vowel Grapheme: V[D](v)*
|
|
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
|
|
bool ValidateIndic::ConsumeGraphemeIfValid() {
|
|
switch (codes_[codes_used_].first) {
|
|
case CharClass::kConsonant:
|
|
return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
|
|
case CharClass::kVowel:
|
|
case CharClass::kVedicMark:
|
|
return ConsumeVowelIfValid();
|
|
case CharClass::kZeroWidthJoiner:
|
|
case CharClass::kZeroWidthNonJoiner:
|
|
// Apart from within an aksara, joiners are silently dropped.
|
|
if (report_errors_)
|
|
tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
|
|
++codes_used_;
|
|
return true;
|
|
case CharClass::kOther:
|
|
UseMultiCode(1);
|
|
return true;
|
|
default:
|
|
if (report_errors_) {
|
|
tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
|
|
codes_[codes_used_].first, codes_[codes_used_].second);
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
|
|
Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
|
|
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
|
|
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
|
|
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
|
|
// Offset from the start of the relevant unicode code block aka code page.
|
|
int base = static_cast<char32>(script_);
|
|
int off = ch - base;
|
|
// Anything in another code block is other.
|
|
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
|
|
// Exception for Tamil. The aytham character is considered a letter.
|
|
if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
|
|
if (off < 0x4) return CharClass::kVowelModifier;
|
|
if (script_ == ViramaScript::kSinhala) {
|
|
// Sinhala is an exception.
|
|
if (off <= 0x19) return CharClass::kVowel;
|
|
if (off <= 0x49) return CharClass::kConsonant;
|
|
if (off == 0x4a) return CharClass::kVirama;
|
|
if (off <= 0x5f) return CharClass::kMatra;
|
|
} else {
|
|
if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
|
|
if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
|
|
return CharClass::kConsonant;
|
|
// Sinhala doesn't have Nukta or Avagraha.
|
|
if (off == 0x3c) return CharClass::kNukta;
|
|
if (off == 0x3d) return CharClass::kVowel; // avagraha
|
|
if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
|
|
if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
|
|
if (off == 0x4d) return CharClass::kVirama;
|
|
}
|
|
if (off == 0x60 || off == 0x61) return CharClass::kVowel;
|
|
if (off == 0x62 || off == 0x63) return CharClass::kMatra;
|
|
// Danda and digits up to 6f are OK as other.
|
|
// 70-7f are script-specific.
|
|
// 0BF0-0BF2 are Tamil numbers 10, 100 and 1000; treat as other.
|
|
if (script_ == ViramaScript::kTamil && (0x70 <= off && off <= 0x72))
|
|
return CharClass::kOther;
|
|
// 0BF3-0BFA are other Tamil symbols.
|
|
if (script_ == ViramaScript::kTamil && (0x73 <= off && off <= 0x7A))
|
|
return CharClass::kOther;
|
|
if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
|
|
return CharClass::kConsonant;
|
|
if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
|
|
return CharClass::kConsonant;
|
|
if (script_ == ViramaScript::kSinhala && off == 0x70)
|
|
return CharClass::kConsonant;
|
|
if (script_ == ViramaScript::kDevanagari && off == 0x70)
|
|
return CharClass::kOther;
|
|
if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
|
|
// Non Indic, Digits, Measures, danda, etc.
|
|
return CharClass::kOther;
|
|
}
|
|
|
|
// Helper consumes/copies a virama and any associated post-virama joiners.
|
|
// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
|
|
// no joiner at all) must be followed by a consonant.
|
|
// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
|
|
// consonant, space, or character from a different script. We clean up the
|
|
// representation to make it consistent by adding a ZWNJ if missing from a
|
|
// non-linking virama. Returns false with an invalid sequence.
|
|
bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
|
|
const unsigned num_codes = codes_.size();
|
|
if (joiner.first == CharClass::kOther) {
|
|
CodeOnlyToOutput();
|
|
if (codes_used_ < num_codes &&
|
|
codes_[codes_used_].second == kZeroWidthJoiner) {
|
|
// Post-matra viramas must be explicit, so no joiners allowed here.
|
|
if (post_matra) {
|
|
if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
|
|
return false;
|
|
}
|
|
if (codes_used_ + 1 < num_codes &&
|
|
codes_[codes_used_ - 2].second != kRayana &&
|
|
(codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
|
|
codes_[codes_used_ + 1].second == kYayana ||
|
|
codes_[codes_used_ + 1].second == kRayana)) {
|
|
// This combination will be picked up later.
|
|
ASSERT_HOST(!CodeOnlyToOutput());
|
|
} else {
|
|
// Half-form with optional Nukta.
|
|
unsigned len = output_.size() + 1 - output_used_;
|
|
if (UseMultiCode(len)) return true;
|
|
}
|
|
if (codes_used_ < num_codes &&
|
|
codes_[codes_used_].second == kZeroWidthNonJoiner) {
|
|
if (output_used_ == output_.size() ||
|
|
output_[output_used_] != kRayana) {
|
|
if (report_errors_) {
|
|
tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
|
|
static_cast<int>(script_));
|
|
}
|
|
return false;
|
|
}
|
|
// Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
|
|
if (UseMultiCode(4)) return true;
|
|
}
|
|
} else if (codes_used_ == num_codes ||
|
|
codes_[codes_used_].first != CharClass::kConsonant ||
|
|
post_matra) {
|
|
if (codes_used_ == num_codes ||
|
|
codes_[codes_used_].second != kZeroWidthNonJoiner) {
|
|
// It is valid to have an unterminated virama at the end of a word, but
|
|
// for consistency, we will always add ZWNJ if not present.
|
|
output_.push_back(kZeroWidthNonJoiner);
|
|
} else {
|
|
CodeOnlyToOutput();
|
|
}
|
|
// Explicit virama [H z]
|
|
MultiCodePart(2);
|
|
}
|
|
} else {
|
|
// Pre-virama joiner [{Z|z} H] requests specific conjunct.
|
|
if (UseMultiCode(2)) {
|
|
if (report_errors_)
|
|
tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
|
|
return false;
|
|
}
|
|
if (codes_[codes_used_].second == kZeroWidthJoiner ||
|
|
codes_[codes_used_].second == kZeroWidthNonJoiner) {
|
|
if (report_errors_) {
|
|
tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
|
|
codes_[codes_used_].second);
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
// It is good so far as it goes.
|
|
return true;
|
|
}
|
|
|
|
// Helper consumes/copies a series of consonants separated by viramas while
|
|
// valid, but not any vowel or other modifiers.
|
|
bool ValidateIndic::ConsumeConsonantHeadIfValid() {
|
|
const unsigned num_codes = codes_.size();
|
|
// Consonant aksara
|
|
do {
|
|
CodeOnlyToOutput();
|
|
// Special Sinhala case of [H Z Yayana/Rayana].
|
|
int index = output_.size() - 3;
|
|
if (output_used_ + 3 <= output_.size() &&
|
|
(output_.back() == kYayana || output_.back() == kRayana) &&
|
|
IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
|
|
MultiCodePart(3);
|
|
}
|
|
bool have_nukta = false;
|
|
if (codes_used_ < num_codes &&
|
|
codes_[codes_used_].first == CharClass::kNukta) {
|
|
have_nukta = true;
|
|
CodeOnlyToOutput();
|
|
}
|
|
// Test for subscript conjunct.
|
|
index = output_.size() - 2 - have_nukta;
|
|
if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
|
|
IsVirama(output_[index])) {
|
|
// Output previous virama, consonant + optional nukta.
|
|
MultiCodePart(2 + have_nukta);
|
|
}
|
|
IndicPair joiner(CharClass::kOther, 0);
|
|
if (codes_used_ < num_codes &&
|
|
(codes_[codes_used_].second == kZeroWidthJoiner ||
|
|
(codes_[codes_used_].second == kZeroWidthNonJoiner &&
|
|
script_ == ViramaScript::kMalayalam))) {
|
|
joiner = codes_[codes_used_];
|
|
if (++codes_used_ == num_codes) {
|
|
if (report_errors_) {
|
|
tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
|
|
joiner.second);
|
|
}
|
|
return true;
|
|
}
|
|
if (codes_[codes_used_].first == CharClass::kVirama) {
|
|
output_.push_back(joiner.second);
|
|
} else {
|
|
if (report_errors_) {
|
|
tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
|
|
output_.back(), joiner.second, codes_[codes_used_].second);
|
|
}
|
|
joiner = std::make_pair(CharClass::kOther, 0);
|
|
}
|
|
}
|
|
if (codes_used_ < num_codes &&
|
|
codes_[codes_used_].first == CharClass::kVirama) {
|
|
if (!ConsumeViramaIfValid(joiner, false)) return false;
|
|
} else {
|
|
break; // No virama, so the run of consonants is over.
|
|
}
|
|
} while (codes_used_ < num_codes &&
|
|
codes_[codes_used_].first == CharClass::kConsonant);
|
|
if (output_used_ < output_.size()) MultiCodePart(1);
|
|
return true;
|
|
}
|
|
|
|
// Helper consumes/copies a tail part of a consonant, comprising optional
|
|
// matra/piece, vowel modifier, vedic mark, terminating virama.
|
|
bool ValidateIndic::ConsumeConsonantTailIfValid() {
|
|
if (codes_used_ == codes_.size()) return true;
|
|
// No virama: Finish the grapheme.
|
|
// Are multiple matras allowed?
|
|
if (codes_[codes_used_].first == CharClass::kMatra) {
|
|
if (UseMultiCode(1)) return true;
|
|
if (codes_[codes_used_].first == CharClass::kMatraPiece) {
|
|
if (UseMultiCode(1)) return true;
|
|
}
|
|
}
|
|
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
|
if (UseMultiCode(1)) return true;
|
|
// Only Malayalam allows only repeated 0xd02.
|
|
if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
|
|
}
|
|
while (codes_[codes_used_].first == CharClass::kVedicMark) {
|
|
if (UseMultiCode(1)) return true;
|
|
}
|
|
if (codes_[codes_used_].first == CharClass::kVirama) {
|
|
if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
|
|
return false;
|
|
}
|
|
}
|
|
// What we have consumed so far is a valid consonant cluster.
|
|
if (output_used_ < output_.size()) MultiCodePart(1);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Helper consumes/copies a vowel and optional modifiers.
|
|
bool ValidateIndic::ConsumeVowelIfValid() {
|
|
if (UseMultiCode(1)) return true;
|
|
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
|
if (UseMultiCode(1)) return true;
|
|
// Only Malayalam allows repeated modifiers?
|
|
if (script_ != ViramaScript::kMalayalam) break;
|
|
}
|
|
while (codes_[codes_used_].first == CharClass::kVedicMark) {
|
|
if (UseMultiCode(1)) return true;
|
|
}
|
|
// What we have consumed so far is a valid vowel cluster.
|
|
return true;
|
|
}
|
|
|
|
} // namespace tesseract
|