mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-12 07:29:07 +08:00
48 lines
1.8 KiB
C++
48 lines
1.8 KiB
C++
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
|
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
|
|
|
#include "validator.h"
|
|
|
|
namespace tesseract {
|
|
|
|
// Subclass of Validator that validates and segments Myanmar.
|
|
class ValidateMyanmar : public Validator {
|
|
public:
|
|
ValidateMyanmar(ViramaScript script, bool report_errors)
|
|
: Validator(script, report_errors) {}
|
|
~ValidateMyanmar() {}
|
|
|
|
protected:
|
|
// Returns whether codes matches the pattern for a Myanmar Grapheme.
|
|
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
|
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
|
// otherwise does not increment codes_used_.
|
|
bool ConsumeGraphemeIfValid() override;
|
|
// Returns the CharClass corresponding to the given Unicode ch.
|
|
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
|
|
|
|
private:
|
|
// Helper consumes/copies a virama and any subscript consonant.
|
|
// Returns true if the end of input is reached.
|
|
bool ConsumeSubscriptIfPresent();
|
|
// Helper consumes/copies a series of optional signs.
|
|
// Returns true if the end of input is reached.
|
|
bool ConsumeOptionalSignsIfPresent();
|
|
// Returns true if the unicode is a Myanmar "letter" including consonants
|
|
// and independent vowels. Although table 16-3 distinguishes between some
|
|
// base consonants and vowels, the extensions make no such distinction, so we
|
|
// put them all into a single bucket.
|
|
static bool IsMyanmarLetter(char32 ch);
|
|
// Returns true if ch is a Myanmar digit or other symbol that does not take
|
|
// part in being a syllable.
|
|
static bool IsMyanmarOther(char32 ch);
|
|
|
|
// Some special unicodes used only for Myanmar processing.
|
|
static const char32 kMyanmarAsat = 0x103a;
|
|
static const char32 kMyanmarMedialYa = 0x103b;
|
|
};
|
|
|
|
} // namespace tesseract
|
|
|
|
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|