tesseract/training/validate_myanmar.cpp

#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h"    // From libicu
#include "unicode/uscript.h"  // From libicu

namespace tesseract {

// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
  int num_codes = codes_.size();
  if (codes_used_ == num_codes) return true;
  // Other.
  if (IsMyanmarOther(codes_[codes_used_].second)) {
    UseMultiCode(1);
    return true;
  }
  // Kinzi.
  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
      codes_[codes_used_ + 1].second == kMyanmarAsat &&
      codes_[codes_used_ + 2].second == kMyanmarVirama) {
    ASSERT_HOST(!CodeOnlyToOutput());
    ASSERT_HOST(!CodeOnlyToOutput());
    if (UseMultiCode(3)) return true;
  }
  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
  // optional, except the base, this is the only place where invalid input can
  // be detected and false returned.
  if (IsMyanmarLetter(codes_[codes_used_].second)) {
    if (UseMultiCode(1)) return true;
  } else {
    if (report_errors_) {
      tprintf("Invalid start of Myanmar syllable:0x%x\n",
              codes_[codes_used_].second);
    }
    return false;  // One of these is required.
  }
  if (ConsumeSubscriptIfPresent()) return true;
  ConsumeOptionalSignsIfPresent();
  // What we have consumed so far is a valid syllable.
  return true;
}

// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
  return CharClass::kOther;
}

// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
  // Subscript consonant. It appears there can be only one.
  int num_codes = codes_.size();
  if (codes_used_ + 1 < num_codes &&
      codes_[codes_used_].second == kMyanmarVirama) {
    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
      ASSERT_HOST(!CodeOnlyToOutput());
      if (UseMultiCode(2)) return true;
    }
  }
  return false;
}

// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
  // The following characters are allowed, all optional, and in sequence.
  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
                                      0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
                                      0x1081, 0x1031});
  for (char32 ch : kMedials) {
    if (codes_[codes_used_].second == ch) {
      if (UseMultiCode(1)) return true;
      if (ch == kMyanmarMedialYa &&
          codes_[codes_used_].second == kMyanmarAsat) {
        if (UseMultiCode(1)) return true;
      }
    }
  }
  // Vowel sign i, ii, ai.
  char32 ch = codes_[codes_used_].second;
  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
    if (UseMultiCode(1)) return true;
  }
  // Vowel sign u, uu, and extensions.
  ch = codes_[codes_used_].second;
  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
      ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
      (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
      ch == 0x109c || ch == 0x109d) {
    if (UseMultiCode(1)) return true;
  }
  // Tall aa, aa with optional asat.
  if (codes_[codes_used_].second == 0x102b ||
      codes_[codes_used_].second == 0x102c) {
    if (UseMultiCode(1)) return true;
    if (codes_[codes_used_].second == kMyanmarAsat) {
      if (UseMultiCode(1)) return true;
    }
  }
  // The following characters are allowed, all optional, and in sequence.
  const std::vector<char32> kSigns({0x1036, 0x1037});
  for (char32 ch : kSigns) {
    if (codes_[codes_used_].second == ch) {
      if (UseMultiCode(1)) return true;
    }
  }
  // Tone mark extensions.
  ch = codes_[codes_used_].second;
  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
      ch == 0x108f || ch == 0x109a || ch == 0x109b ||
      (0xaa7b <= ch && ch <= 0xaa7d)) {
    if (UseMultiCode(1)) return true;
  }
  return false;
}

// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
         (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
         ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
         ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
         (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
         ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
}

// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
  IcuErrorCode err;
  UScriptCode script_code = uscript_getScript(ch, err);
  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
      ch != Validator::kZeroWidthNonJoiner)
    return true;
  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
         (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
         (0xaa74 <= ch && ch <= 0xaa79);
}

}  // namespace tesseract
Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`#include "validate_myanmar.h"`
			`#include "errcode.h"`
			`#include "icuerrorcode.h"`
			`#include "tprintf.h"`
			`#include "unicode/uchar.h" // From libicu`
			`#include "unicode/uscript.h" // From libicu`

			`namespace tesseract {`

			`// Returns whether codes matches the pattern for a Myanmar Grapheme.`
			`// Taken directly from the unicode table 16-3.`
			`// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf`
			`bool ValidateMyanmar::ConsumeGraphemeIfValid() {`
			`int num_codes = codes_.size();`
			`if (codes_used_ == num_codes) return true;`
			`// Other.`
			`if (IsMyanmarOther(codes_[codes_used_].second)) {`
			`UseMultiCode(1);`
			`return true;`
			`}`
			`// Kinzi.`
			`if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&`
			`codes_[codes_used_ + 1].second == kMyanmarAsat &&`
			`codes_[codes_used_ + 2].second == kMyanmarVirama) {`
			`ASSERT_HOST(!CodeOnlyToOutput());`
			`ASSERT_HOST(!CodeOnlyToOutput());`
			`if (UseMultiCode(3)) return true;`
			`}`
			`// Base consonant/vowel. NOTE that since everything in Myanmar appears to be`
			`// optional, except the base, this is the only place where invalid input can`
			`// be detected and false returned.`
			`if (IsMyanmarLetter(codes_[codes_used_].second)) {`
			`if (UseMultiCode(1)) return true;`
			`} else {`
			`if (report_errors_) {`
			`tprintf("Invalid start of Myanmar syllable:0x%x\n",`
			`codes_[codes_used_].second);`
			`}`
			`return false; // One of these is required.`
			`}`
			`if (ConsumeSubscriptIfPresent()) return true;`
			`ConsumeOptionalSignsIfPresent();`
			`// What we have consumed so far is a valid syllable.`
			`return true;`
			`}`

			`// TODO(rays) Doesn't use intermediate coding like the other scripts, as there`
			`// is little correspondence between the content of table 16-3 and the char`
			`// classes of the Indic languages. (Experts may disagree and improve!)`
			`// In unicode table 16-3 there is basically a long list of optional characters,`
			`// which can be coded quite easily.`
			`// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!`
			`// The table also allows sequences that still result in dotted circles!!`
			`// So with a lot of guesswork the rest have been added in a reasonable place.`
			`Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {`
			`if (IsMyanmarLetter(ch)) return CharClass::kConsonant;`
			`return CharClass::kOther;`
			`}`

			`// Helper consumes/copies a virama and any subscript consonant.`
			`// Returns true if the end of input is reached.`
			`bool ValidateMyanmar::ConsumeSubscriptIfPresent() {`
			`// Subscript consonant. It appears there can be only one.`
			`int num_codes = codes_.size();`
			`if (codes_used_ + 1 < num_codes &&`
			`codes_[codes_used_].second == kMyanmarVirama) {`
			`if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {`
			`ASSERT_HOST(!CodeOnlyToOutput());`
			`if (UseMultiCode(2)) return true;`
			`}`
			`}`
			`return false;`
			`}`

			`// Helper consumes/copies a series of optional signs.`
			`// Returns true if the end of input is reached.`
			`bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {`
			`// The following characters are allowed, all optional, and in sequence.`
			`// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.`
			`const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,`
			`0x103d, 0x103e, 0x105e, 0x105f, 0x1060,`
			`0x1081, 0x1031});`
			`for (char32 ch : kMedials) {`
			`if (codes_[codes_used_].second == ch) {`
			`if (UseMultiCode(1)) return true;`
			`if (ch == kMyanmarMedialYa &&`
			`codes_[codes_used_].second == kMyanmarAsat) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`}`
			`}`
			`// Vowel sign i, ii, ai.`
			`char32 ch = codes_[codes_used_].second;`
			`if (ch == 0x102d \|\| ch == 0x102e \|\| ch == 0x1032) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`// Vowel sign u, uu, and extensions.`
			`ch = codes_[codes_used_].second;`
			`if (ch == 0x102f \|\| ch == 0x1030 \|\| (0x1056 <= ch && ch <= 0x1059) \|\|`
			`ch == 0x1062 \|\| ch == 0x1067 \|\| ch == 0x1068 \|\|`
			`(0x1071 <= ch && ch <= 0x1074) \|\| (0x1083 <= ch && ch <= 0x1086) \|\|`
			`ch == 0x109c \|\| ch == 0x109d) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`// Tall aa, aa with optional asat.`
			`if (codes_[codes_used_].second == 0x102b \|\|`
			`codes_[codes_used_].second == 0x102c) {`
			`if (UseMultiCode(1)) return true;`
			`if (codes_[codes_used_].second == kMyanmarAsat) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`}`
			`// The following characters are allowed, all optional, and in sequence.`
			`const std::vector<char32> kSigns({0x1036, 0x1037});`
			`for (char32 ch : kSigns) {`
			`if (codes_[codes_used_].second == ch) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`}`
			`// Tone mark extensions.`
			`ch = codes_[codes_used_].second;`
			`if (ch == 0x1038 \|\| ch == kMyanmarAsat \|\| ch == 0x1063 \|\| ch == 0x1064 \|\|`
			`(0x1069 <= ch && ch <= 0x106d) \|\| (0x1087 <= ch && ch <= 0x108d) \|\|`
			`ch == 0x108f \|\| ch == 0x109a \|\| ch == 0x109b \|\|`
			`(0xaa7b <= ch && ch <= 0xaa7d)) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`return false;`
			`}`

			`// Returns true if the unicode is a Myanmar "letter" including consonants`
			`// and independent vowels. Although table 16-3 distinguishes between some`
			`// base consonants and vowels, the extensions make no such distinction, so we`
			`// put them all into a single bucket.`
			`/* static */`
			`bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {`
			`return (0x1000 <= ch && ch <= 0x102a) \|\| ch == 0x103f \|\|`
			`(0x1050 <= ch && ch <= 0x1055) \|\| (0x105a <= ch && ch <= 0x105d) \|\|`
			`ch == 0x1061 \|\| ch == 0x1065 \|\| ch == 0x1066 \|\|`
			`(0x106e <= ch && ch <= 0x1070) \|\| (0x1075 <= ch && ch <= 0x1080) \|\|`
			`ch == 0x108e \|\| (0xa9e0 <= ch && ch <= 0xa9ef) \|\|`
			`(0xa9fa <= ch && ch <= 0xa9ff) \|\| (0xaa60 <= ch && ch <= 0xaa73) \|\|`
			`ch == 0xaa7a \|\| ch == 0xaa7e \|\| ch == 0xaa7f;`
			`}`

			`// Returns true if ch is a Myanmar digit or other symbol that does not take`
			`// part in being a syllable.`
			`/* static */`
			`bool ValidateMyanmar::IsMyanmarOther(char32 ch) {`
			`IcuErrorCode err;`
			`UScriptCode script_code = uscript_getScript(ch, err);`
			`if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&`
			`ch != Validator::kZeroWidthNonJoiner)`
			`return true;`
			`return (0x1040 <= ch && ch <= 0x1049) \|\| (0x1090 <= ch && ch <= 0x1099) \|\|`
			`(0x109c <= ch && ch <= 0x109d) \|\| (0xa9f0 <= ch && ch <= 0xa9f9) \|\|`
			`(0xaa74 <= ch && ch <= 0xaa79);`
			`}`

			`} // namespace tesseract`