tesseract/training/validate_indic.cpp

#include "validate_indic.h"
#include "errcode.h"
#include "tprintf.h"

namespace tesseract {

// Returns whether codes matches the pattern for an Indic Grapheme.
// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
// has a BNF for valid syllables (Graphemes) which is modified slightly
// for Unicode.  Notably U+200C and U+200D are used before/after the
// virama/virama to express explicit or soft viramas.
// Also the unicode v.9 Malayalam entry states that CZHC can be used in several
// Indic languages to request traditional ligatures, and CzHC is Malayalam-
// specific for requesting open conjuncts.
//
//  + vowel Grapheme:  V[D](v)*
//  + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
bool ValidateIndic::ConsumeGraphemeIfValid() {
  switch (codes_[codes_used_].first) {
    case CharClass::kConsonant:
      return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
    case CharClass::kVowel:
      return ConsumeVowelIfValid();
    case CharClass::kZeroWidthJoiner:
    case CharClass::kZeroWidthNonJoiner:
      // Apart from within an aksara, joiners are silently dropped.
      if (report_errors_)
        tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
      ++codes_used_;
      return true;
    case CharClass::kOther:
      UseMultiCode(1);
      return true;
    default:
      if (report_errors_) {
        tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
                codes_[codes_used_].first, codes_[codes_used_].second);
      }
      return false;
  }
}

Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
  // Offset from the start of the relevant unicode code block aka code page.
  int base = static_cast<char32>(script_);
  int off = ch - base;
  // Anything in another code block is other.
  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
  // Exception for Tamil. The aytham character is considered a letter.
  if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
  if (off < 0x4) return CharClass::kVowelModifier;
  if (script_ == ViramaScript::kSinhala) {
    // Sinhala is an exception.
    if (off <= 0x19) return CharClass::kVowel;
    if (off <= 0x49) return CharClass::kConsonant;
    if (off == 0x4a) return CharClass::kVirama;
    if (off <= 0x5f) return CharClass::kMatra;
  } else {
    if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
    if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
      return CharClass::kConsonant;
    // Sinhala doesn't have Nukta or Avagraha.
    if (off == 0x3c) return CharClass::kNukta;
    if (off == 0x3d) return CharClass::kVowel;
    if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
    if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
    if (off == 0x4d) return CharClass::kVirama;
  }
  if (off == 0x60 || off == 0x61) return CharClass::kVowel;
  if (off == 0x62 || off == 0x63) return CharClass::kMatra;
  // Danda and digits up to 6f are OK as other.
  // 70-7f are script-specific.
  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
    return CharClass::kConsonant;
  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
    return CharClass::kConsonant;
  if (script_ == ViramaScript::kSinhala && off == 0x70)
    return CharClass::kConsonant;
  if (script_ == ViramaScript::kDevanagari && off == 0x70)
    return CharClass::kOther;
  if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
  // Non Indic, Digits, Measures, danda, etc.
  return CharClass::kOther;
}

// Helper consumes/copies a virama and any associated post-virama joiners.
// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
// no joiner at all) must be followed by a consonant.
// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
// consonant, space, or character from a different script. We clean up the
// representation to make it consistent by adding a ZWNJ if missing from a
// non-linking virama. Returns false with an invalid sequence.
bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
  int num_codes = codes_.size();
  if (joiner.first == CharClass::kOther) {
    CodeOnlyToOutput();
    if (codes_used_ < num_codes &&
        codes_[codes_used_].second == kZeroWidthJoiner) {
      // Post-matra viramas must be explicit, so no joiners allowed here.
      if (post_matra) {
        if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
        return false;
      }
      if (codes_used_ + 1 < num_codes &&
          codes_[codes_used_ - 2].second != kRayana &&
          (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
           codes_[codes_used_ + 1].second == kYayana ||
           codes_[codes_used_ + 1].second == kRayana)) {
        // This combination will be picked up later.
        ASSERT_HOST(!CodeOnlyToOutput());
      } else {
        // Half-form with optional Nukta.
        int len = output_.size() + 1 - output_used_;
        if (UseMultiCode(len)) return true;
      }
      if (codes_used_ < num_codes &&
          codes_[codes_used_].second == kZeroWidthNonJoiner) {
        if (output_used_ == output_.size() ||
            output_[output_used_] != kRayana) {
          if (report_errors_) {
            tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
                    static_cast<int>(script_));
          }
          return false;
        }
        // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
        if (UseMultiCode(4)) return true;
      }
    } else if (codes_used_ == num_codes ||
               codes_[codes_used_].first != CharClass::kConsonant ||
               post_matra) {
      if (codes_used_ == num_codes ||
          codes_[codes_used_].second != kZeroWidthNonJoiner) {
        // It is valid to have an unterminated virama at the end of a word, but
        // for consistency, we will always add ZWNJ if not present.
        output_.push_back(kZeroWidthNonJoiner);
      } else {
        CodeOnlyToOutput();
      }
      // Explicit virama [H z]
      MultiCodePart(2);
    }
  } else {
    // Pre-virama joiner [{Z|z} H] requests specific conjunct.
    if (UseMultiCode(2)) {
      if (report_errors_)
        tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
      return false;
    }
    if (codes_[codes_used_].second == kZeroWidthJoiner ||
        codes_[codes_used_].second == kZeroWidthNonJoiner) {
      if (report_errors_) {
        tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
                codes_[codes_used_].second);
      }
      return false;
    }
  }
  // It is good so far as it goes.
  return true;
}

// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ValidateIndic::ConsumeConsonantHeadIfValid() {
  const int num_codes = codes_.size();
  // Consonant aksara
  do {
    CodeOnlyToOutput();
    // Special Sinhala case of [H Z Yayana/Rayana].
    int index = output_.size() - 3;
    if (output_used_ <= index &&
        (output_.back() == kYayana || output_.back() == kRayana) &&
        IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
      MultiCodePart(3);
    }
    bool have_nukta = false;
    if (codes_used_ < num_codes &&
        codes_[codes_used_].first == CharClass::kNukta) {
      have_nukta = true;
      CodeOnlyToOutput();
    }
    // Test for subscript conjunct.
    index = output_.size() - 2 - have_nukta;
    if (output_used_ <= index && IsSubscriptScript() &&
        IsVirama(output_[index])) {
      // Output previous virama, consonant + optional nukta.
      MultiCodePart(2 + have_nukta);
    }
    IndicPair joiner(CharClass::kOther, 0);
    if (codes_used_ < num_codes &&
        (codes_[codes_used_].second == kZeroWidthJoiner ||
         (codes_[codes_used_].second == kZeroWidthNonJoiner &&
          script_ == ViramaScript::kMalayalam))) {
      joiner = codes_[codes_used_];
      if (++codes_used_ == num_codes) {
        if (report_errors_) {
          tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
                  joiner.second);
        }
        return true;
      }
      if (codes_[codes_used_].first == CharClass::kVirama) {
        output_.push_back(joiner.second);
      } else {
        if (report_errors_) {
          tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
                  output_.back(), joiner.second, codes_[codes_used_].second);
        }
        joiner = std::make_pair(CharClass::kOther, 0);
      }
    }
    if (codes_used_ < num_codes &&
        codes_[codes_used_].first == CharClass::kVirama) {
      if (!ConsumeViramaIfValid(joiner, false)) return false;
    } else {
      break;  // No virama, so the run of consonants is over.
    }
  } while (codes_used_ < num_codes &&
           codes_[codes_used_].first == CharClass::kConsonant);
  if (output_used_ < output_.size()) MultiCodePart(1);
  return true;
}

// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ValidateIndic::ConsumeConsonantTailIfValid() {
  if (codes_used_ == codes_.size()) return true;
  // No virama: Finish the grapheme.
  // Are multiple matras allowed?
  if (codes_[codes_used_].first == CharClass::kMatra) {
    if (UseMultiCode(1)) return true;
    if (codes_[codes_used_].first == CharClass::kMatraPiece) {
      if (UseMultiCode(1)) return true;
    }
  }
  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
    if (UseMultiCode(1)) return true;
    // Only Malayalam allows only repeated 0xd02.
    if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
  }
  while (codes_[codes_used_].first == CharClass::kVedicMark) {
    if (UseMultiCode(1)) return true;
  }
  if (codes_[codes_used_].first == CharClass::kVirama) {
    if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
      return false;
    }
  }
  // What we have consumed so far is a valid consonant cluster.
  if (output_used_ < output_.size()) MultiCodePart(1);

  return true;
}

// Helper consumes/copies a vowel and optional modifiers.
bool ValidateIndic::ConsumeVowelIfValid() {
  if (UseMultiCode(1)) return true;
  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
    if (UseMultiCode(1)) return true;
    // Only Malayalam allows repeated modifiers?
    if (script_ != ViramaScript::kMalayalam) break;
  }
  while (codes_[codes_used_].first == CharClass::kVedicMark) {
    if (UseMultiCode(1)) return true;
  }
  // What we have consumed so far is a valid vowel cluster.
  return true;
}

}  // namespace tesseract
Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`#include "validate_indic.h"`
			`#include "errcode.h"`
			`#include "tprintf.h"`

			`namespace tesseract {`

			`// Returns whether codes matches the pattern for an Indic Grapheme.`
			`// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf`
			`// has a BNF for valid syllables (Graphemes) which is modified slightly`
			`// for Unicode. Notably U+200C and U+200D are used before/after the`
			`// virama/virama to express explicit or soft viramas.`
			`// Also the unicode v.9 Malayalam entry states that CZHC can be used in several`
			`// Indic languages to request traditional ligatures, and CzHC is Malayalam-`
			`// specific for requesting open conjuncts.`
			`//`
			`// + vowel Grapheme: V[D](v)*`
			`// + consonant Grapheme: (C[N](H\|HZ\|Hz\|ZH)?)C[N](H\|Hz)?[M[P]][D](v)`
			`bool ValidateIndic::ConsumeGraphemeIfValid() {`
			`switch (codes_[codes_used_].first) {`
			`case CharClass::kConsonant:`
			`return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();`
			`case CharClass::kVowel:`
			`return ConsumeVowelIfValid();`
			`case CharClass::kZeroWidthJoiner:`
			`case CharClass::kZeroWidthNonJoiner:`
			`// Apart from within an aksara, joiners are silently dropped.`
			`if (report_errors_)`
			`tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);`
			`++codes_used_;`
			`return true;`
			`case CharClass::kOther:`
			`UseMultiCode(1);`
			`return true;`
			`default:`
			`if (report_errors_) {`
			`tprintf("Invalid start of grapheme sequence:%c=0x%x\n",`
			`codes_[codes_used_].first, codes_[codes_used_].second);`
			`}`
			`return false;`
			`}`
			`}`

			`Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {`
			`if (IsVedicAccent(ch)) return CharClass::kVedicMark;`
			`if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;`
			`if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;`
			`// Offset from the start of the relevant unicode code block aka code page.`
			`int base = static_cast<char32>(script_);`
			`int off = ch - base;`
			`// Anything in another code block is other.`
			`if (off < 0 \|\| off >= kIndicCodePageSize) return CharClass::kOther;`
			`// Exception for Tamil. The aytham character is considered a letter.`
			`if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;`
			`if (off < 0x4) return CharClass::kVowelModifier;`
			`if (script_ == ViramaScript::kSinhala) {`
			`// Sinhala is an exception.`
			`if (off <= 0x19) return CharClass::kVowel;`
			`if (off <= 0x49) return CharClass::kConsonant;`
			`if (off == 0x4a) return CharClass::kVirama;`
			`if (off <= 0x5f) return CharClass::kMatra;`
			`} else {`
			`if (off <= 0x14 \|\| off == 0x50) return CharClass::kVowel;`
			`if (off <= 0x3b \|\| (0x58 <= off && off <= 0x5f))`
			`return CharClass::kConsonant;`
			`// Sinhala doesn't have Nukta or Avagraha.`
			`if (off == 0x3c) return CharClass::kNukta;`
			`if (off == 0x3d) return CharClass::kVowel;`
			`if (off <= 0x4c \|\| (0x51 <= off && off <= 0x54)) return CharClass::kMatra;`
			`if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;`
			`if (off == 0x4d) return CharClass::kVirama;`
			`}`
			`if (off == 0x60 \|\| off == 0x61) return CharClass::kVowel;`
			`if (off == 0x62 \|\| off == 0x63) return CharClass::kMatra;`
			`// Danda and digits up to 6f are OK as other.`
			`// 70-7f are script-specific.`
			`if (script_ == ViramaScript::kBengali && (off == 0x70 \|\| off == 0x71))`
			`return CharClass::kConsonant;`
			`if (script_ == ViramaScript::kGurmukhi && (off == 0x72 \|\| off == 0x73))`
			`return CharClass::kConsonant;`
			`if (script_ == ViramaScript::kSinhala && off == 0x70)`
			`return CharClass::kConsonant;`
			`if (script_ == ViramaScript::kDevanagari && off == 0x70)`
			`return CharClass::kOther;`
			`if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;`
			`// Non Indic, Digits, Measures, danda, etc.`
			`return CharClass::kOther;`
			`}`

			`// Helper consumes/copies a virama and any associated post-virama joiners.`
			`// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or`
			`// no joiner at all) must be followed by a consonant.`
			`// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non`
			`// consonant, space, or character from a different script. We clean up the`
			`// representation to make it consistent by adding a ZWNJ if missing from a`
			`// non-linking virama. Returns false with an invalid sequence.`
			`bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {`
			`int num_codes = codes_.size();`
			`if (joiner.first == CharClass::kOther) {`
			`CodeOnlyToOutput();`
			`if (codes_used_ < num_codes &&`
			`codes_[codes_used_].second == kZeroWidthJoiner) {`
			`// Post-matra viramas must be explicit, so no joiners allowed here.`
			`if (post_matra) {`
			`if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");`
			`return false;`
			`}`
			`if (codes_used_ + 1 < num_codes &&`
			`codes_[codes_used_ - 2].second != kRayana &&`
			`(codes_[codes_used_ + 1].second == kZeroWidthNonJoiner \|\|`
			`codes_[codes_used_ + 1].second == kYayana \|\|`
			`codes_[codes_used_ + 1].second == kRayana)) {`
			`// This combination will be picked up later.`
			`ASSERT_HOST(!CodeOnlyToOutput());`
			`} else {`
			`// Half-form with optional Nukta.`
			`int len = output_.size() + 1 - output_used_;`
			`if (UseMultiCode(len)) return true;`
			`}`
			`if (codes_used_ < num_codes &&`
			`codes_[codes_used_].second == kZeroWidthNonJoiner) {`
			`if (output_used_ == output_.size() \|\|`
			`output_[output_used_] != kRayana) {`
			`if (report_errors_) {`
			`tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",`
			`static_cast<int>(script_));`
			`}`
			`return false;`
			`}`
			`// Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]`
			`if (UseMultiCode(4)) return true;`
			`}`
			`} else if (codes_used_ == num_codes \|\|`
			`codes_[codes_used_].first != CharClass::kConsonant \|\|`
			`post_matra) {`
			`if (codes_used_ == num_codes \|\|`
			`codes_[codes_used_].second != kZeroWidthNonJoiner) {`
			`// It is valid to have an unterminated virama at the end of a word, but`
			`// for consistency, we will always add ZWNJ if not present.`
			`output_.push_back(kZeroWidthNonJoiner);`
			`} else {`
			`CodeOnlyToOutput();`
			`}`
			`// Explicit virama [H z]`
			`MultiCodePart(2);`
			`}`
			`} else {`
			`// Pre-virama joiner [{Z\|z} H] requests specific conjunct.`
			`if (UseMultiCode(2)) {`
			`if (report_errors_)`
			`tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");`
			`return false;`
			`}`
			`if (codes_[codes_used_].second == kZeroWidthJoiner \|\|`
			`codes_[codes_used_].second == kZeroWidthNonJoiner) {`
			`if (report_errors_) {`
			`tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),`
			`codes_[codes_used_].second);`
			`}`
			`return false;`
			`}`
			`}`
			`// It is good so far as it goes.`
			`return true;`
			`}`

			`// Helper consumes/copies a series of consonants separated by viramas while`
			`// valid, but not any vowel or other modifiers.`
			`bool ValidateIndic::ConsumeConsonantHeadIfValid() {`
			`const int num_codes = codes_.size();`
			`// Consonant aksara`
			`do {`
			`CodeOnlyToOutput();`
			`// Special Sinhala case of [H Z Yayana/Rayana].`
			`int index = output_.size() - 3;`
			`if (output_used_ <= index &&`
			`(output_.back() == kYayana \|\| output_.back() == kRayana) &&`
			`IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {`
			`MultiCodePart(3);`
			`}`
			`bool have_nukta = false;`
			`if (codes_used_ < num_codes &&`
			`codes_[codes_used_].first == CharClass::kNukta) {`
			`have_nukta = true;`
			`CodeOnlyToOutput();`
			`}`
			`// Test for subscript conjunct.`
			`index = output_.size() - 2 - have_nukta;`
			`if (output_used_ <= index && IsSubscriptScript() &&`
			`IsVirama(output_[index])) {`
			`// Output previous virama, consonant + optional nukta.`
			`MultiCodePart(2 + have_nukta);`
			`}`
			`IndicPair joiner(CharClass::kOther, 0);`
			`if (codes_used_ < num_codes &&`
			`(codes_[codes_used_].second == kZeroWidthJoiner \|\|`
			`(codes_[codes_used_].second == kZeroWidthNonJoiner &&`
			`script_ == ViramaScript::kMalayalam))) {`
			`joiner = codes_[codes_used_];`
			`if (++codes_used_ == num_codes) {`
			`if (report_errors_) {`
			`tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),`
			`joiner.second);`
			`}`
			`return true;`
			`}`
			`if (codes_[codes_used_].first == CharClass::kVirama) {`
			`output_.push_back(joiner.second);`
			`} else {`
			`if (report_errors_) {`
			`tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",`
			`output_.back(), joiner.second, codes_[codes_used_].second);`
			`}`
			`joiner = std::make_pair(CharClass::kOther, 0);`
			`}`
			`}`
			`if (codes_used_ < num_codes &&`
			`codes_[codes_used_].first == CharClass::kVirama) {`
			`if (!ConsumeViramaIfValid(joiner, false)) return false;`
			`} else {`
			`break; // No virama, so the run of consonants is over.`
			`}`
			`} while (codes_used_ < num_codes &&`
			`codes_[codes_used_].first == CharClass::kConsonant);`
			`if (output_used_ < output_.size()) MultiCodePart(1);`
			`return true;`
			`}`

			`// Helper consumes/copies a tail part of a consonant, comprising optional`
			`// matra/piece, vowel modifier, vedic mark, terminating virama.`
			`bool ValidateIndic::ConsumeConsonantTailIfValid() {`
			`if (codes_used_ == codes_.size()) return true;`
			`// No virama: Finish the grapheme.`
			`// Are multiple matras allowed?`
			`if (codes_[codes_used_].first == CharClass::kMatra) {`
			`if (UseMultiCode(1)) return true;`
			`if (codes_[codes_used_].first == CharClass::kMatraPiece) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`}`
			`while (codes_[codes_used_].first == CharClass::kVowelModifier) {`
			`if (UseMultiCode(1)) return true;`
			`// Only Malayalam allows only repeated 0xd02.`
			`if (script_ != ViramaScript::kMalayalam \|\| output_.back() != 0xd02) break;`
			`}`
			`while (codes_[codes_used_].first == CharClass::kVedicMark) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`if (codes_[codes_used_].first == CharClass::kVirama) {`
			`if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {`
			`return false;`
			`}`
			`}`
			`// What we have consumed so far is a valid consonant cluster.`
			`if (output_used_ < output_.size()) MultiCodePart(1);`

			`return true;`
			`}`

			`// Helper consumes/copies a vowel and optional modifiers.`
			`bool ValidateIndic::ConsumeVowelIfValid() {`
			`if (UseMultiCode(1)) return true;`
			`while (codes_[codes_used_].first == CharClass::kVowelModifier) {`
			`if (UseMultiCode(1)) return true;`
			`// Only Malayalam allows repeated modifiers?`
			`if (script_ != ViramaScript::kMalayalam) break;`
			`}`
			`while (codes_[codes_used_].first == CharClass::kVedicMark) {`
			`if (UseMultiCode(1)) return true;`
			`}`
			`// What we have consumed so far is a valid vowel cluster.`
			`return true;`
			`}`

			`} // namespace tesseract`