Merge pull request #1823 from Shreeshrii/javanese

Add support for Javanese script - aksara Jawa
2024-12-11 23:19:04 +08:00 · 2018-08-04 16:08:23 +03:00 · 2018-08-04 16:08:23 +03:00 · 3b723ba102
commit 3b723ba102
parent e9b4e21e6f 7957288fd5
7 changed files with 350 additions and 8 deletions
--- a/src/training/CMakeLists.txt
+++ b/src/training/CMakeLists.txt
@ -186,9 +186,9 @@ set(unicharset_training_src
    unicharset_training_utils.h
    validate_grapheme.h validate_indic.h validate_khmer.h
-    validate_myanmar.h validator.h
+    validate_javanese.h validate_myanmar.h validator.h
    validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
-    validate_myanmar.cpp validator.cpp
+    validate_javanese.cpp validate_myanmar.cpp validator.cpp
 )
 add_library                 (unicharset_training ${unicharset_training_src})
--- a/src/training/Makefile.am
+++ b/src/training/Makefile.am
@ -45,6 +45,7 @@ noinst_HEADERS = \
    util.h \
    validate_grapheme.h \
    validate_indic.h \
    validate_javanese.h \
    validate_khmer.h \
    validate_myanmar.h \
    validator.h
@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \
    unicharset_training_utils.cpp \
    validate_grapheme.cpp \
    validate_indic.cpp \
    validate_javanese.cpp \
    validate_khmer.cpp \
    validate_myanmar.cpp \
    validator.cpp
--- a/src/training/language-specific.sh
+++ b/src/training/language-specific.sh
@ -21,7 +21,7 @@
 VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
                      ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
                      ell eng enm epo est eus fas fil fin fra frk frm gle glg
-                      grc guj hat heb hin hrv hun hye iku ind isl ita ita_old
+                      grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
                      jav jpn kan kat kat_old kaz khm kir kor kur lao lat
                      lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
                      pan pol por pus ron rus san sin slk slv snd spa spa_old
@ -961,6 +961,7 @@ set_lang_specific_parameters() {
    glg ) ;;
    hat ) ;;
    hrv ) ;;
    iast ) ;;
    ind ) ;;
    isl ) ;;
    ita ) ;;
@ -1171,7 +1172,7 @@ set_lang_specific_parameters() {
      LANG_IS_RTL="1"
      NORM_MODE="2" ;;
    asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
-    dzo | sin | san | bod | ori | khm | mya | tha | lao )
+    dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
      LANG_IS_RTL="0"
      NORM_MODE="2" ;;
    * )
--- a/src/training/validate_javanese.cpp
+++ b/src/training/validate_javanese.cpp
@ -0,0 +1,263 @@
 /**********************************************************************
 * File:        validate_javanese.cpp
 * Description: Text validator for Javanese Script - aksara jawa.
 * Author:      Shree Devi Kumar
 * Created:     August 03, 2018
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **********************************************************************/
 #include "validate_javanese.h"
 #include "errcode.h"
 #include "tprintf.h"
 namespace tesseract {
 // Returns whether codes matches the pattern for a Javanese Grapheme.
 // Taken from unicode standard:
 // http://www.unicode.org/charts/PDF/UA980.pdf
 // http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
 // The order of components in an orthographic syllable as expressed in BNF is:
 // {C F} C {{R}Y} {V{A}} {Z}
 // Translated to the codes used by the CharClass enum:
 // [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
 // Also the Consonant class here includes independent vowels, as they are
 // treated the same anyway.
 // Indic - for reference
 //  + vowel Grapheme:  V[D](v)*
 //  + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
 bool ValidateJavanese::ConsumeGraphemeIfValid() {
  switch (codes_[codes_used_].first) {
    case CharClass::kConsonant:
      return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
    case CharClass::kVowel:
    case CharClass::kVedicMark:
      return ConsumeVowelIfValid();
    case CharClass::kZeroWidthJoiner:
    case CharClass::kZeroWidthNonJoiner:
      // Apart from within an aksara, joiners are silently dropped.
      if (report_errors_)
        tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
      ++codes_used_;
      return true;
    case CharClass::kOther:
      UseMultiCode(1);
      return true;
    default:
      if (report_errors_) {
        tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
                codes_[codes_used_].first, codes_[codes_used_].second);
      }
      return false;
  }
 }
 Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
  // Offset from the start of the relevant unicode code block aka code page.
  int off = ch - static_cast<char32>(script_);
  // Anything in another code block is other.
  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
  if (off < 0x4) return CharClass::kVowelModifier;
  if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
  if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
  if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
  if (off <= 0x3d) return CharClass::kMatra;
  if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
  if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
  return CharClass::kOther;
 }
 // Helper consumes/copies a virama and any associated post-virama joiners.
 // A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
 // no joiner at all) must be followed by a consonant.
 // A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
 // consonant, space, or character from a different script. We clean up the
 // representation to make it consistent by adding a ZWNJ if missing from a
 // non-linking virama. Returns false with an invalid sequence.
 bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
  int num_codes = codes_.size();
  if (joiner.first == CharClass::kOther) {
    CodeOnlyToOutput();
    if (codes_used_ < num_codes &&
        codes_[codes_used_].second == kZeroWidthJoiner) {
      // Post-matra viramas must be explicit, so no joiners allowed here.
      if (post_matra) {
        if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
        return false;
      }
      if (codes_used_ + 1 < num_codes &&
          codes_[codes_used_ - 2].second != kCakra &&
          (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
           codes_[codes_used_ + 1].second == kPengkal ||
           codes_[codes_used_ + 1].second == kCakra)) {
        // This combination will be picked up later.
        ASSERT_HOST(!CodeOnlyToOutput());
      } else {
        // Half-form with optional Nukta.
        int len = output_.size() + 1 - output_used_;
        if (UseMultiCode(len)) return true;
      }
      if (codes_used_ < num_codes &&
          codes_[codes_used_].second == kZeroWidthNonJoiner) {
        if (output_used_ == output_.size() ||
            output_[output_used_] != kCakra) {
          if (report_errors_) {
            tprintf("Virama ZWJ ZWNJ : base=0x%x!\n",
                    static_cast<int>(script_));
          }
          return false;
        }
      }
    } else if (codes_used_ == num_codes ||
               codes_[codes_used_].first != CharClass::kConsonant ||
               post_matra) {
      if (codes_used_ == num_codes ||
          codes_[codes_used_].second != kZeroWidthNonJoiner) {
        // It is valid to have an unterminated virama at the end of a word, but
        // for consistency, we will always add ZWNJ if not present.
        output_.push_back(kZeroWidthNonJoiner);
      } else {
        CodeOnlyToOutput();
      }
      // Explicit virama [H z]
      MultiCodePart(2);
    }
  } else {
    // Pre-virama joiner [{Z|z} H] requests specific conjunct.
    if (UseMultiCode(2)) {
      if (report_errors_)
        tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
      return false;
    }
    if (codes_[codes_used_].second == kZeroWidthJoiner ||
        codes_[codes_used_].second == kZeroWidthNonJoiner) {
      if (report_errors_) {
        tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
                codes_[codes_used_].second);
      }
      return false;
    }
  }
  // It is good so far as it goes.
  return true;
 }
 // Helper consumes/copies a series of consonants separated by viramas while
 // valid, but not any vowel or other modifiers.
 bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
  const int num_codes = codes_.size();
  // Consonant aksara
  do {
    CodeOnlyToOutput();
    // Special case of medial consonants [H Z Pengkal/Cakra].
    int index = output_.size() - 3;
    if (output_used_ <= index &&
        (output_.back() == kPengkal || output_.back() == kCakra) &&
        IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
      MultiCodePart(3);
    }
    bool have_nukta = false;
    if (codes_used_ < num_codes &&
        codes_[codes_used_].first == CharClass::kNukta) {
      have_nukta = true;
      CodeOnlyToOutput();
    }
    // Test for subscript conjunct.
    index = output_.size() - 2 - have_nukta;
    if (output_used_ <= index && IsSubscriptScript() &&
        IsVirama(output_[index])) {
      // Output previous virama, consonant + optional nukta.
      MultiCodePart(2 + have_nukta);
    }
    IndicPair joiner(CharClass::kOther, 0);
    if (codes_used_ < num_codes &&
        (codes_[codes_used_].second == kZeroWidthJoiner ||
         (codes_[codes_used_].second == kZeroWidthNonJoiner &&
          script_ == ViramaScript::kMalayalam))) {
      joiner = codes_[codes_used_];
      if (++codes_used_ == num_codes) {
        if (report_errors_) {
          tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
                  joiner.second);
        }
        return true;
      }
      if (codes_[codes_used_].first == CharClass::kVirama) {
        output_.push_back(joiner.second);
      } else {
        if (report_errors_) {
          tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
                  output_.back(), joiner.second, codes_[codes_used_].second);
        }
        joiner = std::make_pair(CharClass::kOther, 0);
      }
    }
    if (codes_used_ < num_codes &&
        codes_[codes_used_].first == CharClass::kVirama) {
      if (!ConsumeViramaIfValid(joiner, false)) return false;
    } else {
      break;  // No virama, so the run of consonants is over.
    }
  } while (codes_used_ < num_codes &&
           codes_[codes_used_].first == CharClass::kConsonant);
  if (output_used_ < output_.size()) MultiCodePart(1);
  return true;
 }
 // Helper consumes/copies a tail part of a consonant, comprising optional
 // matra/piece, vowel modifier, vedic mark, terminating virama.
 bool ValidateJavanese::ConsumeConsonantTailIfValid() {
  if (codes_used_ == codes_.size()) return true;
  // No virama: Finish the grapheme.
  // Are multiple matras allowed?
  if (codes_[codes_used_].first == CharClass::kMatra) {
    if (UseMultiCode(1)) return true;
    if (codes_[codes_used_].first == CharClass::kMatraPiece) {
      if (UseMultiCode(1)) return true;
    }
  }
  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
    if (UseMultiCode(1)) return true;
  }
  while (codes_[codes_used_].first == CharClass::kVedicMark) {
    if (UseMultiCode(1)) return true;
  }
  if (codes_[codes_used_].first == CharClass::kVirama) {
    if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
      return false;
    }
  }
  // What we have consumed so far is a valid consonant cluster.
  if (output_used_ < output_.size()) MultiCodePart(1);
  return true;
 }
 // Helper consumes/copies a vowel and optional modifiers.
 bool ValidateJavanese::ConsumeVowelIfValid() {
  if (UseMultiCode(1)) return true;
  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
    if (UseMultiCode(1)) return true;
  }
  while (codes_[codes_used_].first == CharClass::kVedicMark) {
    if (UseMultiCode(1)) return true;
  }
  // What we have consumed so far is a valid vowel cluster.
  return true;
 }
 }  // namespace tesseract
--- a/src/training/validate_javanese.h
+++ b/src/training/validate_javanese.h
@ -0,0 +1,63 @@
 /**********************************************************************
 * File:        validate_javanese.h
 * Description: Text validator for Javanese Script - aksara jawa.
 * Author:      Shree Devi Kumar
 * Created:     August 03, 2018
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **********************************************************************/
 #ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
 #define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
 #include "validator.h"
 namespace tesseract {
 // Subclass of Validator that validates and segments Javanese scripts 
 class ValidateJavanese : public Validator {
 public:
  ValidateJavanese(ViramaScript script, bool report_errors)
      : Validator(script, report_errors) {}
  ~ValidateJavanese() {}
 protected:
  // Returns whether codes matches the pattern for an Javanese Grapheme.
  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
  // parts_ and output_. Returns true if a valid Grapheme was consumed,
  // otherwise does not increment codes_used_.
  bool ConsumeGraphemeIfValid() override;
  // Returns the CharClass corresponding to the given Unicode ch.
  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
 private:
  // Helper consumes/copies a virama and any associated post-virama joiners.
  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
  // Helper consumes/copies a series of consonants separated by viramas while
  // valid, but not any vowel or other modifiers.
  bool ConsumeConsonantHeadIfValid();
  // Helper consumes/copies a tail part of a consonant, comprising optional
  // matra/piece, vowel modifier, vedic mark, terminating virama.
  bool ConsumeConsonantTailIfValid();
  // Helper consumes/copies a vowel and optional modifiers.
  bool ConsumeVowelIfValid();
  // Some special unicodes used only for Javanese processing.
  static const char32 kPengkal = 0xa9be;  // Javanese Ya
  static const char32 kCakra = 0xa9bf;  // Javanese Ra
 };
 }  // namespace tesseract
 #endif  // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
--- a/src/training/validator.cpp
+++ b/src/training/validator.cpp
@ -10,6 +10,7 @@
 #include "unicode/uscript.h"  // From libicu
 #include "validate_grapheme.h"
 #include "validate_indic.h"
 #include "validate_javanese.h"
 #include "validate_khmer.h"
 #include "validate_myanmar.h"
@ -68,6 +69,9 @@ std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
    case ViramaScript::kNonVirama:
      return std::unique_ptr<Validator>(
          new ValidateGrapheme(script, report_errors));
    case ViramaScript::kJavanese:
      return std::unique_ptr<Validator>(
          new ValidateJavanese(script, report_errors));
    case ViramaScript::kMyanmar:
      return std::unique_ptr<Validator>(
          new ValidateMyanmar(script, report_errors));
@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript(
    const std::vector<char32>& utf32) {
  std::unordered_map<int, int> histogram;
  for (char32 ch : utf32) {
-    // Determine the codepage base. For the Indic scripts, and Khmer, it is
+    // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
    // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
    // unicode code space, so use its script id.
    int base = ch / kIndicCodePageSize;
    IcuErrorCode err;
    UScriptCode script_code = uscript_getScript(ch, err);
-    if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
+    if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
         script_code != USCRIPT_COMMON) ||
        script_code == USCRIPT_MYANMAR) {
      if (script_code == USCRIPT_MYANMAR)
@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript(
    char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
    // Check for validity.
    if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
        codebase == static_cast<char32>(ViramaScript::kJavanese) ||
        codebase == static_cast<char32>(ViramaScript::kKhmer) ||
        (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
         codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript(
 bool Validator::IsVirama(char32 unicode) {
  return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
          (unicode & 0x7f) == 0x4d) ||
-         unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
+         unicode == kSinhalaVirama || 
         unicode == kJavaneseVirama ||
         unicode == kMyanmarVirama ||
         unicode == kKhmerVirama;
 }
@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) {
 bool Validator::IsSubscriptScript() const {
  return script_ == ViramaScript::kTelugu ||
         script_ == ViramaScript::kKannada ||
-         script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer;
+         script_ == ViramaScript::kJavanese || 
         script_ == ViramaScript::kMyanmar || 
         script_ == ViramaScript::kKhmer;
 }
 void Validator::ComputeClassCodes(const std::vector<char32>& text) {
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -64,6 +64,7 @@ enum class ViramaScript : char32 {
  kSinhala = 0xd80,
  kMyanmar = 0x1000,
  kKhmer = 0x1780,
  kJavanese = 0xa980,
 };
 // Base class offers a validation API and protected methods to allow subclasses
@ -221,6 +222,9 @@ class Validator {
  static const char32 kSinhalaVirama = 0xdca;
  static const char32 kMyanmarVirama = 0x1039;
  static const char32 kKhmerVirama = 0x17d2;
  // Javanese Script - aksarajawa
  static const char32 kJavaneseVirama = 0xa9c0;
  static const char32 kMaxJavaneseUnicode = 0xa9df;
  // Script we are operating on.
  ViramaScript script_;