From 0eb7be1cd1707931abd77903793bf966a6640d58 Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Fri, 3 Aug 2018 13:59:27 +0000
Subject: [PATCH 1/3] Initial COmmit to add Aksara Jawa - Javanese script

---
 src/training/CMakeLists.txt        |   4 +-
 src/training/Makefile.am           |   2 +
 src/training/language-specific.sh  |   5 +-
 src/training/validate_javanese.cpp | 116 +++++++++++++++++++++++++++++
 src/training/validate_javanese.h   |  45 +++++++++++
 src/training/validator.cpp         |  17 ++++-
 src/training/validator.h           |   4 +
 7 files changed, 185 insertions(+), 8 deletions(-)
 create mode 100644 src/training/validate_javanese.cpp
 create mode 100644 src/training/validate_javanese.h

diff --git a/src/training/CMakeLists.txt b/src/training/CMakeLists.txt
index 8de67772..cb6cf1a0 100644
--- a/src/training/CMakeLists.txt
+++ b/src/training/CMakeLists.txt
@@ -186,9 +186,9 @@ set(unicharset_training_src
     unicharset_training_utils.h
 
     validate_grapheme.h validate_indic.h validate_khmer.h
-    validate_myanmar.h validator.h
+    validate_javanese.h validate_myanmar.h validator.h
     validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
-    validate_myanmar.cpp validator.cpp
+    validate_javanese.cpp validate_myanmar.cpp validator.cpp
 
 )
 add_library                 (unicharset_training ${unicharset_training_src})
diff --git a/src/training/Makefile.am b/src/training/Makefile.am
index fd38ffbe..3a8e6c1a 100644
--- a/src/training/Makefile.am
+++ b/src/training/Makefile.am
@@ -45,6 +45,7 @@ noinst_HEADERS = \
     util.h \
     validate_grapheme.h \
     validate_indic.h \
+    validate_javanese.h \
     validate_khmer.h \
     validate_myanmar.h \
     validator.h
@@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \
     unicharset_training_utils.cpp \
     validate_grapheme.cpp \
     validate_indic.cpp \
+    validate_javanese.h \
     validate_khmer.cpp \
     validate_myanmar.cpp \
     validator.cpp
diff --git a/src/training/language-specific.sh b/src/training/language-specific.sh
index 0f8fa6ed..b6d834bb 100755
--- a/src/training/language-specific.sh
+++ b/src/training/language-specific.sh
@@ -21,7 +21,7 @@
 VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
                       ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
                       ell eng enm epo est eus fas fil fin fra frk frm gle glg
-                      grc guj hat heb hin hrv hun hye iku ind isl ita ita_old
+                      grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
                       jav jpn kan kat kat_old kaz khm kir kor kur lao lat
                       lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
                       pan pol por pus ron rus san sin slk slv snd spa spa_old
@@ -961,6 +961,7 @@ set_lang_specific_parameters() {
     glg ) ;;
     hat ) ;;
     hrv ) ;;
+    iast ) ;;
     ind ) ;;
     isl ) ;;
     ita ) ;;
@@ -1171,7 +1172,7 @@ set_lang_specific_parameters() {
       LANG_IS_RTL="1"
       NORM_MODE="2" ;;
     asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
-    dzo | sin | san | bod | ori | khm | mya | tha | lao )
+    dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
       LANG_IS_RTL="0"
       NORM_MODE="2" ;;
     * )
diff --git a/src/training/validate_javanese.cpp b/src/training/validate_javanese.cpp
new file mode 100644
index 00000000..38119917
--- /dev/null
+++ b/src/training/validate_javanese.cpp
@@ -0,0 +1,116 @@
+/**********************************************************************
+    * File:        validate_javanese.cpp
+ * Description: Text validator for Javanese Script - aksara jawa.
+ * Author:      Shree Devi Kumar
+ * Created:     August 03, 2018
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+ 
+ #include "validate_javanese.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Javanese Grapheme.
+// Taken from unicode standard:
+// http://www.unicode.org/charts/PDF/UA980.pdf
+// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
+// Also the Consonant class here includes independent vowels, as they are
+// treated the same anyway.
+
+bool ValidateJavanese::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  if (codes_used_ == num_codes) return false;
+  if (codes_[codes_used_].first == CharClass::kOther) {
+    UseMultiCode(1);
+    return true;
+  }
+  if (codes_[codes_used_].first != CharClass::kConsonant) {
+    if (report_errors_) {
+      tprintf("Invalid start of Javanese syllable:0x%x\n",
+              codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (UseMultiCode(1)) return true;
+  if (      codes_[codes_used_].first == CharClass::kNukta) {
+    if (UseMultiCode(1)) return true;
+  }
+  while (codes_used_ + 1 < num_codes &&
+         codes_[codes_used_].first == CharClass::kVirama &&
+         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+    if (codes_[codes_used_].first == CharClass::kRobat) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  int num_matra_parts = 0;
+  if (codes_[codes_used_].second == kZeroWidthJoiner ||
+      codes_[codes_used_].second == kZeroWidthNonJoiner) {
+    if (CodeOnlyToOutput()) {
+      if (report_errors_) {
+        tprintf("Unterminated joiner: 0x%x\n", output_.back());
+      }
+      return false;
+    }
+    ++num_matra_parts;
+  }
+  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
+  // own or as an addition to other matras.
+  if (codes_[codes_used_].first == CharClass::kMatra) {
+    ++num_matra_parts;
+    if (UseMultiCode(num_matra_parts)) return true;
+  } else if (num_matra_parts) {
+    if (report_errors_) {
+      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
+              output_.back(), codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
+      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_used_ + 1 < num_codes &&
+      codes_[codes_used_].first == CharClass::kVirama &&
+      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+  }
+  return true;
+}
+
+Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
+  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
+  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
+  // Offset from the start of the relevant unicode code block aka code page.
+  int off = ch - static_cast<char32>(script_);
+  // Anything in another code block is other.
+  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
+  if (off < 0x4) return CharClass::kVowelModifier;
+  if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
+  if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
+  if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG
+  if (off <= 0x3d) return CharClass::kMatra;
+  if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA
+  if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
+  return CharClass::kOther;
+}
+
+}  // namespace tesseract
diff --git a/src/training/validate_javanese.h b/src/training/validate_javanese.h
new file mode 100644
index 00000000..adc8256b
--- /dev/null
+++ b/src/training/validate_javanese.h
@@ -0,0 +1,45 @@
+/**********************************************************************
+    * File:        validate_javanese.h
+ * Description: Text validator for Javanese Script - aksara jawa.
+ * Author:      Shree Devi Kumar
+ * Created:     August 03, 2018
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+ 
+#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
+#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Javanese.
+class ValidateJavanese : public Validator {
+ public:
+  ValidateJavanese(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateJavanese() {}
+
+ protected:
+  // Returns whether codes matches the pattern for an Javanese Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  CharClass UnicodeToCharClass(char32 ch) const override;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
diff --git a/src/training/validator.cpp b/src/training/validator.cpp
index d764c3da..ea5978c7 100644
--- a/src/training/validator.cpp
+++ b/src/training/validator.cpp
@@ -10,6 +10,7 @@
 #include "unicode/uscript.h"  // From libicu
 #include "validate_grapheme.h"
 #include "validate_indic.h"
+#include "validate_javanese.h"
 #include "validate_khmer.h"
 #include "validate_myanmar.h"
 
@@ -68,6 +69,9 @@ std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
     case ViramaScript::kNonVirama:
       return std::unique_ptr<Validator>(
           new ValidateGrapheme(script, report_errors));
+    case ViramaScript::kJavanese:
+      return std::unique_ptr<Validator>(
+          new ValidateJavanese(script, report_errors));
     case ViramaScript::kMyanmar:
       return std::unique_ptr<Validator>(
           new ValidateMyanmar(script, report_errors));
@@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript(
     const std::vector<char32>& utf32) {
   std::unordered_map<int, int> histogram;
   for (char32 ch : utf32) {
-    // Determine the codepage base. For the Indic scripts, and Khmer, it is
+    // Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
     // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
     // unicode code space, so use its script id.
     int base = ch / kIndicCodePageSize;
     IcuErrorCode err;
     UScriptCode script_code = uscript_getScript(ch, err);
-    if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
+    if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
          script_code != USCRIPT_COMMON) ||
         script_code == USCRIPT_MYANMAR) {
       if (script_code == USCRIPT_MYANMAR)
@@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript(
     char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
     // Check for validity.
     if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
+        codebase == static_cast<char32>(ViramaScript::kJavanese) ||
         codebase == static_cast<char32>(ViramaScript::kKhmer) ||
         (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
          codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
@@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript(
 bool Validator::IsVirama(char32 unicode) {
   return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
           (unicode & 0x7f) == 0x4d) ||
-         unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
+         unicode == kSinhalaVirama || 
+         unicode == kJavaneseVirama ||
+         unicode == kMyanmarVirama ||
          unicode == kKhmerVirama;
 }
 
@@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) {
 bool Validator::IsSubscriptScript() const {
   return script_ == ViramaScript::kTelugu ||
          script_ == ViramaScript::kKannada ||
-         script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer;
+         script_ == ViramaScript::kJavanese || 
+         script_ == ViramaScript::kMyanmar || 
+         script_ == ViramaScript::kKhmer;
 }
 
 void Validator::ComputeClassCodes(const std::vector<char32>& text) {
diff --git a/src/training/validator.h b/src/training/validator.h
index 890cfac5..741e76e2 100644
--- a/src/training/validator.h
+++ b/src/training/validator.h
@@ -64,6 +64,7 @@ enum class ViramaScript : char32 {
   kSinhala = 0xd80,
   kMyanmar = 0x1000,
   kKhmer = 0x1780,
+  kJavanese = 0xa980,
 };
 
 // Base class offers a validation API and protected methods to allow subclasses
@@ -221,6 +222,9 @@ class Validator {
   static const char32 kSinhalaVirama = 0xdca;
   static const char32 kMyanmarVirama = 0x1039;
   static const char32 kKhmerVirama = 0x17d2;
+  // Javanese Script - aksarajawa
+  static const char32 kJavaneseVirama = 0xa9c0;
+  static const char32 kMaxJavaneseUnicode = 0xa9df;
 
   // Script we are operating on.
   ViramaScript script_;

From f93f9e8a0951a3c3502c780b035b66bb967eef12 Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Fri, 3 Aug 2018 14:33:24 +0000
Subject: [PATCH 2/3] fix typo re Javanese

---
 src/training/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/training/Makefile.am b/src/training/Makefile.am
index 3a8e6c1a..c7b01d73 100644
--- a/src/training/Makefile.am
+++ b/src/training/Makefile.am
@@ -77,7 +77,7 @@ libtesseract_training_la_SOURCES = \
     unicharset_training_utils.cpp \
     validate_grapheme.cpp \
     validate_indic.cpp \
-    validate_javanese.h \
+    validate_javanese.cpp \
     validate_khmer.cpp \
     validate_myanmar.cpp \
     validator.cpp

From 7957288fd5502551b6c7f073c5f4ecd1f0b11dd8 Mon Sep 17 00:00:00 2001
From: Shree Devi Kumar <shreeshrii@gmail.com>
Date: Sat, 4 Aug 2018 09:43:53 +0000
Subject: [PATCH 3/3] chamge validate javanese similar to indic

---
 src/training/validate_javanese.cpp | 273 ++++++++++++++++++++++-------
 src/training/validate_javanese.h   |  24 ++-
 2 files changed, 231 insertions(+), 66 deletions(-)

diff --git a/src/training/validate_javanese.cpp b/src/training/validate_javanese.cpp
index 38119917..8ee6ef96 100644
--- a/src/training/validate_javanese.cpp
+++ b/src/training/validate_javanese.cpp
@@ -1,5 +1,5 @@
 /**********************************************************************
-    * File:        validate_javanese.cpp
+ * File:        validate_javanese.cpp
  * Description: Text validator for Javanese Script - aksara jawa.
  * Author:      Shree Devi Kumar
  * Created:     August 03, 2018
@@ -16,7 +16,7 @@
  *
  **********************************************************************/
  
- #include "validate_javanese.h"
+#include "validate_javanese.h"
 #include "errcode.h"
 #include "tprintf.h"
 
@@ -26,73 +26,40 @@ namespace tesseract {
 // Taken from unicode standard:
 // http://www.unicode.org/charts/PDF/UA980.pdf
 // http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
+// The order of components in an orthographic syllable as expressed in BNF is:
+// {C F} C {{R}Y} {V{A}} {Z}
+// Translated to the codes used by the CharClass enum:
+// [(V|C[N])(H)] (V|C[N]) [[R]Y] [M[D]] [D]
 // Also the Consonant class here includes independent vowels, as they are
 // treated the same anyway.
+// Indic - for reference
+//  + vowel Grapheme:  V[D](v)*
+//  + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
 
 bool ValidateJavanese::ConsumeGraphemeIfValid() {
-  int num_codes = codes_.size();
-  if (codes_used_ == num_codes) return false;
-  if (codes_[codes_used_].first == CharClass::kOther) {
-    UseMultiCode(1);
-    return true;
-  }
-  if (codes_[codes_used_].first != CharClass::kConsonant) {
-    if (report_errors_) {
-      tprintf("Invalid start of Javanese syllable:0x%x\n",
-              codes_[codes_used_].second);
-    }
-    return false;
-  }
-  if (UseMultiCode(1)) return true;
-  if (      codes_[codes_used_].first == CharClass::kNukta) {
-    if (UseMultiCode(1)) return true;
-  }
-  while (codes_used_ + 1 < num_codes &&
-         codes_[codes_used_].first == CharClass::kVirama &&
-         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(2)) return true;
-    if (codes_[codes_used_].first == CharClass::kRobat) {
-      if (UseMultiCode(1)) return true;
-    }
-  }
-  int num_matra_parts = 0;
-  if (codes_[codes_used_].second == kZeroWidthJoiner ||
-      codes_[codes_used_].second == kZeroWidthNonJoiner) {
-    if (CodeOnlyToOutput()) {
+  switch (codes_[codes_used_].first) {
+    case CharClass::kConsonant:
+      return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
+    case CharClass::kVowel:
+    case CharClass::kVedicMark:
+      return ConsumeVowelIfValid();
+    case CharClass::kZeroWidthJoiner:
+    case CharClass::kZeroWidthNonJoiner:
+      // Apart from within an aksara, joiners are silently dropped.
+      if (report_errors_)
+        tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
+      ++codes_used_;
+      return true;
+    case CharClass::kOther:
+      UseMultiCode(1);
+      return true;
+    default:
       if (report_errors_) {
-        tprintf("Unterminated joiner: 0x%x\n", output_.back());
+        tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
+                codes_[codes_used_].first, codes_[codes_used_].second);
       }
       return false;
-    }
-    ++num_matra_parts;
   }
-  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
-  // own or as an addition to other matras.
-  if (codes_[codes_used_].first == CharClass::kMatra) {
-    ++num_matra_parts;
-    if (UseMultiCode(num_matra_parts)) return true;
-  } else if (num_matra_parts) {
-    if (report_errors_) {
-      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
-              output_.back(), codes_[codes_used_].second);
-    }
-    return false;
-  }
-  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
-      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
-    if (UseMultiCode(1)) return true;
-  }
-  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
-    if (UseMultiCode(1)) return true;
-  }
-  if (codes_used_ + 1 < num_codes &&
-      codes_[codes_used_].first == CharClass::kVirama &&
-      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(2)) return true;
-  }
-  return true;
 }
 
 Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
@@ -106,11 +73,191 @@ Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
   if (off < 0x4) return CharClass::kVowelModifier;
   if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
   if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
-  if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG
+  if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
   if (off <= 0x3d) return CharClass::kMatra;
-  if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA
+  if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
   if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
   return CharClass::kOther;
 }
 
+// Helper consumes/copies a virama and any associated post-virama joiners.
+// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
+// no joiner at all) must be followed by a consonant.
+// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
+// consonant, space, or character from a different script. We clean up the
+// representation to make it consistent by adding a ZWNJ if missing from a
+// non-linking virama. Returns false with an invalid sequence.
+bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
+  int num_codes = codes_.size();
+  if (joiner.first == CharClass::kOther) {
+    CodeOnlyToOutput();
+    if (codes_used_ < num_codes &&
+        codes_[codes_used_].second == kZeroWidthJoiner) {
+      // Post-matra viramas must be explicit, so no joiners allowed here.
+      if (post_matra) {
+        if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
+        return false;
+      }
+      if (codes_used_ + 1 < num_codes &&
+          codes_[codes_used_ - 2].second != kCakra &&
+          (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
+           codes_[codes_used_ + 1].second == kPengkal ||
+           codes_[codes_used_ + 1].second == kCakra)) {
+        // This combination will be picked up later.
+        ASSERT_HOST(!CodeOnlyToOutput());
+      } else {
+        // Half-form with optional Nukta.
+        int len = output_.size() + 1 - output_used_;
+        if (UseMultiCode(len)) return true;
+      }
+      if (codes_used_ < num_codes &&
+          codes_[codes_used_].second == kZeroWidthNonJoiner) {
+        if (output_used_ == output_.size() ||
+            output_[output_used_] != kCakra) {
+          if (report_errors_) {
+            tprintf("Virama ZWJ ZWNJ : base=0x%x!\n",
+                    static_cast<int>(script_));
+          }
+          return false;
+        }
+      }
+    } else if (codes_used_ == num_codes ||
+               codes_[codes_used_].first != CharClass::kConsonant ||
+               post_matra) {
+      if (codes_used_ == num_codes ||
+          codes_[codes_used_].second != kZeroWidthNonJoiner) {
+        // It is valid to have an unterminated virama at the end of a word, but
+        // for consistency, we will always add ZWNJ if not present.
+        output_.push_back(kZeroWidthNonJoiner);
+      } else {
+        CodeOnlyToOutput();
+      }
+      // Explicit virama [H z]
+      MultiCodePart(2);
+    }
+  } else {
+    // Pre-virama joiner [{Z|z} H] requests specific conjunct.
+    if (UseMultiCode(2)) {
+      if (report_errors_)
+        tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
+      return false;
+    }
+    if (codes_[codes_used_].second == kZeroWidthJoiner ||
+        codes_[codes_used_].second == kZeroWidthNonJoiner) {
+      if (report_errors_) {
+        tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
+                codes_[codes_used_].second);
+      }
+      return false;
+    }
+  }
+  // It is good so far as it goes.
+  return true;
+}
+
+// Helper consumes/copies a series of consonants separated by viramas while
+// valid, but not any vowel or other modifiers.
+bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
+  const int num_codes = codes_.size();
+  // Consonant aksara
+  do {
+    CodeOnlyToOutput();
+    // Special case of medial consonants [H Z Pengkal/Cakra].
+    int index = output_.size() - 3;
+    if (output_used_ <= index &&
+        (output_.back() == kPengkal || output_.back() == kCakra) &&
+        IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
+      MultiCodePart(3);
+    }
+    bool have_nukta = false;
+    if (codes_used_ < num_codes &&
+        codes_[codes_used_].first == CharClass::kNukta) {
+      have_nukta = true;
+      CodeOnlyToOutput();
+    }
+    // Test for subscript conjunct.
+    index = output_.size() - 2 - have_nukta;
+    if (output_used_ <= index && IsSubscriptScript() &&
+        IsVirama(output_[index])) {
+      // Output previous virama, consonant + optional nukta.
+      MultiCodePart(2 + have_nukta);
+    }
+    IndicPair joiner(CharClass::kOther, 0);
+    if (codes_used_ < num_codes &&
+        (codes_[codes_used_].second == kZeroWidthJoiner ||
+         (codes_[codes_used_].second == kZeroWidthNonJoiner &&
+          script_ == ViramaScript::kMalayalam))) {
+      joiner = codes_[codes_used_];
+      if (++codes_used_ == num_codes) {
+        if (report_errors_) {
+          tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
+                  joiner.second);
+        }
+        return true;
+      }
+      if (codes_[codes_used_].first == CharClass::kVirama) {
+        output_.push_back(joiner.second);
+      } else {
+        if (report_errors_) {
+          tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
+                  output_.back(), joiner.second, codes_[codes_used_].second);
+        }
+        joiner = std::make_pair(CharClass::kOther, 0);
+      }
+    }
+    if (codes_used_ < num_codes &&
+        codes_[codes_used_].first == CharClass::kVirama) {
+      if (!ConsumeViramaIfValid(joiner, false)) return false;
+    } else {
+      break;  // No virama, so the run of consonants is over.
+    }
+  } while (codes_used_ < num_codes &&
+           codes_[codes_used_].first == CharClass::kConsonant);
+  if (output_used_ < output_.size()) MultiCodePart(1);
+  return true;
+}
+
+// Helper consumes/copies a tail part of a consonant, comprising optional
+// matra/piece, vowel modifier, vedic mark, terminating virama.
+bool ValidateJavanese::ConsumeConsonantTailIfValid() {
+  if (codes_used_ == codes_.size()) return true;
+  // No virama: Finish the grapheme.
+  // Are multiple matras allowed?
+  if (codes_[codes_used_].first == CharClass::kMatra) {
+    if (UseMultiCode(1)) return true;
+    if (codes_[codes_used_].first == CharClass::kMatraPiece) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+  }
+  while (codes_[codes_used_].first == CharClass::kVedicMark) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_[codes_used_].first == CharClass::kVirama) {
+    if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
+      return false;
+    }
+  }
+  // What we have consumed so far is a valid consonant cluster.
+  if (output_used_ < output_.size()) MultiCodePart(1);
+
+  return true;
+}
+
+// Helper consumes/copies a vowel and optional modifiers.
+bool ValidateJavanese::ConsumeVowelIfValid() {
+  if (UseMultiCode(1)) return true;
+  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+  }
+  while (codes_[codes_used_].first == CharClass::kVedicMark) {
+    if (UseMultiCode(1)) return true;
+  }
+  // What we have consumed so far is a valid vowel cluster.
+  return true;
+}
+
 }  // namespace tesseract
+
diff --git a/src/training/validate_javanese.h b/src/training/validate_javanese.h
index adc8256b..2d22c64d 100644
--- a/src/training/validate_javanese.h
+++ b/src/training/validate_javanese.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-    * File:        validate_javanese.h
+ * File:        validate_javanese.h
  * Description: Text validator for Javanese Script - aksara jawa.
  * Author:      Shree Devi Kumar
  * Created:     August 03, 2018
@@ -21,9 +21,11 @@
 
 #include "validator.h"
 
+
 namespace tesseract {
 
-// Subclass of Validator that validates and segments Javanese.
+// Subclass of Validator that validates and segments Javanese scripts 
+
 class ValidateJavanese : public Validator {
  public:
   ValidateJavanese(ViramaScript script, bool report_errors)
@@ -37,7 +39,23 @@ class ValidateJavanese : public Validator {
   // otherwise does not increment codes_used_.
   bool ConsumeGraphemeIfValid() override;
   // Returns the CharClass corresponding to the given Unicode ch.
-  CharClass UnicodeToCharClass(char32 ch) const override;
+  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper consumes/copies a virama and any associated post-virama joiners.
+  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
+  // Helper consumes/copies a series of consonants separated by viramas while
+  // valid, but not any vowel or other modifiers.
+  bool ConsumeConsonantHeadIfValid();
+  // Helper consumes/copies a tail part of a consonant, comprising optional
+  // matra/piece, vowel modifier, vedic mark, terminating virama.
+  bool ConsumeConsonantTailIfValid();
+  // Helper consumes/copies a vowel and optional modifiers.
+  bool ConsumeVowelIfValid();
+
+  // Some special unicodes used only for Javanese processing.
+  static const char32 kPengkal = 0xa9be;  // Javanese Ya
+  static const char32 kCakra = 0xa9bf;  // Javanese Ra
 };
 
 }  // namespace tesseract