tesseract/unittest/validate_indic_test.cc

// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "include_gunit.h"
#include "normstrngs.h"
#include "normstrngs_test.h"

namespace tesseract {
namespace {

// Though the unicode example for Telugu in section 12.7:
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
// shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to
// suppress a conjugate that would otherwise occur.  If a consonant is followed
// by a virama and then by a non-Indic character, OpenType will presume that
// the user simply meant to suppress the inherent vowel of the consonant
// and render it as the consonant with an explicit virama, the same as if
// a ZWNJ had followed. Since this is confusing to an OCR engine, the
// normalizer always puts a termninating ZWNJ on the end if not present,
// and accepts the string as valid.
TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {
  std::string str = "\u0c15\u0c4d";               // KA - virama
  std::string target_str = "\u0c15\u0c4d\u200c";  // KA - virama - ZWNJ
  ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
  // Same result if we started with the normalized string.
  ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1,
                            target_str);
}

// Only one dependent vowel is allowed.
TEST(ValidateIndicTest, OnlyOneDependentVowel) {
  std::string str = "\u0d15\u0d3e\u0d42";  // KA AA UU
  std::string dest;
  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                   GraphemeNorm::kNormalize, str.c_str(),
                                   &dest))
      << PrintString32WithUnicodes(str);
}

//  [c26][c4d][c01]
//     A consonant (DA) followed by the virama followed by a bindu
//     Syllable modifiers [c01][c02][c03] all modify the pronunciation of
//     the vowel in a syllable, as does the virama [c04].  You can only
//     have one of these on a syllable.
//
//  References:
//    http://www.omniglot.com/writing/telugu.htm
TEST(ValidateIndicTest, OnlyOneVowelModifier) {
  std::string str = "\u0c26\u0c4d\u0c01";  // DA virama candrabindu
  std::string result;
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(),
                                  &result));
  // It made 1 grapheme of 4 chars, by terminating the explicit virama.
  EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result);

  str = "\u0995\u0983\u0981";  // KA visarga candrabindu
  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                   GraphemeNorm::kNormalize, str.c_str(),
                                   &result));

  // Exception: Malayalam allows multiple anusvara.
  str = "\u0d15\u0d02\u0d02";  // KA Anusvara Anusvara
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(),
                                  &result));
  EXPECT_EQ(str, result);
}

//  [c28][c02][c3f]
//    A consonant (NA) followed by the Anusvara/sunna and another matra (I).
// The anusvara [c02] is a pronunciation directive
//    for a whole syllable and only appears at the end of the syllable
//  References:
//    + Unicode v9, 12.1 "Modifier Mark Rules R10,"
//       and the Microsoft page
//       http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
TEST(ValidateIndicTest, VowelModifierMustBeLast) {
  std::string str = "\u0c28\u0c02\u0c3f";  // NA Sunna I
  std::string dest;
  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                   GraphemeNorm::kNormalize, str.c_str(),
                                   &dest))
      << PrintString32WithUnicodes(str);
  // Swap c02/c3f and all is ok.
  str = "\u0c28\u0c3f\u0c02";  // NA I Sunna
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_EQ(dest, str);
}

//  [c05][c47]
//    A Vowel (A) followed by a combining vowel/matra (EE).
//    In Telugu, matras are only put on consonants, not independent
//    vowels.
//  References:
//  + Unicode v9, 12.1:
//     Principles of the Devanagari Script: Dependent Vowel Signs (Matras).
//  + http://varamozhi.sourceforge.net/iscii91.pdf
TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
  std::string str = "\u0c05\u0c47";  // A EE
  std::string dest;
  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                   GraphemeNorm::kNormalize, str.c_str(),
                                   &dest))
      << PrintString32WithUnicodes(str);
  str = "\u0c1e\u0c3e";  // NYA AA
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_EQ(dest, str);
}

// Sub-graphemes are allowed if GraphemeNorm is turned off.
TEST(ValidateIndicTest, SubGraphemes) {
  std::string str = "\u0d3e";  // AA
  std::string dest;
  EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                   GraphemeNorm::kNormalize, str.c_str(),
                                   &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNone, str.c_str(), &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_EQ(dest, str);
}

TEST(ValidateIndicTest, Nukta) {
  std::string str = "\u0c95\u0cbc\u0ccd\u0cb9";  // KA Nukta Virama HA
  std::vector<std::string> glyphs;
  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
      true, str.c_str(), &glyphs));
  EXPECT_EQ(glyphs.size(), 3);
  EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9"));
  // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.
  std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9";  // KA Virama Nukta HA
  ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str);
}

// Sinhala has some of its own specific rules. See www.macciato.com/sinhala
TEST(ValidateIndicTest, SinhalaRakaransaya) {
  std::string str = "\u0d9a\u0dca\u200d\u0dbb";  // KA Virama ZWJ Rayanna
  std::string dest;
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_EQ(dest, str);
  std::vector<std::string> glyphs;
  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
      true, str.c_str(), &glyphs));
  EXPECT_EQ(glyphs.size(), 2);
  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb"));
  // Can be followed by a dependent vowel.
  str += "\u0dd9";  // E
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_EQ(dest, str);
}

TEST(ValidateIndicTest, SinhalaYansaya) {
  std::string str = "\u0d9a\u0dca\u200d\u0dba";  // KA Virama ZWJ Yayanna
  std::string dest;
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_EQ(dest, str);
  // Can be followed by a dependent vowel.
  str += "\u0ddd";  // OO
  EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                  GraphemeNorm::kNormalize, str.c_str(), &dest))
      << PrintString32WithUnicodes(str);
  EXPECT_EQ(dest, str);
  std::vector<std::string> glyphs;
  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
      true, str.c_str(), &glyphs));
  EXPECT_EQ(glyphs.size(), 3);
  EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba"));
}

TEST(ValidateIndicTest, SinhalaRepaya) {
  std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8";  // KA Rayanna Virama ZWJ MA
  std::vector<std::string> glyphs;
  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
      str.c_str(), &glyphs));
  EXPECT_EQ(glyphs.size(), 2);
  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8"));
  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
      true, str.c_str(), &glyphs));
  EXPECT_EQ(glyphs.size(), 3);
  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
}

TEST(ValidateIndicTest, SinhalaSpecials) {
  // Sinhala has some exceptions from the usual rules.
  std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";
  std::vector<std::string> glyphs;
  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
      true, str.c_str(), &glyphs));
  EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);
  EXPECT_EQ(glyphs[0], std::string("\u0dc0"));
  EXPECT_EQ(glyphs[1], std::string("\u0d9c"));
  EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb"));
  EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d"));
  EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d"));
  str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";
  EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
      UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
      true, str.c_str(), &glyphs));
  EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);
  EXPECT_EQ(glyphs[0], std::string("\u0dc3"));
  EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));
  EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d"));
  EXPECT_EQ(glyphs[3], std::string("\u0dcf"));
}

}  // namespace
}  // namespace tesseract
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`// (C) Copyright 2017, Google Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

			`#include "include_gunit.h"`
			`#include "normstrngs.h"`
			`#include "normstrngs_test.h"`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00
			`namespace tesseract {`
			`namespace {`

			`// Though the unicode example for Telugu in section 12.7:`
			`// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf`
			`// shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to`
			`// suppress a conjugate that would otherwise occur. If a consonant is followed`
			`// by a virama and then by a non-Indic character, OpenType will presume that`
			`// the user simply meant to suppress the inherent vowel of the consonant`
			`// and render it as the consonant with an explicit virama, the same as if`
			`// a ZWNJ had followed. Since this is confusing to an OCR engine, the`
			`// normalizer always puts a termninating ZWNJ on the end if not present,`
			`// and accepts the string as valid.`
			`TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0c15\u0c4d"; // KA - virama`
			`std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);`
			`// Same result if we started with the normalized string.`
			`ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1,`
			`target_str);`
			`}`

			`// Only one dependent vowel is allowed.`
			`TEST(ValidateIndicTest, OnlyOneDependentVowel) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU`
			`std::string dest;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(),`
			`&dest))`
			`<< PrintString32WithUnicodes(str);`
			`}`

			`// [c26][c4d][c01]`
			`// A consonant (DA) followed by the virama followed by a bindu`
			`// Syllable modifiers [c01][c02][c03] all modify the pronunciation of`
			`// the vowel in a syllable, as does the virama [c04]. You can only`
			`// have one of these on a syllable.`
			`//`
			`// References:`
			`// http://www.omniglot.com/writing/telugu.htm`
			`TEST(ValidateIndicTest, OnlyOneVowelModifier) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu`
			`std::string result;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(),`
			`&result));`
			`// It made 1 grapheme of 4 chars, by terminating the explicit virama.`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result);`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00
			`str = "\u0995\u0983\u0981"; // KA visarga candrabindu`
			`EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(),`
			`&result));`

			`// Exception: Malayalam allows multiple anusvara.`
			`str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara`
			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(),`
			`&result));`
			`EXPECT_EQ(str, result);`
			`}`

			`// [c28][c02][c3f]`
			`// A consonant (NA) followed by the Anusvara/sunna and another matra (I).`
			`// The anusvara [c02] is a pronunciation directive`
			`// for a whole syllable and only appears at the end of the syllable`
			`// References:`
			`// + Unicode v9, 12.1 "Modifier Mark Rules R10,"`
			`// and the Microsoft page`
			`// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx`
			`TEST(ValidateIndicTest, VowelModifierMustBeLast) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I`
			`std::string dest;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(),`
			`&dest))`
			`<< PrintString32WithUnicodes(str);`
			`// Swap c02/c3f and all is ok.`
			`str = "\u0c28\u0c3f\u0c02"; // NA I Sunna`
			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(), &dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_EQ(dest, str);`
			`}`

			`// [c05][c47]`
			`// A Vowel (A) followed by a combining vowel/matra (EE).`
			`// In Telugu, matras are only put on consonants, not independent`
			`// vowels.`
			`// References:`
			`// + Unicode v9, 12.1:`
			`// Principles of the Devanagari Script: Dependent Vowel Signs (Matras).`
			`// + http://varamozhi.sourceforge.net/iscii91.pdf`
			`TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0c05\u0c47"; // A EE`
			`std::string dest;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(),`
			`&dest))`
			`<< PrintString32WithUnicodes(str);`
			`str = "\u0c1e\u0c3e"; // NYA AA`
			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(), &dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_EQ(dest, str);`
			`}`

			`// Sub-graphemes are allowed if GraphemeNorm is turned off.`
			`TEST(ValidateIndicTest, SubGraphemes) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0d3e"; // AA`
			`std::string dest;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(),`
			`&dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNone, str.c_str(), &dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_EQ(dest, str);`
			`}`

			`TEST(ValidateIndicTest, Nukta) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA`
			`std::vector<std::string> glyphs;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(`
			`UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,`
			`true, str.c_str(), &glyphs));`
			`EXPECT_EQ(glyphs.size(), 3);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9"));`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`// Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str);`
			`}`

			`// Sinhala has some of its own specific rules. See www.macciato.com/sinhala`
			`TEST(ValidateIndicTest, SinhalaRakaransaya) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna`
			`std::string dest;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(), &dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_EQ(dest, str);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::vector<std::string> glyphs;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(`
			`UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,`
			`true, str.c_str(), &glyphs));`
			`EXPECT_EQ(glyphs.size(), 2);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb"));`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`// Can be followed by a dependent vowel.`
			`str += "\u0dd9"; // E`
			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(), &dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_EQ(dest, str);`
			`}`

			`TEST(ValidateIndicTest, SinhalaYansaya) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna`
			`std::string dest;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(), &dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_EQ(dest, str);`
			`// Can be followed by a dependent vowel.`
			`str += "\u0ddd"; // OO`
			`EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`GraphemeNorm::kNormalize, str.c_str(), &dest))`
			`<< PrintString32WithUnicodes(str);`
			`EXPECT_EQ(dest, str);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::vector<std::string> glyphs;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(`
			`UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,`
			`true, str.c_str(), &glyphs));`
			`EXPECT_EQ(glyphs.size(), 3);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba"));`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`}`

			`TEST(ValidateIndicTest, SinhalaRepaya) {`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA`
			`std::vector<std::string> glyphs;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(`
			`UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,`
			`str.c_str(), &glyphs));`
			`EXPECT_EQ(glyphs.size(), 2);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8"));`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(`
			`UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,`
			`true, str.c_str(), &glyphs));`
			`EXPECT_EQ(glyphs.size(), 3);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`}`

			`TEST(ValidateIndicTest, SinhalaSpecials) {`
			`// Sinhala has some exceptions from the usual rules.`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";`
			`std::vector<std::string> glyphs;`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(`
			`UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,`
			`true, str.c_str(), &glyphs));`
			`EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(glyphs[0], std::string("\u0dc0"));`
			`EXPECT_EQ(glyphs[1], std::string("\u0d9c"));`
			`EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb"));`
			`EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d"));`
			`EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d"));`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";`
			`EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(`
			`UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,`
			`true, str.c_str(), &glyphs));`
			`EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);`
unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-01-22 22:17:54 +08:00			`EXPECT_EQ(glyphs[0], std::string("\u0dc3"));`
			`EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d"));`
			`EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d"));`
			`EXPECT_EQ(glyphs[3], std::string("\u0dcf"));`
Add more unittests from Google They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-08-24 21:07:48 +08:00			`}`

			`} // namespace`
			`} // namespace tesseract`