#include "tesseract/training/normstrngs.h" #include "tesseract/unittest/normstrngs_test.h" namespace tesseract { namespace { // Though the unicode example for Telugu in section 12.7: // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf // shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to // suppress a conjugate that would otherwise occur. If a consonant is followed // by a virama and then by a non-Indic character, OpenType will presume that // the user simply meant to suppress the inherent vowel of the consonant // and render it as the consonant with an explicit virama, the same as if // a ZWNJ had followed. Since this is confusing to an OCR engine, the // normalizer always puts a termninating ZWNJ on the end if not present, // and accepts the string as valid. TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) { string str = "\u0c15\u0c4d"; // KA - virama string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str); // Same result if we started with the normalized string. ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, target_str); } // Only one dependent vowel is allowed. TEST(ValidateIndicTest, OnlyOneDependentVowel) { string str = "\u0d15\u0d3e\u0d42"; // KA AA UU string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); } // [c26][c4d][c01] // A consonant (DA) followed by the virama followed by a bindu // Syllable modifiers [c01][c02][c03] all modify the pronunciation of // the vowel in a syllable, as does the virama [c04]. You can only // have one of these on a syllable. // // References: // http://www.omniglot.com/writing/telugu.htm TEST(ValidateIndicTest, OnlyOneVowelModifier) { string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu string result; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); // It made 1 grapheme of 4 chars, by terminating the explicit virama. EXPECT_EQ(string("\u0c26\u0c4d\u200c\u0c01"), result); str = "\u0995\u0983\u0981"; // KA visarga candrabindu EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); // Exception: Malayalam allows multiple anusvara. str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); EXPECT_EQ(str, result); } // [c28][c02][c3f] // A consonant (NA) followed by the Anusvara/sunna and another matra (I). // The anusvara [c02] is a pronunciation directive // for a whole syllable and only appears at the end of the syllable // References: // + Unicode v9, 12.1 "Modifier Mark Rules R10," // and the Microsoft page // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx TEST(ValidateIndicTest, VowelModifierMustBeLast) { string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); // Swap c02/c3f and all is ok. str = "\u0c28\u0c3f\u0c02"; // NA I Sunna EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } // [c05][c47] // A Vowel (A) followed by a combining vowel/matra (EE). // In Telugu, matras are only put on consonants, not independent // vowels. // References: // + Unicode v9, 12.1: // Principles of the Devanagari Script: Dependent Vowel Signs (Matras). // + http://varamozhi.sourceforge.net/iscii91.pdf TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) { string str = "\u0c05\u0c47"; // A EE string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); str = "\u0c1e\u0c3e"; // NYA AA EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } // Sub-graphemes are allowed if GraphemeNorm is turned off. TEST(ValidateIndicTest, SubGraphemes) { string str = "\u0d3e"; // AA string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNone, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } TEST(ValidateIndicTest, Nukta) { string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[2], string("\u0ccd\u0cb9")); // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it. string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str); } // Sinhala has some of its own specific rules. See www.macciato.com/sinhala TEST(ValidateIndicTest, SinhalaRakaransaya) { string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna string dest; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 2); EXPECT_EQ(glyphs[1], string("\u0dca\u200d\u0dbb")); // Can be followed by a dependent vowel. str += "\u0dd9"; // E EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); } TEST(ValidateIndicTest, SinhalaYansaya) { string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna string dest; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); // Can be followed by a dependent vowel. str += "\u0ddd"; // OO EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], string("\u0dca\u200d\u0dba")); } TEST(ValidateIndicTest, SinhalaRepaya) { string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 2); EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d\u0db8")); EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 3); EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d")); } TEST(ValidateIndicTest, SinhalaSpecials) { // Sinhala has some exceptions from the usual rules. string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d"; std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(glyphs[0], string("\u0dc0")); EXPECT_EQ(glyphs[1], string("\u0d9c")); EXPECT_EQ(glyphs[2], string("\u0dca\u200d\u0dbb")); EXPECT_EQ(glyphs[3], string("\u0dca\u200d")); EXPECT_EQ(glyphs[4], string("\u0dbb\u0dca\u200d")); str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf"; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(glyphs[0], string("\u0dc3")); EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d")); EXPECT_EQ(glyphs[2], string("\u0dbb\u0dca\u200d")); EXPECT_EQ(glyphs[3], string("\u0dcf")); } } // namespace } // namespace tesseract