mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
2b7df59187
They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de>
223 lines
9.6 KiB
C++
223 lines
9.6 KiB
C++
#include "tesseract/training/normstrngs.h"
|
|
|
|
#include "tesseract/unittest/normstrngs_test.h"
|
|
|
|
namespace tesseract {
|
|
namespace {
|
|
|
|
// Though the unicode example for Telugu in section 12.7:
|
|
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
|
|
// shows using ZWNJ to force an explicit virama, in practice a ZWNJ is used to
|
|
// suppress a conjugate that would otherwise occur. If a consonant is followed
|
|
// by a virama and then by a non-Indic character, OpenType will presume that
|
|
// the user simply meant to suppress the inherent vowel of the consonant
|
|
// and render it as the consonant with an explicit virama, the same as if
|
|
// a ZWNJ had followed. Since this is confusing to an OCR engine, the
|
|
// normalizer always puts a termninating ZWNJ on the end if not present,
|
|
// and accepts the string as valid.
|
|
TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) {
|
|
string str = "\u0c15\u0c4d"; // KA - virama
|
|
string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ
|
|
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str);
|
|
// Same result if we started with the normalized string.
|
|
ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1,
|
|
target_str);
|
|
}
|
|
|
|
// Only one dependent vowel is allowed.
|
|
TEST(ValidateIndicTest, OnlyOneDependentVowel) {
|
|
string str = "\u0d15\u0d3e\u0d42"; // KA AA UU
|
|
string dest;
|
|
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
}
|
|
|
|
// [c26][c4d][c01]
|
|
// A consonant (DA) followed by the virama followed by a bindu
|
|
// Syllable modifiers [c01][c02][c03] all modify the pronunciation of
|
|
// the vowel in a syllable, as does the virama [c04]. You can only
|
|
// have one of these on a syllable.
|
|
//
|
|
// References:
|
|
// http://www.omniglot.com/writing/telugu.htm
|
|
TEST(ValidateIndicTest, OnlyOneVowelModifier) {
|
|
string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu
|
|
string result;
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&result));
|
|
// It made 1 grapheme of 4 chars, by terminating the explicit virama.
|
|
EXPECT_EQ(string("\u0c26\u0c4d\u200c\u0c01"), result);
|
|
|
|
str = "\u0995\u0983\u0981"; // KA visarga candrabindu
|
|
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&result));
|
|
|
|
// Exception: Malayalam allows multiple anusvara.
|
|
str = "\u0d15\u0d02\u0d02"; // KA Anusvara Anusvara
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&result));
|
|
EXPECT_EQ(str, result);
|
|
}
|
|
|
|
// [c28][c02][c3f]
|
|
// A consonant (NA) followed by the Anusvara/sunna and another matra (I).
|
|
// The anusvara [c02] is a pronunciation directive
|
|
// for a whole syllable and only appears at the end of the syllable
|
|
// References:
|
|
// + Unicode v9, 12.1 "Modifier Mark Rules R10,"
|
|
// and the Microsoft page
|
|
// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
|
|
TEST(ValidateIndicTest, VowelModifierMustBeLast) {
|
|
string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I
|
|
string dest;
|
|
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
// Swap c02/c3f and all is ok.
|
|
str = "\u0c28\u0c3f\u0c02"; // NA I Sunna
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_EQ(dest, str);
|
|
}
|
|
|
|
// [c05][c47]
|
|
// A Vowel (A) followed by a combining vowel/matra (EE).
|
|
// In Telugu, matras are only put on consonants, not independent
|
|
// vowels.
|
|
// References:
|
|
// + Unicode v9, 12.1:
|
|
// Principles of the Devanagari Script: Dependent Vowel Signs (Matras).
|
|
// + http://varamozhi.sourceforge.net/iscii91.pdf
|
|
TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) {
|
|
string str = "\u0c05\u0c47"; // A EE
|
|
string dest;
|
|
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
str = "\u0c1e\u0c3e"; // NYA AA
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_EQ(dest, str);
|
|
}
|
|
|
|
// Sub-graphemes are allowed if GraphemeNorm is turned off.
|
|
TEST(ValidateIndicTest, SubGraphemes) {
|
|
string str = "\u0d3e"; // AA
|
|
string dest;
|
|
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNone, str.c_str(), &dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_EQ(dest, str);
|
|
}
|
|
|
|
TEST(ValidateIndicTest, Nukta) {
|
|
string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA
|
|
std::vector<string> glyphs;
|
|
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
|
true, str.c_str(), &glyphs));
|
|
EXPECT_EQ(glyphs.size(), 3);
|
|
EXPECT_EQ(glyphs[2], string("\u0ccd\u0cb9"));
|
|
// Swapped Nukta and Virama are not allowed, but NFC normalization fixes it.
|
|
string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA
|
|
ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str);
|
|
}
|
|
|
|
// Sinhala has some of its own specific rules. See www.macciato.com/sinhala
|
|
TEST(ValidateIndicTest, SinhalaRakaransaya) {
|
|
string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna
|
|
string dest;
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_EQ(dest, str);
|
|
std::vector<string> glyphs;
|
|
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
|
true, str.c_str(), &glyphs));
|
|
EXPECT_EQ(glyphs.size(), 2);
|
|
EXPECT_EQ(glyphs[1], string("\u0dca\u200d\u0dbb"));
|
|
// Can be followed by a dependent vowel.
|
|
str += "\u0dd9"; // E
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_EQ(dest, str);
|
|
}
|
|
|
|
TEST(ValidateIndicTest, SinhalaYansaya) {
|
|
string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna
|
|
string dest;
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_EQ(dest, str);
|
|
// Can be followed by a dependent vowel.
|
|
str += "\u0ddd"; // OO
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(), &dest))
|
|
<< PrintString32WithUnicodes(str);
|
|
EXPECT_EQ(dest, str);
|
|
std::vector<string> glyphs;
|
|
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
|
true, str.c_str(), &glyphs));
|
|
EXPECT_EQ(glyphs.size(), 3);
|
|
EXPECT_EQ(glyphs[1], string("\u0dca\u200d\u0dba"));
|
|
}
|
|
|
|
TEST(ValidateIndicTest, SinhalaRepaya) {
|
|
string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA
|
|
std::vector<string> glyphs;
|
|
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
|
str.c_str(), &glyphs));
|
|
EXPECT_EQ(glyphs.size(), 2);
|
|
EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d\u0db8"));
|
|
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
|
true, str.c_str(), &glyphs));
|
|
EXPECT_EQ(glyphs.size(), 3);
|
|
EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d"));
|
|
}
|
|
|
|
TEST(ValidateIndicTest, SinhalaSpecials) {
|
|
// Sinhala has some exceptions from the usual rules.
|
|
string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d";
|
|
std::vector<string> glyphs;
|
|
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
|
true, str.c_str(), &glyphs));
|
|
EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs);
|
|
EXPECT_EQ(glyphs[0], string("\u0dc0"));
|
|
EXPECT_EQ(glyphs[1], string("\u0d9c"));
|
|
EXPECT_EQ(glyphs[2], string("\u0dca\u200d\u0dbb"));
|
|
EXPECT_EQ(glyphs[3], string("\u0dca\u200d"));
|
|
EXPECT_EQ(glyphs[4], string("\u0dbb\u0dca\u200d"));
|
|
str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf";
|
|
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
|
true, str.c_str(), &glyphs));
|
|
EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs);
|
|
EXPECT_EQ(glyphs[0], string("\u0dc3"));
|
|
EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d"));
|
|
EXPECT_EQ(glyphs[2], string("\u0dbb\u0dca\u200d"));
|
|
EXPECT_EQ(glyphs[3], string("\u0dcf"));
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace tesseract
|