mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 22:43:45 +08:00
2b7df59187
They were provided by Jeff Breidenbach <jbreiden@google.com>. Signed-off-by: Stefan Weil <sw@weilnetz.de>
46 lines
1.9 KiB
C++
46 lines
1.9 KiB
C++
#include "tesseract/training/normstrngs.h"
|
|
|
|
#include "tesseract/unittest/normstrngs_test.h"
|
|
|
|
namespace tesseract {
|
|
namespace {
|
|
|
|
// Test some random Myanmar words.
|
|
TEST(ValidateMyanmarTest, GoodMyanmarWords) {
|
|
string str = "လျှာကသိသည် "; // No viramas in this one.
|
|
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 11, 5, str);
|
|
str = "တုန္လႈပ္မႈ ";
|
|
ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 11, 9, 4, str);
|
|
}
|
|
|
|
// Test some random Myanmar words with dotted circles.
|
|
TEST(ValidateMyanmarTest, BadMyanmarWords) {
|
|
string str = "က်န္းမာေရး";
|
|
std::vector<string> glyphs;
|
|
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
|
|
str.c_str(), &glyphs));
|
|
string result;
|
|
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&result));
|
|
// It works if the grapheme normalization is turned off.
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNone, str.c_str(), &result));
|
|
EXPECT_EQ(str, result);
|
|
str = "ခုႏွစ္";
|
|
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
|
|
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
|
|
true, str.c_str(), &glyphs));
|
|
EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNormalize, str.c_str(),
|
|
&result));
|
|
// It works if the grapheme normalization is turned off.
|
|
EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
GraphemeNorm::kNone, str.c_str(), &result));
|
|
EXPECT_EQ(str, result);
|
|
}
|
|
|
|
} // namespace
|
|
} // namespace tesseract
|