2014-01-10 02:01:34 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: normstrngs.cpp
|
|
|
|
* Description: Utilities to normalize and manipulate UTF-32 and
|
|
|
|
* UTF-8 strings.
|
|
|
|
* Author: Ranjith Unnikrishnan
|
|
|
|
* Created: Thu July 4 2013
|
|
|
|
*
|
|
|
|
* (C) Copyright 2013, Google Inc.
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "normstrngs.h"
|
|
|
|
|
2017-05-12 04:46:46 +08:00
|
|
|
#include <assert.h>
|
2017-07-15 01:05:05 +08:00
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <vector>
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "icuerrorcode.h"
|
|
|
|
#include "unichar.h"
|
|
|
|
#include "unicode/normalizer2.h" // From libicu
|
2014-01-10 02:01:34 +08:00
|
|
|
#include "unicode/translit.h" // From libicu
|
2017-07-15 00:30:14 +08:00
|
|
|
#include "unicode/uchar.h" // From libicu
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "unicode/unorm2.h" // From libicu
|
2017-07-15 00:30:14 +08:00
|
|
|
#include "unicode/uscript.h" // From libicu
|
2013-09-23 23:26:50 +08:00
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
|
|
|
bool is_hyphen_punc(const char32 ch) {
|
|
|
|
static const int kNumHyphenPuncUnicodes = 13;
|
|
|
|
static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
|
2017-07-15 01:05:05 +08:00
|
|
|
'-', 0x2010, 0x2011, 0x2012,
|
|
|
|
0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
|
|
|
|
0x207b, // superscript minus
|
|
|
|
0x208b, // subscript minus
|
|
|
|
0x2212, // minus sign
|
|
|
|
0xfe58, // small em dash
|
|
|
|
0xfe63, // small hyphen-minus
|
|
|
|
0xff0d, // fullwidth hyphen-minus
|
2013-09-23 23:26:50 +08:00
|
|
|
};
|
|
|
|
for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
|
2017-07-15 01:05:05 +08:00
|
|
|
if (kHyphenPuncUnicodes[i] == ch) return true;
|
2013-09-23 23:26:50 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool is_single_quote(const char32 ch) {
|
|
|
|
static const int kNumSingleQuoteUnicodes = 8;
|
|
|
|
static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
|
2017-07-15 01:05:05 +08:00
|
|
|
'\'', '`',
|
|
|
|
0x2018, // left single quotation mark (English, others)
|
|
|
|
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
|
|
|
|
// We may have to introduce a comma set with 0x201a
|
|
|
|
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
|
|
|
|
0x2032, // prime
|
|
|
|
0x300C, // left corner bracket (East Asian languages)
|
|
|
|
0xFF07, // fullwidth apostrophe
|
2013-09-23 23:26:50 +08:00
|
|
|
};
|
|
|
|
for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
|
2017-07-15 01:05:05 +08:00
|
|
|
if (kSingleQuoteUnicodes[i] == ch) return true;
|
2013-09-23 23:26:50 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool is_double_quote(const char32 ch) {
|
|
|
|
static const int kNumDoubleQuoteUnicodes = 8;
|
|
|
|
static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
|
2017-07-15 01:05:05 +08:00
|
|
|
'"',
|
|
|
|
0x201C, // left double quotation mark (English, others)
|
|
|
|
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
|
|
|
|
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
|
|
|
|
0x2033, // double prime
|
|
|
|
0x301D, // reversed double prime quotation mark (East Asian langs,
|
|
|
|
// horiz.)
|
|
|
|
0x301E, // close double prime (East Asian languages written horizontally)
|
|
|
|
0xFF02, // fullwidth quotation mark
|
2013-09-23 23:26:50 +08:00
|
|
|
};
|
|
|
|
for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
|
2017-07-15 01:05:05 +08:00
|
|
|
if (kDoubleQuoteUnicodes[i] == ch) return true;
|
2013-09-23 23:26:50 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-07-15 01:05:05 +08:00
|
|
|
// Helper runs a standard unicode normalization, optional OCR normalization,
|
|
|
|
// and leaves the result as char32 for subsequent processing.
|
|
|
|
static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
|
|
|
|
const char* str8,
|
|
|
|
std::vector<char32>* normed32) {
|
|
|
|
// Convert to ICU string for unicode normalization.
|
|
|
|
icu::UnicodeString uch_str(str8, "UTF-8");
|
2013-09-23 23:26:50 +08:00
|
|
|
IcuErrorCode error_code;
|
2017-07-15 01:05:05 +08:00
|
|
|
// Convert the enum to the new weird icu representation.
|
|
|
|
const char* norm_type =
|
|
|
|
u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
|
|
|
|
? "nfkc"
|
|
|
|
: "nfc";
|
|
|
|
UNormalization2Mode compose =
|
|
|
|
u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
|
|
|
|
? UNORM2_COMPOSE
|
|
|
|
: UNORM2_DECOMPOSE;
|
|
|
|
// Pointer to singleton does not require deletion.
|
|
|
|
const icu::Normalizer2* normalizer =
|
|
|
|
icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
|
2013-09-23 23:26:50 +08:00
|
|
|
error_code.assertSuccess();
|
|
|
|
error_code.reset();
|
2017-07-15 01:05:05 +08:00
|
|
|
icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
|
2013-09-23 23:26:50 +08:00
|
|
|
error_code.assertSuccess();
|
2017-07-15 01:05:05 +08:00
|
|
|
// Convert to char32 for output. OCR normalization if required.
|
|
|
|
normed32->reserve(norm_str.length()); // An approximation.
|
|
|
|
for (int offset = 0; offset < norm_str.length();
|
|
|
|
offset = norm_str.moveIndex32(offset, 1)) {
|
|
|
|
char32 ch = norm_str.char32At(offset);
|
|
|
|
// Skip all ZWS, RTL and LTR marks.
|
|
|
|
if (Validator::IsZeroWidthMark(ch)) continue;
|
|
|
|
if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
|
|
|
|
normed32->push_back(ch);
|
|
|
|
}
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
|
2017-07-15 01:05:05 +08:00
|
|
|
// Helper removes joiners from strings that contain no letters.
|
|
|
|
static void StripJoiners(std::vector<char32>* str32) {
|
|
|
|
for (char32 ch : *str32) {
|
|
|
|
if (u_isalpha(ch)) return;
|
|
|
|
}
|
|
|
|
int len = 0;
|
|
|
|
for (char32 ch : *str32) {
|
|
|
|
if (ch != Validator::kZeroWidthJoiner &&
|
|
|
|
ch != Validator::kZeroWidthNonJoiner) {
|
|
|
|
(*str32)[len++] = ch;
|
2013-09-23 23:26:50 +08:00
|
|
|
}
|
|
|
|
}
|
2017-07-15 01:05:05 +08:00
|
|
|
str32->resize(len);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Normalizes a UTF8 string according to the given modes. Returns true on
|
|
|
|
// success. If false is returned, some failure or invalidity was present, and
|
|
|
|
// the result string is produced on a "best effort" basis.
|
|
|
|
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
|
|
|
|
GraphemeNorm grapheme_normalize, const char* str8,
|
|
|
|
string* normalized) {
|
|
|
|
std::vector<char32> normed32;
|
|
|
|
NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
|
|
|
|
if (grapheme_normalize == GraphemeNorm::kNormalize) {
|
|
|
|
StripJoiners(&normed32);
|
|
|
|
std::vector<std::vector<char32>> graphemes;
|
|
|
|
bool success = Validator::ValidateCleanAndSegment(
|
|
|
|
GraphemeNormMode::kSingleString, false, normed32, &graphemes);
|
|
|
|
if (graphemes.empty() || graphemes[0].empty()) {
|
|
|
|
success = false;
|
|
|
|
} else if (normalized != nullptr) {
|
|
|
|
*normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
|
|
|
|
}
|
|
|
|
return success;
|
|
|
|
}
|
|
|
|
if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Normalizes a UTF8 string according to the given modes and splits into
|
|
|
|
// graphemes according to g_mode. Returns true on success. If false is returned,
|
|
|
|
// some failure or invalidity was present, and the result string is produced on
|
|
|
|
// a "best effort" basis.
|
|
|
|
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
|
|
|
|
GraphemeNormMode g_mode, bool report_errors,
|
|
|
|
const char* str8,
|
|
|
|
std::vector<string>* graphemes) {
|
|
|
|
std::vector<char32> normed32;
|
|
|
|
NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
|
|
|
|
StripJoiners(&normed32);
|
|
|
|
std::vector<std::vector<char32>> graphemes32;
|
|
|
|
bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
|
|
|
|
normed32, &graphemes32);
|
|
|
|
if (g_mode != GraphemeNormMode::kSingleString && success) {
|
|
|
|
// If we modified the string to clean it up, the segmentation may not be
|
|
|
|
// correct, so check for changes and do it again.
|
|
|
|
std::vector<char32> cleaned32;
|
|
|
|
for (const auto& g : graphemes32) {
|
|
|
|
cleaned32.insert(cleaned32.end(), g.begin(), g.end());
|
|
|
|
}
|
|
|
|
if (cleaned32 != normed32) {
|
|
|
|
graphemes32.clear();
|
|
|
|
success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
|
|
|
|
cleaned32, &graphemes32);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
graphemes->clear();
|
|
|
|
graphemes->reserve(graphemes32.size());
|
|
|
|
for (const auto& grapheme : graphemes32) {
|
|
|
|
graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
|
|
|
|
}
|
|
|
|
return success;
|
2013-09-23 23:26:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Apply just the OCR-specific normalizations and return the normalized char.
|
|
|
|
char32 OCRNormalize(char32 ch) {
|
|
|
|
if (is_hyphen_punc(ch))
|
|
|
|
return '-';
|
|
|
|
else if (is_single_quote(ch))
|
|
|
|
return '\'';
|
|
|
|
else if (is_double_quote(ch))
|
|
|
|
return '"';
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsOCREquivalent(char32 ch1, char32 ch2) {
|
|
|
|
return OCRNormalize(ch1) == OCRNormalize(ch2);
|
|
|
|
}
|
|
|
|
|
2014-01-10 02:01:34 +08:00
|
|
|
bool IsValidCodepoint(const char32 ch) {
|
|
|
|
// In the range [0, 0xD800) or [0xE000, 0x10FFFF]
|
2017-09-18 13:39:23 +08:00
|
|
|
return (static_cast<uint32_t>(ch) < 0xD800) || (ch >= 0xE000 && ch <= 0x10FFFF);
|
2014-01-10 02:01:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool IsWhitespace(const char32 ch) {
|
2017-07-15 00:30:14 +08:00
|
|
|
ASSERT_HOST_MSG(IsValidCodepoint(ch), "Invalid Unicode codepoint: 0x%x\n",
|
|
|
|
ch);
|
2014-01-10 02:01:34 +08:00
|
|
|
return u_isUWhiteSpace(static_cast<UChar32>(ch));
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsUTF8Whitespace(const char* text) {
|
|
|
|
return SpanUTF8Whitespace(text) == strlen(text);
|
|
|
|
}
|
|
|
|
|
2017-07-15 00:30:14 +08:00
|
|
|
unsigned int SpanUTF8Whitespace(const char* text) {
|
2014-01-10 02:01:34 +08:00
|
|
|
int n_white = 0;
|
|
|
|
for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
|
2017-07-15 00:30:14 +08:00
|
|
|
it != UNICHAR::end(text, strlen(text)); ++it) {
|
2014-01-10 02:01:34 +08:00
|
|
|
if (!IsWhitespace(*it)) break;
|
|
|
|
n_white += it.utf8_len();
|
|
|
|
}
|
|
|
|
return n_white;
|
|
|
|
}
|
|
|
|
|
2017-07-15 00:30:14 +08:00
|
|
|
unsigned int SpanUTF8NotWhitespace(const char* text) {
|
2014-01-10 02:01:34 +08:00
|
|
|
int n_notwhite = 0;
|
|
|
|
for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
|
2017-07-15 00:30:14 +08:00
|
|
|
it != UNICHAR::end(text, strlen(text)); ++it) {
|
2014-01-10 02:01:34 +08:00
|
|
|
if (IsWhitespace(*it)) break;
|
|
|
|
n_notwhite += it.utf8_len();
|
|
|
|
}
|
|
|
|
return n_notwhite;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsInterchangeValid(const char32 ch) {
|
|
|
|
return IsValidCodepoint(ch) &&
|
2017-07-15 00:30:14 +08:00
|
|
|
!(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters.
|
|
|
|
!(ch >= 0xFFFE && ch <= 0xFFFF) && !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
|
|
|
|
!(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
|
|
|
|
!(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
|
|
|
|
!(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
|
|
|
|
!(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
|
|
|
|
!(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
|
|
|
|
!(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
|
|
|
|
!(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
|
|
|
|
!(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
|
|
|
|
!(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
|
|
|
|
!(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
|
|
|
|
!(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
|
|
|
|
!(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
|
|
|
|
!(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
|
|
|
|
!(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
|
|
|
|
!(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
|
|
|
|
(!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' ||
|
|
|
|
ch == '\f' || ch == '\t' || ch == '\r');
|
2014-01-10 02:01:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bool IsInterchangeValid7BitAscii(const char32 ch) {
|
2017-07-15 00:30:14 +08:00
|
|
|
return IsValidCodepoint(ch) && ch <= 128 &&
|
|
|
|
(!u_isISOControl(static_cast<UChar32>(ch)) || ch == '\n' ||
|
|
|
|
ch == '\f' || ch == '\t' || ch == '\r');
|
2014-01-10 02:01:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
char32 FullwidthToHalfwidth(const char32 ch) {
|
|
|
|
// Return unchanged if not in the fullwidth-halfwidth Unicode block.
|
|
|
|
if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
|
|
|
|
if (ch != 0x3000) return ch;
|
|
|
|
}
|
|
|
|
// Special case for fullwidth left and right "white parentheses".
|
|
|
|
if (ch == 0xFF5F) return 0x2985;
|
|
|
|
if (ch == 0xFF60) return 0x2986;
|
|
|
|
// Construct a full-to-half width transliterator.
|
|
|
|
IcuErrorCode error_code;
|
|
|
|
icu::UnicodeString uch_str(static_cast<UChar32>(ch));
|
|
|
|
const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
|
|
|
|
"Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
|
|
|
|
error_code.assertSuccess();
|
|
|
|
error_code.reset();
|
|
|
|
|
|
|
|
fulltohalf->transliterate(uch_str);
|
|
|
|
delete fulltohalf;
|
|
|
|
ASSERT_HOST(uch_str.length() != 0);
|
|
|
|
return uch_str[0];
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
|
|
|
|
} // namespace tesseract
|