mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-12 07:29:07 +08:00
Merge pull request #2523 from stweil/unilib
unittest: Add unilib.h and other code from Tensorflow and use it for more tests
This commit is contained in:
commit
653faa3a64
@ -132,7 +132,6 @@ check_PROGRAMS += matrix_test
|
||||
check_PROGRAMS += nthitem_test
|
||||
check_PROGRAMS += osd_test
|
||||
# check_PROGRAMS += pagesegmode_test
|
||||
# check_PROGRAMS += pango_font_info_test
|
||||
check_PROGRAMS += paragraphs_test
|
||||
check_PROGRAMS += params_model_test
|
||||
check_PROGRAMS += progress_test
|
||||
@ -159,6 +158,7 @@ check_PROGRAMS += lstm_squashed_test
|
||||
check_PROGRAMS += lstm_test
|
||||
check_PROGRAMS += lstmtrainer_test
|
||||
check_PROGRAMS += normstrngs_test
|
||||
check_PROGRAMS += pango_font_info_test
|
||||
check_PROGRAMS += unichar_test
|
||||
check_PROGRAMS += unicharcompress_test
|
||||
check_PROGRAMS += unicharset_test
|
||||
@ -273,13 +273,22 @@ matrix_test_SOURCES = matrix_test.cc
|
||||
matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
|
||||
normstrngs_test_SOURCES = normstrngs_test.cc
|
||||
normstrngs_test_SOURCES += third_party/utf/rune.c util/utf8/unilib.cc
|
||||
normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)
|
||||
|
||||
nthitem_test_SOURCES = nthitem_test.cc
|
||||
nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
|
||||
#pango_font_info_test_SOURCES = pango_font_info_test.cc
|
||||
#pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
osd_test_SOURCES = osd_test.cc
|
||||
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
|
||||
|
||||
pango_font_info_test_SOURCES = pango_font_info_test.cc
|
||||
pango_font_info_test_SOURCES += third_party/utf/rune.c
|
||||
pango_font_info_test_SOURCES += util/utf8/unicodetext.cc util/utf8/unilib.cc
|
||||
pango_font_info_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(LEPTONICA_LIBS)
|
||||
pango_font_info_test_LDADD += $(ICU_I18N_LIBS) -lfontconfig
|
||||
pango_font_info_test_LDADD += -lpangocairo-1.0 -lpangoft2-1.0
|
||||
pango_font_info_test_LDADD += $(cairo_LIBS) $(pango_LIBS)
|
||||
|
||||
paragraphs_test_SOURCES = paragraphs_test.cc
|
||||
paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
|
||||
@ -287,9 +296,6 @@ paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
|
||||
params_model_test_SOURCES = params_model_test.cc
|
||||
params_model_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
|
||||
osd_test_SOURCES = osd_test.cc
|
||||
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
|
||||
|
||||
progress_test_SOURCES = progress_test.cc
|
||||
progress_test_LDFLAGS = $(OPENCL_LDFLAGS) $(LEPTONICA_LIBS)
|
||||
progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
|
||||
|
@ -319,7 +319,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
|
||||
const char* langs[] = {"eng", "chi_tra", "jpn", "vie"};
|
||||
std::unique_ptr<tesseract::TessBaseAPI> api;
|
||||
CycleTimer timer;
|
||||
for (int i = 0; i < ARRAYSIZE(langs); ++i) {
|
||||
for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
|
||||
api.reset(new tesseract::TessBaseAPI);
|
||||
timer.Restart();
|
||||
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
|
||||
@ -333,7 +333,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
|
||||
vars_vec.push_back(STRING("tessedit_init_config_only"));
|
||||
vars_values.push_back(STRING("1"));
|
||||
LOG(INFO) << "Switching to config only initialization:";
|
||||
for (int i = 0; i < ARRAYSIZE(langs); ++i) {
|
||||
for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
|
||||
api.reset(new tesseract::TessBaseAPI);
|
||||
timer.Restart();
|
||||
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
|
||||
|
@ -34,7 +34,7 @@ class HeapTest : public testing::Test {
|
||||
virtual ~HeapTest();
|
||||
// Pushes the test data onto both the heap and the KDVector.
|
||||
void PushTestData(GenericHeap<IntKDPair>* heap, KDVector* v) {
|
||||
for (int i = 0; i < ARRAYSIZE(test_data); ++i) {
|
||||
for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
|
||||
IntKDPair pair(test_data[i], i);
|
||||
heap->Push(&pair);
|
||||
v->push_back(pair);
|
||||
@ -137,7 +137,7 @@ TEST_F(HeapTest, RevalueTest) {
|
||||
GenericHeap<PtrPair> heap;
|
||||
GenericVector<PtrPair> v;
|
||||
// Push the test data onto both the heap and the vector.
|
||||
for (int i = 0; i < ARRAYSIZE(test_data); ++i) {
|
||||
for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
|
||||
PtrPair h_pair;
|
||||
h_pair.key = test_data[i];
|
||||
PtrPair v_pair;
|
||||
|
@ -15,16 +15,14 @@
|
||||
#include "normstrngs_test.h"
|
||||
#include "strngs.h"
|
||||
#include "unichar.h"
|
||||
#if defined(HAS_UNILIB_H)
|
||||
#include "unilib.h"
|
||||
#endif
|
||||
#include "util/utf8/unilib.h"
|
||||
|
||||
#include "include_gunit.h"
|
||||
|
||||
namespace tesseract {
|
||||
namespace {
|
||||
|
||||
#if defined(HAS_UNILIB_H)
|
||||
#if defined(MISSING_CODE)
|
||||
static std::string EncodeAsUTF8(const char32 ch32) {
|
||||
UNICHAR uni_ch(ch32);
|
||||
return std::string(uni_ch.utf8(), uni_ch.utf8_len());
|
||||
@ -363,7 +361,6 @@ TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
|
||||
EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
|
||||
}
|
||||
|
||||
#if defined(HAS_UNILIB_H)
|
||||
// Test that the method clones the util/utf8/public/unilib definition of
|
||||
// interchange validity.
|
||||
TEST(NormstrngsTest, IsInterchangeValid) {
|
||||
@ -374,12 +371,11 @@ TEST(NormstrngsTest, IsInterchangeValid) {
|
||||
EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_UNILIB_H)
|
||||
// Test that the method clones the util/utf8/public/unilib definition of
|
||||
// 7-bit ASCII interchange validity.
|
||||
TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
|
||||
#if defined(MISSING_CODE)
|
||||
const int32_t kMinUnicodeValue = 33;
|
||||
const int32_t kMaxUnicodeValue = 0x10FFFF;
|
||||
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
|
||||
@ -388,8 +384,11 @@ TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
|
||||
EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
|
||||
IsInterchangeValid7BitAscii(ch));
|
||||
}
|
||||
}
|
||||
#else
|
||||
// Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
|
||||
GTEST_SKIP();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Test that the method clones the util/utf8/public/unilib definition of
|
||||
// fullwidth-halfwidth .
|
||||
@ -401,7 +400,8 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) {
|
||||
// U+FFE6 -> U+20A9 (won sign)
|
||||
EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));
|
||||
|
||||
#if defined(HAS_UNILIB_H)
|
||||
#if defined(MISSING_CODE)
|
||||
// Skipped because of missing UniLib::FullwidthToHalfwidth.
|
||||
const int32_t kMinUnicodeValue = 33;
|
||||
const int32_t kMaxUnicodeValue = 0x10FFFF;
|
||||
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
|
||||
|
@ -1,12 +1,24 @@
|
||||
// (C) Copyright 2017, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "tesseract/training/pango_font_info.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "pango/pango.h"
|
||||
#include "tesseract/training/commandlineflags.h"
|
||||
#include "tesseract/training/fileio.h"
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <pango/pango.h>
|
||||
#include "include_gunit.h"
|
||||
#include "commandlineflags.h"
|
||||
#include "fileio.h"
|
||||
#include "pango_font_info.h"
|
||||
#include "absl/strings/str_cat.h" // for absl::StrCat
|
||||
#include "gmock/gmock-matchers.h" // for EXPECT_THAT
|
||||
#include "util/utf8/unicodetext.h" // for UnicodeText
|
||||
|
||||
DECLARE_STRING_PARAM_FLAG(fonts_dir);
|
||||
DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
|
||||
@ -19,19 +31,19 @@ using tesseract::FontUtils;
|
||||
using tesseract::PangoFontInfo;
|
||||
|
||||
// Fonts in testdata directory
|
||||
const char* kExpectedFontNames[] = {"Arab",
|
||||
"Arial Bold Italic",
|
||||
"DejaVu Sans Ultra-Light",
|
||||
"Lohit Hindi",
|
||||
const char* kExpectedFontNames[] = {
|
||||
"Arab",
|
||||
"Arial Bold Italic",
|
||||
"DejaVu Sans Ultra-Light",
|
||||
"Lohit Hindi",
|
||||
#if PANGO_VERSION <= 12005
|
||||
"Times New Roman",
|
||||
"Times New Roman",
|
||||
#else
|
||||
"Times New Roman,", // Pango v1.36.2
|
||||
// requires a trailing
|
||||
// ','
|
||||
"Times New Roman,", // Pango v1.36.2 requires a trailing ','
|
||||
#endif
|
||||
"UnBatang",
|
||||
"Verdana"};
|
||||
"UnBatang",
|
||||
"Verdana"
|
||||
};
|
||||
|
||||
// Sample text used in tests.
|
||||
const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
|
||||
@ -41,23 +53,27 @@ const char kKorText[] = "이는 것으로";
|
||||
// Hindi words containing illegal vowel sequences.
|
||||
const char* kBadlyFormedHinWords[] = {
|
||||
#if PANGO_VERSION <= 12005
|
||||
"उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
|
||||
"उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
|
||||
#endif
|
||||
// Pango v1.36.2 will render the above words even though they are invalid.
|
||||
"प्रंात", nullptr};
|
||||
// Pango v1.36.2 will render the above words even though they are invalid.
|
||||
"प्रंात", nullptr
|
||||
};
|
||||
|
||||
class PangoFontInfoTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
std::locale::global(std::locale(""));
|
||||
static std::locale system_locale("");
|
||||
std::locale::global(system_locale);
|
||||
}
|
||||
|
||||
// Creates a fake fonts.conf file that points to the testdata fonts for
|
||||
// fontconfig to initialize with.
|
||||
static void SetUpTestCase() {
|
||||
FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
|
||||
FLAGS_fonts_dir = TESTING_DIR;
|
||||
FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
|
||||
#ifdef GOOGLE_TESSERACT
|
||||
FLAGS_use_only_legacy_fonts = false;
|
||||
#endif
|
||||
}
|
||||
|
||||
PangoFontInfo font_info_;
|
||||
@ -120,7 +136,7 @@ TEST_F(PangoFontInfoTest, CanRenderLigature) {
|
||||
font_info_.ParseFontDescriptionName("Arab 12");
|
||||
const char kArabicLigature[] = "لا";
|
||||
EXPECT_TRUE(
|
||||
font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
|
||||
font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
|
||||
|
||||
printf("Next word\n");
|
||||
EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
|
||||
@ -143,17 +159,17 @@ TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
|
||||
TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
|
||||
font_info_.ParseFontDescriptionName("Verdana 12");
|
||||
// Verdana cannot render the "ff" ligature
|
||||
string word = "office";
|
||||
std::string word = "office";
|
||||
EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
|
||||
EXPECT_EQ("oice", word);
|
||||
|
||||
// Don't drop non-letter characters like word joiners.
|
||||
const char* kJoiners[] = {
|
||||
"\u2060", // U+2060 (WJ)
|
||||
"\u200C", // U+200C (ZWJ)
|
||||
"\u200D" // U+200D (ZWNJ)
|
||||
"\u2060", // U+2060 (WJ)
|
||||
"\u200C", // U+200C (ZWJ)
|
||||
"\u200D" // U+200D (ZWNJ)
|
||||
};
|
||||
for (int i = 0; i < ARRAYSIZE(kJoiners); ++i) {
|
||||
for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) {
|
||||
word = kJoiners[i];
|
||||
EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
|
||||
EXPECT_STREQ(kJoiners[i], word.c_str());
|
||||
@ -167,17 +183,21 @@ class FontUtilsTest : public ::testing::Test {
|
||||
// Creates a fake fonts.conf file that points to the testdata fonts for
|
||||
// fontconfig to initialize with.
|
||||
static void SetUpTestCase() {
|
||||
FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
|
||||
FLAGS_fonts_dir = TESTING_DIR;
|
||||
FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
|
||||
}
|
||||
|
||||
void CountUnicodeChars(const char* utf8_text,
|
||||
std::unordered_map<char32, inT64>* ch_map) {
|
||||
std::unordered_map<char32, int64_t>* ch_map) {
|
||||
ch_map->clear();
|
||||
UnicodeText ut;
|
||||
ut.PointToUTF8(utf8_text, strlen(utf8_text));
|
||||
for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
|
||||
#if 0
|
||||
if (UnicodeProps::IsWhitespace(*it)) continue;
|
||||
#else
|
||||
if (std::isspace(*it)) continue;
|
||||
#endif
|
||||
++(*ch_map)[*it];
|
||||
}
|
||||
}
|
||||
@ -206,21 +226,21 @@ TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
|
||||
}
|
||||
|
||||
TEST_F(FontUtilsTest, DoesListAvailableFonts) {
|
||||
const std::vector<string>& fonts = FontUtils::ListAvailableFonts();
|
||||
const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts();
|
||||
EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
|
||||
for (int i = 0; i < fonts.size(); ++i) {
|
||||
for (auto& font : fonts) {
|
||||
PangoFontInfo font_info;
|
||||
EXPECT_TRUE(font_info.ParseFontDescriptionName(fonts[i]));
|
||||
EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(FontUtilsTest, DoesFindBestFonts) {
|
||||
string fonts_list;
|
||||
std::unordered_map<char32, inT64> ch_map;
|
||||
std::string fonts_list;
|
||||
std::unordered_map<char32, int64_t> ch_map;
|
||||
CountUnicodeChars(kEngText, &ch_map);
|
||||
EXPECT_EQ(26, ch_map.size()); // 26 letters
|
||||
std::vector<std::pair<const char*, std::vector<bool> > > font_flags;
|
||||
string best_list = FontUtils::BestFonts(ch_map, &font_flags);
|
||||
std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
|
||||
EXPECT_TRUE(best_list.size());
|
||||
// All fonts except Lohit Hindi should render English text.
|
||||
EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size());
|
||||
@ -238,8 +258,8 @@ TEST_F(FontUtilsTest, DoesSelectFont) {
|
||||
const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
|
||||
for (int i = 0; kLangText[i] != nullptr; ++i) {
|
||||
SCOPED_TRACE(kLangNames[i]);
|
||||
std::vector<string> graphemes;
|
||||
string selected_font;
|
||||
std::vector<std::string> graphemes;
|
||||
std::string selected_font;
|
||||
EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
|
||||
&selected_font, &graphemes));
|
||||
EXPECT_TRUE(selected_font.size());
|
||||
@ -249,17 +269,17 @@ TEST_F(FontUtilsTest, DoesSelectFont) {
|
||||
|
||||
TEST_F(FontUtilsTest, DoesFailToSelectFont) {
|
||||
const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
|
||||
std::vector<string> graphemes;
|
||||
string selected_font;
|
||||
std::vector<std::string> graphemes;
|
||||
std::string selected_font;
|
||||
EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
|
||||
&selected_font, &graphemes));
|
||||
}
|
||||
|
||||
TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
|
||||
const int32 kHindiChar = 0x0905;
|
||||
const int32 kArabicChar = 0x0623;
|
||||
const int32 kMongolianChar = 0x180E; // Mongolian vowel separator
|
||||
const int32 kOghamChar = 0x1680; // Ogham space mark
|
||||
const int32_t kHindiChar = 0x0905;
|
||||
const int32_t kArabicChar = 0x0623;
|
||||
const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator
|
||||
const int32_t kOghamChar = 0x1680; // Ogham space mark
|
||||
std::vector<bool> unicode_mask;
|
||||
FontUtils::GetAllRenderableCharacters(&unicode_mask);
|
||||
EXPECT_TRUE(unicode_mask['A']);
|
||||
@ -267,10 +287,12 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
|
||||
EXPECT_TRUE(unicode_mask[kHindiChar]);
|
||||
EXPECT_TRUE(unicode_mask[kArabicChar]);
|
||||
EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian.
|
||||
#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
|
||||
EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham.
|
||||
#endif
|
||||
unicode_mask.clear();
|
||||
|
||||
std::vector<string> selected_fonts;
|
||||
std::vector<std::string> selected_fonts;
|
||||
selected_fonts.push_back("Lohit Hindi");
|
||||
FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
|
||||
EXPECT_TRUE(unicode_mask['1']);
|
||||
@ -279,14 +301,18 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
|
||||
EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic,
|
||||
EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian,
|
||||
EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham.
|
||||
unicode_mask.clear();
|
||||
|
||||
// Check that none of the included fonts cover the Mongolian or Ogham space
|
||||
// characters.
|
||||
for (int f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
|
||||
for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
|
||||
SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f]));
|
||||
FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
|
||||
#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
|
||||
EXPECT_FALSE(unicode_mask[kOghamChar]);
|
||||
#endif
|
||||
EXPECT_FALSE(unicode_mask[kMongolianChar]);
|
||||
unicode_mask.clear();
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
61
unittest/syntaxnet/base.h
Normal file
61
unittest/syntaxnet/base.h
Normal file
@ -0,0 +1,61 @@
|
||||
/* Copyright 2016 Google Inc. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef SYNTAXNET_BASE_H_
|
||||
#define SYNTAXNET_BASE_H_
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "google/protobuf/util/message_differencer.h"
|
||||
|
||||
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
#include "tensorflow/core/lib/strings/stringprintf.h"
|
||||
#include "tensorflow/core/platform/default/integral_types.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/protobuf.h"
|
||||
|
||||
|
||||
|
||||
using tensorflow::int8;
|
||||
using tensorflow::int16;
|
||||
using tensorflow::int32;
|
||||
using tensorflow::int64;
|
||||
using tensorflow::uint8;
|
||||
using tensorflow::uint16;
|
||||
using tensorflow::uint64;
|
||||
using tensorflow::uint32;
|
||||
using tensorflow::protobuf::TextFormat;
|
||||
using tensorflow::mutex_lock;
|
||||
using tensorflow::mutex;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
using std::vector;
|
||||
using std::unordered_map;
|
||||
using std::unordered_set;
|
||||
typedef signed int char32;
|
||||
|
||||
using tensorflow::StringPiece;
|
||||
using std::string;
|
||||
|
||||
|
||||
// namespace syntaxnet
|
||||
|
||||
#endif // SYNTAXNET_BASE_H_
|
357
unittest/third_party/utf/rune.c
vendored
Normal file
357
unittest/third_party/utf/rune.c
vendored
Normal file
@ -0,0 +1,357 @@
|
||||
/*
|
||||
* The authors of this software are Rob Pike and Ken Thompson.
|
||||
* Copyright (c) 2002 by Lucent Technologies.
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose without fee is hereby granted, provided that this entire notice
|
||||
* is included in all copies of any software which is or includes a copy
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include "third_party/utf/utf.h"
|
||||
#include "third_party/utf/utfdef.h"
|
||||
|
||||
enum
|
||||
{
|
||||
Bit1 = 7,
|
||||
Bitx = 6,
|
||||
Bit2 = 5,
|
||||
Bit3 = 4,
|
||||
Bit4 = 3,
|
||||
Bit5 = 2,
|
||||
|
||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
||||
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
||||
|
||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
||||
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
||||
/* 0001 1111 1111 1111 1111 1111 */
|
||||
|
||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
||||
|
||||
Bad = Runeerror,
|
||||
};
|
||||
|
||||
/*
|
||||
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
|
||||
* This is a slower but "safe" version of the old chartorune
|
||||
* that works on strings that are not necessarily null-terminated.
|
||||
*
|
||||
* If you know for sure that your string is null-terminated,
|
||||
* chartorune will be a bit faster.
|
||||
*
|
||||
* It is guaranteed not to attempt to access "length"
|
||||
* past the incoming pointer. This is to avoid
|
||||
* possible access violations. If the string appears to be
|
||||
* well-formed but incomplete (i.e., to get the whole Rune
|
||||
* we'd need to read past str+length) then we'll set the Rune
|
||||
* to Bad and return 0.
|
||||
*
|
||||
* Note that if we have decoding problems for other
|
||||
* reasons, we return 1 instead of 0.
|
||||
*/
|
||||
int
|
||||
charntorune(Rune *rune, const char *str, int length)
|
||||
{
|
||||
int c, c1, c2, c3;
|
||||
long l;
|
||||
|
||||
/* When we're not allowed to read anything */
|
||||
if(length <= 0) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* one character sequence (7-bit value)
|
||||
* 00000-0007F => T1
|
||||
*/
|
||||
c = *(uchar*)str;
|
||||
if(c < Tx) {
|
||||
*rune = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// If we can't read more than one character we must stop
|
||||
if(length <= 1) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* two character sequence (11-bit value)
|
||||
* 0080-07FF => T2 Tx
|
||||
*/
|
||||
c1 = *(uchar*)(str+1) ^ Tx;
|
||||
if(c1 & Testx)
|
||||
goto bad;
|
||||
if(c < T3) {
|
||||
if(c < T2)
|
||||
goto bad;
|
||||
l = ((c << Bitx) | c1) & Rune2;
|
||||
if(l <= Rune1)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 2;
|
||||
}
|
||||
|
||||
// If we can't read more than two characters we must stop
|
||||
if(length <= 2) {
|
||||
goto badlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* three character sequence (16-bit value)
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
c2 = *(uchar*)(str+2) ^ Tx;
|
||||
if(c2 & Testx)
|
||||
goto bad;
|
||||
if(c < T4) {
|
||||
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||
if(l <= Rune2)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (length <= 3)
|
||||
goto badlen;
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
c3 = *(uchar*)(str+3) ^ Tx;
|
||||
if (c3 & Testx)
|
||||
goto bad;
|
||||
if (c < T5) {
|
||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||
if (l <= Rune3)
|
||||
goto bad;
|
||||
if (l > Runemax)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 4;
|
||||
}
|
||||
|
||||
// Support for 5-byte or longer UTF-8 would go here, but
|
||||
// since we don't have that, we'll just fall through to bad.
|
||||
|
||||
/*
|
||||
* bad decoding
|
||||
*/
|
||||
bad:
|
||||
*rune = Bad;
|
||||
return 1;
|
||||
badlen:
|
||||
*rune = Bad;
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This is the older "unsafe" version, which works fine on
|
||||
* null-terminated strings.
|
||||
*/
|
||||
int
|
||||
chartorune(Rune *rune, const char *str)
|
||||
{
|
||||
int c, c1, c2, c3;
|
||||
long l;
|
||||
|
||||
/*
|
||||
* one character sequence
|
||||
* 00000-0007F => T1
|
||||
*/
|
||||
c = *(uchar*)str;
|
||||
if(c < Tx) {
|
||||
*rune = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* two character sequence
|
||||
* 0080-07FF => T2 Tx
|
||||
*/
|
||||
c1 = *(uchar*)(str+1) ^ Tx;
|
||||
if(c1 & Testx)
|
||||
goto bad;
|
||||
if(c < T3) {
|
||||
if(c < T2)
|
||||
goto bad;
|
||||
l = ((c << Bitx) | c1) & Rune2;
|
||||
if(l <= Rune1)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* three character sequence
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
c2 = *(uchar*)(str+2) ^ Tx;
|
||||
if(c2 & Testx)
|
||||
goto bad;
|
||||
if(c < T4) {
|
||||
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
||||
if(l <= Rune2)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
c3 = *(uchar*)(str+3) ^ Tx;
|
||||
if (c3 & Testx)
|
||||
goto bad;
|
||||
if (c < T5) {
|
||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
||||
if (l <= Rune3)
|
||||
goto bad;
|
||||
if (l > Runemax)
|
||||
goto bad;
|
||||
*rune = l;
|
||||
return 4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Support for 5-byte or longer UTF-8 would go here, but
|
||||
* since we don't have that, we'll just fall through to bad.
|
||||
*/
|
||||
|
||||
/*
|
||||
* bad decoding
|
||||
*/
|
||||
bad:
|
||||
*rune = Bad;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int
|
||||
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
|
||||
*consumed = charntorune(rune, str, length);
|
||||
return *rune != Runeerror || *consumed == 3;
|
||||
}
|
||||
|
||||
int
|
||||
runetochar(char *str, const Rune *rune)
|
||||
{
|
||||
/* Runes are signed, so convert to unsigned for range check. */
|
||||
unsigned long c;
|
||||
|
||||
/*
|
||||
* one character sequence
|
||||
* 00000-0007F => 00-7F
|
||||
*/
|
||||
c = *rune;
|
||||
if(c <= Rune1) {
|
||||
str[0] = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* two character sequence
|
||||
* 0080-07FF => T2 Tx
|
||||
*/
|
||||
if(c <= Rune2) {
|
||||
str[0] = T2 | (c >> 1*Bitx);
|
||||
str[1] = Tx | (c & Maskx);
|
||||
return 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the Rune is out of range, convert it to the error rune.
|
||||
* Do this test here because the error rune encodes to three bytes.
|
||||
* Doing it earlier would duplicate work, since an out of range
|
||||
* Rune wouldn't have fit in one or two bytes.
|
||||
*/
|
||||
if (c > Runemax)
|
||||
c = Runeerror;
|
||||
|
||||
/*
|
||||
* three character sequence
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
if (c <= Rune3) {
|
||||
str[0] = T3 | (c >> 2*Bitx);
|
||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[2] = Tx | (c & Maskx);
|
||||
return 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
str[0] = T4 | (c >> 3*Bitx);
|
||||
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[3] = Tx | (c & Maskx);
|
||||
return 4;
|
||||
}
|
||||
|
||||
int
|
||||
runelen(Rune rune)
|
||||
{
|
||||
char str[10];
|
||||
|
||||
return runetochar(str, &rune);
|
||||
}
|
||||
|
||||
int
|
||||
runenlen(const Rune *r, int nrune)
|
||||
{
|
||||
int nb;
|
||||
ulong c; /* Rune is signed, so use unsigned for range check. */
|
||||
|
||||
nb = 0;
|
||||
while(nrune--) {
|
||||
c = *r++;
|
||||
if (c <= Rune1)
|
||||
nb++;
|
||||
else if (c <= Rune2)
|
||||
nb += 2;
|
||||
else if (c <= Rune3)
|
||||
nb += 3;
|
||||
else if (c <= Runemax)
|
||||
nb += 4;
|
||||
else
|
||||
nb += 3; /* Runeerror = 0xFFFD, see runetochar */
|
||||
}
|
||||
return nb;
|
||||
}
|
||||
|
||||
int
|
||||
fullrune(const char *str, int n)
|
||||
{
|
||||
if (n > 0) {
|
||||
int c = *(uchar*)str;
|
||||
if (c < Tx)
|
||||
return 1;
|
||||
if (n > 1) {
|
||||
if (c < T3)
|
||||
return 1;
|
||||
if (n > 2) {
|
||||
if (c < T4 || n > 3)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
246
unittest/third_party/utf/utf.h
vendored
Normal file
246
unittest/third_party/utf/utf.h
vendored
Normal file
@ -0,0 +1,246 @@
|
||||
/*
|
||||
* The authors of this software are Rob Pike and Ken Thompson.
|
||||
* Copyright (c) 2002 by Lucent Technologies.
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose without fee is hereby granted, provided that this entire notice
|
||||
* is included in all copies of any software which is or includes a copy
|
||||
* or modification of this software and in all copies of the supporting
|
||||
* documentation for such software.
|
||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
#ifndef _UTFH_
|
||||
#define _UTFH_ 1
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
|
||||
|
||||
enum
|
||||
{
|
||||
UTFmax = 4, /* maximum bytes per rune */
|
||||
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
||||
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
||||
Runeerror = 0xFFFD, /* decoding error in UTF */
|
||||
Runemax = 0x10FFFF, /* maximum rune value */
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
* rune routines
|
||||
*/
|
||||
|
||||
/*
|
||||
* These routines were written by Rob Pike and Ken Thompson
|
||||
* and first appeared in Plan 9.
|
||||
* SEE ALSO
|
||||
* utf (7)
|
||||
* tcs (1)
|
||||
*/
|
||||
|
||||
// runetochar copies (encodes) one rune, pointed to by r, to at most
|
||||
// UTFmax bytes starting at s and returns the number of bytes generated.
|
||||
|
||||
int runetochar(char* s, const Rune* r);
|
||||
|
||||
|
||||
// chartorune copies (decodes) at most UTFmax bytes starting at s to
|
||||
// one rune, pointed to by r, and returns the number of bytes consumed.
|
||||
// If the input is not exactly in UTF format, chartorune will set *r
|
||||
// to Runeerror and return 1.
|
||||
//
|
||||
// Note: There is no special case for a "null-terminated" string. A
|
||||
// string whose first byte has the value 0 is the UTF8 encoding of the
|
||||
// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
|
||||
// anywhere else in a UTF sequence.
|
||||
|
||||
int chartorune(Rune* r, const char* s);
|
||||
|
||||
|
||||
// charntorune is like chartorune, except that it will access at most
|
||||
// n bytes of s. If the UTF sequence is incomplete within n bytes,
|
||||
// charntorune will set *r to Runeerror and return 0. If it is complete
|
||||
// but not in UTF format, it will set *r to Runeerror and return 1.
|
||||
//
|
||||
// Added 2004-09-24 by Wei-Hwa Huang
|
||||
|
||||
int charntorune(Rune* r, const char* s, int n);
|
||||
|
||||
// isvalidcharntorune(str, n, r, consumed)
|
||||
// is a convenience function that calls "*consumed = charntorune(r, str, n)"
|
||||
// and returns an int (logically boolean) indicating whether the first
|
||||
// n bytes of str was a valid and complete UTF sequence.
|
||||
|
||||
int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
|
||||
|
||||
// runelen returns the number of bytes required to convert r into UTF.
|
||||
|
||||
int runelen(Rune r);
|
||||
|
||||
|
||||
// runenlen returns the number of bytes required to convert the n
|
||||
// runes pointed to by r into UTF.
|
||||
|
||||
int runenlen(const Rune* r, int n);
|
||||
|
||||
|
||||
// fullrune returns 1 if the string s of length n is long enough to be
|
||||
// decoded by chartorune, and 0 otherwise. This does not guarantee
|
||||
// that the string contains a legal UTF encoding. This routine is used
|
||||
// by programs that obtain input one byte at a time and need to know
|
||||
// when a full rune has arrived.
|
||||
|
||||
int fullrune(const char* s, int n);
|
||||
|
||||
// The following routines are analogous to the corresponding string
|
||||
// routines with "utf" substituted for "str", and "rune" substituted
|
||||
// for "chr".
|
||||
|
||||
// utflen returns the number of runes that are represented by the UTF
|
||||
// string s. (cf. strlen)
|
||||
|
||||
int utflen(const char* s);
|
||||
|
||||
|
||||
// utfnlen returns the number of complete runes that are represented
|
||||
// by the first n bytes of the UTF string s. If the last few bytes of
|
||||
// the string contain an incompletely coded rune, utfnlen will not
|
||||
// count them; in this way, it differs from utflen, which includes
|
||||
// every byte of the string. (cf. strnlen)
|
||||
|
||||
int utfnlen(const char* s, long n);
|
||||
|
||||
|
||||
// utfrune returns a pointer to the first occurrence of rune r in the
|
||||
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||
// byte terminating a string is considered to be part of the string s.
|
||||
// (cf. strchr)
|
||||
|
||||
const char* utfrune(const char* s, Rune r);
|
||||
|
||||
|
||||
// utfrrune returns a pointer to the last occurrence of rune r in the
|
||||
// UTF string s, or 0 if r does not occur in the string. The NULL
|
||||
// byte terminating a string is considered to be part of the string s.
|
||||
// (cf. strrchr)
|
||||
|
||||
const char* utfrrune(const char* s, Rune r);
|
||||
|
||||
|
||||
// utfutf returns a pointer to the first occurrence of the UTF string
|
||||
// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
|
||||
// null string, utfutf returns s1. (cf. strstr)
|
||||
|
||||
const char* utfutf(const char* s1, const char* s2);
|
||||
|
||||
|
||||
// utfecpy copies UTF sequences until a null sequence has been copied,
|
||||
// but writes no sequences beyond es1. If any sequences are copied,
|
||||
// s1 is terminated by a null sequence, and a pointer to that sequence
|
||||
// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
|
||||
|
||||
char* utfecpy(char *s1, char *es1, const char *s2);
|
||||
|
||||
|
||||
|
||||
// These functions are rune-string analogues of the corresponding
|
||||
// functions in strcat (3).
|
||||
//
|
||||
// These routines first appeared in Plan 9.
|
||||
// SEE ALSO
|
||||
// memmove (3)
|
||||
// rune (3)
|
||||
// strcat (2)
|
||||
//
|
||||
// BUGS: The outcome of overlapping moves varies among implementations.
|
||||
|
||||
Rune* runestrcat(Rune* s1, const Rune* s2);
|
||||
Rune* runestrncat(Rune* s1, const Rune* s2, long n);
|
||||
|
||||
const Rune* runestrchr(const Rune* s, Rune c);
|
||||
|
||||
int runestrcmp(const Rune* s1, const Rune* s2);
|
||||
int runestrncmp(const Rune* s1, const Rune* s2, long n);
|
||||
|
||||
Rune* runestrcpy(Rune* s1, const Rune* s2);
|
||||
Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
|
||||
Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
|
||||
|
||||
Rune* runestrdup(const Rune* s);
|
||||
|
||||
const Rune* runestrrchr(const Rune* s, Rune c);
|
||||
long runestrlen(const Rune* s);
|
||||
const Rune* runestrstr(const Rune* s1, const Rune* s2);
|
||||
|
||||
|
||||
|
||||
// The following routines test types and modify cases for Unicode
|
||||
// characters. Unicode defines some characters as letters and
|
||||
// specifies three cases: upper, lower, and title. Mappings among the
|
||||
// cases are also defined, although they are not exhaustive: some
|
||||
// upper case letters have no lower case mapping, and so on. Unicode
|
||||
// also defines several character properties, a subset of which are
|
||||
// checked by these routines. These routines are based on Unicode
|
||||
// version 3.0.0.
|
||||
//
|
||||
// NOTE: The routines are implemented in C, so the boolean functions
|
||||
// (e.g., isupperrune) return 0 for false and 1 for true.
|
||||
//
|
||||
//
|
||||
// toupperrune, tolowerrune, and totitlerune are the Unicode case
|
||||
// mappings. These routines return the character unchanged if it has
|
||||
// no defined mapping.
|
||||
|
||||
Rune toupperrune(Rune r);
|
||||
Rune tolowerrune(Rune r);
|
||||
Rune totitlerune(Rune r);
|
||||
|
||||
|
||||
// isupperrune tests for upper case characters, including Unicode
|
||||
// upper case letters and targets of the toupper mapping. islowerrune
|
||||
// and istitlerune are defined analogously.
|
||||
|
||||
int isupperrune(Rune r);
|
||||
int islowerrune(Rune r);
|
||||
int istitlerune(Rune r);
|
||||
|
||||
|
||||
// isalpharune tests for Unicode letters; this includes ideographs in
|
||||
// addition to alphabetic characters.
|
||||
|
||||
int isalpharune(Rune r);
|
||||
|
||||
|
||||
// isdigitrune tests for digits. Non-digit numbers, such as Roman
|
||||
// numerals, are not included.
|
||||
|
||||
int isdigitrune(Rune r);
|
||||
|
||||
|
||||
// isideographicrune tests for ideographic characters and numbers, as
|
||||
// defined by the Unicode standard.
|
||||
|
||||
int isideographicrune(Rune r);
|
||||
|
||||
|
||||
// isspacerune tests for whitespace characters, including "C" locale
|
||||
// whitespace, Unicode defined whitespace, and the "zero-width
|
||||
// non-break space" character.
|
||||
|
||||
int isspacerune(Rune r);
|
||||
|
||||
|
||||
// (The comments in this file were copied from the manpage files rune.3,
|
||||
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
|
||||
// to conform to Google style. /JRM 11/11/05)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
14
unittest/third_party/utf/utfdef.h
vendored
Normal file
14
unittest/third_party/utf/utfdef.h
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
#define uchar _utfuchar
|
||||
#define ushort _utfushort
|
||||
#define uint _utfuint
|
||||
#define ulong _utfulong
|
||||
#define vlong _utfvlong
|
||||
#define uvlong _utfuvlong
|
||||
|
||||
typedef unsigned char uchar;
|
||||
typedef unsigned short ushort;
|
||||
typedef unsigned int uint;
|
||||
typedef unsigned long ulong;
|
||||
|
||||
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
|
||||
#define nil ((void*)0)
|
507
unittest/util/utf8/unicodetext.cc
Normal file
507
unittest/util/utf8/unicodetext.cc
Normal file
@ -0,0 +1,507 @@
|
||||
/**
|
||||
* Copyright 2010 Google Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "util/utf8/unicodetext.h"
|
||||
|
||||
#include <string.h> // for memcpy, NULL, memcmp, etc
|
||||
#include <algorithm> // for max
|
||||
|
||||
//#include "base/logging.h" // for operator<<, CHECK, etc
|
||||
//#include "base/stringprintf.h" // for StringPrintf, StringAppendF
|
||||
//#include "strings/stringpiece.h" // for StringPiece, etc
|
||||
|
||||
#include "third_party/utf/utf.h" // for isvalidcharntorune, etc
|
||||
#include "util/utf8/unilib.h" // for IsInterchangeValid, etc
|
||||
#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
|
||||
|
||||
static int CodepointDistance(const char* start, const char* end) {
|
||||
int n = 0;
|
||||
// Increment n on every non-trail-byte.
|
||||
for (const char* p = start; p < end; ++p) {
|
||||
n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
static int CodepointCount(const char* utf8, int len) {
|
||||
return CodepointDistance(utf8, utf8 + len);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator::difference_type
|
||||
distance(const UnicodeText::const_iterator& first,
|
||||
const UnicodeText::const_iterator& last) {
|
||||
return CodepointDistance(first.it_, last.it_);
|
||||
}
|
||||
|
||||
// ---------- Utility ----------
|
||||
|
||||
static int ConvertToInterchangeValid(char* start, int len) {
|
||||
// This routine is called only when we've discovered that a UTF-8 buffer
|
||||
// that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
|
||||
// was not interchange valid. This indicates a bug in the caller, and
|
||||
// a LOG(WARNING) is done in that case.
|
||||
// This is similar to CoerceToInterchangeValid, but it replaces each
|
||||
// structurally valid byte with a space, and each non-interchange
|
||||
// character with a space, even when that character requires more
|
||||
// than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
|
||||
// structurally valid UTF8, but U+FDD0 is not an interchange-valid
|
||||
// code point. The result should contain one space, not three.
|
||||
//
|
||||
// Since the conversion never needs to write more data than it
|
||||
// reads, it is safe to change the buffer in place. It returns the
|
||||
// number of bytes written.
|
||||
char* const in = start;
|
||||
char* out = start;
|
||||
char* const end = start + len;
|
||||
while (start < end) {
|
||||
int good = UniLib::SpanInterchangeValid(start, end - start);
|
||||
if (good > 0) {
|
||||
if (out != start) {
|
||||
memmove(out, start, good);
|
||||
}
|
||||
out += good;
|
||||
start += good;
|
||||
if (start == end) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Is the current string invalid UTF8 or just non-interchange UTF8?
|
||||
char32 rune;
|
||||
int n;
|
||||
if (isvalidcharntorune(start, end - start, &rune, &n)) {
|
||||
// structurally valid UTF8, but not interchange valid
|
||||
start += n; // Skip over the whole character.
|
||||
} else { // bad UTF8
|
||||
start += 1; // Skip over just one byte
|
||||
}
|
||||
*out++ = ' ';
|
||||
}
|
||||
return out - in;
|
||||
}
|
||||
|
||||
|
||||
// *************** Data representation **********
|
||||
|
||||
// Note: the copy constructor is undefined.
|
||||
|
||||
// After reserve(), resize(), or clear(), we're an owner, not an alias.
|
||||
|
||||
void UnicodeText::Repr::reserve(int new_capacity) {
|
||||
// If there's already enough capacity, and we're an owner, do nothing.
|
||||
if (capacity_ >= new_capacity && ours_) return;
|
||||
|
||||
// Otherwise, allocate a new buffer.
|
||||
capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
|
||||
char* new_data = new char[capacity_];
|
||||
|
||||
// If there is an old buffer, copy it into the new buffer.
|
||||
if (data_) {
|
||||
memcpy(new_data, data_, size_);
|
||||
if (ours_) delete[] data_; // If we owned the old buffer, free it.
|
||||
}
|
||||
data_ = new_data;
|
||||
ours_ = true; // We own the new buffer.
|
||||
// size_ is unchanged.
|
||||
}
|
||||
|
||||
void UnicodeText::Repr::resize(int new_size) {
|
||||
if (new_size == 0) {
|
||||
clear();
|
||||
} else {
|
||||
if (!ours_ || new_size > capacity_) reserve(new_size);
|
||||
// Clear the memory in the expanded part.
|
||||
if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
|
||||
size_ = new_size;
|
||||
ours_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
// This implementation of clear() deallocates the buffer if we're an owner.
|
||||
// That's not strictly necessary; we could just set size_ to 0.
|
||||
void UnicodeText::Repr::clear() {
|
||||
if (ours_) delete[] data_;
|
||||
data_ = nullptr;
|
||||
size_ = capacity_ = 0;
|
||||
ours_ = true;
|
||||
}
|
||||
|
||||
void UnicodeText::Repr::Copy(const char* data, int size) {
|
||||
resize(size);
|
||||
memcpy(data_, data, size);
|
||||
}
|
||||
|
||||
void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
|
||||
if (data == data_) return; // We already own this memory. (Weird case.)
|
||||
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
|
||||
data_ = data;
|
||||
size_ = size;
|
||||
capacity_ = capacity;
|
||||
ours_ = true;
|
||||
}
|
||||
|
||||
void UnicodeText::Repr::PointTo(const char* data, int size) {
|
||||
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
|
||||
data_ = const_cast<char*>(data);
|
||||
size_ = size;
|
||||
capacity_ = size;
|
||||
ours_ = false;
|
||||
}
|
||||
|
||||
void UnicodeText::Repr::append(const char* bytes, int byte_length) {
|
||||
reserve(size_ + byte_length);
|
||||
memcpy(data_ + size_, bytes, byte_length);
|
||||
size_ += byte_length;
|
||||
}
|
||||
|
||||
string UnicodeText::Repr::DebugString() const {
|
||||
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
|
||||
this,
|
||||
data_, size_, capacity_,
|
||||
ours_ ? "Owned" : "Alias");
|
||||
}
|
||||
|
||||
|
||||
|
||||
// *************** UnicodeText ******************
|
||||
|
||||
// ----- Constructors -----
|
||||
|
||||
// Default constructor
|
||||
UnicodeText::UnicodeText() {
|
||||
}
|
||||
|
||||
// Copy constructor
|
||||
UnicodeText::UnicodeText(const UnicodeText& src) {
|
||||
Copy(src);
|
||||
}
|
||||
|
||||
// Substring constructor
|
||||
UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
|
||||
const UnicodeText::const_iterator& last) {
|
||||
CHECK(first <= last) << " Incompatible iterators";
|
||||
repr_.append(first.it_, last.it_ - first.it_);
|
||||
}
|
||||
|
||||
string UnicodeText::UTF8Substring(const const_iterator& first,
|
||||
const const_iterator& last) {
|
||||
CHECK(first <= last) << " Incompatible iterators";
|
||||
return string(first.it_, last.it_ - first.it_);
|
||||
}
|
||||
|
||||
|
||||
// ----- Copy -----
|
||||
|
||||
UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
|
||||
if (this != &src) {
|
||||
Copy(src);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
|
||||
repr_.Copy(src.repr_.data_, src.repr_.size_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
|
||||
repr_.Copy(buffer, byte_length);
|
||||
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
|
||||
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
|
||||
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
|
||||
int byte_length) {
|
||||
repr_.Copy(buffer, byte_length);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ----- TakeOwnershipOf -----
|
||||
|
||||
UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
|
||||
int byte_length,
|
||||
int byte_capacity) {
|
||||
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
|
||||
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
|
||||
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
|
||||
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
|
||||
int byte_length,
|
||||
int byte_capacity) {
|
||||
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ----- PointTo -----
|
||||
|
||||
UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
|
||||
if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
|
||||
repr_.PointTo(buffer, byte_length);
|
||||
} else {
|
||||
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
|
||||
repr_.Copy(buffer, byte_length);
|
||||
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
|
||||
int byte_length) {
|
||||
repr_.PointTo(buffer, byte_length);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
|
||||
repr_.PointTo(src.repr_.data_, src.repr_.size_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::PointTo(const const_iterator &first,
|
||||
const const_iterator &last) {
|
||||
CHECK(first <= last) << " Incompatible iterators";
|
||||
repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ----- Append -----
|
||||
|
||||
UnicodeText& UnicodeText::append(const UnicodeText& u) {
|
||||
repr_.append(u.repr_.data_, u.repr_.size_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::append(const const_iterator& first,
|
||||
const const_iterator& last) {
|
||||
CHECK(first <= last) << " Incompatible iterators";
|
||||
repr_.append(first.it_, last.it_ - first.it_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
|
||||
repr_.append(utf8, len);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// ----- substring searching -----
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
|
||||
const_iterator start_pos) const {
|
||||
CHECK_GE(start_pos.utf8_data(), utf8_data());
|
||||
CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
|
||||
return UnsafeFind(look, start_pos);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
|
||||
return UnsafeFind(look, begin());
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::UnsafeFind(
|
||||
const UnicodeText& look, const_iterator start_pos) const {
|
||||
// Due to the magic of the UTF8 encoding, searching for a sequence of
|
||||
// letters is equivalent to substring search.
|
||||
StringPiece searching(utf8_data(), utf8_length());
|
||||
StringPiece look_piece(look.utf8_data(), look.utf8_length());
|
||||
LOG(FATAL) << "Not implemented";
|
||||
//StringPiece::size_type found =
|
||||
// searching.find(look_piece, start_pos.utf8_data() - utf8_data());
|
||||
StringPiece::size_type found = StringPiece::npos;
|
||||
if (found == StringPiece::npos) return end();
|
||||
return const_iterator(utf8_data() + found);
|
||||
}
|
||||
|
||||
bool UnicodeText::HasReplacementChar() const {
|
||||
// Equivalent to:
|
||||
// UnicodeText replacement_char;
|
||||
// replacement_char.push_back(0xFFFD);
|
||||
// return find(replacement_char) != end();
|
||||
StringPiece searching(utf8_data(), utf8_length());
|
||||
StringPiece looking_for("\xEF\xBF\xBD", 3);
|
||||
LOG(FATAL) << "Not implemented";
|
||||
//return searching.find(looking_for) != StringPiece::npos;
|
||||
return false;
|
||||
}
|
||||
|
||||
// ----- other methods -----
|
||||
|
||||
// Clear operator
|
||||
void UnicodeText::clear() {
|
||||
repr_.clear();
|
||||
}
|
||||
|
||||
// Destructor
|
||||
UnicodeText::~UnicodeText() {}
|
||||
|
||||
|
||||
void UnicodeText::push_back(char32 c) {
|
||||
if (UniLib::IsValidCodepoint(c)) {
|
||||
char buf[UTFmax];
|
||||
int len = runetochar(buf, &c);
|
||||
if (UniLib::IsInterchangeValid(buf, len)) {
|
||||
repr_.append(buf, len);
|
||||
} else {
|
||||
LOG(WARNING) << "Unicode value 0x" << std::hex << c
|
||||
<< " is not valid for interchange";
|
||||
repr_.append(" ", 1);
|
||||
}
|
||||
} else {
|
||||
LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
|
||||
repr_.append(" ", 1);
|
||||
}
|
||||
}
|
||||
|
||||
int UnicodeText::size() const {
|
||||
return CodepointCount(repr_.data_, repr_.size_);
|
||||
}
|
||||
|
||||
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
|
||||
if (&lhs == &rhs) return true;
|
||||
if (lhs.repr_.size_ != rhs.repr_.size_) return false;
|
||||
return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
|
||||
}
|
||||
|
||||
string UnicodeText::DebugString() const {
|
||||
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
|
||||
this,
|
||||
size(),
|
||||
repr_.DebugString().c_str());
|
||||
}
|
||||
|
||||
|
||||
// ******************* UnicodeText::const_iterator *********************
|
||||
|
||||
// The implementation of const_iterator would be nicer if it
|
||||
// inherited from boost::iterator_facade
|
||||
// (http://boost.org/libs/iterator/doc/iterator_facade.html).
|
||||
|
||||
UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
|
||||
|
||||
UnicodeText::const_iterator::const_iterator(const const_iterator& other)
|
||||
: it_(other.it_) {
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator&
|
||||
UnicodeText::const_iterator::operator=(const const_iterator& other) {
|
||||
if (&other != this)
|
||||
it_ = other.it_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::begin() const {
|
||||
return const_iterator(repr_.data_);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::end() const {
|
||||
return const_iterator(repr_.data_ + repr_.size_);
|
||||
}
|
||||
|
||||
bool operator<(const UnicodeText::const_iterator& lhs,
|
||||
const UnicodeText::const_iterator& rhs) {
|
||||
return lhs.it_ < rhs.it_;
|
||||
}
|
||||
|
||||
char32 UnicodeText::const_iterator::operator*() const {
|
||||
// (We could call chartorune here, but that does some
|
||||
// error-checking, and we're guaranteed that our data is valid
|
||||
// UTF-8. Also, we expect this routine to be called very often. So
|
||||
// for speed, we do the calculation ourselves.)
|
||||
|
||||
// Convert from UTF-8
|
||||
unsigned char byte1 = it_[0];
|
||||
if (byte1 < 0x80)
|
||||
return byte1;
|
||||
|
||||
unsigned char byte2 = it_[1];
|
||||
if (byte1 < 0xE0)
|
||||
return ((byte1 & 0x1F) << 6)
|
||||
| (byte2 & 0x3F);
|
||||
|
||||
unsigned char byte3 = it_[2];
|
||||
if (byte1 < 0xF0)
|
||||
return ((byte1 & 0x0F) << 12)
|
||||
| ((byte2 & 0x3F) << 6)
|
||||
| (byte3 & 0x3F);
|
||||
|
||||
unsigned char byte4 = it_[3];
|
||||
return ((byte1 & 0x07) << 18)
|
||||
| ((byte2 & 0x3F) << 12)
|
||||
| ((byte3 & 0x3F) << 6)
|
||||
| (byte4 & 0x3F);
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
|
||||
it_ += UniLib::OneCharLen(it_);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
|
||||
while (UniLib::IsTrailByte(*--it_));
|
||||
return *this;
|
||||
}
|
||||
|
||||
int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
|
||||
utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1;
|
||||
utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2;
|
||||
utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3;
|
||||
utf8_output[3] = it_[3];
|
||||
return 4;
|
||||
}
|
||||
|
||||
string UnicodeText::const_iterator::get_utf8_string() const {
|
||||
return string(utf8_data(), utf8_length());
|
||||
}
|
||||
|
||||
int UnicodeText::const_iterator::utf8_length() const {
|
||||
if ((it_[0] & 0xff) < 0x80) {
|
||||
return 1;
|
||||
} else if ((it_[0] & 0xff) < 0xE0) {
|
||||
return 2;
|
||||
} else if ((it_[0] & 0xff) < 0xF0) {
|
||||
return 3;
|
||||
} else {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
|
||||
CHECK(p != nullptr);
|
||||
const char* start = utf8_data();
|
||||
int len = utf8_length();
|
||||
const char* end = start + len;
|
||||
CHECK(p >= start);
|
||||
CHECK(p <= end);
|
||||
CHECK(p == end || !UniLib::IsTrailByte(*p));
|
||||
return const_iterator(p);
|
||||
}
|
||||
|
||||
string UnicodeText::const_iterator::DebugString() const {
|
||||
return tensorflow::strings::Printf("{iter %p}", it_);
|
||||
}
|
||||
|
||||
|
||||
// *************************** Utilities *************************
|
||||
|
||||
string CodepointString(const UnicodeText& t) {
|
||||
string s;
|
||||
UnicodeText::const_iterator it = t.begin(), end = t.end();
|
||||
while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
|
||||
return s;
|
||||
}
|
477
unittest/util/utf8/unicodetext.h
Normal file
477
unittest/util/utf8/unicodetext.h
Normal file
@ -0,0 +1,477 @@
|
||||
/**
|
||||
* Copyright 2010 Google Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
|
||||
#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
|
||||
|
||||
#include <stddef.h> // for NULL, ptrdiff_t
|
||||
#include <iterator> // for bidirectional_iterator_tag, etc
|
||||
#include <string> // for string
|
||||
#include <utility> // for pair
|
||||
|
||||
#include "syntaxnet/base.h"
|
||||
|
||||
// ***************************** UnicodeText **************************
|
||||
//
|
||||
// A UnicodeText object is a container for a sequence of Unicode
|
||||
// codepoint values. It has default, copy, and assignment constructors.
|
||||
// Data can be appended to it from another UnicodeText, from
|
||||
// iterators, or from a single codepoint.
|
||||
//
|
||||
// The internal representation of the text is UTF-8. Since UTF-8 is a
|
||||
// variable-width format, UnicodeText does not provide random access
|
||||
// to the text, and changes to the text are permitted only at the end.
|
||||
//
|
||||
// The UnicodeText class defines a const_iterator. The dereferencing
|
||||
// operator (*) returns a codepoint (char32). The iterator is a
|
||||
// bidirectional, read-only iterator. It becomes invalid if the text
|
||||
// is changed.
|
||||
//
|
||||
// There are methods for appending and retrieving UTF-8 data directly.
|
||||
// The 'utf8_data' method returns a const char* that contains the
|
||||
// UTF-8-encoded version of the text; 'utf8_length' returns the number
|
||||
// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
|
||||
// 4 bytes of UTF-8 data in a char array and returns the number of
|
||||
// bytes that it stored.
|
||||
//
|
||||
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
|
||||
// 0x10FFFF], but UnicodeText has the additional restriction that it
|
||||
// can contain only those characters that are valid for interchange on
|
||||
// the Web. This excludes all of the control codes except for carriage
|
||||
// return, line feed, and horizontal tab. It also excludes
|
||||
// non-characters, but codepoints that are in the Private Use regions
|
||||
// are allowed, as are codepoints that are unassigned. (See the
|
||||
// Unicode reference for details.) The function UniLib::IsInterchangeValid
|
||||
// can be used as a test for this property.
|
||||
//
|
||||
// UnicodeTexts are safe. Every method that constructs or modifies a
|
||||
// UnicodeText tests for interchange-validity, and will substitute a
|
||||
// space for the invalid data. Such cases are reported via
|
||||
// LOG(WARNING).
|
||||
//
|
||||
// MEMORY MANAGEMENT: copy, take ownership, or point to
|
||||
//
|
||||
// A UnicodeText is either an "owner", meaning that it owns the memory
|
||||
// for the data buffer and will free it when the UnicodeText is
|
||||
// destroyed, or it is an "alias", meaning that it does not.
|
||||
//
|
||||
// There are three methods for storing UTF-8 data in a UnicodeText:
|
||||
//
|
||||
// CopyUTF8(buffer, len) copies buffer.
|
||||
//
|
||||
// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
|
||||
//
|
||||
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
|
||||
//
|
||||
// All three methods perform a validity check on the buffer. There are
|
||||
// private, "unsafe" versions of these functions that bypass the
|
||||
// validity check. They are used internally and by friend-functions
|
||||
// that are handling UTF-8 data that has already been validated.
|
||||
//
|
||||
// The purpose of an alias is to avoid making an unnecessary copy of a
|
||||
// UTF-8 buffer while still providing access to the Unicode values
|
||||
// within that text through iterators or the fast scanners that are
|
||||
// based on UTF-8 state tables. The lifetime of an alias must not
|
||||
// exceed the lifetime of the buffer from which it was constructed.
|
||||
//
|
||||
// The semantics of an alias might be described as "copy on write or
|
||||
// repair." The source data is never modified. If push_back() or
|
||||
// append() is called on an alias, a copy of the data will be created,
|
||||
// and the UnicodeText will become an owner. If clear() is called on
|
||||
// an alias, it becomes an (empty) owner.
|
||||
//
|
||||
// The copy constructor and the assignment operator produce an owner.
|
||||
// That is, after direct initialization ("UnicodeText x(y);") or copy
|
||||
// initialization ("UnicodeText x = y;") x will be an owner, even if y
|
||||
// was an alias. The assignment operator ("x = y;") also produces an
|
||||
// owner unless x and y are the same object and y is an alias.
|
||||
//
|
||||
// Aliases should be used with care. If the source from which an alias
|
||||
// was created is freed, or if the contents are changed, while the
|
||||
// alias is still in use, fatal errors could result. But it can be
|
||||
// quite useful to have a UnicodeText "window" through which to see a
|
||||
// UTF-8 buffer without having to pay the price of making a copy.
|
||||
//
|
||||
// UTILITIES
|
||||
//
|
||||
// The interfaces in util/utf8/public/textutils.h provide higher-level
|
||||
// utilities for dealing with UnicodeTexts, including routines for
|
||||
// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
|
||||
// strings, creating strings from UnicodeTexts, normalizing text for
|
||||
// efficient matching or display, and others.
|
||||
|
||||
class UnicodeText {
|
||||
public:
|
||||
class const_iterator;
|
||||
|
||||
typedef char32 value_type;
|
||||
|
||||
// Constructors. These always produce owners.
|
||||
UnicodeText(); // Create an empty text.
|
||||
UnicodeText(const UnicodeText& src); // copy constructor
|
||||
// Construct a substring (copies the data).
|
||||
UnicodeText(const const_iterator& first, const const_iterator& last);
|
||||
|
||||
// Assignment operator. This copies the data and produces an owner
|
||||
// unless this == &src, e.g., "x = x;", which is a no-op.
|
||||
UnicodeText& operator=(const UnicodeText& src);
|
||||
|
||||
// x.Copy(y) copies the data from y into x.
|
||||
UnicodeText& Copy(const UnicodeText& src);
|
||||
inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
|
||||
|
||||
// x.PointTo(y) changes x so that it points to y's data.
|
||||
// It does not copy y or take ownership of y's data.
|
||||
UnicodeText& PointTo(const UnicodeText& src);
|
||||
UnicodeText& PointTo(const const_iterator& first,
|
||||
const const_iterator& last);
|
||||
|
||||
~UnicodeText();
|
||||
|
||||
void clear(); // Clear text.
|
||||
bool empty() const { return repr_.size_ == 0; } // Test if text is empty.
|
||||
|
||||
// Add a codepoint to the end of the text.
|
||||
// If the codepoint is not interchange-valid, add a space instead
|
||||
// and log a warning.
|
||||
void push_back(char32 codepoint);
|
||||
|
||||
// Generic appending operation.
|
||||
// iterator_traits<ForwardIterator>::value_type must be implicitly
|
||||
// convertible to char32. Typical uses of this method might include:
|
||||
// char32 chars[] = {0x1, 0x2, ...};
|
||||
// vector<char32> more_chars = ...;
|
||||
// utext.append(chars, chars+arraysize(chars));
|
||||
// utext.append(more_chars.begin(), more_chars.end());
|
||||
template<typename ForwardIterator>
|
||||
UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
|
||||
while (first != last) { push_back(*first++); }
|
||||
return *this;
|
||||
}
|
||||
|
||||
// A specialization of the generic append() method.
|
||||
UnicodeText& append(const const_iterator& first, const const_iterator& last);
|
||||
|
||||
// An optimization of append(source.begin(), source.end()).
|
||||
UnicodeText& append(const UnicodeText& source);
|
||||
|
||||
int size() const; // the number of Unicode characters (codepoints)
|
||||
|
||||
friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
|
||||
friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
|
||||
|
||||
class const_iterator {
|
||||
typedef const_iterator CI;
|
||||
public:
|
||||
typedef std::bidirectional_iterator_tag iterator_category;
|
||||
typedef char32 value_type;
|
||||
typedef ptrdiff_t difference_type;
|
||||
typedef void pointer; // (Not needed.)
|
||||
typedef const char32 reference; // (Needed for const_reverse_iterator)
|
||||
|
||||
// Iterators are default-constructible.
|
||||
const_iterator();
|
||||
|
||||
// It's safe to make multiple passes over a UnicodeText.
|
||||
const_iterator(const const_iterator& other);
|
||||
const_iterator& operator=(const const_iterator& other);
|
||||
|
||||
char32 operator*() const; // Dereference
|
||||
|
||||
const_iterator& operator++(); // Advance (++iter)
|
||||
const_iterator operator++(int) { // (iter++)
|
||||
const_iterator result(*this);
|
||||
++*this;
|
||||
return result;
|
||||
}
|
||||
|
||||
const_iterator& operator--(); // Retreat (--iter)
|
||||
const_iterator operator--(int) { // (iter--)
|
||||
const_iterator result(*this);
|
||||
--*this;
|
||||
return result;
|
||||
}
|
||||
|
||||
// We love relational operators.
|
||||
friend bool operator==(const CI& lhs, const CI& rhs) {
|
||||
return lhs.it_ == rhs.it_; }
|
||||
friend bool operator!=(const CI& lhs, const CI& rhs) {
|
||||
return !(lhs == rhs); }
|
||||
friend bool operator<(const CI& lhs, const CI& rhs);
|
||||
friend bool operator>(const CI& lhs, const CI& rhs) {
|
||||
return rhs < lhs; }
|
||||
friend bool operator<=(const CI& lhs, const CI& rhs) {
|
||||
return !(rhs < lhs); }
|
||||
friend bool operator>=(const CI& lhs, const CI& rhs) {
|
||||
return !(lhs < rhs); }
|
||||
|
||||
friend difference_type distance(const CI& first, const CI& last);
|
||||
|
||||
// UTF-8-specific methods
|
||||
// Store the UTF-8 encoding of the current codepoint into buf,
|
||||
// which must be at least 4 bytes long. Return the number of
|
||||
// bytes written.
|
||||
int get_utf8(char* buf) const;
|
||||
// Return the UTF-8 character that the iterator points to.
|
||||
string get_utf8_string() const;
|
||||
// Return the byte length of the UTF-8 character the iterator points to.
|
||||
int utf8_length() const;
|
||||
// Return the iterator's pointer into the UTF-8 data.
|
||||
const char* utf8_data() const { return it_; }
|
||||
|
||||
string DebugString() const;
|
||||
|
||||
private:
|
||||
friend class UnicodeText;
|
||||
friend class UnicodeTextUtils;
|
||||
friend class UTF8StateTableProperty;
|
||||
explicit const_iterator(const char* it) : it_(it) {}
|
||||
|
||||
const char* it_;
|
||||
};
|
||||
|
||||
const_iterator begin() const;
|
||||
const_iterator end() const;
|
||||
|
||||
class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
|
||||
public:
|
||||
explicit const_reverse_iterator(const_iterator it) :
|
||||
std::reverse_iterator<const_iterator>(it) {}
|
||||
const char* utf8_data() const {
|
||||
const_iterator tmp_it = base();
|
||||
return (--tmp_it).utf8_data();
|
||||
}
|
||||
int get_utf8(char* buf) const {
|
||||
const_iterator tmp_it = base();
|
||||
return (--tmp_it).get_utf8(buf);
|
||||
}
|
||||
string get_utf8_string() const {
|
||||
const_iterator tmp_it = base();
|
||||
return (--tmp_it).get_utf8_string();
|
||||
}
|
||||
int utf8_length() const {
|
||||
const_iterator tmp_it = base();
|
||||
return (--tmp_it).utf8_length();
|
||||
}
|
||||
};
|
||||
const_reverse_iterator rbegin() const {
|
||||
return const_reverse_iterator(end());
|
||||
}
|
||||
const_reverse_iterator rend() const {
|
||||
return const_reverse_iterator(begin());
|
||||
}
|
||||
|
||||
// Substring searching. Returns the beginning of the first
|
||||
// occurrence of "look", or end() if not found.
|
||||
const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
|
||||
// Equivalent to find(look, begin())
|
||||
const_iterator find(const UnicodeText& look) const;
|
||||
|
||||
// Returns whether this contains the character U+FFFD. This can
|
||||
// occur, for example, if the input to Encodings::Decode() had byte
|
||||
// sequences that were invalid in the source encoding.
|
||||
bool HasReplacementChar() const;
|
||||
|
||||
// UTF-8-specific methods
|
||||
//
|
||||
// Return the data, length, and capacity of UTF-8-encoded version of
|
||||
// the text. Length and capacity are measured in bytes.
|
||||
const char* utf8_data() const { return repr_.data_; }
|
||||
int utf8_length() const { return repr_.size_; }
|
||||
int utf8_capacity() const { return repr_.capacity_; }
|
||||
|
||||
// Return the UTF-8 data as a string.
|
||||
static string UTF8Substring(const const_iterator& first,
|
||||
const const_iterator& last);
|
||||
|
||||
// There are three methods for initializing a UnicodeText from UTF-8
|
||||
// data. They vary in details of memory management. In all cases,
|
||||
// the data is tested for interchange-validity. If it is not
|
||||
// interchange-valid, a LOG(WARNING) is issued, and each
|
||||
// structurally invalid byte and each interchange-invalid codepoint
|
||||
// is replaced with a space.
|
||||
|
||||
// x.CopyUTF8(buf, len) copies buf into x.
|
||||
UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
|
||||
|
||||
// x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
|
||||
// buf. buf is not copied.
|
||||
UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
|
||||
int byte_length,
|
||||
int byte_capacity);
|
||||
|
||||
// x.PointToUTF8(buf,len) changes x so that it points to buf
|
||||
// ("becomes an alias"). It does not take ownership or copy buf.
|
||||
// If the buffer is not valid, this has the same effect as
|
||||
// CopyUTF8(utf8_buffer, byte_length).
|
||||
UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
|
||||
|
||||
// Occasionally it is necessary to use functions that operate on the
|
||||
// pointer returned by utf8_data(). MakeIterator(p) provides a way
|
||||
// to get back to the UnicodeText level. It uses CHECK to ensure
|
||||
// that p is a pointer within this object's UTF-8 data, and that it
|
||||
// points to the beginning of a character.
|
||||
const_iterator MakeIterator(const char* p) const;
|
||||
|
||||
string DebugString() const;
|
||||
|
||||
private:
|
||||
friend class const_iterator;
|
||||
friend class UnicodeTextUtils;
|
||||
|
||||
class Repr { // A byte-string.
|
||||
public:
|
||||
char* data_;
|
||||
int size_;
|
||||
int capacity_;
|
||||
bool ours_; // Do we own data_?
|
||||
|
||||
Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
|
||||
~Repr() { if (ours_) delete[] data_; }
|
||||
|
||||
void clear();
|
||||
void reserve(int capacity);
|
||||
void resize(int size);
|
||||
|
||||
void append(const char* bytes, int byte_length);
|
||||
void Copy(const char* data, int size);
|
||||
void TakeOwnershipOf(char* data, int size, int capacity);
|
||||
void PointTo(const char* data, int size);
|
||||
|
||||
string DebugString() const;
|
||||
|
||||
private:
|
||||
Repr& operator=(const Repr&);
|
||||
Repr(const Repr& other);
|
||||
};
|
||||
|
||||
Repr repr_;
|
||||
|
||||
// UTF-8-specific private methods.
|
||||
// These routines do not perform a validity check when compiled
|
||||
// in opt mode.
|
||||
// It is an error to call these methods with UTF-8 data that
|
||||
// is not interchange-valid.
|
||||
//
|
||||
UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
|
||||
UnicodeText& UnsafeTakeOwnershipOfUTF8(
|
||||
char* utf8_buffer, int byte_length, int byte_capacity);
|
||||
UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
|
||||
UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
|
||||
const_iterator UnsafeFind(const UnicodeText& look,
|
||||
const_iterator start_pos) const;
|
||||
};
|
||||
|
||||
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
|
||||
|
||||
inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
// UnicodeTextRange is a pair of iterators, useful for specifying text
|
||||
// segments. If the iterators are ==, the segment is empty.
|
||||
typedef pair<UnicodeText::const_iterator,
|
||||
UnicodeText::const_iterator> UnicodeTextRange;
|
||||
|
||||
inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
|
||||
return r.first == r.second;
|
||||
}
|
||||
|
||||
|
||||
// *************************** Utilities *************************
|
||||
|
||||
// A factory function for creating a UnicodeText from a buffer of
|
||||
// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
|
||||
// is an "owner.")
|
||||
//
|
||||
// Each byte that is structurally invalid will be replaced with a
|
||||
// space. Each codepoint that is interchange-invalid will also be
|
||||
// replaced with a space, even if the codepoint was represented with a
|
||||
// multibyte sequence in the UTF-8 data.
|
||||
//
|
||||
inline UnicodeText MakeUnicodeTextAcceptingOwnership(
|
||||
char* utf8_buffer, int byte_length, int byte_capacity) {
|
||||
return UnicodeText().TakeOwnershipOfUTF8(
|
||||
utf8_buffer, byte_length, byte_capacity);
|
||||
}
|
||||
|
||||
// A factory function for creating a UnicodeText from a buffer of
|
||||
// UTF-8 data. The new UnicodeText does not take ownership of the
|
||||
// buffer. (It is an "alias.")
|
||||
//
|
||||
inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
|
||||
const char* utf8_buffer, int byte_length) {
|
||||
return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
|
||||
}
|
||||
|
||||
// Create a UnicodeText from a UTF-8 string or buffer.
|
||||
//
|
||||
// If do_copy is true, then a copy of the string is made. The copy is
|
||||
// owned by the resulting UnicodeText object and will be freed when
|
||||
// the object is destroyed. This UnicodeText object is referred to
|
||||
// as an "owner."
|
||||
//
|
||||
// If do_copy is false, then no copy is made. The resulting
|
||||
// UnicodeText object does NOT take ownership of the string; in this
|
||||
// case, the lifetime of the UnicodeText object must not exceed the
|
||||
// lifetime of the string. This Unicodetext object is referred to as
|
||||
// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
|
||||
//
|
||||
// If the input string does not contain valid UTF-8, then a copy is
|
||||
// made (as if do_copy were true) and coerced to valid UTF-8 by
|
||||
// replacing each invalid byte with a space.
|
||||
//
|
||||
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
|
||||
bool do_copy) {
|
||||
UnicodeText t;
|
||||
if (do_copy) {
|
||||
t.CopyUTF8(utf8_buf, len);
|
||||
} else {
|
||||
t.PointToUTF8(utf8_buf, len);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
|
||||
return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
|
||||
}
|
||||
|
||||
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
|
||||
return UTF8ToUnicodeText(utf8_buf, len, true);
|
||||
}
|
||||
inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
|
||||
return UTF8ToUnicodeText(utf8_string, true);
|
||||
}
|
||||
|
||||
// Return a string containing the UTF-8 encoded version of all the
|
||||
// Unicode characters in t.
|
||||
inline string UnicodeTextToUTF8(const UnicodeText& t) {
|
||||
return string(t.utf8_data(), t.utf8_length());
|
||||
}
|
||||
|
||||
// This template function declaration is used in defining arraysize.
|
||||
// Note that the function doesn't need an implementation, as we only
|
||||
// use its type.
|
||||
template <typename T, size_t N>
|
||||
char (&ArraySizeHelper(T (&array)[N]))[N];
|
||||
#define arraysize(array) (sizeof(ArraySizeHelper(array)))
|
||||
|
||||
// For debugging. Return a string of integers, written in uppercase
|
||||
// hex (%X), corresponding to the codepoints within the text. Each
|
||||
// integer is followed by a space. E.g., "61 62 6A 3005 ".
|
||||
string CodepointString(const UnicodeText& t);
|
||||
|
||||
#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
|
58
unittest/util/utf8/unilib.cc
Normal file
58
unittest/util/utf8/unilib.cc
Normal file
@ -0,0 +1,58 @@
|
||||
/**
|
||||
* Copyright 2010 Google Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Author: sligocki@google.com (Shawn Ligocki)
|
||||
|
||||
#include "util/utf8/unilib.h"
|
||||
|
||||
#include "syntaxnet/base.h"
|
||||
#include "third_party/utf/utf.h"
|
||||
|
||||
namespace UniLib {
|
||||
|
||||
// Codepoints not allowed for interchange are:
|
||||
// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
|
||||
// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
|
||||
// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
|
||||
// C1 controls: U+007F to U+009F
|
||||
// Surrogates: U+D800 to U+DFFF
|
||||
// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
|
||||
bool IsInterchangeValid(char32 c) {
|
||||
return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
|
||||
(c >= 0x7F && c <= 0x9F) ||
|
||||
(c >= 0xD800 && c <= 0xDFFF) ||
|
||||
(c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
|
||||
}
|
||||
|
||||
int SpanInterchangeValid(const char* begin, int byte_length) {
|
||||
char32 rune;
|
||||
const char* p = begin;
|
||||
const char* end = begin + byte_length;
|
||||
while (p < end) {
|
||||
int bytes_consumed = charntorune(&rune, p, end - p);
|
||||
// We want to accept Runeerror == U+FFFD as a valid char, but it is used
|
||||
// by chartorune to indicate error. Luckily, the real codepoint is size 3
|
||||
// while errors return bytes_consumed <= 1.
|
||||
if ((rune == Runeerror && bytes_consumed <= 1) ||
|
||||
!IsInterchangeValid(rune)) {
|
||||
break; // Found
|
||||
}
|
||||
p += bytes_consumed;
|
||||
}
|
||||
return p - begin;
|
||||
}
|
||||
|
||||
} // namespace UniLib
|
63
unittest/util/utf8/unilib.h
Normal file
63
unittest/util/utf8/unilib.h
Normal file
@ -0,0 +1,63 @@
|
||||
/**
|
||||
* Copyright 2010 Google Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Routines to do manipulation of Unicode characters or text
|
||||
//
|
||||
// The StructurallyValid routines accept buffers of arbitrary bytes.
|
||||
// For CoerceToStructurallyValid(), the input buffer and output buffers may
|
||||
// point to exactly the same memory.
|
||||
//
|
||||
// In all other cases, the UTF-8 string must be structurally valid and
|
||||
// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
|
||||
// Debug builds take a fatal error for invalid UTF-8 input.
|
||||
// The input and output buffers may not overlap at all.
|
||||
//
|
||||
// The char32 routines are here only for convenience; they convert to UTF-8
|
||||
// internally and use the UTF-8 routines.
|
||||
|
||||
#ifndef UTIL_UTF8_UNILIB_H__
|
||||
#define UTIL_UTF8_UNILIB_H__
|
||||
|
||||
#include <string>
|
||||
#include "syntaxnet/base.h"
|
||||
|
||||
// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
|
||||
// but they are defined in unilib_utf8_utils.h.
|
||||
//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
|
||||
|
||||
namespace UniLib {
|
||||
|
||||
// Returns the length in bytes of the prefix of src that is all
|
||||
// interchange valid UTF-8
|
||||
int SpanInterchangeValid(const char* src, int byte_length);
|
||||
inline int SpanInterchangeValid(const std::string& src) {
|
||||
return SpanInterchangeValid(src.data(), src.size());
|
||||
}
|
||||
|
||||
// Returns true if the source is all interchange valid UTF-8
|
||||
// "Interchange valid" is a stronger than structurally valid --
|
||||
// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
|
||||
bool IsInterchangeValid(char32 codepoint);
|
||||
inline bool IsInterchangeValid(const char* src, int byte_length) {
|
||||
return (byte_length == SpanInterchangeValid(src, byte_length));
|
||||
}
|
||||
inline bool IsInterchangeValid(const std::string& src) {
|
||||
return IsInterchangeValid(src.data(), src.size());
|
||||
}
|
||||
|
||||
} // namespace UniLib
|
||||
|
||||
#endif // UTIL_UTF8_PUBLIC_UNILIB_H_
|
66
unittest/util/utf8/unilib_utf8_utils.h
Normal file
66
unittest/util/utf8/unilib_utf8_utils.h
Normal file
@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Copyright 2010 Google Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
|
||||
#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
|
||||
|
||||
// These definitions are self-contained and have no dependencies.
|
||||
// They are also exported from unilib.h for legacy reasons.
|
||||
|
||||
#include "syntaxnet/base.h"
|
||||
#include "third_party/utf/utf.h"
|
||||
|
||||
namespace UniLib {
|
||||
|
||||
// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]
|
||||
// (i.e., is not a surrogate codepoint). See also
|
||||
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
|
||||
inline bool IsValidCodepoint(char32 c) {
|
||||
return (static_cast<uint32>(c) < 0xD800)
|
||||
|| (c >= 0xE000 && c <= 0x10FFFF);
|
||||
}
|
||||
|
||||
// Returns true if 'str' is the start of a structurally valid UTF-8
|
||||
// sequence and is not a surrogate codepoint. Returns false if str.empty()
|
||||
// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
|
||||
// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
|
||||
inline bool IsUTF8ValidCodepoint(StringPiece str) {
|
||||
char32 c;
|
||||
int consumed;
|
||||
// It's OK if str.length() > consumed.
|
||||
return !str.empty()
|
||||
&& isvalidcharntorune(str.data(), str.size(), &c, &consumed)
|
||||
&& IsValidCodepoint(c);
|
||||
}
|
||||
|
||||
// Returns the length (number of bytes) of the Unicode code point
|
||||
// starting at src, based on inspecting just that one byte. This
|
||||
// requires that src point to a well-formed UTF-8 string; the result
|
||||
// is undefined otherwise.
|
||||
inline int OneCharLen(const char* src) {
|
||||
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
|
||||
}
|
||||
|
||||
// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)
|
||||
inline bool IsTrailByte(char x) {
|
||||
// return (x & 0xC0) == 0x80;
|
||||
// Since trail bytes are always in [0x80, 0xBF], we can optimize:
|
||||
return static_cast<signed char>(x) < -0x40;
|
||||
}
|
||||
|
||||
} // namespace UniLib
|
||||
|
||||
#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
|
Loading…
Reference in New Issue
Block a user