Merge pull request #2523 from stweil/unilib

unittest: Add unilib.h and other code from Tensorflow and use it for more tests
This commit is contained in:
zdenop 2019-06-28 12:27:15 +02:00 committed by GitHub
commit 653faa3a64
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 1947 additions and 66 deletions

View File

@ -132,7 +132,6 @@ check_PROGRAMS += matrix_test
check_PROGRAMS += nthitem_test
check_PROGRAMS += osd_test
# check_PROGRAMS += pagesegmode_test
# check_PROGRAMS += pango_font_info_test
check_PROGRAMS += paragraphs_test
check_PROGRAMS += params_model_test
check_PROGRAMS += progress_test
@ -159,6 +158,7 @@ check_PROGRAMS += lstm_squashed_test
check_PROGRAMS += lstm_test
check_PROGRAMS += lstmtrainer_test
check_PROGRAMS += normstrngs_test
check_PROGRAMS += pango_font_info_test
check_PROGRAMS += unichar_test
check_PROGRAMS += unicharcompress_test
check_PROGRAMS += unicharset_test
@ -273,13 +273,22 @@ matrix_test_SOURCES = matrix_test.cc
matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
normstrngs_test_SOURCES = normstrngs_test.cc
normstrngs_test_SOURCES += third_party/utf/rune.c util/utf8/unilib.cc
normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)
nthitem_test_SOURCES = nthitem_test.cc
nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
#pango_font_info_test_SOURCES = pango_font_info_test.cc
#pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
osd_test_SOURCES = osd_test.cc
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
pango_font_info_test_SOURCES = pango_font_info_test.cc
pango_font_info_test_SOURCES += third_party/utf/rune.c
pango_font_info_test_SOURCES += util/utf8/unicodetext.cc util/utf8/unilib.cc
pango_font_info_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(LEPTONICA_LIBS)
pango_font_info_test_LDADD += $(ICU_I18N_LIBS) -lfontconfig
pango_font_info_test_LDADD += -lpangocairo-1.0 -lpangoft2-1.0
pango_font_info_test_LDADD += $(cairo_LIBS) $(pango_LIBS)
paragraphs_test_SOURCES = paragraphs_test.cc
paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
@ -287,9 +296,6 @@ paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
params_model_test_SOURCES = params_model_test.cc
params_model_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
osd_test_SOURCES = osd_test.cc
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
progress_test_SOURCES = progress_test.cc
progress_test_LDFLAGS = $(OPENCL_LDFLAGS) $(LEPTONICA_LIBS)
progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)

View File

@ -319,7 +319,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
const char* langs[] = {"eng", "chi_tra", "jpn", "vie"};
std::unique_ptr<tesseract::TessBaseAPI> api;
CycleTimer timer;
for (int i = 0; i < ARRAYSIZE(langs); ++i) {
for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
api.reset(new tesseract::TessBaseAPI);
timer.Restart();
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
@ -333,7 +333,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
vars_vec.push_back(STRING("tessedit_init_config_only"));
vars_values.push_back(STRING("1"));
LOG(INFO) << "Switching to config only initialization:";
for (int i = 0; i < ARRAYSIZE(langs); ++i) {
for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
api.reset(new tesseract::TessBaseAPI);
timer.Restart();
EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],

View File

@ -34,7 +34,7 @@ class HeapTest : public testing::Test {
virtual ~HeapTest();
// Pushes the test data onto both the heap and the KDVector.
void PushTestData(GenericHeap<IntKDPair>* heap, KDVector* v) {
for (int i = 0; i < ARRAYSIZE(test_data); ++i) {
for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
IntKDPair pair(test_data[i], i);
heap->Push(&pair);
v->push_back(pair);
@ -137,7 +137,7 @@ TEST_F(HeapTest, RevalueTest) {
GenericHeap<PtrPair> heap;
GenericVector<PtrPair> v;
// Push the test data onto both the heap and the vector.
for (int i = 0; i < ARRAYSIZE(test_data); ++i) {
for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
PtrPair h_pair;
h_pair.key = test_data[i];
PtrPair v_pair;

View File

@ -15,16 +15,14 @@
#include "normstrngs_test.h"
#include "strngs.h"
#include "unichar.h"
#if defined(HAS_UNILIB_H)
#include "unilib.h"
#endif
#include "util/utf8/unilib.h"
#include "include_gunit.h"
namespace tesseract {
namespace {
#if defined(HAS_UNILIB_H)
#if defined(MISSING_CODE)
static std::string EncodeAsUTF8(const char32 ch32) {
UNICHAR uni_ch(ch32);
return std::string(uni_ch.utf8(), uni_ch.utf8_len());
@ -363,7 +361,6 @@ TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
}
#if defined(HAS_UNILIB_H)
// Test that the method clones the util/utf8/public/unilib definition of
// interchange validity.
TEST(NormstrngsTest, IsInterchangeValid) {
@ -374,12 +371,11 @@ TEST(NormstrngsTest, IsInterchangeValid) {
EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));
}
}
#endif
#if defined(HAS_UNILIB_H)
// Test that the method clones the util/utf8/public/unilib definition of
// 7-bit ASCII interchange validity.
TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
#if defined(MISSING_CODE)
const int32_t kMinUnicodeValue = 33;
const int32_t kMaxUnicodeValue = 0x10FFFF;
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
@ -388,8 +384,11 @@ TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
IsInterchangeValid7BitAscii(ch));
}
}
#else
// Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
GTEST_SKIP();
#endif
}
// Test that the method clones the util/utf8/public/unilib definition of
// fullwidth-halfwidth .
@ -401,7 +400,8 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) {
// U+FFE6 -> U+20A9 (won sign)
EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));
#if defined(HAS_UNILIB_H)
#if defined(MISSING_CODE)
// Skipped because of missing UniLib::FullwidthToHalfwidth.
const int32_t kMinUnicodeValue = 33;
const int32_t kMaxUnicodeValue = 0x10FFFF;
for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {

View File

@ -1,12 +1,24 @@
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "tesseract/training/pango_font_info.h"
#include <stdio.h>
#include <string.h>
#include "pango/pango.h"
#include "tesseract/training/commandlineflags.h"
#include "tesseract/training/fileio.h"
#include <cstdio>
#include <string>
#include <pango/pango.h>
#include "include_gunit.h"
#include "commandlineflags.h"
#include "fileio.h"
#include "pango_font_info.h"
#include "absl/strings/str_cat.h" // for absl::StrCat
#include "gmock/gmock-matchers.h" // for EXPECT_THAT
#include "util/utf8/unicodetext.h" // for UnicodeText
DECLARE_STRING_PARAM_FLAG(fonts_dir);
DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
@ -19,19 +31,19 @@ using tesseract::FontUtils;
using tesseract::PangoFontInfo;
// Fonts in testdata directory
const char* kExpectedFontNames[] = {"Arab",
"Arial Bold Italic",
"DejaVu Sans Ultra-Light",
"Lohit Hindi",
const char* kExpectedFontNames[] = {
"Arab",
"Arial Bold Italic",
"DejaVu Sans Ultra-Light",
"Lohit Hindi",
#if PANGO_VERSION <= 12005
"Times New Roman",
"Times New Roman",
#else
"Times New Roman,", // Pango v1.36.2
// requires a trailing
// ','
"Times New Roman,", // Pango v1.36.2 requires a trailing ','
#endif
"UnBatang",
"Verdana"};
"UnBatang",
"Verdana"
};
// Sample text used in tests.
const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
@ -41,23 +53,27 @@ const char kKorText[] = "이는 것으로";
// Hindi words containing illegal vowel sequences.
const char* kBadlyFormedHinWords[] = {
#if PANGO_VERSION <= 12005
"उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
"उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
#endif
// Pango v1.36.2 will render the above words even though they are invalid.
"प्रंात", nullptr};
// Pango v1.36.2 will render the above words even though they are invalid.
"प्रंात", nullptr
};
class PangoFontInfoTest : public ::testing::Test {
protected:
void SetUp() override {
std::locale::global(std::locale(""));
static std::locale system_locale("");
std::locale::global(system_locale);
}
// Creates a fake fonts.conf file that points to the testdata fonts for
// fontconfig to initialize with.
static void SetUpTestCase() {
FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
FLAGS_fonts_dir = TESTING_DIR;
FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
#ifdef GOOGLE_TESSERACT
FLAGS_use_only_legacy_fonts = false;
#endif
}
PangoFontInfo font_info_;
@ -120,7 +136,7 @@ TEST_F(PangoFontInfoTest, CanRenderLigature) {
font_info_.ParseFontDescriptionName("Arab 12");
const char kArabicLigature[] = "لا";
EXPECT_TRUE(
font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
printf("Next word\n");
EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
@ -143,17 +159,17 @@ TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
font_info_.ParseFontDescriptionName("Verdana 12");
// Verdana cannot render the "ff" ligature
string word = "office";
std::string word = "office";
EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
EXPECT_EQ("oice", word);
// Don't drop non-letter characters like word joiners.
const char* kJoiners[] = {
"\u2060", // U+2060 (WJ)
"\u200C", // U+200C (ZWJ)
"\u200D" // U+200D (ZWNJ)
"\u2060", // U+2060 (WJ)
"\u200C", // U+200C (ZWJ)
"\u200D" // U+200D (ZWNJ)
};
for (int i = 0; i < ARRAYSIZE(kJoiners); ++i) {
for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) {
word = kJoiners[i];
EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
EXPECT_STREQ(kJoiners[i], word.c_str());
@ -167,17 +183,21 @@ class FontUtilsTest : public ::testing::Test {
// Creates a fake fonts.conf file that points to the testdata fonts for
// fontconfig to initialize with.
static void SetUpTestCase() {
FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
FLAGS_fonts_dir = TESTING_DIR;
FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
}
void CountUnicodeChars(const char* utf8_text,
std::unordered_map<char32, inT64>* ch_map) {
std::unordered_map<char32, int64_t>* ch_map) {
ch_map->clear();
UnicodeText ut;
ut.PointToUTF8(utf8_text, strlen(utf8_text));
for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
#if 0
if (UnicodeProps::IsWhitespace(*it)) continue;
#else
if (std::isspace(*it)) continue;
#endif
++(*ch_map)[*it];
}
}
@ -206,21 +226,21 @@ TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
}
TEST_F(FontUtilsTest, DoesListAvailableFonts) {
const std::vector<string>& fonts = FontUtils::ListAvailableFonts();
const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts();
EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
for (int i = 0; i < fonts.size(); ++i) {
for (auto& font : fonts) {
PangoFontInfo font_info;
EXPECT_TRUE(font_info.ParseFontDescriptionName(fonts[i]));
EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
}
}
TEST_F(FontUtilsTest, DoesFindBestFonts) {
string fonts_list;
std::unordered_map<char32, inT64> ch_map;
std::string fonts_list;
std::unordered_map<char32, int64_t> ch_map;
CountUnicodeChars(kEngText, &ch_map);
EXPECT_EQ(26, ch_map.size()); // 26 letters
std::vector<std::pair<const char*, std::vector<bool> > > font_flags;
string best_list = FontUtils::BestFonts(ch_map, &font_flags);
std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
EXPECT_TRUE(best_list.size());
// All fonts except Lohit Hindi should render English text.
EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size());
@ -238,8 +258,8 @@ TEST_F(FontUtilsTest, DoesSelectFont) {
const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
for (int i = 0; kLangText[i] != nullptr; ++i) {
SCOPED_TRACE(kLangNames[i]);
std::vector<string> graphemes;
string selected_font;
std::vector<std::string> graphemes;
std::string selected_font;
EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
&selected_font, &graphemes));
EXPECT_TRUE(selected_font.size());
@ -249,17 +269,17 @@ TEST_F(FontUtilsTest, DoesSelectFont) {
TEST_F(FontUtilsTest, DoesFailToSelectFont) {
const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
std::vector<string> graphemes;
string selected_font;
std::vector<std::string> graphemes;
std::string selected_font;
EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
&selected_font, &graphemes));
}
TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
const int32 kHindiChar = 0x0905;
const int32 kArabicChar = 0x0623;
const int32 kMongolianChar = 0x180E; // Mongolian vowel separator
const int32 kOghamChar = 0x1680; // Ogham space mark
const int32_t kHindiChar = 0x0905;
const int32_t kArabicChar = 0x0623;
const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator
const int32_t kOghamChar = 0x1680; // Ogham space mark
std::vector<bool> unicode_mask;
FontUtils::GetAllRenderableCharacters(&unicode_mask);
EXPECT_TRUE(unicode_mask['A']);
@ -267,10 +287,12 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
EXPECT_TRUE(unicode_mask[kHindiChar]);
EXPECT_TRUE(unicode_mask[kArabicChar]);
EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian.
#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham.
#endif
unicode_mask.clear();
std::vector<string> selected_fonts;
std::vector<std::string> selected_fonts;
selected_fonts.push_back("Lohit Hindi");
FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
EXPECT_TRUE(unicode_mask['1']);
@ -279,14 +301,18 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic,
EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian,
EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham.
unicode_mask.clear();
// Check that none of the included fonts cover the Mongolian or Ogham space
// characters.
for (int f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f]));
FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
EXPECT_FALSE(unicode_mask[kOghamChar]);
#endif
EXPECT_FALSE(unicode_mask[kMongolianChar]);
unicode_mask.clear();
}
}
} // namespace

61
unittest/syntaxnet/base.h Normal file
View File

@ -0,0 +1,61 @@
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef SYNTAXNET_BASE_H_
#define SYNTAXNET_BASE_H_
#include <functional>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "google/protobuf/util/message_differencer.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/default/integral_types.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/protobuf.h"
using tensorflow::int8;
using tensorflow::int16;
using tensorflow::int32;
using tensorflow::int64;
using tensorflow::uint8;
using tensorflow::uint16;
using tensorflow::uint64;
using tensorflow::uint32;
using tensorflow::protobuf::TextFormat;
using tensorflow::mutex_lock;
using tensorflow::mutex;
using std::map;
using std::pair;
using std::vector;
using std::unordered_map;
using std::unordered_set;
typedef signed int char32;
using tensorflow::StringPiece;
using std::string;
// namespace syntaxnet
#endif // SYNTAXNET_BASE_H_

357
unittest/third_party/utf/rune.c vendored Normal file
View File

@ -0,0 +1,357 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror,
};
/*
* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
* This is a slower but "safe" version of the old chartorune
* that works on strings that are not necessarily null-terminated.
*
* If you know for sure that your string is null-terminated,
* chartorune will be a bit faster.
*
* It is guaranteed not to attempt to access "length"
* past the incoming pointer. This is to avoid
* possible access violations. If the string appears to be
* well-formed but incomplete (i.e., to get the whole Rune
* we'd need to read past str+length) then we'll set the Rune
* to Bad and return 0.
*
* Note that if we have decoding problems for other
* reasons, we return 1 instead of 0.
*/
int
charntorune(Rune *rune, const char *str, int length)
{
int c, c1, c2, c3;
long l;
/* When we're not allowed to read anything */
if(length <= 0) {
goto badlen;
}
/*
* one character sequence (7-bit value)
* 00000-0007F => T1
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
return 1;
}
// If we can't read more than one character we must stop
if(length <= 1) {
goto badlen;
}
/*
* two character sequence (11-bit value)
* 0080-07FF => T2 Tx
*/
c1 = *(uchar*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
// If we can't read more than two characters we must stop
if(length <= 2) {
goto badlen;
}
/*
* three character sequence (16-bit value)
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(uchar*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
if (length <= 3)
goto badlen;
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
if (l > Runemax)
goto bad;
*rune = l;
return 4;
}
// Support for 5-byte or longer UTF-8 would go here, but
// since we don't have that, we'll just fall through to bad.
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
badlen:
*rune = Bad;
return 0;
}
/*
* This is the older "unsafe" version, which works fine on
* null-terminated strings.
*/
int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
long l;
/*
* one character sequence
* 00000-0007F => T1
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1 = *(uchar*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(uchar*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(uchar*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
if (l > Runemax)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
}
int
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
*consumed = charntorune(rune, str, length);
return *rune != Runeerror || *consumed == 3;
}
int
runetochar(char *str, const Rune *rune)
{
/* Runes are signed, so convert to unsigned for range check. */
unsigned long c;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int
runelen(Rune rune)
{
char str[10];
return runetochar(str, &rune);
}
int
runenlen(const Rune *r, int nrune)
{
int nb;
ulong c; /* Rune is signed, so use unsigned for range check. */
nb = 0;
while(nrune--) {
c = *r++;
if (c <= Rune1)
nb++;
else if (c <= Rune2)
nb += 2;
else if (c <= Rune3)
nb += 3;
else if (c <= Runemax)
nb += 4;
else
nb += 3; /* Runeerror = 0xFFFD, see runetochar */
}
return nb;
}
int
fullrune(const char *str, int n)
{
if (n > 0) {
int c = *(uchar*)str;
if (c < Tx)
return 1;
if (n > 1) {
if (c < T3)
return 1;
if (n > 2) {
if (c < T4 || n > 3)
return 1;
}
}
}
return 0;
}

246
unittest/third_party/utf/utf.h vendored Normal file
View File

@ -0,0 +1,246 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#ifndef _UTFH_
#define _UTFH_ 1
#include <stdint.h>
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
};
#ifdef __cplusplus
extern "C" {
#endif
/*
* rune routines
*/
/*
* These routines were written by Rob Pike and Ken Thompson
* and first appeared in Plan 9.
* SEE ALSO
* utf (7)
* tcs (1)
*/
// runetochar copies (encodes) one rune, pointed to by r, to at most
// UTFmax bytes starting at s and returns the number of bytes generated.
int runetochar(char* s, const Rune* r);
// chartorune copies (decodes) at most UTFmax bytes starting at s to
// one rune, pointed to by r, and returns the number of bytes consumed.
// If the input is not exactly in UTF format, chartorune will set *r
// to Runeerror and return 1.
//
// Note: There is no special case for a "null-terminated" string. A
// string whose first byte has the value 0 is the UTF8 encoding of the
// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
// anywhere else in a UTF sequence.
int chartorune(Rune* r, const char* s);
// charntorune is like chartorune, except that it will access at most
// n bytes of s. If the UTF sequence is incomplete within n bytes,
// charntorune will set *r to Runeerror and return 0. If it is complete
// but not in UTF format, it will set *r to Runeerror and return 1.
//
// Added 2004-09-24 by Wei-Hwa Huang
int charntorune(Rune* r, const char* s, int n);
// isvalidcharntorune(str, n, r, consumed)
// is a convenience function that calls "*consumed = charntorune(r, str, n)"
// and returns an int (logically boolean) indicating whether the first
// n bytes of str was a valid and complete UTF sequence.
int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
// runelen returns the number of bytes required to convert r into UTF.
int runelen(Rune r);
// runenlen returns the number of bytes required to convert the n
// runes pointed to by r into UTF.
int runenlen(const Rune* r, int n);
// fullrune returns 1 if the string s of length n is long enough to be
// decoded by chartorune, and 0 otherwise. This does not guarantee
// that the string contains a legal UTF encoding. This routine is used
// by programs that obtain input one byte at a time and need to know
// when a full rune has arrived.
int fullrune(const char* s, int n);
// The following routines are analogous to the corresponding string
// routines with "utf" substituted for "str", and "rune" substituted
// for "chr".
// utflen returns the number of runes that are represented by the UTF
// string s. (cf. strlen)
int utflen(const char* s);
// utfnlen returns the number of complete runes that are represented
// by the first n bytes of the UTF string s. If the last few bytes of
// the string contain an incompletely coded rune, utfnlen will not
// count them; in this way, it differs from utflen, which includes
// every byte of the string. (cf. strnlen)
int utfnlen(const char* s, long n);
// utfrune returns a pointer to the first occurrence of rune r in the
// UTF string s, or 0 if r does not occur in the string. The NULL
// byte terminating a string is considered to be part of the string s.
// (cf. strchr)
const char* utfrune(const char* s, Rune r);
// utfrrune returns a pointer to the last occurrence of rune r in the
// UTF string s, or 0 if r does not occur in the string. The NULL
// byte terminating a string is considered to be part of the string s.
// (cf. strrchr)
const char* utfrrune(const char* s, Rune r);
// utfutf returns a pointer to the first occurrence of the UTF string
// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
// null string, utfutf returns s1. (cf. strstr)
const char* utfutf(const char* s1, const char* s2);
// utfecpy copies UTF sequences until a null sequence has been copied,
// but writes no sequences beyond es1. If any sequences are copied,
// s1 is terminated by a null sequence, and a pointer to that sequence
// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
char* utfecpy(char *s1, char *es1, const char *s2);
// These functions are rune-string analogues of the corresponding
// functions in strcat (3).
//
// These routines first appeared in Plan 9.
// SEE ALSO
// memmove (3)
// rune (3)
// strcat (2)
//
// BUGS: The outcome of overlapping moves varies among implementations.
Rune* runestrcat(Rune* s1, const Rune* s2);
Rune* runestrncat(Rune* s1, const Rune* s2, long n);
const Rune* runestrchr(const Rune* s, Rune c);
int runestrcmp(const Rune* s1, const Rune* s2);
int runestrncmp(const Rune* s1, const Rune* s2, long n);
Rune* runestrcpy(Rune* s1, const Rune* s2);
Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
Rune* runestrdup(const Rune* s);
const Rune* runestrrchr(const Rune* s, Rune c);
long runestrlen(const Rune* s);
const Rune* runestrstr(const Rune* s1, const Rune* s2);
// The following routines test types and modify cases for Unicode
// characters. Unicode defines some characters as letters and
// specifies three cases: upper, lower, and title. Mappings among the
// cases are also defined, although they are not exhaustive: some
// upper case letters have no lower case mapping, and so on. Unicode
// also defines several character properties, a subset of which are
// checked by these routines. These routines are based on Unicode
// version 3.0.0.
//
// NOTE: The routines are implemented in C, so the boolean functions
// (e.g., isupperrune) return 0 for false and 1 for true.
//
//
// toupperrune, tolowerrune, and totitlerune are the Unicode case
// mappings. These routines return the character unchanged if it has
// no defined mapping.
Rune toupperrune(Rune r);
Rune tolowerrune(Rune r);
Rune totitlerune(Rune r);
// isupperrune tests for upper case characters, including Unicode
// upper case letters and targets of the toupper mapping. islowerrune
// and istitlerune are defined analogously.
int isupperrune(Rune r);
int islowerrune(Rune r);
int istitlerune(Rune r);
// isalpharune tests for Unicode letters; this includes ideographs in
// addition to alphabetic characters.
int isalpharune(Rune r);
// isdigitrune tests for digits. Non-digit numbers, such as Roman
// numerals, are not included.
int isdigitrune(Rune r);
// isideographicrune tests for ideographic characters and numbers, as
// defined by the Unicode standard.
int isideographicrune(Rune r);
// isspacerune tests for whitespace characters, including "C" locale
// whitespace, Unicode defined whitespace, and the "zero-width
// non-break space" character.
int isspacerune(Rune r);
// (The comments in this file were copied from the manpage files rune.3,
// isalpharune.3, and runestrcat.3. Some formatting changes were also made
// to conform to Google style. /JRM 11/11/05)
#ifdef __cplusplus
}
#endif
#endif

14
unittest/third_party/utf/utfdef.h vendored Normal file
View File

@ -0,0 +1,14 @@
#define uchar _utfuchar
#define ushort _utfushort
#define uint _utfuint
#define ulong _utfulong
#define vlong _utfvlong
#define uvlong _utfuvlong
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
#define nil ((void*)0)

View File

@ -0,0 +1,507 @@
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "util/utf8/unicodetext.h"
#include <string.h> // for memcpy, NULL, memcmp, etc
#include <algorithm> // for max
//#include "base/logging.h" // for operator<<, CHECK, etc
//#include "base/stringprintf.h" // for StringPrintf, StringAppendF
//#include "strings/stringpiece.h" // for StringPiece, etc
#include "third_party/utf/utf.h" // for isvalidcharntorune, etc
#include "util/utf8/unilib.h" // for IsInterchangeValid, etc
#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
static int CodepointDistance(const char* start, const char* end) {
int n = 0;
// Increment n on every non-trail-byte.
for (const char* p = start; p < end; ++p) {
n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
}
return n;
}
static int CodepointCount(const char* utf8, int len) {
return CodepointDistance(utf8, utf8 + len);
}
UnicodeText::const_iterator::difference_type
distance(const UnicodeText::const_iterator& first,
const UnicodeText::const_iterator& last) {
return CodepointDistance(first.it_, last.it_);
}
// ---------- Utility ----------
static int ConvertToInterchangeValid(char* start, int len) {
// This routine is called only when we've discovered that a UTF-8 buffer
// that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
// was not interchange valid. This indicates a bug in the caller, and
// a LOG(WARNING) is done in that case.
// This is similar to CoerceToInterchangeValid, but it replaces each
// structurally valid byte with a space, and each non-interchange
// character with a space, even when that character requires more
// than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
// structurally valid UTF8, but U+FDD0 is not an interchange-valid
// code point. The result should contain one space, not three.
//
// Since the conversion never needs to write more data than it
// reads, it is safe to change the buffer in place. It returns the
// number of bytes written.
char* const in = start;
char* out = start;
char* const end = start + len;
while (start < end) {
int good = UniLib::SpanInterchangeValid(start, end - start);
if (good > 0) {
if (out != start) {
memmove(out, start, good);
}
out += good;
start += good;
if (start == end) {
break;
}
}
// Is the current string invalid UTF8 or just non-interchange UTF8?
char32 rune;
int n;
if (isvalidcharntorune(start, end - start, &rune, &n)) {
// structurally valid UTF8, but not interchange valid
start += n; // Skip over the whole character.
} else { // bad UTF8
start += 1; // Skip over just one byte
}
*out++ = ' ';
}
return out - in;
}
// *************** Data representation **********
// Note: the copy constructor is undefined.
// After reserve(), resize(), or clear(), we're an owner, not an alias.
void UnicodeText::Repr::reserve(int new_capacity) {
// If there's already enough capacity, and we're an owner, do nothing.
if (capacity_ >= new_capacity && ours_) return;
// Otherwise, allocate a new buffer.
capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
char* new_data = new char[capacity_];
// If there is an old buffer, copy it into the new buffer.
if (data_) {
memcpy(new_data, data_, size_);
if (ours_) delete[] data_; // If we owned the old buffer, free it.
}
data_ = new_data;
ours_ = true; // We own the new buffer.
// size_ is unchanged.
}
void UnicodeText::Repr::resize(int new_size) {
if (new_size == 0) {
clear();
} else {
if (!ours_ || new_size > capacity_) reserve(new_size);
// Clear the memory in the expanded part.
if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
size_ = new_size;
ours_ = true;
}
}
// This implementation of clear() deallocates the buffer if we're an owner.
// That's not strictly necessary; we could just set size_ to 0.
void UnicodeText::Repr::clear() {
if (ours_) delete[] data_;
data_ = nullptr;
size_ = capacity_ = 0;
ours_ = true;
}
void UnicodeText::Repr::Copy(const char* data, int size) {
resize(size);
memcpy(data_, data, size);
}
void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
if (data == data_) return; // We already own this memory. (Weird case.)
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
data_ = data;
size_ = size;
capacity_ = capacity;
ours_ = true;
}
void UnicodeText::Repr::PointTo(const char* data, int size) {
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
data_ = const_cast<char*>(data);
size_ = size;
capacity_ = size;
ours_ = false;
}
void UnicodeText::Repr::append(const char* bytes, int byte_length) {
reserve(size_ + byte_length);
memcpy(data_ + size_, bytes, byte_length);
size_ += byte_length;
}
string UnicodeText::Repr::DebugString() const {
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
this,
data_, size_, capacity_,
ours_ ? "Owned" : "Alias");
}
// *************** UnicodeText ******************
// ----- Constructors -----
// Default constructor
UnicodeText::UnicodeText() {
}
// Copy constructor
UnicodeText::UnicodeText(const UnicodeText& src) {
Copy(src);
}
// Substring constructor
UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
const UnicodeText::const_iterator& last) {
CHECK(first <= last) << " Incompatible iterators";
repr_.append(first.it_, last.it_ - first.it_);
}
string UnicodeText::UTF8Substring(const const_iterator& first,
const const_iterator& last) {
CHECK(first <= last) << " Incompatible iterators";
return string(first.it_, last.it_ - first.it_);
}
// ----- Copy -----
UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
if (this != &src) {
Copy(src);
}
return *this;
}
UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
repr_.Copy(src.repr_.data_, src.repr_.size_);
return *this;
}
UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
repr_.Copy(buffer, byte_length);
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
return *this;
}
UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
int byte_length) {
repr_.Copy(buffer, byte_length);
return *this;
}
// ----- TakeOwnershipOf -----
UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
int byte_length,
int byte_capacity) {
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
return *this;
}
UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
int byte_length,
int byte_capacity) {
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
return *this;
}
// ----- PointTo -----
UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
repr_.PointTo(buffer, byte_length);
} else {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.Copy(buffer, byte_length);
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
return *this;
}
UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
int byte_length) {
repr_.PointTo(buffer, byte_length);
return *this;
}
UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
repr_.PointTo(src.repr_.data_, src.repr_.size_);
return *this;
}
UnicodeText& UnicodeText::PointTo(const const_iterator &first,
const const_iterator &last) {
CHECK(first <= last) << " Incompatible iterators";
repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
return *this;
}
// ----- Append -----
UnicodeText& UnicodeText::append(const UnicodeText& u) {
repr_.append(u.repr_.data_, u.repr_.size_);
return *this;
}
UnicodeText& UnicodeText::append(const const_iterator& first,
const const_iterator& last) {
CHECK(first <= last) << " Incompatible iterators";
repr_.append(first.it_, last.it_ - first.it_);
return *this;
}
UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
repr_.append(utf8, len);
return *this;
}
// ----- substring searching -----
UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
const_iterator start_pos) const {
CHECK_GE(start_pos.utf8_data(), utf8_data());
CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
return UnsafeFind(look, start_pos);
}
UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
return UnsafeFind(look, begin());
}
UnicodeText::const_iterator UnicodeText::UnsafeFind(
const UnicodeText& look, const_iterator start_pos) const {
// Due to the magic of the UTF8 encoding, searching for a sequence of
// letters is equivalent to substring search.
StringPiece searching(utf8_data(), utf8_length());
StringPiece look_piece(look.utf8_data(), look.utf8_length());
LOG(FATAL) << "Not implemented";
//StringPiece::size_type found =
// searching.find(look_piece, start_pos.utf8_data() - utf8_data());
StringPiece::size_type found = StringPiece::npos;
if (found == StringPiece::npos) return end();
return const_iterator(utf8_data() + found);
}
bool UnicodeText::HasReplacementChar() const {
// Equivalent to:
// UnicodeText replacement_char;
// replacement_char.push_back(0xFFFD);
// return find(replacement_char) != end();
StringPiece searching(utf8_data(), utf8_length());
StringPiece looking_for("\xEF\xBF\xBD", 3);
LOG(FATAL) << "Not implemented";
//return searching.find(looking_for) != StringPiece::npos;
return false;
}
// ----- other methods -----
// Clear operator
void UnicodeText::clear() {
repr_.clear();
}
// Destructor
UnicodeText::~UnicodeText() {}
void UnicodeText::push_back(char32 c) {
if (UniLib::IsValidCodepoint(c)) {
char buf[UTFmax];
int len = runetochar(buf, &c);
if (UniLib::IsInterchangeValid(buf, len)) {
repr_.append(buf, len);
} else {
LOG(WARNING) << "Unicode value 0x" << std::hex << c
<< " is not valid for interchange";
repr_.append(" ", 1);
}
} else {
LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
repr_.append(" ", 1);
}
}
int UnicodeText::size() const {
return CodepointCount(repr_.data_, repr_.size_);
}
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
if (&lhs == &rhs) return true;
if (lhs.repr_.size_ != rhs.repr_.size_) return false;
return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
}
string UnicodeText::DebugString() const {
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
this,
size(),
repr_.DebugString().c_str());
}
// ******************* UnicodeText::const_iterator *********************
// The implementation of const_iterator would be nicer if it
// inherited from boost::iterator_facade
// (http://boost.org/libs/iterator/doc/iterator_facade.html).
UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
UnicodeText::const_iterator::const_iterator(const const_iterator& other)
: it_(other.it_) {
}
UnicodeText::const_iterator&
UnicodeText::const_iterator::operator=(const const_iterator& other) {
if (&other != this)
it_ = other.it_;
return *this;
}
UnicodeText::const_iterator UnicodeText::begin() const {
return const_iterator(repr_.data_);
}
UnicodeText::const_iterator UnicodeText::end() const {
return const_iterator(repr_.data_ + repr_.size_);
}
bool operator<(const UnicodeText::const_iterator& lhs,
const UnicodeText::const_iterator& rhs) {
return lhs.it_ < rhs.it_;
}
char32 UnicodeText::const_iterator::operator*() const {
// (We could call chartorune here, but that does some
// error-checking, and we're guaranteed that our data is valid
// UTF-8. Also, we expect this routine to be called very often. So
// for speed, we do the calculation ourselves.)
// Convert from UTF-8
unsigned char byte1 = it_[0];
if (byte1 < 0x80)
return byte1;
unsigned char byte2 = it_[1];
if (byte1 < 0xE0)
return ((byte1 & 0x1F) << 6)
| (byte2 & 0x3F);
unsigned char byte3 = it_[2];
if (byte1 < 0xF0)
return ((byte1 & 0x0F) << 12)
| ((byte2 & 0x3F) << 6)
| (byte3 & 0x3F);
unsigned char byte4 = it_[3];
return ((byte1 & 0x07) << 18)
| ((byte2 & 0x3F) << 12)
| ((byte3 & 0x3F) << 6)
| (byte4 & 0x3F);
}
UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
it_ += UniLib::OneCharLen(it_);
return *this;
}
UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
while (UniLib::IsTrailByte(*--it_));
return *this;
}
int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1;
utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2;
utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3;
utf8_output[3] = it_[3];
return 4;
}
string UnicodeText::const_iterator::get_utf8_string() const {
return string(utf8_data(), utf8_length());
}
int UnicodeText::const_iterator::utf8_length() const {
if ((it_[0] & 0xff) < 0x80) {
return 1;
} else if ((it_[0] & 0xff) < 0xE0) {
return 2;
} else if ((it_[0] & 0xff) < 0xF0) {
return 3;
} else {
return 4;
}
}
UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
CHECK(p != nullptr);
const char* start = utf8_data();
int len = utf8_length();
const char* end = start + len;
CHECK(p >= start);
CHECK(p <= end);
CHECK(p == end || !UniLib::IsTrailByte(*p));
return const_iterator(p);
}
string UnicodeText::const_iterator::DebugString() const {
return tensorflow::strings::Printf("{iter %p}", it_);
}
// *************************** Utilities *************************
string CodepointString(const UnicodeText& t) {
string s;
UnicodeText::const_iterator it = t.begin(), end = t.end();
while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
return s;
}

View File

@ -0,0 +1,477 @@
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
#include <stddef.h> // for NULL, ptrdiff_t
#include <iterator> // for bidirectional_iterator_tag, etc
#include <string> // for string
#include <utility> // for pair
#include "syntaxnet/base.h"
// ***************************** UnicodeText **************************
//
// A UnicodeText object is a container for a sequence of Unicode
// codepoint values. It has default, copy, and assignment constructors.
// Data can be appended to it from another UnicodeText, from
// iterators, or from a single codepoint.
//
// The internal representation of the text is UTF-8. Since UTF-8 is a
// variable-width format, UnicodeText does not provide random access
// to the text, and changes to the text are permitted only at the end.
//
// The UnicodeText class defines a const_iterator. The dereferencing
// operator (*) returns a codepoint (char32). The iterator is a
// bidirectional, read-only iterator. It becomes invalid if the text
// is changed.
//
// There are methods for appending and retrieving UTF-8 data directly.
// The 'utf8_data' method returns a const char* that contains the
// UTF-8-encoded version of the text; 'utf8_length' returns the number
// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
// 4 bytes of UTF-8 data in a char array and returns the number of
// bytes that it stored.
//
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
// 0x10FFFF], but UnicodeText has the additional restriction that it
// can contain only those characters that are valid for interchange on
// the Web. This excludes all of the control codes except for carriage
// return, line feed, and horizontal tab. It also excludes
// non-characters, but codepoints that are in the Private Use regions
// are allowed, as are codepoints that are unassigned. (See the
// Unicode reference for details.) The function UniLib::IsInterchangeValid
// can be used as a test for this property.
//
// UnicodeTexts are safe. Every method that constructs or modifies a
// UnicodeText tests for interchange-validity, and will substitute a
// space for the invalid data. Such cases are reported via
// LOG(WARNING).
//
// MEMORY MANAGEMENT: copy, take ownership, or point to
//
// A UnicodeText is either an "owner", meaning that it owns the memory
// for the data buffer and will free it when the UnicodeText is
// destroyed, or it is an "alias", meaning that it does not.
//
// There are three methods for storing UTF-8 data in a UnicodeText:
//
// CopyUTF8(buffer, len) copies buffer.
//
// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
//
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
//
// All three methods perform a validity check on the buffer. There are
// private, "unsafe" versions of these functions that bypass the
// validity check. They are used internally and by friend-functions
// that are handling UTF-8 data that has already been validated.
//
// The purpose of an alias is to avoid making an unnecessary copy of a
// UTF-8 buffer while still providing access to the Unicode values
// within that text through iterators or the fast scanners that are
// based on UTF-8 state tables. The lifetime of an alias must not
// exceed the lifetime of the buffer from which it was constructed.
//
// The semantics of an alias might be described as "copy on write or
// repair." The source data is never modified. If push_back() or
// append() is called on an alias, a copy of the data will be created,
// and the UnicodeText will become an owner. If clear() is called on
// an alias, it becomes an (empty) owner.
//
// The copy constructor and the assignment operator produce an owner.
// That is, after direct initialization ("UnicodeText x(y);") or copy
// initialization ("UnicodeText x = y;") x will be an owner, even if y
// was an alias. The assignment operator ("x = y;") also produces an
// owner unless x and y are the same object and y is an alias.
//
// Aliases should be used with care. If the source from which an alias
// was created is freed, or if the contents are changed, while the
// alias is still in use, fatal errors could result. But it can be
// quite useful to have a UnicodeText "window" through which to see a
// UTF-8 buffer without having to pay the price of making a copy.
//
// UTILITIES
//
// The interfaces in util/utf8/public/textutils.h provide higher-level
// utilities for dealing with UnicodeTexts, including routines for
// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
// strings, creating strings from UnicodeTexts, normalizing text for
// efficient matching or display, and others.
class UnicodeText {
public:
class const_iterator;
typedef char32 value_type;
// Constructors. These always produce owners.
UnicodeText(); // Create an empty text.
UnicodeText(const UnicodeText& src); // copy constructor
// Construct a substring (copies the data).
UnicodeText(const const_iterator& first, const const_iterator& last);
// Assignment operator. This copies the data and produces an owner
// unless this == &src, e.g., "x = x;", which is a no-op.
UnicodeText& operator=(const UnicodeText& src);
// x.Copy(y) copies the data from y into x.
UnicodeText& Copy(const UnicodeText& src);
inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
// x.PointTo(y) changes x so that it points to y's data.
// It does not copy y or take ownership of y's data.
UnicodeText& PointTo(const UnicodeText& src);
UnicodeText& PointTo(const const_iterator& first,
const const_iterator& last);
~UnicodeText();
void clear(); // Clear text.
bool empty() const { return repr_.size_ == 0; } // Test if text is empty.
// Add a codepoint to the end of the text.
// If the codepoint is not interchange-valid, add a space instead
// and log a warning.
void push_back(char32 codepoint);
// Generic appending operation.
// iterator_traits<ForwardIterator>::value_type must be implicitly
// convertible to char32. Typical uses of this method might include:
// char32 chars[] = {0x1, 0x2, ...};
// vector<char32> more_chars = ...;
// utext.append(chars, chars+arraysize(chars));
// utext.append(more_chars.begin(), more_chars.end());
template<typename ForwardIterator>
UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
while (first != last) { push_back(*first++); }
return *this;
}
// A specialization of the generic append() method.
UnicodeText& append(const const_iterator& first, const const_iterator& last);
// An optimization of append(source.begin(), source.end()).
UnicodeText& append(const UnicodeText& source);
int size() const; // the number of Unicode characters (codepoints)
friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
class const_iterator {
typedef const_iterator CI;
public:
typedef std::bidirectional_iterator_tag iterator_category;
typedef char32 value_type;
typedef ptrdiff_t difference_type;
typedef void pointer; // (Not needed.)
typedef const char32 reference; // (Needed for const_reverse_iterator)
// Iterators are default-constructible.
const_iterator();
// It's safe to make multiple passes over a UnicodeText.
const_iterator(const const_iterator& other);
const_iterator& operator=(const const_iterator& other);
char32 operator*() const; // Dereference
const_iterator& operator++(); // Advance (++iter)
const_iterator operator++(int) { // (iter++)
const_iterator result(*this);
++*this;
return result;
}
const_iterator& operator--(); // Retreat (--iter)
const_iterator operator--(int) { // (iter--)
const_iterator result(*this);
--*this;
return result;
}
// We love relational operators.
friend bool operator==(const CI& lhs, const CI& rhs) {
return lhs.it_ == rhs.it_; }
friend bool operator!=(const CI& lhs, const CI& rhs) {
return !(lhs == rhs); }
friend bool operator<(const CI& lhs, const CI& rhs);
friend bool operator>(const CI& lhs, const CI& rhs) {
return rhs < lhs; }
friend bool operator<=(const CI& lhs, const CI& rhs) {
return !(rhs < lhs); }
friend bool operator>=(const CI& lhs, const CI& rhs) {
return !(lhs < rhs); }
friend difference_type distance(const CI& first, const CI& last);
// UTF-8-specific methods
// Store the UTF-8 encoding of the current codepoint into buf,
// which must be at least 4 bytes long. Return the number of
// bytes written.
int get_utf8(char* buf) const;
// Return the UTF-8 character that the iterator points to.
string get_utf8_string() const;
// Return the byte length of the UTF-8 character the iterator points to.
int utf8_length() const;
// Return the iterator's pointer into the UTF-8 data.
const char* utf8_data() const { return it_; }
string DebugString() const;
private:
friend class UnicodeText;
friend class UnicodeTextUtils;
friend class UTF8StateTableProperty;
explicit const_iterator(const char* it) : it_(it) {}
const char* it_;
};
const_iterator begin() const;
const_iterator end() const;
class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
public:
explicit const_reverse_iterator(const_iterator it) :
std::reverse_iterator<const_iterator>(it) {}
const char* utf8_data() const {
const_iterator tmp_it = base();
return (--tmp_it).utf8_data();
}
int get_utf8(char* buf) const {
const_iterator tmp_it = base();
return (--tmp_it).get_utf8(buf);
}
string get_utf8_string() const {
const_iterator tmp_it = base();
return (--tmp_it).get_utf8_string();
}
int utf8_length() const {
const_iterator tmp_it = base();
return (--tmp_it).utf8_length();
}
};
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
// Substring searching. Returns the beginning of the first
// occurrence of "look", or end() if not found.
const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
// Equivalent to find(look, begin())
const_iterator find(const UnicodeText& look) const;
// Returns whether this contains the character U+FFFD. This can
// occur, for example, if the input to Encodings::Decode() had byte
// sequences that were invalid in the source encoding.
bool HasReplacementChar() const;
// UTF-8-specific methods
//
// Return the data, length, and capacity of UTF-8-encoded version of
// the text. Length and capacity are measured in bytes.
const char* utf8_data() const { return repr_.data_; }
int utf8_length() const { return repr_.size_; }
int utf8_capacity() const { return repr_.capacity_; }
// Return the UTF-8 data as a string.
static string UTF8Substring(const const_iterator& first,
const const_iterator& last);
// There are three methods for initializing a UnicodeText from UTF-8
// data. They vary in details of memory management. In all cases,
// the data is tested for interchange-validity. If it is not
// interchange-valid, a LOG(WARNING) is issued, and each
// structurally invalid byte and each interchange-invalid codepoint
// is replaced with a space.
// x.CopyUTF8(buf, len) copies buf into x.
UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
// x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
// buf. buf is not copied.
UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
int byte_length,
int byte_capacity);
// x.PointToUTF8(buf,len) changes x so that it points to buf
// ("becomes an alias"). It does not take ownership or copy buf.
// If the buffer is not valid, this has the same effect as
// CopyUTF8(utf8_buffer, byte_length).
UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
// Occasionally it is necessary to use functions that operate on the
// pointer returned by utf8_data(). MakeIterator(p) provides a way
// to get back to the UnicodeText level. It uses CHECK to ensure
// that p is a pointer within this object's UTF-8 data, and that it
// points to the beginning of a character.
const_iterator MakeIterator(const char* p) const;
string DebugString() const;
private:
friend class const_iterator;
friend class UnicodeTextUtils;
class Repr { // A byte-string.
public:
char* data_;
int size_;
int capacity_;
bool ours_; // Do we own data_?
Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
~Repr() { if (ours_) delete[] data_; }
void clear();
void reserve(int capacity);
void resize(int size);
void append(const char* bytes, int byte_length);
void Copy(const char* data, int size);
void TakeOwnershipOf(char* data, int size, int capacity);
void PointTo(const char* data, int size);
string DebugString() const;
private:
Repr& operator=(const Repr&);
Repr(const Repr& other);
};
Repr repr_;
// UTF-8-specific private methods.
// These routines do not perform a validity check when compiled
// in opt mode.
// It is an error to call these methods with UTF-8 data that
// is not interchange-valid.
//
UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
UnicodeText& UnsafeTakeOwnershipOfUTF8(
char* utf8_buffer, int byte_length, int byte_capacity);
UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
const_iterator UnsafeFind(const UnicodeText& look,
const_iterator start_pos) const;
};
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
return !(lhs == rhs);
}
// UnicodeTextRange is a pair of iterators, useful for specifying text
// segments. If the iterators are ==, the segment is empty.
typedef pair<UnicodeText::const_iterator,
UnicodeText::const_iterator> UnicodeTextRange;
inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
return r.first == r.second;
}
// *************************** Utilities *************************
// A factory function for creating a UnicodeText from a buffer of
// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
// is an "owner.")
//
// Each byte that is structurally invalid will be replaced with a
// space. Each codepoint that is interchange-invalid will also be
// replaced with a space, even if the codepoint was represented with a
// multibyte sequence in the UTF-8 data.
//
inline UnicodeText MakeUnicodeTextAcceptingOwnership(
char* utf8_buffer, int byte_length, int byte_capacity) {
return UnicodeText().TakeOwnershipOfUTF8(
utf8_buffer, byte_length, byte_capacity);
}
// A factory function for creating a UnicodeText from a buffer of
// UTF-8 data. The new UnicodeText does not take ownership of the
// buffer. (It is an "alias.")
//
inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
const char* utf8_buffer, int byte_length) {
return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
}
// Create a UnicodeText from a UTF-8 string or buffer.
//
// If do_copy is true, then a copy of the string is made. The copy is
// owned by the resulting UnicodeText object and will be freed when
// the object is destroyed. This UnicodeText object is referred to
// as an "owner."
//
// If do_copy is false, then no copy is made. The resulting
// UnicodeText object does NOT take ownership of the string; in this
// case, the lifetime of the UnicodeText object must not exceed the
// lifetime of the string. This Unicodetext object is referred to as
// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
//
// If the input string does not contain valid UTF-8, then a copy is
// made (as if do_copy were true) and coerced to valid UTF-8 by
// replacing each invalid byte with a space.
//
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
bool do_copy) {
UnicodeText t;
if (do_copy) {
t.CopyUTF8(utf8_buf, len);
} else {
t.PointToUTF8(utf8_buf, len);
}
return t;
}
inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
}
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
return UTF8ToUnicodeText(utf8_buf, len, true);
}
inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
return UTF8ToUnicodeText(utf8_string, true);
}
// Return a string containing the UTF-8 encoded version of all the
// Unicode characters in t.
inline string UnicodeTextToUTF8(const UnicodeText& t) {
return string(t.utf8_data(), t.utf8_length());
}
// This template function declaration is used in defining arraysize.
// Note that the function doesn't need an implementation, as we only
// use its type.
template <typename T, size_t N>
char (&ArraySizeHelper(T (&array)[N]))[N];
#define arraysize(array) (sizeof(ArraySizeHelper(array)))
// For debugging. Return a string of integers, written in uppercase
// hex (%X), corresponding to the codepoints within the text. Each
// integer is followed by a space. E.g., "61 62 6A 3005 ".
string CodepointString(const UnicodeText& t);
#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_

View File

@ -0,0 +1,58 @@
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
#include "util/utf8/unilib.h"
#include "syntaxnet/base.h"
#include "third_party/utf/utf.h"
namespace UniLib {
// Codepoints not allowed for interchange are:
// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
// C1 controls: U+007F to U+009F
// Surrogates: U+D800 to U+DFFF
// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
bool IsInterchangeValid(char32 c) {
return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
(c >= 0x7F && c <= 0x9F) ||
(c >= 0xD800 && c <= 0xDFFF) ||
(c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
}
int SpanInterchangeValid(const char* begin, int byte_length) {
char32 rune;
const char* p = begin;
const char* end = begin + byte_length;
while (p < end) {
int bytes_consumed = charntorune(&rune, p, end - p);
// We want to accept Runeerror == U+FFFD as a valid char, but it is used
// by chartorune to indicate error. Luckily, the real codepoint is size 3
// while errors return bytes_consumed <= 1.
if ((rune == Runeerror && bytes_consumed <= 1) ||
!IsInterchangeValid(rune)) {
break; // Found
}
p += bytes_consumed;
}
return p - begin;
}
} // namespace UniLib

View File

@ -0,0 +1,63 @@
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Routines to do manipulation of Unicode characters or text
//
// The StructurallyValid routines accept buffers of arbitrary bytes.
// For CoerceToStructurallyValid(), the input buffer and output buffers may
// point to exactly the same memory.
//
// In all other cases, the UTF-8 string must be structurally valid and
// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
// Debug builds take a fatal error for invalid UTF-8 input.
// The input and output buffers may not overlap at all.
//
// The char32 routines are here only for convenience; they convert to UTF-8
// internally and use the UTF-8 routines.
#ifndef UTIL_UTF8_UNILIB_H__
#define UTIL_UTF8_UNILIB_H__
#include <string>
#include "syntaxnet/base.h"
// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
// but they are defined in unilib_utf8_utils.h.
//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
namespace UniLib {
// Returns the length in bytes of the prefix of src that is all
// interchange valid UTF-8
int SpanInterchangeValid(const char* src, int byte_length);
inline int SpanInterchangeValid(const std::string& src) {
return SpanInterchangeValid(src.data(), src.size());
}
// Returns true if the source is all interchange valid UTF-8
// "Interchange valid" is a stronger than structurally valid --
// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
bool IsInterchangeValid(char32 codepoint);
inline bool IsInterchangeValid(const char* src, int byte_length) {
return (byte_length == SpanInterchangeValid(src, byte_length));
}
inline bool IsInterchangeValid(const std::string& src) {
return IsInterchangeValid(src.data(), src.size());
}
} // namespace UniLib
#endif // UTIL_UTF8_PUBLIC_UNILIB_H_

View File

@ -0,0 +1,66 @@
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
// These definitions are self-contained and have no dependencies.
// They are also exported from unilib.h for legacy reasons.
#include "syntaxnet/base.h"
#include "third_party/utf/utf.h"
namespace UniLib {
// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]
// (i.e., is not a surrogate codepoint). See also
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
inline bool IsValidCodepoint(char32 c) {
return (static_cast<uint32>(c) < 0xD800)
|| (c >= 0xE000 && c <= 0x10FFFF);
}
// Returns true if 'str' is the start of a structurally valid UTF-8
// sequence and is not a surrogate codepoint. Returns false if str.empty()
// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
inline bool IsUTF8ValidCodepoint(StringPiece str) {
char32 c;
int consumed;
// It's OK if str.length() > consumed.
return !str.empty()
&& isvalidcharntorune(str.data(), str.size(), &c, &consumed)
&& IsValidCodepoint(c);
}
// Returns the length (number of bytes) of the Unicode code point
// starting at src, based on inspecting just that one byte. This
// requires that src point to a well-formed UTF-8 string; the result
// is undefined otherwise.
inline int OneCharLen(const char* src) {
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
}
// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)
inline bool IsTrailByte(char x) {
// return (x & 0xC0) == 0x80;
// Since trail bytes are always in [0x80, 0xBF], we can optimize:
return static_cast<signed char>(x) < -0x40;
}
} // namespace UniLib
#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_