Merge pull request #2523 from stweil/unilib

unittest: Add unilib.h and other code from Tensorflow and use it for more tests
2024-12-12 07:29:07 +08:00 · 2019-06-28 12:27:15 +02:00 · 2019-06-28 12:27:15 +02:00 · 653faa3a64
commit 653faa3a64
parent 7093727da1 655ba7af10
14 changed files with 1947 additions and 66 deletions
--- a/unittest/Makefile.am
+++ b/unittest/Makefile.am
@ -132,7 +132,6 @@ check_PROGRAMS += matrix_test
 check_PROGRAMS += nthitem_test
 check_PROGRAMS += osd_test
 # check_PROGRAMS += pagesegmode_test
-# check_PROGRAMS += pango_font_info_test
 check_PROGRAMS += paragraphs_test
 check_PROGRAMS += params_model_test
 check_PROGRAMS += progress_test
@ -159,6 +158,7 @@ check_PROGRAMS += lstm_squashed_test
 check_PROGRAMS += lstm_test
 check_PROGRAMS += lstmtrainer_test
 check_PROGRAMS += normstrngs_test
+check_PROGRAMS += pango_font_info_test
 check_PROGRAMS += unichar_test
 check_PROGRAMS += unicharcompress_test
 check_PROGRAMS += unicharset_test
@ -273,13 +273,22 @@ matrix_test_SOURCES = matrix_test.cc
 matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

 normstrngs_test_SOURCES = normstrngs_test.cc
+normstrngs_test_SOURCES += third_party/utf/rune.c util/utf8/unilib.cc
 normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)

 nthitem_test_SOURCES = nthitem_test.cc
 nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

-#pango_font_info_test_SOURCES = pango_font_info_test.cc
-#pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
+osd_test_SOURCES = osd_test.cc
+osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
+
+pango_font_info_test_SOURCES = pango_font_info_test.cc
+pango_font_info_test_SOURCES += third_party/utf/rune.c
+pango_font_info_test_SOURCES += util/utf8/unicodetext.cc util/utf8/unilib.cc
+pango_font_info_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(LEPTONICA_LIBS)
+pango_font_info_test_LDADD += $(ICU_I18N_LIBS) -lfontconfig
+pango_font_info_test_LDADD += -lpangocairo-1.0 -lpangoft2-1.0
+pango_font_info_test_LDADD += $(cairo_LIBS) $(pango_LIBS)

 paragraphs_test_SOURCES = paragraphs_test.cc
 paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
@ -287,9 +296,6 @@ paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
 params_model_test_SOURCES = params_model_test.cc
 params_model_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

-osd_test_SOURCES = osd_test.cc
-osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
-
 progress_test_SOURCES = progress_test.cc
 progress_test_LDFLAGS = $(OPENCL_LDFLAGS) $(LEPTONICA_LIBS)
 progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
--- a/unittest/baseapi_test.cc
+++ b/unittest/baseapi_test.cc
@ -319,7 +319,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
  const char* langs[] = {"eng", "chi_tra", "jpn", "vie"};
  std::unique_ptr<tesseract::TessBaseAPI> api;
  CycleTimer timer;
-  for (int i = 0; i < ARRAYSIZE(langs); ++i) {
+  for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
    api.reset(new tesseract::TessBaseAPI);
    timer.Restart();
    EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
@ -333,7 +333,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) {
  vars_vec.push_back(STRING("tessedit_init_config_only"));
  vars_values.push_back(STRING("1"));
  LOG(INFO) << "Switching to config only initialization:";
-  for (int i = 0; i < ARRAYSIZE(langs); ++i) {
+  for (size_t i = 0; i < ARRAYSIZE(langs); ++i) {
    api.reset(new tesseract::TessBaseAPI);
    timer.Restart();
    EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i],
--- a/unittest/heap_test.cc
+++ b/unittest/heap_test.cc
@ -34,7 +34,7 @@ class HeapTest : public testing::Test {
  virtual ~HeapTest();
  // Pushes the test data onto both the heap and the KDVector.
  void PushTestData(GenericHeap<IntKDPair>* heap, KDVector* v) {
-    for (int i = 0; i < ARRAYSIZE(test_data); ++i) {
+    for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
      IntKDPair pair(test_data[i], i);
      heap->Push(&pair);
      v->push_back(pair);
@ -137,7 +137,7 @@ TEST_F(HeapTest, RevalueTest) {
  GenericHeap<PtrPair> heap;
  GenericVector<PtrPair> v;
  // Push the test data onto both the heap and the vector.
-  for (int i = 0; i < ARRAYSIZE(test_data); ++i) {
+  for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) {
    PtrPair h_pair;
    h_pair.key = test_data[i];
    PtrPair v_pair;
--- a/unittest/normstrngs_test.cc
+++ b/unittest/normstrngs_test.cc
@ -15,16 +15,14 @@
 #include "normstrngs_test.h"
 #include "strngs.h"
 #include "unichar.h"
-#if defined(HAS_UNILIB_H)
-#include "unilib.h"
-#endif
+#include "util/utf8/unilib.h"

 #include "include_gunit.h"

 namespace tesseract {
 namespace {

-#if defined(HAS_UNILIB_H)
+#if defined(MISSING_CODE)
 static std::string EncodeAsUTF8(const char32 ch32) {
  UNICHAR uni_ch(ch32);
  return std::string(uni_ch.utf8(), uni_ch.utf8_len());
@ -363,7 +361,6 @@ TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
  EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText));
 }

-#if defined(HAS_UNILIB_H)
 // Test that the method clones the util/utf8/public/unilib definition of
 // interchange validity.
 TEST(NormstrngsTest, IsInterchangeValid) {
@ -374,12 +371,11 @@ TEST(NormstrngsTest, IsInterchangeValid) {
    EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch));
  }
 }
-#endif

-#if defined(HAS_UNILIB_H)
 // Test that the method clones the util/utf8/public/unilib definition of
 // 7-bit ASCII interchange validity.
 TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
+#if defined(MISSING_CODE)
  const int32_t kMinUnicodeValue = 33;
  const int32_t kMaxUnicodeValue = 0x10FFFF;
  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
@ -388,8 +384,11 @@ TEST(NormstrngsTest, IsInterchangeValid7BitAscii) {
    EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str),
              IsInterchangeValid7BitAscii(ch));
  }
-}
+#else
+  // Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
+  GTEST_SKIP();
 #endif
+}

 // Test that the method clones the util/utf8/public/unilib definition of
 // fullwidth-halfwidth .
@ -401,7 +400,8 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) {
  // U+FFE6 -> U+20A9 (won sign)
  EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));

-#if defined(HAS_UNILIB_H)
+#if defined(MISSING_CODE)
+  // Skipped because of missing UniLib::FullwidthToHalfwidth.
  const int32_t kMinUnicodeValue = 33;
  const int32_t kMaxUnicodeValue = 0x10FFFF;
  for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
--- a/unittest/pango_font_info_test.cc
+++ b/unittest/pango_font_info_test.cc
@ -1,12 +1,24 @@
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.

-#include "tesseract/training/pango_font_info.h"
-
-#include <stdio.h>
-#include <string.h>
-
-#include "pango/pango.h"
-#include "tesseract/training/commandlineflags.h"
-#include "tesseract/training/fileio.h"
+#include <cstdio>
+#include <string>
+#include <pango/pango.h>
+#include "include_gunit.h"
+#include "commandlineflags.h"
+#include "fileio.h"
+#include "pango_font_info.h"
+#include "absl/strings/str_cat.h"       // for absl::StrCat
+#include "gmock/gmock-matchers.h"       // for EXPECT_THAT
+#include "util/utf8/unicodetext.h"      // for UnicodeText

 DECLARE_STRING_PARAM_FLAG(fonts_dir);
 DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
@ -19,19 +31,19 @@ using tesseract::FontUtils;
 using tesseract::PangoFontInfo;

 // Fonts in testdata directory
-const char* kExpectedFontNames[] = {"Arab",
-                                    "Arial Bold Italic",
-                                    "DejaVu Sans Ultra-Light",
-                                    "Lohit Hindi",
+const char* kExpectedFontNames[] = {
+  "Arab",
+  "Arial Bold Italic",
+  "DejaVu Sans Ultra-Light",
+  "Lohit Hindi",
 #if PANGO_VERSION <= 12005
-                                    "Times New Roman",
+  "Times New Roman",
 #else
-                                    "Times New Roman,",  // Pango v1.36.2
-                                                         // requires a trailing
-                                                         // ','
+  "Times New Roman,",  // Pango v1.36.2 requires a trailing ','
 #endif
-                                    "UnBatang",
-                                    "Verdana"};
+  "UnBatang",
+  "Verdana"
+};

 // Sample text used in tests.
 const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
@ -41,23 +53,27 @@ const char kKorText[] = "이는 것으로";
 // Hindi words containing illegal vowel sequences.
 const char* kBadlyFormedHinWords[] = {
 #if PANGO_VERSION <= 12005
-    "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
+  "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
 #endif
-    // Pango v1.36.2 will render the above words even though they are invalid.
-    "प्रंात", nullptr};
+  // Pango v1.36.2 will render the above words even though they are invalid.
+  "प्रंात", nullptr
+};

 class PangoFontInfoTest : public ::testing::Test {
 protected:
  void SetUp() override {
-    std::locale::global(std::locale(""));
+    static std::locale system_locale("");
+    std::locale::global(system_locale);
  }

  // Creates a fake fonts.conf file that points to the testdata fonts for
  // fontconfig to initialize with.
  static void SetUpTestCase() {
-    FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
+    FLAGS_fonts_dir = TESTING_DIR;
    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
+#ifdef GOOGLE_TESSERACT
    FLAGS_use_only_legacy_fonts = false;
+#endif
  }

  PangoFontInfo font_info_;
@ -120,7 +136,7 @@ TEST_F(PangoFontInfoTest, CanRenderLigature) {
  font_info_.ParseFontDescriptionName("Arab 12");
  const char kArabicLigature[] = "لا";
  EXPECT_TRUE(
-      font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
+    font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));

  printf("Next word\n");
  EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
@ -143,17 +159,17 @@ TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
 TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
  font_info_.ParseFontDescriptionName("Verdana 12");
  // Verdana cannot render the "ff" ligature
-  string word = "oﬀice";
+  std::string word = "oﬀice";
  EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
  EXPECT_EQ("oice", word);

  // Don't drop non-letter characters like word joiners.
  const char* kJoiners[] = {
-      "\u2060",  // U+2060 (WJ)
-      "\u200C",  // U+200C (ZWJ)
-      "\u200D"   // U+200D (ZWNJ)
+    "\u2060",  // U+2060 (WJ)
+    "\u200C",  // U+200C (ZWJ)
+    "\u200D"   // U+200D (ZWNJ)
  };
-  for (int i = 0; i < ARRAYSIZE(kJoiners); ++i) {
+  for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) {
    word = kJoiners[i];
    EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
    EXPECT_STREQ(kJoiners[i], word.c_str());
@ -167,17 +183,21 @@ class FontUtilsTest : public ::testing::Test {
  // Creates a fake fonts.conf file that points to the testdata fonts for
  // fontconfig to initialize with.
  static void SetUpTestCase() {
-    FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata");
+    FLAGS_fonts_dir = TESTING_DIR;
    FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
  }

  void CountUnicodeChars(const char* utf8_text,
-                         std::unordered_map<char32, inT64>* ch_map) {
+                         std::unordered_map<char32, int64_t>* ch_map) {
    ch_map->clear();
    UnicodeText ut;
    ut.PointToUTF8(utf8_text, strlen(utf8_text));
    for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
+#if 0
      if (UnicodeProps::IsWhitespace(*it)) continue;
+#else
+      if (std::isspace(*it)) continue;
+#endif
      ++(*ch_map)[*it];
    }
  }
@ -206,21 +226,21 @@ TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
 }

 TEST_F(FontUtilsTest, DoesListAvailableFonts) {
-  const std::vector<string>& fonts = FontUtils::ListAvailableFonts();
+  const std::vector<std::string>& fonts = FontUtils::ListAvailableFonts();
  EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
-  for (int i = 0; i < fonts.size(); ++i) {
+  for (auto& font : fonts) {
    PangoFontInfo font_info;
-    EXPECT_TRUE(font_info.ParseFontDescriptionName(fonts[i]));
+    EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
  }
 }

 TEST_F(FontUtilsTest, DoesFindBestFonts) {
-  string fonts_list;
-  std::unordered_map<char32, inT64> ch_map;
+  std::string fonts_list;
+  std::unordered_map<char32, int64_t> ch_map;
  CountUnicodeChars(kEngText, &ch_map);
  EXPECT_EQ(26, ch_map.size());  // 26 letters
  std::vector<std::pair<const char*, std::vector<bool> > > font_flags;
-  string best_list = FontUtils::BestFonts(ch_map, &font_flags);
+  std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
  EXPECT_TRUE(best_list.size());
  // All fonts except Lohit Hindi should render English text.
  EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size());
@ -238,8 +258,8 @@ TEST_F(FontUtilsTest, DoesSelectFont) {
  const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
  for (int i = 0; kLangText[i] != nullptr; ++i) {
    SCOPED_TRACE(kLangNames[i]);
-    std::vector<string> graphemes;
-    string selected_font;
+    std::vector<std::string> graphemes;
+    std::string selected_font;
    EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]),
                                      &selected_font, &graphemes));
    EXPECT_TRUE(selected_font.size());
@ -249,17 +269,17 @@ TEST_F(FontUtilsTest, DoesSelectFont) {

 TEST_F(FontUtilsTest, DoesFailToSelectFont) {
  const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
-  std::vector<string> graphemes;
-  string selected_font;
+  std::vector<std::string> graphemes;
+  std::string selected_font;
  EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText),
                                     &selected_font, &graphemes));
 }

 TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
-  const int32 kHindiChar = 0x0905;
-  const int32 kArabicChar = 0x0623;
-  const int32 kMongolianChar = 0x180E;  // Mongolian vowel separator
-  const int32 kOghamChar = 0x1680;      // Ogham space mark
+  const int32_t kHindiChar = 0x0905;
+  const int32_t kArabicChar = 0x0623;
+  const int32_t kMongolianChar = 0x180E;  // Mongolian vowel separator
+  const int32_t kOghamChar = 0x1680;      // Ogham space mark
  std::vector<bool> unicode_mask;
  FontUtils::GetAllRenderableCharacters(&unicode_mask);
  EXPECT_TRUE(unicode_mask['A']);
@ -267,10 +287,12 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
  EXPECT_TRUE(unicode_mask[kHindiChar]);
  EXPECT_TRUE(unicode_mask[kArabicChar]);
  EXPECT_FALSE(unicode_mask[kMongolianChar]);  // no font for mongolian.
+#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
  EXPECT_FALSE(unicode_mask[kOghamChar]);      // no font for ogham.
+#endif
  unicode_mask.clear();

-  std::vector<string> selected_fonts;
+  std::vector<std::string> selected_fonts;
  selected_fonts.push_back("Lohit Hindi");
  FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
  EXPECT_TRUE(unicode_mask['1']);
@ -279,14 +301,18 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
  EXPECT_FALSE(unicode_mask[kArabicChar]);     // or Arabic,
  EXPECT_FALSE(unicode_mask[kMongolianChar]);  // or Mongolian,
  EXPECT_FALSE(unicode_mask[kOghamChar]);      // or Ogham.
+  unicode_mask.clear();

  // Check that none of the included fonts cover the Mongolian or Ogham space
  // characters.
-  for (int f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
+  for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) {
    SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f]));
    FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
+#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
    EXPECT_FALSE(unicode_mask[kOghamChar]);
+#endif
    EXPECT_FALSE(unicode_mask[kMongolianChar]);
+    unicode_mask.clear();
  }
 }
 }  // namespace
--- a/unittest/syntaxnet/base.h
+++ b/unittest/syntaxnet/base.h
@ -0,0 +1,61 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef SYNTAXNET_BASE_H_
+#define SYNTAXNET_BASE_H_
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "google/protobuf/util/message_differencer.h"
+
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/default/integral_types.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+
+
+using tensorflow::int8;
+using tensorflow::int16;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::uint8;
+using tensorflow::uint16;
+using tensorflow::uint64;
+using tensorflow::uint32;
+using tensorflow::protobuf::TextFormat;
+using tensorflow::mutex_lock;
+using tensorflow::mutex;
+using std::map;
+using std::pair;
+using std::vector;
+using std::unordered_map;
+using std::unordered_set;
+typedef signed int char32;
+
+using tensorflow::StringPiece;
+using std::string;
+
+
+  // namespace syntaxnet
+
+#endif  // SYNTAXNET_BASE_H_
--- a/unittest/third_party/utf/rune.c
+++ b/unittest/third_party/utf/rune.c
@ -0,0 +1,357 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include <stdarg.h>
+#include <string.h>
+#include "third_party/utf/utf.h"
+#include "third_party/utf/utfdef.h"
+
+enum
+{
+	Bit1	= 7,
+	Bitx	= 6,
+	Bit2	= 5,
+	Bit3	= 4,
+	Bit4	= 3,
+	Bit5	= 2, 
+
+	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
+	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
+	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
+	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
+	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
+
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,
+                                        /* 0001 1111 1111 1111 1111 1111 */
+
+	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
+	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
+
+	Bad	= Runeerror,
+};
+
+/*
+ * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
+ * This is a slower but "safe" version of the old chartorune 
+ * that works on strings that are not necessarily null-terminated.
+ * 
+ * If you know for sure that your string is null-terminated,
+ * chartorune will be a bit faster.
+ *
+ * It is guaranteed not to attempt to access "length"
+ * past the incoming pointer.  This is to avoid
+ * possible access violations.  If the string appears to be
+ * well-formed but incomplete (i.e., to get the whole Rune
+ * we'd need to read past str+length) then we'll set the Rune
+ * to Bad and return 0.
+ *
+ * Note that if we have decoding problems for other
+ * reasons, we return 1 instead of 0.
+ */
+int
+charntorune(Rune *rune, const char *str, int length)
+{
+	int c, c1, c2, c3;
+	long l;
+
+	/* When we're not allowed to read anything */
+	if(length <= 0) {
+		goto badlen;
+	}
+
+	/*
+	 * one character sequence (7-bit value)
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	// If we can't read more than one character we must stop
+	if(length <= 1) {
+		goto badlen;
+	}
+
+	/*
+	 * two character sequence (11-bit value)
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	// If we can't read more than two characters we must stop
+	if(length <= 2) {
+		goto badlen;
+	}
+
+	/*
+	 * three character sequence (16-bit value)
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	if (length <= 3)
+		goto badlen;
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		if (l > Runemax)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	// Support for 5-byte or longer UTF-8 would go here, but
+	// since we don't have that, we'll just fall through to bad.
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+badlen:
+	*rune = Bad;
+	return 0;
+
+}
+
+
+/*
+ * This is the older "unsafe" version, which works fine on 
+ * null-terminated strings.
+ */
+int
+chartorune(Rune *rune, const char *str)
+{
+	int c, c1, c2, c3;
+	long l;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => T1
+	 */
+	c = *(uchar*)str;
+	if(c < Tx) {
+		*rune = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	c1 = *(uchar*)(str+1) ^ Tx;
+	if(c1 & Testx)
+		goto bad;
+	if(c < T3) {
+		if(c < T2)
+			goto bad;
+		l = ((c << Bitx) | c1) & Rune2;
+		if(l <= Rune1)
+			goto bad;
+		*rune = l;
+		return 2;
+	}
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	c2 = *(uchar*)(str+2) ^ Tx;
+	if(c2 & Testx)
+		goto bad;
+	if(c < T4) {
+		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
+		if(l <= Rune2)
+			goto bad;
+		*rune = l;
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *	10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	c3 = *(uchar*)(str+3) ^ Tx;
+	if (c3 & Testx)
+		goto bad;
+	if (c < T5) {
+		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+		if (l <= Rune3)
+			goto bad;
+		if (l > Runemax)
+			goto bad;
+		*rune = l;
+		return 4;
+	}
+
+	/*
+	 * Support for 5-byte or longer UTF-8 would go here, but
+	 * since we don't have that, we'll just fall through to bad.
+	 */
+
+	/*
+	 * bad decoding
+	 */
+bad:
+	*rune = Bad;
+	return 1;
+}
+
+int
+isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
+	*consumed = charntorune(rune, str, length);
+	return *rune != Runeerror || *consumed == 3;
+}
+    
+int
+runetochar(char *str, const Rune *rune)
+{
+	/* Runes are signed, so convert to unsigned for range check. */
+	unsigned long c;
+
+	/*
+	 * one character sequence
+	 *	00000-0007F => 00-7F
+	 */
+	c = *rune;
+	if(c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+
+	/*
+	 * two character sequence
+	 *	0080-07FF => T2 Tx
+	 */
+	if(c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/*
+	 * If the Rune is out of range, convert it to the error rune.
+	 * Do this test here because the error rune encodes to three bytes.
+	 * Doing it earlier would duplicate work, since an out of range
+	 * Rune wouldn't have fit in one or two bytes.
+	 */
+	if (c > Runemax)
+		c = Runeerror;
+
+	/*
+	 * three character sequence
+	 *	0800-FFFF => T3 Tx Tx
+	 */
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+
+	/*
+	 * four character sequence (21-bit value)
+	 *     10000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 | (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx | (c & Maskx);
+	return 4;
+}
+
+int
+runelen(Rune rune)
+{
+	char str[10];
+
+	return runetochar(str, &rune);
+}
+
+int
+runenlen(const Rune *r, int nrune)
+{
+	int nb;
+	ulong c;	/* Rune is signed, so use unsigned for range check. */
+
+	nb = 0;
+	while(nrune--) {
+		c = *r++;
+		if (c <= Rune1)
+			nb++;
+		else if (c <= Rune2)
+			nb += 2;
+		else if (c <= Rune3)
+			nb += 3;
+		else if (c <= Runemax)
+			nb += 4;
+		else
+			nb += 3;	/* Runeerror = 0xFFFD, see runetochar */
+	}
+	return nb;
+}
+
+int
+fullrune(const char *str, int n)
+{
+	if (n > 0) {
+		int c = *(uchar*)str;
+		if (c < Tx)
+			return 1;
+		if (n > 1) {
+			if (c < T3)
+				return 1;
+			if (n > 2) {
+				if (c < T4 || n > 3)
+					return 1;
+			}
+		}
+	}
+	return 0;
+}
--- a/unittest/third_party/utf/utf.h
+++ b/unittest/third_party/utf/utf.h
@ -0,0 +1,246 @@
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#ifndef _UTFH_
+#define _UTFH_ 1
+
+#include <stdint.h>
+
+typedef signed int Rune;	/* Code-point values in Unicode 4.0 are 21 bits wide.*/
+
+enum
+{
+  UTFmax	= 4,		/* maximum bytes per rune */
+  Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
+  Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
+  Runeerror	= 0xFFFD,	/* decoding error in UTF */
+  Runemax	= 0x10FFFF,	/* maximum rune value */
+};
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * rune routines
+ */
+
+/*
+ * These routines were written by Rob Pike and Ken Thompson
+ * and first appeared in Plan 9.
+ * SEE ALSO
+ * utf (7)
+ * tcs (1)
+*/
+
+// runetochar copies (encodes) one rune, pointed to by r, to at most
+// UTFmax bytes starting at s and returns the number of bytes generated.
+
+int runetochar(char* s, const Rune* r);
+
+
+// chartorune copies (decodes) at most UTFmax bytes starting at s to
+// one rune, pointed to by r, and returns the number of bytes consumed.
+// If the input is not exactly in UTF format, chartorune will set *r
+// to Runeerror and return 1.
+//
+// Note: There is no special case for a "null-terminated" string. A
+// string whose first byte has the value 0 is the UTF8 encoding of the
+// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
+// anywhere else in a UTF sequence.
+
+int chartorune(Rune* r, const char* s);
+
+
+// charntorune is like chartorune, except that it will access at most
+// n bytes of s.  If the UTF sequence is incomplete within n bytes,
+// charntorune will set *r to Runeerror and return 0. If it is complete
+// but not in UTF format, it will set *r to Runeerror and return 1.
+// 
+// Added 2004-09-24 by Wei-Hwa Huang
+
+int charntorune(Rune* r, const char* s, int n);
+
+// isvalidcharntorune(str, n, r, consumed)
+// is a convenience function that calls "*consumed = charntorune(r, str, n)"
+// and returns an int (logically boolean) indicating whether the first
+// n bytes of str was a valid and complete UTF sequence.
+
+int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed);
+
+// runelen returns the number of bytes required to convert r into UTF.
+
+int runelen(Rune r);
+
+
+// runenlen returns the number of bytes required to convert the n
+// runes pointed to by r into UTF.
+
+int runenlen(const Rune* r, int n);
+
+
+// fullrune returns 1 if the string s of length n is long enough to be
+// decoded by chartorune, and 0 otherwise. This does not guarantee
+// that the string contains a legal UTF encoding. This routine is used
+// by programs that obtain input one byte at a time and need to know
+// when a full rune has arrived.
+
+int fullrune(const char* s, int n);
+
+// The following routines are analogous to the corresponding string
+// routines with "utf" substituted for "str", and "rune" substituted
+// for "chr".
+
+// utflen returns the number of runes that are represented by the UTF
+// string s. (cf. strlen)
+
+int utflen(const char* s);
+
+
+// utfnlen returns the number of complete runes that are represented
+// by the first n bytes of the UTF string s. If the last few bytes of
+// the string contain an incompletely coded rune, utfnlen will not
+// count them; in this way, it differs from utflen, which includes
+// every byte of the string. (cf. strnlen)
+
+int utfnlen(const char* s, long n);
+
+
+// utfrune returns a pointer to the first occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strchr)
+
+const char* utfrune(const char* s, Rune r);
+
+
+// utfrrune returns a pointer to the last occurrence of rune r in the
+// UTF string s, or 0 if r does not occur in the string.  The NULL
+// byte terminating a string is considered to be part of the string s.
+// (cf. strrchr)
+
+const char* utfrrune(const char* s, Rune r);
+
+
+// utfutf returns a pointer to the first occurrence of the UTF string
+// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
+// null string, utfutf returns s1. (cf. strstr)
+
+const char* utfutf(const char* s1, const char* s2);
+
+
+// utfecpy copies UTF sequences until a null sequence has been copied,
+// but writes no sequences beyond es1.  If any sequences are copied,
+// s1 is terminated by a null sequence, and a pointer to that sequence
+// is returned.  Otherwise, the original s1 is returned. (cf. strecpy)
+
+char* utfecpy(char *s1, char *es1, const char *s2);
+
+
+
+// These functions are rune-string analogues of the corresponding
+// functions in strcat (3).
+// 
+// These routines first appeared in Plan 9.
+// SEE ALSO
+// memmove (3)
+// rune (3)
+// strcat (2)
+//
+// BUGS: The outcome of overlapping moves varies among implementations.
+
+Rune* runestrcat(Rune* s1, const Rune* s2);
+Rune* runestrncat(Rune* s1, const Rune* s2, long n);
+
+const Rune* runestrchr(const Rune* s, Rune c);
+
+int runestrcmp(const Rune* s1, const Rune* s2);
+int runestrncmp(const Rune* s1, const Rune* s2, long n);
+
+Rune* runestrcpy(Rune* s1, const Rune* s2);
+Rune* runestrncpy(Rune* s1, const Rune* s2, long n);
+Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2);
+
+Rune* runestrdup(const Rune* s);
+
+const Rune* runestrrchr(const Rune* s, Rune c);
+long runestrlen(const Rune* s);
+const Rune* runestrstr(const Rune* s1, const Rune* s2);
+
+
+
+// The following routines test types and modify cases for Unicode
+// characters.  Unicode defines some characters as letters and
+// specifies three cases: upper, lower, and title.  Mappings among the
+// cases are also defined, although they are not exhaustive: some
+// upper case letters have no lower case mapping, and so on.  Unicode
+// also defines several character properties, a subset of which are
+// checked by these routines.  These routines are based on Unicode
+// version 3.0.0.
+//
+// NOTE: The routines are implemented in C, so the boolean functions
+// (e.g., isupperrune) return 0 for false and 1 for true.
+//
+//
+// toupperrune, tolowerrune, and totitlerune are the Unicode case
+// mappings. These routines return the character unchanged if it has
+// no defined mapping.
+
+Rune toupperrune(Rune r);
+Rune tolowerrune(Rune r);
+Rune totitlerune(Rune r);
+
+
+// isupperrune tests for upper case characters, including Unicode
+// upper case letters and targets of the toupper mapping. islowerrune
+// and istitlerune are defined analogously. 
+ 
+int isupperrune(Rune r);
+int islowerrune(Rune r);
+int istitlerune(Rune r);
+
+
+// isalpharune tests for Unicode letters; this includes ideographs in
+// addition to alphabetic characters.
+
+int isalpharune(Rune r);
+
+
+// isdigitrune tests for digits. Non-digit numbers, such as Roman
+// numerals, are not included.
+
+int isdigitrune(Rune r);
+
+
+// isideographicrune tests for ideographic characters and numbers, as
+// defined by the Unicode standard.
+
+int isideographicrune(Rune r);
+
+
+// isspacerune tests for whitespace characters, including "C" locale
+// whitespace, Unicode defined whitespace, and the "zero-width
+// non-break space" character.
+
+int isspacerune(Rune r);
+
+
+// (The comments in this file were copied from the manpage files rune.3,
+// isalpharune.3, and runestrcat.3. Some formatting changes were also made
+// to conform to Google style. /JRM 11/11/05)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
--- a/unittest/third_party/utf/utfdef.h
+++ b/unittest/third_party/utf/utfdef.h
@ -0,0 +1,14 @@
+#define uchar _utfuchar
+#define ushort _utfushort
+#define uint _utfuint
+#define ulong _utfulong
+#define vlong _utfvlong
+#define uvlong _utfuvlong
+
+typedef unsigned char		uchar;
+typedef unsigned short		ushort;
+typedef unsigned int		uint;
+typedef unsigned long		ulong;
+
+#define nelem(x) (sizeof(x)/sizeof((x)[0]))
+#define nil ((void*)0)
--- a/unittest/util/utf8/unicodetext.cc
+++ b/unittest/util/utf8/unicodetext.cc
@ -0,0 +1,507 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/utf8/unicodetext.h"
+
+#include <string.h>                     // for memcpy, NULL, memcmp, etc
+#include <algorithm>                    // for max
+
+//#include "base/logging.h"               // for operator<<, CHECK, etc
+//#include "base/stringprintf.h"          // for StringPrintf, StringAppendF
+//#include "strings/stringpiece.h"        // for StringPiece, etc
+
+#include "third_party/utf/utf.h"        // for isvalidcharntorune, etc
+#include "util/utf8/unilib.h"    // for IsInterchangeValid, etc
+#include "util/utf8/unilib_utf8_utils.h"    // for OneCharLen
+
+static int CodepointDistance(const char* start, const char* end) {
+  int n = 0;
+  // Increment n on every non-trail-byte.
+  for (const char* p = start; p < end; ++p) {
+    n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
+  }
+  return n;
+}
+
+static int CodepointCount(const char* utf8, int len) {
+  return CodepointDistance(utf8, utf8 + len);
+}
+
+UnicodeText::const_iterator::difference_type
+distance(const UnicodeText::const_iterator& first,
+         const UnicodeText::const_iterator& last) {
+  return CodepointDistance(first.it_, last.it_);
+}
+
+// ---------- Utility ----------
+
+static int ConvertToInterchangeValid(char* start, int len) {
+  // This routine is called only when we've discovered that a UTF-8 buffer
+  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
+  // was not interchange valid. This indicates a bug in the caller, and
+  // a LOG(WARNING) is done in that case.
+  // This is similar to CoerceToInterchangeValid, but it replaces each
+  // structurally valid byte with a space, and each non-interchange
+  // character with a space, even when that character requires more
+  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
+  // structurally valid UTF8, but U+FDD0 is not an interchange-valid
+  // code point. The result should contain one space, not three.
+  //
+  // Since the conversion never needs to write more data than it
+  // reads, it is safe to change the buffer in place. It returns the
+  // number of bytes written.
+  char* const in = start;
+  char* out = start;
+  char* const end = start + len;
+  while (start < end) {
+    int good = UniLib::SpanInterchangeValid(start, end - start);
+    if (good > 0) {
+      if (out != start) {
+        memmove(out, start, good);
+      }
+      out += good;
+      start += good;
+      if (start == end) {
+        break;
+      }
+    }
+    // Is the current string invalid UTF8 or just non-interchange UTF8?
+    char32 rune;
+    int n;
+    if (isvalidcharntorune(start, end - start, &rune, &n)) {
+      // structurally valid UTF8, but not interchange valid
+      start += n;  // Skip over the whole character.
+    } else {  // bad UTF8
+      start += 1;  // Skip over just one byte
+    }
+    *out++ = ' ';
+  }
+  return out - in;
+}
+
+
+// *************** Data representation **********
+
+// Note: the copy constructor is undefined.
+
+// After reserve(), resize(), or clear(), we're an owner, not an alias.
+
+void UnicodeText::Repr::reserve(int new_capacity) {
+  // If there's already enough capacity, and we're an owner, do nothing.
+  if (capacity_ >= new_capacity && ours_) return;
+
+  // Otherwise, allocate a new buffer.
+  capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
+  char* new_data = new char[capacity_];
+
+  // If there is an old buffer, copy it into the new buffer.
+  if (data_) {
+    memcpy(new_data, data_, size_);
+    if (ours_) delete[] data_;  // If we owned the old buffer, free it.
+  }
+  data_ = new_data;
+  ours_ = true;  // We own the new buffer.
+  // size_ is unchanged.
+}
+
+void UnicodeText::Repr::resize(int new_size) {
+  if (new_size == 0) {
+    clear();
+  } else {
+    if (!ours_ || new_size > capacity_) reserve(new_size);
+    // Clear the memory in the expanded part.
+    if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
+    size_ = new_size;
+    ours_ = true;
+  }
+}
+
+// This implementation of clear() deallocates the buffer if we're an owner.
+// That's not strictly necessary; we could just set size_ to 0.
+void UnicodeText::Repr::clear() {
+  if (ours_) delete[] data_;
+  data_ = nullptr;
+  size_ = capacity_ = 0;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::Copy(const char* data, int size) {
+  resize(size);
+  memcpy(data_, data, size);
+}
+
+void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
+  if (data == data_) return;  // We already own this memory. (Weird case.)
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = data;
+  size_ = size;
+  capacity_ = capacity;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::PointTo(const char* data, int size) {
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = const_cast<char*>(data);
+  size_ = size;
+  capacity_ = size;
+  ours_ = false;
+}
+
+void UnicodeText::Repr::append(const char* bytes, int byte_length) {
+  reserve(size_ + byte_length);
+  memcpy(data_ + size_, bytes, byte_length);
+  size_ += byte_length;
+}
+
+string UnicodeText::Repr::DebugString() const {
+  return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
+                      this,
+                      data_, size_, capacity_,
+                      ours_ ? "Owned" : "Alias");
+}
+
+
+
+// *************** UnicodeText ******************
+
+// ----- Constructors -----
+
+// Default constructor
+UnicodeText::UnicodeText() {
+}
+
+// Copy constructor
+UnicodeText::UnicodeText(const UnicodeText& src) {
+  Copy(src);
+}
+
+// Substring constructor
+UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
+                         const UnicodeText::const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.append(first.it_, last.it_ - first.it_);
+}
+
+string UnicodeText::UTF8Substring(const const_iterator& first,
+                                  const const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  return string(first.it_, last.it_ - first.it_);
+}
+
+
+// ----- Copy -----
+
+UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
+  if (this != &src) {
+    Copy(src);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
+  repr_.Copy(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
+                                           int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  return *this;
+}
+
+// ----- TakeOwnershipOf  -----
+
+UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
+                                              int byte_length,
+                                              int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
+                                                    int byte_length,
+                                                    int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  return *this;
+}
+
+// ----- PointTo -----
+
+UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
+  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    repr_.PointTo(buffer, byte_length);
+  } else {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.Copy(buffer, byte_length);
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
+                                          int byte_length) {
+  repr_.PointTo(buffer, byte_length);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
+  repr_.PointTo(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const const_iterator &first,
+                                  const const_iterator &last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
+  return *this;
+}
+
+// ----- Append -----
+
+UnicodeText& UnicodeText::append(const UnicodeText& u) {
+  repr_.append(u.repr_.data_, u.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::append(const const_iterator& first,
+                                 const const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.append(first.it_, last.it_ - first.it_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
+  repr_.append(utf8, len);
+  return *this;
+}
+
+// ----- substring searching -----
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
+                                              const_iterator start_pos) const {
+  CHECK_GE(start_pos.utf8_data(), utf8_data());
+  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
+  return UnsafeFind(look, start_pos);
+}
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
+  return UnsafeFind(look, begin());
+}
+
+UnicodeText::const_iterator UnicodeText::UnsafeFind(
+    const UnicodeText& look, const_iterator start_pos) const {
+  // Due to the magic of the UTF8 encoding, searching for a sequence of
+  // letters is equivalent to substring search.
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece look_piece(look.utf8_data(), look.utf8_length());
+  LOG(FATAL) << "Not implemented";
+  //StringPiece::size_type found =
+  //    searching.find(look_piece, start_pos.utf8_data() - utf8_data());
+  StringPiece::size_type found = StringPiece::npos;
+  if (found == StringPiece::npos) return end();
+  return const_iterator(utf8_data() + found);
+}
+
+bool UnicodeText::HasReplacementChar() const {
+  // Equivalent to:
+  //   UnicodeText replacement_char;
+  //   replacement_char.push_back(0xFFFD);
+  //   return find(replacement_char) != end();
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece looking_for("\xEF\xBF\xBD", 3);
+  LOG(FATAL) << "Not implemented";
+  //return searching.find(looking_for) != StringPiece::npos;
+  return false;
+}
+
+// ----- other methods -----
+
+// Clear operator
+void UnicodeText::clear() {
+  repr_.clear();
+}
+
+// Destructor
+UnicodeText::~UnicodeText() {}
+
+
+void UnicodeText::push_back(char32 c) {
+  if (UniLib::IsValidCodepoint(c)) {
+    char buf[UTFmax];
+    int len = runetochar(buf, &c);
+    if (UniLib::IsInterchangeValid(buf, len)) {
+      repr_.append(buf, len);
+    } else {
+      LOG(WARNING) << "Unicode value 0x" << std::hex << c
+                  << " is not valid for interchange";
+      repr_.append(" ", 1);
+    }
+  } else {
+    LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
+    repr_.append(" ", 1);
+  }
+}
+
+int UnicodeText::size() const {
+  return CodepointCount(repr_.data_, repr_.size_);
+}
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
+  if (&lhs == &rhs) return true;
+  if (lhs.repr_.size_ != rhs.repr_.size_) return false;
+  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
+}
+
+string UnicodeText::DebugString() const {
+  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
+                      this,
+                      size(),
+                      repr_.DebugString().c_str());
+}
+
+
+// ******************* UnicodeText::const_iterator *********************
+
+// The implementation of const_iterator would be nicer if it
+// inherited from boost::iterator_facade
+// (http://boost.org/libs/iterator/doc/iterator_facade.html).
+
+UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
+
+UnicodeText::const_iterator::const_iterator(const const_iterator& other)
+    : it_(other.it_) {
+}
+
+UnicodeText::const_iterator&
+UnicodeText::const_iterator::operator=(const const_iterator& other) {
+  if (&other != this)
+    it_ = other.it_;
+  return *this;
+}
+
+UnicodeText::const_iterator UnicodeText::begin() const {
+  return const_iterator(repr_.data_);
+}
+
+UnicodeText::const_iterator UnicodeText::end() const {
+  return const_iterator(repr_.data_ + repr_.size_);
+}
+
+bool operator<(const UnicodeText::const_iterator& lhs,
+               const UnicodeText::const_iterator& rhs) {
+  return lhs.it_ < rhs.it_;
+}
+
+char32 UnicodeText::const_iterator::operator*() const {
+  // (We could call chartorune here, but that does some
+  // error-checking, and we're guaranteed that our data is valid
+  // UTF-8. Also, we expect this routine to be called very often. So
+  // for speed, we do the calculation ourselves.)
+
+  // Convert from UTF-8
+  unsigned char byte1 = it_[0];
+  if (byte1 < 0x80)
+    return byte1;
+
+  unsigned char byte2 = it_[1];
+  if (byte1 < 0xE0)
+    return ((byte1 & 0x1F) << 6)
+          | (byte2 & 0x3F);
+
+  unsigned char byte3 = it_[2];
+  if (byte1 < 0xF0)
+    return ((byte1 & 0x0F) << 12)
+         | ((byte2 & 0x3F) << 6)
+         |  (byte3 & 0x3F);
+
+  unsigned char byte4 = it_[3];
+  return ((byte1 & 0x07) << 18)
+       | ((byte2 & 0x3F) << 12)
+       | ((byte3 & 0x3F) << 6)
+       |  (byte4 & 0x3F);
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
+  it_ += UniLib::OneCharLen(it_);
+  return *this;
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
+  while (UniLib::IsTrailByte(*--it_));
+  return *this;
+}
+
+int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
+  utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1;
+  utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2;
+  utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3;
+  utf8_output[3] = it_[3];
+  return 4;
+}
+
+string UnicodeText::const_iterator::get_utf8_string() const {
+  return string(utf8_data(), utf8_length());
+}
+
+int UnicodeText::const_iterator::utf8_length() const {
+  if ((it_[0] & 0xff) < 0x80) {
+    return 1;
+  } else if ((it_[0] & 0xff) < 0xE0) {
+    return 2;
+  } else if ((it_[0] & 0xff) < 0xF0) {
+    return 3;
+  } else {
+    return 4;
+  }
+}
+
+UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
+  CHECK(p != nullptr);
+  const char* start = utf8_data();
+  int len = utf8_length();
+  const char* end = start + len;
+  CHECK(p >= start);
+  CHECK(p <= end);
+  CHECK(p == end || !UniLib::IsTrailByte(*p));
+  return const_iterator(p);
+}
+
+string UnicodeText::const_iterator::DebugString() const {
+  return tensorflow::strings::Printf("{iter %p}", it_);
+}
+
+
+// *************************** Utilities *************************
+
+string CodepointString(const UnicodeText& t) {
+  string s;
+  UnicodeText::const_iterator it = t.begin(), end = t.end();
+  while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
+  return s;
+}
--- a/unittest/util/utf8/unicodetext.h
+++ b/unittest/util/utf8/unicodetext.h
@ -0,0 +1,477 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+
+#include <stddef.h>                     // for NULL, ptrdiff_t
+#include <iterator>                     // for bidirectional_iterator_tag, etc
+#include <string>                       // for string
+#include <utility>                      // for pair
+
+#include "syntaxnet/base.h"
+
+// ***************************** UnicodeText **************************
+//
+// A UnicodeText object is a container for a sequence of Unicode
+// codepoint values. It has default, copy, and assignment constructors.
+// Data can be appended to it from another UnicodeText, from
+// iterators, or from a single codepoint.
+//
+// The internal representation of the text is UTF-8. Since UTF-8 is a
+// variable-width format, UnicodeText does not provide random access
+// to the text, and changes to the text are permitted only at the end.
+//
+// The UnicodeText class defines a const_iterator. The dereferencing
+// operator (*) returns a codepoint (char32). The iterator is a
+// bidirectional, read-only iterator. It becomes invalid if the text
+// is changed.
+//
+// There are methods for appending and retrieving UTF-8 data directly.
+// The 'utf8_data' method returns a const char* that contains the
+// UTF-8-encoded version of the text; 'utf8_length' returns the number
+// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
+// 4 bytes of UTF-8 data in a char array and returns the number of
+// bytes that it stored.
+//
+// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
+// 0x10FFFF], but UnicodeText has the additional restriction that it
+// can contain only those characters that are valid for interchange on
+// the Web. This excludes all of the control codes except for carriage
+// return, line feed, and horizontal tab.  It also excludes
+// non-characters, but codepoints that are in the Private Use regions
+// are allowed, as are codepoints that are unassigned. (See the
+// Unicode reference for details.) The function UniLib::IsInterchangeValid
+// can be used as a test for this property.
+//
+// UnicodeTexts are safe. Every method that constructs or modifies a
+// UnicodeText tests for interchange-validity, and will substitute a
+// space for the invalid data. Such cases are reported via
+// LOG(WARNING).
+//
+// MEMORY MANAGEMENT: copy, take ownership, or point to
+//
+// A UnicodeText is either an "owner", meaning that it owns the memory
+// for the data buffer and will free it when the UnicodeText is
+// destroyed, or it is an "alias", meaning that it does not.
+//
+// There are three methods for storing UTF-8 data in a UnicodeText:
+//
+// CopyUTF8(buffer, len) copies buffer.
+//
+// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
+//
+// PointToUTF8(buffer, size) creates an alias pointing to buffer.
+//
+// All three methods perform a validity check on the buffer. There are
+// private, "unsafe" versions of these functions that bypass the
+// validity check. They are used internally and by friend-functions
+// that are handling UTF-8 data that has already been validated.
+//
+// The purpose of an alias is to avoid making an unnecessary copy of a
+// UTF-8 buffer while still providing access to the Unicode values
+// within that text through iterators or the fast scanners that are
+// based on UTF-8 state tables. The lifetime of an alias must not
+// exceed the lifetime of the buffer from which it was constructed.
+//
+// The semantics of an alias might be described as "copy on write or
+// repair." The source data is never modified. If push_back() or
+// append() is called on an alias, a copy of the data will be created,
+// and the UnicodeText will become an owner. If clear() is called on
+// an alias, it becomes an (empty) owner.
+//
+// The copy constructor and the assignment operator produce an owner.
+// That is, after direct initialization ("UnicodeText x(y);") or copy
+// initialization ("UnicodeText x = y;") x will be an owner, even if y
+// was an alias. The assignment operator ("x = y;") also produces an
+// owner unless x and y are the same object and y is an alias.
+//
+// Aliases should be used with care. If the source from which an alias
+// was created is freed, or if the contents are changed, while the
+// alias is still in use, fatal errors could result. But it can be
+// quite useful to have a UnicodeText "window" through which to see a
+// UTF-8 buffer without having to pay the price of making a copy.
+//
+// UTILITIES
+//
+// The interfaces in util/utf8/public/textutils.h provide higher-level
+// utilities for dealing with UnicodeTexts, including routines for
+// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
+// strings, creating strings from UnicodeTexts, normalizing text for
+// efficient matching or display, and others.
+
+class UnicodeText {
+ public:
+  class const_iterator;
+
+  typedef char32 value_type;
+
+  // Constructors. These always produce owners.
+  UnicodeText();  // Create an empty text.
+  UnicodeText(const UnicodeText& src);  // copy constructor
+  // Construct a substring (copies the data).
+  UnicodeText(const const_iterator& first, const const_iterator& last);
+
+  // Assignment operator. This copies the data and produces an owner
+  // unless this == &src, e.g., "x = x;", which is a no-op.
+  UnicodeText& operator=(const UnicodeText& src);
+
+  // x.Copy(y) copies the data from y into x.
+  UnicodeText& Copy(const UnicodeText& src);
+  inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
+
+  // x.PointTo(y) changes x so that it points to y's data.
+  // It does not copy y or take ownership of y's data.
+  UnicodeText& PointTo(const UnicodeText& src);
+  UnicodeText& PointTo(const const_iterator& first,
+                       const const_iterator& last);
+
+  ~UnicodeText();
+
+  void clear();  // Clear text.
+  bool empty() const { return repr_.size_ == 0; }  // Test if text is empty.
+
+  // Add a codepoint to the end of the text.
+  // If the codepoint is not interchange-valid, add a space instead
+  // and log a warning.
+  void push_back(char32 codepoint);
+
+  // Generic appending operation.
+  // iterator_traits<ForwardIterator>::value_type must be implicitly
+  // convertible to char32. Typical uses of this method might include:
+  //     char32 chars[] = {0x1, 0x2, ...};
+  //     vector<char32> more_chars = ...;
+  //     utext.append(chars, chars+arraysize(chars));
+  //     utext.append(more_chars.begin(), more_chars.end());
+  template<typename ForwardIterator>
+  UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
+    while (first != last) { push_back(*first++); }
+    return *this;
+  }
+
+  // A specialization of the generic append() method.
+  UnicodeText& append(const const_iterator& first, const const_iterator& last);
+
+  // An optimization of append(source.begin(), source.end()).
+  UnicodeText& append(const UnicodeText& source);
+
+  int size() const;  // the number of Unicode characters (codepoints)
+
+  friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+  friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
+
+  class const_iterator {
+    typedef const_iterator CI;
+   public:
+    typedef std::bidirectional_iterator_tag iterator_category;
+    typedef char32 value_type;
+    typedef ptrdiff_t difference_type;
+    typedef void pointer;  // (Not needed.)
+    typedef const char32 reference;  // (Needed for const_reverse_iterator)
+
+    // Iterators are default-constructible.
+    const_iterator();
+
+    // It's safe to make multiple passes over a UnicodeText.
+    const_iterator(const const_iterator& other);
+    const_iterator& operator=(const const_iterator& other);
+
+    char32 operator*() const;  // Dereference
+
+    const_iterator& operator++();  // Advance (++iter)
+    const_iterator operator++(int) {  // (iter++)
+      const_iterator result(*this);
+      ++*this;
+      return result;
+    }
+
+    const_iterator& operator--();  // Retreat (--iter)
+    const_iterator operator--(int) {  // (iter--)
+      const_iterator result(*this);
+      --*this;
+      return result;
+    }
+
+    // We love relational operators.
+    friend bool operator==(const CI& lhs, const CI& rhs) {
+      return lhs.it_ == rhs.it_; }
+    friend bool operator!=(const CI& lhs, const CI& rhs) {
+      return !(lhs == rhs); }
+    friend bool operator<(const CI& lhs, const CI& rhs);
+    friend bool operator>(const CI& lhs, const CI& rhs) {
+      return rhs < lhs; }
+    friend bool operator<=(const CI& lhs, const CI& rhs) {
+      return !(rhs < lhs); }
+    friend bool operator>=(const CI& lhs, const CI& rhs) {
+      return !(lhs < rhs); }
+
+    friend difference_type distance(const CI& first, const CI& last);
+
+    // UTF-8-specific methods
+    // Store the UTF-8 encoding of the current codepoint into buf,
+    // which must be at least 4 bytes long. Return the number of
+    // bytes written.
+    int get_utf8(char* buf) const;
+    // Return the UTF-8 character that the iterator points to.
+    string get_utf8_string() const;
+    // Return the byte length of the UTF-8 character the iterator points to.
+    int utf8_length() const;
+    // Return the iterator's pointer into the UTF-8 data.
+    const char* utf8_data() const { return it_; }
+
+    string DebugString() const;
+
+   private:
+    friend class UnicodeText;
+    friend class UnicodeTextUtils;
+    friend class UTF8StateTableProperty;
+    explicit const_iterator(const char* it) : it_(it) {}
+
+    const char* it_;
+  };
+
+  const_iterator begin() const;
+  const_iterator end() const;
+
+  class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
+   public:
+    explicit const_reverse_iterator(const_iterator it) :
+        std::reverse_iterator<const_iterator>(it) {}
+    const char* utf8_data() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).utf8_data();
+    }
+    int get_utf8(char* buf) const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).get_utf8(buf);
+    }
+    string get_utf8_string() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).get_utf8_string();
+    }
+    int utf8_length() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).utf8_length();
+    }
+  };
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Substring searching.  Returns the beginning of the first
+  // occurrence of "look", or end() if not found.
+  const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
+  // Equivalent to find(look, begin())
+  const_iterator find(const UnicodeText& look) const;
+
+  // Returns whether this contains the character U+FFFD.  This can
+  // occur, for example, if the input to Encodings::Decode() had byte
+  // sequences that were invalid in the source encoding.
+  bool HasReplacementChar() const;
+
+  // UTF-8-specific methods
+  //
+  // Return the data, length, and capacity of UTF-8-encoded version of
+  // the text. Length and capacity are measured in bytes.
+  const char* utf8_data() const { return repr_.data_; }
+  int utf8_length() const { return repr_.size_; }
+  int utf8_capacity() const { return repr_.capacity_; }
+
+  // Return the UTF-8 data as a string.
+  static string UTF8Substring(const const_iterator& first,
+                              const const_iterator& last);
+
+  // There are three methods for initializing a UnicodeText from UTF-8
+  // data. They vary in details of memory management. In all cases,
+  // the data is tested for interchange-validity. If it is not
+  // interchange-valid, a LOG(WARNING) is issued, and each
+  // structurally invalid byte and each interchange-invalid codepoint
+  // is replaced with a space.
+
+  // x.CopyUTF8(buf, len) copies buf into x.
+  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
+
+  // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
+  // buf. buf is not copied.
+  UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
+                                   int byte_length,
+                                   int byte_capacity);
+
+  // x.PointToUTF8(buf,len) changes x so that it points to buf
+  // ("becomes an alias"). It does not take ownership or copy buf.
+  // If the buffer is not valid, this has the same effect as
+  // CopyUTF8(utf8_buffer, byte_length).
+  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
+
+  // Occasionally it is necessary to use functions that operate on the
+  // pointer returned by utf8_data(). MakeIterator(p) provides a way
+  // to get back to the UnicodeText level. It uses CHECK to ensure
+  // that p is a pointer within this object's UTF-8 data, and that it
+  // points to the beginning of a character.
+  const_iterator MakeIterator(const char* p) const;
+
+  string DebugString() const;
+
+ private:
+  friend class const_iterator;
+  friend class UnicodeTextUtils;
+
+  class Repr {  // A byte-string.
+   public:
+    char* data_;
+    int size_;
+    int capacity_;
+    bool ours_;  // Do we own data_?
+
+    Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
+    ~Repr() { if (ours_) delete[] data_; }
+
+    void clear();
+    void reserve(int capacity);
+    void resize(int size);
+
+    void append(const char* bytes, int byte_length);
+    void Copy(const char* data, int size);
+    void TakeOwnershipOf(char* data, int size, int capacity);
+    void PointTo(const char* data, int size);
+
+    string DebugString() const;
+
+   private:
+    Repr& operator=(const Repr&);
+    Repr(const Repr& other);
+  };
+
+  Repr repr_;
+
+  // UTF-8-specific private methods.
+  // These routines do not perform a validity check when compiled
+  // in opt mode.
+  // It is an error to call these methods with UTF-8 data that
+  // is not interchange-valid.
+  //
+  UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
+  UnicodeText& UnsafeTakeOwnershipOfUTF8(
+      char* utf8_buffer, int byte_length, int byte_capacity);
+  UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
+  UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
+  const_iterator UnsafeFind(const UnicodeText& look,
+                            const_iterator start_pos) const;
+};
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+
+inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
+  return !(lhs == rhs);
+}
+
+// UnicodeTextRange is a pair of iterators, useful for specifying text
+// segments. If the iterators are ==, the segment is empty.
+typedef pair<UnicodeText::const_iterator,
+             UnicodeText::const_iterator> UnicodeTextRange;
+
+inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
+  return r.first == r.second;
+}
+
+
+// *************************** Utilities *************************
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
+// is an "owner.")
+//
+// Each byte that is structurally invalid will be replaced with a
+// space. Each codepoint that is interchange-invalid will also be
+// replaced with a space, even if the codepoint was represented with a
+// multibyte sequence in the UTF-8 data.
+//
+inline UnicodeText MakeUnicodeTextAcceptingOwnership(
+    char* utf8_buffer, int byte_length, int byte_capacity) {
+  return UnicodeText().TakeOwnershipOfUTF8(
+      utf8_buffer, byte_length, byte_capacity);
+}
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText does not take ownership of the
+// buffer. (It is an "alias.")
+//
+inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
+    const char* utf8_buffer, int byte_length) {
+  return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
+}
+
+// Create a UnicodeText from a UTF-8 string or buffer.
+//
+// If do_copy is true, then a copy of the string is made. The copy is
+// owned by the resulting UnicodeText object and will be freed when
+// the object is destroyed. This UnicodeText object is referred to
+// as an "owner."
+//
+// If do_copy is false, then no copy is made. The resulting
+// UnicodeText object does NOT take ownership of the string; in this
+// case, the lifetime of the UnicodeText object must not exceed the
+// lifetime of the string. This Unicodetext object is referred to as
+// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
+//
+// If the input string does not contain valid UTF-8, then a copy is
+// made (as if do_copy were true) and coerced to valid UTF-8 by
+// replacing each invalid byte with a space.
+//
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
+                                     bool do_copy) {
+  UnicodeText t;
+  if (do_copy) {
+    t.CopyUTF8(utf8_buf, len);
+  } else {
+    t.PointToUTF8(utf8_buf, len);
+  }
+  return t;
+}
+
+inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
+  return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
+}
+
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
+  return UTF8ToUnicodeText(utf8_buf, len, true);
+}
+inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
+  return UTF8ToUnicodeText(utf8_string, true);
+}
+
+// Return a string containing the UTF-8 encoded version of all the
+// Unicode characters in t.
+inline string UnicodeTextToUTF8(const UnicodeText& t) {
+  return string(t.utf8_data(), t.utf8_length());
+}
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// For debugging.  Return a string of integers, written in uppercase
+// hex (%X), corresponding to the codepoints within the text. Each
+// integer is followed by a space. E.g., "61 62 6A 3005 ".
+string CodepointString(const UnicodeText& t);
+
+#endif  // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
--- a/unittest/util/utf8/unilib.cc
+++ b/unittest/util/utf8/unilib.cc
@ -0,0 +1,58 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: sligocki@google.com (Shawn Ligocki)
+
+#include "util/utf8/unilib.h"
+
+#include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
+
+namespace UniLib {
+
+// Codepoints not allowed for interchange are:
+//   C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
+//       Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
+//       Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
+//   C1 controls: U+007F to U+009F
+//   Surrogates: U+D800 to U+DFFF
+//   Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
+bool IsInterchangeValid(char32 c) {
+  return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
+           (c >= 0x7F && c <= 0x9F) ||
+           (c >= 0xD800 && c <= 0xDFFF) ||
+           (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
+}
+
+int SpanInterchangeValid(const char* begin, int byte_length) {
+  char32 rune;
+  const char* p = begin;
+  const char* end = begin + byte_length;
+  while (p < end) {
+    int bytes_consumed = charntorune(&rune, p, end - p);
+    // We want to accept Runeerror == U+FFFD as a valid char, but it is used
+    // by chartorune to indicate error. Luckily, the real codepoint is size 3
+    // while errors return bytes_consumed <= 1.
+    if ((rune == Runeerror && bytes_consumed <= 1) ||
+        !IsInterchangeValid(rune)) {
+      break;  // Found
+    }
+    p += bytes_consumed;
+  }
+  return p - begin;
+}
+
+}  // namespace UniLib
--- a/unittest/util/utf8/unilib.h
+++ b/unittest/util/utf8/unilib.h
@ -0,0 +1,63 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Routines to do manipulation of Unicode characters or text
+//
+// The StructurallyValid routines accept buffers of arbitrary bytes.
+// For CoerceToStructurallyValid(), the input buffer and output buffers may
+// point to exactly the same memory.
+//
+// In all other cases, the UTF-8 string must be structurally valid and
+// have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
+// Debug builds take a fatal error for invalid UTF-8 input.
+// The input and output buffers may not overlap at all.
+//
+// The char32 routines are here only for convenience; they convert to UTF-8
+// internally and use the UTF-8 routines.
+
+#ifndef UTIL_UTF8_UNILIB_H__
+#define UTIL_UTF8_UNILIB_H__
+
+#include <string>
+#include "syntaxnet/base.h"
+
+// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
+// but they are defined in unilib_utf8_utils.h.
+//#include "util/utf8/public/unilib_utf8_utils.h"  // IWYU pragma: export
+
+namespace UniLib {
+
+// Returns the length in bytes of the prefix of src that is all
+//  interchange valid UTF-8
+int SpanInterchangeValid(const char* src, int byte_length);
+inline int SpanInterchangeValid(const std::string& src) {
+  return SpanInterchangeValid(src.data(), src.size());
+}
+
+// Returns true if the source is all interchange valid UTF-8
+// "Interchange valid" is a stronger than structurally valid --
+// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
+bool IsInterchangeValid(char32 codepoint);
+inline bool IsInterchangeValid(const char* src, int byte_length) {
+  return (byte_length == SpanInterchangeValid(src, byte_length));
+}
+inline bool IsInterchangeValid(const std::string& src) {
+  return IsInterchangeValid(src.data(), src.size());
+}
+
+}  // namespace UniLib
+
+#endif  // UTIL_UTF8_PUBLIC_UNILIB_H_
--- a/unittest/util/utf8/unilib_utf8_utils.h
+++ b/unittest/util/utf8/unilib_utf8_utils.h
@ -0,0 +1,66 @@
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+
+// These definitions are self-contained and have no dependencies.
+// They are also exported from unilib.h for legacy reasons.
+
+#include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
+
+namespace UniLib {
+
+// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]
+// (i.e., is not a surrogate codepoint). See also
+// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
+inline bool IsValidCodepoint(char32 c) {
+  return (static_cast<uint32>(c) < 0xD800)
+    || (c >= 0xE000 && c <= 0x10FFFF);
+}
+
+// Returns true if 'str' is the start of a structurally valid UTF-8
+// sequence and is not a surrogate codepoint. Returns false if str.empty()
+// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
+// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
+inline bool IsUTF8ValidCodepoint(StringPiece str) {
+  char32 c;
+  int consumed;
+  // It's OK if str.length() > consumed.
+  return !str.empty()
+      && isvalidcharntorune(str.data(), str.size(), &c, &consumed)
+      && IsValidCodepoint(c);
+}
+
+// Returns the length (number of bytes) of the Unicode code point
+// starting at src, based on inspecting just that one byte. This
+// requires that src point to a well-formed UTF-8 string; the result
+// is undefined otherwise.
+inline int OneCharLen(const char* src) {
+  return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+}
+
+// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)
+inline bool IsTrailByte(char x) {
+  // return (x & 0xC0) == 0x80;
+  // Since trail bytes are always in [0x80, 0xBF], we can optimize:
+  return static_cast<signed char>(x) < -0x40;
+}
+
+}  // namespace UniLib
+
+#endif  // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_