From 07021942467150448c7fb9168fdb4ddf38ad4850 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 24 Jun 2019 12:33:26 +0200 Subject: [PATCH 1/5] Add code from tensorflow/models The new code was copied from the latest code on GitHub (https://github.com/tensorflow/models/tree/master/research/syntaxnet). It is required for pango_font_info_test and other unit tests. Signed-off-by: Stefan Weil --- unittest/syntaxnet/base.h | 61 +++ unittest/third_party/utf/rune.c | 357 +++++++++++++++++ unittest/third_party/utf/utf.h | 246 ++++++++++++ unittest/third_party/utf/utfdef.h | 14 + unittest/util/utf8/unicodetext.cc | 507 +++++++++++++++++++++++++ unittest/util/utf8/unicodetext.h | 477 +++++++++++++++++++++++ unittest/util/utf8/unilib.cc | 58 +++ unittest/util/utf8/unilib.h | 63 +++ unittest/util/utf8/unilib_utf8_utils.h | 66 ++++ 9 files changed, 1849 insertions(+) create mode 100644 unittest/syntaxnet/base.h create mode 100644 unittest/third_party/utf/rune.c create mode 100644 unittest/third_party/utf/utf.h create mode 100644 unittest/third_party/utf/utfdef.h create mode 100644 unittest/util/utf8/unicodetext.cc create mode 100644 unittest/util/utf8/unicodetext.h create mode 100644 unittest/util/utf8/unilib.cc create mode 100644 unittest/util/utf8/unilib.h create mode 100644 unittest/util/utf8/unilib_utf8_utils.h diff --git a/unittest/syntaxnet/base.h b/unittest/syntaxnet/base.h new file mode 100644 index 000000000..5dabbbdaf --- /dev/null +++ b/unittest/syntaxnet/base.h @@ -0,0 +1,61 @@ +/* Copyright 2016 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef SYNTAXNET_BASE_H_ +#define SYNTAXNET_BASE_H_ + +#include +#include +#include +#include +#include + +#include "google/protobuf/util/message_differencer.h" + + +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/default/integral_types.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/protobuf.h" + + + +using tensorflow::int8; +using tensorflow::int16; +using tensorflow::int32; +using tensorflow::int64; +using tensorflow::uint8; +using tensorflow::uint16; +using tensorflow::uint64; +using tensorflow::uint32; +using tensorflow::protobuf::TextFormat; +using tensorflow::mutex_lock; +using tensorflow::mutex; +using std::map; +using std::pair; +using std::vector; +using std::unordered_map; +using std::unordered_set; +typedef signed int char32; + +using tensorflow::StringPiece; +using std::string; + + + // namespace syntaxnet + +#endif // SYNTAXNET_BASE_H_ diff --git a/unittest/third_party/utf/rune.c b/unittest/third_party/utf/rune.c new file mode 100644 index 000000000..3d8605704 --- /dev/null +++ b/unittest/third_party/utf/rune.c @@ -0,0 +1,357 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ +#include +#include +#include "third_party/utf/utf.h" +#include "third_party/utf/utfdef.h" + +enum +{ + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, + /* 0001 1111 1111 1111 1111 1111 */ + + Maskx = (1< T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + // If we can't read more than one character we must stop + if(length <= 1) { + goto badlen; + } + + /* + * two character sequence (11-bit value) + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + // If we can't read more than two characters we must stop + if(length <= 2) { + goto badlen; + } + + /* + * three character sequence (16-bit value) + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + if (length <= 3) + goto badlen; + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + if (l > Runemax) + goto bad; + *rune = l; + return 4; + } + + // Support for 5-byte or longer UTF-8 would go here, but + // since we don't have that, we'll just fall through to bad. + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +badlen: + *rune = Bad; + return 0; + +} + + +/* + * This is the older "unsafe" version, which works fine on + * null-terminated strings. + */ +int +chartorune(Rune *rune, const char *str) +{ + int c, c1, c2, c3; + long l; + + /* + * one character sequence + * 00000-0007F => T1 + */ + c = *(uchar*)str; + if(c < Tx) { + *rune = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + c1 = *(uchar*)(str+1) ^ Tx; + if(c1 & Testx) + goto bad; + if(c < T3) { + if(c < T2) + goto bad; + l = ((c << Bitx) | c1) & Rune2; + if(l <= Rune1) + goto bad; + *rune = l; + return 2; + } + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + c2 = *(uchar*)(str+2) ^ Tx; + if(c2 & Testx) + goto bad; + if(c < T4) { + l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; + if(l <= Rune2) + goto bad; + *rune = l; + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + c3 = *(uchar*)(str+3) ^ Tx; + if (c3 & Testx) + goto bad; + if (c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if (l <= Rune3) + goto bad; + if (l > Runemax) + goto bad; + *rune = l; + return 4; + } + + /* + * Support for 5-byte or longer UTF-8 would go here, but + * since we don't have that, we'll just fall through to bad. + */ + + /* + * bad decoding + */ +bad: + *rune = Bad; + return 1; +} + +int +isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { + *consumed = charntorune(rune, str, length); + return *rune != Runeerror || *consumed == 3; +} + +int +runetochar(char *str, const Rune *rune) +{ + /* Runes are signed, so convert to unsigned for range check. */ + unsigned long c; + + /* + * one character sequence + * 00000-0007F => 00-7F + */ + c = *rune; + if(c <= Rune1) { + str[0] = c; + return 1; + } + + /* + * two character sequence + * 0080-07FF => T2 Tx + */ + if(c <= Rune2) { + str[0] = T2 | (c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* + * If the Rune is out of range, convert it to the error rune. + * Do this test here because the error rune encodes to three bytes. + * Doing it earlier would duplicate work, since an out of range + * Rune wouldn't have fit in one or two bytes. + */ + if (c > Runemax) + c = Runeerror; + + /* + * three character sequence + * 0800-FFFF => T3 Tx Tx + */ + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence (21-bit value) + * 10000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; +} + +int +runelen(Rune rune) +{ + char str[10]; + + return runetochar(str, &rune); +} + +int +runenlen(const Rune *r, int nrune) +{ + int nb; + ulong c; /* Rune is signed, so use unsigned for range check. */ + + nb = 0; + while(nrune--) { + c = *r++; + if (c <= Rune1) + nb++; + else if (c <= Rune2) + nb += 2; + else if (c <= Rune3) + nb += 3; + else if (c <= Runemax) + nb += 4; + else + nb += 3; /* Runeerror = 0xFFFD, see runetochar */ + } + return nb; +} + +int +fullrune(const char *str, int n) +{ + if (n > 0) { + int c = *(uchar*)str; + if (c < Tx) + return 1; + if (n > 1) { + if (c < T3) + return 1; + if (n > 2) { + if (c < T4 || n > 3) + return 1; + } + } + } + return 0; +} diff --git a/unittest/third_party/utf/utf.h b/unittest/third_party/utf/utf.h new file mode 100644 index 000000000..06982e58f --- /dev/null +++ b/unittest/third_party/utf/utf.h @@ -0,0 +1,246 @@ +/* + * The authors of this software are Rob Pike and Ken Thompson. + * Copyright (c) 2002 by Lucent Technologies. + * Permission to use, copy, modify, and distribute this software for any + * purpose without fee is hereby granted, provided that this entire notice + * is included in all copies of any software which is or includes a copy + * or modification of this software and in all copies of the supporting + * documentation for such software. + * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY + * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. + */ +#ifndef _UTFH_ +#define _UTFH_ 1 + +#include + +typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* maximum rune value */ +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * rune routines + */ + +/* + * These routines were written by Rob Pike and Ken Thompson + * and first appeared in Plan 9. + * SEE ALSO + * utf (7) + * tcs (1) +*/ + +// runetochar copies (encodes) one rune, pointed to by r, to at most +// UTFmax bytes starting at s and returns the number of bytes generated. + +int runetochar(char* s, const Rune* r); + + +// chartorune copies (decodes) at most UTFmax bytes starting at s to +// one rune, pointed to by r, and returns the number of bytes consumed. +// If the input is not exactly in UTF format, chartorune will set *r +// to Runeerror and return 1. +// +// Note: There is no special case for a "null-terminated" string. A +// string whose first byte has the value 0 is the UTF8 encoding of the +// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal +// anywhere else in a UTF sequence. + +int chartorune(Rune* r, const char* s); + + +// charntorune is like chartorune, except that it will access at most +// n bytes of s. If the UTF sequence is incomplete within n bytes, +// charntorune will set *r to Runeerror and return 0. If it is complete +// but not in UTF format, it will set *r to Runeerror and return 1. +// +// Added 2004-09-24 by Wei-Hwa Huang + +int charntorune(Rune* r, const char* s, int n); + +// isvalidcharntorune(str, n, r, consumed) +// is a convenience function that calls "*consumed = charntorune(r, str, n)" +// and returns an int (logically boolean) indicating whether the first +// n bytes of str was a valid and complete UTF sequence. + +int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed); + +// runelen returns the number of bytes required to convert r into UTF. + +int runelen(Rune r); + + +// runenlen returns the number of bytes required to convert the n +// runes pointed to by r into UTF. + +int runenlen(const Rune* r, int n); + + +// fullrune returns 1 if the string s of length n is long enough to be +// decoded by chartorune, and 0 otherwise. This does not guarantee +// that the string contains a legal UTF encoding. This routine is used +// by programs that obtain input one byte at a time and need to know +// when a full rune has arrived. + +int fullrune(const char* s, int n); + +// The following routines are analogous to the corresponding string +// routines with "utf" substituted for "str", and "rune" substituted +// for "chr". + +// utflen returns the number of runes that are represented by the UTF +// string s. (cf. strlen) + +int utflen(const char* s); + + +// utfnlen returns the number of complete runes that are represented +// by the first n bytes of the UTF string s. If the last few bytes of +// the string contain an incompletely coded rune, utfnlen will not +// count them; in this way, it differs from utflen, which includes +// every byte of the string. (cf. strnlen) + +int utfnlen(const char* s, long n); + + +// utfrune returns a pointer to the first occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string. The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strchr) + +const char* utfrune(const char* s, Rune r); + + +// utfrrune returns a pointer to the last occurrence of rune r in the +// UTF string s, or 0 if r does not occur in the string. The NULL +// byte terminating a string is considered to be part of the string s. +// (cf. strrchr) + +const char* utfrrune(const char* s, Rune r); + + +// utfutf returns a pointer to the first occurrence of the UTF string +// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the +// null string, utfutf returns s1. (cf. strstr) + +const char* utfutf(const char* s1, const char* s2); + + +// utfecpy copies UTF sequences until a null sequence has been copied, +// but writes no sequences beyond es1. If any sequences are copied, +// s1 is terminated by a null sequence, and a pointer to that sequence +// is returned. Otherwise, the original s1 is returned. (cf. strecpy) + +char* utfecpy(char *s1, char *es1, const char *s2); + + + +// These functions are rune-string analogues of the corresponding +// functions in strcat (3). +// +// These routines first appeared in Plan 9. +// SEE ALSO +// memmove (3) +// rune (3) +// strcat (2) +// +// BUGS: The outcome of overlapping moves varies among implementations. + +Rune* runestrcat(Rune* s1, const Rune* s2); +Rune* runestrncat(Rune* s1, const Rune* s2, long n); + +const Rune* runestrchr(const Rune* s, Rune c); + +int runestrcmp(const Rune* s1, const Rune* s2); +int runestrncmp(const Rune* s1, const Rune* s2, long n); + +Rune* runestrcpy(Rune* s1, const Rune* s2); +Rune* runestrncpy(Rune* s1, const Rune* s2, long n); +Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2); + +Rune* runestrdup(const Rune* s); + +const Rune* runestrrchr(const Rune* s, Rune c); +long runestrlen(const Rune* s); +const Rune* runestrstr(const Rune* s1, const Rune* s2); + + + +// The following routines test types and modify cases for Unicode +// characters. Unicode defines some characters as letters and +// specifies three cases: upper, lower, and title. Mappings among the +// cases are also defined, although they are not exhaustive: some +// upper case letters have no lower case mapping, and so on. Unicode +// also defines several character properties, a subset of which are +// checked by these routines. These routines are based on Unicode +// version 3.0.0. +// +// NOTE: The routines are implemented in C, so the boolean functions +// (e.g., isupperrune) return 0 for false and 1 for true. +// +// +// toupperrune, tolowerrune, and totitlerune are the Unicode case +// mappings. These routines return the character unchanged if it has +// no defined mapping. + +Rune toupperrune(Rune r); +Rune tolowerrune(Rune r); +Rune totitlerune(Rune r); + + +// isupperrune tests for upper case characters, including Unicode +// upper case letters and targets of the toupper mapping. islowerrune +// and istitlerune are defined analogously. + +int isupperrune(Rune r); +int islowerrune(Rune r); +int istitlerune(Rune r); + + +// isalpharune tests for Unicode letters; this includes ideographs in +// addition to alphabetic characters. + +int isalpharune(Rune r); + + +// isdigitrune tests for digits. Non-digit numbers, such as Roman +// numerals, are not included. + +int isdigitrune(Rune r); + + +// isideographicrune tests for ideographic characters and numbers, as +// defined by the Unicode standard. + +int isideographicrune(Rune r); + + +// isspacerune tests for whitespace characters, including "C" locale +// whitespace, Unicode defined whitespace, and the "zero-width +// non-break space" character. + +int isspacerune(Rune r); + + +// (The comments in this file were copied from the manpage files rune.3, +// isalpharune.3, and runestrcat.3. Some formatting changes were also made +// to conform to Google style. /JRM 11/11/05) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/unittest/third_party/utf/utfdef.h b/unittest/third_party/utf/utfdef.h new file mode 100644 index 000000000..4b58ae87e --- /dev/null +++ b/unittest/third_party/utf/utfdef.h @@ -0,0 +1,14 @@ +#define uchar _utfuchar +#define ushort _utfushort +#define uint _utfuint +#define ulong _utfulong +#define vlong _utfvlong +#define uvlong _utfuvlong + +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; + +#define nelem(x) (sizeof(x)/sizeof((x)[0])) +#define nil ((void*)0) diff --git a/unittest/util/utf8/unicodetext.cc b/unittest/util/utf8/unicodetext.cc new file mode 100644 index 000000000..99cb02d5a --- /dev/null +++ b/unittest/util/utf8/unicodetext.cc @@ -0,0 +1,507 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/utf8/unicodetext.h" + +#include // for memcpy, NULL, memcmp, etc +#include // for max + +//#include "base/logging.h" // for operator<<, CHECK, etc +//#include "base/stringprintf.h" // for StringPrintf, StringAppendF +//#include "strings/stringpiece.h" // for StringPiece, etc + +#include "third_party/utf/utf.h" // for isvalidcharntorune, etc +#include "util/utf8/unilib.h" // for IsInterchangeValid, etc +#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen + +static int CodepointDistance(const char* start, const char* end) { + int n = 0; + // Increment n on every non-trail-byte. + for (const char* p = start; p < end; ++p) { + n += (*reinterpret_cast(p) >= -0x40); + } + return n; +} + +static int CodepointCount(const char* utf8, int len) { + return CodepointDistance(utf8, utf8 + len); +} + +UnicodeText::const_iterator::difference_type +distance(const UnicodeText::const_iterator& first, + const UnicodeText::const_iterator& last) { + return CodepointDistance(first.it_, last.it_); +} + +// ---------- Utility ---------- + +static int ConvertToInterchangeValid(char* start, int len) { + // This routine is called only when we've discovered that a UTF-8 buffer + // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 + // was not interchange valid. This indicates a bug in the caller, and + // a LOG(WARNING) is done in that case. + // This is similar to CoerceToInterchangeValid, but it replaces each + // structurally valid byte with a space, and each non-interchange + // character with a space, even when that character requires more + // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is + // structurally valid UTF8, but U+FDD0 is not an interchange-valid + // code point. The result should contain one space, not three. + // + // Since the conversion never needs to write more data than it + // reads, it is safe to change the buffer in place. It returns the + // number of bytes written. + char* const in = start; + char* out = start; + char* const end = start + len; + while (start < end) { + int good = UniLib::SpanInterchangeValid(start, end - start); + if (good > 0) { + if (out != start) { + memmove(out, start, good); + } + out += good; + start += good; + if (start == end) { + break; + } + } + // Is the current string invalid UTF8 or just non-interchange UTF8? + char32 rune; + int n; + if (isvalidcharntorune(start, end - start, &rune, &n)) { + // structurally valid UTF8, but not interchange valid + start += n; // Skip over the whole character. + } else { // bad UTF8 + start += 1; // Skip over just one byte + } + *out++ = ' '; + } + return out - in; +} + + +// *************** Data representation ********** + +// Note: the copy constructor is undefined. + +// After reserve(), resize(), or clear(), we're an owner, not an alias. + +void UnicodeText::Repr::reserve(int new_capacity) { + // If there's already enough capacity, and we're an owner, do nothing. + if (capacity_ >= new_capacity && ours_) return; + + // Otherwise, allocate a new buffer. + capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20); + char* new_data = new char[capacity_]; + + // If there is an old buffer, copy it into the new buffer. + if (data_) { + memcpy(new_data, data_, size_); + if (ours_) delete[] data_; // If we owned the old buffer, free it. + } + data_ = new_data; + ours_ = true; // We own the new buffer. + // size_ is unchanged. +} + +void UnicodeText::Repr::resize(int new_size) { + if (new_size == 0) { + clear(); + } else { + if (!ours_ || new_size > capacity_) reserve(new_size); + // Clear the memory in the expanded part. + if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); + size_ = new_size; + ours_ = true; + } +} + +// This implementation of clear() deallocates the buffer if we're an owner. +// That's not strictly necessary; we could just set size_ to 0. +void UnicodeText::Repr::clear() { + if (ours_) delete[] data_; + data_ = nullptr; + size_ = capacity_ = 0; + ours_ = true; +} + +void UnicodeText::Repr::Copy(const char* data, int size) { + resize(size); + memcpy(data_, data, size); +} + +void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { + if (data == data_) return; // We already own this memory. (Weird case.) + if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. + data_ = data; + size_ = size; + capacity_ = capacity; + ours_ = true; +} + +void UnicodeText::Repr::PointTo(const char* data, int size) { + if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. + data_ = const_cast(data); + size_ = size; + capacity_ = size; + ours_ = false; +} + +void UnicodeText::Repr::append(const char* bytes, int byte_length) { + reserve(size_ + byte_length); + memcpy(data_ + size_, bytes, byte_length); + size_ += byte_length; +} + +string UnicodeText::Repr::DebugString() const { + return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", + this, + data_, size_, capacity_, + ours_ ? "Owned" : "Alias"); +} + + + +// *************** UnicodeText ****************** + +// ----- Constructors ----- + +// Default constructor +UnicodeText::UnicodeText() { +} + +// Copy constructor +UnicodeText::UnicodeText(const UnicodeText& src) { + Copy(src); +} + +// Substring constructor +UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, + const UnicodeText::const_iterator& last) { + CHECK(first <= last) << " Incompatible iterators"; + repr_.append(first.it_, last.it_ - first.it_); +} + +string UnicodeText::UTF8Substring(const const_iterator& first, + const const_iterator& last) { + CHECK(first <= last) << " Incompatible iterators"; + return string(first.it_, last.it_ - first.it_); +} + + +// ----- Copy ----- + +UnicodeText& UnicodeText::operator=(const UnicodeText& src) { + if (this != &src) { + Copy(src); + } + return *this; +} + +UnicodeText& UnicodeText::Copy(const UnicodeText& src) { + repr_.Copy(src.repr_.data_, src.repr_.size_); + return *this; +} + +UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { + repr_.Copy(buffer, byte_length); + if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; + repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); + } + return *this; +} + +UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, + int byte_length) { + repr_.Copy(buffer, byte_length); + return *this; +} + +// ----- TakeOwnershipOf ----- + +UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, + int byte_length, + int byte_capacity) { + repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); + if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; + repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); + } + return *this; +} + +UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, + int byte_length, + int byte_capacity) { + repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); + return *this; +} + +// ----- PointTo ----- + +UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { + if (UniLib:: IsInterchangeValid(buffer, byte_length)) { + repr_.PointTo(buffer, byte_length); + } else { + LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; + repr_.Copy(buffer, byte_length); + repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); + } + return *this; +} + +UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, + int byte_length) { + repr_.PointTo(buffer, byte_length); + return *this; +} + +UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { + repr_.PointTo(src.repr_.data_, src.repr_.size_); + return *this; +} + +UnicodeText& UnicodeText::PointTo(const const_iterator &first, + const const_iterator &last) { + CHECK(first <= last) << " Incompatible iterators"; + repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); + return *this; +} + +// ----- Append ----- + +UnicodeText& UnicodeText::append(const UnicodeText& u) { + repr_.append(u.repr_.data_, u.repr_.size_); + return *this; +} + +UnicodeText& UnicodeText::append(const const_iterator& first, + const const_iterator& last) { + CHECK(first <= last) << " Incompatible iterators"; + repr_.append(first.it_, last.it_ - first.it_); + return *this; +} + +UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { + repr_.append(utf8, len); + return *this; +} + +// ----- substring searching ----- + +UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, + const_iterator start_pos) const { + CHECK_GE(start_pos.utf8_data(), utf8_data()); + CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length()); + return UnsafeFind(look, start_pos); +} + +UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { + return UnsafeFind(look, begin()); +} + +UnicodeText::const_iterator UnicodeText::UnsafeFind( + const UnicodeText& look, const_iterator start_pos) const { + // Due to the magic of the UTF8 encoding, searching for a sequence of + // letters is equivalent to substring search. + StringPiece searching(utf8_data(), utf8_length()); + StringPiece look_piece(look.utf8_data(), look.utf8_length()); + LOG(FATAL) << "Not implemented"; + //StringPiece::size_type found = + // searching.find(look_piece, start_pos.utf8_data() - utf8_data()); + StringPiece::size_type found = StringPiece::npos; + if (found == StringPiece::npos) return end(); + return const_iterator(utf8_data() + found); +} + +bool UnicodeText::HasReplacementChar() const { + // Equivalent to: + // UnicodeText replacement_char; + // replacement_char.push_back(0xFFFD); + // return find(replacement_char) != end(); + StringPiece searching(utf8_data(), utf8_length()); + StringPiece looking_for("\xEF\xBF\xBD", 3); + LOG(FATAL) << "Not implemented"; + //return searching.find(looking_for) != StringPiece::npos; + return false; +} + +// ----- other methods ----- + +// Clear operator +void UnicodeText::clear() { + repr_.clear(); +} + +// Destructor +UnicodeText::~UnicodeText() {} + + +void UnicodeText::push_back(char32 c) { + if (UniLib::IsValidCodepoint(c)) { + char buf[UTFmax]; + int len = runetochar(buf, &c); + if (UniLib::IsInterchangeValid(buf, len)) { + repr_.append(buf, len); + } else { + LOG(WARNING) << "Unicode value 0x" << std::hex << c + << " is not valid for interchange"; + repr_.append(" ", 1); + } + } else { + LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c; + repr_.append(" ", 1); + } +} + +int UnicodeText::size() const { + return CodepointCount(repr_.data_, repr_.size_); +} + +bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { + if (&lhs == &rhs) return true; + if (lhs.repr_.size_ != rhs.repr_.size_) return false; + return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; +} + +string UnicodeText::DebugString() const { + return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", + this, + size(), + repr_.DebugString().c_str()); +} + + +// ******************* UnicodeText::const_iterator ********************* + +// The implementation of const_iterator would be nicer if it +// inherited from boost::iterator_facade +// (http://boost.org/libs/iterator/doc/iterator_facade.html). + +UnicodeText::const_iterator::const_iterator() : it_(nullptr) {} + +UnicodeText::const_iterator::const_iterator(const const_iterator& other) + : it_(other.it_) { +} + +UnicodeText::const_iterator& +UnicodeText::const_iterator::operator=(const const_iterator& other) { + if (&other != this) + it_ = other.it_; + return *this; +} + +UnicodeText::const_iterator UnicodeText::begin() const { + return const_iterator(repr_.data_); +} + +UnicodeText::const_iterator UnicodeText::end() const { + return const_iterator(repr_.data_ + repr_.size_); +} + +bool operator<(const UnicodeText::const_iterator& lhs, + const UnicodeText::const_iterator& rhs) { + return lhs.it_ < rhs.it_; +} + +char32 UnicodeText::const_iterator::operator*() const { + // (We could call chartorune here, but that does some + // error-checking, and we're guaranteed that our data is valid + // UTF-8. Also, we expect this routine to be called very often. So + // for speed, we do the calculation ourselves.) + + // Convert from UTF-8 + int byte1 = it_[0]; + if (byte1 < 0x80) + return byte1; + + int byte2 = it_[1]; + if (byte1 < 0xE0) + return ((byte1 & 0x1F) << 6) + | (byte2 & 0x3F); + + int byte3 = it_[2]; + if (byte1 < 0xF0) + return ((byte1 & 0x0F) << 12) + | ((byte2 & 0x3F) << 6) + | (byte3 & 0x3F); + + int byte4 = it_[3]; + return ((byte1 & 0x07) << 18) + | ((byte2 & 0x3F) << 12) + | ((byte3 & 0x3F) << 6) + | (byte4 & 0x3F); +} + +UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { + it_ += UniLib::OneCharLen(it_); + return *this; +} + +UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { + while (UniLib::IsTrailByte(*--it_)); + return *this; +} + +int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { + utf8_output[0] = it_[0]; if (it_[0] < 0x80) return 1; + utf8_output[1] = it_[1]; if (it_[0] < 0xE0) return 2; + utf8_output[2] = it_[2]; if (it_[0] < 0xF0) return 3; + utf8_output[3] = it_[3]; + return 4; +} + +string UnicodeText::const_iterator::get_utf8_string() const { + return string(utf8_data(), utf8_length()); +} + +int UnicodeText::const_iterator::utf8_length() const { + if (it_[0] < 0x80) { + return 1; + } else if (it_[0] < 0xE0) { + return 2; + } else if (it_[0] < 0xF0) { + return 3; + } else { + return 4; + } +} + +UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { + CHECK(p != nullptr); + const char* start = utf8_data(); + int len = utf8_length(); + const char* end = start + len; + CHECK(p >= start); + CHECK(p <= end); + CHECK(p == end || !UniLib::IsTrailByte(*p)); + return const_iterator(p); +} + +string UnicodeText::const_iterator::DebugString() const { + return tensorflow::strings::Printf("{iter %p}", it_); +} + + +// *************************** Utilities ************************* + +string CodepointString(const UnicodeText& t) { + string s; + UnicodeText::const_iterator it = t.begin(), end = t.end(); + while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++); + return s; +} diff --git a/unittest/util/utf8/unicodetext.h b/unittest/util/utf8/unicodetext.h new file mode 100644 index 000000000..4e25d3eef --- /dev/null +++ b/unittest/util/utf8/unicodetext.h @@ -0,0 +1,477 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_ +#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_ + +#include // for NULL, ptrdiff_t +#include // for bidirectional_iterator_tag, etc +#include // for string +#include // for pair + +#include "syntaxnet/base.h" + +// ***************************** UnicodeText ************************** +// +// A UnicodeText object is a container for a sequence of Unicode +// codepoint values. It has default, copy, and assignment constructors. +// Data can be appended to it from another UnicodeText, from +// iterators, or from a single codepoint. +// +// The internal representation of the text is UTF-8. Since UTF-8 is a +// variable-width format, UnicodeText does not provide random access +// to the text, and changes to the text are permitted only at the end. +// +// The UnicodeText class defines a const_iterator. The dereferencing +// operator (*) returns a codepoint (char32). The iterator is a +// bidirectional, read-only iterator. It becomes invalid if the text +// is changed. +// +// There are methods for appending and retrieving UTF-8 data directly. +// The 'utf8_data' method returns a const char* that contains the +// UTF-8-encoded version of the text; 'utf8_length' returns the number +// of bytes in the UTF-8 data. An iterator's 'get' method stores up to +// 4 bytes of UTF-8 data in a char array and returns the number of +// bytes that it stored. +// +// Codepoints are integers in the range [0, 0xD7FF] or [0xE000, +// 0x10FFFF], but UnicodeText has the additional restriction that it +// can contain only those characters that are valid for interchange on +// the Web. This excludes all of the control codes except for carriage +// return, line feed, and horizontal tab. It also excludes +// non-characters, but codepoints that are in the Private Use regions +// are allowed, as are codepoints that are unassigned. (See the +// Unicode reference for details.) The function UniLib::IsInterchangeValid +// can be used as a test for this property. +// +// UnicodeTexts are safe. Every method that constructs or modifies a +// UnicodeText tests for interchange-validity, and will substitute a +// space for the invalid data. Such cases are reported via +// LOG(WARNING). +// +// MEMORY MANAGEMENT: copy, take ownership, or point to +// +// A UnicodeText is either an "owner", meaning that it owns the memory +// for the data buffer and will free it when the UnicodeText is +// destroyed, or it is an "alias", meaning that it does not. +// +// There are three methods for storing UTF-8 data in a UnicodeText: +// +// CopyUTF8(buffer, len) copies buffer. +// +// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. +// +// PointToUTF8(buffer, size) creates an alias pointing to buffer. +// +// All three methods perform a validity check on the buffer. There are +// private, "unsafe" versions of these functions that bypass the +// validity check. They are used internally and by friend-functions +// that are handling UTF-8 data that has already been validated. +// +// The purpose of an alias is to avoid making an unnecessary copy of a +// UTF-8 buffer while still providing access to the Unicode values +// within that text through iterators or the fast scanners that are +// based on UTF-8 state tables. The lifetime of an alias must not +// exceed the lifetime of the buffer from which it was constructed. +// +// The semantics of an alias might be described as "copy on write or +// repair." The source data is never modified. If push_back() or +// append() is called on an alias, a copy of the data will be created, +// and the UnicodeText will become an owner. If clear() is called on +// an alias, it becomes an (empty) owner. +// +// The copy constructor and the assignment operator produce an owner. +// That is, after direct initialization ("UnicodeText x(y);") or copy +// initialization ("UnicodeText x = y;") x will be an owner, even if y +// was an alias. The assignment operator ("x = y;") also produces an +// owner unless x and y are the same object and y is an alias. +// +// Aliases should be used with care. If the source from which an alias +// was created is freed, or if the contents are changed, while the +// alias is still in use, fatal errors could result. But it can be +// quite useful to have a UnicodeText "window" through which to see a +// UTF-8 buffer without having to pay the price of making a copy. +// +// UTILITIES +// +// The interfaces in util/utf8/public/textutils.h provide higher-level +// utilities for dealing with UnicodeTexts, including routines for +// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or +// strings, creating strings from UnicodeTexts, normalizing text for +// efficient matching or display, and others. + +class UnicodeText { + public: + class const_iterator; + + typedef char32 value_type; + + // Constructors. These always produce owners. + UnicodeText(); // Create an empty text. + UnicodeText(const UnicodeText& src); // copy constructor + // Construct a substring (copies the data). + UnicodeText(const const_iterator& first, const const_iterator& last); + + // Assignment operator. This copies the data and produces an owner + // unless this == &src, e.g., "x = x;", which is a no-op. + UnicodeText& operator=(const UnicodeText& src); + + // x.Copy(y) copies the data from y into x. + UnicodeText& Copy(const UnicodeText& src); + inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } + + // x.PointTo(y) changes x so that it points to y's data. + // It does not copy y or take ownership of y's data. + UnicodeText& PointTo(const UnicodeText& src); + UnicodeText& PointTo(const const_iterator& first, + const const_iterator& last); + + ~UnicodeText(); + + void clear(); // Clear text. + bool empty() const { return repr_.size_ == 0; } // Test if text is empty. + + // Add a codepoint to the end of the text. + // If the codepoint is not interchange-valid, add a space instead + // and log a warning. + void push_back(char32 codepoint); + + // Generic appending operation. + // iterator_traits::value_type must be implicitly + // convertible to char32. Typical uses of this method might include: + // char32 chars[] = {0x1, 0x2, ...}; + // vector more_chars = ...; + // utext.append(chars, chars+arraysize(chars)); + // utext.append(more_chars.begin(), more_chars.end()); + template + UnicodeText& append(ForwardIterator first, const ForwardIterator last) { + while (first != last) { push_back(*first++); } + return *this; + } + + // A specialization of the generic append() method. + UnicodeText& append(const const_iterator& first, const const_iterator& last); + + // An optimization of append(source.begin(), source.end()). + UnicodeText& append(const UnicodeText& source); + + int size() const; // the number of Unicode characters (codepoints) + + friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); + friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); + + class const_iterator { + typedef const_iterator CI; + public: + typedef std::bidirectional_iterator_tag iterator_category; + typedef char32 value_type; + typedef ptrdiff_t difference_type; + typedef void pointer; // (Not needed.) + typedef const char32 reference; // (Needed for const_reverse_iterator) + + // Iterators are default-constructible. + const_iterator(); + + // It's safe to make multiple passes over a UnicodeText. + const_iterator(const const_iterator& other); + const_iterator& operator=(const const_iterator& other); + + char32 operator*() const; // Dereference + + const_iterator& operator++(); // Advance (++iter) + const_iterator operator++(int) { // (iter++) + const_iterator result(*this); + ++*this; + return result; + } + + const_iterator& operator--(); // Retreat (--iter) + const_iterator operator--(int) { // (iter--) + const_iterator result(*this); + --*this; + return result; + } + + // We love relational operators. + friend bool operator==(const CI& lhs, const CI& rhs) { + return lhs.it_ == rhs.it_; } + friend bool operator!=(const CI& lhs, const CI& rhs) { + return !(lhs == rhs); } + friend bool operator<(const CI& lhs, const CI& rhs); + friend bool operator>(const CI& lhs, const CI& rhs) { + return rhs < lhs; } + friend bool operator<=(const CI& lhs, const CI& rhs) { + return !(rhs < lhs); } + friend bool operator>=(const CI& lhs, const CI& rhs) { + return !(lhs < rhs); } + + friend difference_type distance(const CI& first, const CI& last); + + // UTF-8-specific methods + // Store the UTF-8 encoding of the current codepoint into buf, + // which must be at least 4 bytes long. Return the number of + // bytes written. + int get_utf8(char* buf) const; + // Return the UTF-8 character that the iterator points to. + string get_utf8_string() const; + // Return the byte length of the UTF-8 character the iterator points to. + int utf8_length() const; + // Return the iterator's pointer into the UTF-8 data. + const char* utf8_data() const { return it_; } + + string DebugString() const; + + private: + friend class UnicodeText; + friend class UnicodeTextUtils; + friend class UTF8StateTableProperty; + explicit const_iterator(const char* it) : it_(it) {} + + const char* it_; + }; + + const_iterator begin() const; + const_iterator end() const; + + class const_reverse_iterator : public std::reverse_iterator { + public: + explicit const_reverse_iterator(const_iterator it) : + std::reverse_iterator(it) {} + const char* utf8_data() const { + const_iterator tmp_it = base(); + return (--tmp_it).utf8_data(); + } + int get_utf8(char* buf) const { + const_iterator tmp_it = base(); + return (--tmp_it).get_utf8(buf); + } + string get_utf8_string() const { + const_iterator tmp_it = base(); + return (--tmp_it).get_utf8_string(); + } + int utf8_length() const { + const_iterator tmp_it = base(); + return (--tmp_it).utf8_length(); + } + }; + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + // Substring searching. Returns the beginning of the first + // occurrence of "look", or end() if not found. + const_iterator find(const UnicodeText& look, const_iterator start_pos) const; + // Equivalent to find(look, begin()) + const_iterator find(const UnicodeText& look) const; + + // Returns whether this contains the character U+FFFD. This can + // occur, for example, if the input to Encodings::Decode() had byte + // sequences that were invalid in the source encoding. + bool HasReplacementChar() const; + + // UTF-8-specific methods + // + // Return the data, length, and capacity of UTF-8-encoded version of + // the text. Length and capacity are measured in bytes. + const char* utf8_data() const { return repr_.data_; } + int utf8_length() const { return repr_.size_; } + int utf8_capacity() const { return repr_.capacity_; } + + // Return the UTF-8 data as a string. + static string UTF8Substring(const const_iterator& first, + const const_iterator& last); + + // There are three methods for initializing a UnicodeText from UTF-8 + // data. They vary in details of memory management. In all cases, + // the data is tested for interchange-validity. If it is not + // interchange-valid, a LOG(WARNING) is issued, and each + // structurally invalid byte and each interchange-invalid codepoint + // is replaced with a space. + + // x.CopyUTF8(buf, len) copies buf into x. + UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); + + // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of + // buf. buf is not copied. + UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, + int byte_length, + int byte_capacity); + + // x.PointToUTF8(buf,len) changes x so that it points to buf + // ("becomes an alias"). It does not take ownership or copy buf. + // If the buffer is not valid, this has the same effect as + // CopyUTF8(utf8_buffer, byte_length). + UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); + + // Occasionally it is necessary to use functions that operate on the + // pointer returned by utf8_data(). MakeIterator(p) provides a way + // to get back to the UnicodeText level. It uses CHECK to ensure + // that p is a pointer within this object's UTF-8 data, and that it + // points to the beginning of a character. + const_iterator MakeIterator(const char* p) const; + + string DebugString() const; + + private: + friend class const_iterator; + friend class UnicodeTextUtils; + + class Repr { // A byte-string. + public: + char* data_; + int size_; + int capacity_; + bool ours_; // Do we own data_? + + Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {} + ~Repr() { if (ours_) delete[] data_; } + + void clear(); + void reserve(int capacity); + void resize(int size); + + void append(const char* bytes, int byte_length); + void Copy(const char* data, int size); + void TakeOwnershipOf(char* data, int size, int capacity); + void PointTo(const char* data, int size); + + string DebugString() const; + + private: + Repr& operator=(const Repr&); + Repr(const Repr& other); + }; + + Repr repr_; + + // UTF-8-specific private methods. + // These routines do not perform a validity check when compiled + // in opt mode. + // It is an error to call these methods with UTF-8 data that + // is not interchange-valid. + // + UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); + UnicodeText& UnsafeTakeOwnershipOfUTF8( + char* utf8_buffer, int byte_length, int byte_capacity); + UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); + UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); + const_iterator UnsafeFind(const UnicodeText& look, + const_iterator start_pos) const; +}; + +bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); + +inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { + return !(lhs == rhs); +} + +// UnicodeTextRange is a pair of iterators, useful for specifying text +// segments. If the iterators are ==, the segment is empty. +typedef pair UnicodeTextRange; + +inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { + return r.first == r.second; +} + + +// *************************** Utilities ************************* + +// A factory function for creating a UnicodeText from a buffer of +// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It +// is an "owner.") +// +// Each byte that is structurally invalid will be replaced with a +// space. Each codepoint that is interchange-invalid will also be +// replaced with a space, even if the codepoint was represented with a +// multibyte sequence in the UTF-8 data. +// +inline UnicodeText MakeUnicodeTextAcceptingOwnership( + char* utf8_buffer, int byte_length, int byte_capacity) { + return UnicodeText().TakeOwnershipOfUTF8( + utf8_buffer, byte_length, byte_capacity); +} + +// A factory function for creating a UnicodeText from a buffer of +// UTF-8 data. The new UnicodeText does not take ownership of the +// buffer. (It is an "alias.") +// +inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( + const char* utf8_buffer, int byte_length) { + return UnicodeText().PointToUTF8(utf8_buffer, byte_length); +} + +// Create a UnicodeText from a UTF-8 string or buffer. +// +// If do_copy is true, then a copy of the string is made. The copy is +// owned by the resulting UnicodeText object and will be freed when +// the object is destroyed. This UnicodeText object is referred to +// as an "owner." +// +// If do_copy is false, then no copy is made. The resulting +// UnicodeText object does NOT take ownership of the string; in this +// case, the lifetime of the UnicodeText object must not exceed the +// lifetime of the string. This Unicodetext object is referred to as +// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. +// +// If the input string does not contain valid UTF-8, then a copy is +// made (as if do_copy were true) and coerced to valid UTF-8 by +// replacing each invalid byte with a space. +// +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, + bool do_copy) { + UnicodeText t; + if (do_copy) { + t.CopyUTF8(utf8_buf, len); + } else { + t.PointToUTF8(utf8_buf, len); + } + return t; +} + +inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { + return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); +} + +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { + return UTF8ToUnicodeText(utf8_buf, len, true); +} +inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { + return UTF8ToUnicodeText(utf8_string, true); +} + +// Return a string containing the UTF-8 encoded version of all the +// Unicode characters in t. +inline string UnicodeTextToUTF8(const UnicodeText& t) { + return string(t.utf8_data(), t.utf8_length()); +} + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// For debugging. Return a string of integers, written in uppercase +// hex (%X), corresponding to the codepoints within the text. Each +// integer is followed by a space. E.g., "61 62 6A 3005 ". +string CodepointString(const UnicodeText& t); + +#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_ diff --git a/unittest/util/utf8/unilib.cc b/unittest/util/utf8/unilib.cc new file mode 100644 index 000000000..c00759ae3 --- /dev/null +++ b/unittest/util/utf8/unilib.cc @@ -0,0 +1,58 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Author: sligocki@google.com (Shawn Ligocki) + +#include "util/utf8/unilib.h" + +#include "syntaxnet/base.h" +#include "third_party/utf/utf.h" + +namespace UniLib { + +// Codepoints not allowed for interchange are: +// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020), +// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A), +// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D) +// C1 controls: U+007F to U+009F +// Surrogates: U+D800 to U+DFFF +// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx +bool IsInterchangeValid(char32 c) { + return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) || + (c >= 0x7F && c <= 0x9F) || + (c >= 0xD800 && c <= 0xDFFF) || + (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE); +} + +int SpanInterchangeValid(const char* begin, int byte_length) { + char32 rune; + const char* p = begin; + const char* end = begin + byte_length; + while (p < end) { + int bytes_consumed = charntorune(&rune, p, end - p); + // We want to accept Runeerror == U+FFFD as a valid char, but it is used + // by chartorune to indicate error. Luckily, the real codepoint is size 3 + // while errors return bytes_consumed <= 1. + if ((rune == Runeerror && bytes_consumed <= 1) || + !IsInterchangeValid(rune)) { + break; // Found + } + p += bytes_consumed; + } + return p - begin; +} + +} // namespace UniLib diff --git a/unittest/util/utf8/unilib.h b/unittest/util/utf8/unilib.h new file mode 100644 index 000000000..e99895a2a --- /dev/null +++ b/unittest/util/utf8/unilib.h @@ -0,0 +1,63 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Routines to do manipulation of Unicode characters or text +// +// The StructurallyValid routines accept buffers of arbitrary bytes. +// For CoerceToStructurallyValid(), the input buffer and output buffers may +// point to exactly the same memory. +// +// In all other cases, the UTF-8 string must be structurally valid and +// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. +// Debug builds take a fatal error for invalid UTF-8 input. +// The input and output buffers may not overlap at all. +// +// The char32 routines are here only for convenience; they convert to UTF-8 +// internally and use the UTF-8 routines. + +#ifndef UTIL_UTF8_UNILIB_H__ +#define UTIL_UTF8_UNILIB_H__ + +#include +#include "syntaxnet/base.h" + +// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here, +// but they are defined in unilib_utf8_utils.h. +//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export + +namespace UniLib { + +// Returns the length in bytes of the prefix of src that is all +// interchange valid UTF-8 +int SpanInterchangeValid(const char* src, int byte_length); +inline int SpanInterchangeValid(const std::string& src) { + return SpanInterchangeValid(src.data(), src.size()); +} + +// Returns true if the source is all interchange valid UTF-8 +// "Interchange valid" is a stronger than structurally valid -- +// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. +bool IsInterchangeValid(char32 codepoint); +inline bool IsInterchangeValid(const char* src, int byte_length) { + return (byte_length == SpanInterchangeValid(src, byte_length)); +} +inline bool IsInterchangeValid(const std::string& src) { + return IsInterchangeValid(src.data(), src.size()); +} + +} // namespace UniLib + +#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ diff --git a/unittest/util/utf8/unilib_utf8_utils.h b/unittest/util/utf8/unilib_utf8_utils.h new file mode 100644 index 000000000..a9c101661 --- /dev/null +++ b/unittest/util/utf8/unilib_utf8_utils.h @@ -0,0 +1,66 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ +#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ + +// These definitions are self-contained and have no dependencies. +// They are also exported from unilib.h for legacy reasons. + +#include "syntaxnet/base.h" +#include "third_party/utf/utf.h" + +namespace UniLib { + +// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF] +// (i.e., is not a surrogate codepoint). See also +// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h. +inline bool IsValidCodepoint(char32 c) { + return (static_cast(c) < 0xD800) + || (c >= 0xE000 && c <= 0x10FFFF); +} + +// Returns true if 'str' is the start of a structurally valid UTF-8 +// sequence and is not a surrogate codepoint. Returns false if str.empty() +// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function +// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]). +inline bool IsUTF8ValidCodepoint(StringPiece str) { + char32 c; + int consumed; + // It's OK if str.length() > consumed. + return !str.empty() + && isvalidcharntorune(str.data(), str.size(), &c, &consumed) + && IsValidCodepoint(c); +} + +// Returns the length (number of bytes) of the Unicode code point +// starting at src, based on inspecting just that one byte. This +// requires that src point to a well-formed UTF-8 string; the result +// is undefined otherwise. +inline int OneCharLen(const char* src) { + return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4]; +} + +// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx) +inline bool IsTrailByte(char x) { + // return (x & 0xC0) == 0x80; + // Since trail bytes are always in [0x80, 0xBF], we can optimize: + return static_cast(x) < -0x40; +} + +} // namespace UniLib + +#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_ From aa54bf0f8bb923d17c0f4ecf631d378f1f7b196a Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 24 Jun 2019 12:37:13 +0200 Subject: [PATCH 2/5] Fix code from tensorflow/models/research/syntaxnet/util/utf8 See https://github.com/tensorflow/models/issues/7090. Signed-off-by: Stefan Weil --- unittest/util/utf8/unicodetext.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unittest/util/utf8/unicodetext.cc b/unittest/util/utf8/unicodetext.cc index 99cb02d5a..1a884dd18 100644 --- a/unittest/util/utf8/unicodetext.cc +++ b/unittest/util/utf8/unicodetext.cc @@ -425,22 +425,22 @@ char32 UnicodeText::const_iterator::operator*() const { // for speed, we do the calculation ourselves.) // Convert from UTF-8 - int byte1 = it_[0]; + unsigned char byte1 = it_[0]; if (byte1 < 0x80) return byte1; - int byte2 = it_[1]; + unsigned char byte2 = it_[1]; if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F); - int byte3 = it_[2]; + unsigned char byte3 = it_[2]; if (byte1 < 0xF0) return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F); - int byte4 = it_[3]; + unsigned char byte4 = it_[3]; return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) @@ -458,9 +458,9 @@ UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { } int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { - utf8_output[0] = it_[0]; if (it_[0] < 0x80) return 1; - utf8_output[1] = it_[1]; if (it_[0] < 0xE0) return 2; - utf8_output[2] = it_[2]; if (it_[0] < 0xF0) return 3; + utf8_output[0] = it_[0]; if ((it_[0] & 0xff) < 0x80) return 1; + utf8_output[1] = it_[1]; if ((it_[0] & 0xff) < 0xE0) return 2; + utf8_output[2] = it_[2]; if ((it_[0] & 0xff) < 0xF0) return 3; utf8_output[3] = it_[3]; return 4; } @@ -470,11 +470,11 @@ string UnicodeText::const_iterator::get_utf8_string() const { } int UnicodeText::const_iterator::utf8_length() const { - if (it_[0] < 0x80) { + if ((it_[0] & 0xff) < 0x80) { return 1; - } else if (it_[0] < 0xE0) { + } else if ((it_[0] & 0xff) < 0xE0) { return 2; - } else if (it_[0] < 0xF0) { + } else if ((it_[0] & 0xff) < 0xF0) { return 3; } else { return 4; From 04d85b4c0f2c8911546348dbe1a9baede52132b4 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Mon, 24 Jun 2019 12:51:05 +0200 Subject: [PATCH 3/5] Add more test code for normstrngs_test unilib.h is now available, so more code can be enabled. Signed-off-by: Stefan Weil --- unittest/Makefile.am | 1 + unittest/normstrngs_test.cc | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/unittest/Makefile.am b/unittest/Makefile.am index c4dda1afc..f33352852 100644 --- a/unittest/Makefile.am +++ b/unittest/Makefile.am @@ -273,6 +273,7 @@ matrix_test_SOURCES = matrix_test.cc matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) normstrngs_test_SOURCES = normstrngs_test.cc +normstrngs_test_SOURCES += third_party/utf/rune.c util/utf8/unilib.cc normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS) nthitem_test_SOURCES = nthitem_test.cc diff --git a/unittest/normstrngs_test.cc b/unittest/normstrngs_test.cc index 390cb8658..716981569 100644 --- a/unittest/normstrngs_test.cc +++ b/unittest/normstrngs_test.cc @@ -15,16 +15,14 @@ #include "normstrngs_test.h" #include "strngs.h" #include "unichar.h" -#if defined(HAS_UNILIB_H) -#include "unilib.h" -#endif +#include "util/utf8/unilib.h" #include "include_gunit.h" namespace tesseract { namespace { -#if defined(HAS_UNILIB_H) +#if defined(MISSING_CODE) static std::string EncodeAsUTF8(const char32 ch32) { UNICHAR uni_ch(ch32); return std::string(uni_ch.utf8(), uni_ch.utf8_len()); @@ -363,7 +361,6 @@ TEST(NormstrngsTest, SpanUTF8NotWhitespace) { EXPECT_EQ(12, SpanUTF8NotWhitespace(kMixedText)); } -#if defined(HAS_UNILIB_H) // Test that the method clones the util/utf8/public/unilib definition of // interchange validity. TEST(NormstrngsTest, IsInterchangeValid) { @@ -374,12 +371,11 @@ TEST(NormstrngsTest, IsInterchangeValid) { EXPECT_EQ(UniLib::IsInterchangeValid(ch), IsInterchangeValid(ch)); } } -#endif -#if defined(HAS_UNILIB_H) // Test that the method clones the util/utf8/public/unilib definition of // 7-bit ASCII interchange validity. TEST(NormstrngsTest, IsInterchangeValid7BitAscii) { +#if defined(MISSING_CODE) const int32_t kMinUnicodeValue = 33; const int32_t kMaxUnicodeValue = 0x10FFFF; for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) { @@ -388,8 +384,11 @@ TEST(NormstrngsTest, IsInterchangeValid7BitAscii) { EXPECT_EQ(UniLib::IsInterchangeValid7BitAscii(str), IsInterchangeValid7BitAscii(ch)); } -} +#else + // Skipped because of missing UniLib::IsInterchangeValid7BitAscii. + GTEST_SKIP(); #endif +} // Test that the method clones the util/utf8/public/unilib definition of // fullwidth-halfwidth . @@ -401,7 +400,8 @@ TEST(NormstrngsTest, FullwidthToHalfwidth) { // U+FFE6 -> U+20A9 (won sign) EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6)); -#if defined(HAS_UNILIB_H) +#if defined(MISSING_CODE) + // Skipped because of missing UniLib::FullwidthToHalfwidth. const int32_t kMinUnicodeValue = 33; const int32_t kMaxUnicodeValue = 0x10FFFF; for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) { From 40c1cf671f9fe52a818f1bbcdadcde7845b3da49 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Thu, 30 Aug 2018 11:28:04 +0200 Subject: [PATCH 4/5] unittest: Fix and enable pango_font_info_test Signed-off-by: Stefan Weil --- unittest/Makefile.am | 17 +++-- unittest/pango_font_info_test.cc | 120 +++++++++++++++++++------------ 2 files changed, 84 insertions(+), 53 deletions(-) diff --git a/unittest/Makefile.am b/unittest/Makefile.am index f33352852..04d0d34e7 100644 --- a/unittest/Makefile.am +++ b/unittest/Makefile.am @@ -132,7 +132,6 @@ check_PROGRAMS += matrix_test check_PROGRAMS += nthitem_test check_PROGRAMS += osd_test # check_PROGRAMS += pagesegmode_test -# check_PROGRAMS += pango_font_info_test check_PROGRAMS += paragraphs_test check_PROGRAMS += params_model_test check_PROGRAMS += progress_test @@ -159,6 +158,7 @@ check_PROGRAMS += lstm_squashed_test check_PROGRAMS += lstm_test check_PROGRAMS += lstmtrainer_test check_PROGRAMS += normstrngs_test +check_PROGRAMS += pango_font_info_test check_PROGRAMS += unichar_test check_PROGRAMS += unicharcompress_test check_PROGRAMS += unicharset_test @@ -279,8 +279,16 @@ normstrngs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(ICU_I18N nthitem_test_SOURCES = nthitem_test.cc nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) -#pango_font_info_test_SOURCES = pango_font_info_test.cc -#pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) +osd_test_SOURCES = osd_test.cc +osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS) + +pango_font_info_test_SOURCES = pango_font_info_test.cc +pango_font_info_test_SOURCES += third_party/utf/rune.c +pango_font_info_test_SOURCES += util/utf8/unicodetext.cc util/utf8/unilib.cc +pango_font_info_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(LEPTONICA_LIBS) +pango_font_info_test_LDADD += $(ICU_I18N_LIBS) -lfontconfig +pango_font_info_test_LDADD += -lpangocairo-1.0 -lpangoft2-1.0 +pango_font_info_test_LDADD += $(cairo_LIBS) $(pango_LIBS) paragraphs_test_SOURCES = paragraphs_test.cc paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS) @@ -288,9 +296,6 @@ paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS) params_model_test_SOURCES = params_model_test.cc params_model_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) -osd_test_SOURCES = osd_test.cc -osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS) - progress_test_SOURCES = progress_test.cc progress_test_LDFLAGS = $(OPENCL_LDFLAGS) $(LEPTONICA_LIBS) progress_test_LDADD = $(GTEST_LIBS) $(GMOCK_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS) diff --git a/unittest/pango_font_info_test.cc b/unittest/pango_font_info_test.cc index fde4f34f6..de6290340 100644 --- a/unittest/pango_font_info_test.cc +++ b/unittest/pango_font_info_test.cc @@ -1,12 +1,24 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. -#include "tesseract/training/pango_font_info.h" - -#include -#include - -#include "pango/pango.h" -#include "tesseract/training/commandlineflags.h" -#include "tesseract/training/fileio.h" +#include +#include +#include +#include "include_gunit.h" +#include "commandlineflags.h" +#include "fileio.h" +#include "pango_font_info.h" +#include "absl/strings/str_cat.h" // for absl::StrCat +#include "gmock/gmock-matchers.h" // for EXPECT_THAT +#include "util/utf8/unicodetext.h" // for UnicodeText DECLARE_STRING_PARAM_FLAG(fonts_dir); DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir); @@ -19,19 +31,19 @@ using tesseract::FontUtils; using tesseract::PangoFontInfo; // Fonts in testdata directory -const char* kExpectedFontNames[] = {"Arab", - "Arial Bold Italic", - "DejaVu Sans Ultra-Light", - "Lohit Hindi", +const char* kExpectedFontNames[] = { + "Arab", + "Arial Bold Italic", + "DejaVu Sans Ultra-Light", + "Lohit Hindi", #if PANGO_VERSION <= 12005 - "Times New Roman", + "Times New Roman", #else - "Times New Roman,", // Pango v1.36.2 - // requires a trailing - // ',' + "Times New Roman,", // Pango v1.36.2 requires a trailing ',' #endif - "UnBatang", - "Verdana"}; + "UnBatang", + "Verdana" +}; // Sample text used in tests. const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع"; @@ -41,23 +53,27 @@ const char kKorText[] = "이는 것으로"; // Hindi words containing illegal vowel sequences. const char* kBadlyFormedHinWords[] = { #if PANGO_VERSION <= 12005 - "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस", + "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस", #endif - // Pango v1.36.2 will render the above words even though they are invalid. - "प्रंात", nullptr}; + // Pango v1.36.2 will render the above words even though they are invalid. + "प्रंात", nullptr +}; class PangoFontInfoTest : public ::testing::Test { protected: void SetUp() override { - std::locale::global(std::locale("")); + static std::locale system_locale(""); + std::locale::global(system_locale); } // Creates a fake fonts.conf file that points to the testdata fonts for // fontconfig to initialize with. static void SetUpTestCase() { - FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata"); + FLAGS_fonts_dir = TESTING_DIR; FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir; +#ifdef GOOGLE_TESSERACT FLAGS_use_only_legacy_fonts = false; +#endif } PangoFontInfo font_info_; @@ -120,7 +136,7 @@ TEST_F(PangoFontInfoTest, CanRenderLigature) { font_info_.ParseFontDescriptionName("Arab 12"); const char kArabicLigature[] = "لا"; EXPECT_TRUE( - font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature))); + font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature))); printf("Next word\n"); EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText))); @@ -143,17 +159,17 @@ TEST_F(PangoFontInfoTest, CannotRenderInvalidString) { TEST_F(PangoFontInfoTest, CanDropUncoveredChars) { font_info_.ParseFontDescriptionName("Verdana 12"); // Verdana cannot render the "ff" ligature - string word = "office"; + std::string word = "office"; EXPECT_EQ(1, font_info_.DropUncoveredChars(&word)); EXPECT_EQ("oice", word); // Don't drop non-letter characters like word joiners. const char* kJoiners[] = { - "\u2060", // U+2060 (WJ) - "\u200C", // U+200C (ZWJ) - "\u200D" // U+200D (ZWNJ) + "\u2060", // U+2060 (WJ) + "\u200C", // U+200C (ZWJ) + "\u200D" // U+200D (ZWNJ) }; - for (int i = 0; i < ARRAYSIZE(kJoiners); ++i) { + for (size_t i = 0; i < ARRAYSIZE(kJoiners); ++i) { word = kJoiners[i]; EXPECT_EQ(0, font_info_.DropUncoveredChars(&word)); EXPECT_STREQ(kJoiners[i], word.c_str()); @@ -167,17 +183,21 @@ class FontUtilsTest : public ::testing::Test { // Creates a fake fonts.conf file that points to the testdata fonts for // fontconfig to initialize with. static void SetUpTestCase() { - FLAGS_fonts_dir = File::JoinPath(FLAGS_test_srcdir, "testdata"); + FLAGS_fonts_dir = TESTING_DIR; FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir; } void CountUnicodeChars(const char* utf8_text, - std::unordered_map* ch_map) { + std::unordered_map* ch_map) { ch_map->clear(); UnicodeText ut; ut.PointToUTF8(utf8_text, strlen(utf8_text)); for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) { +#if 0 if (UnicodeProps::IsWhitespace(*it)) continue; +#else + if (std::isspace(*it)) continue; +#endif ++(*ch_map)[*it]; } } @@ -206,21 +226,21 @@ TEST_F(FontUtilsTest, DoesDetectMissingFonts) { } TEST_F(FontUtilsTest, DoesListAvailableFonts) { - const std::vector& fonts = FontUtils::ListAvailableFonts(); + const std::vector& fonts = FontUtils::ListAvailableFonts(); EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames)); - for (int i = 0; i < fonts.size(); ++i) { + for (auto& font : fonts) { PangoFontInfo font_info; - EXPECT_TRUE(font_info.ParseFontDescriptionName(fonts[i])); + EXPECT_TRUE(font_info.ParseFontDescriptionName(font)); } } TEST_F(FontUtilsTest, DoesFindBestFonts) { - string fonts_list; - std::unordered_map ch_map; + std::string fonts_list; + std::unordered_map ch_map; CountUnicodeChars(kEngText, &ch_map); EXPECT_EQ(26, ch_map.size()); // 26 letters std::vector > > font_flags; - string best_list = FontUtils::BestFonts(ch_map, &font_flags); + std::string best_list = FontUtils::BestFonts(ch_map, &font_flags); EXPECT_TRUE(best_list.size()); // All fonts except Lohit Hindi should render English text. EXPECT_EQ(ARRAYSIZE(kExpectedFontNames) - 1, font_flags.size()); @@ -238,8 +258,8 @@ TEST_F(FontUtilsTest, DoesSelectFont) { const char* kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr}; for (int i = 0; kLangText[i] != nullptr; ++i) { SCOPED_TRACE(kLangNames[i]); - std::vector graphemes; - string selected_font; + std::vector graphemes; + std::string selected_font; EXPECT_TRUE(FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), &selected_font, &graphemes)); EXPECT_TRUE(selected_font.size()); @@ -249,17 +269,17 @@ TEST_F(FontUtilsTest, DoesSelectFont) { TEST_F(FontUtilsTest, DoesFailToSelectFont) { const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع"; - std::vector graphemes; - string selected_font; + std::vector graphemes; + std::string selected_font; EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), &selected_font, &graphemes)); } TEST_F(FontUtilsTest, GetAllRenderableCharacters) { - const int32 kHindiChar = 0x0905; - const int32 kArabicChar = 0x0623; - const int32 kMongolianChar = 0x180E; // Mongolian vowel separator - const int32 kOghamChar = 0x1680; // Ogham space mark + const int32_t kHindiChar = 0x0905; + const int32_t kArabicChar = 0x0623; + const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator + const int32_t kOghamChar = 0x1680; // Ogham space mark std::vector unicode_mask; FontUtils::GetAllRenderableCharacters(&unicode_mask); EXPECT_TRUE(unicode_mask['A']); @@ -267,10 +287,12 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) { EXPECT_TRUE(unicode_mask[kHindiChar]); EXPECT_TRUE(unicode_mask[kArabicChar]); EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian. +#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham. +#endif unicode_mask.clear(); - std::vector selected_fonts; + std::vector selected_fonts; selected_fonts.push_back("Lohit Hindi"); FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask); EXPECT_TRUE(unicode_mask['1']); @@ -279,14 +301,18 @@ TEST_F(FontUtilsTest, GetAllRenderableCharacters) { EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic, EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian, EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham. + unicode_mask.clear(); // Check that none of the included fonts cover the Mongolian or Ogham space // characters. - for (int f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) { + for (size_t f = 0; f < ARRAYSIZE(kExpectedFontNames); ++f) { SCOPED_TRACE(absl::StrCat("Testing ", kExpectedFontNames[f])); FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask); +#if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham EXPECT_FALSE(unicode_mask[kOghamChar]); +#endif EXPECT_FALSE(unicode_mask[kMongolianChar]); + unicode_mask.clear(); } } } // namespace From 655ba7af1088070550794f73703fd0a034bcb311 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 28 Jun 2019 08:11:42 +0200 Subject: [PATCH 5/5] unittest: Fix compiler warnings (signed/unsigned) Signed-off-by: Stefan Weil --- unittest/baseapi_test.cc | 4 ++-- unittest/heap_test.cc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unittest/baseapi_test.cc b/unittest/baseapi_test.cc index f36583096..813c3a6c5 100644 --- a/unittest/baseapi_test.cc +++ b/unittest/baseapi_test.cc @@ -319,7 +319,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) { const char* langs[] = {"eng", "chi_tra", "jpn", "vie"}; std::unique_ptr api; CycleTimer timer; - for (int i = 0; i < ARRAYSIZE(langs); ++i) { + for (size_t i = 0; i < ARRAYSIZE(langs); ++i) { api.reset(new tesseract::TessBaseAPI); timer.Restart(); EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], @@ -333,7 +333,7 @@ TEST_F(TesseractTest, InitConfigOnlyTest) { vars_vec.push_back(STRING("tessedit_init_config_only")); vars_values.push_back(STRING("1")); LOG(INFO) << "Switching to config only initialization:"; - for (int i = 0; i < ARRAYSIZE(langs); ++i) { + for (size_t i = 0; i < ARRAYSIZE(langs); ++i) { api.reset(new tesseract::TessBaseAPI); timer.Restart(); EXPECT_EQ(0, api->Init(TessdataPath().c_str(), langs[i], diff --git a/unittest/heap_test.cc b/unittest/heap_test.cc index 4daae7660..2547614ac 100644 --- a/unittest/heap_test.cc +++ b/unittest/heap_test.cc @@ -34,7 +34,7 @@ class HeapTest : public testing::Test { virtual ~HeapTest(); // Pushes the test data onto both the heap and the KDVector. void PushTestData(GenericHeap* heap, KDVector* v) { - for (int i = 0; i < ARRAYSIZE(test_data); ++i) { + for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) { IntKDPair pair(test_data[i], i); heap->Push(&pair); v->push_back(pair); @@ -137,7 +137,7 @@ TEST_F(HeapTest, RevalueTest) { GenericHeap heap; GenericVector v; // Push the test data onto both the heap and the vector. - for (int i = 0; i < ARRAYSIZE(test_data); ++i) { + for (size_t i = 0; i < ARRAYSIZE(test_data); ++i) { PtrPair h_pair; h_pair.key = test_data[i]; PtrPair v_pair;