unittest: Enable more code for tatweel_test without requiring Tensorflow

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2021-08-08 12:10:20 +02:00
parent c1180a8bc0
commit 63c12a9ee5
6 changed files with 34 additions and 13 deletions

View File

@ -1156,9 +1156,9 @@ unittest_CPPFLAGS += $(pangocairo_CFLAGS)
endif # ENABLE_TRAINING
unittest_CPPFLAGS += -I$(top_srcdir)/src/viewer
unittest_CPPFLAGS += -I$(top_srcdir)/src/wordrec
unittest_CPPFLAGS += -I$(top_srcdir)/unittest
if TENSORFLOW
unittest_CPPFLAGS += -DINCLUDE_TENSORFLOW
unittest_CPPFLAGS += -I$(top_srcdir)/unittest
unittest_CPPFLAGS += -I/usr/include/tensorflow
endif # TENSORFLOW
@ -1536,11 +1536,9 @@ tabvector_test_CPPFLAGS = $(unittest_CPPFLAGS)
tabvector_test_LDADD = $(TESS_LIBS)
tatweel_test_SOURCES = unittest/tatweel_test.cc
if TENSORFLOW
tatweel_test_SOURCES += unittest/third_party/utf/rune.c
tatweel_test_SOURCES += unittest/util/utf8/unicodetext.cc
tatweel_test_SOURCES += unittest/util/utf8/unilib.cc
endif # TENSORFLOW
tatweel_test_CPPFLAGS = $(unittest_CPPFLAGS)
tatweel_test_LDADD = $(TRAINING_LIBS)

View File

@ -18,11 +18,11 @@
#include "gtest/gtest.h"
#include "log.h" // for LOG
const char *FLAGS_test_tmpdir = "./tmp";
static const char *FLAGS_test_tmpdir = "./tmp";
namespace tesseract {
void trim(std::string &s) {
static inline void trim(std::string &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
return !std::isspace(ch);
}));
@ -77,6 +77,7 @@ public:
if (!(condition)) \
LOG(FATAL) << "Check failed: " #condition " "
# define CHECK_EQ(test, value) CHECK((test) == (value))
# define CHECK_GE(test, value) CHECK((test) >= (value))
# define CHECK_GT(test, value) CHECK((test) > (value))
# define CHECK_LT(test, value) CHECK((test) < (value))
# define CHECK_LE(test, value) CHECK((test) <= (value))

View File

@ -16,12 +16,15 @@ limitations under the License.
#ifndef SYNTAXNET_BASE_H_
#define SYNTAXNET_BASE_H_
#include <map>
#include <functional>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#ifdef INCLUDE_TENSORFLOW
#include "google/protobuf/util/message_differencer.h"
#include "tensorflow/core/lib/core/status.h"
@ -31,11 +34,14 @@ limitations under the License.
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/protobuf.h"
#endif
using std::map;
using std::pair;
using std::unordered_map;
using std::unordered_set;
using std::vector;
#ifdef INCLUDE_TENSORFLOW
using tensorflow::int16;
using tensorflow::int32;
using tensorflow::int64;
@ -47,10 +53,13 @@ using tensorflow::uint32;
using tensorflow::uint64;
using tensorflow::uint8;
using tensorflow::protobuf::TextFormat;
#endif
typedef signed int char32;
using std::string;
#ifdef INCLUDE_TENSORFLOW
using tensorflow::StringPiece;
#endif
// namespace syntaxnet

View File

@ -19,9 +19,7 @@
#include "include_gunit.h"
#include "trie.h"
#include "unicharset.h"
#ifdef INCLUDE_TENSORFLOW
# include "util/utf8/unicodetext.h" // for UnicodeText
#endif
#include "util/utf8/unicodetext.h" // for UnicodeText
namespace tesseract {
@ -42,10 +40,9 @@ protected:
}
TatweelTest() {
#ifdef INCLUDE_TENSORFLOW
std::string filename = TestDataNameToPath("ara.wordlist");
if (file_exists(filename.c_str())) {
std::string wordlist(u8"\u0640");
std::string wordlist("\u0640");
CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
// Put all the unicodes in the unicharset_.
UnicodeText text;
@ -53,14 +50,13 @@ protected:
int num_tatweel = 0;
for (auto it = text.begin(); it != text.end(); ++it) {
std::string utf8 = it.get_utf8_string();
if (utf8.find(u8"\u0640") != std::string::npos)
if (utf8.find("\u0640") != std::string::npos)
++num_tatweel;
unicharset_.unichar_insert(utf8.c_str());
}
LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
EXPECT_GT(num_tatweel, 0);
}
#endif
}
std::string TestDataNameToPath(const std::string &name) {

View File

@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "include_gunit.h"
#include "util/utf8/unicodetext.h"
#include <string.h> // for memcpy, NULL, memcmp, etc
@ -172,10 +173,12 @@ void UnicodeText::Repr::append(const char *bytes, int byte_length) {
size_ += byte_length;
}
#ifdef INCLUDE_TENSORFLOW
string UnicodeText::Repr::DebugString() const {
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
capacity_, ours_ ? "Owned" : "Alias");
}
#endif
// *************** UnicodeText ******************
@ -310,17 +313,24 @@ UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
const_iterator start_pos) const {
// Due to the magic of the UTF8 encoding, searching for a sequence of
// letters is equivalent to substring search.
#ifdef INCLUDE_TENSORFLOW
StringPiece searching(utf8_data(), utf8_length());
StringPiece look_piece(look.utf8_data(), look.utf8_length());
#endif
LOG(FATAL) << "Not implemented";
#ifdef INCLUDE_TENSORFLOW
// StringPiece::size_type found =
// searching.find(look_piece, start_pos.utf8_data() - utf8_data());
StringPiece::size_type found = StringPiece::npos;
if (found == StringPiece::npos)
return end();
return const_iterator(utf8_data() + found);
#else
return end();
#endif
}
#ifdef INCLUDE_TENSORFLOW
bool UnicodeText::HasReplacementChar() const {
// Equivalent to:
// UnicodeText replacement_char;
@ -332,6 +342,7 @@ bool UnicodeText::HasReplacementChar() const {
// return searching.find(looking_for) != StringPiece::npos;
return false;
}
#endif
// ----- other methods -----
@ -371,10 +382,12 @@ bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
}
#ifdef INCLUDE_TENSORFLOW
string UnicodeText::DebugString() const {
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
repr_.DebugString().c_str());
}
#endif
// ******************* UnicodeText::const_iterator *********************
@ -479,6 +492,7 @@ UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {
return const_iterator(p);
}
#ifdef INCLUDE_TENSORFLOW
string UnicodeText::const_iterator::DebugString() const {
return tensorflow::strings::Printf("{iter %p}", it_);
}
@ -492,3 +506,4 @@ string CodepointString(const UnicodeText &t) {
tensorflow::strings::Appendf(&s, "%X ", *it++);
return s;
}
#endif

View File

@ -29,13 +29,14 @@ namespace UniLib {
// (i.e., is not a surrogate codepoint). See also
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
inline bool IsValidCodepoint(char32 c) {
return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
return (static_cast<uint32_t>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
}
// Returns true if 'str' is the start of a structurally valid UTF-8
// sequence and is not a surrogate codepoint. Returns false if str.empty()
// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
#ifdef INCLUDE_TENSORFLOW
inline bool IsUTF8ValidCodepoint(StringPiece str) {
char32 c;
int consumed;
@ -43,6 +44,7 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) {
return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) &&
IsValidCodepoint(c);
}
#endif
// Returns the length (number of bytes) of the Unicode code point
// starting at src, based on inspecting just that one byte. This