unittest: Enable more code for tatweel_test without requiring Tensorflow

Signed-off-by: Stefan Weil <sw@weilnetz.de>
2025-01-19 15:03:45 +08:00 · 2021-08-08 12:10:20 +02:00 · 2021-08-08 12:10:20 +02:00 · 63c12a9ee5
commit 63c12a9ee5
parent c1180a8bc0
6 changed files with 34 additions and 13 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -1156,9 +1156,9 @@ unittest_CPPFLAGS += $(pangocairo_CFLAGS)
 endif # ENABLE_TRAINING
 unittest_CPPFLAGS += -I$(top_srcdir)/src/viewer
 unittest_CPPFLAGS += -I$(top_srcdir)/src/wordrec
+unittest_CPPFLAGS += -I$(top_srcdir)/unittest
 if TENSORFLOW
 unittest_CPPFLAGS += -DINCLUDE_TENSORFLOW
-unittest_CPPFLAGS += -I$(top_srcdir)/unittest
 unittest_CPPFLAGS += -I/usr/include/tensorflow
 endif # TENSORFLOW

@ -1536,11 +1536,9 @@ tabvector_test_CPPFLAGS = $(unittest_CPPFLAGS)
 tabvector_test_LDADD = $(TESS_LIBS)

 tatweel_test_SOURCES = unittest/tatweel_test.cc
-if TENSORFLOW
 tatweel_test_SOURCES += unittest/third_party/utf/rune.c
 tatweel_test_SOURCES += unittest/util/utf8/unicodetext.cc
 tatweel_test_SOURCES += unittest/util/utf8/unilib.cc
-endif # TENSORFLOW
 tatweel_test_CPPFLAGS = $(unittest_CPPFLAGS)
 tatweel_test_LDADD = $(TRAINING_LIBS)

--- a/unittest/include_gunit.h
+++ b/unittest/include_gunit.h
@ -18,11 +18,11 @@
 #include "gtest/gtest.h"
 #include "log.h" // for LOG

-const char *FLAGS_test_tmpdir = "./tmp";
+static const char *FLAGS_test_tmpdir = "./tmp";

 namespace tesseract {

-void trim(std::string &s) {
+static inline void trim(std::string &s) {
  s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
    return !std::isspace(ch);
  }));
@ -77,6 +77,7 @@ public:
    if (!(condition))      \
    LOG(FATAL) << "Check failed: " #condition " "
 #  define CHECK_EQ(test, value) CHECK((test) == (value))
+#  define CHECK_GE(test, value) CHECK((test) >= (value))
 #  define CHECK_GT(test, value) CHECK((test) > (value))
 #  define CHECK_LT(test, value) CHECK((test) < (value))
 #  define CHECK_LE(test, value) CHECK((test) <= (value))
--- a/unittest/syntaxnet/base.h
+++ b/unittest/syntaxnet/base.h
@ -16,12 +16,15 @@ limitations under the License.
 #ifndef SYNTAXNET_BASE_H_
 #define SYNTAXNET_BASE_H_

+#include <map>
 #include <functional>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>

+#ifdef INCLUDE_TENSORFLOW
+
 #include "google/protobuf/util/message_differencer.h"

 #include "tensorflow/core/lib/core/status.h"
@ -31,11 +34,14 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"

+#endif
+
 using std::map;
 using std::pair;
 using std::unordered_map;
 using std::unordered_set;
 using std::vector;
+#ifdef INCLUDE_TENSORFLOW
 using tensorflow::int16;
 using tensorflow::int32;
 using tensorflow::int64;
@ -47,10 +53,13 @@ using tensorflow::uint32;
 using tensorflow::uint64;
 using tensorflow::uint8;
 using tensorflow::protobuf::TextFormat;
+#endif
 typedef signed int char32;

 using std::string;
+#ifdef INCLUDE_TENSORFLOW
 using tensorflow::StringPiece;
+#endif

 // namespace syntaxnet

--- a/unittest/tatweel_test.cc
+++ b/unittest/tatweel_test.cc
@ -19,9 +19,7 @@
 #include "include_gunit.h"
 #include "trie.h"
 #include "unicharset.h"
-#ifdef INCLUDE_TENSORFLOW
-#  include "util/utf8/unicodetext.h" // for UnicodeText
-#endif
+#include "util/utf8/unicodetext.h" // for UnicodeText

 namespace tesseract {

@ -42,10 +40,9 @@ protected:
  }

  TatweelTest() {
-#ifdef INCLUDE_TENSORFLOW
    std::string filename = TestDataNameToPath("ara.wordlist");
    if (file_exists(filename.c_str())) {
-      std::string wordlist(u8"\u0640");
+      std::string wordlist("\u0640");
      CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
      // Put all the unicodes in the unicharset_.
      UnicodeText text;
@ -53,14 +50,13 @@ protected:
      int num_tatweel = 0;
      for (auto it = text.begin(); it != text.end(); ++it) {
        std::string utf8 = it.get_utf8_string();
-        if (utf8.find(u8"\u0640") != std::string::npos)
+        if (utf8.find("\u0640") != std::string::npos)
          ++num_tatweel;
        unicharset_.unichar_insert(utf8.c_str());
      }
      LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
      EXPECT_GT(num_tatweel, 0);
    }
-#endif
  }

  std::string TestDataNameToPath(const std::string &name) {
--- a/unittest/util/utf8/unicodetext.cc
+++ b/unittest/util/utf8/unicodetext.cc
@ -14,6 +14,7 @@
 * limitations under the License.
 */

+#include "include_gunit.h"
 #include "util/utf8/unicodetext.h"

 #include <string.h>  // for memcpy, NULL, memcmp, etc
@ -172,10 +173,12 @@ void UnicodeText::Repr::append(const char *bytes, int byte_length) {
  size_ += byte_length;
 }

+#ifdef INCLUDE_TENSORFLOW
 string UnicodeText::Repr::DebugString() const {
  return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
                                     capacity_, ours_ ? "Owned" : "Alias");
 }
+#endif

 // *************** UnicodeText ******************

@ -310,17 +313,24 @@ UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
                                                    const_iterator start_pos) const {
  // Due to the magic of the UTF8 encoding, searching for a sequence of
  // letters is equivalent to substring search.
+#ifdef INCLUDE_TENSORFLOW
  StringPiece searching(utf8_data(), utf8_length());
  StringPiece look_piece(look.utf8_data(), look.utf8_length());
+#endif
  LOG(FATAL) << "Not implemented";
+#ifdef INCLUDE_TENSORFLOW
  // StringPiece::size_type found =
  //    searching.find(look_piece, start_pos.utf8_data() - utf8_data());
  StringPiece::size_type found = StringPiece::npos;
  if (found == StringPiece::npos)
    return end();
  return const_iterator(utf8_data() + found);
+#else
+  return end();
+#endif
 }

+#ifdef INCLUDE_TENSORFLOW
 bool UnicodeText::HasReplacementChar() const {
  // Equivalent to:
  //   UnicodeText replacement_char;
@ -332,6 +342,7 @@ bool UnicodeText::HasReplacementChar() const {
  // return searching.find(looking_for) != StringPiece::npos;
  return false;
 }
+#endif

 // ----- other methods -----

@ -371,10 +382,12 @@ bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
 }

+#ifdef INCLUDE_TENSORFLOW
 string UnicodeText::DebugString() const {
  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
                                     repr_.DebugString().c_str());
 }
+#endif

 // ******************* UnicodeText::const_iterator *********************

@ -479,6 +492,7 @@ UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {
  return const_iterator(p);
 }

+#ifdef INCLUDE_TENSORFLOW
 string UnicodeText::const_iterator::DebugString() const {
  return tensorflow::strings::Printf("{iter %p}", it_);
 }
@ -492,3 +506,4 @@ string CodepointString(const UnicodeText &t) {
    tensorflow::strings::Appendf(&s, "%X ", *it++);
  return s;
 }
+#endif
--- a/unittest/util/utf8/unilib_utf8_utils.h
+++ b/unittest/util/utf8/unilib_utf8_utils.h
@ -29,13 +29,14 @@ namespace UniLib {
 // (i.e., is not a surrogate codepoint). See also
 // IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
 inline bool IsValidCodepoint(char32 c) {
-  return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
+  return (static_cast<uint32_t>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
 }

 // Returns true if 'str' is the start of a structurally valid UTF-8
 // sequence and is not a surrogate codepoint. Returns false if str.empty()
 // or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
 // will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
+#ifdef INCLUDE_TENSORFLOW
 inline bool IsUTF8ValidCodepoint(StringPiece str) {
  char32 c;
  int consumed;
@ -43,6 +44,7 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) {
  return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) &&
         IsValidCodepoint(c);
 }
+#endif

 // Returns the length (number of bytes) of the Unicode code point
 // starting at src, based on inspecting just that one byte. This