mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 15:03:45 +08:00
unittest: Enable more code for tatweel_test without requiring Tensorflow
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
c1180a8bc0
commit
63c12a9ee5
@ -1156,9 +1156,9 @@ unittest_CPPFLAGS += $(pangocairo_CFLAGS)
|
||||
endif # ENABLE_TRAINING
|
||||
unittest_CPPFLAGS += -I$(top_srcdir)/src/viewer
|
||||
unittest_CPPFLAGS += -I$(top_srcdir)/src/wordrec
|
||||
unittest_CPPFLAGS += -I$(top_srcdir)/unittest
|
||||
if TENSORFLOW
|
||||
unittest_CPPFLAGS += -DINCLUDE_TENSORFLOW
|
||||
unittest_CPPFLAGS += -I$(top_srcdir)/unittest
|
||||
unittest_CPPFLAGS += -I/usr/include/tensorflow
|
||||
endif # TENSORFLOW
|
||||
|
||||
@ -1536,11 +1536,9 @@ tabvector_test_CPPFLAGS = $(unittest_CPPFLAGS)
|
||||
tabvector_test_LDADD = $(TESS_LIBS)
|
||||
|
||||
tatweel_test_SOURCES = unittest/tatweel_test.cc
|
||||
if TENSORFLOW
|
||||
tatweel_test_SOURCES += unittest/third_party/utf/rune.c
|
||||
tatweel_test_SOURCES += unittest/util/utf8/unicodetext.cc
|
||||
tatweel_test_SOURCES += unittest/util/utf8/unilib.cc
|
||||
endif # TENSORFLOW
|
||||
tatweel_test_CPPFLAGS = $(unittest_CPPFLAGS)
|
||||
tatweel_test_LDADD = $(TRAINING_LIBS)
|
||||
|
||||
|
@ -18,11 +18,11 @@
|
||||
#include "gtest/gtest.h"
|
||||
#include "log.h" // for LOG
|
||||
|
||||
const char *FLAGS_test_tmpdir = "./tmp";
|
||||
static const char *FLAGS_test_tmpdir = "./tmp";
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
void trim(std::string &s) {
|
||||
static inline void trim(std::string &s) {
|
||||
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
@ -77,6 +77,7 @@ public:
|
||||
if (!(condition)) \
|
||||
LOG(FATAL) << "Check failed: " #condition " "
|
||||
# define CHECK_EQ(test, value) CHECK((test) == (value))
|
||||
# define CHECK_GE(test, value) CHECK((test) >= (value))
|
||||
# define CHECK_GT(test, value) CHECK((test) > (value))
|
||||
# define CHECK_LT(test, value) CHECK((test) < (value))
|
||||
# define CHECK_LE(test, value) CHECK((test) <= (value))
|
||||
|
@ -16,12 +16,15 @@ limitations under the License.
|
||||
#ifndef SYNTAXNET_BASE_H_
|
||||
#define SYNTAXNET_BASE_H_
|
||||
|
||||
#include <map>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
|
||||
#include "google/protobuf/util/message_differencer.h"
|
||||
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
@ -31,11 +34,14 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/protobuf.h"
|
||||
|
||||
#endif
|
||||
|
||||
using std::map;
|
||||
using std::pair;
|
||||
using std::unordered_map;
|
||||
using std::unordered_set;
|
||||
using std::vector;
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
using tensorflow::int16;
|
||||
using tensorflow::int32;
|
||||
using tensorflow::int64;
|
||||
@ -47,10 +53,13 @@ using tensorflow::uint32;
|
||||
using tensorflow::uint64;
|
||||
using tensorflow::uint8;
|
||||
using tensorflow::protobuf::TextFormat;
|
||||
#endif
|
||||
typedef signed int char32;
|
||||
|
||||
using std::string;
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
using tensorflow::StringPiece;
|
||||
#endif
|
||||
|
||||
// namespace syntaxnet
|
||||
|
||||
|
@ -19,9 +19,7 @@
|
||||
#include "include_gunit.h"
|
||||
#include "trie.h"
|
||||
#include "unicharset.h"
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
# include "util/utf8/unicodetext.h" // for UnicodeText
|
||||
#endif
|
||||
#include "util/utf8/unicodetext.h" // for UnicodeText
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -42,10 +40,9 @@ protected:
|
||||
}
|
||||
|
||||
TatweelTest() {
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
std::string filename = TestDataNameToPath("ara.wordlist");
|
||||
if (file_exists(filename.c_str())) {
|
||||
std::string wordlist(u8"\u0640");
|
||||
std::string wordlist("\u0640");
|
||||
CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
|
||||
// Put all the unicodes in the unicharset_.
|
||||
UnicodeText text;
|
||||
@ -53,14 +50,13 @@ protected:
|
||||
int num_tatweel = 0;
|
||||
for (auto it = text.begin(); it != text.end(); ++it) {
|
||||
std::string utf8 = it.get_utf8_string();
|
||||
if (utf8.find(u8"\u0640") != std::string::npos)
|
||||
if (utf8.find("\u0640") != std::string::npos)
|
||||
++num_tatweel;
|
||||
unicharset_.unichar_insert(utf8.c_str());
|
||||
}
|
||||
LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
|
||||
EXPECT_GT(num_tatweel, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string TestDataNameToPath(const std::string &name) {
|
||||
|
@ -14,6 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "include_gunit.h"
|
||||
#include "util/utf8/unicodetext.h"
|
||||
|
||||
#include <string.h> // for memcpy, NULL, memcmp, etc
|
||||
@ -172,10 +173,12 @@ void UnicodeText::Repr::append(const char *bytes, int byte_length) {
|
||||
size_ += byte_length;
|
||||
}
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
string UnicodeText::Repr::DebugString() const {
|
||||
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
|
||||
capacity_, ours_ ? "Owned" : "Alias");
|
||||
}
|
||||
#endif
|
||||
|
||||
// *************** UnicodeText ******************
|
||||
|
||||
@ -310,17 +313,24 @@ UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
|
||||
const_iterator start_pos) const {
|
||||
// Due to the magic of the UTF8 encoding, searching for a sequence of
|
||||
// letters is equivalent to substring search.
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
StringPiece searching(utf8_data(), utf8_length());
|
||||
StringPiece look_piece(look.utf8_data(), look.utf8_length());
|
||||
#endif
|
||||
LOG(FATAL) << "Not implemented";
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
// StringPiece::size_type found =
|
||||
// searching.find(look_piece, start_pos.utf8_data() - utf8_data());
|
||||
StringPiece::size_type found = StringPiece::npos;
|
||||
if (found == StringPiece::npos)
|
||||
return end();
|
||||
return const_iterator(utf8_data() + found);
|
||||
#else
|
||||
return end();
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
bool UnicodeText::HasReplacementChar() const {
|
||||
// Equivalent to:
|
||||
// UnicodeText replacement_char;
|
||||
@ -332,6 +342,7 @@ bool UnicodeText::HasReplacementChar() const {
|
||||
// return searching.find(looking_for) != StringPiece::npos;
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
// ----- other methods -----
|
||||
|
||||
@ -371,10 +382,12 @@ bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
|
||||
return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
|
||||
}
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
string UnicodeText::DebugString() const {
|
||||
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
|
||||
repr_.DebugString().c_str());
|
||||
}
|
||||
#endif
|
||||
|
||||
// ******************* UnicodeText::const_iterator *********************
|
||||
|
||||
@ -479,6 +492,7 @@ UnicodeText::const_iterator UnicodeText::MakeIterator(const char *p) const {
|
||||
return const_iterator(p);
|
||||
}
|
||||
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
string UnicodeText::const_iterator::DebugString() const {
|
||||
return tensorflow::strings::Printf("{iter %p}", it_);
|
||||
}
|
||||
@ -492,3 +506,4 @@ string CodepointString(const UnicodeText &t) {
|
||||
tensorflow::strings::Appendf(&s, "%X ", *it++);
|
||||
return s;
|
||||
}
|
||||
#endif
|
||||
|
@ -29,13 +29,14 @@ namespace UniLib {
|
||||
// (i.e., is not a surrogate codepoint). See also
|
||||
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
|
||||
inline bool IsValidCodepoint(char32 c) {
|
||||
return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
|
||||
return (static_cast<uint32_t>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
|
||||
}
|
||||
|
||||
// Returns true if 'str' is the start of a structurally valid UTF-8
|
||||
// sequence and is not a surrogate codepoint. Returns false if str.empty()
|
||||
// or if str.length() < UniLib::OneCharLen(str[0]). Otherwise, this function
|
||||
// will access 1-4 bytes of src, where n is UniLib::OneCharLen(src[0]).
|
||||
#ifdef INCLUDE_TENSORFLOW
|
||||
inline bool IsUTF8ValidCodepoint(StringPiece str) {
|
||||
char32 c;
|
||||
int consumed;
|
||||
@ -43,6 +44,7 @@ inline bool IsUTF8ValidCodepoint(StringPiece str) {
|
||||
return !str.empty() && isvalidcharntorune(str.data(), str.size(), &c, &consumed) &&
|
||||
IsValidCodepoint(c);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Returns the length (number of bytes) of the Unicode code point
|
||||
// starting at src, based on inspecting just that one byte. This
|
||||
|
Loading…
Reference in New Issue
Block a user