Merge pull request #2169 from Shreeshrii/master

Fix for unicharcompress_test
This commit is contained in:
Stefan Weil 2019-01-19 09:05:39 +01:00 committed by GitHub
commit 9d6978b258
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 20 deletions

View File

@ -127,6 +127,7 @@ if ENABLE_TRAINING
check_PROGRAMS += commandlineflags_test
check_PROGRAMS += unichar_test
check_PROGRAMS += unicharset_test
check_PROGRAMS += unicharcompress_test
check_PROGRAMS += validator_test
endif
@ -242,6 +243,9 @@ tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
unichar_test_SOURCES = unichar_test.cc
unichar_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
unicharcompress_test_SOURCES = unicharcompress_test.cc
unicharcompress_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
unicharset_test_SOURCES = unicharset_test.cc
unicharset_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

View File

@ -21,10 +21,21 @@ const char* FLAGS_test_tmpdir = ".";
class file : public tesseract::File {
public:
// Create a file and write a string to it.
static bool WriteStringToFile(const std::string& contents, const std::string& filename) {
File::WriteStringToFileOrDie(contents, filename);
return true;
}
static bool GetContents(const std::string& filename, std::string* out, int) {
return File::ReadFileToString(filename, out);
}
static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) {
return WriteStringToFile(contents, name);
}
static int Defaults() {
return 0;
}

View File

@ -8,10 +8,19 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "unicharcompress.h"
#include "gunit.h"
#include "printf.h"
#include <string>
#include "absl/strings/ascii.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_split.h"
#include "allheaders.h"
#include "include_gunit.h"
#include "log.h" // for LOG
#include "serialis.h"
#include "tprintf.h"
#include "unicharcompress.h"
namespace tesseract {
namespace {
@ -19,14 +28,14 @@ namespace {
class UnicharcompressTest : public ::testing::Test {
protected:
// Loads and compresses the given unicharset.
void LoadUnicharset(const string& unicharset_name) {
string radical_stroke_file =
file::JoinPath(FLAGS_test_srcdir, "langdata/radical-stroke.txt");
string unicharset_file =
file::JoinPath(FLAGS_test_srcdir, "testdata", unicharset_name);
string uni_data;
void LoadUnicharset(const std::string& unicharset_name) {
std::string radical_stroke_file =
file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
std::string unicharset_file =
file::JoinPath(TESTDATA_DIR, unicharset_name);
std::string uni_data;
CHECK_OK(file::GetContents(unicharset_file, &uni_data, file::Defaults()));
string radical_data;
std::string radical_data;
CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
file::Defaults()));
CHECK(
@ -39,10 +48,10 @@ class UnicharcompressTest : public ::testing::Test {
RecodedCharID code;
compressed_.EncodeUnichar(null_char_, &code);
encoded_null_char_ = code(0);
string output_name = file::JoinPath(
std::string output_name = file::JoinPath(
FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
STRING encoding = compressed_.GetEncodingAsString(unicharset_);
string encoding_str(&encoding[0], encoding.size());
std::string encoding_str(&encoding[0], encoding.size());
CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
LOG(INFO) << "Wrote encoding to:" << output_name;
}
@ -57,12 +66,12 @@ class UnicharcompressTest : public ::testing::Test {
EXPECT_TRUE(compressed_.DeSerialize(&rfp));
}
// Returns true if the lang is in CJK.
bool IsCJKLang(const string& lang) {
bool IsCJKLang(const std::string& lang) {
return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
lang == "jpn";
}
// Returns true if the lang is Indic.
bool IsIndicLang(const string& lang) {
bool IsIndicLang(const std::string& lang) {
return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
@ -71,13 +80,13 @@ class UnicharcompressTest : public ::testing::Test {
}
// Expects the appropriate results from the compressed_ unicharset_.
void ExpectCorrect(const string& lang) {
void ExpectCorrect(const std::string& lang) {
// Count the number of times each code is used in each element of
// RecodedCharID.
RecodedCharID zeros;
for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
int code_range = compressed_.code_range();
std::vector<RecodedCharID> times_seen(code_range, zeros);
std::vector<RecodedCharID> times_seen(code_range, zeros);
for (int u = 0; u <= unicharset_.size(); ++u) {
if (u != UNICHAR_SPACE && u != null_char_ &&
(u == unicharset_.size() || (unicharset_.has_special_codes() &&
@ -227,9 +236,9 @@ TEST_F(UnicharcompressTest, GetEncodingAsString) {
LoadUnicharset("trivial.unicharset");
ExpectCorrect("trivial");
STRING encoding = compressed_.GetEncodingAsString(unicharset_);
string encoding_str(&encoding[0], encoding.length());
std::vector<string> lines =
strings::Split(encoding_str, "\n", strings::SkipEmpty());
std::string encoding_str(&encoding[0], encoding.length());
std::vector<std::string> lines =
absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
EXPECT_EQ(5, lines.size());
// The first line is always space.
EXPECT_EQ("0\t ", lines[0]);

View File

@ -77,7 +77,7 @@ TEST(UnicharsetTest, Multibyte) {
EXPECT_EQ(u.size(), 9);
EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
// The first two bytes of this std::string is \u0627, which matches id 3;
// The first two bytes of this string is \u0627, which matches id 3;
EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
// Individual f and i are not present, but they are there as a pair.