Merge pull request #2169 from Shreeshrii/master

Fix for unicharcompress_test
2025-01-19 06:53:36 +08:00 · 2019-01-19 09:05:39 +01:00 · 2019-01-19 09:05:39 +01:00 · 9d6978b258
commit 9d6978b258
parent 58447c0d52 0ae8fdc859
4 changed files with 44 additions and 20 deletions
--- a/unittest/Makefile.am
+++ b/unittest/Makefile.am
@ -127,6 +127,7 @@ if ENABLE_TRAINING
 check_PROGRAMS += commandlineflags_test
 check_PROGRAMS += unichar_test
 check_PROGRAMS += unicharset_test
+check_PROGRAMS += unicharcompress_test
 check_PROGRAMS += validator_test
 endif

@ -242,6 +243,9 @@ tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 unichar_test_SOURCES = unichar_test.cc
 unichar_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

+unicharcompress_test_SOURCES = unicharcompress_test.cc
+unicharcompress_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
+
 unicharset_test_SOURCES = unicharset_test.cc
 unicharset_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

--- a/unittest/include_gunit.h
+++ b/unittest/include_gunit.h
@ -21,10 +21,21 @@ const char* FLAGS_test_tmpdir = ".";

 class file : public tesseract::File {
 public:
+
+// Create a file and write a string to it.
+  static bool WriteStringToFile(const std::string& contents, const std::string& filename) {
+    File::WriteStringToFileOrDie(contents, filename);
+    return true;
+  }
+
  static bool GetContents(const std::string& filename, std::string* out, int) {
    return File::ReadFileToString(filename, out);
  }

+  static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) {
+    return WriteStringToFile(contents, name);
+  }
+
  static int Defaults() {
    return 0;
  }
--- a/unittest/unicharcompress_test.cc
+++ b/unittest/unicharcompress_test.cc
@ -8,10 +8,19 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "unicharcompress.h"
-#include "gunit.h"
-#include "printf.h"
+
+#include <string>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "allheaders.h"
+
+#include "include_gunit.h"
+#include "log.h"                        // for LOG
 #include "serialis.h"
+#include "tprintf.h"
+#include "unicharcompress.h"

 namespace tesseract {
 namespace {
@ -19,14 +28,14 @@ namespace {
 class UnicharcompressTest : public ::testing::Test {
 protected:
  // Loads and compresses the given unicharset.
-  void LoadUnicharset(const string& unicharset_name) {
-    string radical_stroke_file =
-        file::JoinPath(FLAGS_test_srcdir, "langdata/radical-stroke.txt");
-    string unicharset_file =
-        file::JoinPath(FLAGS_test_srcdir, "testdata", unicharset_name);
-    string uni_data;
+  void LoadUnicharset(const std::string& unicharset_name) {
+    std::string radical_stroke_file =
+        file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
+    std::string unicharset_file =
+        file::JoinPath(TESTDATA_DIR, unicharset_name);
+    std::string uni_data;
    CHECK_OK(file::GetContents(unicharset_file, &uni_data, file::Defaults()));
-    string radical_data;
+    std::string radical_data;
    CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
                               file::Defaults()));
    CHECK(
@ -39,10 +48,10 @@ class UnicharcompressTest : public ::testing::Test {
    RecodedCharID code;
    compressed_.EncodeUnichar(null_char_, &code);
    encoded_null_char_ = code(0);
-    string output_name = file::JoinPath(
+    std::string output_name = file::JoinPath(
        FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
    STRING encoding = compressed_.GetEncodingAsString(unicharset_);
-    string encoding_str(&encoding[0], encoding.size());
+    std::string encoding_str(&encoding[0], encoding.size());
    CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
    LOG(INFO) << "Wrote encoding to:" << output_name;
  }
@ -57,12 +66,12 @@ class UnicharcompressTest : public ::testing::Test {
    EXPECT_TRUE(compressed_.DeSerialize(&rfp));
  }
  // Returns true if the lang is in CJK.
-  bool IsCJKLang(const string& lang) {
+  bool IsCJKLang(const std::string& lang) {
    return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
           lang == "jpn";
  }
  // Returns true if the lang is Indic.
-  bool IsIndicLang(const string& lang) {
+  bool IsIndicLang(const std::string& lang) {
    return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
           lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
           lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
@ -71,13 +80,13 @@ class UnicharcompressTest : public ::testing::Test {
  }

  // Expects the appropriate results from the compressed_  unicharset_.
-  void ExpectCorrect(const string& lang) {
+  void ExpectCorrect(const std::string& lang) {
    // Count the number of times each code is used in each element of
    // RecodedCharID.
    RecodedCharID zeros;
    for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
    int code_range = compressed_.code_range();
-    std::vector<RecodedCharID> times_seen(code_range, zeros);
+   std::vector<RecodedCharID> times_seen(code_range, zeros);
    for (int u = 0; u <= unicharset_.size(); ++u) {
      if (u != UNICHAR_SPACE && u != null_char_ &&
          (u == unicharset_.size() || (unicharset_.has_special_codes() &&
@ -227,9 +236,9 @@ TEST_F(UnicharcompressTest, GetEncodingAsString) {
  LoadUnicharset("trivial.unicharset");
  ExpectCorrect("trivial");
  STRING encoding = compressed_.GetEncodingAsString(unicharset_);
-  string encoding_str(&encoding[0], encoding.length());
-  std::vector<string> lines =
-      strings::Split(encoding_str, "\n", strings::SkipEmpty());
+  std::string encoding_str(&encoding[0], encoding.length());
+  std::vector<std::string> lines =
+      absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
  EXPECT_EQ(5, lines.size());
  // The first line is always space.
  EXPECT_EQ("0\t ", lines[0]);
--- a/unittest/unicharset_test.cc
+++ b/unittest/unicharset_test.cc
@ -77,7 +77,7 @@ TEST(UnicharsetTest, Multibyte) {
  EXPECT_EQ(u.size(), 9);
  EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
  EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
-  // The first two bytes of this std::string is \u0627, which matches id 3;
+  // The first two bytes of this string is \u0627, which matches id 3;
  EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
  EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
  // Individual f and i are not present, but they are there as a pair.