mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 06:53:36 +08:00
Merge pull request #2169 from Shreeshrii/master
Fix for unicharcompress_test
This commit is contained in:
commit
9d6978b258
@ -127,6 +127,7 @@ if ENABLE_TRAINING
|
||||
check_PROGRAMS += commandlineflags_test
|
||||
check_PROGRAMS += unichar_test
|
||||
check_PROGRAMS += unicharset_test
|
||||
check_PROGRAMS += unicharcompress_test
|
||||
check_PROGRAMS += validator_test
|
||||
endif
|
||||
|
||||
@ -242,6 +243,9 @@ tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
unichar_test_SOURCES = unichar_test.cc
|
||||
unichar_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
|
||||
|
||||
unicharcompress_test_SOURCES = unicharcompress_test.cc
|
||||
unicharcompress_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
|
||||
|
||||
unicharset_test_SOURCES = unicharset_test.cc
|
||||
unicharset_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
|
||||
|
||||
|
@ -21,10 +21,21 @@ const char* FLAGS_test_tmpdir = ".";
|
||||
|
||||
class file : public tesseract::File {
|
||||
public:
|
||||
|
||||
// Create a file and write a string to it.
|
||||
static bool WriteStringToFile(const std::string& contents, const std::string& filename) {
|
||||
File::WriteStringToFileOrDie(contents, filename);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool GetContents(const std::string& filename, std::string* out, int) {
|
||||
return File::ReadFileToString(filename, out);
|
||||
}
|
||||
|
||||
static bool SetContents(const std::string& name, const std::string& contents, bool /*is_default*/) {
|
||||
return WriteStringToFile(contents, name);
|
||||
}
|
||||
|
||||
static int Defaults() {
|
||||
return 0;
|
||||
}
|
||||
|
@ -8,10 +8,19 @@
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#include "unicharcompress.h"
|
||||
#include "gunit.h"
|
||||
#include "printf.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "absl/strings/ascii.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_split.h"
|
||||
#include "allheaders.h"
|
||||
|
||||
#include "include_gunit.h"
|
||||
#include "log.h" // for LOG
|
||||
#include "serialis.h"
|
||||
#include "tprintf.h"
|
||||
#include "unicharcompress.h"
|
||||
|
||||
namespace tesseract {
|
||||
namespace {
|
||||
@ -19,14 +28,14 @@ namespace {
|
||||
class UnicharcompressTest : public ::testing::Test {
|
||||
protected:
|
||||
// Loads and compresses the given unicharset.
|
||||
void LoadUnicharset(const string& unicharset_name) {
|
||||
string radical_stroke_file =
|
||||
file::JoinPath(FLAGS_test_srcdir, "langdata/radical-stroke.txt");
|
||||
string unicharset_file =
|
||||
file::JoinPath(FLAGS_test_srcdir, "testdata", unicharset_name);
|
||||
string uni_data;
|
||||
void LoadUnicharset(const std::string& unicharset_name) {
|
||||
std::string radical_stroke_file =
|
||||
file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
|
||||
std::string unicharset_file =
|
||||
file::JoinPath(TESTDATA_DIR, unicharset_name);
|
||||
std::string uni_data;
|
||||
CHECK_OK(file::GetContents(unicharset_file, &uni_data, file::Defaults()));
|
||||
string radical_data;
|
||||
std::string radical_data;
|
||||
CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
|
||||
file::Defaults()));
|
||||
CHECK(
|
||||
@ -39,10 +48,10 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
RecodedCharID code;
|
||||
compressed_.EncodeUnichar(null_char_, &code);
|
||||
encoded_null_char_ = code(0);
|
||||
string output_name = file::JoinPath(
|
||||
std::string output_name = file::JoinPath(
|
||||
FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
|
||||
STRING encoding = compressed_.GetEncodingAsString(unicharset_);
|
||||
string encoding_str(&encoding[0], encoding.size());
|
||||
std::string encoding_str(&encoding[0], encoding.size());
|
||||
CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
|
||||
LOG(INFO) << "Wrote encoding to:" << output_name;
|
||||
}
|
||||
@ -57,12 +66,12 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
EXPECT_TRUE(compressed_.DeSerialize(&rfp));
|
||||
}
|
||||
// Returns true if the lang is in CJK.
|
||||
bool IsCJKLang(const string& lang) {
|
||||
bool IsCJKLang(const std::string& lang) {
|
||||
return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
|
||||
lang == "jpn";
|
||||
}
|
||||
// Returns true if the lang is Indic.
|
||||
bool IsIndicLang(const string& lang) {
|
||||
bool IsIndicLang(const std::string& lang) {
|
||||
return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
|
||||
lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
|
||||
lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
|
||||
@ -71,13 +80,13 @@ class UnicharcompressTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
// Expects the appropriate results from the compressed_ unicharset_.
|
||||
void ExpectCorrect(const string& lang) {
|
||||
void ExpectCorrect(const std::string& lang) {
|
||||
// Count the number of times each code is used in each element of
|
||||
// RecodedCharID.
|
||||
RecodedCharID zeros;
|
||||
for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
|
||||
int code_range = compressed_.code_range();
|
||||
std::vector<RecodedCharID> times_seen(code_range, zeros);
|
||||
std::vector<RecodedCharID> times_seen(code_range, zeros);
|
||||
for (int u = 0; u <= unicharset_.size(); ++u) {
|
||||
if (u != UNICHAR_SPACE && u != null_char_ &&
|
||||
(u == unicharset_.size() || (unicharset_.has_special_codes() &&
|
||||
@ -227,9 +236,9 @@ TEST_F(UnicharcompressTest, GetEncodingAsString) {
|
||||
LoadUnicharset("trivial.unicharset");
|
||||
ExpectCorrect("trivial");
|
||||
STRING encoding = compressed_.GetEncodingAsString(unicharset_);
|
||||
string encoding_str(&encoding[0], encoding.length());
|
||||
std::vector<string> lines =
|
||||
strings::Split(encoding_str, "\n", strings::SkipEmpty());
|
||||
std::string encoding_str(&encoding[0], encoding.length());
|
||||
std::vector<std::string> lines =
|
||||
absl::StrSplit(encoding_str, "\n", absl::SkipEmpty());
|
||||
EXPECT_EQ(5, lines.size());
|
||||
// The first line is always space.
|
||||
EXPECT_EQ("0\t ", lines[0]);
|
||||
|
@ -77,7 +77,7 @@ TEST(UnicharsetTest, Multibyte) {
|
||||
EXPECT_EQ(u.size(), 9);
|
||||
EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
|
||||
EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
|
||||
// The first two bytes of this std::string is \u0627, which matches id 3;
|
||||
// The first two bytes of this string is \u0627, which matches id 3;
|
||||
EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
|
||||
EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
|
||||
// Individual f and i are not present, but they are there as a pair.
|
||||
|
Loading…
Reference in New Issue
Block a user