2019-01-25 22:05:57 +08:00
|
|
|
// (C) Copyright 2017, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2020-12-28 06:11:13 +08:00
|
|
|
#include "include_gunit.h"
|
2021-03-13 05:06:34 +08:00
|
|
|
#include "log.h" // for LOG
|
2020-12-28 06:11:13 +08:00
|
|
|
|
2019-01-25 22:05:57 +08:00
|
|
|
#include "matrix.h"
|
2021-03-13 05:06:34 +08:00
|
|
|
#include "normstrngs.h"
|
2019-01-25 22:05:57 +08:00
|
|
|
#include "pageres.h"
|
|
|
|
#include "ratngs.h"
|
2021-03-13 05:06:34 +08:00
|
|
|
#include "recodebeam.h"
|
2019-01-25 22:05:57 +08:00
|
|
|
#include "unicharcompress.h"
|
|
|
|
#include "unicharset_training_utils.h"
|
|
|
|
|
2020-12-31 16:03:56 +08:00
|
|
|
#include "helpers.h"
|
2020-12-28 06:11:13 +08:00
|
|
|
|
2020-12-27 17:41:48 +08:00
|
|
|
namespace tesseract {
|
2018-08-24 21:07:48 +08:00
|
|
|
|
|
|
|
// Number of characters to test beam search with.
|
|
|
|
const int kNumChars = 100;
|
|
|
|
// Amount of extra random data to pad with after.
|
|
|
|
const int kPadding = 64;
|
|
|
|
// Dictionary test data.
|
|
|
|
// The top choice is: "Gef s wordsright.".
|
|
|
|
// The desired phrase is "Gets words right.".
|
|
|
|
// There is a competing dictionary phrase: "Get swords right.".
|
|
|
|
// ... due to the following errors from the network:
|
|
|
|
// f stronger than t in "Get".
|
|
|
|
// weak space between Gef and s and between s and words.
|
|
|
|
// weak space between words and right.
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *kGWRTops[] = {"G", "e", "f", " ", "s", " ", "w", "o", "r", "d",
|
2018-08-24 21:07:48 +08:00
|
|
|
"s", "", "r", "i", "g", "h", "t", ".", nullptr};
|
2021-03-13 05:06:34 +08:00
|
|
|
const float kGWRTopScores[] = {0.99, 0.85, 0.87, 0.55, 0.99, 0.65, 0.89, 0.99, 0.99,
|
|
|
|
0.99, 0.99, 0.95, 0.99, 0.90, 0.90, 0.90, 0.95, 0.75};
|
|
|
|
const char *kGWR2nds[] = {"C", "c", "t", "", "S", "", "W", "O", "t", "h",
|
2018-08-24 21:07:48 +08:00
|
|
|
"S", " ", "t", "I", "9", "b", "f", ",", nullptr};
|
2021-03-13 05:06:34 +08:00
|
|
|
const float kGWR2ndScores[] = {0.01, 0.10, 0.12, 0.42, 0.01, 0.25, 0.10, 0.01, 0.01,
|
|
|
|
0.01, 0.01, 0.05, 0.01, 0.09, 0.09, 0.09, 0.05, 0.25};
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *kZHTops[] = {"实", "学", "储", "啬", "投", "学", "生", nullptr};
|
2018-08-24 21:07:48 +08:00
|
|
|
const float kZHTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98};
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *kZH2nds[] = {"学", "储", "投", "生", "学", "生", "实", nullptr};
|
2018-08-24 21:07:48 +08:00
|
|
|
const float kZH2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
|
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *kViTops[] = {"v", "ậ", "y", " ", "t", "ộ", "i", nullptr};
|
2018-08-24 21:07:48 +08:00
|
|
|
const float kViTopScores[] = {0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97};
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *kVi2nds[] = {"V", "a", "v", "", "l", "o", "", nullptr};
|
2018-08-24 21:07:48 +08:00
|
|
|
const float kVi2ndScores[] = {0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01};
|
|
|
|
|
|
|
|
class RecodeBeamTest : public ::testing::Test {
|
2021-03-13 05:06:34 +08:00
|
|
|
protected:
|
2021-03-22 15:26:05 +08:00
|
|
|
void SetUp() override {
|
2019-05-17 00:12:06 +08:00
|
|
|
std::locale::global(std::locale(""));
|
2020-12-31 01:17:58 +08:00
|
|
|
file::MakeTmpdir();
|
2019-05-17 00:12:06 +08:00
|
|
|
}
|
|
|
|
|
2018-08-24 21:07:48 +08:00
|
|
|
RecodeBeamTest() : lstm_dict_(&ccutil_) {}
|
2021-03-22 15:26:05 +08:00
|
|
|
~RecodeBeamTest() override {
|
2021-03-13 05:06:34 +08:00
|
|
|
lstm_dict_.End();
|
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
|
|
|
|
// Loads and compresses the given unicharset.
|
2021-03-13 05:06:34 +08:00
|
|
|
void LoadUnicharset(const std::string &unicharset_name) {
|
|
|
|
std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
|
|
|
|
std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string radical_data;
|
2021-03-13 05:06:34 +08:00
|
|
|
CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
|
2019-08-12 19:07:15 +08:00
|
|
|
CHECK(ccutil_.unicharset.load_from_file(unicharset_file.c_str()));
|
2021-03-13 05:06:34 +08:00
|
|
|
unichar_null_char_ =
|
|
|
|
ccutil_.unicharset.has_special_codes() ? UNICHAR_BROKEN : ccutil_.unicharset.size();
|
2021-03-15 05:52:52 +08:00
|
|
|
std::string radical_str(radical_data.c_str());
|
2021-03-13 05:06:34 +08:00
|
|
|
EXPECT_TRUE(recoder_.ComputeEncoding(ccutil_.unicharset, unichar_null_char_, &radical_str));
|
2018-08-24 21:07:48 +08:00
|
|
|
RecodedCharID code;
|
|
|
|
recoder_.EncodeUnichar(unichar_null_char_, &code);
|
|
|
|
encoded_null_char_ = code(0);
|
|
|
|
// Space should encode as itself.
|
|
|
|
recoder_.EncodeUnichar(UNICHAR_SPACE, &code);
|
|
|
|
EXPECT_EQ(UNICHAR_SPACE, code(0));
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string output_name = file::JoinPath(FLAGS_test_tmpdir, "testenc.txt");
|
2021-03-15 05:52:52 +08:00
|
|
|
std::string encoding = recoder_.GetEncodingAsString(ccutil_.unicharset);
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string encoding_str(&encoding[0], encoding.size());
|
2018-08-24 21:07:48 +08:00
|
|
|
CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
|
2019-01-25 22:05:57 +08:00
|
|
|
LOG(INFO) << "Wrote encoding to:" << output_name << "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
// Loads the dictionary.
|
2021-03-13 05:06:34 +08:00
|
|
|
void LoadDict(const std::string &lang) {
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string traineddata_name = lang + ".traineddata";
|
2021-03-13 05:06:34 +08:00
|
|
|
std::string traineddata_file = file::JoinPath(TESTDATA_DIR, traineddata_name);
|
2018-09-29 15:27:12 +08:00
|
|
|
lstm_dict_.SetupForLoad(nullptr);
|
2018-08-24 21:07:48 +08:00
|
|
|
tesseract::TessdataManager mgr;
|
|
|
|
mgr.Init(traineddata_file.c_str());
|
|
|
|
lstm_dict_.LoadLSTM(lang.c_str(), &mgr);
|
|
|
|
lstm_dict_.FinishLoad();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Expects the appropriate results from the compressed_ ccutil_.unicharset.
|
2021-03-13 05:06:34 +08:00
|
|
|
void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output,
|
2021-03-18 22:40:47 +08:00
|
|
|
const std::vector<int> &transcription) {
|
2018-08-24 21:07:48 +08:00
|
|
|
// Get the utf8 string of the transcription.
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string truth_utf8;
|
2021-03-22 15:32:09 +08:00
|
|
|
for (int i : transcription) {
|
|
|
|
truth_utf8 += ccutil_.unicharset.id_to_unichar(i);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
PointerVector<WERD_RES> words;
|
2018-09-29 15:27:12 +08:00
|
|
|
ExpectCorrect(output, truth_utf8, nullptr, &words);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
void ExpectCorrect(const GENERIC_2D_ARRAY<float> &output, const std::string &truth_utf8,
|
|
|
|
Dict *dict, PointerVector<WERD_RES> *words) {
|
2018-08-24 21:07:48 +08:00
|
|
|
RecodeBeamSearch beam_search(recoder_, encoded_null_char_, false, dict);
|
|
|
|
beam_search.Decode(output, 3.5, -0.125, -25.0, nullptr);
|
|
|
|
// Uncomment and/or change nullptr above to &ccutil_.unicharset to debug:
|
|
|
|
// beam_search.DebugBeams(ccutil_.unicharset);
|
2021-01-07 18:57:49 +08:00
|
|
|
std::vector<int> labels, xcoords;
|
2018-08-24 21:07:48 +08:00
|
|
|
beam_search.ExtractBestPathAsLabels(&labels, &xcoords);
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Labels size = " << labels.size() << " coords " << xcoords.size() << "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
// Now decode using recoder_.
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string decoded;
|
2018-08-24 21:07:48 +08:00
|
|
|
int end = 1;
|
2021-03-22 16:01:54 +08:00
|
|
|
for (unsigned start = 0; start < labels.size(); start = end) {
|
2018-08-24 21:07:48 +08:00
|
|
|
RecodedCharID code;
|
2021-03-22 16:01:54 +08:00
|
|
|
unsigned index = start;
|
2018-08-24 21:07:48 +08:00
|
|
|
int uni_id = INVALID_UNICHAR_ID;
|
|
|
|
do {
|
|
|
|
code.Set(code.length(), labels[index++]);
|
|
|
|
uni_id = recoder_.DecodeUnichar(code);
|
2021-03-13 05:06:34 +08:00
|
|
|
} while (index < labels.size() && code.length() < RecodedCharID::kMaxCodeLen &&
|
|
|
|
(uni_id == INVALID_UNICHAR_ID || !recoder_.IsValidFirstCode(labels[index])));
|
|
|
|
EXPECT_NE(INVALID_UNICHAR_ID, uni_id) << "index=" << index << "/" << labels.size();
|
2019-05-17 00:12:06 +08:00
|
|
|
// To the extent of truth_utf8, we expect decoded to match, but if
|
2018-08-24 21:07:48 +08:00
|
|
|
// transcription is shorter, that is OK too, as we may just be testing
|
|
|
|
// that we get a valid sequence when padded with random data.
|
2021-03-22 15:48:50 +08:00
|
|
|
if (uni_id != unichar_null_char_ && decoded.size() < truth_utf8.size()) {
|
2018-08-24 21:07:48 +08:00
|
|
|
decoded += ccutil_.unicharset.id_to_unichar(uni_id);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
end = index;
|
|
|
|
}
|
|
|
|
EXPECT_EQ(truth_utf8, decoded);
|
|
|
|
|
|
|
|
// Check that ExtractBestPathAsUnicharIds does the same thing.
|
2021-01-07 18:57:49 +08:00
|
|
|
std::vector<int> unichar_ids;
|
|
|
|
std::vector<float> certainties, ratings;
|
2021-03-13 05:06:34 +08:00
|
|
|
beam_search.ExtractBestPathAsUnicharIds(false, &ccutil_.unicharset, &unichar_ids, &certainties,
|
2018-08-24 21:07:48 +08:00
|
|
|
&ratings, &xcoords);
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string u_decoded;
|
2018-08-24 21:07:48 +08:00
|
|
|
float total_rating = 0.0f;
|
2021-03-22 16:01:54 +08:00
|
|
|
for (unsigned u = 0; u < unichar_ids.size(); ++u) {
|
2018-08-24 21:07:48 +08:00
|
|
|
// To the extent of truth_utf8, we expect decoded to match, but if
|
|
|
|
// transcription is shorter, that is OK too, as we may just be testing
|
|
|
|
// that we get a valid sequence when padded with random data.
|
|
|
|
if (u_decoded.size() < truth_utf8.size()) {
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *str = ccutil_.unicharset.id_to_unichar(unichar_ids[u]);
|
2018-08-24 21:07:48 +08:00
|
|
|
total_rating += ratings[u];
|
2021-08-04 16:21:01 +08:00
|
|
|
LOG(INFO) << u << ":u_id=" << unichar_ids[u] << "=" << str << ", c="
|
|
|
|
<< certainties[u] << ", r=" << ratings[u] << "r_sum="
|
|
|
|
<< total_rating << " @" << xcoords[u] << "\n";
|
2021-03-22 15:48:50 +08:00
|
|
|
if (str[0] == ' ') {
|
2021-03-13 05:06:34 +08:00
|
|
|
total_rating = 0.0f;
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
u_decoded += str;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPECT_EQ(truth_utf8, u_decoded);
|
|
|
|
|
|
|
|
// Check that ExtractBestPathAsWords does the same thing.
|
|
|
|
TBOX line_box(0, 0, 100, 10);
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
2021-03-13 05:06:34 +08:00
|
|
|
beam_search.ExtractBestPathAsWords(line_box, 1.0f, false, &ccutil_.unicharset, words);
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string w_decoded;
|
2018-08-24 21:07:48 +08:00
|
|
|
for (int w = 0; w < words->size(); ++w) {
|
2021-03-13 05:06:34 +08:00
|
|
|
const WERD_RES *word = (*words)[w];
|
2018-08-24 21:07:48 +08:00
|
|
|
if (w_decoded.size() < truth_utf8.size()) {
|
2021-03-22 15:48:50 +08:00
|
|
|
if (!w_decoded.empty() && word->word->space()) {
|
2021-03-13 05:06:34 +08:00
|
|
|
w_decoded += " ";
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2019-09-25 16:07:51 +08:00
|
|
|
w_decoded += word->best_choice->unichar_string().c_str();
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
2021-08-04 16:21:01 +08:00
|
|
|
LOG(INFO) << "Word:" << w << " = " << word->best_choice->unichar_string()
|
|
|
|
<< ", c=" << word->best_choice->certainty() << ", r=" << word->best_choice->rating()
|
|
|
|
<< ", perm=" << word->best_choice->permuter() << "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string w_trunc(w_decoded.data(), truth_utf8.size());
|
2018-08-24 21:07:48 +08:00
|
|
|
if (truth_utf8 != w_trunc) {
|
|
|
|
tesseract::NormalizeUTF8String(
|
|
|
|
tesseract::UnicodeNormMode::kNFKD, tesseract::OCRNorm::kNormalize,
|
|
|
|
tesseract::GraphemeNorm::kNone, w_decoded.c_str(), &w_decoded);
|
|
|
|
w_trunc.assign(w_decoded.data(), truth_utf8.size());
|
|
|
|
}
|
|
|
|
EXPECT_EQ(truth_utf8, w_trunc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Generates easy encoding of the given unichar_ids, and pads with at least
|
|
|
|
// padding of random data.
|
2021-03-18 22:40:47 +08:00
|
|
|
GENERIC_2D_ARRAY<float> GenerateRandomPaddedOutputs(const std::vector<int> &unichar_ids,
|
2021-03-13 05:06:34 +08:00
|
|
|
int padding) {
|
2018-08-24 21:07:48 +08:00
|
|
|
int width = unichar_ids.size() * 2 * RecodedCharID::kMaxCodeLen;
|
|
|
|
int num_codes = recoder_.code_range();
|
|
|
|
GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
|
|
|
|
// Fill with random data.
|
|
|
|
TRand random;
|
|
|
|
for (int t = 0; t < width; ++t) {
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = 0; i < num_codes; ++i) {
|
2018-08-24 21:07:48 +08:00
|
|
|
outputs(t, i) = random.UnsignedRand(0.25);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
int t = 0;
|
2021-03-22 15:32:09 +08:00
|
|
|
for (int unichar_id : unichar_ids) {
|
2018-08-24 21:07:48 +08:00
|
|
|
RecodedCharID code;
|
2021-03-22 15:32:09 +08:00
|
|
|
int len = recoder_.EncodeUnichar(unichar_id, &code);
|
2018-08-24 21:07:48 +08:00
|
|
|
EXPECT_NE(0, len);
|
|
|
|
for (int j = 0; j < len; ++j) {
|
|
|
|
// Make the desired answer a clear winner.
|
|
|
|
if (j > 0 && code(j) == code(j - 1)) {
|
|
|
|
// We will collapse adjacent equal codes so put a null in between.
|
|
|
|
outputs(t++, encoded_null_char_) = 1.0f;
|
|
|
|
}
|
|
|
|
outputs(t++, code(j)) = 1.0f;
|
|
|
|
}
|
|
|
|
// Put a 0 as a null char in between.
|
|
|
|
outputs(t++, encoded_null_char_) = 1.0f;
|
|
|
|
}
|
|
|
|
// Normalize the probs.
|
|
|
|
for (int t = 0; t < width; ++t) {
|
|
|
|
double sum = 0.0;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = 0; i < num_codes; ++i) {
|
2021-03-13 05:06:34 +08:00
|
|
|
sum += outputs(t, i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
|
|
|
for (int i = 0; i < num_codes; ++i) {
|
2021-03-13 05:06:34 +08:00
|
|
|
outputs(t, i) /= sum;
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return outputs;
|
|
|
|
}
|
|
|
|
// Encodes a utf8 string (character) as unichar_id, then recodes, and sets
|
|
|
|
// the score for the appropriate sequence of codes, returning the ending t.
|
2021-03-13 05:06:34 +08:00
|
|
|
int EncodeUTF8(const char *utf8_str, float score, int start_t, TRand *random,
|
|
|
|
GENERIC_2D_ARRAY<float> *outputs) {
|
2018-08-24 21:07:48 +08:00
|
|
|
int t = start_t;
|
2021-01-07 18:57:49 +08:00
|
|
|
std::vector<int> unichar_ids;
|
2021-03-13 05:06:34 +08:00
|
|
|
EXPECT_TRUE(ccutil_.unicharset.encode_string(utf8_str, true, &unichar_ids, nullptr, nullptr));
|
2018-08-24 21:07:48 +08:00
|
|
|
if (unichar_ids.empty() || utf8_str[0] == '\0') {
|
|
|
|
unichar_ids.clear();
|
|
|
|
unichar_ids.push_back(unichar_null_char_);
|
|
|
|
}
|
|
|
|
int num_ids = unichar_ids.size();
|
|
|
|
for (int u = 0; u < num_ids; ++u) {
|
|
|
|
RecodedCharID code;
|
|
|
|
int len = recoder_.EncodeUnichar(unichar_ids[u], &code);
|
|
|
|
EXPECT_NE(0, len);
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
|
|
// Apply the desired score.
|
|
|
|
(*outputs)(t++, code(i)) = score;
|
2021-03-13 05:06:34 +08:00
|
|
|
if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
|
2018-08-24 21:07:48 +08:00
|
|
|
int dups = static_cast<int>(random->UnsignedRand(3.0));
|
|
|
|
for (int d = 0; d < dups; ++d) {
|
|
|
|
// Duplicate the desired score.
|
|
|
|
(*outputs)(t++, code(i)) = score;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
if (random != nullptr && t + (num_ids - u) * RecodedCharID::kMaxCodeLen < outputs->dim1()) {
|
2018-08-24 21:07:48 +08:00
|
|
|
int dups = static_cast<int>(random->UnsignedRand(3.0));
|
|
|
|
for (int d = 0; d < dups; ++d) {
|
|
|
|
// Add a random number of nulls as well.
|
|
|
|
(*outputs)(t++, encoded_null_char_) = score;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return t;
|
|
|
|
}
|
|
|
|
// Generates an encoding of the given 4 arrays as synthetic network scores.
|
|
|
|
// uses scores1 for chars1 and scores2 for chars2, and everything else gets
|
|
|
|
// the leftovers shared out equally. Note that empty string encodes as the
|
|
|
|
// null_char_.
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> GenerateSyntheticOutputs(const char *chars1[], const float scores1[],
|
|
|
|
const char *chars2[], const float scores2[],
|
|
|
|
TRand *random) {
|
2018-08-24 21:07:48 +08:00
|
|
|
int width = 0;
|
2021-03-22 15:48:50 +08:00
|
|
|
while (chars1[width] != nullptr) {
|
2021-03-13 05:06:34 +08:00
|
|
|
++width;
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
int padding = width * RecodedCharID::kMaxCodeLen;
|
|
|
|
int num_codes = recoder_.code_range();
|
|
|
|
GENERIC_2D_ARRAY<float> outputs(width + padding, num_codes, 0.0f);
|
|
|
|
int t = 0;
|
|
|
|
for (int i = 0; i < width; ++i) {
|
|
|
|
// In case there is overlap in the codes between 1st and 2nd choice, it
|
|
|
|
// is better to encode the 2nd choice first.
|
|
|
|
int end_t2 = EncodeUTF8(chars2[i], scores2[i], t, random, &outputs);
|
|
|
|
int end_t1 = EncodeUTF8(chars1[i], scores1[i], t, random, &outputs);
|
|
|
|
// Advance t to the max end, setting everything else to the leftovers.
|
|
|
|
int max_t = std::max(end_t1, end_t2);
|
|
|
|
while (t < max_t) {
|
|
|
|
double total_score = 0.0;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int j = 0; j < num_codes; ++j) {
|
2021-03-13 05:06:34 +08:00
|
|
|
total_score += outputs(t, j);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
double null_remainder = (1.0 - total_score) / 2.0;
|
|
|
|
double remainder = null_remainder / (num_codes - 2);
|
|
|
|
if (outputs(t, encoded_null_char_) < null_remainder) {
|
|
|
|
outputs(t, encoded_null_char_) += null_remainder;
|
|
|
|
} else {
|
|
|
|
remainder += remainder;
|
|
|
|
}
|
|
|
|
for (int j = 0; j < num_codes; ++j) {
|
2021-03-22 15:48:50 +08:00
|
|
|
if (outputs(t, j) == 0.0f) {
|
2021-03-13 05:06:34 +08:00
|
|
|
outputs(t, j) = remainder;
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
++t;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Fill the rest with null chars.
|
|
|
|
while (t < width + padding) {
|
|
|
|
outputs(t++, encoded_null_char_) = 1.0f;
|
|
|
|
}
|
|
|
|
return outputs;
|
|
|
|
}
|
|
|
|
UnicharCompress recoder_;
|
2019-07-10 22:52:27 +08:00
|
|
|
int unichar_null_char_ = 0;
|
|
|
|
int encoded_null_char_ = 0;
|
2018-08-24 21:07:48 +08:00
|
|
|
CCUtil ccutil_;
|
|
|
|
Dict lstm_dict_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(RecodeBeamTest, DoesChinese) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing chi_tra"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("chi_tra.unicharset");
|
|
|
|
// Correctly reproduce the first kNumchars characters from easy output.
|
2021-03-18 22:40:47 +08:00
|
|
|
std::vector<int> transcription;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
|
2018-08-24 21:07:48 +08:00
|
|
|
transcription.push_back(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
2018-08-24 21:07:48 +08:00
|
|
|
ExpectCorrect(outputs, transcription);
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing chi_sim"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("chi_sim.unicharset");
|
|
|
|
// Correctly reproduce the first kNumchars characters from easy output.
|
|
|
|
transcription.clear();
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
|
2018-08-24 21:07:48 +08:00
|
|
|
transcription.push_back(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
|
|
|
ExpectCorrect(outputs, transcription);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(RecodeBeamTest, DoesJapanese) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing jpn"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("jpn.unicharset");
|
|
|
|
// Correctly reproduce the first kNumchars characters from easy output.
|
2021-03-18 22:40:47 +08:00
|
|
|
std::vector<int> transcription;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
|
2018-08-24 21:07:48 +08:00
|
|
|
transcription.push_back(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
2018-08-24 21:07:48 +08:00
|
|
|
ExpectCorrect(outputs, transcription);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(RecodeBeamTest, DoesKorean) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing kor"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("kor.unicharset");
|
|
|
|
// Correctly reproduce the first kNumchars characters from easy output.
|
2021-03-18 22:40:47 +08:00
|
|
|
std::vector<int> transcription;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
|
2018-08-24 21:07:48 +08:00
|
|
|
transcription.push_back(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
2018-08-24 21:07:48 +08:00
|
|
|
ExpectCorrect(outputs, transcription);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(RecodeBeamTest, DoesKannada) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing kan"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("kan.unicharset");
|
|
|
|
// Correctly reproduce the first kNumchars characters from easy output.
|
2021-03-18 22:40:47 +08:00
|
|
|
std::vector<int> transcription;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
|
2018-08-24 21:07:48 +08:00
|
|
|
transcription.push_back(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
2018-08-24 21:07:48 +08:00
|
|
|
ExpectCorrect(outputs, transcription);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(RecodeBeamTest, DoesMarathi) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing mar"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("mar.unicharset");
|
|
|
|
// Correctly reproduce the first kNumchars characters from easy output.
|
2021-03-18 22:40:47 +08:00
|
|
|
std::vector<int> transcription;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
|
2018-08-24 21:07:48 +08:00
|
|
|
transcription.push_back(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
2018-08-24 21:07:48 +08:00
|
|
|
ExpectCorrect(outputs, transcription);
|
|
|
|
}
|
|
|
|
|
2019-01-25 22:05:57 +08:00
|
|
|
TEST_F(RecodeBeamTest, DoesEnglish) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing eng"
|
|
|
|
<< "\n";
|
2019-01-25 22:05:57 +08:00
|
|
|
LoadUnicharset("eng.unicharset");
|
|
|
|
// Correctly reproduce the first kNumchars characters from easy output.
|
2021-03-18 22:40:47 +08:00
|
|
|
std::vector<int> transcription;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = SPECIAL_UNICHAR_CODES_COUNT; i < kNumChars; ++i) {
|
2019-01-25 22:05:57 +08:00
|
|
|
transcription.push_back(i);
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs = GenerateRandomPaddedOutputs(transcription, kPadding);
|
2019-01-25 22:05:57 +08:00
|
|
|
ExpectCorrect(outputs, transcription);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(RecodeBeamTest, DISABLED_EngDictionary) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing eng dictionary"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("eng_beam.unicharset");
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs =
|
|
|
|
GenerateSyntheticOutputs(kGWRTops, kGWRTopScores, kGWR2nds, kGWR2ndScores, nullptr);
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string default_str;
|
2021-03-22 15:48:50 +08:00
|
|
|
for (int i = 0; kGWRTops[i] != nullptr; ++i) {
|
2021-03-13 05:06:34 +08:00
|
|
|
default_str += kGWRTops[i];
|
2021-03-22 15:48:50 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
PointerVector<WERD_RES> words;
|
2018-09-29 15:27:12 +08:00
|
|
|
ExpectCorrect(outputs, default_str, nullptr, &words);
|
2018-08-24 21:07:48 +08:00
|
|
|
// Now try again with the dictionary.
|
|
|
|
LoadDict("eng_beam");
|
|
|
|
ExpectCorrect(outputs, "Gets words right.", &lstm_dict_, &words);
|
|
|
|
}
|
|
|
|
|
2019-01-25 22:05:57 +08:00
|
|
|
TEST_F(RecodeBeamTest, DISABLED_ChiDictionary) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing zh_hans dictionary"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("zh_hans.unicharset");
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs =
|
|
|
|
GenerateSyntheticOutputs(kZHTops, kZHTopScores, kZH2nds, kZH2ndScores, nullptr);
|
2018-08-24 21:07:48 +08:00
|
|
|
PointerVector<WERD_RES> words;
|
2018-09-29 15:27:12 +08:00
|
|
|
ExpectCorrect(outputs, "实学储啬投学生", nullptr, &words);
|
2018-08-24 21:07:48 +08:00
|
|
|
// Each is an individual word, with permuter = top choice.
|
|
|
|
EXPECT_EQ(7, words.size());
|
|
|
|
for (int w = 0; w < words.size(); ++w) {
|
|
|
|
EXPECT_EQ(TOP_CHOICE_PERM, words[w]->best_choice->permuter());
|
|
|
|
}
|
|
|
|
// Now try again with the dictionary.
|
|
|
|
LoadDict("zh_hans");
|
|
|
|
ExpectCorrect(outputs, "实学储啬投学生", &lstm_dict_, &words);
|
|
|
|
// Number of words expected.
|
|
|
|
const int kNumWords = 5;
|
|
|
|
// Content of the words.
|
2021-03-13 05:06:34 +08:00
|
|
|
const char *kWords[kNumWords] = {"实学", "储", "啬", "投", "学生"};
|
2018-08-24 21:07:48 +08:00
|
|
|
// Permuters of the words.
|
2021-03-13 05:06:34 +08:00
|
|
|
const int kWordPerms[kNumWords] = {SYSTEM_DAWG_PERM, TOP_CHOICE_PERM, TOP_CHOICE_PERM,
|
|
|
|
TOP_CHOICE_PERM, SYSTEM_DAWG_PERM};
|
2018-08-24 21:07:48 +08:00
|
|
|
EXPECT_EQ(kNumWords, words.size());
|
|
|
|
for (int w = 0; w < kNumWords && w < words.size(); ++w) {
|
2019-09-25 16:07:51 +08:00
|
|
|
EXPECT_STREQ(kWords[w], words[w]->best_choice->unichar_string().c_str());
|
2018-08-24 21:07:48 +08:00
|
|
|
EXPECT_EQ(kWordPerms[w], words[w]->best_choice->permuter());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests that a recoder built with decomposed unicode allows true ctc
|
|
|
|
// arbitrary duplicates and inserted nulls inside the multicode sequence.
|
2019-01-25 22:05:57 +08:00
|
|
|
TEST_F(RecodeBeamTest, DISABLED_MultiCodeSequences) {
|
2021-03-13 05:06:34 +08:00
|
|
|
LOG(INFO) << "Testing duplicates in multi-code sequences"
|
|
|
|
<< "\n";
|
2018-08-24 21:07:48 +08:00
|
|
|
LoadUnicharset("vie.d.unicharset");
|
|
|
|
tesseract::SetupBasicProperties(false, true, &ccutil_.unicharset);
|
|
|
|
TRand random;
|
2021-03-13 05:06:34 +08:00
|
|
|
GENERIC_2D_ARRAY<float> outputs =
|
|
|
|
GenerateSyntheticOutputs(kViTops, kViTopScores, kVi2nds, kVi2ndScores, &random);
|
2018-08-24 21:07:48 +08:00
|
|
|
PointerVector<WERD_RES> words;
|
2019-01-25 22:05:57 +08:00
|
|
|
std::string truth_str;
|
2021-03-13 05:06:34 +08:00
|
|
|
tesseract::NormalizeUTF8String(tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize,
|
|
|
|
tesseract::GraphemeNorm::kNone, "vậy tội", &truth_str);
|
2018-08-24 21:07:48 +08:00
|
|
|
ExpectCorrect(outputs, truth_str, nullptr, &words);
|
|
|
|
}
|
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
} // namespace tesseract
|