2016-11-08 07:38:07 +08:00
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
// File: unicharcompress.h
|
|
|
|
// Description: Unicode re-encoding using a sequence of smaller numbers in
|
|
|
|
// place of a single large code for CJK, similarly for Indic,
|
|
|
|
// and dissection of ligatures for other scripts.
|
|
|
|
// Author: Ray Smith
|
|
|
|
//
|
|
|
|
// (C) Copyright 2015, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
|
|
|
|
#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
|
|
|
|
|
2021-03-13 03:35:02 +08:00
|
|
|
#include <unordered_map>
|
2021-03-15 16:01:55 +08:00
|
|
|
#include <vector>
|
2020-12-29 18:28:50 +08:00
|
|
|
#include "serialis.h"
|
2016-11-08 07:38:07 +08:00
|
|
|
#include "unicharset.h"
|
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
|
|
|
// Trivial class to hold the code for a recoded unichar-id.
|
|
|
|
class RecodedCharID {
|
2021-03-13 03:35:02 +08:00
|
|
|
public:
|
2016-11-08 07:38:07 +08:00
|
|
|
// The maximum length of a code.
|
|
|
|
static const int kMaxCodeLen = 9;
|
|
|
|
|
2017-07-26 00:40:44 +08:00
|
|
|
RecodedCharID() : self_normalized_(1), length_(0) {
|
2016-11-08 07:38:07 +08:00
|
|
|
memset(code_, 0, sizeof(code_));
|
|
|
|
}
|
2021-03-13 03:35:02 +08:00
|
|
|
void Truncate(int length) {
|
|
|
|
length_ = length;
|
|
|
|
}
|
2016-11-08 07:38:07 +08:00
|
|
|
// Sets the code value at the given index in the code.
|
|
|
|
void Set(int index, int value) {
|
|
|
|
code_[index] = value;
|
2021-03-13 03:35:02 +08:00
|
|
|
if (length_ <= index)
|
|
|
|
length_ = index + 1;
|
2016-11-08 07:38:07 +08:00
|
|
|
}
|
|
|
|
// Shorthand for setting codes of length 3, as all Hangul and Han codes are
|
|
|
|
// length 3.
|
|
|
|
void Set3(int code0, int code1, int code2) {
|
|
|
|
length_ = 3;
|
|
|
|
code_[0] = code0;
|
|
|
|
code_[1] = code1;
|
|
|
|
code_[2] = code2;
|
|
|
|
}
|
|
|
|
// Accessors
|
2021-03-13 03:35:02 +08:00
|
|
|
int length() const {
|
|
|
|
return length_;
|
|
|
|
}
|
|
|
|
int operator()(int index) const {
|
|
|
|
return code_[index];
|
|
|
|
}
|
2016-11-08 07:38:07 +08:00
|
|
|
|
|
|
|
// Writes to the given file. Returns false in case of error.
|
2021-03-13 03:35:02 +08:00
|
|
|
bool Serialize(TFile *fp) const {
|
|
|
|
return fp->Serialize(&self_normalized_) && fp->Serialize(&length_) &&
|
2018-07-18 19:55:58 +08:00
|
|
|
fp->Serialize(&code_[0], length_);
|
2016-11-08 07:38:07 +08:00
|
|
|
}
|
|
|
|
// Reads from the given file. Returns false in case of error.
|
2021-03-13 03:35:02 +08:00
|
|
|
bool DeSerialize(TFile *fp) {
|
|
|
|
return fp->DeSerialize(&self_normalized_) && fp->DeSerialize(&length_) &&
|
2018-07-18 19:55:58 +08:00
|
|
|
fp->DeSerialize(&code_[0], length_);
|
2016-11-08 07:38:07 +08:00
|
|
|
}
|
2021-03-13 03:35:02 +08:00
|
|
|
bool operator==(const RecodedCharID &other) const {
|
|
|
|
if (length_ != other.length_)
|
|
|
|
return false;
|
2016-11-08 07:38:07 +08:00
|
|
|
for (int i = 0; i < length_; ++i) {
|
2021-03-13 03:35:02 +08:00
|
|
|
if (code_[i] != other.code_[i])
|
|
|
|
return false;
|
2016-11-08 07:38:07 +08:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Hash functor for RecodedCharID.
|
|
|
|
struct RecodedCharIDHash {
|
2021-03-13 03:35:02 +08:00
|
|
|
uint64_t operator()(const RecodedCharID &code) const {
|
2019-09-10 21:46:20 +08:00
|
|
|
uint64_t result = 0;
|
2016-11-08 07:38:07 +08:00
|
|
|
for (int i = 0; i < code.length_; ++i) {
|
2019-09-10 21:46:20 +08:00
|
|
|
result ^= static_cast<uint64_t>(code(i)) << (7 * i);
|
2016-11-08 07:38:07 +08:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2021-03-13 03:35:02 +08:00
|
|
|
private:
|
2016-11-08 07:38:07 +08:00
|
|
|
// True if this code is self-normalizing, ie is the master entry for indices
|
2018-03-14 02:01:40 +08:00
|
|
|
// that map to the same code. Has boolean value, but int8_t for serialization.
|
|
|
|
int8_t self_normalized_;
|
2016-11-08 07:38:07 +08:00
|
|
|
// The number of elements in use in code_;
|
2018-03-14 02:01:40 +08:00
|
|
|
int32_t length_;
|
2016-11-08 07:38:07 +08:00
|
|
|
// The re-encoded form of the unichar-id to which this RecodedCharID relates.
|
2018-03-14 02:01:40 +08:00
|
|
|
int32_t code_[kMaxCodeLen];
|
2016-11-08 07:38:07 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
// Class holds a "compression" of a unicharset to simplify the learning problem
|
|
|
|
// for a neural-network-based classifier.
|
|
|
|
// Objectives:
|
|
|
|
// 1 (CJK): Ids of a unicharset with a large number of classes are expressed as
|
|
|
|
// a sequence of 3 codes with much fewer values.
|
|
|
|
// This is achieved using the Jamo coding for Hangul and the Unicode
|
|
|
|
// Radical-Stroke-index for Han.
|
|
|
|
// 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code
|
|
|
|
// as the unicode sequence (but coded in a more compact space).
|
|
|
|
// 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing
|
2018-05-28 00:40:13 +08:00
|
|
|
// and not significantly distinct shapes (quotes) together, ie
|
2016-11-08 07:38:07 +08:00
|
|
|
// represent the fi ligature as the f-i pair, and fold u+2019 and
|
|
|
|
// friends all onto ascii single '
|
|
|
|
// 4 The null character and mapping to target activations:
|
|
|
|
// To save horizontal coding space, the compressed codes are generally mapped
|
|
|
|
// to target network activations without intervening null characters, BUT
|
|
|
|
// in the case of ligatures, such as ff, null characters have to be included
|
|
|
|
// so existence of repeated codes is detected at codebook-building time, and
|
|
|
|
// null characters are embedded directly into the codes, so the rest of the
|
|
|
|
// system doesn't need to worry about the problem (much). There is still an
|
|
|
|
// effect on the range of ways in which the target activations can be
|
|
|
|
// generated.
|
|
|
|
//
|
|
|
|
// The computed code values are compact (no unused values), and, for CJK,
|
|
|
|
// unique (each code position uses a disjoint set of values from each other code
|
|
|
|
// position). For non-CJK, the same code value CAN be used in multiple
|
|
|
|
// positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
|
|
|
|
// is the same code as is used for the single f.
|
2020-12-31 21:31:10 +08:00
|
|
|
class TESS_API UnicharCompress {
|
2021-03-13 03:35:02 +08:00
|
|
|
public:
|
2016-11-08 07:38:07 +08:00
|
|
|
UnicharCompress();
|
2021-03-13 03:35:02 +08:00
|
|
|
UnicharCompress(const UnicharCompress &src);
|
2016-11-08 07:38:07 +08:00
|
|
|
~UnicharCompress();
|
2021-03-13 03:35:02 +08:00
|
|
|
UnicharCompress &operator=(const UnicharCompress &src);
|
2016-11-08 07:38:07 +08:00
|
|
|
|
|
|
|
// The 1st Hangul unicode.
|
|
|
|
static const int kFirstHangul = 0xac00;
|
|
|
|
// The number of Hangul unicodes.
|
|
|
|
static const int kNumHangul = 11172;
|
|
|
|
// The number of Jamos for each of the 3 parts of a Hangul character, being
|
|
|
|
// the Leading consonant, Vowel and Trailing consonant.
|
|
|
|
static const int kLCount = 19;
|
|
|
|
static const int kVCount = 21;
|
|
|
|
static const int kTCount = 28;
|
|
|
|
|
|
|
|
// Computes the encoding for the given unicharset. It is a requirement that
|
|
|
|
// the file training/langdata/radical-stroke.txt have been read into the
|
|
|
|
// input string radical_stroke_table.
|
|
|
|
// Returns false if the encoding cannot be constructed.
|
2021-03-15 03:36:20 +08:00
|
|
|
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table);
|
2016-11-08 07:38:07 +08:00
|
|
|
// Sets up an encoder that doesn't change the unichars at all, so it just
|
|
|
|
// passes them through unchanged.
|
2021-03-13 03:35:02 +08:00
|
|
|
void SetupPassThrough(const UNICHARSET &unicharset);
|
2016-11-08 07:38:07 +08:00
|
|
|
// Sets up an encoder directly using the given encoding vector, which maps
|
|
|
|
// unichar_ids to the given codes.
|
2021-03-13 03:35:02 +08:00
|
|
|
void SetupDirect(const std::vector<RecodedCharID> &codes);
|
2016-11-08 07:38:07 +08:00
|
|
|
|
|
|
|
// Returns the number of different values that can be used in a code, ie
|
|
|
|
// 1 + the maximum value that will ever be used by an RecodedCharID code in
|
|
|
|
// any position in its array.
|
2021-03-13 03:35:02 +08:00
|
|
|
int code_range() const {
|
|
|
|
return code_range_;
|
|
|
|
}
|
2016-11-08 07:38:07 +08:00
|
|
|
|
|
|
|
// Encodes a single unichar_id. Returns the length of the code, (or zero if
|
|
|
|
// invalid input), and the encoding itself in code.
|
2021-03-13 03:35:02 +08:00
|
|
|
int EncodeUnichar(int unichar_id, RecodedCharID *code) const;
|
2016-11-08 07:38:07 +08:00
|
|
|
// Decodes code, returning the original unichar-id, or
|
2017-07-26 00:40:44 +08:00
|
|
|
// INVALID_UNICHAR_ID if the input is invalid.
|
2021-03-13 03:35:02 +08:00
|
|
|
int DecodeUnichar(const RecodedCharID &code) const;
|
2016-11-08 07:38:07 +08:00
|
|
|
// Returns true if the given code is a valid start or single code.
|
2021-03-13 03:35:02 +08:00
|
|
|
bool IsValidFirstCode(int code) const {
|
|
|
|
return is_valid_start_[code];
|
|
|
|
}
|
2016-11-08 07:38:07 +08:00
|
|
|
// Returns a list of valid non-final next codes for a given prefix code,
|
|
|
|
// which may be empty.
|
2021-03-15 16:01:55 +08:00
|
|
|
const std::vector<int> *GetNextCodes(const RecodedCharID &code) const {
|
2016-11-08 07:38:07 +08:00
|
|
|
auto it = next_codes_.find(code);
|
2016-12-13 00:20:28 +08:00
|
|
|
return it == next_codes_.end() ? nullptr : it->second;
|
2016-11-08 07:38:07 +08:00
|
|
|
}
|
|
|
|
// Returns a list of valid final codes for a given prefix code, which may
|
|
|
|
// be empty.
|
2021-03-15 16:01:55 +08:00
|
|
|
const std::vector<int> *GetFinalCodes(const RecodedCharID &code) const {
|
2016-11-08 07:38:07 +08:00
|
|
|
auto it = final_codes_.find(code);
|
2016-12-13 00:20:28 +08:00
|
|
|
return it == final_codes_.end() ? nullptr : it->second;
|
2016-11-08 07:38:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Writes to the given file. Returns false in case of error.
|
2021-03-13 03:35:02 +08:00
|
|
|
bool Serialize(TFile *fp) const;
|
2016-11-08 07:38:07 +08:00
|
|
|
// Reads from the given file. Returns false in case of error.
|
2017-05-04 07:09:44 +08:00
|
|
|
|
2021-03-13 03:35:02 +08:00
|
|
|
bool DeSerialize(TFile *fp);
|
2016-11-08 07:38:07 +08:00
|
|
|
|
2021-03-14 06:15:18 +08:00
|
|
|
// Returns a string containing a text file that describes the encoding thus:
|
2016-11-08 07:38:07 +08:00
|
|
|
// <index>[,<index>]*<tab><UTF8-str><newline>
|
|
|
|
// In words, a comma-separated list of one or more indices, followed by a tab
|
|
|
|
// and the UTF-8 string that the code represents per line. Most simple scripts
|
|
|
|
// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
|
|
|
|
// and the Indic scripts will contain a many-to-many mapping.
|
|
|
|
// See the class comment above for details.
|
2021-03-14 06:15:18 +08:00
|
|
|
std::string GetEncodingAsString(const UNICHARSET &unicharset) const;
|
2016-11-08 07:38:07 +08:00
|
|
|
|
|
|
|
// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
|
|
|
|
// Note that the returned values are 0-based indices, NOT unicode Jamo.
|
|
|
|
// Returns false if the input is not in the Hangul unicode range.
|
2021-03-13 03:35:02 +08:00
|
|
|
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing);
|
2016-11-08 07:38:07 +08:00
|
|
|
|
2021-03-13 03:35:02 +08:00
|
|
|
private:
|
2016-11-08 07:38:07 +08:00
|
|
|
// Renumbers codes to eliminate unused values.
|
|
|
|
void DefragmentCodeValues(int encoded_null);
|
|
|
|
// Computes the value of code_range_ from the encoder_.
|
|
|
|
void ComputeCodeRange();
|
|
|
|
// Initializes the decoding hash_map from the encoder_ array.
|
|
|
|
void SetupDecoder();
|
|
|
|
// Frees allocated memory.
|
|
|
|
void Cleanup();
|
|
|
|
|
|
|
|
// The encoder that maps a unichar-id to a sequence of small codes.
|
|
|
|
// encoder_ is the only part that is serialized. The rest is computed on load.
|
2021-03-04 03:22:00 +08:00
|
|
|
std::vector<RecodedCharID> encoder_;
|
2016-11-08 07:38:07 +08:00
|
|
|
// Decoder converts the output of encoder back to a unichar-id.
|
2021-03-04 03:22:00 +08:00
|
|
|
std::unordered_map<RecodedCharID, int, RecodedCharID::RecodedCharIDHash> decoder_;
|
2016-11-08 07:38:07 +08:00
|
|
|
// True if the index is a valid single or start code.
|
2021-03-15 16:01:55 +08:00
|
|
|
std::vector<bool> is_valid_start_;
|
2016-11-08 07:38:07 +08:00
|
|
|
// Maps a prefix code to a list of valid next codes.
|
|
|
|
// The map owns the vectors.
|
2021-03-15 16:01:55 +08:00
|
|
|
std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
|
2016-11-08 07:38:07 +08:00
|
|
|
next_codes_;
|
|
|
|
// Maps a prefix code to a list of valid final codes.
|
|
|
|
// The map owns the vectors.
|
2021-03-15 16:01:55 +08:00
|
|
|
std::unordered_map<RecodedCharID, std::vector<int> *, RecodedCharID::RecodedCharIDHash>
|
2016-11-08 07:38:07 +08:00
|
|
|
final_codes_;
|
|
|
|
// Max of any value in encoder_ + 1.
|
|
|
|
int code_range_;
|
|
|
|
};
|
|
|
|
|
2021-03-13 03:35:02 +08:00
|
|
|
} // namespace tesseract.
|
2016-11-08 07:38:07 +08:00
|
|
|
|
2021-03-13 03:35:02 +08:00
|
|
|
#endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_
|