mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-12 21:53:25 +08:00
Changed the way unicharsets are handled to allow support for the ™ character. Can find the issue where it was requested.
This commit is contained in:
parent
4efc539f51
commit
b0ead95d64
@ -24,6 +24,7 @@
|
||||
|
||||
#include "ratngs.h"
|
||||
|
||||
#include <string>
|
||||
#include "blobs.h"
|
||||
#include "callcpp.h"
|
||||
#include "genericvector.h"
|
||||
@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
|
||||
: unicharset_(&unicharset){
|
||||
GenericVector<UNICHAR_ID> encoding;
|
||||
GenericVector<char> lengths;
|
||||
if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
|
||||
string cleaned = unicharset.CleanupString(src_string);
|
||||
if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
|
||||
NULL)) {
|
||||
lengths.push_back('\0');
|
||||
STRING src_lengths = &lengths[0];
|
||||
this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
|
||||
this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
|
||||
} else { // There must have been an invalid unichar in the string.
|
||||
this->init(8);
|
||||
this->make_bad();
|
||||
|
@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
|
||||
// Insert the corresponding correct ngram into the unicharset.
|
||||
// Unicharset code assumes that the "base" ngram is inserted into
|
||||
// the unicharset before fragments of this ngram are inserted.
|
||||
unicharset->unichar_insert(replacement_string);
|
||||
unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
|
||||
ambig_spec->correct_ngram_id =
|
||||
unicharset->unichar_to_id(replacement_string);
|
||||
if (replacement_ambig_part_size > 1) {
|
||||
@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
|
||||
} else {
|
||||
STRING frag_str = CHAR_FRAGMENT::to_string(
|
||||
replacement_string, i, test_ambig_part_size, false);
|
||||
unicharset->unichar_insert(frag_str.string());
|
||||
unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
|
||||
unichar_id = unicharset->unichar_to_id(frag_str.string());
|
||||
}
|
||||
ambig_spec->correct_fragments[i] = unichar_id;
|
||||
|
@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
|
||||
direct_set.clear();
|
||||
radicals.clear();
|
||||
// Always keep space as 0;
|
||||
direct_set.unichar_insert(" ");
|
||||
direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
|
||||
// Null char is next if we have one.
|
||||
if (null_id >= 0) {
|
||||
direct_set.unichar_insert(kNullChar);
|
||||
@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
|
||||
if (it != radical_map.end()) {
|
||||
// This is Han. Convert to radical, stroke, index.
|
||||
if (!radicals.contains_unichar(it->second.radical.string())) {
|
||||
radicals.unichar_insert(it->second.radical.string());
|
||||
radicals.unichar_insert(it->second.radical.string(),
|
||||
OldUncleanUnichars::kTrue);
|
||||
}
|
||||
int radical = radicals.unichar_to_id(it->second.radical.string());
|
||||
int num_strokes = it->second.num_strokes;
|
||||
|
@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
|
||||
delete[] nodes;
|
||||
}
|
||||
|
||||
// Search the given unichar representation in the tree. Each character in the
|
||||
// string is interpreted as an index in an array of nodes.
|
||||
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
|
||||
const char* current_char = unichar_repr;
|
||||
UNICHARMAP_NODE* current_nodes = nodes;
|
||||
|
||||
assert(*unichar_repr != '\0');
|
||||
|
||||
do {
|
||||
if (*(current_char + 1) == '\0')
|
||||
return current_nodes[static_cast<unsigned char>(*current_char)].id;
|
||||
current_nodes =
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
||||
++current_char;
|
||||
} while (true);
|
||||
}
|
||||
|
||||
// Search the given unichar representation in the tree, using length characters
|
||||
// from it maximum. Each character in the string is interpreted as an index in
|
||||
// an array of nodes.
|
||||
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
|
||||
int length) const {
|
||||
const char* current_char = unichar_repr;
|
||||
UNICHARMAP_NODE* current_nodes = nodes;
|
||||
|
||||
assert(*unichar_repr != '\0');
|
||||
assert(length > 0 && length <= UNICHAR_LEN);
|
||||
|
||||
int index = 0;
|
||||
if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
|
||||
do {
|
||||
if (length == 1 || *(current_char + 1) == '\0')
|
||||
return current_nodes[static_cast<unsigned char>(*current_char)].id;
|
||||
if (index + 1 >= length || unichar_repr[index + 1] == '\0')
|
||||
return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
|
||||
current_nodes =
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
||||
++current_char;
|
||||
--length;
|
||||
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
|
||||
++index;
|
||||
} while (true);
|
||||
}
|
||||
|
||||
@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
|
||||
// string is interpreted as an index in an array of nodes.
|
||||
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
|
||||
const char* current_char = unichar_repr;
|
||||
if (*current_char == '\0') return;
|
||||
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
|
||||
|
||||
assert(*unichar_repr != '\0');
|
||||
assert(id >= 0);
|
||||
|
||||
do {
|
||||
if (*current_nodes_pointer == 0)
|
||||
*current_nodes_pointer = new UNICHARMAP_NODE[256];
|
||||
if (*(current_char + 1) == '\0') {
|
||||
if (current_char[1] == '\0') {
|
||||
(*current_nodes_pointer)
|
||||
[static_cast<unsigned char>(*current_char)].id = id;
|
||||
return;
|
||||
@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
|
||||
} while (true);
|
||||
}
|
||||
|
||||
// Search the given unichar representation in the tree. Each character in the
|
||||
// string is interpreted as an index in an array of nodes. Stop once the tree
|
||||
// does not have anymore nodes or once we found the right unichar_repr.
|
||||
bool UNICHARMAP::contains(const char* const unichar_repr) const {
|
||||
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
|
||||
|
||||
const char* current_char = unichar_repr;
|
||||
UNICHARMAP_NODE* current_nodes = nodes;
|
||||
|
||||
while (current_nodes != 0 && *(current_char + 1) != '\0') {
|
||||
current_nodes =
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
||||
++current_char;
|
||||
}
|
||||
return current_nodes != 0 && *(current_char + 1) == '\0' &&
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
|
||||
}
|
||||
|
||||
// Search the given unichar representation in the tree, using length characters
|
||||
// from it maximum. Each character in the string is interpreted as an index in
|
||||
// an array of nodes. Stop once the tree does not have anymore nodes or once we
|
||||
@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
|
||||
int length) const {
|
||||
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
|
||||
if (length <= 0 || length > UNICHAR_LEN) return false;
|
||||
|
||||
const char* current_char = unichar_repr;
|
||||
int index = 0;
|
||||
if (index >= length || unichar_repr[index] == '\0') return false;
|
||||
UNICHARMAP_NODE* current_nodes = nodes;
|
||||
|
||||
while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
|
||||
while (current_nodes != 0 && index + 1 < length &&
|
||||
unichar_repr[index + 1] != '\0') {
|
||||
current_nodes =
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
||||
--length;
|
||||
++current_char;
|
||||
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
|
||||
++index;
|
||||
}
|
||||
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
|
||||
return current_nodes != 0 &&
|
||||
(index + 1 >= length || unichar_repr[index + 1] == '\0') &&
|
||||
current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
|
||||
}
|
||||
|
||||
// Return the minimum number of characters that must be used from this string
|
||||
// to obtain a match in the UNICHARMAP.
|
||||
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
|
||||
const char* current_char = unichar_repr;
|
||||
if (*current_char == '\0') return 0;
|
||||
UNICHARMAP_NODE* current_nodes = nodes;
|
||||
|
||||
while (current_nodes != NULL && *current_char != '\0') {
|
||||
|
@ -36,21 +36,12 @@ class UNICHARMAP {
|
||||
// with the given id. The length of the representation MUST be non-zero.
|
||||
void insert(const char* const unichar_repr, UNICHAR_ID id);
|
||||
|
||||
// Return the id associated with the given unichar representation,
|
||||
// this representation MUST exist within the UNICHARMAP.
|
||||
// The length of the representation MUST be non-zero.
|
||||
UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
|
||||
|
||||
// Return the id associated with the given unichar representation,
|
||||
// this representation MUST exist within the UNICHARMAP. The first
|
||||
// length characters (maximum) from unichar_repr are used. The length
|
||||
// MUST be non-zero.
|
||||
UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
|
||||
|
||||
// Return true if the given unichar representation is already present in the
|
||||
// UNICHARMAP. The length of the representation MUST be non-zero.
|
||||
bool contains(const char* const unichar_repr) const;
|
||||
|
||||
// Return true if the given unichar representation is already present in the
|
||||
// UNICHARMAP. The first length characters (maximum) from unichar_repr are
|
||||
// used. The length MUST be non-zero.
|
||||
|
@ -67,6 +67,15 @@ const char* UNICHARSET::kCustomLigatures[][2] = {
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
// List of mappings to make when ingesting strings from the outside.
|
||||
// The substitutions clean up text that should exist for rendering of
|
||||
// synthetic data, but not in the recognition set.
|
||||
const char* UNICHARSET::kCleanupMaps[][2] = {
|
||||
{"\u0640", ""}, // TATWEEL is deleted.
|
||||
{"\ufb01", "fi"}, // fi ligature->fi pair.
|
||||
{"\ufb02", "fl"}, // fl ligature->fl pair.
|
||||
{nullptr, nullptr}};
|
||||
|
||||
// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
|
||||
const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
|
||||
" ",
|
||||
@ -196,15 +205,21 @@ void UNICHARSET::reserve(int unichars_number) {
|
||||
|
||||
UNICHAR_ID
|
||||
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
|
||||
return ids.contains(unichar_repr) ?
|
||||
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
|
||||
string cleaned =
|
||||
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
||||
return ids.contains(cleaned.data(), cleaned.size())
|
||||
? ids.unichar_to_id(cleaned.data(), cleaned.size())
|
||||
: INVALID_UNICHAR_ID;
|
||||
}
|
||||
|
||||
UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
||||
int length) const {
|
||||
assert(length > 0 && length <= UNICHAR_LEN);
|
||||
return ids.contains(unichar_repr, length) ?
|
||||
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
|
||||
string cleaned(unichar_repr, length);
|
||||
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
|
||||
return ids.contains(cleaned.data(), cleaned.size())
|
||||
? ids.unichar_to_id(cleaned.data(), cleaned.size())
|
||||
: INVALID_UNICHAR_ID;
|
||||
}
|
||||
|
||||
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
||||
@ -235,6 +250,9 @@ bool UNICHARSET::encodable_string(const char *str,
|
||||
// the rest of the string is still encoded.
|
||||
// If lengths is not NULL, then it is filled with the corresponding
|
||||
// byte length of each encoded UNICHAR_ID.
|
||||
// WARNING: Caller must guarantee that str has already been cleaned of codes
|
||||
// that do not belong in the unicharset, or encoding may fail.
|
||||
// Use CleanupString to perform the cleaning.
|
||||
bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
|
||||
GenericVector<UNICHAR_ID>* encoding,
|
||||
GenericVector<char>* lengths,
|
||||
@ -429,7 +447,7 @@ void UNICHARSET::CopyFrom(const UNICHARSET& src) {
|
||||
for (int ch = 0; ch < src.size_used; ++ch) {
|
||||
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
||||
const char* utf8 = src.id_to_unichar(ch);
|
||||
unichar_insert(utf8);
|
||||
unichar_insert_backwards_compatible(utf8);
|
||||
unichars[ch].properties.ExpandRangesFrom(src_props);
|
||||
}
|
||||
// Set properties, including mirror and other_case, WITHOUT reordering
|
||||
@ -445,24 +463,13 @@ void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
|
||||
for (int ch = 0; ch < src.size_used; ++ch) {
|
||||
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
||||
const char* utf8 = src.id_to_unichar(ch);
|
||||
if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
|
||||
// Only use fully valid entries.
|
||||
tprintf("Bad properties for index %d, char %s: "
|
||||
"%d,%d %d,%d %g,%g %g,%g %g,%g\n",
|
||||
ch, utf8, src_props.min_bottom, src_props.max_bottom,
|
||||
src_props.min_top, src_props.max_top,
|
||||
src_props.width, src_props.width_sd,
|
||||
src_props.bearing, src_props.bearing_sd,
|
||||
src_props.advance, src_props.advance_sd);
|
||||
continue;
|
||||
}
|
||||
int id = size_used;
|
||||
if (contains_unichar(utf8)) {
|
||||
id = unichar_to_id(utf8);
|
||||
// Just expand current ranges.
|
||||
unichars[id].properties.ExpandRangesFrom(src_props);
|
||||
} else {
|
||||
unichar_insert(utf8);
|
||||
unichar_insert_backwards_compatible(utf8);
|
||||
unichars[id].properties.SetRangesEmpty();
|
||||
}
|
||||
}
|
||||
@ -613,40 +620,55 @@ char UNICHARSET::get_chartype(UNICHAR_ID id) const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
||||
if (!ids.contains(unichar_repr)) {
|
||||
if (strlen(unichar_repr) > UNICHAR_LEN) {
|
||||
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
|
||||
int(strlen(unichar_repr)), unichar_repr);
|
||||
void UNICHARSET::unichar_insert(const char* const unichar_repr,
|
||||
OldUncleanUnichars old_style) {
|
||||
if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
|
||||
string cleaned =
|
||||
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
||||
if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
|
||||
const char* str = cleaned.c_str();
|
||||
GenericVector<int> encoding;
|
||||
if (!old_style_included_ &&
|
||||
encode_string(str, true, &encoding, nullptr, nullptr))
|
||||
return;
|
||||
}
|
||||
if (size_used == size_reserved) {
|
||||
if (size_used == 0)
|
||||
reserve(8);
|
||||
else
|
||||
reserve(2 * size_used);
|
||||
}
|
||||
|
||||
strcpy(unichars[size_used].representation, unichar_repr);
|
||||
int index = 0;
|
||||
do {
|
||||
if (index > UNICHAR_LEN) {
|
||||
fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
|
||||
unichar_repr);
|
||||
return;
|
||||
}
|
||||
unichars[size_used].representation[index++] = *str++;
|
||||
} while (*str != '\0');
|
||||
unichars[size_used].representation[index] = '\0';
|
||||
this->set_script(size_used, null_script);
|
||||
// If the given unichar_repr represents a fragmented character, set
|
||||
// fragment property to a pointer to CHAR_FRAGMENT class instance with
|
||||
// information parsed from the unichar representation. Use the script
|
||||
// of the base unichar for the fragmented character if possible.
|
||||
CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
|
||||
CHAR_FRAGMENT* frag =
|
||||
CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
|
||||
this->unichars[size_used].properties.fragment = frag;
|
||||
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
|
||||
this->unichars[size_used].properties.script_id =
|
||||
this->get_script(frag->get_unichar());
|
||||
}
|
||||
this->unichars[size_used].properties.enabled = true;
|
||||
ids.insert(unichar_repr, size_used);
|
||||
ids.insert(unichars[size_used].representation, size_used);
|
||||
++size_used;
|
||||
}
|
||||
}
|
||||
|
||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
|
||||
return ids.contains(unichar_repr);
|
||||
string cleaned =
|
||||
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
||||
return ids.contains(cleaned.data(), cleaned.size());
|
||||
}
|
||||
|
||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
|
||||
@ -654,7 +676,9 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr,
|
||||
if (length == 0) {
|
||||
return false;
|
||||
}
|
||||
return ids.contains(unichar_repr, length);
|
||||
string cleaned(unichar_repr, length);
|
||||
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
|
||||
return ids.contains(cleaned.data(), cleaned.size());
|
||||
}
|
||||
|
||||
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
|
||||
@ -840,7 +864,7 @@ bool UNICHARSET::load_via_fgets(
|
||||
if (strcmp(unichar, "NULL") == 0)
|
||||
this->unichar_insert(" ");
|
||||
else
|
||||
this->unichar_insert(unichar);
|
||||
this->unichar_insert_backwards_compatible(unichar);
|
||||
|
||||
this->set_isalpha(id, properties & ISALPHA_MASK);
|
||||
this->set_islower(id, properties & ISLOWER_MASK);
|
||||
@ -1088,3 +1112,32 @@ int UNICHARSET::get_script_id_from_name(const char* script_name) const {
|
||||
}
|
||||
return 0; // 0 is always the null_script
|
||||
}
|
||||
|
||||
// Removes/replaces content that belongs in rendered text, but not in the
|
||||
// unicharset.
|
||||
/* static */
|
||||
string UNICHARSET::CleanupString(const char* utf8_str, int length) {
|
||||
string result;
|
||||
result.reserve(length);
|
||||
char ch;
|
||||
while ((ch = *utf8_str) != '\0' && --length >= 0) {
|
||||
int key_index = 0;
|
||||
const char* key;
|
||||
while ((key = kCleanupMaps[key_index][0]) != nullptr) {
|
||||
int match = 0;
|
||||
while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
|
||||
if (key[match] == '\0') {
|
||||
utf8_str += match;
|
||||
break;
|
||||
}
|
||||
++key_index;
|
||||
}
|
||||
if (key == nullptr) {
|
||||
result.push_back(ch);
|
||||
++utf8_str;
|
||||
} else {
|
||||
result.append(kCleanupMaps[key_index][1]);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -39,6 +39,13 @@ enum SpecialUnicharCodes {
|
||||
SPECIAL_UNICHAR_CODES_COUNT
|
||||
};
|
||||
|
||||
// Boolean flag for unichar_insert. It's a bit of a double negative to allow
|
||||
// the default value to be false.
|
||||
enum class OldUncleanUnichars {
|
||||
kFalse,
|
||||
kTrue,
|
||||
};
|
||||
|
||||
class CHAR_FRAGMENT {
|
||||
public:
|
||||
// Minimum number of characters used for fragment representation.
|
||||
@ -190,7 +197,7 @@ class UNICHARSET {
|
||||
// Use encode_string in preference to repeatedly calling step.
|
||||
int step(const char* str) const;
|
||||
|
||||
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
|
||||
// Returns true if the given UTF-8 string is encodable with this UNICHARSET.
|
||||
// If not encodable, write the first byte offset which cannot be converted
|
||||
// into the second (return) argument.
|
||||
bool encodable_string(const char *str, int *first_bad_position) const;
|
||||
@ -207,6 +214,9 @@ class UNICHARSET {
|
||||
// If encoded_length is not NULL then on return it contains the length of
|
||||
// str that was encoded. (if give_up_on_failure the location of the first
|
||||
// failure, otherwise strlen(str).)
|
||||
// WARNING: Caller must guarantee that str has already been cleaned of codes
|
||||
// that do not belong in the unicharset, or encoding may fail.
|
||||
// Use CleanupString to perform the cleaning.
|
||||
bool encode_string(const char* str, bool give_up_on_failure,
|
||||
GenericVector<UNICHAR_ID>* encoding,
|
||||
GenericVector<char>* lengths,
|
||||
@ -226,6 +236,13 @@ class UNICHARSET {
|
||||
// by its hex unicodes.
|
||||
static STRING debug_utf8_str(const char* str);
|
||||
|
||||
// Removes/replaces content that belongs in rendered text, but not in the
|
||||
// unicharset.
|
||||
static string CleanupString(const char* utf8_str) {
|
||||
return CleanupString(utf8_str, strlen(utf8_str));
|
||||
}
|
||||
static string CleanupString(const char* utf8_str, int length);
|
||||
|
||||
// Return a STRING containing debug information on the unichar, including
|
||||
// the id_to_unichar, its hex unicodes and the properties.
|
||||
STRING debug_str(UNICHAR_ID id) const;
|
||||
@ -233,8 +250,29 @@ class UNICHARSET {
|
||||
return debug_str(unichar_to_id(unichar_repr));
|
||||
}
|
||||
|
||||
// Add a unichar representation to the set.
|
||||
void unichar_insert(const char* const unichar_repr);
|
||||
// Adds a unichar representation to the set. If old_style is true, then
|
||||
// TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
|
||||
// characters are ignored/skipped as if they don't exist and n-grams that
|
||||
// can already be encoded are not added.
|
||||
void unichar_insert(const char* const unichar_repr,
|
||||
OldUncleanUnichars old_style);
|
||||
void unichar_insert(const char* const unichar_repr) {
|
||||
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
|
||||
}
|
||||
// Adds a unichar representation to the set. Avoids setting old_style to true,
|
||||
// unless it is necessary to make the new unichar get added.
|
||||
void unichar_insert_backwards_compatible(const char* const unichar_repr) {
|
||||
string cleaned = CleanupString(unichar_repr);
|
||||
if (cleaned != unichar_repr) {
|
||||
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
||||
} else {
|
||||
int old_size = size();
|
||||
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
|
||||
if (size() == old_size) {
|
||||
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if the given unichar id exists within the set.
|
||||
// Relies on the fact that unichar ids are contiguous in the unicharset.
|
||||
@ -282,6 +320,7 @@ class UNICHARSET {
|
||||
top_bottom_set_ = false;
|
||||
script_has_upper_lower_ = false;
|
||||
script_has_xheight_ = false;
|
||||
old_style_included_ = false;
|
||||
null_sid_ = 0;
|
||||
common_sid_ = 0;
|
||||
latin_sid_ = 0;
|
||||
@ -743,7 +782,7 @@ class UNICHARSET {
|
||||
// unichar representation represents a character fragment.
|
||||
const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
|
||||
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
|
||||
!ids.contains(unichar_repr)) {
|
||||
!ids.contains(unichar_repr, false)) {
|
||||
return NULL;
|
||||
}
|
||||
return get_fragment(unichar_to_id(unichar_repr));
|
||||
@ -965,6 +1004,11 @@ class UNICHARSET {
|
||||
bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
|
||||
bool skip_fragments);
|
||||
|
||||
// List of mappings to make when ingesting strings from the outside.
|
||||
// The substitutions clean up text that should exists for rendering of
|
||||
// synthetic data, but not in the recognition set.
|
||||
static const char* kCleanupMaps[][2];
|
||||
|
||||
UNICHAR_SLOT* unichars;
|
||||
UNICHARMAP ids;
|
||||
int size_used;
|
||||
@ -980,6 +1024,8 @@ class UNICHARSET {
|
||||
// True if the unicharset has a significant mean-line with significant
|
||||
// ascenders above that.
|
||||
bool script_has_xheight_;
|
||||
// True if the set contains chars that would be changed by the cleanup.
|
||||
bool old_style_included_;
|
||||
|
||||
// A few convenient script name-to-id mapping without using hash.
|
||||
// These are initialized when unicharset file is loaded. Anything
|
||||
|
@ -170,6 +170,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
|
||||
tprintf("Training parameters:\n Debug interval = %d,"
|
||||
" weights = %g, learning rate = %g, momentum=%g\n",
|
||||
debug_interval_, weight_range_, learning_rate_, momentum_);
|
||||
tprintf("null char=%d\n", null_char_);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -733,7 +734,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
|
||||
GenericVector<int> internal_labels;
|
||||
labels->truncate(0);
|
||||
if (!simple_text) labels->push_back(null_char);
|
||||
if (unicharset.encode_string(str.string(), true, &internal_labels, NULL,
|
||||
string cleaned = unicharset.CleanupString(str.string());
|
||||
if (unicharset.encode_string(cleaned.c_str(), true, &internal_labels, NULL,
|
||||
&err_index)) {
|
||||
bool success = true;
|
||||
for (int i = 0; i < internal_labels.size(); ++i) {
|
||||
@ -759,8 +761,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
|
||||
if (success) return true;
|
||||
}
|
||||
tprintf("Encoding of string failed! Failure bytes:");
|
||||
while (err_index < str.length()) {
|
||||
tprintf(" %x", str[err_index++]);
|
||||
while (err_index < cleaned.size()) {
|
||||
tprintf(" %x", cleaned[err_index++]);
|
||||
}
|
||||
tprintf("\n");
|
||||
return false;
|
||||
@ -813,8 +815,9 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData* trainingdata,
|
||||
training_iteration() % debug_interval_ == 0;
|
||||
GenericVector<int> truth_labels;
|
||||
if (!EncodeString(trainingdata->transcription(), &truth_labels)) {
|
||||
tprintf("Can't encode transcription: %s\n",
|
||||
trainingdata->transcription().string());
|
||||
tprintf("Can't encode transcription: '%s' in language '%s'\n",
|
||||
trainingdata->transcription().string(),
|
||||
trainingdata->language().string());
|
||||
return UNENCODABLE;
|
||||
}
|
||||
int w = 0;
|
||||
|
@ -409,9 +409,7 @@ using tesseract::SpanUTF8NotWhitespace;
|
||||
using tesseract::SpanUTF8Whitespace;
|
||||
using tesseract::StringRenderer;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
||||
|
||||
int Main() {
|
||||
if (FLAGS_list_available_fonts) {
|
||||
const std::vector<string>& all_fonts = FontUtils::ListAvailableFonts();
|
||||
for (unsigned int i = 0; i < all_fonts.size(); ++i) {
|
||||
@ -543,8 +541,9 @@ int main(int argc, char** argv) {
|
||||
const char *curr_pos = str8 + offsets[i].first;
|
||||
int ngram_len = offsets[i].second;
|
||||
// Skip words that contain characters not in found in unicharset.
|
||||
string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
|
||||
if (!FLAGS_unicharset_file.empty() &&
|
||||
!unicharset.encodable_string(curr_pos, nullptr)) {
|
||||
!unicharset.encodable_string(cleaned.c_str(), nullptr)) {
|
||||
continue;
|
||||
}
|
||||
rand_utf8.append(curr_pos, ngram_len);
|
||||
@ -665,3 +664,8 @@ int main(int argc, char** argv) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
||||
Main();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user