mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-12 21:53:25 +08:00
Changed the way unicharsets are handled to allow support for the ™ character. Can find the issue where it was requested.
This commit is contained in:
parent
4efc539f51
commit
b0ead95d64
@ -24,6 +24,7 @@
|
|||||||
|
|
||||||
#include "ratngs.h"
|
#include "ratngs.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
#include "blobs.h"
|
#include "blobs.h"
|
||||||
#include "callcpp.h"
|
#include "callcpp.h"
|
||||||
#include "genericvector.h"
|
#include "genericvector.h"
|
||||||
@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
|
|||||||
: unicharset_(&unicharset){
|
: unicharset_(&unicharset){
|
||||||
GenericVector<UNICHAR_ID> encoding;
|
GenericVector<UNICHAR_ID> encoding;
|
||||||
GenericVector<char> lengths;
|
GenericVector<char> lengths;
|
||||||
if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
|
string cleaned = unicharset.CleanupString(src_string);
|
||||||
|
if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
|
||||||
|
NULL)) {
|
||||||
lengths.push_back('\0');
|
lengths.push_back('\0');
|
||||||
STRING src_lengths = &lengths[0];
|
STRING src_lengths = &lengths[0];
|
||||||
this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
|
this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
|
||||||
} else { // There must have been an invalid unichar in the string.
|
} else { // There must have been an invalid unichar in the string.
|
||||||
this->init(8);
|
this->init(8);
|
||||||
this->make_bad();
|
this->make_bad();
|
||||||
|
@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
|
|||||||
// Insert the corresponding correct ngram into the unicharset.
|
// Insert the corresponding correct ngram into the unicharset.
|
||||||
// Unicharset code assumes that the "base" ngram is inserted into
|
// Unicharset code assumes that the "base" ngram is inserted into
|
||||||
// the unicharset before fragments of this ngram are inserted.
|
// the unicharset before fragments of this ngram are inserted.
|
||||||
unicharset->unichar_insert(replacement_string);
|
unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
|
||||||
ambig_spec->correct_ngram_id =
|
ambig_spec->correct_ngram_id =
|
||||||
unicharset->unichar_to_id(replacement_string);
|
unicharset->unichar_to_id(replacement_string);
|
||||||
if (replacement_ambig_part_size > 1) {
|
if (replacement_ambig_part_size > 1) {
|
||||||
@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
|
|||||||
} else {
|
} else {
|
||||||
STRING frag_str = CHAR_FRAGMENT::to_string(
|
STRING frag_str = CHAR_FRAGMENT::to_string(
|
||||||
replacement_string, i, test_ambig_part_size, false);
|
replacement_string, i, test_ambig_part_size, false);
|
||||||
unicharset->unichar_insert(frag_str.string());
|
unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
|
||||||
unichar_id = unicharset->unichar_to_id(frag_str.string());
|
unichar_id = unicharset->unichar_to_id(frag_str.string());
|
||||||
}
|
}
|
||||||
ambig_spec->correct_fragments[i] = unichar_id;
|
ambig_spec->correct_fragments[i] = unichar_id;
|
||||||
|
@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
|
|||||||
direct_set.clear();
|
direct_set.clear();
|
||||||
radicals.clear();
|
radicals.clear();
|
||||||
// Always keep space as 0;
|
// Always keep space as 0;
|
||||||
direct_set.unichar_insert(" ");
|
direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
|
||||||
// Null char is next if we have one.
|
// Null char is next if we have one.
|
||||||
if (null_id >= 0) {
|
if (null_id >= 0) {
|
||||||
direct_set.unichar_insert(kNullChar);
|
direct_set.unichar_insert(kNullChar);
|
||||||
@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
|
|||||||
if (it != radical_map.end()) {
|
if (it != radical_map.end()) {
|
||||||
// This is Han. Convert to radical, stroke, index.
|
// This is Han. Convert to radical, stroke, index.
|
||||||
if (!radicals.contains_unichar(it->second.radical.string())) {
|
if (!radicals.contains_unichar(it->second.radical.string())) {
|
||||||
radicals.unichar_insert(it->second.radical.string());
|
radicals.unichar_insert(it->second.radical.string(),
|
||||||
|
OldUncleanUnichars::kTrue);
|
||||||
}
|
}
|
||||||
int radical = radicals.unichar_to_id(it->second.radical.string());
|
int radical = radicals.unichar_to_id(it->second.radical.string());
|
||||||
int num_strokes = it->second.num_strokes;
|
int num_strokes = it->second.num_strokes;
|
||||||
|
@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
|
|||||||
delete[] nodes;
|
delete[] nodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search the given unichar representation in the tree. Each character in the
|
|
||||||
// string is interpreted as an index in an array of nodes.
|
|
||||||
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
|
|
||||||
const char* current_char = unichar_repr;
|
|
||||||
UNICHARMAP_NODE* current_nodes = nodes;
|
|
||||||
|
|
||||||
assert(*unichar_repr != '\0');
|
|
||||||
|
|
||||||
do {
|
|
||||||
if (*(current_char + 1) == '\0')
|
|
||||||
return current_nodes[static_cast<unsigned char>(*current_char)].id;
|
|
||||||
current_nodes =
|
|
||||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
|
||||||
++current_char;
|
|
||||||
} while (true);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Search the given unichar representation in the tree, using length characters
|
// Search the given unichar representation in the tree, using length characters
|
||||||
// from it maximum. Each character in the string is interpreted as an index in
|
// from it maximum. Each character in the string is interpreted as an index in
|
||||||
// an array of nodes.
|
// an array of nodes.
|
||||||
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
|
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
|
||||||
int length) const {
|
int length) const {
|
||||||
const char* current_char = unichar_repr;
|
|
||||||
UNICHARMAP_NODE* current_nodes = nodes;
|
UNICHARMAP_NODE* current_nodes = nodes;
|
||||||
|
|
||||||
assert(*unichar_repr != '\0');
|
assert(*unichar_repr != '\0');
|
||||||
assert(length > 0 && length <= UNICHAR_LEN);
|
assert(length > 0 && length <= UNICHAR_LEN);
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
|
||||||
do {
|
do {
|
||||||
if (length == 1 || *(current_char + 1) == '\0')
|
if (index + 1 >= length || unichar_repr[index + 1] == '\0')
|
||||||
return current_nodes[static_cast<unsigned char>(*current_char)].id;
|
return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
|
||||||
current_nodes =
|
current_nodes =
|
||||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
|
||||||
++current_char;
|
++index;
|
||||||
--length;
|
|
||||||
} while (true);
|
} while (true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
|
|||||||
// string is interpreted as an index in an array of nodes.
|
// string is interpreted as an index in an array of nodes.
|
||||||
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
|
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
|
||||||
const char* current_char = unichar_repr;
|
const char* current_char = unichar_repr;
|
||||||
|
if (*current_char == '\0') return;
|
||||||
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
|
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
|
||||||
|
|
||||||
assert(*unichar_repr != '\0');
|
|
||||||
assert(id >= 0);
|
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if (*current_nodes_pointer == 0)
|
if (*current_nodes_pointer == 0)
|
||||||
*current_nodes_pointer = new UNICHARMAP_NODE[256];
|
*current_nodes_pointer = new UNICHARMAP_NODE[256];
|
||||||
if (*(current_char + 1) == '\0') {
|
if (current_char[1] == '\0') {
|
||||||
(*current_nodes_pointer)
|
(*current_nodes_pointer)
|
||||||
[static_cast<unsigned char>(*current_char)].id = id;
|
[static_cast<unsigned char>(*current_char)].id = id;
|
||||||
return;
|
return;
|
||||||
@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
|
|||||||
} while (true);
|
} while (true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search the given unichar representation in the tree. Each character in the
|
|
||||||
// string is interpreted as an index in an array of nodes. Stop once the tree
|
|
||||||
// does not have anymore nodes or once we found the right unichar_repr.
|
|
||||||
bool UNICHARMAP::contains(const char* const unichar_repr) const {
|
|
||||||
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
|
|
||||||
|
|
||||||
const char* current_char = unichar_repr;
|
|
||||||
UNICHARMAP_NODE* current_nodes = nodes;
|
|
||||||
|
|
||||||
while (current_nodes != 0 && *(current_char + 1) != '\0') {
|
|
||||||
current_nodes =
|
|
||||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
|
||||||
++current_char;
|
|
||||||
}
|
|
||||||
return current_nodes != 0 && *(current_char + 1) == '\0' &&
|
|
||||||
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Search the given unichar representation in the tree, using length characters
|
// Search the given unichar representation in the tree, using length characters
|
||||||
// from it maximum. Each character in the string is interpreted as an index in
|
// from it maximum. Each character in the string is interpreted as an index in
|
||||||
// an array of nodes. Stop once the tree does not have anymore nodes or once we
|
// an array of nodes. Stop once the tree does not have anymore nodes or once we
|
||||||
@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
|
|||||||
int length) const {
|
int length) const {
|
||||||
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
|
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
|
||||||
if (length <= 0 || length > UNICHAR_LEN) return false;
|
if (length <= 0 || length > UNICHAR_LEN) return false;
|
||||||
|
int index = 0;
|
||||||
const char* current_char = unichar_repr;
|
if (index >= length || unichar_repr[index] == '\0') return false;
|
||||||
UNICHARMAP_NODE* current_nodes = nodes;
|
UNICHARMAP_NODE* current_nodes = nodes;
|
||||||
|
|
||||||
while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
|
while (current_nodes != 0 && index + 1 < length &&
|
||||||
|
unichar_repr[index + 1] != '\0') {
|
||||||
current_nodes =
|
current_nodes =
|
||||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
|
||||||
--length;
|
++index;
|
||||||
++current_char;
|
|
||||||
}
|
}
|
||||||
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
|
return current_nodes != 0 &&
|
||||||
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
|
(index + 1 >= length || unichar_repr[index + 1] == '\0') &&
|
||||||
|
current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the minimum number of characters that must be used from this string
|
// Return the minimum number of characters that must be used from this string
|
||||||
// to obtain a match in the UNICHARMAP.
|
// to obtain a match in the UNICHARMAP.
|
||||||
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
|
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
|
||||||
const char* current_char = unichar_repr;
|
const char* current_char = unichar_repr;
|
||||||
|
if (*current_char == '\0') return 0;
|
||||||
UNICHARMAP_NODE* current_nodes = nodes;
|
UNICHARMAP_NODE* current_nodes = nodes;
|
||||||
|
|
||||||
while (current_nodes != NULL && *current_char != '\0') {
|
while (current_nodes != NULL && *current_char != '\0') {
|
||||||
|
@ -36,21 +36,12 @@ class UNICHARMAP {
|
|||||||
// with the given id. The length of the representation MUST be non-zero.
|
// with the given id. The length of the representation MUST be non-zero.
|
||||||
void insert(const char* const unichar_repr, UNICHAR_ID id);
|
void insert(const char* const unichar_repr, UNICHAR_ID id);
|
||||||
|
|
||||||
// Return the id associated with the given unichar representation,
|
|
||||||
// this representation MUST exist within the UNICHARMAP.
|
|
||||||
// The length of the representation MUST be non-zero.
|
|
||||||
UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
|
|
||||||
|
|
||||||
// Return the id associated with the given unichar representation,
|
// Return the id associated with the given unichar representation,
|
||||||
// this representation MUST exist within the UNICHARMAP. The first
|
// this representation MUST exist within the UNICHARMAP. The first
|
||||||
// length characters (maximum) from unichar_repr are used. The length
|
// length characters (maximum) from unichar_repr are used. The length
|
||||||
// MUST be non-zero.
|
// MUST be non-zero.
|
||||||
UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
|
UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
|
||||||
|
|
||||||
// Return true if the given unichar representation is already present in the
|
|
||||||
// UNICHARMAP. The length of the representation MUST be non-zero.
|
|
||||||
bool contains(const char* const unichar_repr) const;
|
|
||||||
|
|
||||||
// Return true if the given unichar representation is already present in the
|
// Return true if the given unichar representation is already present in the
|
||||||
// UNICHARMAP. The first length characters (maximum) from unichar_repr are
|
// UNICHARMAP. The first length characters (maximum) from unichar_repr are
|
||||||
// used. The length MUST be non-zero.
|
// used. The length MUST be non-zero.
|
||||||
|
@ -67,6 +67,15 @@ const char* UNICHARSET::kCustomLigatures[][2] = {
|
|||||||
{NULL, NULL}
|
{NULL, NULL}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// List of mappings to make when ingesting strings from the outside.
|
||||||
|
// The substitutions clean up text that should exist for rendering of
|
||||||
|
// synthetic data, but not in the recognition set.
|
||||||
|
const char* UNICHARSET::kCleanupMaps[][2] = {
|
||||||
|
{"\u0640", ""}, // TATWEEL is deleted.
|
||||||
|
{"\ufb01", "fi"}, // fi ligature->fi pair.
|
||||||
|
{"\ufb02", "fl"}, // fl ligature->fl pair.
|
||||||
|
{nullptr, nullptr}};
|
||||||
|
|
||||||
// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
|
// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
|
||||||
const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
|
const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
|
||||||
" ",
|
" ",
|
||||||
@ -196,15 +205,21 @@ void UNICHARSET::reserve(int unichars_number) {
|
|||||||
|
|
||||||
UNICHAR_ID
|
UNICHAR_ID
|
||||||
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
|
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
|
||||||
return ids.contains(unichar_repr) ?
|
string cleaned =
|
||||||
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
|
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
||||||
|
return ids.contains(cleaned.data(), cleaned.size())
|
||||||
|
? ids.unichar_to_id(cleaned.data(), cleaned.size())
|
||||||
|
: INVALID_UNICHAR_ID;
|
||||||
}
|
}
|
||||||
|
|
||||||
UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
||||||
int length) const {
|
int length) const {
|
||||||
assert(length > 0 && length <= UNICHAR_LEN);
|
assert(length > 0 && length <= UNICHAR_LEN);
|
||||||
return ids.contains(unichar_repr, length) ?
|
string cleaned(unichar_repr, length);
|
||||||
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
|
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
|
||||||
|
return ids.contains(cleaned.data(), cleaned.size())
|
||||||
|
? ids.unichar_to_id(cleaned.data(), cleaned.size())
|
||||||
|
: INVALID_UNICHAR_ID;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
||||||
@ -235,6 +250,9 @@ bool UNICHARSET::encodable_string(const char *str,
|
|||||||
// the rest of the string is still encoded.
|
// the rest of the string is still encoded.
|
||||||
// If lengths is not NULL, then it is filled with the corresponding
|
// If lengths is not NULL, then it is filled with the corresponding
|
||||||
// byte length of each encoded UNICHAR_ID.
|
// byte length of each encoded UNICHAR_ID.
|
||||||
|
// WARNING: Caller must guarantee that str has already been cleaned of codes
|
||||||
|
// that do not belong in the unicharset, or encoding may fail.
|
||||||
|
// Use CleanupString to perform the cleaning.
|
||||||
bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
|
bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
|
||||||
GenericVector<UNICHAR_ID>* encoding,
|
GenericVector<UNICHAR_ID>* encoding,
|
||||||
GenericVector<char>* lengths,
|
GenericVector<char>* lengths,
|
||||||
@ -429,7 +447,7 @@ void UNICHARSET::CopyFrom(const UNICHARSET& src) {
|
|||||||
for (int ch = 0; ch < src.size_used; ++ch) {
|
for (int ch = 0; ch < src.size_used; ++ch) {
|
||||||
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
||||||
const char* utf8 = src.id_to_unichar(ch);
|
const char* utf8 = src.id_to_unichar(ch);
|
||||||
unichar_insert(utf8);
|
unichar_insert_backwards_compatible(utf8);
|
||||||
unichars[ch].properties.ExpandRangesFrom(src_props);
|
unichars[ch].properties.ExpandRangesFrom(src_props);
|
||||||
}
|
}
|
||||||
// Set properties, including mirror and other_case, WITHOUT reordering
|
// Set properties, including mirror and other_case, WITHOUT reordering
|
||||||
@ -445,24 +463,13 @@ void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
|
|||||||
for (int ch = 0; ch < src.size_used; ++ch) {
|
for (int ch = 0; ch < src.size_used; ++ch) {
|
||||||
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
||||||
const char* utf8 = src.id_to_unichar(ch);
|
const char* utf8 = src.id_to_unichar(ch);
|
||||||
if (ch >= SPECIAL_UNICHAR_CODES_COUNT && src_props.AnyRangeEmpty()) {
|
|
||||||
// Only use fully valid entries.
|
|
||||||
tprintf("Bad properties for index %d, char %s: "
|
|
||||||
"%d,%d %d,%d %g,%g %g,%g %g,%g\n",
|
|
||||||
ch, utf8, src_props.min_bottom, src_props.max_bottom,
|
|
||||||
src_props.min_top, src_props.max_top,
|
|
||||||
src_props.width, src_props.width_sd,
|
|
||||||
src_props.bearing, src_props.bearing_sd,
|
|
||||||
src_props.advance, src_props.advance_sd);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
int id = size_used;
|
int id = size_used;
|
||||||
if (contains_unichar(utf8)) {
|
if (contains_unichar(utf8)) {
|
||||||
id = unichar_to_id(utf8);
|
id = unichar_to_id(utf8);
|
||||||
// Just expand current ranges.
|
// Just expand current ranges.
|
||||||
unichars[id].properties.ExpandRangesFrom(src_props);
|
unichars[id].properties.ExpandRangesFrom(src_props);
|
||||||
} else {
|
} else {
|
||||||
unichar_insert(utf8);
|
unichar_insert_backwards_compatible(utf8);
|
||||||
unichars[id].properties.SetRangesEmpty();
|
unichars[id].properties.SetRangesEmpty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -613,40 +620,55 @@ char UNICHARSET::get_chartype(UNICHAR_ID id) const {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
void UNICHARSET::unichar_insert(const char* const unichar_repr,
|
||||||
if (!ids.contains(unichar_repr)) {
|
OldUncleanUnichars old_style) {
|
||||||
if (strlen(unichar_repr) > UNICHAR_LEN) {
|
if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
|
||||||
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
|
string cleaned =
|
||||||
int(strlen(unichar_repr)), unichar_repr);
|
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
||||||
|
if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
|
||||||
|
const char* str = cleaned.c_str();
|
||||||
|
GenericVector<int> encoding;
|
||||||
|
if (!old_style_included_ &&
|
||||||
|
encode_string(str, true, &encoding, nullptr, nullptr))
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
if (size_used == size_reserved) {
|
if (size_used == size_reserved) {
|
||||||
if (size_used == 0)
|
if (size_used == 0)
|
||||||
reserve(8);
|
reserve(8);
|
||||||
else
|
else
|
||||||
reserve(2 * size_used);
|
reserve(2 * size_used);
|
||||||
}
|
}
|
||||||
|
int index = 0;
|
||||||
strcpy(unichars[size_used].representation, unichar_repr);
|
do {
|
||||||
|
if (index > UNICHAR_LEN) {
|
||||||
|
fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
|
||||||
|
unichar_repr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unichars[size_used].representation[index++] = *str++;
|
||||||
|
} while (*str != '\0');
|
||||||
|
unichars[size_used].representation[index] = '\0';
|
||||||
this->set_script(size_used, null_script);
|
this->set_script(size_used, null_script);
|
||||||
// If the given unichar_repr represents a fragmented character, set
|
// If the given unichar_repr represents a fragmented character, set
|
||||||
// fragment property to a pointer to CHAR_FRAGMENT class instance with
|
// fragment property to a pointer to CHAR_FRAGMENT class instance with
|
||||||
// information parsed from the unichar representation. Use the script
|
// information parsed from the unichar representation. Use the script
|
||||||
// of the base unichar for the fragmented character if possible.
|
// of the base unichar for the fragmented character if possible.
|
||||||
CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
|
CHAR_FRAGMENT* frag =
|
||||||
|
CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
|
||||||
this->unichars[size_used].properties.fragment = frag;
|
this->unichars[size_used].properties.fragment = frag;
|
||||||
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
|
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
|
||||||
this->unichars[size_used].properties.script_id =
|
this->unichars[size_used].properties.script_id =
|
||||||
this->get_script(frag->get_unichar());
|
this->get_script(frag->get_unichar());
|
||||||
}
|
}
|
||||||
this->unichars[size_used].properties.enabled = true;
|
this->unichars[size_used].properties.enabled = true;
|
||||||
ids.insert(unichar_repr, size_used);
|
ids.insert(unichars[size_used].representation, size_used);
|
||||||
++size_used;
|
++size_used;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
|
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
|
||||||
return ids.contains(unichar_repr);
|
string cleaned =
|
||||||
|
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
||||||
|
return ids.contains(cleaned.data(), cleaned.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
|
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
|
||||||
@ -654,7 +676,9 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr,
|
|||||||
if (length == 0) {
|
if (length == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return ids.contains(unichar_repr, length);
|
string cleaned(unichar_repr, length);
|
||||||
|
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
|
||||||
|
return ids.contains(cleaned.data(), cleaned.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
|
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
|
||||||
@ -840,7 +864,7 @@ bool UNICHARSET::load_via_fgets(
|
|||||||
if (strcmp(unichar, "NULL") == 0)
|
if (strcmp(unichar, "NULL") == 0)
|
||||||
this->unichar_insert(" ");
|
this->unichar_insert(" ");
|
||||||
else
|
else
|
||||||
this->unichar_insert(unichar);
|
this->unichar_insert_backwards_compatible(unichar);
|
||||||
|
|
||||||
this->set_isalpha(id, properties & ISALPHA_MASK);
|
this->set_isalpha(id, properties & ISALPHA_MASK);
|
||||||
this->set_islower(id, properties & ISLOWER_MASK);
|
this->set_islower(id, properties & ISLOWER_MASK);
|
||||||
@ -1088,3 +1112,32 @@ int UNICHARSET::get_script_id_from_name(const char* script_name) const {
|
|||||||
}
|
}
|
||||||
return 0; // 0 is always the null_script
|
return 0; // 0 is always the null_script
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Removes/replaces content that belongs in rendered text, but not in the
|
||||||
|
// unicharset.
|
||||||
|
/* static */
|
||||||
|
string UNICHARSET::CleanupString(const char* utf8_str, int length) {
|
||||||
|
string result;
|
||||||
|
result.reserve(length);
|
||||||
|
char ch;
|
||||||
|
while ((ch = *utf8_str) != '\0' && --length >= 0) {
|
||||||
|
int key_index = 0;
|
||||||
|
const char* key;
|
||||||
|
while ((key = kCleanupMaps[key_index][0]) != nullptr) {
|
||||||
|
int match = 0;
|
||||||
|
while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
|
||||||
|
if (key[match] == '\0') {
|
||||||
|
utf8_str += match;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
++key_index;
|
||||||
|
}
|
||||||
|
if (key == nullptr) {
|
||||||
|
result.push_back(ch);
|
||||||
|
++utf8_str;
|
||||||
|
} else {
|
||||||
|
result.append(kCleanupMaps[key_index][1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
@ -39,6 +39,13 @@ enum SpecialUnicharCodes {
|
|||||||
SPECIAL_UNICHAR_CODES_COUNT
|
SPECIAL_UNICHAR_CODES_COUNT
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Boolean flag for unichar_insert. It's a bit of a double negative to allow
|
||||||
|
// the default value to be false.
|
||||||
|
enum class OldUncleanUnichars {
|
||||||
|
kFalse,
|
||||||
|
kTrue,
|
||||||
|
};
|
||||||
|
|
||||||
class CHAR_FRAGMENT {
|
class CHAR_FRAGMENT {
|
||||||
public:
|
public:
|
||||||
// Minimum number of characters used for fragment representation.
|
// Minimum number of characters used for fragment representation.
|
||||||
@ -190,7 +197,7 @@ class UNICHARSET {
|
|||||||
// Use encode_string in preference to repeatedly calling step.
|
// Use encode_string in preference to repeatedly calling step.
|
||||||
int step(const char* str) const;
|
int step(const char* str) const;
|
||||||
|
|
||||||
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
|
// Returns true if the given UTF-8 string is encodable with this UNICHARSET.
|
||||||
// If not encodable, write the first byte offset which cannot be converted
|
// If not encodable, write the first byte offset which cannot be converted
|
||||||
// into the second (return) argument.
|
// into the second (return) argument.
|
||||||
bool encodable_string(const char *str, int *first_bad_position) const;
|
bool encodable_string(const char *str, int *first_bad_position) const;
|
||||||
@ -207,6 +214,9 @@ class UNICHARSET {
|
|||||||
// If encoded_length is not NULL then on return it contains the length of
|
// If encoded_length is not NULL then on return it contains the length of
|
||||||
// str that was encoded. (if give_up_on_failure the location of the first
|
// str that was encoded. (if give_up_on_failure the location of the first
|
||||||
// failure, otherwise strlen(str).)
|
// failure, otherwise strlen(str).)
|
||||||
|
// WARNING: Caller must guarantee that str has already been cleaned of codes
|
||||||
|
// that do not belong in the unicharset, or encoding may fail.
|
||||||
|
// Use CleanupString to perform the cleaning.
|
||||||
bool encode_string(const char* str, bool give_up_on_failure,
|
bool encode_string(const char* str, bool give_up_on_failure,
|
||||||
GenericVector<UNICHAR_ID>* encoding,
|
GenericVector<UNICHAR_ID>* encoding,
|
||||||
GenericVector<char>* lengths,
|
GenericVector<char>* lengths,
|
||||||
@ -226,6 +236,13 @@ class UNICHARSET {
|
|||||||
// by its hex unicodes.
|
// by its hex unicodes.
|
||||||
static STRING debug_utf8_str(const char* str);
|
static STRING debug_utf8_str(const char* str);
|
||||||
|
|
||||||
|
// Removes/replaces content that belongs in rendered text, but not in the
|
||||||
|
// unicharset.
|
||||||
|
static string CleanupString(const char* utf8_str) {
|
||||||
|
return CleanupString(utf8_str, strlen(utf8_str));
|
||||||
|
}
|
||||||
|
static string CleanupString(const char* utf8_str, int length);
|
||||||
|
|
||||||
// Return a STRING containing debug information on the unichar, including
|
// Return a STRING containing debug information on the unichar, including
|
||||||
// the id_to_unichar, its hex unicodes and the properties.
|
// the id_to_unichar, its hex unicodes and the properties.
|
||||||
STRING debug_str(UNICHAR_ID id) const;
|
STRING debug_str(UNICHAR_ID id) const;
|
||||||
@ -233,8 +250,29 @@ class UNICHARSET {
|
|||||||
return debug_str(unichar_to_id(unichar_repr));
|
return debug_str(unichar_to_id(unichar_repr));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add a unichar representation to the set.
|
// Adds a unichar representation to the set. If old_style is true, then
|
||||||
void unichar_insert(const char* const unichar_repr);
|
// TATWEEL characters are kept and n-grams are allowed. Otherwise TATWEEL
|
||||||
|
// characters are ignored/skipped as if they don't exist and n-grams that
|
||||||
|
// can already be encoded are not added.
|
||||||
|
void unichar_insert(const char* const unichar_repr,
|
||||||
|
OldUncleanUnichars old_style);
|
||||||
|
void unichar_insert(const char* const unichar_repr) {
|
||||||
|
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
|
||||||
|
}
|
||||||
|
// Adds a unichar representation to the set. Avoids setting old_style to true,
|
||||||
|
// unless it is necessary to make the new unichar get added.
|
||||||
|
void unichar_insert_backwards_compatible(const char* const unichar_repr) {
|
||||||
|
string cleaned = CleanupString(unichar_repr);
|
||||||
|
if (cleaned != unichar_repr) {
|
||||||
|
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
||||||
|
} else {
|
||||||
|
int old_size = size();
|
||||||
|
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
|
||||||
|
if (size() == old_size) {
|
||||||
|
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Return true if the given unichar id exists within the set.
|
// Return true if the given unichar id exists within the set.
|
||||||
// Relies on the fact that unichar ids are contiguous in the unicharset.
|
// Relies on the fact that unichar ids are contiguous in the unicharset.
|
||||||
@ -282,6 +320,7 @@ class UNICHARSET {
|
|||||||
top_bottom_set_ = false;
|
top_bottom_set_ = false;
|
||||||
script_has_upper_lower_ = false;
|
script_has_upper_lower_ = false;
|
||||||
script_has_xheight_ = false;
|
script_has_xheight_ = false;
|
||||||
|
old_style_included_ = false;
|
||||||
null_sid_ = 0;
|
null_sid_ = 0;
|
||||||
common_sid_ = 0;
|
common_sid_ = 0;
|
||||||
latin_sid_ = 0;
|
latin_sid_ = 0;
|
||||||
@ -743,7 +782,7 @@ class UNICHARSET {
|
|||||||
// unichar representation represents a character fragment.
|
// unichar representation represents a character fragment.
|
||||||
const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
|
const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
|
||||||
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
|
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
|
||||||
!ids.contains(unichar_repr)) {
|
!ids.contains(unichar_repr, false)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return get_fragment(unichar_to_id(unichar_repr));
|
return get_fragment(unichar_to_id(unichar_repr));
|
||||||
@ -965,6 +1004,11 @@ class UNICHARSET {
|
|||||||
bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
|
bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
|
||||||
bool skip_fragments);
|
bool skip_fragments);
|
||||||
|
|
||||||
|
// List of mappings to make when ingesting strings from the outside.
|
||||||
|
// The substitutions clean up text that should exists for rendering of
|
||||||
|
// synthetic data, but not in the recognition set.
|
||||||
|
static const char* kCleanupMaps[][2];
|
||||||
|
|
||||||
UNICHAR_SLOT* unichars;
|
UNICHAR_SLOT* unichars;
|
||||||
UNICHARMAP ids;
|
UNICHARMAP ids;
|
||||||
int size_used;
|
int size_used;
|
||||||
@ -980,6 +1024,8 @@ class UNICHARSET {
|
|||||||
// True if the unicharset has a significant mean-line with significant
|
// True if the unicharset has a significant mean-line with significant
|
||||||
// ascenders above that.
|
// ascenders above that.
|
||||||
bool script_has_xheight_;
|
bool script_has_xheight_;
|
||||||
|
// True if the set contains chars that would be changed by the cleanup.
|
||||||
|
bool old_style_included_;
|
||||||
|
|
||||||
// A few convenient script name-to-id mapping without using hash.
|
// A few convenient script name-to-id mapping without using hash.
|
||||||
// These are initialized when unicharset file is loaded. Anything
|
// These are initialized when unicharset file is loaded. Anything
|
||||||
|
@ -170,6 +170,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,
|
|||||||
tprintf("Training parameters:\n Debug interval = %d,"
|
tprintf("Training parameters:\n Debug interval = %d,"
|
||||||
" weights = %g, learning rate = %g, momentum=%g\n",
|
" weights = %g, learning rate = %g, momentum=%g\n",
|
||||||
debug_interval_, weight_range_, learning_rate_, momentum_);
|
debug_interval_, weight_range_, learning_rate_, momentum_);
|
||||||
|
tprintf("null char=%d\n", null_char_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -733,7 +734,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
|
|||||||
GenericVector<int> internal_labels;
|
GenericVector<int> internal_labels;
|
||||||
labels->truncate(0);
|
labels->truncate(0);
|
||||||
if (!simple_text) labels->push_back(null_char);
|
if (!simple_text) labels->push_back(null_char);
|
||||||
if (unicharset.encode_string(str.string(), true, &internal_labels, NULL,
|
string cleaned = unicharset.CleanupString(str.string());
|
||||||
|
if (unicharset.encode_string(cleaned.c_str(), true, &internal_labels, NULL,
|
||||||
&err_index)) {
|
&err_index)) {
|
||||||
bool success = true;
|
bool success = true;
|
||||||
for (int i = 0; i < internal_labels.size(); ++i) {
|
for (int i = 0; i < internal_labels.size(); ++i) {
|
||||||
@ -759,8 +761,8 @@ bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset,
|
|||||||
if (success) return true;
|
if (success) return true;
|
||||||
}
|
}
|
||||||
tprintf("Encoding of string failed! Failure bytes:");
|
tprintf("Encoding of string failed! Failure bytes:");
|
||||||
while (err_index < str.length()) {
|
while (err_index < cleaned.size()) {
|
||||||
tprintf(" %x", str[err_index++]);
|
tprintf(" %x", cleaned[err_index++]);
|
||||||
}
|
}
|
||||||
tprintf("\n");
|
tprintf("\n");
|
||||||
return false;
|
return false;
|
||||||
@ -813,8 +815,9 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData* trainingdata,
|
|||||||
training_iteration() % debug_interval_ == 0;
|
training_iteration() % debug_interval_ == 0;
|
||||||
GenericVector<int> truth_labels;
|
GenericVector<int> truth_labels;
|
||||||
if (!EncodeString(trainingdata->transcription(), &truth_labels)) {
|
if (!EncodeString(trainingdata->transcription(), &truth_labels)) {
|
||||||
tprintf("Can't encode transcription: %s\n",
|
tprintf("Can't encode transcription: '%s' in language '%s'\n",
|
||||||
trainingdata->transcription().string());
|
trainingdata->transcription().string(),
|
||||||
|
trainingdata->language().string());
|
||||||
return UNENCODABLE;
|
return UNENCODABLE;
|
||||||
}
|
}
|
||||||
int w = 0;
|
int w = 0;
|
||||||
|
@ -409,9 +409,7 @@ using tesseract::SpanUTF8NotWhitespace;
|
|||||||
using tesseract::SpanUTF8Whitespace;
|
using tesseract::SpanUTF8Whitespace;
|
||||||
using tesseract::StringRenderer;
|
using tesseract::StringRenderer;
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int Main() {
|
||||||
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
|
||||||
|
|
||||||
if (FLAGS_list_available_fonts) {
|
if (FLAGS_list_available_fonts) {
|
||||||
const std::vector<string>& all_fonts = FontUtils::ListAvailableFonts();
|
const std::vector<string>& all_fonts = FontUtils::ListAvailableFonts();
|
||||||
for (unsigned int i = 0; i < all_fonts.size(); ++i) {
|
for (unsigned int i = 0; i < all_fonts.size(); ++i) {
|
||||||
@ -543,8 +541,9 @@ int main(int argc, char** argv) {
|
|||||||
const char *curr_pos = str8 + offsets[i].first;
|
const char *curr_pos = str8 + offsets[i].first;
|
||||||
int ngram_len = offsets[i].second;
|
int ngram_len = offsets[i].second;
|
||||||
// Skip words that contain characters not in found in unicharset.
|
// Skip words that contain characters not in found in unicharset.
|
||||||
|
string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
|
||||||
if (!FLAGS_unicharset_file.empty() &&
|
if (!FLAGS_unicharset_file.empty() &&
|
||||||
!unicharset.encodable_string(curr_pos, nullptr)) {
|
!unicharset.encodable_string(cleaned.c_str(), nullptr)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
rand_utf8.append(curr_pos, ngram_len);
|
rand_utf8.append(curr_pos, ngram_len);
|
||||||
@ -665,3 +664,8 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
||||||
|
Main();
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user