mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
Removed debug messages, forward compatability of traineddata files, further bug fix.
This commit is contained in:
parent
a303ab9d00
commit
44122698d7
@ -1,6 +1,11 @@
|
||||
/**********************************************************************
|
||||
* File: pageres.cpp (Formerly page_res.c)
|
||||
* Description: Results classes used by control.c
|
||||
* Description: Hierarchy of results classes from PAGE_RES to WERD_RES
|
||||
* and an iterator class to iterate over the words.
|
||||
* Main purposes:
|
||||
* Easy way to iterate over the words without a 3-nested loop.
|
||||
* Holds data used during word recognition.
|
||||
* Holds information about alternative spacing paths.
|
||||
* Author: Phil Cheatle
|
||||
* Created: Tue Sep 22 08:42:49 BST 1992
|
||||
*
|
||||
@ -1478,8 +1483,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
|
||||
WERD* real_word = word_res->word;
|
||||
if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
|
||||
real_word->set_flag(W_FUZZY_SP, true);
|
||||
tprintf("Made word fuzzy at:");
|
||||
real_word->bounding_box().print();
|
||||
if (word_res->combination) {
|
||||
// The next word should be the corresponding part of combo, but we have
|
||||
// already stepped past it, so find it by search.
|
||||
@ -1493,8 +1496,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
|
||||
ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
|
||||
!real_word->flag(W_FUZZY_NON));
|
||||
real_word->set_flag(W_FUZZY_SP, true);
|
||||
tprintf("Made part of combo word fuzzy at:");
|
||||
real_word->bounding_box().print();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -50,7 +50,10 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
|
||||
ReverseN(&actual_tessdata_num_entries_,
|
||||
sizeof(actual_tessdata_num_entries_));
|
||||
}
|
||||
ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
|
||||
if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
|
||||
// For forward compatability, truncate to the number we can handle.
|
||||
actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
|
||||
}
|
||||
fread(offset_table_, sizeof(inT64),
|
||||
actual_tessdata_num_entries_, data_file_);
|
||||
if (swap_) {
|
||||
|
@ -215,34 +215,6 @@ int UNICHARSET::step(const char* str) const {
|
||||
if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
|
||||
return lengths[0];
|
||||
}
|
||||
// As step except constraining the search to unichar-ids that are
|
||||
// self-normalized. Unlike step, does not encode the whole string, therefore
|
||||
// should be used on short strings (like those obtained from
|
||||
// get_normed_unichar.)
|
||||
int UNICHARSET::normed_step(const char* str) const {
|
||||
// Find the length of the first matching unicharset member.
|
||||
int length = ids.minmatch(str);
|
||||
if (length == 0)
|
||||
return 0; // Empty string or illegal char.
|
||||
|
||||
while (length <= UNICHAR_LEN) {
|
||||
if (ids.contains(str, length)) {
|
||||
int matched_id = unichar_to_id(str, length);
|
||||
const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id);
|
||||
bool good_start = matched_norms.size() == 1 &&
|
||||
matched_norms[0] == matched_id;
|
||||
if (str[length] == '\0') {
|
||||
return good_start ? length : 0;
|
||||
}
|
||||
if (normed_step(str + length) > 0)
|
||||
return length; // This length works!
|
||||
} else if (str[length] == '\0') {
|
||||
return 0; // Ran out of string.
|
||||
}
|
||||
++length;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
|
||||
// If not encodable, write the first byte offset which cannot be converted
|
||||
@ -375,19 +347,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
||||
// stored in the file, and needs to be set when the UNICHARSET is loaded.
|
||||
void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
|
||||
unichars[unichar_id].properties.normed_ids.truncate(0);
|
||||
int length = unichars[unichar_id].properties.normed.length();
|
||||
const char* normed_str = unichars[unichar_id].properties.normed.string();
|
||||
int step = 0;
|
||||
for (int offset = 0; offset < length; offset+= step) {
|
||||
step = normed_step(normed_str + offset);
|
||||
if (step == 0) {
|
||||
if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
|
||||
unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
|
||||
} else if (!encode_string(unichars[unichar_id].properties.normed.string(),
|
||||
true, &unichars[unichar_id].properties.normed_ids,
|
||||
NULL, NULL)) {
|
||||
unichars[unichar_id].properties.normed_ids.truncate(0);
|
||||
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
|
||||
break;
|
||||
}
|
||||
int normed_id = unichar_to_id(normed_str + offset, step);
|
||||
ASSERT_HOST(normed_id >= 0);
|
||||
unichars[unichar_id].properties.normed_ids.push_back(normed_id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1015,6 +981,24 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if there are any repeated unicodes in the normalized
|
||||
// text of any unichar-id in the unicharset.
|
||||
bool UNICHARSET::AnyRepeatedUnicodes() const {
|
||||
int start_id = 0;
|
||||
if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
|
||||
for (int id = start_id; id < size_used; ++id) {
|
||||
// Convert to unicodes.
|
||||
GenericVector<int> unicodes;
|
||||
if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
|
||||
unicodes.size() > 1) {
|
||||
for (int u = 1; u < unicodes.size(); ++u) {
|
||||
if (unicodes[u - 1] == unicodes[u]) return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int UNICHARSET::add_script(const char* script) {
|
||||
for (int i = 0; i < script_table_size_used; ++i) {
|
||||
if (strcmp(script, script_table[i]) == 0)
|
||||
|
@ -190,11 +190,6 @@ class UNICHARSET {
|
||||
// WARNING: this function now encodes the whole string for precision.
|
||||
// Use encode_string in preference to repeatedly calling step.
|
||||
int step(const char* str) const;
|
||||
// As step except constraining the search to unichar-ids that are
|
||||
// self-normalized. Unlike step, does not encode the whole string, therefore
|
||||
// should be used on short strings (like those obtained from
|
||||
// get_normed_unichar.)
|
||||
int normed_step(const char* str) const;
|
||||
|
||||
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
|
||||
// If not encodable, write the first byte offset which cannot be converted
|
||||
@ -678,6 +673,10 @@ class UNICHARSET {
|
||||
kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
|
||||
}
|
||||
|
||||
// Returns true if there are any repeated unicodes in the normalized
|
||||
// text of any unichar-id in the unicharset.
|
||||
bool AnyRepeatedUnicodes() const;
|
||||
|
||||
// Return a pointer to the CHAR_FRAGMENT class if the given
|
||||
// unichar id represents a character fragment.
|
||||
const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
|
||||
@ -775,6 +774,7 @@ class UNICHARSET {
|
||||
|
||||
// Returns normalized version of unichar with the given unichar_id.
|
||||
const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
|
||||
if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
|
||||
return unichars[unichar_id].properties.normed.string();
|
||||
}
|
||||
// Returns a vector of UNICHAR_IDs that represent the ids of the normalized
|
||||
|
Loading…
Reference in New Issue
Block a user