mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
Merge pull request #3592 from stweil/unsigned
Fix compiler warnings (mainly signed / unsigned mismatches) and modernize some code
This commit is contained in:
commit
5a36943de4
@ -243,7 +243,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
|
|||||||
std::vector<BLOB_CHOICE *> blob_choices;
|
std::vector<BLOB_CHOICE *> blob_choices;
|
||||||
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
|
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
|
||||||
auto rating = static_cast<float>(INT8_MAX);
|
auto rating = static_cast<float>(INT8_MAX);
|
||||||
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
|
for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
|
||||||
// The rating and certainty are not quite arbitrary. Since
|
// The rating and certainty are not quite arbitrary. Since
|
||||||
// select_blob_to_chop uses the worst certainty to choose, they all have
|
// select_blob_to_chop uses the worst certainty to choose, they all have
|
||||||
// to be different, so starting with INT8_MAX, subtract 1/8 for each blob
|
// to be different, so starting with INT8_MAX, subtract 1/8 for each blob
|
||||||
@ -257,7 +257,7 @@ void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block,
|
|||||||
rating -= 0.125f;
|
rating -= 0.125f;
|
||||||
}
|
}
|
||||||
const double e = exp(1.0); // The base of natural logs.
|
const double e = exp(1.0); // The base of natural logs.
|
||||||
int blob_number;
|
unsigned blob_number;
|
||||||
int right_chop_index = 0;
|
int right_chop_index = 0;
|
||||||
if (!assume_fixed_pitch_char_segment) {
|
if (!assume_fixed_pitch_char_segment) {
|
||||||
// We only chop if the language is not fixed pitch like CJK.
|
// We only chop if the language is not fixed pitch like CJK.
|
||||||
@ -613,8 +613,8 @@ bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WER
|
|||||||
/// @param best_rating
|
/// @param best_rating
|
||||||
/// @param best_segmentation
|
/// @param best_segmentation
|
||||||
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||||
int choices_length, const std::vector<UNICHAR_ID> &target_text,
|
unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
|
||||||
int text_index, float rating, std::vector<int> *segmentation,
|
unsigned text_index, float rating, std::vector<int> *segmentation,
|
||||||
float *best_rating, std::vector<int> *best_segmentation) {
|
float *best_rating, std::vector<int> *best_segmentation) {
|
||||||
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
|
const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
|
||||||
for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
|
for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
|
||||||
@ -625,12 +625,12 @@ void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, in
|
|||||||
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
|
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
|
||||||
const BLOB_CHOICE *choice = choice_it.data();
|
const BLOB_CHOICE *choice = choice_it.data();
|
||||||
choice_rating = choice->rating();
|
choice_rating = choice->rating();
|
||||||
UNICHAR_ID class_id = choice->unichar_id();
|
auto class_id = choice->unichar_id();
|
||||||
if (class_id == target_text[text_index]) {
|
if (class_id == target_text[text_index]) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Search ambigs table.
|
// Search ambigs table.
|
||||||
if (class_id < table.size() && table[class_id] != nullptr) {
|
if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
|
||||||
AmbigSpec_IT spec_it(table[class_id]);
|
AmbigSpec_IT spec_it(table[class_id]);
|
||||||
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
|
for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
|
||||||
const AmbigSpec *ambig_spec = spec_it.data();
|
const AmbigSpec *ambig_spec = spec_it.data();
|
||||||
|
@ -227,7 +227,7 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (word->word->tess_failed) {
|
if (word->word->tess_failed) {
|
||||||
int s;
|
unsigned s;
|
||||||
for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
|
for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
|
||||||
}
|
}
|
||||||
// If all are failed, skip it. Image words are skipped by this test.
|
// If all are failed, skip it. Image words are skipped by this test.
|
||||||
@ -727,7 +727,7 @@ void Tesseract::script_pos_pass(PAGE_RES *page_res) {
|
|||||||
// Scan for upper/lower.
|
// Scan for upper/lower.
|
||||||
int num_upper = 0;
|
int num_upper = 0;
|
||||||
int num_lower = 0;
|
int num_lower = 0;
|
||||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word->best_choice->length(); ++i) {
|
||||||
if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
|
if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
|
||||||
++num_upper;
|
++num_upper;
|
||||||
} else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
|
} else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
|
||||||
@ -743,7 +743,7 @@ void Tesseract::script_pos_pass(PAGE_RES *page_res) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Helper finds the gap between the index word and the next.
|
// Helper finds the gap between the index word and the next.
|
||||||
static void WordGap(const PointerVector<WERD_RES> &words, int index, int *right, int *next_left) {
|
static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {
|
||||||
*right = -INT32_MAX;
|
*right = -INT32_MAX;
|
||||||
*next_left = INT32_MAX;
|
*next_left = INT32_MAX;
|
||||||
if (index < words.size()) {
|
if (index < words.size()) {
|
||||||
@ -756,13 +756,13 @@ static void WordGap(const PointerVector<WERD_RES> &words, int index, int *right,
|
|||||||
|
|
||||||
// Factored helper computes the rating, certainty, badness and validity of
|
// Factored helper computes the rating, certainty, badness and validity of
|
||||||
// the permuter of the words in [first_index, end_index).
|
// the permuter of the words in [first_index, end_index).
|
||||||
static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, int first_index, int end_index,
|
static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,
|
||||||
float *rating, float *certainty, bool *bad, bool *valid_permuter) {
|
float *rating, float *certainty, bool *bad, bool *valid_permuter) {
|
||||||
if (end_index <= first_index) {
|
if (end_index <= first_index) {
|
||||||
*bad = true;
|
*bad = true;
|
||||||
*valid_permuter = false;
|
*valid_permuter = false;
|
||||||
}
|
}
|
||||||
for (int index = first_index; index < end_index && index < words.size(); ++index) {
|
for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {
|
||||||
WERD_CHOICE *choice = words[index]->best_choice;
|
WERD_CHOICE *choice = words[index]->best_choice;
|
||||||
if (choice == nullptr) {
|
if (choice == nullptr) {
|
||||||
*bad = true;
|
*bad = true;
|
||||||
@ -790,11 +790,11 @@ static int SelectBestWords(double rating_ratio, double certainty_margin, bool de
|
|||||||
// boundary at the end.
|
// boundary at the end.
|
||||||
std::vector<WERD_RES *> out_words;
|
std::vector<WERD_RES *> out_words;
|
||||||
// Index into each word vector (best, new).
|
// Index into each word vector (best, new).
|
||||||
int b = 0, n = 0;
|
unsigned b = 0, n = 0;
|
||||||
int num_best = 0, num_new = 0;
|
int num_best = 0, num_new = 0;
|
||||||
while (b < best_words->size() || n < new_words->size()) {
|
while (b < best_words->size() || n < new_words->size()) {
|
||||||
// Start of the current run in each.
|
// Start of the current run in each.
|
||||||
int start_b = b, start_n = n;
|
auto start_b = b, start_n = n;
|
||||||
while (b < best_words->size() || n < new_words->size()) {
|
while (b < best_words->size() || n < new_words->size()) {
|
||||||
int b_right = -INT32_MAX;
|
int b_right = -INT32_MAX;
|
||||||
int next_b_left = INT32_MAX;
|
int next_b_left = INT32_MAX;
|
||||||
@ -884,7 +884,7 @@ int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recog
|
|||||||
*in_word = nullptr;
|
*in_word = nullptr;
|
||||||
}
|
}
|
||||||
if (debug) {
|
if (debug) {
|
||||||
for (int i = 0; i < new_words.size(); ++i) {
|
for (unsigned i = 0; i < new_words.size(); ++i) {
|
||||||
new_words[i]->DebugTopChoice("Lang result");
|
new_words[i]->DebugTopChoice("Lang result");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -896,7 +896,7 @@ int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recog
|
|||||||
|
|
||||||
// Helper returns true if all the words are acceptable.
|
// Helper returns true if all the words are acceptable.
|
||||||
static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
|
static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
|
||||||
for (int w = 0; w < words.size(); ++w) {
|
for (unsigned w = 0; w < words.size(); ++w) {
|
||||||
if (words[w]->tess_failed || !words[w]->tess_accepted) {
|
if (words[w]->tess_failed || !words[w]->tess_accepted) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1597,10 +1597,10 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *b
|
|||||||
word->fix_hyphens();
|
word->fix_hyphens();
|
||||||
}
|
}
|
||||||
/* Don't trust fix_quotes! - though I think I've fixed the bug */
|
/* Don't trust fix_quotes! - though I think I've fixed the bug */
|
||||||
if (word->best_choice->length() != word->box_word->length()) {
|
if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
|
||||||
tprintf(
|
tprintf(
|
||||||
"POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
|
"POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
|
||||||
" #Blobs=%d\n",
|
" #Blobs=%u\n",
|
||||||
word->best_choice->debug_string().c_str(), word->best_choice->length(),
|
word->best_choice->debug_string().c_str(), word->best_choice->length(),
|
||||||
word->box_word->length());
|
word->box_word->length());
|
||||||
}
|
}
|
||||||
@ -1621,7 +1621,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *b
|
|||||||
static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
|
static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
|
||||||
// Find the corresponding best BLOB_CHOICE from any position in the word_res.
|
// Find the corresponding best BLOB_CHOICE from any position in the word_res.
|
||||||
BLOB_CHOICE *best_choice = nullptr;
|
BLOB_CHOICE *best_choice = nullptr;
|
||||||
for (int i = 0; i < word_res->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
|
||||||
BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
|
BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
|
||||||
if (choice != nullptr) {
|
if (choice != nullptr) {
|
||||||
if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
|
if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
|
||||||
@ -1637,7 +1637,7 @@ static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_re
|
|||||||
// in the best_choice.
|
// in the best_choice.
|
||||||
static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
|
static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
|
||||||
WERD_CHOICE *word = word_res->best_choice;
|
WERD_CHOICE *word = word_res->best_choice;
|
||||||
for (int i = 0; i < word_res->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
|
||||||
BLOB_CHOICE *choice =
|
BLOB_CHOICE *choice =
|
||||||
FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
|
FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
|
||||||
if (choice == nullptr) {
|
if (choice == nullptr) {
|
||||||
@ -1646,7 +1646,7 @@ static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Correct any incorrect results in word.
|
// Correct any incorrect results in word.
|
||||||
for (int i = 0; i < word->length(); ++i) {
|
for (unsigned i = 0; i < word->length(); ++i) {
|
||||||
if (word->unichar_id(i) != blob_choice->unichar_id()) {
|
if (word->unichar_id(i) != blob_choice->unichar_id()) {
|
||||||
word->set_unichar_id(blob_choice->unichar_id(), i);
|
word->set_unichar_id(blob_choice->unichar_id(), i);
|
||||||
}
|
}
|
||||||
@ -1666,7 +1666,7 @@ void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {
|
|||||||
|
|
||||||
// Find the frequency of each unique character in the word.
|
// Find the frequency of each unique character in the word.
|
||||||
SortHelper<UNICHAR_ID> rep_ch(word.length());
|
SortHelper<UNICHAR_ID> rep_ch(word.length());
|
||||||
for (int i = 0; i < word.length(); ++i) {
|
for (unsigned i = 0; i < word.length(); ++i) {
|
||||||
rep_ch.Add(word.unichar_id(i), 1);
|
rep_ch.Add(word.unichar_id(i), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1951,7 +1951,7 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
|
|||||||
if (tessedit_debug_fonts) {
|
if (tessedit_debug_fonts) {
|
||||||
tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
|
tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
|
||||||
}
|
}
|
||||||
for (int b = 0; b < word->best_choice->length(); ++b) {
|
for (unsigned b = 0; b < word->best_choice->length(); ++b) {
|
||||||
const BLOB_CHOICE *choice = word->GetBlobChoice(b);
|
const BLOB_CHOICE *choice = word->GetBlobChoice(b);
|
||||||
if (choice == nullptr) {
|
if (choice == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -64,7 +64,7 @@ int16_t Tesseract::word_outline_errs(WERD_RES *word) {
|
|||||||
int16_t err_count = 0;
|
int16_t err_count = 0;
|
||||||
|
|
||||||
if (word->rebuild_word != nullptr) {
|
if (word->rebuild_word != nullptr) {
|
||||||
for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
|
for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
|
||||||
TBLOB *blob = word->rebuild_word->blobs[b];
|
TBLOB *blob = word->rebuild_word->blobs[b];
|
||||||
err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
|
err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
|
||||||
i++;
|
i++;
|
||||||
@ -911,7 +911,7 @@ bool Tesseract::noise_outlines(TWERD *word) {
|
|||||||
int16_t max_dimension;
|
int16_t max_dimension;
|
||||||
float small_limit = kBlnXHeight * crunch_small_outlines_size;
|
float small_limit = kBlnXHeight * crunch_small_outlines_size;
|
||||||
|
|
||||||
for (int b = 0; b < word->NumBlobs(); ++b) {
|
for (unsigned b = 0; b < word->NumBlobs(); ++b) {
|
||||||
TBLOB *blob = word->blobs[b];
|
TBLOB *blob = word->blobs[b];
|
||||||
for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
|
for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
|
||||||
outline_count++;
|
outline_count++;
|
||||||
|
@ -742,7 +742,7 @@ int EquationDetect::CountAlignment(const std::vector<int> &sorted_vec, const int
|
|||||||
|
|
||||||
// Search right side.
|
// Search right side.
|
||||||
index = pos + 1 - sorted_vec.begin();
|
index = pos + 1 - sorted_vec.begin();
|
||||||
while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
|
while (static_cast<size_t>(index) < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -262,7 +262,7 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
|||||||
int16_t total_score = 0;
|
int16_t total_score = 0;
|
||||||
int16_t word_count = 0;
|
int16_t word_count = 0;
|
||||||
int16_t done_word_count = 0;
|
int16_t done_word_count = 0;
|
||||||
int16_t i;
|
int i;
|
||||||
int16_t offset;
|
int16_t offset;
|
||||||
int16_t prev_word_score = 0;
|
int16_t prev_word_score = 0;
|
||||||
bool prev_word_done = false;
|
bool prev_word_done = false;
|
||||||
@ -684,7 +684,6 @@ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
|
|||||||
|
|
||||||
int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
|
int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
|
||||||
float noise_score[512];
|
float noise_score[512];
|
||||||
int i;
|
|
||||||
int min_noise_blob; // 1st contender
|
int min_noise_blob; // 1st contender
|
||||||
int max_noise_blob; // last contender
|
int max_noise_blob; // last contender
|
||||||
int non_noise_count;
|
int non_noise_count;
|
||||||
@ -697,7 +696,7 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Normalised.
|
// Normalised.
|
||||||
int blob_count = word_res->box_word->length();
|
auto blob_count = word_res->box_word->length();
|
||||||
ASSERT_HOST(blob_count <= 512);
|
ASSERT_HOST(blob_count <= 512);
|
||||||
if (blob_count < 5) {
|
if (blob_count < 5) {
|
||||||
return -1; // too short to split
|
return -1; // too short to split
|
||||||
@ -712,7 +711,7 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
|
for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
|
||||||
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
||||||
if (word_res->reject_map[i].accepted()) {
|
if (word_res->reject_map[i].accepted()) {
|
||||||
noise_score[i] = non_noise_limit;
|
noise_score[i] = non_noise_limit;
|
||||||
@ -731,7 +730,8 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
|
|||||||
/* Now find the worst one which is far enough away from the end of the word */
|
/* Now find the worst one which is far enough away from the end of the word */
|
||||||
|
|
||||||
non_noise_count = 0;
|
non_noise_count = 0;
|
||||||
for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
|
int i;
|
||||||
|
for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
|
||||||
if (noise_score[i] >= non_noise_limit) {
|
if (noise_score[i] >= non_noise_limit) {
|
||||||
non_noise_count++;
|
non_noise_count++;
|
||||||
}
|
}
|
||||||
@ -760,7 +760,7 @@ int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score
|
|||||||
|
|
||||||
*worst_noise_score = small_limit;
|
*worst_noise_score = small_limit;
|
||||||
worst_noise_blob = -1;
|
worst_noise_blob = -1;
|
||||||
for (i = min_noise_blob; i <= max_noise_blob; i++) {
|
for (auto i = min_noise_blob; i <= max_noise_blob; i++) {
|
||||||
if (noise_score[i] < *worst_noise_score) {
|
if (noise_score[i] < *worst_noise_score) {
|
||||||
worst_noise_blob = i;
|
worst_noise_blob = i;
|
||||||
*worst_noise_score = noise_score[i];
|
*worst_noise_score = noise_score[i];
|
||||||
@ -838,7 +838,6 @@ int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
|||||||
WERD_RES_IT word_it(&word_res_list);
|
WERD_RES_IT word_it(&word_res_list);
|
||||||
WERD_RES *word;
|
WERD_RES *word;
|
||||||
int16_t score = 0;
|
int16_t score = 0;
|
||||||
int16_t i;
|
|
||||||
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
|
float small_limit = kBlnXHeight * fixsp_small_outlines_size;
|
||||||
|
|
||||||
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
||||||
@ -849,9 +848,9 @@ int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
|||||||
if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
||||||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
||||||
word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
|
word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
|
||||||
int num_blobs = word->rebuild_word->NumBlobs();
|
auto num_blobs = word->rebuild_word->NumBlobs();
|
||||||
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
|
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
|
||||||
for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
|
for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
|
||||||
TBLOB *blob = word->rebuild_word->blobs[i];
|
TBLOB *blob = word->rebuild_word->blobs[i];
|
||||||
if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
|
if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
|
||||||
score -= 1; // penalise possibly erroneous non-space
|
score -= 1; // penalise possibly erroneous non-space
|
||||||
|
@ -269,22 +269,14 @@ void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
|
|||||||
if (stopper_dict == nullptr) {
|
if (stopper_dict == nullptr) {
|
||||||
stopper_dict = &getDict();
|
stopper_dict = &getDict();
|
||||||
}
|
}
|
||||||
bool any_nonspace_delimited = false;
|
for (unsigned w = 0; w < words->size(); ++w) {
|
||||||
for (int w = 0; w < words->size(); ++w) {
|
|
||||||
WERD_RES *word = (*words)[w];
|
|
||||||
if (word->best_choice != nullptr && word->best_choice->ContainsAnyNonSpaceDelimited()) {
|
|
||||||
any_nonspace_delimited = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int w = 0; w < words->size(); ++w) {
|
|
||||||
WERD_RES *word = (*words)[w];
|
WERD_RES *word = (*words)[w];
|
||||||
if (word->best_choice == nullptr) {
|
if (word->best_choice == nullptr) {
|
||||||
// It is a dud.
|
// It is a dud.
|
||||||
word->SetupFake(lstm_recognizer_->GetUnicharset());
|
word->SetupFake(lstm_recognizer_->GetUnicharset());
|
||||||
} else {
|
} else {
|
||||||
// Set the best state.
|
// Set the best state.
|
||||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word->best_choice->length(); ++i) {
|
||||||
int length = word->best_choice->state(i);
|
int length = word->best_choice->state(i);
|
||||||
word->best_state.push_back(length);
|
word->best_state.push_back(length);
|
||||||
}
|
}
|
||||||
|
@ -335,10 +335,10 @@ char *LTRResultIterator::WordNormedUTF8Text() const {
|
|||||||
WERD_CHOICE *best_choice = it_->word()->best_choice;
|
WERD_CHOICE *best_choice = it_->word()->best_choice;
|
||||||
const UNICHARSET *unicharset = it_->word()->uch_set;
|
const UNICHARSET *unicharset = it_->word()->uch_set;
|
||||||
ASSERT_HOST(best_choice != nullptr);
|
ASSERT_HOST(best_choice != nullptr);
|
||||||
for (int i = 0; i < best_choice->length(); ++i) {
|
for (unsigned i = 0; i < best_choice->length(); ++i) {
|
||||||
ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
|
ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
|
||||||
}
|
}
|
||||||
int length = ocr_text.length() + 1;
|
auto length = ocr_text.length() + 1;
|
||||||
char *result = new char[length];
|
char *result = new char[length];
|
||||||
strncpy(result, ocr_text.c_str(), length);
|
strncpy(result, ocr_text.c_str(), length);
|
||||||
return result;
|
return result;
|
||||||
@ -404,7 +404,7 @@ ChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {
|
|||||||
strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
|
strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
|
||||||
blanks_before_word_ = 0;
|
blanks_before_word_ = 0;
|
||||||
}
|
}
|
||||||
auto index = *tstep_index_;
|
unsigned index = *tstep_index_;
|
||||||
index += blanks_before_word_;
|
index += blanks_before_word_;
|
||||||
if (index < word_res_->CTC_symbol_choices.size()) {
|
if (index < word_res_->CTC_symbol_choices.size()) {
|
||||||
LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
|
LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
|
||||||
@ -484,7 +484,7 @@ float ChoiceIterator::Confidence() const {
|
|||||||
|
|
||||||
// Returns the set of timesteps which belong to the current symbol
|
// Returns the set of timesteps which belong to the current symbol
|
||||||
std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
|
std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
|
||||||
int offset = *tstep_index_ + blanks_before_word_;
|
unsigned offset = *tstep_index_ + blanks_before_word_;
|
||||||
if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
|
if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -381,7 +381,7 @@ bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
|
|||||||
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;
|
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && choice == nullptr;
|
||||||
choice_it.forward()) {
|
choice_it.forward()) {
|
||||||
int choice_script = choice_it.data()->script_id();
|
int choice_script = choice_it.data()->script_id();
|
||||||
int s = 0;
|
unsigned s = 0;
|
||||||
for (s = 0; s < allowed_scripts_->size(); ++s) {
|
for (s = 0; s < allowed_scripts_->size(); ++s) {
|
||||||
if ((*allowed_scripts_)[s] == choice_script) {
|
if ((*allowed_scripts_)[s] == choice_script) {
|
||||||
choice = choice_it.data();
|
choice = choice_it.data();
|
||||||
@ -477,7 +477,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) {
|
|||||||
int id = choice->script_id();
|
int id = choice->script_id();
|
||||||
if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
|
if (allowed_scripts_ != nullptr && !allowed_scripts_->empty()) {
|
||||||
// Check that the choice is in an allowed script.
|
// Check that the choice is in an allowed script.
|
||||||
int s = 0;
|
size_t s = 0;
|
||||||
for (s = 0; s < allowed_scripts_->size(); ++s) {
|
for (s = 0; s < allowed_scripts_->size(); ++s) {
|
||||||
if ((*allowed_scripts_)[s] == id) {
|
if ((*allowed_scripts_)[s] == id) {
|
||||||
break;
|
break;
|
||||||
|
@ -101,7 +101,6 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
|
|||||||
bool force_eol) { // override tilde crunch?
|
bool force_eol) { // override tilde crunch?
|
||||||
WERD_RES *word = page_res_it.word();
|
WERD_RES *word = page_res_it.word();
|
||||||
const UNICHARSET &uchset = *word->uch_set;
|
const UNICHARSET &uchset = *word->uch_set;
|
||||||
int i;
|
|
||||||
bool need_reject = false;
|
bool need_reject = false;
|
||||||
UNICHAR_ID space = uchset.unichar_to_id(" ");
|
UNICHAR_ID space = uchset.unichar_to_id(" ");
|
||||||
|
|
||||||
@ -181,7 +180,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
|
|||||||
if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
|
if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
|
||||||
if (tessedit_zero_rejection) {
|
if (tessedit_zero_rejection) {
|
||||||
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
||||||
for (i = 0; i < word->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word->best_choice->length(); ++i) {
|
||||||
if (word->reject_map[i].rejected()) {
|
if (word->reject_map[i].rejected()) {
|
||||||
word->reject_map[i].setrej_minimal_rej_accept();
|
word->reject_map[i].setrej_minimal_rej_accept();
|
||||||
}
|
}
|
||||||
@ -189,7 +188,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
|
|||||||
}
|
}
|
||||||
if (tessedit_minimal_rejection) {
|
if (tessedit_minimal_rejection) {
|
||||||
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
||||||
for (i = 0; i < word->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word->best_choice->length(); ++i) {
|
||||||
if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
|
if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
|
||||||
word->reject_map[i].setrej_minimal_rej_accept();
|
word->reject_map[i].setrej_minimal_rej_accept();
|
||||||
}
|
}
|
||||||
@ -365,7 +364,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
|
|||||||
|
|
||||||
int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
|
int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (int i = 0; i < word.length(); ++i) {
|
for (unsigned i = 0; i < word.length(); ++i) {
|
||||||
if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
|
if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
@ -375,7 +374,7 @@ int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
|
|||||||
|
|
||||||
int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
|
int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (int i = 0; i < word.length(); ++i) {
|
for (unsigned i = 0; i < word.length(); ++i) {
|
||||||
if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
|
if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
|
||||||
word.unicharset()->get_isdigit(word.unichar_id(i))) {
|
word.unicharset()->get_isdigit(word.unichar_id(i))) {
|
||||||
count++;
|
count++;
|
||||||
|
@ -612,12 +612,12 @@ void PageIterator::BeginWord(int offset) {
|
|||||||
// is already baseline denormalized.
|
// is already baseline denormalized.
|
||||||
word_length_ = word_res->best_choice->length();
|
word_length_ = word_res->best_choice->length();
|
||||||
if (word_res->box_word != nullptr) {
|
if (word_res->box_word != nullptr) {
|
||||||
if (word_res->box_word->length() != word_length_) {
|
if (word_res->box_word->length() != static_cast<unsigned>(word_length_)) {
|
||||||
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ", word_length_,
|
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ", word_length_,
|
||||||
word_res->best_choice->unichar_string().c_str(), word_res->box_word->length());
|
word_res->best_choice->unichar_string().c_str(), word_res->box_word->length());
|
||||||
word_res->box_word->bounding_box().print();
|
word_res->box_word->bounding_box().print();
|
||||||
}
|
}
|
||||||
ASSERT_HOST(word_res->box_word->length() == word_length_);
|
ASSERT_HOST(word_res->box_word->length() == static_cast<unsigned>(word_length_));
|
||||||
}
|
}
|
||||||
word_ = nullptr;
|
word_ = nullptr;
|
||||||
// We will be iterating the box_word.
|
// We will be iterating the box_word.
|
||||||
|
@ -40,10 +40,10 @@ void Tesseract::PrerecAllWordsPar(const std::vector<WordData> &words) {
|
|||||||
std::vector<BlobData> blobs;
|
std::vector<BlobData> blobs;
|
||||||
for (const auto &w : words) {
|
for (const auto &w : words) {
|
||||||
if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {
|
if (w.word->ratings != nullptr && w.word->ratings->get(0, 0) == nullptr) {
|
||||||
for (int s = 0; s < w.lang_words.size(); ++s) {
|
for (size_t s = 0; s < w.lang_words.size(); ++s) {
|
||||||
Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
|
Tesseract *sub = s < sub_langs_.size() ? sub_langs_[s] : this;
|
||||||
const WERD_RES &word = *w.lang_words[s];
|
const WERD_RES &word = *w.lang_words[s];
|
||||||
for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
|
for (unsigned b = 0; b < word.chopped_word->NumBlobs(); ++b) {
|
||||||
blobs.emplace_back(b, sub, word);
|
blobs.emplace_back(b, sub, word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -73,7 +73,7 @@ static int Epsilon(int space_pix) {
|
|||||||
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
|
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
|
||||||
const std::vector<RowScratchRegisters> *rows, int row_start,
|
const std::vector<RowScratchRegisters> *rows, int row_start,
|
||||||
int row_end) {
|
int row_end) {
|
||||||
if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
|
if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
|
||||||
tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
|
tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
|
||||||
rows->size());
|
rows->size());
|
||||||
return false;
|
return false;
|
||||||
@ -94,8 +94,8 @@ static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *fun
|
|||||||
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
|
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
|
||||||
std::vector<int> max_col_widths;
|
std::vector<int> max_col_widths;
|
||||||
for (const auto &row : rows) {
|
for (const auto &row : rows) {
|
||||||
int num_columns = row.size();
|
auto num_columns = row.size();
|
||||||
for (int c = 0; c < num_columns; c++) {
|
for (size_t c = 0; c < num_columns; c++) {
|
||||||
int num_unicodes = 0;
|
int num_unicodes = 0;
|
||||||
for (char i : row[c]) {
|
for (char i : row[c]) {
|
||||||
if ((i & 0xC0) != 0x80) {
|
if ((i & 0xC0) != 0x80) {
|
||||||
@ -285,7 +285,7 @@ bool AsciiLikelyListItem(const std::string &word) {
|
|||||||
// ========== Brain Dead Language Model (Tesseract Version) ================
|
// ========== Brain Dead Language Model (Tesseract Version) ================
|
||||||
|
|
||||||
// Return the first Unicode Codepoint from werd[pos].
|
// Return the first Unicode Codepoint from werd[pos].
|
||||||
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
|
static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
|
||||||
if (!u || !werd || pos > werd->length()) {
|
if (!u || !werd || pos > werd->length()) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -297,33 +297,32 @@ int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
|
|||||||
class UnicodeSpanSkipper {
|
class UnicodeSpanSkipper {
|
||||||
public:
|
public:
|
||||||
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
|
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
|
||||||
: u_(unicharset), word_(word) {
|
: u_(unicharset), word_(word), wordlen_(word->length()) {
|
||||||
wordlen_ = word->length();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Given an input position, return the first position >= pos not punc.
|
// Given an input position, return the first position >= pos not punc.
|
||||||
int SkipPunc(int pos);
|
unsigned SkipPunc(unsigned pos);
|
||||||
// Given an input position, return the first position >= pos not digit.
|
// Given an input position, return the first position >= pos not digit.
|
||||||
int SkipDigits(int pos);
|
unsigned SkipDigits(unsigned pos);
|
||||||
// Given an input position, return the first position >= pos not roman.
|
// Given an input position, return the first position >= pos not roman.
|
||||||
int SkipRomans(int pos);
|
unsigned SkipRomans(unsigned pos);
|
||||||
// Given an input position, return the first position >= pos not alpha.
|
// Given an input position, return the first position >= pos not alpha.
|
||||||
int SkipAlpha(int pos);
|
unsigned SkipAlpha(unsigned pos);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const UNICHARSET *u_;
|
const UNICHARSET *u_;
|
||||||
const WERD_CHOICE *word_;
|
const WERD_CHOICE *word_;
|
||||||
int wordlen_;
|
unsigned wordlen_;
|
||||||
};
|
};
|
||||||
|
|
||||||
int UnicodeSpanSkipper::SkipPunc(int pos) {
|
unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
|
||||||
while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
|
while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
int UnicodeSpanSkipper::SkipDigits(int pos) {
|
unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
|
||||||
while (pos < wordlen_ &&
|
while (pos < wordlen_ &&
|
||||||
(u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
|
(u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
|
||||||
pos++;
|
pos++;
|
||||||
@ -331,7 +330,7 @@ int UnicodeSpanSkipper::SkipDigits(int pos) {
|
|||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
int UnicodeSpanSkipper::SkipRomans(int pos) {
|
unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
|
||||||
const char *kRomans = "ivxlmdIVXLMD";
|
const char *kRomans = "ivxlmdIVXLMD";
|
||||||
while (pos < wordlen_) {
|
while (pos < wordlen_) {
|
||||||
int ch = UnicodeFor(u_, word_, pos);
|
int ch = UnicodeFor(u_, word_, pos);
|
||||||
@ -343,7 +342,7 @@ int UnicodeSpanSkipper::SkipRomans(int pos) {
|
|||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
int UnicodeSpanSkipper::SkipAlpha(int pos) {
|
unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
|
||||||
while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
|
while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
@ -386,13 +385,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
|
|||||||
|
|
||||||
UnicodeSpanSkipper m(u, werd);
|
UnicodeSpanSkipper m(u, werd);
|
||||||
int num_segments = 0;
|
int num_segments = 0;
|
||||||
int pos = 0;
|
unsigned pos = 0;
|
||||||
while (pos < werd->length() && num_segments < 3) {
|
while (pos < werd->length() && num_segments < 3) {
|
||||||
int numeral_start = m.SkipPunc(pos);
|
auto numeral_start = m.SkipPunc(pos);
|
||||||
if (numeral_start > pos + 1) {
|
if (numeral_start > pos + 1) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
int numeral_end = m.SkipRomans(numeral_start);
|
auto numeral_end = m.SkipRomans(numeral_start);
|
||||||
if (numeral_end == numeral_start) {
|
if (numeral_end == numeral_start) {
|
||||||
numeral_end = m.SkipDigits(numeral_start);
|
numeral_end = m.SkipDigits(numeral_start);
|
||||||
if (numeral_end == numeral_start) {
|
if (numeral_end == numeral_start) {
|
||||||
@ -2353,7 +2352,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
|||||||
LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
|
LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
|
||||||
bool pass2a_was_useful =
|
bool pass2a_was_useful =
|
||||||
leftovers2.size() > 1 ||
|
leftovers2.size() > 1 ||
|
||||||
(leftovers2.size() == 1 && (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
|
(leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
|
||||||
if (pass2a_was_useful) {
|
if (pass2a_was_useful) {
|
||||||
for (auto &leftover2 : leftovers2) {
|
for (auto &leftover2 : leftovers2) {
|
||||||
StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
|
StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
|
||||||
|
@ -34,9 +34,6 @@ class WERD_CHOICE;
|
|||||||
TESS_API
|
TESS_API
|
||||||
bool AsciiLikelyListItem(const std::string &word);
|
bool AsciiLikelyListItem(const std::string &word);
|
||||||
|
|
||||||
// Return the first Unicode Codepoint from werd[pos].
|
|
||||||
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
|
||||||
|
|
||||||
// Set right word attributes given either a unicharset and werd or a utf8
|
// Set right word attributes given either a unicharset and werd or a utf8
|
||||||
// string.
|
// string.
|
||||||
TESS_API
|
TESS_API
|
||||||
|
@ -94,9 +94,6 @@ void Tesseract::set_done(WERD_RES *word, int16_t pass) {
|
|||||||
* Sets a reject map for the word.
|
* Sets a reject map for the word.
|
||||||
*************************************************************************/
|
*************************************************************************/
|
||||||
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
|
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
|
||||||
int i;
|
|
||||||
int offset;
|
|
||||||
|
|
||||||
flip_0O(word);
|
flip_0O(word);
|
||||||
check_debug_pt(word, -1); // For trap only
|
check_debug_pt(word, -1); // For trap only
|
||||||
set_done(word, pass); // Set acceptance
|
set_done(word, pass); // Set acceptance
|
||||||
@ -145,7 +142,7 @@ void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
|
|||||||
// PASSED TEST
|
// PASSED TEST
|
||||||
} else if (best_choice->permuter() == NUMBER_PERM) {
|
} else if (best_choice->permuter() == NUMBER_PERM) {
|
||||||
if (rej_alphas_in_number_perm) {
|
if (rej_alphas_in_number_perm) {
|
||||||
for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
|
for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';
|
||||||
offset += best_choice->unichar_lengths()[i++]) {
|
offset += best_choice->unichar_lengths()[i++]) {
|
||||||
if (word->reject_map[i].accepted() &&
|
if (word->reject_map[i].accepted() &&
|
||||||
word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
|
word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,
|
||||||
@ -210,7 +207,7 @@ void Tesseract::reject_I_1_L(WERD_RES *word) {
|
|||||||
|
|
||||||
void reject_poor_matches(WERD_RES *word) {
|
void reject_poor_matches(WERD_RES *word) {
|
||||||
float threshold = compute_reject_threshold(word->best_choice);
|
float threshold = compute_reject_threshold(word->best_choice);
|
||||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word->best_choice->length(); ++i) {
|
||||||
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
|
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {
|
||||||
word->reject_map[i].setrej_tess_failure();
|
word->reject_map[i].setrej_tess_failure();
|
||||||
} else if (word->best_choice->certainty(i) < threshold) {
|
} else if (word->best_choice->certainty(i) < threshold) {
|
||||||
@ -232,16 +229,16 @@ float compute_reject_threshold(WERD_CHOICE *word) {
|
|||||||
float bestgap = 0.0f; // biggest gap
|
float bestgap = 0.0f; // biggest gap
|
||||||
float gapstart; // bottom of gap
|
float gapstart; // bottom of gap
|
||||||
|
|
||||||
int blob_count = word->length();
|
auto blob_count = word->length();
|
||||||
std::vector<float> ratings;
|
std::vector<float> ratings;
|
||||||
ratings.reserve(blob_count);
|
ratings.reserve(blob_count);
|
||||||
for (int i = 0; i < blob_count; ++i) {
|
for (unsigned i = 0; i < blob_count; ++i) {
|
||||||
ratings.push_back(word->certainty(i));
|
ratings.push_back(word->certainty(i));
|
||||||
}
|
}
|
||||||
std::sort(ratings.begin(), ratings.end());
|
std::sort(ratings.begin(), ratings.end());
|
||||||
gapstart = ratings[0] - 1; // all reject if none better
|
gapstart = ratings[0] - 1; // all reject if none better
|
||||||
if (blob_count >= 3) {
|
if (blob_count >= 3) {
|
||||||
for (int index = 0; index < blob_count - 1; index++) {
|
for (unsigned index = 0; index < blob_count - 1; index++) {
|
||||||
if (ratings[index + 1] - ratings[index] > bestgap) {
|
if (ratings[index + 1] - ratings[index] > bestgap) {
|
||||||
bestgap = ratings[index + 1] - ratings[index];
|
bestgap = ratings[index + 1] - ratings[index];
|
||||||
// find biggest
|
// find biggest
|
||||||
@ -514,14 +511,12 @@ bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_len
|
|||||||
* Don't unreject LONE accepted 1Il conflict set chars
|
* Don't unreject LONE accepted 1Il conflict set chars
|
||||||
*************************************************************************/
|
*************************************************************************/
|
||||||
void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
||||||
int i = 0;
|
|
||||||
int offset;
|
|
||||||
int word_len = word->reject_map.length();
|
int word_len = word->reject_map.length();
|
||||||
const char *s = word->best_choice->unichar_string().c_str();
|
const char *s = word->best_choice->unichar_string().c_str();
|
||||||
const char *lengths = word->best_choice->unichar_lengths().c_str();
|
const char *lengths = word->best_choice->unichar_lengths().c_str();
|
||||||
bool accepted_1Il = false;
|
bool accepted_1Il = false;
|
||||||
|
|
||||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||||
if (word->reject_map[i].accepted()) {
|
if (word->reject_map[i].accepted()) {
|
||||||
if (conflict_set_I_l_1.contains(s[offset])) {
|
if (conflict_set_I_l_1.contains(s[offset])) {
|
||||||
accepted_1Il = true;
|
accepted_1Il = true;
|
||||||
@ -537,7 +532,7 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
|||||||
return; // Nothing to worry about
|
return; // Nothing to worry about
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||||
if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
|
if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {
|
||||||
word->reject_map[i].setrej_postNN_1Il();
|
word->reject_map[i].setrej_postNN_1Il();
|
||||||
}
|
}
|
||||||
@ -547,7 +542,7 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
|||||||
int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
|
int16_t Tesseract::count_alphanums(WERD_RES *word_res) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
const WERD_CHOICE *best_choice = word_res->best_choice;
|
const WERD_CHOICE *best_choice = word_res->best_choice;
|
||||||
for (int i = 0; i < word_res->reject_map.length(); ++i) {
|
for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {
|
||||||
if ((word_res->reject_map[i].accepted()) &&
|
if ((word_res->reject_map[i].accepted()) &&
|
||||||
(word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
|
(word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
|
||||||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
|
word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
|
||||||
@ -568,9 +563,6 @@ void Tesseract::reject_mostly_rejects(WERD_RES *word) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
||||||
int16_t char_quality;
|
|
||||||
int16_t accepted_char_quality;
|
|
||||||
|
|
||||||
if (word->best_choice->unichar_lengths().length() <= 1) {
|
if (word->best_choice->unichar_lengths().length() <= 1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -580,15 +572,17 @@ bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
|
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
|
||||||
for (int i = 1; i < word->best_choice->length(); ++i) {
|
for (unsigned i = 1; i < word->best_choice->length(); ++i) {
|
||||||
if (word->best_choice->unichar_id(i) != uch_id) {
|
if (word->best_choice->unichar_id(i) != uch_id) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int16_t char_quality;
|
||||||
|
int16_t accepted_char_quality;
|
||||||
word_char_quality(word, &char_quality, &accepted_char_quality);
|
word_char_quality(word, &char_quality, &accepted_char_quality);
|
||||||
|
|
||||||
if ((word->best_choice->unichar_lengths().length() == char_quality) &&
|
if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&
|
||||||
(char_quality == accepted_char_quality)) {
|
(char_quality == accepted_char_quality)) {
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
@ -607,7 +601,6 @@ int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
|
|||||||
// in word_res->best_choice.
|
// in word_res->best_choice.
|
||||||
void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
||||||
WERD_CHOICE *best_choice = word_res->best_choice;
|
WERD_CHOICE *best_choice = word_res->best_choice;
|
||||||
int i;
|
|
||||||
int prev_right = -9999;
|
int prev_right = -9999;
|
||||||
int next_left;
|
int next_left;
|
||||||
TBOX out_box;
|
TBOX out_box;
|
||||||
@ -617,9 +610,9 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
auto num_blobs = word_res->rebuild_word->NumBlobs();
|
||||||
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
|
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
|
||||||
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
||||||
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
||||||
out_box = blob->bounding_box();
|
out_box = blob->bounding_box();
|
||||||
if (i + 1 == num_blobs) {
|
if (i + 1 == num_blobs) {
|
||||||
@ -666,15 +659,14 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
|||||||
// in word_res->best_choice.
|
// in word_res->best_choice.
|
||||||
void Tesseract::flip_0O(WERD_RES *word_res) {
|
void Tesseract::flip_0O(WERD_RES *word_res) {
|
||||||
WERD_CHOICE *best_choice = word_res->best_choice;
|
WERD_CHOICE *best_choice = word_res->best_choice;
|
||||||
int i;
|
|
||||||
TBOX out_box;
|
TBOX out_box;
|
||||||
|
|
||||||
if (!tessedit_flip_0O) {
|
if (!tessedit_flip_0O) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
auto num_blobs = word_res->rebuild_word->NumBlobs();
|
||||||
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
||||||
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
TBLOB *blob = word_res->rebuild_word->blobs[i];
|
||||||
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
|
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
|
||||||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
|
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
|
||||||
@ -691,7 +683,7 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
|
|||||||
unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
|
unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {
|
||||||
return; // 0 or O are not present/enabled in unicharset
|
return; // 0 or O are not present/enabled in unicharset
|
||||||
}
|
}
|
||||||
for (i = 1; i < best_choice->length(); ++i) {
|
for (unsigned i = 1; i < best_choice->length(); ++i) {
|
||||||
if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
|
if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {
|
||||||
/* A0A */
|
/* A0A */
|
||||||
if ((i + 1) < best_choice->length() &&
|
if ((i + 1) < best_choice->length() &&
|
||||||
|
@ -228,7 +228,7 @@ void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {
|
|||||||
i = j;
|
i = j;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ASSERT_HOST(blob_indices->size() == word_length_);
|
ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
|
static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
|
||||||
@ -501,7 +501,7 @@ bool ResultIterator::Next(PageIteratorLevel level) {
|
|||||||
case RIL_SYMBOL: {
|
case RIL_SYMBOL: {
|
||||||
std::vector<int> blob_order;
|
std::vector<int> blob_order;
|
||||||
CalculateBlobOrder(&blob_order);
|
CalculateBlobOrder(&blob_order);
|
||||||
int next_blob = 0;
|
unsigned next_blob = 0;
|
||||||
while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
|
while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
|
||||||
next_blob++;
|
next_blob++;
|
||||||
}
|
}
|
||||||
|
@ -502,13 +502,13 @@ WERD_RES *Tesseract::TrySuperscriptSplits(int num_chopped_leading, float leading
|
|||||||
*/
|
*/
|
||||||
bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
|
bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold,
|
||||||
int *left_ok, int *right_ok) const {
|
int *left_ok, int *right_ok) const {
|
||||||
int initial_ok_run_count = 0;
|
unsigned initial_ok_run_count = 0;
|
||||||
int ok_run_count = 0;
|
unsigned ok_run_count = 0;
|
||||||
float worst_certainty = 0.0f;
|
float worst_certainty = 0.0f;
|
||||||
const WERD_CHOICE &wc = *word.best_choice;
|
const WERD_CHOICE &wc = *word.best_choice;
|
||||||
|
|
||||||
const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
|
const UnicityTable<FontInfo> &fontinfo_table = get_fontinfo_table();
|
||||||
for (int i = 0; i < wc.length(); i++) {
|
for (unsigned i = 0; i < wc.length(); i++) {
|
||||||
TBLOB *blob = word.rebuild_word->blobs[i];
|
TBLOB *blob = word.rebuild_word->blobs[i];
|
||||||
UNICHAR_ID unichar_id = wc.unichar_id(i);
|
UNICHAR_ID unichar_id = wc.unichar_id(i);
|
||||||
float char_certainty = wc.certainty(i);
|
float char_certainty = wc.certainty(i);
|
||||||
|
@ -593,7 +593,7 @@ public:
|
|||||||
void recog_word_recursive(WERD_RES *word);
|
void recog_word_recursive(WERD_RES *word);
|
||||||
void recog_word(WERD_RES *word);
|
void recog_word(WERD_RES *word);
|
||||||
void split_and_recog_word(WERD_RES *word);
|
void split_and_recog_word(WERD_RES *word);
|
||||||
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
|
void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,
|
||||||
BlamerBundle **orig_blamer_bundle) const;
|
BlamerBundle **orig_blamer_bundle) const;
|
||||||
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const;
|
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const;
|
||||||
//// fixspace.cpp ///////////////////////////////////////////////////////
|
//// fixspace.cpp ///////////////////////////////////////////////////////
|
||||||
@ -722,8 +722,8 @@ public:
|
|||||||
// vector holding classification results for a sequence of consecutive
|
// vector holding classification results for a sequence of consecutive
|
||||||
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
|
||||||
void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
void SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
|
||||||
int choices_length, const std::vector<UNICHAR_ID> &target_text,
|
unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
|
||||||
int text_index, float rating, std::vector<int> *segmentation,
|
unsigned text_index, float rating, std::vector<int> *segmentation,
|
||||||
float *best_rating, std::vector<int> *best_segmentation);
|
float *best_rating, std::vector<int> *best_segmentation);
|
||||||
// Counts up the labelled words and the blobs within.
|
// Counts up the labelled words and the blobs within.
|
||||||
// Deletes all unused or emptied words, counting the unused ones.
|
// Deletes all unused or emptied words, counting the unused ones.
|
||||||
|
@ -47,14 +47,7 @@ void Tesseract::recog_word(WERD_RES *word) {
|
|||||||
ASSERT_HOST(!word->chopped_word->blobs.empty());
|
ASSERT_HOST(!word->chopped_word->blobs.empty());
|
||||||
recog_word_recursive(word);
|
recog_word_recursive(word);
|
||||||
word->SetupBoxWord();
|
word->SetupBoxWord();
|
||||||
if (word->best_choice->length() != word->box_word->length()) {
|
ASSERT_HOST(static_cast<unsigned>(word->best_choice->length()) == word->box_word->length());
|
||||||
tprintf(
|
|
||||||
"recog_word ASSERT FAIL String:\"%s\"; "
|
|
||||||
"Strlen=%d; #Blobs=%d\n",
|
|
||||||
word->best_choice->debug_string().c_str(), word->best_choice->length(),
|
|
||||||
word->box_word->length());
|
|
||||||
}
|
|
||||||
ASSERT_HOST(word->best_choice->length() == word->box_word->length());
|
|
||||||
// Check that the ratings matrix size matches the sum of all the
|
// Check that the ratings matrix size matches the sum of all the
|
||||||
// segmentation states.
|
// segmentation states.
|
||||||
if (!word->StatesAllValid()) {
|
if (!word->StatesAllValid()) {
|
||||||
@ -82,7 +75,7 @@ void Tesseract::recog_word(WERD_RES *word) {
|
|||||||
// Factored out from control.cpp
|
// Factored out from control.cpp
|
||||||
ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
|
ASSERT_HOST((word->best_choice == nullptr) == (word->raw_choice == nullptr));
|
||||||
if (word->best_choice == nullptr || word->best_choice->empty() ||
|
if (word->best_choice == nullptr || word->best_choice->empty() ||
|
||||||
static_cast<int>(strspn(word->best_choice->unichar_string().c_str(), " ")) ==
|
strspn(word->best_choice->unichar_string().c_str(), " ") ==
|
||||||
word->best_choice->length()) {
|
word->best_choice->length()) {
|
||||||
word->tess_failed = true;
|
word->tess_failed = true;
|
||||||
word->reject_map.initialise(word->box_word->length());
|
word->reject_map.initialise(word->box_word->length());
|
||||||
@ -99,7 +92,7 @@ void Tesseract::recog_word(WERD_RES *word) {
|
|||||||
* Convert the output back to editor form.
|
* Convert the output back to editor form.
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
void Tesseract::recog_word_recursive(WERD_RES *word) {
|
void Tesseract::recog_word_recursive(WERD_RES *word) {
|
||||||
int word_length = word->chopped_word->NumBlobs(); // no of blobs
|
auto word_length = word->chopped_word->NumBlobs(); // no of blobs
|
||||||
if (word_length > MAX_UNDIVIDED_LENGTH) {
|
if (word_length > MAX_UNDIVIDED_LENGTH) {
|
||||||
return split_and_recog_word(word);
|
return split_and_recog_word(word);
|
||||||
}
|
}
|
||||||
@ -134,7 +127,7 @@ void Tesseract::split_and_recog_word(WERD_RES *word) {
|
|||||||
// Find the biggest blob gap in the chopped_word.
|
// Find the biggest blob gap in the chopped_word.
|
||||||
int bestgap = -INT32_MAX;
|
int bestgap = -INT32_MAX;
|
||||||
int split_index = 0;
|
int split_index = 0;
|
||||||
for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
|
for (unsigned b = 1; b < word->chopped_word->NumBlobs(); ++b) {
|
||||||
TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
|
TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
|
||||||
TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
|
TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
|
||||||
int gap = blob_box.left() - prev_box.right();
|
int gap = blob_box.left() - prev_box.right();
|
||||||
@ -167,7 +160,7 @@ void Tesseract::split_and_recog_word(WERD_RES *word) {
|
|||||||
* and will now be owned by the caller. New blamer bundles are forged for the
|
* and will now be owned by the caller. New blamer bundles are forged for the
|
||||||
* two pieces.
|
* two pieces.
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
|
void Tesseract::split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece,
|
||||||
BlamerBundle **orig_blamer_bundle) const {
|
BlamerBundle **orig_blamer_bundle) const {
|
||||||
ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
|
ASSERT_HOST(split_pt > 0 && split_pt < word->chopped_word->NumBlobs());
|
||||||
|
|
||||||
@ -181,7 +174,7 @@ void Tesseract::split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece,
|
|||||||
TWERD *chopped = word->chopped_word;
|
TWERD *chopped = word->chopped_word;
|
||||||
auto *chopped2 = new TWERD;
|
auto *chopped2 = new TWERD;
|
||||||
chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
|
chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
|
||||||
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
|
for (auto i = split_pt; i < chopped->NumBlobs(); ++i) {
|
||||||
chopped2->blobs.push_back(chopped->blobs[i]);
|
chopped2->blobs.push_back(chopped->blobs[i]);
|
||||||
}
|
}
|
||||||
chopped->blobs.resize(split_pt);
|
chopped->blobs.resize(split_pt);
|
||||||
|
@ -205,9 +205,8 @@ std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
|
|||||||
|
|
||||||
int r;
|
int r;
|
||||||
if (method == ThresholdMethod::Sauvola) {
|
if (method == ThresholdMethod::Sauvola) {
|
||||||
bool b;
|
|
||||||
int window_size;
|
int window_size;
|
||||||
b = api->GetIntVariable("thresholding_window_size", &window_size);
|
api->GetIntVariable("thresholding_window_size", &window_size);
|
||||||
int half_window_size = window_size / 2;
|
int half_window_size = window_size / 2;
|
||||||
// factor for image division into tiles; >= 1
|
// factor for image division into tiles; >= 1
|
||||||
l_int32 nx, ny;
|
l_int32 nx, ny;
|
||||||
@ -226,19 +225,18 @@ std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
|
|||||||
}
|
}
|
||||||
|
|
||||||
double kfactor;
|
double kfactor;
|
||||||
b = api->GetDoubleVariable("thresholding_kfactor", &kfactor);
|
api->GetDoubleVariable("thresholding_kfactor", &kfactor);
|
||||||
r = pixSauvolaBinarizeTiled(pix_grey, half_window_size, kfactor, nx, ny,
|
r = pixSauvolaBinarizeTiled(pix_grey, half_window_size, kfactor, nx, ny,
|
||||||
(PIX**)pix_thresholds,
|
(PIX**)pix_thresholds,
|
||||||
(PIX**)pix_binary);
|
(PIX**)pix_binary);
|
||||||
} else { // if (method == ThresholdMethod::AdaptiveOtsu)
|
} else { // if (method == ThresholdMethod::AdaptiveOtsu)
|
||||||
bool b;
|
|
||||||
int tile_size;
|
int tile_size;
|
||||||
b = api->GetIntVariable("thresholding_tile_size", &tile_size);
|
api->GetIntVariable("thresholding_tile_size", &tile_size);
|
||||||
int smooth_size;
|
int smooth_size;
|
||||||
b = api->GetIntVariable("thresholding_smooth_size", &smooth_size);
|
api->GetIntVariable("thresholding_smooth_size", &smooth_size);
|
||||||
int half_smooth_size = smooth_size / 2;
|
int half_smooth_size = smooth_size / 2;
|
||||||
double score_fraction;
|
double score_fraction;
|
||||||
b = api->GetDoubleVariable("thresholding_score_fraction", &score_fraction);
|
api->GetDoubleVariable("thresholding_score_fraction", &score_fraction);
|
||||||
r = pixOtsuAdaptiveThreshold(pix_grey, tile_size, tile_size,
|
r = pixOtsuAdaptiveThreshold(pix_grey, tile_size, tile_size,
|
||||||
half_smooth_size, half_smooth_size,
|
half_smooth_size, half_smooth_size,
|
||||||
score_fraction,
|
score_fraction,
|
||||||
|
@ -72,7 +72,7 @@ void BlamerBundle::SetWordTruth(const UNICHARSET &unicharset, const char *truth_
|
|||||||
std::vector<char> lengths;
|
std::vector<char> lengths;
|
||||||
unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);
|
unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);
|
||||||
int total_length = 0;
|
int total_length = 0;
|
||||||
for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
|
for (size_t i = 0; i < encoding.size(); total_length += lengths[i++]) {
|
||||||
std::string uch(truth_str + total_length);
|
std::string uch(truth_str + total_length);
|
||||||
uch.resize(lengths[i] - total_length);
|
uch.resize(lengths[i] - total_length);
|
||||||
UNICHAR_ID id = encoding[i];
|
UNICHAR_ID id = encoding[i];
|
||||||
@ -119,7 +119,7 @@ bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE *word_choice) const {
|
|||||||
}
|
}
|
||||||
const UNICHARSET *uni_set = word_choice->unicharset();
|
const UNICHARSET *uni_set = word_choice->unicharset();
|
||||||
std::string normed_choice_str;
|
std::string normed_choice_str;
|
||||||
for (int i = 0; i < word_choice->length(); ++i) {
|
for (unsigned i = 0; i < word_choice->length(); ++i) {
|
||||||
normed_choice_str += uni_set->get_normed_unichar(word_choice->unichar_id(i));
|
normed_choice_str += uni_set->get_normed_unichar(word_choice->unichar_id(i));
|
||||||
}
|
}
|
||||||
std::string truth_str = TruthString();
|
std::string truth_str = TruthString();
|
||||||
@ -155,7 +155,7 @@ void BlamerBundle::SetupNormTruthWord(const DENORM &denorm) {
|
|||||||
TPOINT botright;
|
TPOINT botright;
|
||||||
TPOINT norm_topleft;
|
TPOINT norm_topleft;
|
||||||
TPOINT norm_botright;
|
TPOINT norm_botright;
|
||||||
for (int b = 0; b < truth_word_.length(); ++b) {
|
for (unsigned b = 0; b < truth_word_.length(); ++b) {
|
||||||
const TBOX &box = truth_word_.BlobBox(b);
|
const TBOX &box = truth_word_.BlobBox(b);
|
||||||
topleft.x = box.left();
|
topleft.x = box.left();
|
||||||
topleft.y = box.top();
|
topleft.y = box.top();
|
||||||
@ -175,8 +175,7 @@ void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, Blam
|
|||||||
BlamerBundle *bundle2) const {
|
BlamerBundle *bundle2) const {
|
||||||
std::string debug_str;
|
std::string debug_str;
|
||||||
// Find truth boxes that correspond to the split in the blobs.
|
// Find truth boxes that correspond to the split in the blobs.
|
||||||
int b;
|
unsigned begin2_truth_index = 0;
|
||||||
int begin2_truth_index = -1;
|
|
||||||
if (incorrect_result_reason_ != IRR_NO_TRUTH && truth_has_char_boxes_) {
|
if (incorrect_result_reason_ != IRR_NO_TRUTH && truth_has_char_boxes_) {
|
||||||
debug_str = "Looking for truth split at";
|
debug_str = "Looking for truth split at";
|
||||||
debug_str += " end1_x " + std::to_string(word1_right);
|
debug_str += " end1_x " + std::to_string(word1_right);
|
||||||
@ -184,7 +183,7 @@ void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, Blam
|
|||||||
debug_str += "\nnorm_truth_word boxes:\n";
|
debug_str += "\nnorm_truth_word boxes:\n";
|
||||||
if (norm_truth_word_.length() > 1) {
|
if (norm_truth_word_.length() > 1) {
|
||||||
norm_truth_word_.BlobBox(0).print_to_str(debug_str);
|
norm_truth_word_.BlobBox(0).print_to_str(debug_str);
|
||||||
for (b = 1; b < norm_truth_word_.length(); ++b) {
|
for (unsigned b = 1; b < norm_truth_word_.length(); ++b) {
|
||||||
norm_truth_word_.BlobBox(b).print_to_str(debug_str);
|
norm_truth_word_.BlobBox(b).print_to_str(debug_str);
|
||||||
if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < norm_box_tolerance_) &&
|
if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < norm_box_tolerance_) &&
|
||||||
(abs(word2_left - norm_truth_word_.BlobBox(b).left()) < norm_box_tolerance_)) {
|
(abs(word2_left - norm_truth_word_.BlobBox(b).left()) < norm_box_tolerance_)) {
|
||||||
@ -204,7 +203,7 @@ void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, Blam
|
|||||||
bundle2->truth_has_char_boxes_ = true;
|
bundle2->truth_has_char_boxes_ = true;
|
||||||
bundle2->norm_box_tolerance_ = norm_box_tolerance_;
|
bundle2->norm_box_tolerance_ = norm_box_tolerance_;
|
||||||
BlamerBundle *curr_bb = bundle1;
|
BlamerBundle *curr_bb = bundle1;
|
||||||
for (b = 0; b < norm_truth_word_.length(); ++b) {
|
for (unsigned b = 0; b < norm_truth_word_.length(); ++b) {
|
||||||
if (b == begin2_truth_index) {
|
if (b == begin2_truth_index) {
|
||||||
curr_bb = bundle2;
|
curr_bb = bundle2;
|
||||||
}
|
}
|
||||||
@ -264,7 +263,7 @@ void BlamerBundle::BlameClassifier(const UNICHARSET &unicharset, const TBOX &blo
|
|||||||
return; // Nothing to do here.
|
return; // Nothing to do here.
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int b = 0; b < norm_truth_word_.length(); ++b) {
|
for (unsigned b = 0; b < norm_truth_word_.length(); ++b) {
|
||||||
const TBOX &truth_box = norm_truth_word_.BlobBox(b);
|
const TBOX &truth_box = norm_truth_word_.BlobBox(b);
|
||||||
// Note that we are more strict on the bounding box boundaries here
|
// Note that we are more strict on the bounding box boundaries here
|
||||||
// than in other places (chopper, segmentation search), since we do
|
// than in other places (chopper, segmentation search), since we do
|
||||||
@ -313,7 +312,7 @@ void BlamerBundle::SetChopperBlame(const WERD_RES *word, bool debug) {
|
|||||||
}
|
}
|
||||||
bool missing_chop = false;
|
bool missing_chop = false;
|
||||||
int num_blobs = word->chopped_word->blobs.size();
|
int num_blobs = word->chopped_word->blobs.size();
|
||||||
int box_index = 0;
|
unsigned box_index = 0;
|
||||||
int blob_index = 0;
|
int blob_index = 0;
|
||||||
int16_t truth_x = -1;
|
int16_t truth_x = -1;
|
||||||
while (box_index < truth_word_.length() && blob_index < num_blobs) {
|
while (box_index < truth_word_.length() && blob_index < num_blobs) {
|
||||||
@ -366,7 +365,7 @@ void BlamerBundle::BlameClassifierOrLangModel(const WERD_RES *word, const UNICHA
|
|||||||
if (valid_permuter) {
|
if (valid_permuter) {
|
||||||
// Find out whether best choice is a top choice.
|
// Find out whether best choice is a top choice.
|
||||||
best_choice_is_dict_and_top_choice_ = true;
|
best_choice_is_dict_and_top_choice_ = true;
|
||||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
for (unsigned i = 0; i < word->best_choice->length(); ++i) {
|
||||||
BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
|
BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
|
||||||
ASSERT_HOST(!blob_choice_it.empty());
|
ASSERT_HOST(!blob_choice_it.empty());
|
||||||
BLOB_CHOICE *first_choice = nullptr;
|
BLOB_CHOICE *first_choice = nullptr;
|
||||||
@ -414,7 +413,7 @@ void BlamerBundle::SetupCorrectSegmentation(const TWERD *word, bool debug) {
|
|||||||
}
|
}
|
||||||
int blob_index = 0;
|
int blob_index = 0;
|
||||||
int16_t next_box_x = word->blobs[blob_index]->bounding_box().right();
|
int16_t next_box_x = word->blobs[blob_index]->bounding_box().right();
|
||||||
for (int truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.length();
|
for (unsigned truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.length();
|
||||||
++blob_index) {
|
++blob_index) {
|
||||||
++next_box_col;
|
++next_box_col;
|
||||||
int16_t curr_box_x = next_box_x;
|
int16_t curr_box_x = next_box_x;
|
||||||
@ -477,7 +476,7 @@ void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *rati
|
|||||||
// Fill pain points for any unclassifed blob corresponding to the
|
// Fill pain points for any unclassifed blob corresponding to the
|
||||||
// correct segmentation state.
|
// correct segmentation state.
|
||||||
debug_str += "Correct segmentation:\n";
|
debug_str += "Correct segmentation:\n";
|
||||||
for (int idx = 0; idx < correct_segmentation_cols_.size(); ++idx) {
|
for (unsigned idx = 0; idx < correct_segmentation_cols_.size(); ++idx) {
|
||||||
debug_str += "col=" + std::to_string(correct_segmentation_cols_[idx]);
|
debug_str += "col=" + std::to_string(correct_segmentation_cols_[idx]);
|
||||||
debug_str += " row=" + std::to_string(correct_segmentation_rows_[idx]);
|
debug_str += " row=" + std::to_string(correct_segmentation_rows_[idx]);
|
||||||
debug_str += "\n";
|
debug_str += "\n";
|
||||||
|
@ -871,12 +871,15 @@ TBOX TWERD::bounding_box() const {
|
|||||||
|
|
||||||
// Merges the blobs from start to end, not including end, and deletes
|
// Merges the blobs from start to end, not including end, and deletes
|
||||||
// the blobs between start and end.
|
// the blobs between start and end.
|
||||||
void TWERD::MergeBlobs(int start, int end) {
|
void TWERD::MergeBlobs(unsigned start, unsigned end) {
|
||||||
if (start >= blobs.size() - 1) {
|
if (end > blobs.size()) {
|
||||||
|
end = blobs.size();
|
||||||
|
}
|
||||||
|
if (start >= end) {
|
||||||
return; // Nothing to do.
|
return; // Nothing to do.
|
||||||
}
|
}
|
||||||
TESSLINE *outline = blobs[start]->outlines;
|
TESSLINE *outline = blobs[start]->outlines;
|
||||||
for (int i = start + 1; i < end && i < blobs.size(); ++i) {
|
for (auto i = start + 1; i < end; ++i) {
|
||||||
TBLOB *next_blob = blobs[i];
|
TBLOB *next_blob = blobs[i];
|
||||||
// Take the outlines from the next blob.
|
// Take the outlines from the next blob.
|
||||||
if (outline == nullptr) {
|
if (outline == nullptr) {
|
||||||
@ -895,7 +898,7 @@ void TWERD::MergeBlobs(int start, int end) {
|
|||||||
}
|
}
|
||||||
// Remove dead blobs from the vector.
|
// Remove dead blobs from the vector.
|
||||||
// TODO: optimize.
|
// TODO: optimize.
|
||||||
for (int i = start + 1; i < end && start + 1 < blobs.size(); ++i) {
|
for (auto i = start + 1; i < end && start + 1 < blobs.size(); ++i) {
|
||||||
blobs.erase(blobs.begin() + start + 1);
|
blobs.erase(blobs.begin() + start + 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -446,14 +446,14 @@ struct TWERD {
|
|||||||
void ComputeBoundingBoxes();
|
void ComputeBoundingBoxes();
|
||||||
|
|
||||||
// Returns the number of blobs in the word.
|
// Returns the number of blobs in the word.
|
||||||
int NumBlobs() const {
|
unsigned NumBlobs() const {
|
||||||
return blobs.size();
|
return blobs.size();
|
||||||
}
|
}
|
||||||
TBOX bounding_box() const;
|
TBOX bounding_box() const;
|
||||||
|
|
||||||
// Merges the blobs from start to end, not including end, and deletes
|
// Merges the blobs from start to end, not including end, and deletes
|
||||||
// the blobs between start and end.
|
// the blobs between start and end.
|
||||||
void MergeBlobs(int start, int end);
|
void MergeBlobs(unsigned start, unsigned end);
|
||||||
|
|
||||||
#ifndef GRAPHICS_DISABLED
|
#ifndef GRAPHICS_DISABLED
|
||||||
void plot(ScrollView *window);
|
void plot(ScrollView *window);
|
||||||
|
@ -46,7 +46,7 @@ void BoxWord::CopyFrom(const BoxWord &src) {
|
|||||||
length_ = src.length_;
|
length_ = src.length_;
|
||||||
boxes_.clear();
|
boxes_.clear();
|
||||||
boxes_.reserve(length_);
|
boxes_.reserve(length_);
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
boxes_.push_back(src.boxes_[i]);
|
boxes_.push_back(src.boxes_[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -60,7 +60,7 @@ BoxWord *BoxWord::CopyFromNormalized(TWERD *tessword) {
|
|||||||
// Allocate memory.
|
// Allocate memory.
|
||||||
boxword->boxes_.reserve(boxword->length_);
|
boxword->boxes_.reserve(boxword->length_);
|
||||||
|
|
||||||
for (int b = 0; b < boxword->length_; ++b) {
|
for (unsigned b = 0; b < boxword->length_; ++b) {
|
||||||
TBLOB *tblob = tessword->blobs[b];
|
TBLOB *tblob = tessword->blobs[b];
|
||||||
TBOX blob_box;
|
TBOX blob_box;
|
||||||
for (TESSLINE *outline = tblob->outlines; outline != nullptr; outline = outline->next) {
|
for (TESSLINE *outline = tblob->outlines; outline != nullptr; outline = outline->next) {
|
||||||
@ -89,7 +89,7 @@ BoxWord *BoxWord::CopyFromNormalized(TWERD *tessword) {
|
|||||||
// expanding slightly, then clipping to the blobs from the original_word
|
// expanding slightly, then clipping to the blobs from the original_word
|
||||||
// that overlap. If not null, the block provides the inverse rotation.
|
// that overlap. If not null, the block provides the inverse rotation.
|
||||||
void BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) {
|
void BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) {
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
TBOX box = boxes_[i];
|
TBOX box = boxes_[i];
|
||||||
// Expand by a single pixel, as the poly approximation error is 1 pixel.
|
// Expand by a single pixel, as the poly approximation error is 1 pixel.
|
||||||
box = TBOX(box.left() - 1, box.bottom() - 1, box.right() + 1, box.top() + 1);
|
box = TBOX(box.left() - 1, box.bottom() - 1, box.right() + 1, box.top() + 1);
|
||||||
@ -130,18 +130,18 @@ void BoxWord::ClipToOriginalWord(const BLOCK *block, WERD *original_word) {
|
|||||||
|
|
||||||
// Merges the boxes from start to end, not including end, and deletes
|
// Merges the boxes from start to end, not including end, and deletes
|
||||||
// the boxes between start and end.
|
// the boxes between start and end.
|
||||||
void BoxWord::MergeBoxes(int start, int end) {
|
void BoxWord::MergeBoxes(unsigned start, unsigned end) {
|
||||||
start = ClipToRange(start, 0, length_);
|
start = ClipToRange(start, 0U, length_);
|
||||||
end = ClipToRange(end, 0, length_);
|
end = ClipToRange(end, 0U, length_);
|
||||||
if (end <= start + 1) {
|
if (end <= start + 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (int i = start + 1; i < end; ++i) {
|
for (unsigned i = start + 1; i < end; ++i) {
|
||||||
boxes_[start] += boxes_[i];
|
boxes_[start] += boxes_[i];
|
||||||
}
|
}
|
||||||
int shrinkage = end - 1 - start;
|
int shrinkage = end - 1 - start;
|
||||||
length_ -= shrinkage;
|
length_ -= shrinkage;
|
||||||
for (int i = start + 1; i < length_; ++i) {
|
for (unsigned i = start + 1; i < length_; ++i) {
|
||||||
boxes_[i] = boxes_[i + shrinkage];
|
boxes_[i] = boxes_[i + shrinkage];
|
||||||
}
|
}
|
||||||
boxes_.resize(length_);
|
boxes_.resize(length_);
|
||||||
@ -149,7 +149,7 @@ void BoxWord::MergeBoxes(int start, int end) {
|
|||||||
|
|
||||||
// Inserts a new box before the given index.
|
// Inserts a new box before the given index.
|
||||||
// Recomputes the bounding box.
|
// Recomputes the bounding box.
|
||||||
void BoxWord::InsertBox(int index, const TBOX &box) {
|
void BoxWord::InsertBox(unsigned index, const TBOX &box) {
|
||||||
if (index < length_) {
|
if (index < length_) {
|
||||||
boxes_.insert(boxes_.begin() + index, box);
|
boxes_.insert(boxes_.begin() + index, box);
|
||||||
} else {
|
} else {
|
||||||
@ -161,15 +161,15 @@ void BoxWord::InsertBox(int index, const TBOX &box) {
|
|||||||
|
|
||||||
// Changes the box at the given index to the new box.
|
// Changes the box at the given index to the new box.
|
||||||
// Recomputes the bounding box.
|
// Recomputes the bounding box.
|
||||||
void BoxWord::ChangeBox(int index, const TBOX &box) {
|
void BoxWord::ChangeBox(unsigned index, const TBOX &box) {
|
||||||
boxes_[index] = box;
|
boxes_[index] = box;
|
||||||
ComputeBoundingBox();
|
ComputeBoundingBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deletes the box with the given index, and shuffles up the rest.
|
// Deletes the box with the given index, and shuffles up the rest.
|
||||||
// Recomputes the bounding box.
|
// Recomputes the bounding box.
|
||||||
void BoxWord::DeleteBox(int index) {
|
void BoxWord::DeleteBox(unsigned index) {
|
||||||
ASSERT_HOST(0 <= index && index < length_);
|
ASSERT_HOST(index < length_);
|
||||||
boxes_.erase(boxes_.begin() + index);
|
boxes_.erase(boxes_.begin() + index);
|
||||||
--length_;
|
--length_;
|
||||||
ComputeBoundingBox();
|
ComputeBoundingBox();
|
||||||
@ -185,7 +185,7 @@ void BoxWord::DeleteAllBoxes() {
|
|||||||
// Computes the bounding box of the word.
|
// Computes the bounding box of the word.
|
||||||
void BoxWord::ComputeBoundingBox() {
|
void BoxWord::ComputeBoundingBox() {
|
||||||
bbox_ = TBOX();
|
bbox_ = TBOX();
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
bbox_ += boxes_[i];
|
bbox_ += boxes_[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -194,7 +194,7 @@ void BoxWord::ComputeBoundingBox() {
|
|||||||
// for each blob index where the bounding boxes match.
|
// for each blob index where the bounding boxes match.
|
||||||
// The callback is deleted on completion.
|
// The callback is deleted on completion.
|
||||||
void BoxWord::ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const {
|
void BoxWord::ProcessMatchedBlobs(const TWERD &other, std::function<void(int)> cb) const {
|
||||||
for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
|
for (unsigned i = 0; i < length_ && i < other.NumBlobs(); ++i) {
|
||||||
TBOX blob_box = other.blobs[i]->bounding_box();
|
TBOX blob_box = other.blobs[i]->bounding_box();
|
||||||
if (blob_box == boxes_[i]) {
|
if (blob_box == boxes_[i]) {
|
||||||
cb(i);
|
cb(i);
|
||||||
|
@ -52,19 +52,19 @@ public:
|
|||||||
|
|
||||||
// Merges the boxes from start to end, not including end, and deletes
|
// Merges the boxes from start to end, not including end, and deletes
|
||||||
// the boxes between start and end.
|
// the boxes between start and end.
|
||||||
void MergeBoxes(int start, int end);
|
void MergeBoxes(unsigned start, unsigned end);
|
||||||
|
|
||||||
// Inserts a new box before the given index.
|
// Inserts a new box before the given index.
|
||||||
// Recomputes the bounding box.
|
// Recomputes the bounding box.
|
||||||
void InsertBox(int index, const TBOX &box);
|
void InsertBox(unsigned index, const TBOX &box);
|
||||||
|
|
||||||
// Changes the box at the given index to the new box.
|
// Changes the box at the given index to the new box.
|
||||||
// Recomputes the bounding box.
|
// Recomputes the bounding box.
|
||||||
void ChangeBox(int index, const TBOX &box);
|
void ChangeBox(unsigned index, const TBOX &box);
|
||||||
|
|
||||||
// Deletes the box with the given index, and shuffles up the rest.
|
// Deletes the box with the given index, and shuffles up the rest.
|
||||||
// Recomputes the bounding box.
|
// Recomputes the bounding box.
|
||||||
void DeleteBox(int index);
|
void DeleteBox(unsigned index);
|
||||||
|
|
||||||
// Deletes all the boxes stored in BoxWord.
|
// Deletes all the boxes stored in BoxWord.
|
||||||
void DeleteAllBoxes();
|
void DeleteAllBoxes();
|
||||||
@ -77,10 +77,10 @@ public:
|
|||||||
const TBOX &bounding_box() const {
|
const TBOX &bounding_box() const {
|
||||||
return bbox_;
|
return bbox_;
|
||||||
}
|
}
|
||||||
int length() const {
|
unsigned length() const {
|
||||||
return length_;
|
return length_;
|
||||||
}
|
}
|
||||||
const TBOX &BlobBox(int index) const {
|
const TBOX &BlobBox(unsigned index) const {
|
||||||
return boxes_[index];
|
return boxes_[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,7 +88,7 @@ private:
|
|||||||
void ComputeBoundingBox();
|
void ComputeBoundingBox();
|
||||||
|
|
||||||
TBOX bbox_;
|
TBOX bbox_;
|
||||||
int length_;
|
unsigned length_;
|
||||||
std::vector<TBOX> boxes_;
|
std::vector<TBOX> boxes_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ double DetLineFit::ConstrainedFit(const FCOORD &direction, double min_dist, doub
|
|||||||
if (debug) {
|
if (debug) {
|
||||||
tprintf("Constrained fit to dir %g, %g = %d, %d :%zu distances:\n", direction.x(), direction.y(),
|
tprintf("Constrained fit to dir %g, %g = %d, %d :%zu distances:\n", direction.x(), direction.y(),
|
||||||
line_pt->x(), line_pt->y(), distances_.size());
|
line_pt->x(), line_pt->y(), distances_.size());
|
||||||
for (int i = 0; i < distances_.size(); ++i) {
|
for (unsigned i = 0; i < distances_.size(); ++i) {
|
||||||
tprintf("%d: %d, %d -> %g\n", i, distances_[i].data().x(), distances_[i].data().y(),
|
tprintf("%d: %d, %d -> %g\n", i, distances_[i].data().x(), distances_[i].data().y(),
|
||||||
distances_[i].key());
|
distances_[i].key());
|
||||||
}
|
}
|
||||||
@ -260,7 +260,7 @@ void DetLineFit::ComputeDistances(const ICOORD &start, const ICOORD &end) {
|
|||||||
// Compute the distance of each point from the line.
|
// Compute the distance of each point from the line.
|
||||||
int prev_abs_dist = 0;
|
int prev_abs_dist = 0;
|
||||||
int prev_dot = 0;
|
int prev_dot = 0;
|
||||||
for (int i = 0; i < pts_.size(); ++i) {
|
for (unsigned i = 0; i < pts_.size(); ++i) {
|
||||||
ICOORD pt_vector = pts_[i].pt;
|
ICOORD pt_vector = pts_[i].pt;
|
||||||
pt_vector -= start;
|
pt_vector -= start;
|
||||||
int dot = line_vector % pt_vector;
|
int dot = line_vector % pt_vector;
|
||||||
|
@ -83,7 +83,7 @@ bool FontInfoTable::SetContainsMultipleFontProperties(
|
|||||||
}
|
}
|
||||||
int first_font = font_set[0].fontinfo_id;
|
int first_font = font_set[0].fontinfo_id;
|
||||||
uint32_t properties = at(first_font).properties;
|
uint32_t properties = at(first_font).properties;
|
||||||
for (int f = 1; f < font_set.size(); ++f) {
|
for (unsigned f = 1; f < font_set.size(); ++f) {
|
||||||
if (at(font_set[f].fontinfo_id).properties != properties) {
|
if (at(font_set[f].fontinfo_id).properties != properties) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -95,7 +95,7 @@ bool FontInfoTable::SetContainsMultipleFontProperties(
|
|||||||
void FontInfoTable::MoveSpacingInfoFrom(FontInfoTable *other) {
|
void FontInfoTable::MoveSpacingInfoFrom(FontInfoTable *other) {
|
||||||
using namespace std::placeholders; // for _1, _2
|
using namespace std::placeholders; // for _1, _2
|
||||||
set_clear_callback(std::bind(FontInfoDeleteCallback, _1));
|
set_clear_callback(std::bind(FontInfoDeleteCallback, _1));
|
||||||
for (int i = 0; i < other->size(); ++i) {
|
for (unsigned i = 0; i < other->size(); ++i) {
|
||||||
std::vector<FontSpacingInfo *> *spacing_vec = other->at(i).spacing_vec;
|
std::vector<FontSpacingInfo *> *spacing_vec = other->at(i).spacing_vec;
|
||||||
if (spacing_vec != nullptr) {
|
if (spacing_vec != nullptr) {
|
||||||
int target_index = get_index(other->at(i));
|
int target_index = get_index(other->at(i));
|
||||||
@ -117,7 +117,7 @@ void FontInfoTable::MoveTo(UnicityTable<FontInfo> *target) {
|
|||||||
target->clear();
|
target->clear();
|
||||||
using namespace std::placeholders; // for _1, _2
|
using namespace std::placeholders; // for _1, _2
|
||||||
target->set_clear_callback(std::bind(FontInfoDeleteCallback, _1));
|
target->set_clear_callback(std::bind(FontInfoDeleteCallback, _1));
|
||||||
for (int i = 0; i < size(); ++i) {
|
for (unsigned i = 0; i < size(); ++i) {
|
||||||
// Bit copy the FontInfo and steal all the pointers.
|
// Bit copy the FontInfo and steal all the pointers.
|
||||||
target->push_back(at(i));
|
target->push_back(at(i));
|
||||||
at(i).name = nullptr;
|
at(i).name = nullptr;
|
||||||
|
@ -292,7 +292,7 @@ void ImageData::Display() const {
|
|||||||
}
|
}
|
||||||
win->TextAttributes("Arial", text_size, false, false, false);
|
win->TextAttributes("Arial", text_size, false, false, false);
|
||||||
if (!boxes_.empty()) {
|
if (!boxes_.empty()) {
|
||||||
for (int b = 0; b < boxes_.size(); ++b) {
|
for (unsigned b = 0; b < boxes_.size(); ++b) {
|
||||||
boxes_[b].plot(win);
|
boxes_[b].plot(win);
|
||||||
win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
|
win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
|
||||||
}
|
}
|
||||||
@ -312,7 +312,7 @@ void ImageData::Display() const {
|
|||||||
void ImageData::AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
|
void ImageData::AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
|
||||||
const std::vector<int> &box_pages) {
|
const std::vector<int> &box_pages) {
|
||||||
// Copy the boxes and make the transcription.
|
// Copy the boxes and make the transcription.
|
||||||
for (int i = 0; i < box_pages.size(); ++i) {
|
for (unsigned i = 0; i < box_pages.size(); ++i) {
|
||||||
if (page_number_ >= 0 && box_pages[i] != page_number_) {
|
if (page_number_ >= 0 && box_pages[i] != page_number_) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -483,7 +483,7 @@ bool DocumentData::IsPageAvailable(int index, ImageData **page) {
|
|||||||
}
|
}
|
||||||
if (num_pages > 0) {
|
if (num_pages > 0) {
|
||||||
index = Modulo(index, num_pages);
|
index = Modulo(index, num_pages);
|
||||||
if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) {
|
if (pages_offset_ <= index && static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {
|
||||||
*page = pages_[index - pages_offset_]; // Page is available already.
|
*page = pages_[index - pages_offset_]; // Page is available already.
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
* File: mod128.h (Formerly dir128.h)
|
* File: mod128.h (Formerly dir128.h)
|
||||||
* Description: Header for class which implements modulo arithmetic.
|
* Description: Header for class which implements modulo arithmetic.
|
||||||
* Author: Ray Smith
|
* Author: Ray Smith
|
||||||
* Created: Tue Mar 26 17:48:13 GMT 1991
|
|
||||||
*
|
*
|
||||||
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
||||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -81,7 +80,6 @@ public:
|
|||||||
return dir;
|
return dir;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
int8_t dir; // a direction
|
int8_t dir; // a direction
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -449,18 +449,18 @@ bool WERD_RES::IsAmbiguous() {
|
|||||||
// Returns true if the ratings matrix size matches the sum of each of the
|
// Returns true if the ratings matrix size matches the sum of each of the
|
||||||
// segmentation states.
|
// segmentation states.
|
||||||
bool WERD_RES::StatesAllValid() {
|
bool WERD_RES::StatesAllValid() {
|
||||||
int ratings_dim = ratings->dimension();
|
unsigned ratings_dim = ratings->dimension();
|
||||||
if (raw_choice->TotalOfStates() != ratings_dim) {
|
if (raw_choice->TotalOfStates() != ratings_dim) {
|
||||||
tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
|
tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",
|
||||||
raw_choice->TotalOfStates(), ratings_dim);
|
raw_choice->TotalOfStates(), ratings_dim);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
WERD_CHOICE_IT it(&best_choices);
|
WERD_CHOICE_IT it(&best_choices);
|
||||||
int index = 0;
|
unsigned index = 0;
|
||||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
|
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
|
||||||
WERD_CHOICE *choice = it.data();
|
WERD_CHOICE *choice = it.data();
|
||||||
if (choice->TotalOfStates() != ratings_dim) {
|
if (choice->TotalOfStates() != ratings_dim) {
|
||||||
tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n", index,
|
tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n", index,
|
||||||
choice->TotalOfStates(), ratings_dim);
|
choice->TotalOfStates(), ratings_dim);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -522,19 +522,19 @@ void WERD_RES::FilterWordChoices(int debug_level) {
|
|||||||
// Since the two words may use different segmentations of the chunks, we
|
// Since the two words may use different segmentations of the chunks, we
|
||||||
// iterate over the chunks to find out whether a comparable blob
|
// iterate over the chunks to find out whether a comparable blob
|
||||||
// classification is much worse than the best result.
|
// classification is much worse than the best result.
|
||||||
int i = 0, j = 0, chunk = 0;
|
unsigned i = 0, j = 0, chunk = 0;
|
||||||
// Each iteration of the while deals with 1 chunk. On entry choice_chunk
|
// Each iteration of the while deals with 1 chunk. On entry choice_chunk
|
||||||
// and best_chunk are the indices of the first chunk in the NEXT blob,
|
// and best_chunk are the indices of the first chunk in the NEXT blob,
|
||||||
// i.e. we don't have to increment i, j while chunk < choice_chunk and
|
// i.e. we don't have to increment i, j while chunk < choice_chunk and
|
||||||
// best_chunk respectively.
|
// best_chunk respectively.
|
||||||
int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
|
auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
|
||||||
while (i < choice->length() && j < best_choice->length()) {
|
while (i < choice->length() && j < best_choice->length()) {
|
||||||
if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
|
if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
|
||||||
choice->certainty(i) - best_choice->certainty(j) < threshold) {
|
choice->certainty(i) - best_choice->certainty(j) < threshold) {
|
||||||
if (debug_level >= 2) {
|
if (debug_level >= 2) {
|
||||||
choice->print("WorstCertaintyDiffWorseThan");
|
choice->print("WorstCertaintyDiffWorseThan");
|
||||||
tprintf(
|
tprintf(
|
||||||
"i %d j %d Choice->Blob[i].Certainty %.4g"
|
"i %u j %u Choice->Blob[i].Certainty %.4g"
|
||||||
" WorstOtherChoiceCertainty %g Threshold %g\n",
|
" WorstOtherChoiceCertainty %g Threshold %g\n",
|
||||||
i, j, choice->certainty(i), best_choice->certainty(j), threshold);
|
i, j, choice->certainty(i), best_choice->certainty(j), threshold);
|
||||||
tprintf("Discarding bad choice #%d\n", index);
|
tprintf("Discarding bad choice #%d\n", index);
|
||||||
@ -561,7 +561,7 @@ void WERD_RES::ComputeAdaptionThresholds(float certainty_scale, float min_rating
|
|||||||
int end_chunk = best_choice->state(0);
|
int end_chunk = best_choice->state(0);
|
||||||
int end_raw_chunk = raw_choice->state(0);
|
int end_raw_chunk = raw_choice->state(0);
|
||||||
int raw_blob = 0;
|
int raw_blob = 0;
|
||||||
for (int i = 0; i < best_choice->length(); i++, thresholds++) {
|
for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {
|
||||||
float avg_rating = 0.0f;
|
float avg_rating = 0.0f;
|
||||||
int num_error_chunks = 0;
|
int num_error_chunks = 0;
|
||||||
|
|
||||||
@ -737,8 +737,8 @@ int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
// Returns the width of a gap between the specified blob and the next one.
|
// Returns the width of a gap between the specified blob and the next one.
|
||||||
int WERD_RES::GetBlobsGap(int blob_index) const {
|
int WERD_RES::GetBlobsGap(unsigned blob_index) const {
|
||||||
if (blob_index < 0 || blob_index >= blob_gaps.size()) {
|
if (blob_index >= blob_gaps.size()) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return blob_gaps[blob_index];
|
return blob_gaps[blob_index];
|
||||||
@ -748,8 +748,8 @@ int WERD_RES::GetBlobsGap(int blob_index) const {
|
|||||||
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
||||||
// Borrowed pointer, so do not delete. May return nullptr if there is no
|
// Borrowed pointer, so do not delete. May return nullptr if there is no
|
||||||
// BLOB_CHOICE matching the unichar_id at the given index.
|
// BLOB_CHOICE matching the unichar_id at the given index.
|
||||||
BLOB_CHOICE *WERD_RES::GetBlobChoice(int index) const {
|
BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {
|
||||||
if (index < 0 || index >= best_choice->length()) {
|
if (index >= best_choice->length()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
|
BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
|
||||||
@ -826,7 +826,7 @@ void WERD_RES::RebuildBestState() {
|
|||||||
}
|
}
|
||||||
best_state.clear();
|
best_state.clear();
|
||||||
int start = 0;
|
int start = 0;
|
||||||
for (int i = 0; i < best_choice->length(); ++i) {
|
for (unsigned i = 0; i < best_choice->length(); ++i) {
|
||||||
int length = best_choice->state(i);
|
int length = best_choice->state(i);
|
||||||
best_state.push_back(length);
|
best_state.push_back(length);
|
||||||
if (length > 1) {
|
if (length > 1) {
|
||||||
@ -847,10 +847,10 @@ void WERD_RES::CloneChoppedToRebuild() {
|
|||||||
delete rebuild_word;
|
delete rebuild_word;
|
||||||
rebuild_word = new TWERD(*chopped_word);
|
rebuild_word = new TWERD(*chopped_word);
|
||||||
SetupBoxWord();
|
SetupBoxWord();
|
||||||
int word_len = box_word->length();
|
auto word_len = box_word->length();
|
||||||
best_state.reserve(word_len);
|
best_state.reserve(word_len);
|
||||||
correct_text.reserve(word_len);
|
correct_text.reserve(word_len);
|
||||||
for (int i = 0; i < word_len; ++i) {
|
for (unsigned i = 0; i < word_len; ++i) {
|
||||||
best_state.push_back(1);
|
best_state.push_back(1);
|
||||||
correct_text.emplace_back("");
|
correct_text.emplace_back("");
|
||||||
}
|
}
|
||||||
@ -886,14 +886,14 @@ void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
|
|||||||
// providing a single classifier result for each blob.
|
// providing a single classifier result for each blob.
|
||||||
// The BLOB_CHOICEs are consumed and the word takes ownership.
|
// The BLOB_CHOICEs are consumed and the word takes ownership.
|
||||||
// The number of blobs in the box_word must match blob_count.
|
// The number of blobs in the box_word must match blob_count.
|
||||||
void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE **choices) {
|
void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {
|
||||||
// Setup the WERD_RES.
|
// Setup the WERD_RES.
|
||||||
ASSERT_HOST(box_word != nullptr);
|
ASSERT_HOST(box_word != nullptr);
|
||||||
ASSERT_HOST(blob_count == box_word->length());
|
ASSERT_HOST(blob_count == box_word->length());
|
||||||
ClearWordChoices();
|
ClearWordChoices();
|
||||||
ClearRatings();
|
ClearRatings();
|
||||||
ratings = new MATRIX(blob_count, 1);
|
ratings = new MATRIX(blob_count, 1);
|
||||||
for (int c = 0; c < blob_count; ++c) {
|
for (unsigned c = 0; c < blob_count; ++c) {
|
||||||
auto *choice_list = new BLOB_CHOICE_LIST;
|
auto *choice_list = new BLOB_CHOICE_LIST;
|
||||||
BLOB_CHOICE_IT choice_it(choice_list);
|
BLOB_CHOICE_IT choice_it(choice_list);
|
||||||
choice_it.add_after_then_move(choices[c]);
|
choice_it.add_after_then_move(choices[c]);
|
||||||
@ -936,7 +936,7 @@ void WERD_RES::FakeWordFromRatings(PermuterType permuter) {
|
|||||||
void WERD_RES::BestChoiceToCorrectText() {
|
void WERD_RES::BestChoiceToCorrectText() {
|
||||||
correct_text.clear();
|
correct_text.clear();
|
||||||
ASSERT_HOST(best_choice != nullptr);
|
ASSERT_HOST(best_choice != nullptr);
|
||||||
for (int i = 0; i < best_choice->length(); ++i) {
|
for (unsigned i = 0; i < best_choice->length(); ++i) {
|
||||||
UNICHAR_ID choice_id = best_choice->unichar_id(i);
|
UNICHAR_ID choice_id = best_choice->unichar_id(i);
|
||||||
const char *blob_choice = uch_set->id_to_unichar(choice_id);
|
const char *blob_choice = uch_set->id_to_unichar(choice_id);
|
||||||
correct_text.emplace_back(blob_choice);
|
correct_text.emplace_back(blob_choice);
|
||||||
@ -952,7 +952,7 @@ bool WERD_RES::ConditionalBlobMerge(std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR
|
|||||||
std::function<bool(const TBOX &, const TBOX &)> box_cb) {
|
std::function<bool(const TBOX &, const TBOX &)> box_cb) {
|
||||||
ASSERT_HOST(best_choice->empty() || ratings != nullptr);
|
ASSERT_HOST(best_choice->empty() || ratings != nullptr);
|
||||||
bool modified = false;
|
bool modified = false;
|
||||||
for (int i = 0; i + 1 < best_choice->length(); ++i) {
|
for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {
|
||||||
UNICHAR_ID new_id = class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
|
UNICHAR_ID new_id = class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
|
||||||
if (new_id != INVALID_UNICHAR_ID &&
|
if (new_id != INVALID_UNICHAR_ID &&
|
||||||
(box_cb == nullptr || box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
|
(box_cb == nullptr || box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
|
||||||
@ -979,7 +979,7 @@ bool WERD_RES::ConditionalBlobMerge(std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR
|
|||||||
|
|
||||||
// Merges 2 adjacent blobs in the result (index and index+1) and corrects
|
// Merges 2 adjacent blobs in the result (index and index+1) and corrects
|
||||||
// all the data to account for the change.
|
// all the data to account for the change.
|
||||||
void WERD_RES::MergeAdjacentBlobs(int index) {
|
void WERD_RES::MergeAdjacentBlobs(unsigned index) {
|
||||||
if (reject_map.length() == best_choice->length()) {
|
if (reject_map.length() == best_choice->length()) {
|
||||||
reject_map.remove_pos(index);
|
reject_map.remove_pos(index);
|
||||||
}
|
}
|
||||||
@ -1072,7 +1072,7 @@ UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
|
|||||||
void WERD_RES::merge_tess_fails() {
|
void WERD_RES::merge_tess_fails() {
|
||||||
using namespace std::placeholders; // for _1, _2
|
using namespace std::placeholders; // for _1, _2
|
||||||
if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2), nullptr)) {
|
if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2), nullptr)) {
|
||||||
int len = best_choice->length();
|
unsigned len = best_choice->length();
|
||||||
ASSERT_HOST(reject_map.length() == len);
|
ASSERT_HOST(reject_map.length() == len);
|
||||||
ASSERT_HOST(box_word->length() == len);
|
ASSERT_HOST(box_word->length() == len);
|
||||||
}
|
}
|
||||||
@ -1083,7 +1083,7 @@ void WERD_RES::merge_tess_fails() {
|
|||||||
bool WERD_RES::PiecesAllNatural(int start, int count) const {
|
bool WERD_RES::PiecesAllNatural(int start, int count) const {
|
||||||
// all seams must have no splits.
|
// all seams must have no splits.
|
||||||
for (int index = start; index < start + count - 1; ++index) {
|
for (int index = start; index < start + count - 1; ++index) {
|
||||||
if (index >= 0 && index < seam_array.size()) {
|
if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {
|
||||||
SEAM *seam = seam_array[index];
|
SEAM *seam = seam_array[index];
|
||||||
if (seam != nullptr && seam->HasAnySplits()) {
|
if (seam != nullptr && seam->HasAnySplits()) {
|
||||||
return false;
|
return false;
|
||||||
@ -1278,7 +1278,7 @@ static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words, i
|
|||||||
TBOX clipped_box;
|
TBOX clipped_box;
|
||||||
TBOX current_box = words[w_index]->word->bounding_box();
|
TBOX current_box = words[w_index]->word->bounding_box();
|
||||||
TBOX next_box;
|
TBOX next_box;
|
||||||
if (w_index + 1 < words.size() && words[w_index + 1] != nullptr &&
|
if (static_cast<size_t>(w_index + 1) < words.size() && words[w_index + 1] != nullptr &&
|
||||||
words[w_index + 1]->word != nullptr) {
|
words[w_index + 1]->word != nullptr) {
|
||||||
next_box = words[w_index + 1]->word->bounding_box();
|
next_box = words[w_index + 1]->word->bounding_box();
|
||||||
}
|
}
|
||||||
@ -1383,7 +1383,7 @@ void PAGE_RES_IT::ReplaceCurrentWord(tesseract::PointerVector<WERD_RES> *words)
|
|||||||
C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
|
C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
|
||||||
rej_b_it.sort(&C_BLOB::SortByXMiddle);
|
rej_b_it.sort(&C_BLOB::SortByXMiddle);
|
||||||
TBOX clip_box;
|
TBOX clip_box;
|
||||||
for (int w = 0; w < words->size(); ++w) {
|
for (size_t w = 0; w < words->size(); ++w) {
|
||||||
WERD_RES *word_w = (*words)[w];
|
WERD_RES *word_w = (*words)[w];
|
||||||
clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
|
clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
|
||||||
// Compute blob boundaries.
|
// Compute blob boundaries.
|
||||||
@ -1401,7 +1401,7 @@ void PAGE_RES_IT::ReplaceCurrentWord(tesseract::PointerVector<WERD_RES> *words)
|
|||||||
C_BLOB_IT dest_it(word_w->word->cblob_list());
|
C_BLOB_IT dest_it(word_w->word->cblob_list());
|
||||||
// Build the box word as we move the blobs.
|
// Build the box word as we move the blobs.
|
||||||
auto *box_word = new tesseract::BoxWord;
|
auto *box_word = new tesseract::BoxWord;
|
||||||
for (int i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
|
for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
|
||||||
int end_x = blob_ends[i];
|
int end_x = blob_ends[i];
|
||||||
TBOX blob_box;
|
TBOX blob_box;
|
||||||
// Add the blobs up to end_x.
|
// Add the blobs up to end_x.
|
||||||
|
@ -357,12 +357,12 @@ public:
|
|||||||
// This matters for mirrorable characters such as parentheses. We recognize
|
// This matters for mirrorable characters such as parentheses. We recognize
|
||||||
// characters purely based on their shape on the page, and by default produce
|
// characters purely based on their shape on the page, and by default produce
|
||||||
// the corresponding unicode for a left-to-right context.
|
// the corresponding unicode for a left-to-right context.
|
||||||
const char *BestUTF8(int blob_index, bool in_rtl_context) const {
|
const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const {
|
||||||
if (blob_index < 0 || best_choice == nullptr || blob_index >= best_choice->length()) {
|
if (best_choice == nullptr || blob_index >= best_choice->length()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
UNICHAR_ID id = best_choice->unichar_id(blob_index);
|
UNICHAR_ID id = best_choice->unichar_id(blob_index);
|
||||||
if (id < 0 || id >= uch_set->size()) {
|
if (static_cast<unsigned>(id) >= uch_set->size()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
UNICHAR_ID mirrored = uch_set->get_mirror(id);
|
UNICHAR_ID mirrored = uch_set->get_mirror(id);
|
||||||
@ -372,19 +372,19 @@ public:
|
|||||||
return uch_set->id_to_unichar_ext(id);
|
return uch_set->id_to_unichar_ext(id);
|
||||||
}
|
}
|
||||||
// Returns the UTF-8 string for the given blob index in the raw_choice word.
|
// Returns the UTF-8 string for the given blob index in the raw_choice word.
|
||||||
const char *RawUTF8(int blob_index) const {
|
const char *RawUTF8(unsigned blob_index) const {
|
||||||
if (blob_index < 0 || blob_index >= raw_choice->length()) {
|
if (blob_index >= raw_choice->length()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
UNICHAR_ID id = raw_choice->unichar_id(blob_index);
|
UNICHAR_ID id = raw_choice->unichar_id(blob_index);
|
||||||
if (id < 0 || id >= uch_set->size()) {
|
if (static_cast<unsigned>(id) >= uch_set->size()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
return uch_set->id_to_unichar(id);
|
return uch_set->id_to_unichar(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
UNICHARSET::Direction SymbolDirection(int blob_index) const {
|
UNICHARSET::Direction SymbolDirection(unsigned blob_index) const {
|
||||||
if (best_choice == nullptr || blob_index >= best_choice->length() || blob_index < 0) {
|
if (best_choice == nullptr || blob_index >= best_choice->length()) {
|
||||||
return UNICHARSET::U_OTHER_NEUTRAL;
|
return UNICHARSET::U_OTHER_NEUTRAL;
|
||||||
}
|
}
|
||||||
return uch_set->get_direction(best_choice->unichar_id(blob_index));
|
return uch_set->get_direction(best_choice->unichar_id(blob_index));
|
||||||
@ -394,9 +394,9 @@ public:
|
|||||||
if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1) {
|
if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (int id = 0; id < best_choice->length(); id++) {
|
for (unsigned id = 0; id < best_choice->length(); id++) {
|
||||||
int unichar_id = best_choice->unichar_id(id);
|
unsigned unichar_id = best_choice->unichar_id(id);
|
||||||
if (unichar_id < 0 || unichar_id >= uch_set->size()) {
|
if (unichar_id >= uch_set->size()) {
|
||||||
continue; // Ignore illegal chars.
|
continue; // Ignore illegal chars.
|
||||||
}
|
}
|
||||||
UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
|
UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
|
||||||
@ -411,9 +411,9 @@ public:
|
|||||||
if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1) {
|
if (uch_set == nullptr || best_choice == nullptr || best_choice->length() < 1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (int id = 0; id < best_choice->length(); id++) {
|
for (unsigned id = 0; id < best_choice->length(); id++) {
|
||||||
int unichar_id = best_choice->unichar_id(id);
|
unsigned unichar_id = best_choice->unichar_id(id);
|
||||||
if (unichar_id < 0 || unichar_id >= uch_set->size()) {
|
if (unichar_id >= uch_set->size()) {
|
||||||
continue; // Ignore illegal chars.
|
continue; // Ignore illegal chars.
|
||||||
}
|
}
|
||||||
UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
|
UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
|
||||||
@ -550,13 +550,13 @@ public:
|
|||||||
// inclusive.
|
// inclusive.
|
||||||
int GetBlobsWidth(int start_blob, int last_blob) const;
|
int GetBlobsWidth(int start_blob, int last_blob) const;
|
||||||
// Returns the width of a gap between the specified blob and the next one.
|
// Returns the width of a gap between the specified blob and the next one.
|
||||||
int GetBlobsGap(int blob_index) const;
|
int GetBlobsGap(unsigned blob_index) const;
|
||||||
|
|
||||||
// Returns the BLOB_CHOICE corresponding to the given index in the
|
// Returns the BLOB_CHOICE corresponding to the given index in the
|
||||||
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
||||||
// Borrowed pointer, so do not delete. May return nullptr if there is no
|
// Borrowed pointer, so do not delete. May return nullptr if there is no
|
||||||
// BLOB_CHOICE matching the unichar_id at the given index.
|
// BLOB_CHOICE matching the unichar_id at the given index.
|
||||||
BLOB_CHOICE *GetBlobChoice(int index) const;
|
BLOB_CHOICE *GetBlobChoice(unsigned index) const;
|
||||||
|
|
||||||
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
|
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
|
||||||
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
||||||
@ -602,7 +602,7 @@ public:
|
|||||||
// providing a single classifier result for each blob.
|
// providing a single classifier result for each blob.
|
||||||
// The BLOB_CHOICEs are consumed and the word takes ownership.
|
// The BLOB_CHOICEs are consumed and the word takes ownership.
|
||||||
// The number of blobs in the box_word must match blob_count.
|
// The number of blobs in the box_word must match blob_count.
|
||||||
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices);
|
void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices);
|
||||||
|
|
||||||
// Creates a WERD_CHOICE for the word using the top choices from the leading
|
// Creates a WERD_CHOICE for the word using the top choices from the leading
|
||||||
// diagonal of the ratings matrix.
|
// diagonal of the ratings matrix.
|
||||||
@ -621,7 +621,7 @@ public:
|
|||||||
|
|
||||||
// Merges 2 adjacent blobs in the result (index and index+1) and corrects
|
// Merges 2 adjacent blobs in the result (index and index+1) and corrects
|
||||||
// all the data to account for the change.
|
// all the data to account for the change.
|
||||||
void MergeAdjacentBlobs(int index);
|
void MergeAdjacentBlobs(unsigned index);
|
||||||
|
|
||||||
// Callback helper for fix_quotes returns a double quote if both
|
// Callback helper for fix_quotes returns a double quote if both
|
||||||
// arguments are quote, otherwise INVALID_UNICHAR_ID.
|
// arguments are quote, otherwise INVALID_UNICHAR_ID.
|
||||||
|
@ -243,7 +243,7 @@ void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float sr
|
|||||||
this->init(src_lengths ? strlen(src_lengths) : src_string_len);
|
this->init(src_lengths ? strlen(src_lengths) : src_string_len);
|
||||||
length_ = reserved_;
|
length_ = reserved_;
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
int unichar_length = src_lengths ? src_lengths[i] : 1;
|
int unichar_length = src_lengths ? src_lengths[i] : 1;
|
||||||
unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);
|
unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);
|
||||||
state_[i] = 1;
|
state_[i] = 1;
|
||||||
@ -270,7 +270,7 @@ const char *WERD_CHOICE::permuter_name() const {
|
|||||||
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
|
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
|
||||||
// taken from the appropriate cell in the ratings MATRIX.
|
// taken from the appropriate cell in the ratings MATRIX.
|
||||||
// Borrowed pointer, so do not delete.
|
// Borrowed pointer, so do not delete.
|
||||||
BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(int index, MATRIX *ratings) const {
|
BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const {
|
||||||
MATRIX_COORD coord = MatrixCoord(index);
|
MATRIX_COORD coord = MatrixCoord(index);
|
||||||
BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);
|
BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);
|
||||||
if (result == nullptr) {
|
if (result == nullptr) {
|
||||||
@ -282,9 +282,9 @@ BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(int index, MATRIX *ratings) const {
|
|||||||
|
|
||||||
// Returns the MATRIX_COORD corresponding to the location in the ratings
|
// Returns the MATRIX_COORD corresponding to the location in the ratings
|
||||||
// MATRIX for the given index into the word.
|
// MATRIX for the given index into the word.
|
||||||
MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const {
|
MATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const {
|
||||||
int col = 0;
|
int col = 0;
|
||||||
for (int i = 0; i < index; ++i) {
|
for (unsigned i = 0; i < index; ++i) {
|
||||||
col += state_[i];
|
col += state_[i];
|
||||||
}
|
}
|
||||||
int row = col + state_[index] - 1;
|
int row = col + state_[index] - 1;
|
||||||
@ -293,7 +293,7 @@ MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const {
|
|||||||
|
|
||||||
// Sets the entries for the given index from the BLOB_CHOICE, assuming
|
// Sets the entries for the given index from the BLOB_CHOICE, assuming
|
||||||
// unit fragment lengths, but setting the state for this index to blob_count.
|
// unit fragment lengths, but setting the state for this index to blob_count.
|
||||||
void WERD_CHOICE::set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice) {
|
void WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) {
|
||||||
unichar_ids_[index] = blob_choice->unichar_id();
|
unichar_ids_[index] = blob_choice->unichar_id();
|
||||||
script_pos_[index] = tesseract::SP_NORMAL;
|
script_pos_[index] = tesseract::SP_NORMAL;
|
||||||
state_[index] = blob_count;
|
state_[index] = blob_count;
|
||||||
@ -306,7 +306,7 @@ void WERD_CHOICE::set_blob_choice(int index, int blob_count, const BLOB_CHOICE *
|
|||||||
* Returns true if unichar_ids_ contain the given unichar_id, false otherwise.
|
* Returns true if unichar_ids_ contain the given unichar_id, false otherwise.
|
||||||
*/
|
*/
|
||||||
bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
|
bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
if (unichar_ids_[i] == unichar_id) {
|
if (unichar_ids_[i] == unichar_id) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -321,8 +321,8 @@ bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
|
|||||||
* and updates length_ and fragment_lengths_ to reflect this change.
|
* and updates length_ and fragment_lengths_ to reflect this change.
|
||||||
* Note: this function does not modify rating_ and certainty_.
|
* Note: this function does not modify rating_ and certainty_.
|
||||||
*/
|
*/
|
||||||
void WERD_CHOICE::remove_unichar_ids(int start, int num) {
|
void WERD_CHOICE::remove_unichar_ids(unsigned start, int num) {
|
||||||
ASSERT_HOST(start >= 0 && start + num <= length_);
|
ASSERT_HOST(start + num <= length_);
|
||||||
// Accumulate the states to account for the merged blobs.
|
// Accumulate the states to account for the merged blobs.
|
||||||
for (int i = 0; i < num; ++i) {
|
for (int i = 0; i < num; ++i) {
|
||||||
if (start > 0) {
|
if (start > 0) {
|
||||||
@ -331,7 +331,7 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
|
|||||||
state_[start + num] += state_[start + i];
|
state_[start + num] += state_[start + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i = start; i + num < length_; ++i) {
|
for (unsigned i = start; i + num < length_; ++i) {
|
||||||
unichar_ids_[i] = unichar_ids_[i + num];
|
unichar_ids_[i] = unichar_ids_[i + num];
|
||||||
script_pos_[i] = script_pos_[i + num];
|
script_pos_[i] = script_pos_[i + num];
|
||||||
state_[i] = state_[i + num];
|
state_[i] = state_[i + num];
|
||||||
@ -346,7 +346,7 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
|
|||||||
* Reverses and mirrors unichars in unichar_ids.
|
* Reverses and mirrors unichars in unichar_ids.
|
||||||
*/
|
*/
|
||||||
void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
|
void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
|
||||||
for (int i = 0; i < length_ / 2; ++i) {
|
for (unsigned i = 0; i < length_ / 2; ++i) {
|
||||||
UNICHAR_ID tmp_id = unichar_ids_[i];
|
UNICHAR_ID tmp_id = unichar_ids_[i];
|
||||||
unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);
|
unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);
|
||||||
unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);
|
unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);
|
||||||
@ -363,16 +363,15 @@ void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
|
|||||||
* enclose the core portion of this word -- the part after stripping
|
* enclose the core portion of this word -- the part after stripping
|
||||||
* punctuation from the left and right.
|
* punctuation from the left and right.
|
||||||
*/
|
*/
|
||||||
void WERD_CHOICE::punct_stripped(int *start, int *end) const {
|
void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const {
|
||||||
*start = 0;
|
*start = 0;
|
||||||
*end = length() - 1;
|
*end = length();
|
||||||
while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
|
while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
|
||||||
(*start)++;
|
(*start)++;
|
||||||
}
|
}
|
||||||
while (*end > -1 && unicharset()->get_ispunctuation(unichar_id(*end))) {
|
while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
|
||||||
(*end)--;
|
(*end)--;
|
||||||
}
|
}
|
||||||
(*end)++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
|
void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
|
||||||
@ -390,14 +389,14 @@ void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
|
|||||||
*pend = end;
|
*pend = end;
|
||||||
}
|
}
|
||||||
|
|
||||||
WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
|
WERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const {
|
||||||
ASSERT_HOST(start >= 0 && start <= length_);
|
ASSERT_HOST(start <= length_);
|
||||||
ASSERT_HOST(end >= 0 && end <= length_);
|
ASSERT_HOST(end <= length_);
|
||||||
if (end < start) {
|
if (end < start) {
|
||||||
end = start;
|
end = start;
|
||||||
}
|
}
|
||||||
WERD_CHOICE retval(unicharset_, end - start);
|
WERD_CHOICE retval(unicharset_, end - start);
|
||||||
for (int i = start; i < end; i++) {
|
for (auto i = start; i < end; i++) {
|
||||||
retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
|
retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
|
||||||
}
|
}
|
||||||
return retval;
|
return retval;
|
||||||
@ -409,8 +408,7 @@ WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
|
|||||||
* Returns true if unichar_ids contain at least one "strongly" RTL unichar.
|
* Returns true if unichar_ids contain at least one "strongly" RTL unichar.
|
||||||
*/
|
*/
|
||||||
bool WERD_CHOICE::has_rtl_unichar_id() const {
|
bool WERD_CHOICE::has_rtl_unichar_id() const {
|
||||||
int i;
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
for (i = 0; i < length_; ++i) {
|
|
||||||
UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
|
UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
|
||||||
if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
|
if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
|
||||||
return true;
|
return true;
|
||||||
@ -430,7 +428,7 @@ void WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_le
|
|||||||
if (word_lengths_str != nullptr) {
|
if (word_lengths_str != nullptr) {
|
||||||
*word_lengths_str = "";
|
*word_lengths_str = "";
|
||||||
}
|
}
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
|
const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
|
||||||
*word_str += ch;
|
*word_str += ch;
|
||||||
if (word_lengths_str != nullptr) {
|
if (word_lengths_str != nullptr) {
|
||||||
@ -466,7 +464,7 @@ WERD_CHOICE &WERD_CHOICE::operator+=(const WERD_CHOICE &second) {
|
|||||||
this->double_the_size();
|
this->double_the_size();
|
||||||
}
|
}
|
||||||
const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();
|
const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();
|
||||||
for (int i = 0; i < second.length(); ++i) {
|
for (unsigned i = 0; i < second.length(); ++i) {
|
||||||
unichar_ids_[length_ + i] = other_unichar_ids[i];
|
unichar_ids_[length_ + i] = other_unichar_ids[i];
|
||||||
state_[length_ + i] = second.state_[i];
|
state_[length_ + i] = second.state_[i];
|
||||||
certainties_[length_ + i] = second.certainties_[i];
|
certainties_[length_ + i] = second.certainties_[i];
|
||||||
@ -504,7 +502,7 @@ WERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) {
|
|||||||
|
|
||||||
unicharset_ = source.unicharset_;
|
unicharset_ = source.unicharset_;
|
||||||
const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();
|
const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();
|
||||||
for (int i = 0; i < source.length(); ++i) {
|
for (unsigned i = 0; i < source.length(); ++i) {
|
||||||
unichar_ids_[i] = other_unichar_ids[i];
|
unichar_ids_[i] = other_unichar_ids[i];
|
||||||
state_[i] = source.state_[i];
|
state_[i] = source.state_[i];
|
||||||
certainties_[i] = source.certainties_[i];
|
certainties_[i] = source.certainties_[i];
|
||||||
@ -528,17 +526,17 @@ WERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) {
|
|||||||
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
|
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
|
||||||
void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
|
void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
|
||||||
// Initialize to normal.
|
// Initialize to normal.
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
script_pos_[i] = tesseract::SP_NORMAL;
|
script_pos_[i] = tesseract::SP_NORMAL;
|
||||||
}
|
}
|
||||||
if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
|
if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
int position_counts[4] = {0, 0, 0, 0};
|
unsigned position_counts[4] = {0, 0, 0, 0};
|
||||||
|
|
||||||
int chunk_index = 0;
|
int chunk_index = 0;
|
||||||
for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
|
for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
|
||||||
TBLOB *tblob = word->blobs[chunk_index];
|
TBLOB *tblob = word->blobs[chunk_index];
|
||||||
int uni_id = unichar_id(blob_index);
|
int uni_id = unichar_id(blob_index);
|
||||||
TBOX blob_box = tblob->bounding_box();
|
TBOX blob_box = tblob->bounding_box();
|
||||||
@ -557,18 +555,19 @@ void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
|
|||||||
}
|
}
|
||||||
// If almost everything looks like a superscript or subscript,
|
// If almost everything looks like a superscript or subscript,
|
||||||
// we most likely just got the baseline wrong.
|
// we most likely just got the baseline wrong.
|
||||||
if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
|
if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ ||
|
||||||
position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
|
4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) {
|
||||||
if (debug >= 2) {
|
if (debug >= 2) {
|
||||||
tprintf(
|
tprintf(
|
||||||
"Most characters of %s are subscript or superscript.\n"
|
"Most characters of %s are subscript or superscript.\n"
|
||||||
"That seems wrong, so I'll assume we got the baseline wrong\n",
|
"That seems wrong, so I'll assume we got the baseline wrong\n",
|
||||||
unichar_string().c_str());
|
unichar_string().c_str());
|
||||||
}
|
}
|
||||||
for (int i = 0; i < length_; i++) {
|
for (unsigned i = 0; i < length_; i++) {
|
||||||
ScriptPos sp = script_pos_[i];
|
ScriptPos sp = script_pos_[i];
|
||||||
if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
|
if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
|
||||||
position_counts[sp]--;
|
ASSERT_HOST(position_counts[sp] > 0);
|
||||||
|
position_counts[sp]--;
|
||||||
position_counts[tesseract::SP_NORMAL]++;
|
position_counts[tesseract::SP_NORMAL]++;
|
||||||
script_pos_[i] = tesseract::SP_NORMAL;
|
script_pos_[i] = tesseract::SP_NORMAL;
|
||||||
}
|
}
|
||||||
@ -578,7 +577,7 @@ void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
|
|||||||
if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {
|
if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {
|
||||||
tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
|
tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
|
||||||
int chunk_index = 0;
|
int chunk_index = 0;
|
||||||
for (int blob_index = 0; blob_index < length_; ++blob_index) {
|
for (unsigned blob_index = 0; blob_index < length_; ++blob_index) {
|
||||||
if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
|
if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
|
||||||
TBLOB *tblob = word->blobs[chunk_index];
|
TBLOB *tblob = word->blobs[chunk_index];
|
||||||
ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));
|
ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));
|
||||||
@ -590,7 +589,7 @@ void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
|
|||||||
|
|
||||||
// Sets all the script_pos_ positions to the given position.
|
// Sets all the script_pos_ positions to the given position.
|
||||||
void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
|
void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
script_pos_[i] = position;
|
script_pos_[i] = position;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -629,13 +628,9 @@ ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unic
|
|||||||
|
|
||||||
// Returns the script-id (eg Han) of the dominant script in the word.
|
// Returns the script-id (eg Han) of the dominant script in the word.
|
||||||
int WERD_CHOICE::GetTopScriptID() const {
|
int WERD_CHOICE::GetTopScriptID() const {
|
||||||
int max_script = unicharset_->get_script_table_size();
|
unsigned max_script = unicharset_->get_script_table_size();
|
||||||
int *sid = new int[max_script];
|
std::vector<unsigned> sid(max_script);
|
||||||
int x;
|
for (unsigned x = 0; x < length_; ++x) {
|
||||||
for (x = 0; x < max_script; x++) {
|
|
||||||
sid[x] = 0;
|
|
||||||
}
|
|
||||||
for (x = 0; x < length_; ++x) {
|
|
||||||
int script_id = unicharset_->get_script(unichar_id(x));
|
int script_id = unicharset_->get_script(unichar_id(x));
|
||||||
sid[script_id]++;
|
sid[script_id]++;
|
||||||
}
|
}
|
||||||
@ -652,8 +647,8 @@ int WERD_CHOICE::GetTopScriptID() const {
|
|||||||
}
|
}
|
||||||
// Note that high script ID overrides lower one on a tie, thus biasing
|
// Note that high script ID overrides lower one on a tie, thus biasing
|
||||||
// towards non-Common script (if sorted that way in unicharset file).
|
// towards non-Common script (if sorted that way in unicharset file).
|
||||||
int max_sid = 0;
|
unsigned max_sid = 0;
|
||||||
for (x = 1; x < max_script; x++) {
|
for (unsigned x = 1; x < max_script; x++) {
|
||||||
if (sid[x] >= sid[max_sid]) {
|
if (sid[x] >= sid[max_sid]) {
|
||||||
max_sid = x;
|
max_sid = x;
|
||||||
}
|
}
|
||||||
@ -661,14 +656,13 @@ int WERD_CHOICE::GetTopScriptID() const {
|
|||||||
if (sid[max_sid] < length_ / 2) {
|
if (sid[max_sid] < length_ / 2) {
|
||||||
max_sid = unicharset_->null_sid();
|
max_sid = unicharset_->null_sid();
|
||||||
}
|
}
|
||||||
delete[] sid;
|
|
||||||
return max_sid;
|
return max_sid;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fixes the state_ for a chop at the given blob_posiiton.
|
// Fixes the state_ for a chop at the given blob_posiiton.
|
||||||
void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
|
void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
|
||||||
int total_chunks = 0;
|
int total_chunks = 0;
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
total_chunks += state_[i];
|
total_chunks += state_[i];
|
||||||
if (total_chunks > blob_position) {
|
if (total_chunks > blob_position) {
|
||||||
++state_[i];
|
++state_[i];
|
||||||
@ -678,9 +672,9 @@ void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns the sum of all the state elements, being the total number of blobs.
|
// Returns the sum of all the state elements, being the total number of blobs.
|
||||||
int WERD_CHOICE::TotalOfStates() const {
|
unsigned WERD_CHOICE::TotalOfStates() const {
|
||||||
int total_chunks = 0;
|
unsigned total_chunks = 0;
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
total_chunks += state_[i];
|
total_chunks += state_[i];
|
||||||
}
|
}
|
||||||
return total_chunks;
|
return total_chunks;
|
||||||
@ -693,25 +687,25 @@ int WERD_CHOICE::TotalOfStates() const {
|
|||||||
*/
|
*/
|
||||||
void WERD_CHOICE::print(const char *msg) const {
|
void WERD_CHOICE::print(const char *msg) const {
|
||||||
tprintf("%s : ", msg);
|
tprintf("%s : ", msg);
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
|
tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
|
||||||
}
|
}
|
||||||
tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
|
tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
|
||||||
adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
|
adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
|
||||||
tprintf("pos");
|
tprintf("pos");
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
tprintf("\t%s", ScriptPosToString(script_pos_[i]));
|
tprintf("\t%s", ScriptPosToString(script_pos_[i]));
|
||||||
}
|
}
|
||||||
tprintf("\nstr");
|
tprintf("\nstr");
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
|
tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
|
||||||
}
|
}
|
||||||
tprintf("\nstate:");
|
tprintf("\nstate:");
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
tprintf("\t%d ", state_[i]);
|
tprintf("\t%d ", state_[i]);
|
||||||
}
|
}
|
||||||
tprintf("\nC");
|
tprintf("\nC");
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
tprintf("\t%.3f", certainties_[i]);
|
tprintf("\t%.3f", certainties_[i]);
|
||||||
}
|
}
|
||||||
tprintf("\n");
|
tprintf("\n");
|
||||||
@ -720,7 +714,7 @@ void WERD_CHOICE::print(const char *msg) const {
|
|||||||
// Prints the segmentation state with an introductory message.
|
// Prints the segmentation state with an introductory message.
|
||||||
void WERD_CHOICE::print_state(const char *msg) const {
|
void WERD_CHOICE::print_state(const char *msg) const {
|
||||||
tprintf("%s", msg);
|
tprintf("%s", msg);
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
tprintf(" %d", state_[i]);
|
tprintf(" %d", state_[i]);
|
||||||
}
|
}
|
||||||
tprintf("\n");
|
tprintf("\n");
|
||||||
@ -741,7 +735,7 @@ void WERD_CHOICE::DisplaySegmentation(TWERD *word) {
|
|||||||
prev_drawn_state.clear();
|
prev_drawn_state.clear();
|
||||||
prev_drawn_state.resize(length_);
|
prev_drawn_state.resize(length_);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
if (prev_drawn_state[i] != state_[i]) {
|
if (prev_drawn_state[i] != state_[i]) {
|
||||||
already_done = false;
|
already_done = false;
|
||||||
}
|
}
|
||||||
@ -760,7 +754,7 @@ void WERD_CHOICE::DisplaySegmentation(TWERD *word) {
|
|||||||
|
|
||||||
TBOX bbox;
|
TBOX bbox;
|
||||||
int blob_index = 0;
|
int blob_index = 0;
|
||||||
for (int c = 0; c < length_; ++c) {
|
for (unsigned c = 0; c < length_; ++c) {
|
||||||
auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);
|
auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);
|
||||||
for (int i = 0; i < state_[c]; ++i, ++blob_index) {
|
for (int i = 0; i < state_[c]; ++i, ++blob_index) {
|
||||||
TBLOB *blob = word->blobs[blob_index];
|
TBLOB *blob = word->blobs[blob_index];
|
||||||
@ -780,14 +774,14 @@ bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOI
|
|||||||
if (word2.unicharset() != uchset) {
|
if (word2.unicharset() != uchset) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int w1start, w1end;
|
unsigned w1start, w1end;
|
||||||
word1.punct_stripped(&w1start, &w1end);
|
word1.punct_stripped(&w1start, &w1end);
|
||||||
int w2start, w2end;
|
unsigned w2start, w2end;
|
||||||
word2.punct_stripped(&w2start, &w2end);
|
word2.punct_stripped(&w2start, &w2end);
|
||||||
if (w1end - w1start != w2end - w2start) {
|
if (w1end - w1start != w2end - w2start) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < w1end - w1start; i++) {
|
for (unsigned i = 0; i < w1end - w1start; i++) {
|
||||||
if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
|
if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
|
||||||
uchset->to_lower(word2.unichar_id(w2start + i))) {
|
uchset->to_lower(word2.unichar_id(w2start + i))) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -280,7 +280,7 @@ public:
|
|||||||
bool empty() const {
|
bool empty() const {
|
||||||
return length_ == 0;
|
return length_ == 0;
|
||||||
}
|
}
|
||||||
inline int length() const {
|
inline unsigned length() const {
|
||||||
return length_;
|
return length_;
|
||||||
}
|
}
|
||||||
float adjust_factor() const {
|
float adjust_factor() const {
|
||||||
@ -292,15 +292,15 @@ public:
|
|||||||
inline const std::vector<UNICHAR_ID> &unichar_ids() const {
|
inline const std::vector<UNICHAR_ID> &unichar_ids() const {
|
||||||
return unichar_ids_;
|
return unichar_ids_;
|
||||||
}
|
}
|
||||||
inline UNICHAR_ID unichar_id(int index) const {
|
inline UNICHAR_ID unichar_id(unsigned index) const {
|
||||||
assert(index < length_);
|
assert(index < length_);
|
||||||
return unichar_ids_[index];
|
return unichar_ids_[index];
|
||||||
}
|
}
|
||||||
inline int state(int index) const {
|
inline unsigned state(unsigned index) const {
|
||||||
return state_[index];
|
return state_[index];
|
||||||
}
|
}
|
||||||
ScriptPos BlobPosition(int index) const {
|
ScriptPos BlobPosition(unsigned index) const {
|
||||||
if (index < 0 || index >= length_) {
|
if (index >= length_) {
|
||||||
return SP_NORMAL;
|
return SP_NORMAL;
|
||||||
}
|
}
|
||||||
return script_pos_[index];
|
return script_pos_[index];
|
||||||
@ -311,7 +311,7 @@ public:
|
|||||||
inline float certainty() const {
|
inline float certainty() const {
|
||||||
return certainty_;
|
return certainty_;
|
||||||
}
|
}
|
||||||
inline float certainty(int index) const {
|
inline float certainty(unsigned index) const {
|
||||||
return certainties_[index];
|
return certainties_[index];
|
||||||
}
|
}
|
||||||
inline float min_x_height() const {
|
inline float min_x_height() const {
|
||||||
@ -331,13 +331,13 @@ public:
|
|||||||
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
|
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
|
||||||
// taken from the appropriate cell in the ratings MATRIX.
|
// taken from the appropriate cell in the ratings MATRIX.
|
||||||
// Borrowed pointer, so do not delete.
|
// Borrowed pointer, so do not delete.
|
||||||
BLOB_CHOICE_LIST *blob_choices(int index, MATRIX *ratings) const;
|
BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const;
|
||||||
|
|
||||||
// Returns the MATRIX_COORD corresponding to the location in the ratings
|
// Returns the MATRIX_COORD corresponding to the location in the ratings
|
||||||
// MATRIX for the given index into the word.
|
// MATRIX for the given index into the word.
|
||||||
MATRIX_COORD MatrixCoord(int index) const;
|
MATRIX_COORD MatrixCoord(unsigned index) const;
|
||||||
|
|
||||||
inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
|
inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) {
|
||||||
assert(index < length_);
|
assert(index < length_);
|
||||||
unichar_ids_[index] = unichar_id;
|
unichar_ids_[index] = unichar_id;
|
||||||
}
|
}
|
||||||
@ -359,7 +359,7 @@ public:
|
|||||||
// Note: this function should only be used if all the fields
|
// Note: this function should only be used if all the fields
|
||||||
// are populated manually with set_* functions (rather than
|
// are populated manually with set_* functions (rather than
|
||||||
// (copy)constructors and append_* functions).
|
// (copy)constructors and append_* functions).
|
||||||
inline void set_length(int len) {
|
inline void set_length(unsigned len) {
|
||||||
ASSERT_HOST(reserved_ >= len);
|
ASSERT_HOST(reserved_ >= len);
|
||||||
length_ = len;
|
length_ = len;
|
||||||
}
|
}
|
||||||
@ -379,7 +379,7 @@ public:
|
|||||||
|
|
||||||
/// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
|
/// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
|
||||||
/// fragment_length_ arrays. Sets other values to default (blank) values.
|
/// fragment_length_ arrays. Sets other values to default (blank) values.
|
||||||
inline void init(int reserved) {
|
inline void init(unsigned reserved) {
|
||||||
reserved_ = reserved;
|
reserved_ = reserved;
|
||||||
if (reserved > 0) {
|
if (reserved > 0) {
|
||||||
unichar_ids_.resize(reserved);
|
unichar_ids_.resize(reserved);
|
||||||
@ -431,7 +431,7 @@ public:
|
|||||||
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty);
|
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty);
|
||||||
|
|
||||||
inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty,
|
inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty,
|
||||||
int index) {
|
unsigned index) {
|
||||||
assert(index < length_);
|
assert(index < length_);
|
||||||
unichar_ids_[index] = unichar_id;
|
unichar_ids_[index] = unichar_id;
|
||||||
state_[index] = blob_count;
|
state_[index] = blob_count;
|
||||||
@ -444,14 +444,14 @@ public:
|
|||||||
}
|
}
|
||||||
// Sets the entries for the given index from the BLOB_CHOICE, assuming
|
// Sets the entries for the given index from the BLOB_CHOICE, assuming
|
||||||
// unit fragment lengths, but setting the state for this index to blob_count.
|
// unit fragment lengths, but setting the state for this index to blob_count.
|
||||||
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice);
|
void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice);
|
||||||
|
|
||||||
bool contains_unichar_id(UNICHAR_ID unichar_id) const;
|
bool contains_unichar_id(UNICHAR_ID unichar_id) const;
|
||||||
void remove_unichar_ids(int index, int num);
|
void remove_unichar_ids(unsigned index, int num);
|
||||||
inline void remove_last_unichar_id() {
|
inline void remove_last_unichar_id() {
|
||||||
--length_;
|
--length_;
|
||||||
}
|
}
|
||||||
inline void remove_unichar_id(int index) {
|
inline void remove_unichar_id(unsigned index) {
|
||||||
this->remove_unichar_ids(index, 1);
|
this->remove_unichar_ids(index, 1);
|
||||||
}
|
}
|
||||||
bool has_rtl_unichar_id() const;
|
bool has_rtl_unichar_id() const;
|
||||||
@ -460,7 +460,7 @@ public:
|
|||||||
// Returns the half-open interval of unichar_id indices [start, end) which
|
// Returns the half-open interval of unichar_id indices [start, end) which
|
||||||
// enclose the core portion of this word -- the part after stripping
|
// enclose the core portion of this word -- the part after stripping
|
||||||
// punctuation from the left and right.
|
// punctuation from the left and right.
|
||||||
void punct_stripped(int *start_core, int *end_core) const;
|
void punct_stripped(unsigned *start_core, unsigned *end_core) const;
|
||||||
|
|
||||||
// Returns the indices [start, end) containing the core of the word, stripped
|
// Returns the indices [start, end) containing the core of the word, stripped
|
||||||
// of any superscript digits on either side. (i.e., the non-footnote part
|
// of any superscript digits on either side. (i.e., the non-footnote part
|
||||||
@ -469,12 +469,12 @@ public:
|
|||||||
|
|
||||||
// Return a copy of this WERD_CHOICE with the choices [start, end).
|
// Return a copy of this WERD_CHOICE with the choices [start, end).
|
||||||
// The result is useful only for checking against a dictionary.
|
// The result is useful only for checking against a dictionary.
|
||||||
WERD_CHOICE shallow_copy(int start, int end) const;
|
WERD_CHOICE shallow_copy(unsigned start, unsigned end) const;
|
||||||
|
|
||||||
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const;
|
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const;
|
||||||
std::string debug_string() const {
|
std::string debug_string() const {
|
||||||
std::string word_str;
|
std::string word_str;
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
word_str += unicharset_->debug_str(unichar_ids_[i]);
|
word_str += unicharset_->debug_str(unichar_ids_[i]);
|
||||||
word_str += " ";
|
word_str += " ";
|
||||||
}
|
}
|
||||||
@ -482,7 +482,7 @@ public:
|
|||||||
}
|
}
|
||||||
// Returns true if any unichar_id in the word is a non-space-delimited char.
|
// Returns true if any unichar_id in the word is a non-space-delimited char.
|
||||||
bool ContainsAnyNonSpaceDelimited() const {
|
bool ContainsAnyNonSpaceDelimited() const {
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
|
if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -491,7 +491,7 @@ public:
|
|||||||
}
|
}
|
||||||
// Returns true if the word is all spaces.
|
// Returns true if the word is all spaces.
|
||||||
bool IsAllSpaces() const {
|
bool IsAllSpaces() const {
|
||||||
for (int i = 0; i < length_; ++i) {
|
for (unsigned i = 0; i < length_; ++i) {
|
||||||
if (unichar_ids_[i] != UNICHAR_SPACE) {
|
if (unichar_ids_[i] != UNICHAR_SPACE) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -552,7 +552,7 @@ public:
|
|||||||
void UpdateStateForSplit(int blob_position);
|
void UpdateStateForSplit(int blob_position);
|
||||||
|
|
||||||
// Returns the sum of all the state elements, being the total number of blobs.
|
// Returns the sum of all the state elements, being the total number of blobs.
|
||||||
int TotalOfStates() const;
|
unsigned TotalOfStates() const;
|
||||||
|
|
||||||
void print() const {
|
void print() const {
|
||||||
this->print("");
|
this->print("");
|
||||||
@ -591,8 +591,8 @@ private:
|
|||||||
std::vector<ScriptPos> script_pos_; // Normal/Sub/Superscript of each unichar.
|
std::vector<ScriptPos> script_pos_; // Normal/Sub/Superscript of each unichar.
|
||||||
std::vector<int> state_; // Number of blobs in each unichar.
|
std::vector<int> state_; // Number of blobs in each unichar.
|
||||||
std::vector<float> certainties_; // Certainty of each unichar.
|
std::vector<float> certainties_; // Certainty of each unichar.
|
||||||
int reserved_; // size of the above arrays
|
unsigned reserved_; // size of the above arrays
|
||||||
int length_; // word length
|
unsigned length_; // word length
|
||||||
// Factor that was used to adjust the rating.
|
// Factor that was used to adjust the rating.
|
||||||
float adjust_factor_;
|
float adjust_factor_;
|
||||||
// Rating is the sum of the ratings of the individual blobs in the word.
|
// Rating is the sum of the ratings of the individual blobs in the word.
|
||||||
|
@ -58,22 +58,20 @@ void REJ::full_print(FILE *fp) const {
|
|||||||
|
|
||||||
REJMAP &REJMAP::operator=(const REJMAP &source) {
|
REJMAP &REJMAP::operator=(const REJMAP &source) {
|
||||||
initialise(source.len);
|
initialise(source.len);
|
||||||
for (int i = 0; i < len; i++) {
|
for (unsigned i = 0; i < len; i++) {
|
||||||
ptr[i] = source.ptr[i];
|
ptr[i] = source.ptr[i];
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::initialise(int16_t length) {
|
void REJMAP::initialise(uint16_t length) {
|
||||||
ptr = std::make_unique<REJ[]>(length);
|
ptr = std::make_unique<REJ[]>(length);
|
||||||
len = length;
|
len = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
int16_t REJMAP::accept_count() const { // How many accepted?
|
int16_t REJMAP::accept_count() const { // How many accepted?
|
||||||
int i;
|
|
||||||
int16_t count = 0;
|
int16_t count = 0;
|
||||||
|
for (unsigned i = 0; i < len; i++) {
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
@ -82,7 +80,7 @@ int16_t REJMAP::accept_count() const { // How many accepted?
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool REJMAP::recoverable_rejects() const { // Any non perm rejs?
|
bool REJMAP::recoverable_rejects() const { // Any non perm rejs?
|
||||||
for (int i = 0; i < len; i++) {
|
for (unsigned i = 0; i < len; i++) {
|
||||||
if (ptr[i].recoverable()) {
|
if (ptr[i].recoverable()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -91,7 +89,7 @@ bool REJMAP::recoverable_rejects() const { // Any non perm rejs?
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool REJMAP::quality_recoverable_rejects() const { // Any potential rejs?
|
bool REJMAP::quality_recoverable_rejects() const { // Any potential rejs?
|
||||||
for (int i = 0; i < len; i++) {
|
for (unsigned i = 0; i < len; i++) {
|
||||||
if (ptr[i].accept_if_good_quality()) {
|
if (ptr[i].accept_if_good_quality()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -100,9 +98,8 @@ bool REJMAP::quality_recoverable_rejects() const { // Any potential rejs?
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::remove_pos( // Cut out an element
|
void REJMAP::remove_pos( // Cut out an element
|
||||||
int16_t pos // element to remove
|
uint16_t pos // element to remove
|
||||||
) {
|
) {
|
||||||
ASSERT_HOST(pos >= 0);
|
|
||||||
ASSERT_HOST(pos < len);
|
ASSERT_HOST(pos < len);
|
||||||
ASSERT_HOST(len > 0);
|
ASSERT_HOST(len > 0);
|
||||||
|
|
||||||
@ -113,45 +110,34 @@ void REJMAP::remove_pos( // Cut out an element
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::print(FILE *fp) const {
|
void REJMAP::print(FILE *fp) const {
|
||||||
int i;
|
fputc('"', fp);
|
||||||
char buff[512];
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
fputc( ptr[i].display_char(), fp);
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
buff[i] = ptr[i].display_char();
|
|
||||||
}
|
}
|
||||||
buff[i] = '\0';
|
fputc('"', fp);
|
||||||
fprintf(fp, "\"%s\"", buff);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::full_print(FILE *fp) const {
|
void REJMAP::full_print(FILE *fp) const {
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
ptr[i].full_print(fp);
|
ptr[i].full_print(fp);
|
||||||
fprintf(fp, "\n");
|
fprintf(fp, "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_small_xht() { // Reject whole word
|
void REJMAP::rej_word_small_xht() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
ptr[i].setrej_small_xht();
|
ptr[i].setrej_small_xht();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_tess_failure() { // Reject whole word
|
void REJMAP::rej_word_tess_failure() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
ptr[i].setrej_tess_failure();
|
ptr[i].setrej_tess_failure();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_not_tess_accepted() { // Reject whole word
|
void REJMAP::rej_word_not_tess_accepted() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_not_tess_accepted();
|
ptr[i].setrej_not_tess_accepted();
|
||||||
}
|
}
|
||||||
@ -159,9 +145,7 @@ void REJMAP::rej_word_not_tess_accepted() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_contains_blanks() { // Reject whole word
|
void REJMAP::rej_word_contains_blanks() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_contains_blanks();
|
ptr[i].setrej_contains_blanks();
|
||||||
}
|
}
|
||||||
@ -169,9 +153,7 @@ void REJMAP::rej_word_contains_blanks() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_bad_permuter() { // Reject whole word
|
void REJMAP::rej_word_bad_permuter() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_bad_permuter();
|
ptr[i].setrej_bad_permuter();
|
||||||
}
|
}
|
||||||
@ -179,9 +161,7 @@ void REJMAP::rej_word_bad_permuter() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_xht_fixup() { // Reject whole word
|
void REJMAP::rej_word_xht_fixup() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_xht_fixup();
|
ptr[i].setrej_xht_fixup();
|
||||||
}
|
}
|
||||||
@ -189,9 +169,7 @@ void REJMAP::rej_word_xht_fixup() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_no_alphanums() { // Reject whole word
|
void REJMAP::rej_word_no_alphanums() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_no_alphanums();
|
ptr[i].setrej_no_alphanums();
|
||||||
}
|
}
|
||||||
@ -199,9 +177,7 @@ void REJMAP::rej_word_no_alphanums() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_mostly_rej() { // Reject whole word
|
void REJMAP::rej_word_mostly_rej() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_mostly_rej();
|
ptr[i].setrej_mostly_rej();
|
||||||
}
|
}
|
||||||
@ -209,9 +185,7 @@ void REJMAP::rej_word_mostly_rej() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_bad_quality() { // Reject whole word
|
void REJMAP::rej_word_bad_quality() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_bad_quality();
|
ptr[i].setrej_bad_quality();
|
||||||
}
|
}
|
||||||
@ -219,9 +193,7 @@ void REJMAP::rej_word_bad_quality() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_doc_rej() { // Reject whole word
|
void REJMAP::rej_word_doc_rej() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_doc_rej();
|
ptr[i].setrej_doc_rej();
|
||||||
}
|
}
|
||||||
@ -229,9 +201,7 @@ void REJMAP::rej_word_doc_rej() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_block_rej() { // Reject whole word
|
void REJMAP::rej_word_block_rej() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_block_rej();
|
ptr[i].setrej_block_rej();
|
||||||
}
|
}
|
||||||
@ -239,9 +209,7 @@ void REJMAP::rej_word_block_rej() { // Reject whole word
|
|||||||
}
|
}
|
||||||
|
|
||||||
void REJMAP::rej_word_row_rej() { // Reject whole word
|
void REJMAP::rej_word_row_rej() { // Reject whole word
|
||||||
int i;
|
for (unsigned i = 0; i < len; i++) {
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
if (ptr[i].accepted()) {
|
if (ptr[i].accepted()) {
|
||||||
ptr[i].setrej_row_rej();
|
ptr[i].setrej_row_rej();
|
||||||
}
|
}
|
||||||
|
@ -309,10 +309,10 @@ public:
|
|||||||
|
|
||||||
class REJMAP {
|
class REJMAP {
|
||||||
std::unique_ptr<REJ[]> ptr; // ptr to the chars
|
std::unique_ptr<REJ[]> ptr; // ptr to the chars
|
||||||
int16_t len; // Number of chars
|
uint16_t len = 0; // Number of chars
|
||||||
|
|
||||||
public:
|
public:
|
||||||
REJMAP() : len(0) {}
|
REJMAP() = default;
|
||||||
|
|
||||||
REJMAP(const REJMAP &rejmap) {
|
REJMAP(const REJMAP &rejmap) {
|
||||||
*this = rejmap;
|
*this = rejmap;
|
||||||
@ -321,16 +321,16 @@ public:
|
|||||||
REJMAP &operator=(const REJMAP &source);
|
REJMAP &operator=(const REJMAP &source);
|
||||||
|
|
||||||
// Sets up the ptr array to length, whatever it was before.
|
// Sets up the ptr array to length, whatever it was before.
|
||||||
void initialise(int16_t length);
|
void initialise(uint16_t length);
|
||||||
|
|
||||||
REJ &operator[]( // access function
|
REJ &operator[]( // access function
|
||||||
int16_t index) const // map index
|
uint16_t index) const // map index
|
||||||
{
|
{
|
||||||
ASSERT_HOST(index < len);
|
ASSERT_HOST(index < len);
|
||||||
return ptr[index]; // no bounds checks
|
return ptr[index]; // no bounds checks
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t length() const { // map length
|
uint16_t length() const { // map length
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -340,8 +340,8 @@ public:
|
|||||||
return len - accept_count();
|
return len - accept_count();
|
||||||
}
|
}
|
||||||
|
|
||||||
void remove_pos( // Cut out an element
|
// Cut out an element.
|
||||||
int16_t pos); // element to remove
|
void remove_pos(uint16_t pos);
|
||||||
|
|
||||||
void print(FILE *fp) const;
|
void print(FILE *fp) const;
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ void SEAM::UndoSeam(TBLOB *blob, TBLOB *other_blob) const {
|
|||||||
// Prints everything in *this SEAM.
|
// Prints everything in *this SEAM.
|
||||||
void SEAM::Print(const char *label) const {
|
void SEAM::Print(const char *label) const {
|
||||||
tprintf("%s", label);
|
tprintf("%s", label);
|
||||||
tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ", priority_, location_.x, location_.y, widthp_, widthn_);
|
tprintf(" %6.2f @ (%d,%d), p=%u, n=%u ", priority_, location_.x, location_.y, widthp_, widthn_);
|
||||||
for (int s = 0; s < num_splits_; ++s) {
|
for (int s = 0; s < num_splits_; ++s) {
|
||||||
splits_[s].Print();
|
splits_[s].Print();
|
||||||
if (s + 1 < num_splits_) {
|
if (s + 1 < num_splits_) {
|
||||||
@ -263,7 +263,7 @@ void start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array) {
|
|||||||
seam_array->clear();
|
seam_array->clear();
|
||||||
TPOINT location;
|
TPOINT location;
|
||||||
|
|
||||||
for (int b = 1; b < word->NumBlobs(); ++b) {
|
for (unsigned b = 1; b < word->NumBlobs(); ++b) {
|
||||||
TBOX bbox = word->blobs[b - 1]->bounding_box();
|
TBOX bbox = word->blobs[b - 1]->bounding_box();
|
||||||
TBOX nbox = word->blobs[b]->bounding_box();
|
TBOX nbox = word->blobs[b]->bounding_box();
|
||||||
location.x = (bbox.right() + nbox.left()) / 2;
|
location.x = (bbox.right() + nbox.left()) / 2;
|
||||||
|
@ -34,10 +34,10 @@ class SEAM {
|
|||||||
public:
|
public:
|
||||||
// A seam with no splits
|
// A seam with no splits
|
||||||
SEAM(float priority, const TPOINT &location)
|
SEAM(float priority, const TPOINT &location)
|
||||||
: priority_(priority), location_(location), widthp_(0), widthn_(0), num_splits_(0) {}
|
: priority_(priority), location_(location), num_splits_(0) {}
|
||||||
// A seam with a single split point.
|
// A seam with a single split point.
|
||||||
SEAM(float priority, const TPOINT &location, const SPLIT &split)
|
SEAM(float priority, const TPOINT &location, const SPLIT &split)
|
||||||
: priority_(priority), location_(location), widthp_(0), widthn_(0), num_splits_(1) {
|
: priority_(priority), location_(location), num_splits_(1) {
|
||||||
splits_[0] = split;
|
splits_[0] = split;
|
||||||
}
|
}
|
||||||
// Default copy constructor, operator= and destructor are OK!
|
// Default copy constructor, operator= and destructor are OK!
|
||||||
@ -191,8 +191,8 @@ private:
|
|||||||
// A range such that all splits in *this SEAM are contained within blobs in
|
// A range such that all splits in *this SEAM are contained within blobs in
|
||||||
// the range [index - widthn_,index + widthp_] where index is the index of
|
// the range [index - widthn_,index + widthp_] where index is the index of
|
||||||
// this SEAM in the seams vector.
|
// this SEAM in the seams vector.
|
||||||
int8_t widthp_;
|
uint8_t widthp_ = 0;
|
||||||
int8_t widthn_;
|
uint8_t widthn_ = 0;
|
||||||
// Number of splits_ that are used.
|
// Number of splits_ that are used.
|
||||||
uint8_t num_splits_;
|
uint8_t num_splits_;
|
||||||
// Set of pairs of points that are the ends of each split in the SEAM.
|
// Set of pairs of points that are the ends of each split in the SEAM.
|
||||||
|
@ -521,12 +521,12 @@ int STATS::top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (total_count > least_count || modes.size() < max_modes) {
|
if (total_count > least_count || modes.size() < static_cast<size_t>(max_modes)) {
|
||||||
// We definitely want this mode, so if we have enough discard the least.
|
// We definitely want this mode, so if we have enough discard the least.
|
||||||
if (modes.size() == max_modes) {
|
if (modes.size() == static_cast<size_t>(max_modes)) {
|
||||||
modes.resize(max_modes - 1);
|
modes.resize(max_modes - 1);
|
||||||
}
|
}
|
||||||
int target_index = 0;
|
size_t target_index = 0;
|
||||||
// Linear search for the target insertion point.
|
// Linear search for the target insertion point.
|
||||||
while (target_index < modes.size() && modes[target_index].data() >= total_count) {
|
while (target_index < modes.size() && modes[target_index].data() >= total_count) {
|
||||||
++target_index;
|
++target_index;
|
||||||
|
@ -49,7 +49,7 @@ AmbigSpec::AmbigSpec() {
|
|||||||
|
|
||||||
// Initializes the ambigs by adding a nullptr pointer to each table.
|
// Initializes the ambigs by adding a nullptr pointer to each table.
|
||||||
void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption) {
|
void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption) {
|
||||||
for (int i = 0; i < unicharset.size(); ++i) {
|
for (unsigned i = 0; i < unicharset.size(); ++i) {
|
||||||
replace_ambigs_.push_back(nullptr);
|
replace_ambigs_.push_back(nullptr);
|
||||||
dang_ambigs_.push_back(nullptr);
|
dang_ambigs_.push_back(nullptr);
|
||||||
one_to_one_definite_ambigs_.push_back(nullptr);
|
one_to_one_definite_ambigs_.push_back(nullptr);
|
||||||
@ -72,7 +72,6 @@ void UnicharAmbigs::LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *uni
|
|||||||
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambig_file,
|
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambig_file,
|
||||||
int debug_level, bool use_ambigs_for_adaption,
|
int debug_level, bool use_ambigs_for_adaption,
|
||||||
UNICHARSET *unicharset) {
|
UNICHARSET *unicharset) {
|
||||||
int i, j;
|
|
||||||
UnicharIdVector *adaption_ambigs_entry;
|
UnicharIdVector *adaption_ambigs_entry;
|
||||||
if (debug_level) {
|
if (debug_level) {
|
||||||
tprintf("Reading ambiguities\n");
|
tprintf("Reading ambiguities\n");
|
||||||
@ -130,7 +129,7 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
|
|||||||
// Silently ignore invalid strings, as before, so it is safe to use a
|
// Silently ignore invalid strings, as before, so it is safe to use a
|
||||||
// universal ambigs file.
|
// universal ambigs file.
|
||||||
if (unicharset->encode_string(replacement_string, true, &encoding, nullptr, nullptr)) {
|
if (unicharset->encode_string(replacement_string, true, &encoding, nullptr, nullptr)) {
|
||||||
for (i = 0; i < test_ambig_part_size; ++i) {
|
for (int i = 0; i < test_ambig_part_size; ++i) {
|
||||||
if (ambigs_for_adaption_[test_unichar_ids[i]] == nullptr) {
|
if (ambigs_for_adaption_[test_unichar_ids[i]] == nullptr) {
|
||||||
ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
|
ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
|
||||||
}
|
}
|
||||||
@ -139,6 +138,7 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
|
|||||||
ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
|
ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
|
||||||
// Add the new unichar id to adaption_ambigs_entry (only if the
|
// Add the new unichar id to adaption_ambigs_entry (only if the
|
||||||
// vector does not already contain it) keeping it in sorted order.
|
// vector does not already contain it) keeping it in sorted order.
|
||||||
|
size_t j;
|
||||||
for (j = 0;
|
for (j = 0;
|
||||||
j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert;
|
j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert;
|
||||||
++j) {
|
++j) {
|
||||||
@ -160,12 +160,12 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
|
|||||||
|
|
||||||
// Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
|
// Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
|
||||||
if (use_ambigs_for_adaption) {
|
if (use_ambigs_for_adaption) {
|
||||||
for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
|
for (size_t i = 0; i < ambigs_for_adaption_.size(); ++i) {
|
||||||
adaption_ambigs_entry = ambigs_for_adaption_[i];
|
adaption_ambigs_entry = ambigs_for_adaption_[i];
|
||||||
if (adaption_ambigs_entry == nullptr) {
|
if (adaption_ambigs_entry == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
|
for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) {
|
||||||
UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
|
UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
|
||||||
if (reverse_ambigs_for_adaption_[ambig_id] == nullptr) {
|
if (reverse_ambigs_for_adaption_[ambig_id] == nullptr) {
|
||||||
reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
|
reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
|
||||||
@ -179,7 +179,7 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
|
|||||||
if (debug_level > 1) {
|
if (debug_level > 1) {
|
||||||
for (int tbl = 0; tbl < 2; ++tbl) {
|
for (int tbl = 0; tbl < 2; ++tbl) {
|
||||||
const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
|
const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
|
||||||
for (i = 0; i < print_table.size(); ++i) {
|
for (size_t i = 0; i < print_table.size(); ++i) {
|
||||||
AmbigSpec_LIST *lst = print_table[i];
|
AmbigSpec_LIST *lst = print_table[i];
|
||||||
if (lst == nullptr) {
|
if (lst == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
@ -202,12 +202,12 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambi
|
|||||||
for (int vec_id = 0; vec_id < 2; ++vec_id) {
|
for (int vec_id = 0; vec_id < 2; ++vec_id) {
|
||||||
const std::vector<UnicharIdVector *> &vec =
|
const std::vector<UnicharIdVector *> &vec =
|
||||||
(vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
|
(vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
|
||||||
for (i = 0; i < vec.size(); ++i) {
|
for (size_t i = 0; i < vec.size(); ++i) {
|
||||||
adaption_ambigs_entry = vec[i];
|
adaption_ambigs_entry = vec[i];
|
||||||
if (adaption_ambigs_entry != nullptr) {
|
if (adaption_ambigs_entry != nullptr) {
|
||||||
tprintf("%sAmbigs for adaption for %s:\n", (vec_id == 0) ? "" : "Reverse ",
|
tprintf("%sAmbigs for adaption for %s:\n", (vec_id == 0) ? "" : "Reverse ",
|
||||||
unicharset->debug_str(i).c_str());
|
unicharset->debug_str(i).c_str());
|
||||||
for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
|
for (size_t j = 0; j < adaption_ambigs_entry->size(); ++j) {
|
||||||
tprintf("%s ", unicharset->debug_str((*adaption_ambigs_entry)[j]).c_str());
|
tprintf("%s ", unicharset->debug_str((*adaption_ambigs_entry)[j]).c_str());
|
||||||
}
|
}
|
||||||
tprintf("\n");
|
tprintf("\n");
|
||||||
@ -246,7 +246,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(int line_num, int version, int debug_leve
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Copy encoded string to output.
|
// Copy encoded string to output.
|
||||||
for (int i = 0; i < unichars.size(); ++i) {
|
for (size_t i = 0; i < unichars.size(); ++i) {
|
||||||
test_unichar_ids[i] = unichars[i];
|
test_unichar_ids[i] = unichars[i];
|
||||||
}
|
}
|
||||||
test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;
|
test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;
|
||||||
|
@ -71,7 +71,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Return the size used.
|
// Return the size used.
|
||||||
int size() const {
|
unsigned size() const {
|
||||||
return size_used_;
|
return size_used_;
|
||||||
}
|
}
|
||||||
// Workaround to avoid g++ -Wsign-compare warnings.
|
// Workaround to avoid g++ -Wsign-compare warnings.
|
||||||
@ -308,7 +308,7 @@ inline bool SaveDataToFile(const GenericVector<char> &data, const char *filename
|
|||||||
if (fp == nullptr) {
|
if (fp == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool result = static_cast<int>(fwrite(&data[0], 1, data.size(), fp)) == data.size();
|
bool result = fwrite(&data[0], 1, data.size(), fp) == data.size();
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -373,7 +373,7 @@ public:
|
|||||||
}
|
}
|
||||||
PointerVector<T> &operator+=(const PointerVector &other) {
|
PointerVector<T> &operator+=(const PointerVector &other) {
|
||||||
this->reserve(this->size_used_ + other.size_used_);
|
this->reserve(this->size_used_ + other.size_used_);
|
||||||
for (int i = 0; i < other.size(); ++i) {
|
for (unsigned i = 0; i < other.size(); ++i) {
|
||||||
this->push_back(new T(*other.data_[i]));
|
this->push_back(new T(*other.data_[i]));
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
@ -681,7 +681,7 @@ void GenericVector<T>::operator+=(const T &t) {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
GenericVector<T> &GenericVector<T>::operator+=(const GenericVector &other) {
|
GenericVector<T> &GenericVector<T>::operator+=(const GenericVector &other) {
|
||||||
this->reserve(size_used_ + other.size_used_);
|
this->reserve(size_used_ + other.size_used_);
|
||||||
for (int i = 0; i < other.size(); ++i) {
|
for (unsigned i = 0; i < other.size(); ++i) {
|
||||||
this->operator+=(other.data_[i]);
|
this->operator+=(other.data_[i]);
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
@ -757,7 +757,7 @@ bool GenericVector<T>::read(TFile *f, std::function<bool(TFile *, T *)> cb) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (f->FReadEndian(data_, sizeof(T), size_used_) != size_used_) {
|
if (f->FReadEndian(data_, sizeof(T), size_used_) != static_cast<unsigned>(size_used_)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -123,7 +123,7 @@ void IndexMapBiDi::Setup() {
|
|||||||
}
|
}
|
||||||
compact_map_.clear();
|
compact_map_.clear();
|
||||||
compact_map_.resize(compact_size, -1);
|
compact_map_.resize(compact_size, -1);
|
||||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
for (size_t i = 0; i < sparse_map_.size(); ++i) {
|
||||||
if (sparse_map_[i] >= 0) {
|
if (sparse_map_[i] >= 0) {
|
||||||
compact_map_[sparse_map_[i]] = i;
|
compact_map_[sparse_map_[i]] = i;
|
||||||
}
|
}
|
||||||
@ -187,7 +187,7 @@ void IndexMapBiDi::CompleteMerges() {
|
|||||||
// Re-generate the compact_map leaving holes for unused indices.
|
// Re-generate the compact_map leaving holes for unused indices.
|
||||||
compact_map_.clear();
|
compact_map_.clear();
|
||||||
compact_map_.resize(compact_size, -1);
|
compact_map_.resize(compact_size, -1);
|
||||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
for (size_t i = 0; i < sparse_map_.size(); ++i) {
|
||||||
if (sparse_map_[i] >= 0) {
|
if (sparse_map_[i] >= 0) {
|
||||||
if (compact_map_[sparse_map_[i]] == -1) {
|
if (compact_map_[sparse_map_[i]] == -1) {
|
||||||
compact_map_[sparse_map_[i]] = i;
|
compact_map_[sparse_map_[i]] = i;
|
||||||
@ -198,7 +198,7 @@ void IndexMapBiDi::CompleteMerges() {
|
|||||||
// index went to in the compacted map.
|
// index went to in the compacted map.
|
||||||
std::vector<int32_t> tmp_compact_map(compact_size, -1);
|
std::vector<int32_t> tmp_compact_map(compact_size, -1);
|
||||||
compact_size = 0;
|
compact_size = 0;
|
||||||
for (int i = 0; i < compact_map_.size(); ++i) {
|
for (size_t i = 0; i < compact_map_.size(); ++i) {
|
||||||
if (compact_map_[i] >= 0) {
|
if (compact_map_[i] >= 0) {
|
||||||
tmp_compact_map[i] = compact_size;
|
tmp_compact_map[i] = compact_size;
|
||||||
compact_map_[compact_size++] = compact_map_[i];
|
compact_map_[compact_size++] = compact_map_[i];
|
||||||
@ -222,8 +222,8 @@ bool IndexMapBiDi::Serialize(FILE *fp) const {
|
|||||||
// then each additional sparse entry needs to be stored.
|
// then each additional sparse entry needs to be stored.
|
||||||
// Normally we store only the compact map to save space.
|
// Normally we store only the compact map to save space.
|
||||||
std::vector<int32_t> remaining_pairs;
|
std::vector<int32_t> remaining_pairs;
|
||||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
for (unsigned i = 0; i < sparse_map_.size(); ++i) {
|
||||||
if (sparse_map_[i] >= 0 && compact_map_[sparse_map_[i]] != i) {
|
if (sparse_map_[i] >= 0 && static_cast<unsigned>(compact_map_[sparse_map_[i]]) != i) {
|
||||||
remaining_pairs.push_back(i);
|
remaining_pairs.push_back(i);
|
||||||
remaining_pairs.push_back(sparse_map_[i]);
|
remaining_pairs.push_back(sparse_map_[i]);
|
||||||
}
|
}
|
||||||
@ -243,10 +243,10 @@ bool IndexMapBiDi::DeSerialize(bool swap, FILE *fp) {
|
|||||||
}
|
}
|
||||||
sparse_map_.clear();
|
sparse_map_.clear();
|
||||||
sparse_map_.resize(sparse_size_, -1);
|
sparse_map_.resize(sparse_size_, -1);
|
||||||
for (int i = 0; i < compact_map_.size(); ++i) {
|
for (unsigned i = 0; i < compact_map_.size(); ++i) {
|
||||||
sparse_map_[compact_map_[i]] = i;
|
sparse_map_[compact_map_[i]] = i;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < remaining_pairs.size(); ++i) {
|
for (size_t i = 0; i < remaining_pairs.size(); ++i) {
|
||||||
int sparse_index = remaining_pairs[i++];
|
int sparse_index = remaining_pairs[i++];
|
||||||
sparse_map_[sparse_index] = remaining_pairs[i];
|
sparse_map_[sparse_index] = remaining_pairs[i];
|
||||||
}
|
}
|
||||||
|
@ -55,13 +55,13 @@ bool SaveDataToFile(const std::vector<char> &data, const char *filename) {
|
|||||||
if (fp == nullptr) {
|
if (fp == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool result = static_cast<int>(fwrite(&data[0], 1, data.size(), fp)) == data.size();
|
bool result = fwrite(&data[0], 1, data.size(), fp) == data.size();
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
TFile::TFile()
|
TFile::TFile() {
|
||||||
: data_(nullptr), offset_(0), data_is_owned_(false), is_writing_(false), swap_(false) {}
|
}
|
||||||
|
|
||||||
TFile::~TFile() {
|
TFile::~TFile() {
|
||||||
if (data_is_owned_) {
|
if (data_is_owned_) {
|
||||||
@ -152,7 +152,7 @@ bool TFile::Open(const char *filename, FileReader reader) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TFile::Open(const char *data, int size) {
|
bool TFile::Open(const char *data, size_t size) {
|
||||||
offset_ = 0;
|
offset_ = 0;
|
||||||
if (!data_is_owned_) {
|
if (!data_is_owned_) {
|
||||||
data_ = new std::vector<char>;
|
data_ = new std::vector<char>;
|
||||||
@ -181,7 +181,7 @@ bool TFile::Open(FILE *fp, int64_t end_offset) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int size = end_offset - current_pos;
|
size_t size = end_offset - current_pos;
|
||||||
is_writing_ = false;
|
is_writing_ = false;
|
||||||
swap_ = false;
|
swap_ = false;
|
||||||
if (!data_is_owned_) {
|
if (!data_is_owned_) {
|
||||||
@ -189,7 +189,7 @@ bool TFile::Open(FILE *fp, int64_t end_offset) {
|
|||||||
data_is_owned_ = true;
|
data_is_owned_ = true;
|
||||||
}
|
}
|
||||||
data_->resize(size); // TODO: optimize no init
|
data_->resize(size); // TODO: optimize no init
|
||||||
return static_cast<int>(fread(&(*data_)[0], 1, size, fp)) == size;
|
return fread(&(*data_)[0], 1, size, fp) == size;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *TFile::FGets(char *buffer, int buffer_size) {
|
char *TFile::FGets(char *buffer, int buffer_size) {
|
||||||
@ -207,21 +207,20 @@ char *TFile::FGets(char *buffer, int buffer_size) {
|
|||||||
return size > 0 ? buffer : nullptr;
|
return size > 0 ? buffer : nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
int TFile::FReadEndian(void *buffer, size_t size, int count) {
|
size_t TFile::FReadEndian(void *buffer, size_t size, size_t count) {
|
||||||
int num_read = FRead(buffer, size, count);
|
auto num_read = FRead(buffer, size, count);
|
||||||
if (swap_ && size != 1) {
|
if (swap_ && size != 1) {
|
||||||
char *char_buffer = static_cast<char *>(buffer);
|
char *char_buffer = static_cast<char *>(buffer);
|
||||||
for (int i = 0; i < num_read; ++i, char_buffer += size) {
|
for (size_t i = 0; i < num_read; ++i, char_buffer += size) {
|
||||||
ReverseN(char_buffer, size);
|
ReverseN(char_buffer, size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return num_read;
|
return num_read;
|
||||||
}
|
}
|
||||||
|
|
||||||
int TFile::FRead(void *buffer, size_t size, int count) {
|
size_t TFile::FRead(void *buffer, size_t size, size_t count) {
|
||||||
ASSERT_HOST(!is_writing_);
|
ASSERT_HOST(!is_writing_);
|
||||||
ASSERT_HOST(size > 0);
|
ASSERT_HOST(size > 0);
|
||||||
ASSERT_HOST(count >= 0);
|
|
||||||
size_t required_size;
|
size_t required_size;
|
||||||
if (SIZE_MAX / size <= count) {
|
if (SIZE_MAX / size <= count) {
|
||||||
// Avoid integer overflow.
|
// Avoid integer overflow.
|
||||||
@ -270,10 +269,9 @@ bool TFile::CloseWrite(const char *filename, FileWriter writer) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int TFile::FWrite(const void *buffer, size_t size, int count) {
|
size_t TFile::FWrite(const void *buffer, size_t size, size_t count) {
|
||||||
ASSERT_HOST(is_writing_);
|
ASSERT_HOST(is_writing_);
|
||||||
ASSERT_HOST(size > 0);
|
ASSERT_HOST(size > 0);
|
||||||
ASSERT_HOST(count >= 0);
|
|
||||||
ASSERT_HOST(SIZE_MAX / size > count);
|
ASSERT_HOST(SIZE_MAX / size > count);
|
||||||
size_t total = size * count;
|
size_t total = size * count;
|
||||||
const char *buf = static_cast<const char *>(buffer);
|
const char *buf = static_cast<const char *>(buffer);
|
||||||
|
@ -76,7 +76,7 @@ public:
|
|||||||
// Note that mixed read/write is not supported.
|
// Note that mixed read/write is not supported.
|
||||||
bool Open(const char *filename, FileReader reader);
|
bool Open(const char *filename, FileReader reader);
|
||||||
// From an existing memory buffer.
|
// From an existing memory buffer.
|
||||||
bool Open(const char *data, int size);
|
bool Open(const char *data, size_t size);
|
||||||
// From an open file and an end offset.
|
// From an open file and an end offset.
|
||||||
bool Open(FILE *fp, int64_t end_offset);
|
bool Open(FILE *fp, int64_t end_offset);
|
||||||
// Sets the value of the swap flag, so that FReadEndian does the right thing.
|
// Sets the value of the swap flag, so that FReadEndian does the right thing.
|
||||||
@ -92,7 +92,7 @@ public:
|
|||||||
//bool DeSerialize(std::vector<std::string> &data);
|
//bool DeSerialize(std::vector<std::string> &data);
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool DeSerialize(T *data, size_t count = 1) {
|
bool DeSerialize(T *data, size_t count = 1) {
|
||||||
return FReadEndian(data, sizeof(T), count) == static_cast<int>(count);
|
return FReadEndian(data, sizeof(T), count) == count;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool DeSerialize(std::vector<T> &data) {
|
bool DeSerialize(std::vector<T> &data) {
|
||||||
@ -155,7 +155,7 @@ public:
|
|||||||
bool Serialize(const std::vector<char> &data);
|
bool Serialize(const std::vector<char> &data);
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool Serialize(const T *data, size_t count = 1) {
|
bool Serialize(const T *data, size_t count = 1) {
|
||||||
return FWrite(data, sizeof(T), count) == static_cast<int>(count);
|
return FWrite(data, sizeof(T), count) == count;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool Serialize(const std::vector<T> &data) {
|
bool Serialize(const std::vector<T> &data) {
|
||||||
@ -207,9 +207,9 @@ public:
|
|||||||
// Replicates fread, followed by a swap of the bytes if needed, returning the
|
// Replicates fread, followed by a swap of the bytes if needed, returning the
|
||||||
// number of items read. If swap_ is true then the count items will each have
|
// number of items read. If swap_ is true then the count items will each have
|
||||||
// size bytes reversed.
|
// size bytes reversed.
|
||||||
int FReadEndian(void *buffer, size_t size, int count);
|
size_t FReadEndian(void *buffer, size_t size, size_t count);
|
||||||
// Replicates fread, returning the number of items read.
|
// Replicates fread, returning the number of items read.
|
||||||
int FRead(void *buffer, size_t size, int count);
|
size_t FRead(void *buffer, size_t size, size_t count);
|
||||||
// Resets the TFile as if it has been Opened, but nothing read.
|
// Resets the TFile as if it has been Opened, but nothing read.
|
||||||
// Only allowed while reading!
|
// Only allowed while reading!
|
||||||
void Rewind();
|
void Rewind();
|
||||||
@ -222,19 +222,19 @@ public:
|
|||||||
|
|
||||||
// Replicates fwrite, returning the number of items written.
|
// Replicates fwrite, returning the number of items written.
|
||||||
// To use fprintf, use snprintf and FWrite.
|
// To use fprintf, use snprintf and FWrite.
|
||||||
int FWrite(const void *buffer, size_t size, int count);
|
size_t FWrite(const void *buffer, size_t size, size_t count);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// The buffered data from the file.
|
// The buffered data from the file.
|
||||||
std::vector<char> *data_;
|
std::vector<char> *data_ = nullptr;
|
||||||
// The number of bytes used so far.
|
// The number of bytes used so far.
|
||||||
int offset_;
|
unsigned offset_ = 0;
|
||||||
// True if the data_ pointer is owned by *this.
|
// True if the data_ pointer is owned by *this.
|
||||||
bool data_is_owned_;
|
bool data_is_owned_ = false;
|
||||||
// True if the TFile is open for writing.
|
// True if the TFile is open for writing.
|
||||||
bool is_writing_;
|
bool is_writing_ = false;
|
||||||
// True if bytes need to be swapped in FReadEndian.
|
// True if bytes need to be swapped in FReadEndian.
|
||||||
bool swap_;
|
bool swap_ = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace tesseract.
|
} // namespace tesseract.
|
||||||
|
@ -61,7 +61,7 @@ static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
|
std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
|
||||||
for (int i = 1; i < entries.size(); ++i) {
|
for (size_t i = 1; i < entries.size(); ++i) {
|
||||||
int radical = strtol(&entries[i][0], &end, 10);
|
int radical = strtol(&entries[i][0], &end, 10);
|
||||||
if (*end != '\0') {
|
if (*end != '\0') {
|
||||||
return false;
|
return false;
|
||||||
@ -78,7 +78,7 @@ static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map
|
|||||||
// is unlikely to want to use it again.
|
// is unlikely to want to use it again.
|
||||||
static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
|
static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
|
||||||
std::vector<std::string> lines = split(radical_data, '\n');
|
std::vector<std::string> lines = split(radical_data, '\n');
|
||||||
for (int i = 0; i < lines.size(); ++i) {
|
for (unsigned i = 0; i < lines.size(); ++i) {
|
||||||
if (!DecodeRadicalLine(lines[i], radical_map)) {
|
if (!DecodeRadicalLine(lines[i], radical_map)) {
|
||||||
tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
|
tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
|
||||||
return false;
|
return false;
|
||||||
@ -132,10 +132,10 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
|
|||||||
// to measure the number of radicals and strokes, initially we use the same
|
// to measure the number of radicals and strokes, initially we use the same
|
||||||
// code range for all 3 Han code positions, and fix them after.
|
// code range for all 3 Han code positions, and fix them after.
|
||||||
int han_offset = hangul_offset + kTotalJamos;
|
int han_offset = hangul_offset + kTotalJamos;
|
||||||
for (int u = 0; u <= unicharset.size(); ++u) {
|
for (unsigned u = 0; u <= unicharset.size(); ++u) {
|
||||||
// We special-case allow null_id to be equal to unicharset.size() in case
|
// We special-case allow null_id to be equal to unicharset.size() in case
|
||||||
// there is no space in unicharset for it.
|
// there is no space in unicharset for it.
|
||||||
if (u == unicharset.size() && u != null_id) {
|
if (u == unicharset.size() && static_cast<int>(u) != null_id) {
|
||||||
break; // Finished
|
break; // Finished
|
||||||
}
|
}
|
||||||
RecodedCharID code;
|
RecodedCharID code;
|
||||||
@ -173,7 +173,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
|
|||||||
// Special cases.
|
// Special cases.
|
||||||
if (u == UNICHAR_SPACE) {
|
if (u == UNICHAR_SPACE) {
|
||||||
code.Set(0, 0); // Space.
|
code.Set(0, 0); // Space.
|
||||||
} else if (u == null_id ||
|
} else if (static_cast<int>(u) == null_id ||
|
||||||
(unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
|
(unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
|
||||||
code.Set(0, direct_set.unichar_to_id(kNullChar));
|
code.Set(0, direct_set.unichar_to_id(kNullChar));
|
||||||
} else {
|
} else {
|
||||||
@ -207,7 +207,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
|
|||||||
int code_offset = 0;
|
int code_offset = 0;
|
||||||
for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
|
for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
|
||||||
int max_offset = 0;
|
int max_offset = 0;
|
||||||
for (int u = 0; u < unicharset.size(); ++u) {
|
for (unsigned u = 0; u < unicharset.size(); ++u) {
|
||||||
RecodedCharID *code = &encoder_[u];
|
RecodedCharID *code = &encoder_[u];
|
||||||
if (code->length() <= i) {
|
if (code->length() <= i) {
|
||||||
continue;
|
continue;
|
||||||
@ -229,7 +229,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
|
|||||||
// passes them through unchanged.
|
// passes them through unchanged.
|
||||||
void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) {
|
void UnicharCompress::SetupPassThrough(const UNICHARSET &unicharset) {
|
||||||
std::vector<RecodedCharID> codes;
|
std::vector<RecodedCharID> codes;
|
||||||
for (int u = 0; u < unicharset.size(); ++u) {
|
for (unsigned u = 0; u < unicharset.size(); ++u) {
|
||||||
RecodedCharID code;
|
RecodedCharID code;
|
||||||
code.Set(0, u);
|
code.Set(0, u);
|
||||||
codes.push_back(code);
|
codes.push_back(code);
|
||||||
@ -265,10 +265,10 @@ void UnicharCompress::DefragmentCodeValues(int encoded_null) {
|
|||||||
}
|
}
|
||||||
// Compute offsets based on code use.
|
// Compute offsets based on code use.
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (int i = 0; i < offsets.size(); ++i) {
|
for (unsigned i = 0; i < offsets.size(); ++i) {
|
||||||
// If not used, decrement everything above here.
|
// If not used, decrement everything above here.
|
||||||
// We are moving encoded_null to the end, so it is not "used".
|
// We are moving encoded_null to the end, so it is not "used".
|
||||||
if (offsets[i] == 0 || i == encoded_null) {
|
if (offsets[i] == 0 || i == static_cast<unsigned>(encoded_null)) {
|
||||||
--offset;
|
--offset;
|
||||||
} else {
|
} else {
|
||||||
offsets[i] = offset;
|
offsets[i] = offset;
|
||||||
@ -292,8 +292,8 @@ void UnicharCompress::DefragmentCodeValues(int encoded_null) {
|
|||||||
|
|
||||||
// Encodes a single unichar_id. Returns the length of the code, or zero if
|
// Encodes a single unichar_id. Returns the length of the code, or zero if
|
||||||
// invalid input, and the encoding itself
|
// invalid input, and the encoding itself
|
||||||
int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID *code) const {
|
int UnicharCompress::EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const {
|
||||||
if (unichar_id < 0 || unichar_id >= encoder_.size()) {
|
if (unichar_id >= encoder_.size()) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
*code = encoder_[unichar_id];
|
*code = encoder_[unichar_id];
|
||||||
@ -338,7 +338,7 @@ bool UnicharCompress::DeSerialize(TFile *fp) {
|
|||||||
// See the class comment above for details.
|
// See the class comment above for details.
|
||||||
std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {
|
std::string UnicharCompress::GetEncodingAsString(const UNICHARSET &unicharset) const {
|
||||||
std::string encoding;
|
std::string encoding;
|
||||||
for (int c = 0; c < encoder_.size(); ++c) {
|
for (unsigned c = 0; c < encoder_.size(); ++c) {
|
||||||
const RecodedCharID &code = encoder_[c];
|
const RecodedCharID &code = encoder_[c];
|
||||||
if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
|
if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
|
||||||
// Don't show the duplicate entry.
|
// Don't show the duplicate entry.
|
||||||
@ -397,7 +397,7 @@ void UnicharCompress::SetupDecoder() {
|
|||||||
Cleanup();
|
Cleanup();
|
||||||
is_valid_start_.clear();
|
is_valid_start_.clear();
|
||||||
is_valid_start_.resize(code_range_);
|
is_valid_start_.resize(code_range_);
|
||||||
for (int c = 0; c < encoder_.size(); ++c) {
|
for (unsigned c = 0; c < encoder_.size(); ++c) {
|
||||||
const RecodedCharID &code = encoder_[c];
|
const RecodedCharID &code = encoder_[c];
|
||||||
decoder_[code] = c;
|
decoder_[code] = c;
|
||||||
is_valid_start_[code(0)] = true;
|
is_valid_start_[code(0)] = true;
|
||||||
|
@ -174,7 +174,7 @@ public:
|
|||||||
|
|
||||||
// Encodes a single unichar_id. Returns the length of the code, (or zero if
|
// Encodes a single unichar_id. Returns the length of the code, (or zero if
|
||||||
// invalid input), and the encoding itself in code.
|
// invalid input), and the encoding itself in code.
|
||||||
int EncodeUnichar(int unichar_id, RecodedCharID *code) const;
|
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const;
|
||||||
// Decodes code, returning the original unichar-id, or
|
// Decodes code, returning the original unichar-id, or
|
||||||
// INVALID_UNICHAR_ID if the input is invalid.
|
// INVALID_UNICHAR_ID if the input is invalid.
|
||||||
int DecodeUnichar(const RecodedCharID &code) const;
|
int DecodeUnichar(const RecodedCharID &code) const;
|
||||||
|
@ -272,7 +272,7 @@ const char *UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
|||||||
if (id == INVALID_UNICHAR_ID) {
|
if (id == INVALID_UNICHAR_ID) {
|
||||||
return INVALID_UNICHAR;
|
return INVALID_UNICHAR;
|
||||||
}
|
}
|
||||||
ASSERT_HOST(id < this->size());
|
ASSERT_HOST(static_cast<unsigned>(id) < this->size());
|
||||||
return unichars[id].representation;
|
return unichars[id].representation;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,7 +280,7 @@ const char *UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
|
|||||||
if (id == INVALID_UNICHAR_ID) {
|
if (id == INVALID_UNICHAR_ID) {
|
||||||
return INVALID_UNICHAR;
|
return INVALID_UNICHAR;
|
||||||
}
|
}
|
||||||
ASSERT_HOST(id < this->size());
|
ASSERT_HOST(static_cast<unsigned>(id) < this->size());
|
||||||
// Resolve from the kCustomLigatures table if this is a private encoding.
|
// Resolve from the kCustomLigatures table if this is a private encoding.
|
||||||
if (get_isprivate(id)) {
|
if (get_isprivate(id)) {
|
||||||
const char *ch = id_to_unichar(id);
|
const char *ch = id_to_unichar(id);
|
||||||
@ -384,7 +384,7 @@ void UNICHARSET::set_ranges_empty() {
|
|||||||
// everything set. The unicharsets don't have to be the same, and graphemes
|
// everything set. The unicharsets don't have to be the same, and graphemes
|
||||||
// are correctly accounted for.
|
// are correctly accounted for.
|
||||||
void UNICHARSET::PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src) {
|
void UNICHARSET::PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src) {
|
||||||
for (int ch = start_index; ch < unichars.size(); ++ch) {
|
for (unsigned ch = start_index; ch < unichars.size(); ++ch) {
|
||||||
const char *utf8 = id_to_unichar(ch);
|
const char *utf8 = id_to_unichar(ch);
|
||||||
UNICHAR_PROPERTIES properties;
|
UNICHAR_PROPERTIES properties;
|
||||||
if (src.GetStrProperties(utf8, &properties)) {
|
if (src.GetStrProperties(utf8, &properties)) {
|
||||||
@ -481,7 +481,7 @@ void UNICHARSET::encode_string(const char *str, int str_index, int str_length,
|
|||||||
std::vector<UNICHAR_ID> *encoding, std::vector<char> *lengths,
|
std::vector<UNICHAR_ID> *encoding, std::vector<char> *lengths,
|
||||||
unsigned *best_total_length, std::vector<UNICHAR_ID> *best_encoding,
|
unsigned *best_total_length, std::vector<UNICHAR_ID> *best_encoding,
|
||||||
std::vector<char> *best_lengths) const {
|
std::vector<char> *best_lengths) const {
|
||||||
if (str_index > *best_total_length) {
|
if (str_index > static_cast<int>(*best_total_length)) {
|
||||||
// This is the best result so far.
|
// This is the best result so far.
|
||||||
*best_total_length = str_index;
|
*best_total_length = str_index;
|
||||||
*best_encoding = *encoding;
|
*best_encoding = *encoding;
|
||||||
@ -506,7 +506,7 @@ void UNICHARSET::encode_string(const char *str, int str_index, int str_length,
|
|||||||
lengths->push_back(length);
|
lengths->push_back(length);
|
||||||
encode_string(str, str_index + length, str_length, encoding, lengths, best_total_length,
|
encode_string(str, str_index + length, str_length, encoding, lengths, best_total_length,
|
||||||
best_encoding, best_lengths);
|
best_encoding, best_lengths);
|
||||||
if (*best_total_length == str_length) {
|
if (static_cast<int>(*best_total_length) == str_length) {
|
||||||
return; // Tail recursion success!
|
return; // Tail recursion success!
|
||||||
}
|
}
|
||||||
// Failed with that length, truncate back and try again.
|
// Failed with that length, truncate back and try again.
|
||||||
@ -695,9 +695,9 @@ bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
|
|||||||
bool UNICHARSET::save_to_string(std::string &str) const {
|
bool UNICHARSET::save_to_string(std::string &str) const {
|
||||||
const int kFileBufSize = 1024;
|
const int kFileBufSize = 1024;
|
||||||
char buffer[kFileBufSize + 1];
|
char buffer[kFileBufSize + 1];
|
||||||
snprintf(buffer, kFileBufSize, "%d\n", this->size());
|
snprintf(buffer, kFileBufSize, "%zu\n", this->size());
|
||||||
str = buffer;
|
str = buffer;
|
||||||
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
|
for (unsigned id = 0; id < this->size(); ++id) {
|
||||||
int min_bottom, max_bottom, min_top, max_top;
|
int min_bottom, max_bottom, min_top, max_top;
|
||||||
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
|
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
|
||||||
float width, width_sd;
|
float width, width_sd;
|
||||||
@ -883,7 +883,7 @@ void UNICHARSET::post_load_setup() {
|
|||||||
int x_height_alphas = 0;
|
int x_height_alphas = 0;
|
||||||
int cap_height_alphas = 0;
|
int cap_height_alphas = 0;
|
||||||
top_bottom_set_ = false;
|
top_bottom_set_ = false;
|
||||||
for (UNICHAR_ID id = 0; id < unichars.size(); ++id) {
|
for (unsigned id = 0; id < unichars.size(); ++id) {
|
||||||
int min_bottom = 0;
|
int min_bottom = 0;
|
||||||
int max_bottom = UINT8_MAX;
|
int max_bottom = UINT8_MAX;
|
||||||
int min_top = 0;
|
int min_top = 0;
|
||||||
@ -1012,7 +1012,7 @@ bool UNICHARSET::AnyRepeatedUnicodes() const {
|
|||||||
if (has_special_codes()) {
|
if (has_special_codes()) {
|
||||||
start_id = SPECIAL_UNICHAR_CODES_COUNT;
|
start_id = SPECIAL_UNICHAR_CODES_COUNT;
|
||||||
}
|
}
|
||||||
for (int id = start_id; id < unichars.size(); ++id) {
|
for (unsigned id = start_id; id < unichars.size(); ++id) {
|
||||||
// Convert to unicodes.
|
// Convert to unicodes.
|
||||||
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
|
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
|
||||||
for (size_t u = 1; u < unicodes.size(); ++u) {
|
for (size_t u = 1; u < unicodes.size(); ++u) {
|
||||||
|
@ -283,7 +283,7 @@ public:
|
|||||||
if (cleaned != unichar_repr) {
|
if (cleaned != unichar_repr) {
|
||||||
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
||||||
} else {
|
} else {
|
||||||
int old_size = size();
|
auto old_size = size();
|
||||||
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
|
unichar_insert(unichar_repr, OldUncleanUnichars::kFalse);
|
||||||
if (size() == old_size) {
|
if (size() == old_size) {
|
||||||
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
unichar_insert(unichar_repr, OldUncleanUnichars::kTrue);
|
||||||
@ -345,7 +345,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Return the size of the set (the number of different UNICHAR it holds).
|
// Return the size of the set (the number of different UNICHAR it holds).
|
||||||
int size() const {
|
size_t size() const {
|
||||||
return unichars.size();
|
return unichars.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ ADAPT_TEMPLATES_STRUCT::ADAPT_TEMPLATES_STRUCT(UNICHARSET &unicharset) {
|
|||||||
NumNonEmptyClasses = 0;
|
NumNonEmptyClasses = 0;
|
||||||
|
|
||||||
/* Insert an empty class for each unichar id in unicharset */
|
/* Insert an empty class for each unichar id in unicharset */
|
||||||
for (int i = 0; i < MAX_NUM_CLASSES; i++) {
|
for (unsigned i = 0; i < MAX_NUM_CLASSES; i++) {
|
||||||
Class[i] = nullptr;
|
Class[i] = nullptr;
|
||||||
if (i < unicharset.size()) {
|
if (i < unicharset.size()) {
|
||||||
AddAdaptedClass(this, new ADAPT_CLASS_STRUCT, i);
|
AddAdaptedClass(this, new ADAPT_CLASS_STRUCT, i);
|
||||||
@ -108,7 +108,7 @@ ADAPT_TEMPLATES_STRUCT::ADAPT_TEMPLATES_STRUCT(UNICHARSET &unicharset) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ADAPT_TEMPLATES_STRUCT::~ADAPT_TEMPLATES_STRUCT() {
|
ADAPT_TEMPLATES_STRUCT::~ADAPT_TEMPLATES_STRUCT() {
|
||||||
for (int i = 0; i < (Templates)->NumClasses; i++) {
|
for (unsigned i = 0; i < (Templates)->NumClasses; i++) {
|
||||||
delete Class[i];
|
delete Class[i];
|
||||||
}
|
}
|
||||||
delete Templates;
|
delete Templates;
|
||||||
@ -160,11 +160,11 @@ void Classify::PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templat
|
|||||||
fprintf(File, " Id NC NPC NP NPP\n");
|
fprintf(File, " Id NC NPC NP NPP\n");
|
||||||
fprintf(File, "------------------------\n");
|
fprintf(File, "------------------------\n");
|
||||||
|
|
||||||
for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
|
for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
|
||||||
IClass = Templates->Templates->Class[i];
|
IClass = Templates->Templates->Class[i];
|
||||||
AClass = Templates->Class[i];
|
AClass = Templates->Class[i];
|
||||||
if (!IsEmptyAdaptedClass(AClass)) {
|
if (!IsEmptyAdaptedClass(AClass)) {
|
||||||
fprintf(File, "%5d %s %3d %3d %3d %3zd\n", i, unicharset.id_to_unichar(i), IClass->NumConfigs,
|
fprintf(File, "%5u %s %3d %3d %3d %3zd\n", i, unicharset.id_to_unichar(i), IClass->NumConfigs,
|
||||||
AClass->NumPermConfigs, IClass->NumProtos,
|
AClass->NumPermConfigs, IClass->NumProtos,
|
||||||
IClass->NumProtos - AClass->TempProtos->size());
|
IClass->NumProtos - AClass->TempProtos->size());
|
||||||
}
|
}
|
||||||
@ -242,7 +242,7 @@ ADAPT_TEMPLATES_STRUCT *Classify::ReadAdaptedTemplates(TFile *fp) {
|
|||||||
Templates->Templates = ReadIntTemplates(fp);
|
Templates->Templates = ReadIntTemplates(fp);
|
||||||
|
|
||||||
/* then read in the adaptive info for each class */
|
/* then read in the adaptive info for each class */
|
||||||
for (int i = 0; i < (Templates->Templates)->NumClasses; i++) {
|
for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
|
||||||
Templates->Class[i] = ReadAdaptedClass(fp);
|
Templates->Class[i] = ReadAdaptedClass(fp);
|
||||||
}
|
}
|
||||||
return (Templates);
|
return (Templates);
|
||||||
@ -343,8 +343,6 @@ void WriteAdaptedClass(FILE *File, ADAPT_CLASS_STRUCT *Class, int NumConfigs) {
|
|||||||
* @note Globals: none
|
* @note Globals: none
|
||||||
*/
|
*/
|
||||||
void Classify::WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) {
|
void Classify::WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates) {
|
||||||
int i;
|
|
||||||
|
|
||||||
/* first write the high level adaptive template struct */
|
/* first write the high level adaptive template struct */
|
||||||
fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);
|
fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);
|
||||||
|
|
||||||
@ -352,7 +350,7 @@ void Classify::WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templat
|
|||||||
WriteIntTemplates(File, Templates->Templates, unicharset);
|
WriteIntTemplates(File, Templates->Templates, unicharset);
|
||||||
|
|
||||||
/* then write out the adaptive info for each class */
|
/* then write out the adaptive info for each class */
|
||||||
for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
|
for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
|
||||||
WriteAdaptedClass(File, Templates->Class[i], Templates->Templates->Class[i]->NumConfigs);
|
WriteAdaptedClass(File, Templates->Class[i], Templates->Templates->Class[i]->NumConfigs);
|
||||||
}
|
}
|
||||||
} /* WriteAdaptedTemplates */
|
} /* WriteAdaptedTemplates */
|
||||||
|
@ -143,7 +143,7 @@ inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
|
|||||||
-----------------------------------------------------------------------------*/
|
-----------------------------------------------------------------------------*/
|
||||||
// Returns the index of the given id in results, if present, or the size of the
|
// Returns the index of the given id in results, if present, or the size of the
|
||||||
// vector (index it will go at) if not present.
|
// vector (index it will go at) if not present.
|
||||||
static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
|
static unsigned FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
|
||||||
for (unsigned i = 0; i < results.match.size(); i++) {
|
for (unsigned i = 0; i < results.match.size(); i++) {
|
||||||
if (results.match[i].unichar_id == id) {
|
if (results.match[i].unichar_id == id) {
|
||||||
return i;
|
return i;
|
||||||
@ -155,7 +155,7 @@ static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
|
|||||||
// Returns the current rating for a unichar id if we have rated it, defaulting
|
// Returns the current rating for a unichar id if we have rated it, defaulting
|
||||||
// to WORST_POSSIBLE_RATING.
|
// to WORST_POSSIBLE_RATING.
|
||||||
static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
|
static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
|
||||||
int index = FindScoredUnichar(id, results);
|
unsigned index = FindScoredUnichar(id, results);
|
||||||
if (index >= results.match.size()) {
|
if (index >= results.match.size()) {
|
||||||
return WORST_POSSIBLE_RATING;
|
return WORST_POSSIBLE_RATING;
|
||||||
}
|
}
|
||||||
@ -323,7 +323,7 @@ void Classify::LearnWord(const char *fontname, WERD_RES *word) {
|
|||||||
pieces_all_natural);
|
pieces_all_natural);
|
||||||
|
|
||||||
std::string full_string;
|
std::string full_string;
|
||||||
for (int i = 0; i < tokens.size(); i++) {
|
for (unsigned i = 0; i < tokens.size(); i++) {
|
||||||
full_string += tokens[i];
|
full_string += tokens[i];
|
||||||
if (i != tokens.size() - 1) {
|
if (i != tokens.size() - 1) {
|
||||||
full_string += ' ';
|
full_string += ' ';
|
||||||
@ -578,7 +578,7 @@ void Classify::InitAdaptiveClassifier(TessdataManager *mgr) {
|
|||||||
tprintf("\n");
|
tprintf("\n");
|
||||||
PrintAdaptedTemplates(stdout, AdaptedTemplates);
|
PrintAdaptedTemplates(stdout, AdaptedTemplates);
|
||||||
|
|
||||||
for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
|
for (unsigned i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
|
||||||
BaselineCutoffs[i] = CharNormCutoffs[i];
|
BaselineCutoffs[i] = CharNormCutoffs[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -807,7 +807,7 @@ bool Classify::AdaptableWord(WERD_RES *word) {
|
|||||||
if (word->best_choice == nullptr) {
|
if (word->best_choice == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int BestChoiceLength = word->best_choice->length();
|
auto BestChoiceLength = word->best_choice->length();
|
||||||
float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
|
float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
|
||||||
return // rules that apply in general - simplest to compute first
|
return // rules that apply in general - simplest to compute first
|
||||||
BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&
|
BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&
|
||||||
@ -979,7 +979,7 @@ void Classify::DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class) {
|
|||||||
* @param[out] results results to add new result to
|
* @param[out] results results to add new result to
|
||||||
*/
|
*/
|
||||||
void Classify::AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results) {
|
void Classify::AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results) {
|
||||||
int old_match = FindScoredUnichar(new_result.unichar_id, *results);
|
auto old_match = FindScoredUnichar(new_result.unichar_id, *results);
|
||||||
|
|
||||||
if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
|
if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
|
||||||
(old_match < results->match.size() &&
|
(old_match < results->match.size() &&
|
||||||
@ -1120,7 +1120,7 @@ void Classify::ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, boo
|
|||||||
// by int_result. In this case, build a vector of UnicharRating to
|
// by int_result. In this case, build a vector of UnicharRating to
|
||||||
// gather together different font-ids for each unichar. Also covers case1.
|
// gather together different font-ids for each unichar. Also covers case1.
|
||||||
std::vector<UnicharRating> mapped_results;
|
std::vector<UnicharRating> mapped_results;
|
||||||
for (int f = 0; f < int_result->fonts.size(); ++f) {
|
for (unsigned f = 0; f < int_result->fonts.size(); ++f) {
|
||||||
int shape_id = int_result->fonts[f].fontinfo_id;
|
int shape_id = int_result->fonts[f].fontinfo_id;
|
||||||
const Shape &shape = shape_table_->GetShape(shape_id);
|
const Shape &shape = shape_table_->GetShape(shape_id);
|
||||||
for (int c = 0; c < shape.size(); ++c) {
|
for (int c = 0; c < shape.size(); ++c) {
|
||||||
@ -1283,7 +1283,7 @@ int Classify::CharNormClassifier(TBLOB *blob, const TrainingSample &sample,
|
|||||||
int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
|
int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
|
||||||
std::vector<UnicharRating> *results) {
|
std::vector<UnicharRating> *results) {
|
||||||
results->clear();
|
results->clear();
|
||||||
auto *adapt_results = new ADAPT_RESULTS();
|
std::unique_ptr<ADAPT_RESULTS> adapt_results(new ADAPT_RESULTS());
|
||||||
adapt_results->Initialize();
|
adapt_results->Initialize();
|
||||||
// Compute the bounding box of the features.
|
// Compute the bounding box of the features.
|
||||||
uint32_t num_features = sample.num_features();
|
uint32_t num_features = sample.num_features();
|
||||||
@ -1293,16 +1293,15 @@ int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const Trai
|
|||||||
sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
|
sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
|
||||||
// Compute the char_norm_array from the saved cn_feature.
|
// Compute the char_norm_array from the saved cn_feature.
|
||||||
FEATURE norm_feature = sample.GetCNFeature();
|
FEATURE norm_feature = sample.GetCNFeature();
|
||||||
auto *char_norm_array = new uint8_t[unicharset.size()];
|
std::vector<uint8_t> char_norm_array(unicharset.size());
|
||||||
int num_pruner_classes = std::max(unicharset.size(), PreTrainedTemplates->NumClasses);
|
auto num_pruner_classes = std::max(static_cast<unsigned>(unicharset.size()), PreTrainedTemplates->NumClasses);
|
||||||
auto *pruner_norm_array = new uint8_t[num_pruner_classes];
|
std::vector<uint8_t> pruner_norm_array(num_pruner_classes);
|
||||||
adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
|
adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
|
||||||
ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array, pruner_norm_array);
|
ComputeCharNormArrays(norm_feature, PreTrainedTemplates, &char_norm_array[0], &pruner_norm_array[0]);
|
||||||
|
|
||||||
PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), pruner_norm_array,
|
PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), &pruner_norm_array[0],
|
||||||
shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
|
shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
|
||||||
&adapt_results->CPResults);
|
&adapt_results->CPResults);
|
||||||
delete[] pruner_norm_array;
|
|
||||||
if (keep_this >= 0) {
|
if (keep_this >= 0) {
|
||||||
adapt_results->CPResults[0].Class = keep_this;
|
adapt_results->CPResults[0].Class = keep_this;
|
||||||
adapt_results->CPResults.resize(1);
|
adapt_results->CPResults.resize(1);
|
||||||
@ -1314,9 +1313,9 @@ int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const Trai
|
|||||||
results->push_back(UnicharRating(class_id, 1.0f - it.Rating));
|
results->push_back(UnicharRating(class_id, 1.0f - it.Rating));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
MasterMatcher(PreTrainedTemplates, num_features, sample.features(), char_norm_array, nullptr,
|
MasterMatcher(PreTrainedTemplates, num_features, sample.features(), &char_norm_array[0], nullptr,
|
||||||
matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,
|
matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,
|
||||||
adapt_results->CPResults, adapt_results);
|
adapt_results->CPResults, adapt_results.get());
|
||||||
// Convert master matcher results to output format.
|
// Convert master matcher results to output format.
|
||||||
for (auto &i : adapt_results->match) {
|
for (auto &i : adapt_results->match) {
|
||||||
results->push_back(i);
|
results->push_back(i);
|
||||||
@ -1325,8 +1324,6 @@ int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const Trai
|
|||||||
std::sort(results->begin(), results->end(), SortDescendingRating);
|
std::sort(results->begin(), results->end(), SortDescendingRating);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
delete[] char_norm_array;
|
|
||||||
delete adapt_results;
|
|
||||||
return num_features;
|
return num_features;
|
||||||
} /* CharNormTrainingSample */
|
} /* CharNormTrainingSample */
|
||||||
|
|
||||||
@ -1627,17 +1624,17 @@ int Classify::GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLA
|
|||||||
void Classify::ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,
|
void Classify::ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,
|
||||||
uint8_t *char_norm_array, uint8_t *pruner_array) {
|
uint8_t *char_norm_array, uint8_t *pruner_array) {
|
||||||
ComputeIntCharNormArray(*norm_feature, char_norm_array);
|
ComputeIntCharNormArray(*norm_feature, char_norm_array);
|
||||||
if (pruner_array != nullptr) {
|
//if (pruner_array != nullptr) {
|
||||||
if (shape_table_ == nullptr) {
|
if (shape_table_ == nullptr) {
|
||||||
ComputeIntCharNormArray(*norm_feature, pruner_array);
|
ComputeIntCharNormArray(*norm_feature, pruner_array);
|
||||||
} else {
|
} else {
|
||||||
memset(pruner_array, UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));
|
memset(&pruner_array[0], UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));
|
||||||
// Each entry in the pruner norm array is the MIN of all the entries of
|
// Each entry in the pruner norm array is the MIN of all the entries of
|
||||||
// the corresponding unichars in the CharNormArray.
|
// the corresponding unichars in the CharNormArray.
|
||||||
for (int id = 0; id < templates->NumClasses; ++id) {
|
for (unsigned id = 0; id < templates->NumClasses; ++id) {
|
||||||
int font_set_id = templates->Class[id]->font_set_id;
|
int font_set_id = templates->Class[id]->font_set_id;
|
||||||
const FontSet &fs = fontset_table_.at(font_set_id);
|
const FontSet &fs = fontset_table_.at(font_set_id);
|
||||||
for (int config = 0; config < fs.size(); ++config) {
|
for (unsigned config = 0; config < fs.size(); ++config) {
|
||||||
const Shape &shape = shape_table_->GetShape(fs[config]);
|
const Shape &shape = shape_table_->GetShape(fs[config]);
|
||||||
for (int c = 0; c < shape.size(); ++c) {
|
for (int c = 0; c < shape.size(); ++c) {
|
||||||
if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {
|
if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {
|
||||||
@ -1647,7 +1644,7 @@ void Classify::ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
//}
|
||||||
delete norm_feature;
|
delete norm_feature;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2117,11 +2114,11 @@ int Classify::ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_confi
|
|||||||
// Converts a shape_table_ index to a classifier class_id index (not a
|
// Converts a shape_table_ index to a classifier class_id index (not a
|
||||||
// unichar-id!). Uses a search, so not fast.
|
// unichar-id!). Uses a search, so not fast.
|
||||||
int Classify::ShapeIDToClassID(int shape_id) const {
|
int Classify::ShapeIDToClassID(int shape_id) const {
|
||||||
for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
|
for (unsigned id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
|
||||||
int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
|
int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
|
||||||
ASSERT_HOST(font_set_id >= 0);
|
ASSERT_HOST(font_set_id >= 0);
|
||||||
const FontSet &fs = fontset_table_.at(font_set_id);
|
const FontSet &fs = fontset_table_.at(font_set_id);
|
||||||
for (int config = 0; config < fs.size(); ++config) {
|
for (unsigned config = 0; config < fs.size(); ++config) {
|
||||||
if (fs[config] == shape_id) {
|
if (fs[config] == shape_id) {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
@ -1489,7 +1489,7 @@ CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) {
|
|||||||
*
|
*
|
||||||
* @return Pointer to the new sample data structure
|
* @return Pointer to the new sample data structure
|
||||||
*/
|
*/
|
||||||
SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID) {
|
SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) {
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
// see if the samples have already been clustered - if so trap an error
|
// see if the samples have already been clustered - if so trap an error
|
||||||
|
@ -95,7 +95,7 @@ struct CLUSTERER {
|
|||||||
KDTREE *KDTree; // for optimal nearest neighbor searching
|
KDTREE *KDTree; // for optimal nearest neighbor searching
|
||||||
CLUSTER *Root; // ptr to root cluster of cluster tree
|
CLUSTER *Root; // ptr to root cluster of cluster tree
|
||||||
LIST ProtoList; // list of prototypes
|
LIST ProtoList; // list of prototypes
|
||||||
int32_t NumChar; // # of characters represented by samples
|
uint32_t NumChar; // # of characters represented by samples
|
||||||
// cache of reusable histograms by distribution type and number of buckets.
|
// cache of reusable histograms by distribution type and number of buckets.
|
||||||
BUCKETS *bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];
|
BUCKETS *bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];
|
||||||
};
|
};
|
||||||
@ -116,7 +116,7 @@ TESS_API
|
|||||||
CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]);
|
CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]);
|
||||||
|
|
||||||
TESS_API
|
TESS_API
|
||||||
SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID);
|
SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID);
|
||||||
|
|
||||||
TESS_API
|
TESS_API
|
||||||
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
|
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
|
||||||
|
@ -57,7 +57,7 @@ void Classify::ClearCharNormArray(uint8_t *char_norm_array) {
|
|||||||
*/
|
*/
|
||||||
void Classify::ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature,
|
void Classify::ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature,
|
||||||
uint8_t *char_norm_array) {
|
uint8_t *char_norm_array) {
|
||||||
for (int i = 0; i < unicharset.size(); i++) {
|
for (unsigned i = 0; i < unicharset.size(); i++) {
|
||||||
if (i < PreTrainedTemplates->NumClasses) {
|
if (i < PreTrainedTemplates->NumClasses) {
|
||||||
int norm_adjust =
|
int norm_adjust =
|
||||||
static_cast<int>(INT_CHAR_NORM_RANGE * ComputeNormMatch(i, norm_feature, false));
|
static_cast<int>(INT_CHAR_NORM_RANGE * ComputeNormMatch(i, norm_feature, false));
|
||||||
|
@ -165,7 +165,7 @@ public:
|
|||||||
void ComputeScores(const INT_TEMPLATES_STRUCT *int_templates, int num_features,
|
void ComputeScores(const INT_TEMPLATES_STRUCT *int_templates, int num_features,
|
||||||
const INT_FEATURE_STRUCT *features) {
|
const INT_FEATURE_STRUCT *features) {
|
||||||
num_features_ = num_features;
|
num_features_ = num_features;
|
||||||
int num_pruners = int_templates->NumClassPruners;
|
auto num_pruners = int_templates->NumClassPruners;
|
||||||
for (int f = 0; f < num_features; ++f) {
|
for (int f = 0; f < num_features; ++f) {
|
||||||
const INT_FEATURE_STRUCT *feature = &features[f];
|
const INT_FEATURE_STRUCT *feature = &features[f];
|
||||||
// Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
|
// Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
|
||||||
@ -175,7 +175,7 @@ public:
|
|||||||
int class_id = 0;
|
int class_id = 0;
|
||||||
// Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
|
// Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
|
||||||
// we need a collection of them, indexed by pruner_set.
|
// we need a collection of them, indexed by pruner_set.
|
||||||
for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
|
for (unsigned pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
|
||||||
// Look up quantized feature in a 3-D array, an array of weights for
|
// Look up quantized feature in a 3-D array, an array of weights for
|
||||||
// each class.
|
// each class.
|
||||||
const uint32_t *pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta];
|
const uint32_t *pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta];
|
||||||
|
@ -221,7 +221,7 @@ void AddIntClass(INT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, INT_CLASS_ST
|
|||||||
int Pruner;
|
int Pruner;
|
||||||
|
|
||||||
assert(LegalClassId(ClassId));
|
assert(LegalClassId(ClassId));
|
||||||
if (ClassId != Templates->NumClasses) {
|
if (static_cast<unsigned>(ClassId) != Templates->NumClasses) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Please make sure that classes are added to templates"
|
"Please make sure that classes are added to templates"
|
||||||
" in increasing order of ClassIds\n");
|
" in increasing order of ClassIds\n");
|
||||||
@ -491,13 +491,12 @@ INT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos,
|
|||||||
const UNICHARSET &target_unicharset) {
|
const UNICHARSET &target_unicharset) {
|
||||||
CLASS_TYPE FClass;
|
CLASS_TYPE FClass;
|
||||||
INT_CLASS_STRUCT *IClass;
|
INT_CLASS_STRUCT *IClass;
|
||||||
int ClassId;
|
|
||||||
int ProtoId;
|
int ProtoId;
|
||||||
int ConfigId;
|
int ConfigId;
|
||||||
|
|
||||||
auto IntTemplates = new INT_TEMPLATES_STRUCT;
|
auto IntTemplates = new INT_TEMPLATES_STRUCT;
|
||||||
|
|
||||||
for (ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
|
for (unsigned ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
|
||||||
FClass = &(FloatProtos[ClassId]);
|
FClass = &(FloatProtos[ClassId]);
|
||||||
if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
|
if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
|
||||||
strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
|
strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
|
||||||
@ -507,7 +506,7 @@ INT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos,
|
|||||||
assert(UnusedClassIdIn(IntTemplates, ClassId));
|
assert(UnusedClassIdIn(IntTemplates, ClassId));
|
||||||
IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);
|
IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);
|
||||||
FontSet fs{FClass->font_set.size()};
|
FontSet fs{FClass->font_set.size()};
|
||||||
for (int i = 0; i < fs.size(); ++i) {
|
for (unsigned i = 0; i < fs.size(); ++i) {
|
||||||
fs[i] = FClass->font_set.at(i);
|
fs[i] = FClass->font_set.at(i);
|
||||||
}
|
}
|
||||||
if (this->fontset_table_.contains(fs)) {
|
if (this->fontset_table_.contains(fs)) {
|
||||||
@ -613,10 +612,10 @@ INT_TEMPLATES_STRUCT::INT_TEMPLATES_STRUCT() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
INT_TEMPLATES_STRUCT::~INT_TEMPLATES_STRUCT() {
|
INT_TEMPLATES_STRUCT::~INT_TEMPLATES_STRUCT() {
|
||||||
for (int i = 0; i < NumClasses; i++) {
|
for (unsigned i = 0; i < NumClasses; i++) {
|
||||||
delete Class[i];
|
delete Class[i];
|
||||||
}
|
}
|
||||||
for (int i = 0; i < NumClassPruners; i++) {
|
for (unsigned i = 0; i < NumClassPruners; i++) {
|
||||||
delete ClassPruners[i];
|
delete ClassPruners[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -630,9 +629,7 @@ INT_TEMPLATES_STRUCT::~INT_TEMPLATES_STRUCT() {
|
|||||||
* @note Globals: none
|
* @note Globals: none
|
||||||
*/
|
*/
|
||||||
INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
||||||
int i, j, w, x, y, z;
|
int j, w, x, y, z;
|
||||||
int unicharset_size;
|
|
||||||
int version_id = 0;
|
|
||||||
INT_TEMPLATES_STRUCT *Templates;
|
INT_TEMPLATES_STRUCT *Templates;
|
||||||
CLASS_PRUNER_STRUCT *Pruner;
|
CLASS_PRUNER_STRUCT *Pruner;
|
||||||
INT_CLASS_STRUCT *Class;
|
INT_CLASS_STRUCT *Class;
|
||||||
@ -645,25 +642,29 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
uint32_t SetBitsForMask = // word with NUM_BITS_PER_CLASS
|
uint32_t SetBitsForMask = // word with NUM_BITS_PER_CLASS
|
||||||
(1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
|
(1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
|
||||||
uint32_t Mask, NewMask, ClassBits;
|
uint32_t Mask, NewMask, ClassBits;
|
||||||
int MaxNumConfigs = MAX_NUM_CONFIGS;
|
unsigned MaxNumConfigs = MAX_NUM_CONFIGS;
|
||||||
int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
|
unsigned WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
|
||||||
|
|
||||||
/* first read the high level template struct */
|
/* first read the high level template struct */
|
||||||
Templates = new INT_TEMPLATES_STRUCT;
|
Templates = new INT_TEMPLATES_STRUCT;
|
||||||
// Read Templates in parts for 64 bit compatibility.
|
// Read Templates in parts for 64 bit compatibility.
|
||||||
|
uint32_t unicharset_size;
|
||||||
if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1) {
|
if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1) {
|
||||||
tprintf("Bad read of inttemp!\n");
|
tprintf("Bad read of inttemp!\n");
|
||||||
}
|
}
|
||||||
if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1) != 1 ||
|
int32_t version_id = 0;
|
||||||
|
if (fp->FReadEndian(&version_id, sizeof(version_id), 1) != 1 ||
|
||||||
fp->FReadEndian(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners), 1) != 1) {
|
fp->FReadEndian(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners), 1) != 1) {
|
||||||
tprintf("Bad read of inttemp!\n");
|
tprintf("Bad read of inttemp!\n");
|
||||||
}
|
}
|
||||||
if (Templates->NumClasses < 0) {
|
if (version_id < 0) {
|
||||||
// This file has a version id!
|
// This file has a version id!
|
||||||
version_id = -Templates->NumClasses;
|
version_id = -version_id;
|
||||||
if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1) != 1) {
|
if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1) != 1) {
|
||||||
tprintf("Bad read of inttemp!\n");
|
tprintf("Bad read of inttemp!\n");
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
Templates->NumClasses = version_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (version_id < 3) {
|
if (version_id < 3) {
|
||||||
@ -683,8 +684,8 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* then read in the class pruners */
|
/* then read in the class pruners */
|
||||||
const int kNumBuckets = NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR;
|
const unsigned kNumBuckets = NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR;
|
||||||
for (i = 0; i < Templates->NumClassPruners; i++) {
|
for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
|
||||||
Pruner = new CLASS_PRUNER_STRUCT;
|
Pruner = new CLASS_PRUNER_STRUCT;
|
||||||
if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) != kNumBuckets) {
|
if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) != kNumBuckets) {
|
||||||
tprintf("Bad read of inttemp!\n");
|
tprintf("Bad read of inttemp!\n");
|
||||||
@ -700,19 +701,19 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
if (version_id < 2) {
|
if (version_id < 2) {
|
||||||
// Allocate enough class pruners to cover all the class ids.
|
// Allocate enough class pruners to cover all the class ids.
|
||||||
max_class_id = 0;
|
max_class_id = 0;
|
||||||
for (i = 0; i < Templates->NumClasses; i++) {
|
for (unsigned i = 0; i < Templates->NumClasses; i++) {
|
||||||
if (ClassIdFor[i] > max_class_id) {
|
if (ClassIdFor[i] > max_class_id) {
|
||||||
max_class_id = ClassIdFor[i];
|
max_class_id = ClassIdFor[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
|
for (int i = 0; i <= CPrunerIdFor(max_class_id); i++) {
|
||||||
Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
|
Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
|
||||||
memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
|
memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
|
||||||
}
|
}
|
||||||
// Convert class pruners from the old format (indexed by class index)
|
// Convert class pruners from the old format (indexed by class index)
|
||||||
// to the new format (indexed by class id).
|
// to the new format (indexed by class id).
|
||||||
last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
|
last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
|
||||||
for (i = 0; i < Templates->NumClassPruners; i++) {
|
for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
|
||||||
for (x = 0; x < NUM_CP_BUCKETS; x++) {
|
for (x = 0; x < NUM_CP_BUCKETS; x++) {
|
||||||
for (y = 0; y < NUM_CP_BUCKETS; y++) {
|
for (y = 0; y < NUM_CP_BUCKETS; y++) {
|
||||||
for (z = 0; z < NUM_CP_BUCKETS; z++) {
|
for (z = 0; z < NUM_CP_BUCKETS; z++) {
|
||||||
@ -750,13 +751,13 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (i = 0; i < Templates->NumClassPruners; i++) {
|
for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
|
||||||
delete TempClassPruner[i];
|
delete TempClassPruner[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* then read in each class */
|
/* then read in each class */
|
||||||
for (i = 0; i < Templates->NumClasses; i++) {
|
for (unsigned i = 0; i < Templates->NumClasses; i++) {
|
||||||
/* first read in the high level struct for the class */
|
/* first read in the high level struct for the class */
|
||||||
Class = new INT_CLASS_STRUCT;
|
Class = new INT_CLASS_STRUCT;
|
||||||
if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||
|
if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||
|
||||||
@ -773,7 +774,7 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
|
unsigned num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
|
||||||
ASSERT_HOST(num_configs <= MaxNumConfigs);
|
ASSERT_HOST(num_configs <= MaxNumConfigs);
|
||||||
if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) != num_configs) {
|
if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) != num_configs) {
|
||||||
tprintf("Bad read of inttemp!\n");
|
tprintf("Bad read of inttemp!\n");
|
||||||
@ -797,7 +798,7 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
/* then read in the proto sets */
|
/* then read in the proto sets */
|
||||||
for (j = 0; j < Class->NumProtoSets; j++) {
|
for (j = 0; j < Class->NumProtoSets; j++) {
|
||||||
auto ProtoSet = new PROTO_SET_STRUCT;
|
auto ProtoSet = new PROTO_SET_STRUCT;
|
||||||
int num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
|
unsigned num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
|
||||||
if (fp->FReadEndian(&ProtoSet->ProtoPruner, sizeof(ProtoSet->ProtoPruner[0][0][0]),
|
if (fp->FReadEndian(&ProtoSet->ProtoPruner, sizeof(ProtoSet->ProtoPruner[0][0][0]),
|
||||||
num_buckets) != num_buckets) {
|
num_buckets) != num_buckets) {
|
||||||
tprintf("Bad read of inttemp!\n");
|
tprintf("Bad read of inttemp!\n");
|
||||||
@ -830,7 +831,7 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
ClassForClassId(Templates, 0)->font_set_id = -1;
|
ClassForClassId(Templates, 0)->font_set_id = -1;
|
||||||
Templates->NumClasses++;
|
Templates->NumClasses++;
|
||||||
/* make sure the classes are contiguous */
|
/* make sure the classes are contiguous */
|
||||||
for (i = 0; i < MAX_NUM_CLASSES; i++) {
|
for (unsigned i = 0; i < MAX_NUM_CLASSES; i++) {
|
||||||
if (i < Templates->NumClasses) {
|
if (i < Templates->NumClasses) {
|
||||||
if (ClassForClassId(Templates, i) == nullptr) {
|
if (ClassForClassId(Templates, i) == nullptr) {
|
||||||
fprintf(stderr, "Non-contiguous class ids in inttemp\n");
|
fprintf(stderr, "Non-contiguous class ids in inttemp\n");
|
||||||
@ -838,7 +839,7 @@ INT_TEMPLATES_STRUCT *Classify::ReadIntTemplates(TFile *fp) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (ClassForClassId(Templates, i) != nullptr) {
|
if (ClassForClassId(Templates, i) != nullptr) {
|
||||||
fprintf(stderr, "Class id %d exceeds NumClassesIn (Templates) %d\n", i,
|
fprintf(stderr, "Class id %u exceeds NumClassesIn (Templates) %u\n", i,
|
||||||
Templates->NumClasses);
|
Templates->NumClasses);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
@ -919,15 +920,14 @@ void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window) {
|
|||||||
*/
|
*/
|
||||||
void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates,
|
void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates,
|
||||||
const UNICHARSET &target_unicharset) {
|
const UNICHARSET &target_unicharset) {
|
||||||
int i, j;
|
|
||||||
INT_CLASS_STRUCT *Class;
|
INT_CLASS_STRUCT *Class;
|
||||||
int unicharset_size = target_unicharset.size();
|
auto unicharset_size = target_unicharset.size();
|
||||||
int version_id = -5; // When negated by the reader -1 becomes +1 etc.
|
int version_id = -5; // When negated by the reader -1 becomes +1 etc.
|
||||||
|
|
||||||
if (Templates->NumClasses != unicharset_size) {
|
if (Templates->NumClasses != unicharset_size) {
|
||||||
tprintf(
|
tprintf(
|
||||||
"Warning: executing WriteIntTemplates() with %d classes in"
|
"Warning: executing WriteIntTemplates() with %d classes in"
|
||||||
" Templates, while target_unicharset size is %d\n",
|
" Templates, while target_unicharset size is %zu\n",
|
||||||
Templates->NumClasses, unicharset_size);
|
Templates->NumClasses, unicharset_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -938,12 +938,12 @@ void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates,
|
|||||||
fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);
|
fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);
|
||||||
|
|
||||||
/* then write out the class pruners */
|
/* then write out the class pruners */
|
||||||
for (i = 0; i < Templates->NumClassPruners; i++) {
|
for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
|
||||||
fwrite(Templates->ClassPruners[i], sizeof(CLASS_PRUNER_STRUCT), 1, File);
|
fwrite(Templates->ClassPruners[i], sizeof(CLASS_PRUNER_STRUCT), 1, File);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* then write out each class */
|
/* then write out each class */
|
||||||
for (i = 0; i < Templates->NumClasses; i++) {
|
for (unsigned i = 0; i < Templates->NumClasses; i++) {
|
||||||
Class = Templates->Class[i];
|
Class = Templates->Class[i];
|
||||||
|
|
||||||
/* first write out the high level struct for the class */
|
/* first write out the high level struct for the class */
|
||||||
@ -951,7 +951,7 @@ void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates,
|
|||||||
fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
|
fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
|
||||||
ASSERT_HOST(Class->NumConfigs == this->fontset_table_.at(Class->font_set_id).size());
|
ASSERT_HOST(Class->NumConfigs == this->fontset_table_.at(Class->font_set_id).size());
|
||||||
fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
|
fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
|
||||||
for (j = 0; j < Class->NumConfigs; ++j) {
|
for (int j = 0; j < Class->NumConfigs; ++j) {
|
||||||
fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);
|
fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -961,7 +961,7 @@ void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* then write out the proto sets */
|
/* then write out the proto sets */
|
||||||
for (j = 0; j < Class->NumProtoSets; j++) {
|
for (int j = 0; j < Class->NumProtoSets; j++) {
|
||||||
fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);
|
fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -991,7 +991,7 @@ void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates,
|
|||||||
* @note Globals: none
|
* @note Globals: none
|
||||||
*/
|
*/
|
||||||
float BucketStart(int Bucket, float Offset, int NumBuckets) {
|
float BucketStart(int Bucket, float Offset, int NumBuckets) {
|
||||||
return ((static_cast<float>(Bucket) / NumBuckets) - Offset);
|
return static_cast<float>(Bucket) / NumBuckets - Offset;
|
||||||
|
|
||||||
} /* BucketStart */
|
} /* BucketStart */
|
||||||
|
|
||||||
@ -1007,7 +1007,7 @@ float BucketStart(int Bucket, float Offset, int NumBuckets) {
|
|||||||
* @note Globals: none
|
* @note Globals: none
|
||||||
*/
|
*/
|
||||||
float BucketEnd(int Bucket, float Offset, int NumBuckets) {
|
float BucketEnd(int Bucket, float Offset, int NumBuckets) {
|
||||||
return ((static_cast<float>(Bucket + 1) / NumBuckets) - Offset);
|
return static_cast<float>(Bucket + 1) / NumBuckets - Offset;
|
||||||
} /* BucketEnd */
|
} /* BucketEnd */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1180,7 +1180,7 @@ CLASS_ID Classify::GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *
|
|||||||
*shape_id = atoi(ev->parameter);
|
*shape_id = atoi(ev->parameter);
|
||||||
*adaptive_on = false;
|
*adaptive_on = false;
|
||||||
*pretrained_on = true;
|
*pretrained_on = true;
|
||||||
if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
|
if (*shape_id >= 0 && static_cast<unsigned>(*shape_id) < shape_table_->NumShapes()) {
|
||||||
int font_id;
|
int font_id;
|
||||||
shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id, &font_id);
|
shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id, &font_id);
|
||||||
tprintf("Shape %d, first unichar=%d, font=%d\n", *shape_id, unichar_id, font_id);
|
tprintf("Shape %d, first unichar=%d, font=%d\n", *shape_id, unichar_id, font_id);
|
||||||
@ -1208,7 +1208,7 @@ CLASS_ID Classify::GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *
|
|||||||
*shape_id = -1;
|
*shape_id = -1;
|
||||||
return unichar_id;
|
return unichar_id;
|
||||||
}
|
}
|
||||||
for (int s = 0; s < shape_table_->NumShapes(); ++s) {
|
for (unsigned s = 0; s < shape_table_->NumShapes(); ++s) {
|
||||||
if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
|
if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
|
||||||
tprintf("%s\n", shape_table_->DebugStr(s).c_str());
|
tprintf("%s\n", shape_table_->DebugStr(s).c_str());
|
||||||
}
|
}
|
||||||
|
@ -106,8 +106,8 @@ struct INT_CLASS_STRUCT {
|
|||||||
struct TESS_API INT_TEMPLATES_STRUCT {
|
struct TESS_API INT_TEMPLATES_STRUCT {
|
||||||
INT_TEMPLATES_STRUCT();
|
INT_TEMPLATES_STRUCT();
|
||||||
~INT_TEMPLATES_STRUCT();
|
~INT_TEMPLATES_STRUCT();
|
||||||
int NumClasses;
|
unsigned NumClasses;
|
||||||
int NumClassPruners;
|
unsigned NumClassPruners;
|
||||||
INT_CLASS_STRUCT *Class[MAX_NUM_CLASSES];
|
INT_CLASS_STRUCT *Class[MAX_NUM_CLASSES];
|
||||||
CLASS_PRUNER_STRUCT *ClassPruners[MAX_NUM_CLASS_PRUNERS];
|
CLASS_PRUNER_STRUCT *ClassPruners[MAX_NUM_CLASS_PRUNERS];
|
||||||
};
|
};
|
||||||
|
@ -204,13 +204,13 @@ void ShapeClassifier::FilterDuplicateUnichars(std::vector<ShapeRating> *results)
|
|||||||
std::vector<ShapeRating> filtered_results;
|
std::vector<ShapeRating> filtered_results;
|
||||||
// Copy results to filtered results and knock out duplicate unichars.
|
// Copy results to filtered results and knock out duplicate unichars.
|
||||||
const ShapeTable *shapes = GetShapeTable();
|
const ShapeTable *shapes = GetShapeTable();
|
||||||
for (int r = 0; r < results->size(); ++r) {
|
for (unsigned r = 0; r < results->size(); ++r) {
|
||||||
if (r > 0) {
|
if (r > 0) {
|
||||||
const Shape &shape_r = shapes->GetShape((*results)[r].shape_id);
|
const Shape &shape_r = shapes->GetShape((*results)[r].shape_id);
|
||||||
int c;
|
int c;
|
||||||
for (c = 0; c < shape_r.size(); ++c) {
|
for (c = 0; c < shape_r.size(); ++c) {
|
||||||
int unichar_id = shape_r[c].unichar_id;
|
int unichar_id = shape_r[c].unichar_id;
|
||||||
int s;
|
unsigned s;
|
||||||
for (s = 0; s < r; ++s) {
|
for (s = 0; s < r; ++s) {
|
||||||
const Shape &shape_s = shapes->GetShape((*results)[s].shape_id);
|
const Shape &shape_s = shapes->GetShape((*results)[s].shape_id);
|
||||||
if (shape_s.ContainsUnichar(unichar_id)) {
|
if (shape_s.ContainsUnichar(unichar_id)) {
|
||||||
|
@ -37,8 +37,8 @@ namespace tesseract {
|
|||||||
// Returns -1 if the unichar_id is not found
|
// Returns -1 if the unichar_id is not found
|
||||||
int ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results,
|
int ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results,
|
||||||
const ShapeTable &shape_table, UNICHAR_ID unichar_id) {
|
const ShapeTable &shape_table, UNICHAR_ID unichar_id) {
|
||||||
for (int r = 0; r < results.size(); ++r) {
|
for (unsigned r = 0; r < results.size(); ++r) {
|
||||||
const int shape_id = results[r].shape_id;
|
const auto shape_id = results[r].shape_id;
|
||||||
const Shape &shape = shape_table.GetShape(shape_id);
|
const Shape &shape = shape_table.GetShape(shape_id);
|
||||||
if (shape.ContainsUnichar(unichar_id)) {
|
if (shape.ContainsUnichar(unichar_id)) {
|
||||||
return r;
|
return r;
|
||||||
@ -53,7 +53,7 @@ int ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results,
|
|||||||
// Returns -1 if the unichar_id is not found
|
// Returns -1 if the unichar_id is not found
|
||||||
int UnicharRating::FirstResultWithUnichar(const std::vector<UnicharRating> &results,
|
int UnicharRating::FirstResultWithUnichar(const std::vector<UnicharRating> &results,
|
||||||
UNICHAR_ID unichar_id) {
|
UNICHAR_ID unichar_id) {
|
||||||
for (int r = 0; r < results.size(); ++r) {
|
for (unsigned r = 0; r < results.size(); ++r) {
|
||||||
if (results[r].unichar_id == unichar_id) {
|
if (results[r].unichar_id == unichar_id) {
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
@ -122,7 +122,7 @@ void Shape::AddToShape(int unichar_id, int font_id) {
|
|||||||
// Adds everything in other to this.
|
// Adds everything in other to this.
|
||||||
void Shape::AddShape(const Shape &other) {
|
void Shape::AddShape(const Shape &other) {
|
||||||
for (const auto &unichar : other.unichars_) {
|
for (const auto &unichar : other.unichars_) {
|
||||||
for (int f = 0; f < unichar.font_ids.size(); ++f) {
|
for (unsigned f = 0; f < unichar.font_ids.size(); ++f) {
|
||||||
AddToShape(unichar.unichar_id, unichar.font_ids[f]);
|
AddToShape(unichar.unichar_id, unichar.font_ids[f]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -229,7 +229,7 @@ bool Shape::IsEqualUnichars(Shape *other) {
|
|||||||
if (!other->unichars_sorted_) {
|
if (!other->unichars_sorted_) {
|
||||||
other->SortUnichars();
|
other->SortUnichars();
|
||||||
}
|
}
|
||||||
for (int c = 0; c < unichars_.size(); ++c) {
|
for (unsigned c = 0; c < unichars_.size(); ++c) {
|
||||||
if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) {
|
if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -289,8 +289,8 @@ void ShapeTable::ReMapClassIds(const std::vector<int> &unicharset_map) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns a string listing the classes/fonts in a shape.
|
// Returns a string listing the classes/fonts in a shape.
|
||||||
std::string ShapeTable::DebugStr(int shape_id) const {
|
std::string ShapeTable::DebugStr(unsigned shape_id) const {
|
||||||
if (shape_id < 0 || shape_id >= shape_table_.size()) {
|
if (shape_id >= shape_table_.size()) {
|
||||||
return "INVALID_UNICHAR_ID";
|
return "INVALID_UNICHAR_ID";
|
||||||
}
|
}
|
||||||
const Shape &shape = GetShape(shape_id);
|
const Shape &shape = GetShape(shape_id);
|
||||||
@ -326,7 +326,7 @@ std::string ShapeTable::SummaryStr() const {
|
|||||||
int max_unichars = 0;
|
int max_unichars = 0;
|
||||||
int num_multi_shapes = 0;
|
int num_multi_shapes = 0;
|
||||||
int num_master_shapes = 0;
|
int num_master_shapes = 0;
|
||||||
for (int s = 0; s < shape_table_.size(); ++s) {
|
for (unsigned s = 0; s < shape_table_.size(); ++s) {
|
||||||
if (MasterDestinationIndex(s) != s) {
|
if (MasterDestinationIndex(s) != s) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -348,8 +348,8 @@ std::string ShapeTable::SummaryStr() const {
|
|||||||
|
|
||||||
// Adds a new shape starting with the given unichar_id and font_id.
|
// Adds a new shape starting with the given unichar_id and font_id.
|
||||||
// Returns the assigned index.
|
// Returns the assigned index.
|
||||||
int ShapeTable::AddShape(int unichar_id, int font_id) {
|
unsigned ShapeTable::AddShape(int unichar_id, int font_id) {
|
||||||
int index = shape_table_.size();
|
auto index = shape_table_.size();
|
||||||
auto *shape = new Shape;
|
auto *shape = new Shape;
|
||||||
shape->AddToShape(unichar_id, font_id);
|
shape->AddToShape(unichar_id, font_id);
|
||||||
shape_table_.push_back(shape);
|
shape_table_.push_back(shape);
|
||||||
@ -359,8 +359,8 @@ int ShapeTable::AddShape(int unichar_id, int font_id) {
|
|||||||
|
|
||||||
// Adds a copy of the given shape unless it is already present.
|
// Adds a copy of the given shape unless it is already present.
|
||||||
// Returns the assigned index or index of existing shape if already present.
|
// Returns the assigned index or index of existing shape if already present.
|
||||||
int ShapeTable::AddShape(const Shape &other) {
|
unsigned ShapeTable::AddShape(const Shape &other) {
|
||||||
int index;
|
unsigned index;
|
||||||
for (index = 0; index < shape_table_.size() && !(other == *shape_table_[index]); ++index) {
|
for (index = 0; index < shape_table_.size() && !(other == *shape_table_[index]); ++index) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -373,21 +373,21 @@ int ShapeTable::AddShape(const Shape &other) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Removes the shape given by the shape index.
|
// Removes the shape given by the shape index.
|
||||||
void ShapeTable::DeleteShape(int shape_id) {
|
void ShapeTable::DeleteShape(unsigned shape_id) {
|
||||||
delete shape_table_[shape_id];
|
delete shape_table_[shape_id];
|
||||||
shape_table_.erase(shape_table_.begin() + shape_id);
|
shape_table_.erase(shape_table_.begin() + shape_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adds a font_id to the given existing shape index for the given
|
// Adds a font_id to the given existing shape index for the given
|
||||||
// unichar_id. If the unichar_id is not in the shape, it is added.
|
// unichar_id. If the unichar_id is not in the shape, it is added.
|
||||||
void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
|
void ShapeTable::AddToShape(unsigned shape_id, int unichar_id, int font_id) {
|
||||||
Shape &shape = *shape_table_[shape_id];
|
Shape &shape = *shape_table_[shape_id];
|
||||||
shape.AddToShape(unichar_id, font_id);
|
shape.AddToShape(unichar_id, font_id);
|
||||||
num_fonts_ = std::max(num_fonts_, font_id + 1);
|
num_fonts_ = std::max(num_fonts_, font_id + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adds the given shape to the existing shape with the given index.
|
// Adds the given shape to the existing shape with the given index.
|
||||||
void ShapeTable::AddShapeToShape(int shape_id, const Shape &other) {
|
void ShapeTable::AddShapeToShape(unsigned shape_id, const Shape &other) {
|
||||||
Shape &shape = *shape_table_[shape_id];
|
Shape &shape = *shape_table_[shape_id];
|
||||||
shape.AddShape(other);
|
shape.AddShape(other);
|
||||||
num_fonts_ = 0;
|
num_fonts_ = 0;
|
||||||
@ -398,7 +398,7 @@ void ShapeTable::AddShapeToShape(int shape_id, const Shape &other) {
|
|||||||
// If font_id < 0, the font_id is ignored and the first shape that matches
|
// If font_id < 0, the font_id is ignored and the first shape that matches
|
||||||
// the unichar_id is returned.
|
// the unichar_id is returned.
|
||||||
int ShapeTable::FindShape(int unichar_id, int font_id) const {
|
int ShapeTable::FindShape(int unichar_id, int font_id) const {
|
||||||
for (int s = 0; s < shape_table_.size(); ++s) {
|
for (unsigned s = 0; s < shape_table_.size(); ++s) {
|
||||||
const Shape &shape = GetShape(s);
|
const Shape &shape = GetShape(s);
|
||||||
for (int c = 0; c < shape.size(); ++c) {
|
for (int c = 0; c < shape.size(); ++c) {
|
||||||
if (shape[c].unichar_id == unichar_id) {
|
if (shape[c].unichar_id == unichar_id) {
|
||||||
@ -417,7 +417,7 @@ int ShapeTable::FindShape(int unichar_id, int font_id) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns the first unichar_id and font_id in the given shape.
|
// Returns the first unichar_id and font_id in the given shape.
|
||||||
void ShapeTable::GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const {
|
void ShapeTable::GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const {
|
||||||
const UnicharAndFonts &unichar_and_fonts = (*shape_table_[shape_id])[0];
|
const UnicharAndFonts &unichar_and_fonts = (*shape_table_[shape_id])[0];
|
||||||
*unichar_id = unichar_and_fonts.unichar_id;
|
*unichar_id = unichar_and_fonts.unichar_id;
|
||||||
*font_id = unichar_and_fonts.font_ids[0];
|
*font_id = unichar_and_fonts.font_ids[0];
|
||||||
@ -428,7 +428,7 @@ void ShapeTable::GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font
|
|||||||
int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shapes) {
|
int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shapes) {
|
||||||
BitVector shape_map(master_shapes.NumShapes());
|
BitVector shape_map(master_shapes.NumShapes());
|
||||||
for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
|
for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
|
||||||
for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
|
for (unsigned f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
|
||||||
int c = shape[u_ind].unichar_id;
|
int c = shape[u_ind].unichar_id;
|
||||||
int f = shape[u_ind].font_ids[f_ind];
|
int f = shape[u_ind].font_ids[f_ind];
|
||||||
int master_id = master_shapes.FindShape(c, f);
|
int master_id = master_shapes.FindShape(c, f);
|
||||||
@ -440,7 +440,7 @@ int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shap
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
int num_masters = 0;
|
int num_masters = 0;
|
||||||
for (int s = 0; s < master_shapes.NumShapes(); ++s) {
|
for (unsigned s = 0; s < master_shapes.NumShapes(); ++s) {
|
||||||
if (shape_map[s]) {
|
if (shape_map[s]) {
|
||||||
AddShape(master_shapes.GetShape(s));
|
AddShape(master_shapes.GetShape(s));
|
||||||
++num_masters;
|
++num_masters;
|
||||||
@ -450,14 +450,14 @@ int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shap
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if the shapes are already merged.
|
// Returns true if the shapes are already merged.
|
||||||
bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const {
|
bool ShapeTable::AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const {
|
||||||
return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
|
return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if any shape contains multiple unichars.
|
// Returns true if any shape contains multiple unichars.
|
||||||
bool ShapeTable::AnyMultipleUnichars() const {
|
bool ShapeTable::AnyMultipleUnichars() const {
|
||||||
int num_shapes = NumShapes();
|
auto num_shapes = NumShapes();
|
||||||
for (int s1 = 0; s1 < num_shapes; ++s1) {
|
for (unsigned s1 = 0; s1 < num_shapes; ++s1) {
|
||||||
if (MasterDestinationIndex(s1) != s1) {
|
if (MasterDestinationIndex(s1) != s1) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -482,11 +482,11 @@ int ShapeTable::MaxNumUnichars() const {
|
|||||||
|
|
||||||
// Merges shapes with a common unichar over the [start, end) interval.
|
// Merges shapes with a common unichar over the [start, end) interval.
|
||||||
// Assumes single unichar per shape.
|
// Assumes single unichar per shape.
|
||||||
void ShapeTable::ForceFontMerges(int start, int end) {
|
void ShapeTable::ForceFontMerges(unsigned start, unsigned end) {
|
||||||
for (int s1 = start; s1 < end; ++s1) {
|
for (unsigned s1 = start; s1 < end; ++s1) {
|
||||||
if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
|
if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
|
||||||
int unichar_id = GetShape(s1)[0].unichar_id;
|
int unichar_id = GetShape(s1)[0].unichar_id;
|
||||||
for (int s2 = s1 + 1; s2 < end; ++s2) {
|
for (auto s2 = s1 + 1; s2 < end; ++s2) {
|
||||||
if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
|
if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
|
||||||
unichar_id == GetShape(s2)[0].unichar_id) {
|
unichar_id == GetShape(s2)[0].unichar_id) {
|
||||||
MergeShapes(s1, s2);
|
MergeShapes(s1, s2);
|
||||||
@ -500,13 +500,13 @@ void ShapeTable::ForceFontMerges(int start, int end) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns the number of unichars in the master shape.
|
// Returns the number of unichars in the master shape.
|
||||||
int ShapeTable::MasterUnicharCount(int shape_id) const {
|
unsigned ShapeTable::MasterUnicharCount(unsigned shape_id) const {
|
||||||
int master_id = MasterDestinationIndex(shape_id);
|
int master_id = MasterDestinationIndex(shape_id);
|
||||||
return GetShape(master_id).size();
|
return GetShape(master_id).size();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the sum of the font counts in the master shape.
|
// Returns the sum of the font counts in the master shape.
|
||||||
int ShapeTable::MasterFontCount(int shape_id) const {
|
int ShapeTable::MasterFontCount(unsigned shape_id) const {
|
||||||
int master_id = MasterDestinationIndex(shape_id);
|
int master_id = MasterDestinationIndex(shape_id);
|
||||||
const Shape &shape = GetShape(master_id);
|
const Shape &shape = GetShape(master_id);
|
||||||
int font_count = 0;
|
int font_count = 0;
|
||||||
@ -517,7 +517,7 @@ int ShapeTable::MasterFontCount(int shape_id) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns the number of unichars that would result from merging the shapes.
|
// Returns the number of unichars that would result from merging the shapes.
|
||||||
int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
|
int ShapeTable::MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const {
|
||||||
// Do it the easy way for now.
|
// Do it the easy way for now.
|
||||||
int master_id1 = MasterDestinationIndex(shape_id1);
|
int master_id1 = MasterDestinationIndex(shape_id1);
|
||||||
int master_id2 = MasterDestinationIndex(shape_id2);
|
int master_id2 = MasterDestinationIndex(shape_id2);
|
||||||
@ -527,9 +527,9 @@ int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Merges two shape_ids, leaving shape_id2 marked as merged.
|
// Merges two shape_ids, leaving shape_id2 marked as merged.
|
||||||
void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
|
void ShapeTable::MergeShapes(unsigned shape_id1, unsigned shape_id2) {
|
||||||
int master_id1 = MasterDestinationIndex(shape_id1);
|
auto master_id1 = MasterDestinationIndex(shape_id1);
|
||||||
int master_id2 = MasterDestinationIndex(shape_id2);
|
auto master_id2 = MasterDestinationIndex(shape_id2);
|
||||||
// Point master_id2 (and all merged shapes) to master_id1.
|
// Point master_id2 (and all merged shapes) to master_id1.
|
||||||
shape_table_[master_id2]->set_destination_index(master_id1);
|
shape_table_[master_id2]->set_destination_index(master_id1);
|
||||||
// Add all the shapes of master_id2 to master_id1.
|
// Add all the shapes of master_id2 to master_id1.
|
||||||
@ -537,7 +537,7 @@ void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Swaps two shape_ids.
|
// Swaps two shape_ids.
|
||||||
void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
|
void ShapeTable::SwapShapes(unsigned shape_id1, unsigned shape_id2) {
|
||||||
Shape *tmp = shape_table_[shape_id1];
|
Shape *tmp = shape_table_[shape_id1];
|
||||||
shape_table_[shape_id1] = shape_table_[shape_id2];
|
shape_table_[shape_id1] = shape_table_[shape_id2];
|
||||||
shape_table_[shape_id2] = tmp;
|
shape_table_[shape_id2] = tmp;
|
||||||
@ -545,12 +545,12 @@ void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
|
|||||||
|
|
||||||
// Returns the destination of this shape, (if merged), taking into account
|
// Returns the destination of this shape, (if merged), taking into account
|
||||||
// the fact that the destination may itself have been merged.
|
// the fact that the destination may itself have been merged.
|
||||||
int ShapeTable::MasterDestinationIndex(int shape_id) const {
|
unsigned ShapeTable::MasterDestinationIndex(unsigned shape_id) const {
|
||||||
int dest_id = shape_table_[shape_id]->destination_index();
|
auto dest_id = shape_table_[shape_id]->destination_index();
|
||||||
if (dest_id == shape_id || dest_id < 0) {
|
if (static_cast<unsigned>(dest_id) == shape_id || dest_id < 0) {
|
||||||
return shape_id; // Is master already.
|
return shape_id; // Is master already.
|
||||||
}
|
}
|
||||||
int master_id = shape_table_[dest_id]->destination_index();
|
auto master_id = shape_table_[dest_id]->destination_index();
|
||||||
if (master_id == dest_id || master_id < 0) {
|
if (master_id == dest_id || master_id < 0) {
|
||||||
return dest_id; // Dest is the master and shape_id points to it.
|
return dest_id; // Dest is the master and shape_id points to it.
|
||||||
}
|
}
|
||||||
@ -559,7 +559,7 @@ int ShapeTable::MasterDestinationIndex(int shape_id) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns false if the unichars in neither shape is a subset of the other.
|
// Returns false if the unichars in neither shape is a subset of the other.
|
||||||
bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
|
bool ShapeTable::SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const {
|
||||||
const Shape &shape1 = GetShape(shape_id1);
|
const Shape &shape1 = GetShape(shape_id1);
|
||||||
const Shape &shape2 = GetShape(shape_id2);
|
const Shape &shape2 = GetShape(shape_id2);
|
||||||
int c1, c2;
|
int c1, c2;
|
||||||
@ -579,7 +579,7 @@ bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns false if the unichars in neither shape is a subset of the other.
|
// Returns false if the unichars in neither shape is a subset of the other.
|
||||||
bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const {
|
bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const {
|
||||||
const Shape &merge1 = GetShape(merge_id1);
|
const Shape &merge1 = GetShape(merge_id1);
|
||||||
const Shape &merge2 = GetShape(merge_id2);
|
const Shape &merge2 = GetShape(merge_id2);
|
||||||
const Shape &shape = GetShape(shape_id);
|
const Shape &shape = GetShape(shape_id);
|
||||||
@ -606,7 +606,7 @@ bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if the unichar sets are equal between the shapes.
|
// Returns true if the unichar sets are equal between the shapes.
|
||||||
bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
|
bool ShapeTable::EqualUnichars(unsigned shape_id1, unsigned shape_id2) const {
|
||||||
const Shape &shape1 = GetShape(shape_id1);
|
const Shape &shape1 = GetShape(shape_id1);
|
||||||
const Shape &shape2 = GetShape(shape_id2);
|
const Shape &shape2 = GetShape(shape_id2);
|
||||||
for (int c1 = 0; c1 < shape1.size(); ++c1) {
|
for (int c1 = 0; c1 < shape1.size(); ++c1) {
|
||||||
@ -625,7 +625,7 @@ bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if the unichar sets are equal between the shapes.
|
// Returns true if the unichar sets are equal between the shapes.
|
||||||
bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const {
|
bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const {
|
||||||
const Shape &merge1 = GetShape(merge_id1);
|
const Shape &merge1 = GetShape(merge_id1);
|
||||||
const Shape &merge2 = GetShape(merge_id2);
|
const Shape &merge2 = GetShape(merge_id2);
|
||||||
const Shape &shape = GetShape(shape_id);
|
const Shape &shape = GetShape(shape_id);
|
||||||
@ -651,7 +651,7 @@ bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if there is a common unichar between the shapes.
|
// Returns true if there is a common unichar between the shapes.
|
||||||
bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
|
bool ShapeTable::CommonUnichars(unsigned shape_id1, unsigned shape_id2) const {
|
||||||
const Shape &shape1 = GetShape(shape_id1);
|
const Shape &shape1 = GetShape(shape_id1);
|
||||||
const Shape &shape2 = GetShape(shape_id2);
|
const Shape &shape2 = GetShape(shape_id2);
|
||||||
for (int c1 = 0; c1 < shape1.size(); ++c1) {
|
for (int c1 = 0; c1 < shape1.size(); ++c1) {
|
||||||
@ -664,7 +664,7 @@ bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns true if there is a common font id between the shapes.
|
// Returns true if there is a common font id between the shapes.
|
||||||
bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
|
bool ShapeTable::CommonFont(unsigned shape_id1, unsigned shape_id2) const {
|
||||||
const Shape &shape1 = GetShape(shape_id1);
|
const Shape &shape1 = GetShape(shape_id1);
|
||||||
const Shape &shape2 = GetShape(shape_id2);
|
const Shape &shape2 = GetShape(shape_id2);
|
||||||
for (int c1 = 0; c1 < shape1.size(); ++c1) {
|
for (int c1 = 0; c1 < shape1.size(); ++c1) {
|
||||||
@ -685,7 +685,7 @@ void ShapeTable::AppendMasterShapes(const ShapeTable &other, std::vector<int> *s
|
|||||||
shape_map->clear();
|
shape_map->clear();
|
||||||
shape_map->resize(other.NumShapes(), -1);
|
shape_map->resize(other.NumShapes(), -1);
|
||||||
}
|
}
|
||||||
for (int s = 0; s < other.shape_table_.size(); ++s) {
|
for (unsigned s = 0; s < other.shape_table_.size(); ++s) {
|
||||||
if (other.shape_table_[s]->destination_index() < 0) {
|
if (other.shape_table_[s]->destination_index() < 0) {
|
||||||
int index = AddShape(*other.shape_table_[s]);
|
int index = AddShape(*other.shape_table_[s]);
|
||||||
if (shape_map != nullptr) {
|
if (shape_map != nullptr) {
|
||||||
|
@ -245,7 +245,7 @@ public:
|
|||||||
bool DeSerialize(TFile *fp);
|
bool DeSerialize(TFile *fp);
|
||||||
|
|
||||||
// Accessors.
|
// Accessors.
|
||||||
int NumShapes() const {
|
unsigned NumShapes() const {
|
||||||
return shape_table_.size();
|
return shape_table_.size();
|
||||||
}
|
}
|
||||||
const UNICHARSET &unicharset() const {
|
const UNICHARSET &unicharset() const {
|
||||||
@ -263,36 +263,36 @@ public:
|
|||||||
// Useful in conjunction with set_unicharset.
|
// Useful in conjunction with set_unicharset.
|
||||||
void ReMapClassIds(const std::vector<int> &unicharset_map);
|
void ReMapClassIds(const std::vector<int> &unicharset_map);
|
||||||
// Returns a string listing the classes/fonts in a shape.
|
// Returns a string listing the classes/fonts in a shape.
|
||||||
std::string DebugStr(int shape_id) const;
|
std::string DebugStr(unsigned shape_id) const;
|
||||||
// Returns a debug string summarizing the table.
|
// Returns a debug string summarizing the table.
|
||||||
std::string SummaryStr() const;
|
std::string SummaryStr() const;
|
||||||
|
|
||||||
// Adds a new shape starting with the given unichar_id and font_id.
|
// Adds a new shape starting with the given unichar_id and font_id.
|
||||||
// Returns the assigned index.
|
// Returns the assigned index.
|
||||||
int AddShape(int unichar_id, int font_id);
|
unsigned AddShape(int unichar_id, int font_id);
|
||||||
// Adds a copy of the given shape unless it is already present.
|
// Adds a copy of the given shape unless it is already present.
|
||||||
// Returns the assigned index or index of existing shape if already present.
|
// Returns the assigned index or index of existing shape if already present.
|
||||||
int AddShape(const Shape &other);
|
unsigned AddShape(const Shape &other);
|
||||||
// Removes the shape given by the shape index. All indices above are changed!
|
// Removes the shape given by the shape index. All indices above are changed!
|
||||||
void DeleteShape(int shape_id);
|
void DeleteShape(unsigned shape_id);
|
||||||
// Adds a font_id to the given existing shape index for the given
|
// Adds a font_id to the given existing shape index for the given
|
||||||
// unichar_id. If the unichar_id is not in the shape, it is added.
|
// unichar_id. If the unichar_id is not in the shape, it is added.
|
||||||
void AddToShape(int shape_id, int unichar_id, int font_id);
|
void AddToShape(unsigned shape_id, int unichar_id, int font_id);
|
||||||
// Adds the given shape to the existing shape with the given index.
|
// Adds the given shape to the existing shape with the given index.
|
||||||
void AddShapeToShape(int shape_id, const Shape &other);
|
void AddShapeToShape(unsigned shape_id, const Shape &other);
|
||||||
// Returns the id of the shape that contains the given unichar and font.
|
// Returns the id of the shape that contains the given unichar and font.
|
||||||
// If not found, returns -1.
|
// If not found, returns -1.
|
||||||
// If font_id < 0, the font_id is ignored and the first shape that matches
|
// If font_id < 0, the font_id is ignored and the first shape that matches
|
||||||
// the unichar_id is returned.
|
// the unichar_id is returned.
|
||||||
int FindShape(int unichar_id, int font_id) const;
|
int FindShape(int unichar_id, int font_id) const;
|
||||||
// Returns the first unichar_id and font_id in the given shape.
|
// Returns the first unichar_id and font_id in the given shape.
|
||||||
void GetFirstUnicharAndFont(int shape_id, int *unichar_id, int *font_id) const;
|
void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const;
|
||||||
|
|
||||||
// Accessors for the Shape with the given shape_id.
|
// Accessors for the Shape with the given shape_id.
|
||||||
const Shape &GetShape(int shape_id) const {
|
const Shape &GetShape(unsigned shape_id) const {
|
||||||
return *shape_table_[shape_id];
|
return *shape_table_[shape_id];
|
||||||
}
|
}
|
||||||
Shape *MutableShape(int shape_id) {
|
Shape *MutableShape(unsigned shape_id) {
|
||||||
return shape_table_[shape_id];
|
return shape_table_[shape_id];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -301,24 +301,24 @@ public:
|
|||||||
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes);
|
int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes);
|
||||||
|
|
||||||
// Returns true if the shapes are already merged.
|
// Returns true if the shapes are already merged.
|
||||||
bool AlreadyMerged(int shape_id1, int shape_id2) const;
|
bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const;
|
||||||
// Returns true if any shape contains multiple unichars.
|
// Returns true if any shape contains multiple unichars.
|
||||||
bool AnyMultipleUnichars() const;
|
bool AnyMultipleUnichars() const;
|
||||||
// Returns the maximum number of unichars over all shapes.
|
// Returns the maximum number of unichars over all shapes.
|
||||||
int MaxNumUnichars() const;
|
int MaxNumUnichars() const;
|
||||||
// Merges shapes with a common unichar over the [start, end) interval.
|
// Merges shapes with a common unichar over the [start, end) interval.
|
||||||
// Assumes single unichar per shape.
|
// Assumes single unichar per shape.
|
||||||
void ForceFontMerges(int start, int end);
|
void ForceFontMerges(unsigned start, unsigned end);
|
||||||
// Returns the number of unichars in the master shape.
|
// Returns the number of unichars in the master shape.
|
||||||
int MasterUnicharCount(int shape_id) const;
|
unsigned MasterUnicharCount(unsigned shape_id) const;
|
||||||
// Returns the sum of the font counts in the master shape.
|
// Returns the sum of the font counts in the master shape.
|
||||||
int MasterFontCount(int shape_id) const;
|
int MasterFontCount(unsigned shape_id) const;
|
||||||
// Returns the number of unichars that would result from merging the shapes.
|
// Returns the number of unichars that would result from merging the shapes.
|
||||||
int MergedUnicharCount(int shape_id1, int shape_id2) const;
|
int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const;
|
||||||
// Merges two shape_ids, leaving shape_id2 marked as merged.
|
// Merges two shape_ids, leaving shape_id2 marked as merged.
|
||||||
void MergeShapes(int shape_id1, int shape_id2);
|
void MergeShapes(unsigned shape_id1, unsigned shape_id2);
|
||||||
// Swaps two shape_ids.
|
// Swaps two shape_ids.
|
||||||
void SwapShapes(int shape_id1, int shape_id2);
|
void SwapShapes(unsigned shape_id1, unsigned shape_id2);
|
||||||
// Appends the master shapes from other to this.
|
// Appends the master shapes from other to this.
|
||||||
// Used to create a clean ShapeTable from a merged one, or to create a
|
// Used to create a clean ShapeTable from a merged one, or to create a
|
||||||
// copy of a ShapeTable.
|
// copy of a ShapeTable.
|
||||||
@ -330,19 +330,19 @@ public:
|
|||||||
// Returns the destination of this shape, (if merged), taking into account
|
// Returns the destination of this shape, (if merged), taking into account
|
||||||
// the fact that the destination may itself have been merged.
|
// the fact that the destination may itself have been merged.
|
||||||
// For a non-merged shape, returns the input shape_id.
|
// For a non-merged shape, returns the input shape_id.
|
||||||
int MasterDestinationIndex(int shape_id) const;
|
unsigned MasterDestinationIndex(unsigned shape_id) const;
|
||||||
|
|
||||||
// Returns false if the unichars in neither shape is a subset of the other..
|
// Returns false if the unichars in neither shape is a subset of the other..
|
||||||
bool SubsetUnichar(int shape_id1, int shape_id2) const;
|
bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const;
|
||||||
// Returns false if the unichars in neither shape is a subset of the other..
|
// Returns false if the unichars in neither shape is a subset of the other..
|
||||||
bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
|
bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const;
|
||||||
// Returns true if the unichar sets are equal between the shapes.
|
// Returns true if the unichar sets are equal between the shapes.
|
||||||
bool EqualUnichars(int shape_id1, int shape_id2) const;
|
bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const;
|
||||||
bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
|
bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const;
|
||||||
// Returns true if there is a common unichar between the shapes.
|
// Returns true if there is a common unichar between the shapes.
|
||||||
bool CommonUnichars(int shape_id1, int shape_id2) const;
|
bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const;
|
||||||
// Returns true if there is a common font id between the shapes.
|
// Returns true if there is a common font id between the shapes.
|
||||||
bool CommonFont(int shape_id1, int shape_id2) const;
|
bool CommonFont(unsigned shape_id1, unsigned shape_id2) const;
|
||||||
|
|
||||||
// Adds the unichars of the given shape_id to the vector of results. Any
|
// Adds the unichars of the given shape_id to the vector of results. Any
|
||||||
// unichar_id that is already present just has the fonts added to the
|
// unichar_id that is already present just has the fonts added to the
|
||||||
|
@ -44,9 +44,8 @@ const int case_state_table[6][4] = {
|
|||||||
|
|
||||||
int Dict::case_ok(const WERD_CHOICE &word) const {
|
int Dict::case_ok(const WERD_CHOICE &word) const {
|
||||||
int state = 0;
|
int state = 0;
|
||||||
int x;
|
|
||||||
const UNICHARSET *unicharset = word.unicharset();
|
const UNICHARSET *unicharset = word.unicharset();
|
||||||
for (x = 0; x < word.length(); ++x) {
|
for (unsigned x = 0; x < word.length(); ++x) {
|
||||||
UNICHAR_ID ch_id = word.unichar_id(x);
|
UNICHAR_ID ch_id = word.unichar_id(x);
|
||||||
if (unicharset->get_isupper(ch_id)) {
|
if (unicharset->get_isupper(ch_id)) {
|
||||||
state = case_state_table[state][1];
|
state = case_state_table[state][1];
|
||||||
@ -69,7 +68,7 @@ bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharse
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int num_alphanum = 0;
|
int num_alphanum = 0;
|
||||||
for (int x = 0; x < word.length(); ++x) {
|
for (unsigned x = 0; x < word.length(); ++x) {
|
||||||
num_alphanum +=
|
num_alphanum +=
|
||||||
(unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
|
(unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
|
||||||
}
|
}
|
||||||
|
@ -136,10 +136,7 @@ void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Dawg::match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const {
|
bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, UNICHAR_ID wildcard) const {
|
||||||
EDGE_REF edge;
|
|
||||||
int32_t word_end;
|
|
||||||
|
|
||||||
if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
|
if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
|
||||||
bool any_matched = false;
|
bool any_matched = false;
|
||||||
NodeChildVector vec;
|
NodeChildVector vec;
|
||||||
@ -153,8 +150,8 @@ bool Dawg::match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_
|
|||||||
word->set_unichar_id(wildcard, index);
|
word->set_unichar_id(wildcard, index);
|
||||||
return any_matched;
|
return any_matched;
|
||||||
} else {
|
} else {
|
||||||
word_end = index == word->length() - 1;
|
auto word_end = index == word->length() - 1;
|
||||||
edge = edge_char_of(node, word->unichar_id(index), word_end);
|
auto edge = edge_char_of(node, word->unichar_id(index), word_end);
|
||||||
if (edge != NO_EDGE) { // normal edge in DAWG
|
if (edge != NO_EDGE) { // normal edge in DAWG
|
||||||
node = next_node(edge);
|
node = next_node(edge);
|
||||||
if (word_end) {
|
if (word_end) {
|
||||||
|
@ -277,7 +277,7 @@ protected:
|
|||||||
/// the *'s in this string are interpreted as wildcards.
|
/// the *'s in this string are interpreted as wildcards.
|
||||||
/// WERD_CHOICE param is not passed by const so that wildcard searches
|
/// WERD_CHOICE param is not passed by const so that wildcard searches
|
||||||
/// can modify it and work without having to copy WERD_CHOICEs.
|
/// can modify it and work without having to copy WERD_CHOICEs.
|
||||||
bool match_words(WERD_CHOICE *word, int32_t index, NODE_REF node, UNICHAR_ID wildcard) const;
|
bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, UNICHAR_ID wildcard) const;
|
||||||
|
|
||||||
// Recursively iterate over all words in a dawg (see public iterate_words).
|
// Recursively iterate over all words in a dawg (see public iterate_words).
|
||||||
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore,
|
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore,
|
||||||
|
@ -364,7 +364,7 @@ bool Dict::FinishLoad() {
|
|||||||
successors_.reserve(dawgs_.size());
|
successors_.reserve(dawgs_.size());
|
||||||
for (auto dawg : dawgs_) {
|
for (auto dawg : dawgs_) {
|
||||||
auto *lst = new SuccessorList();
|
auto *lst = new SuccessorList();
|
||||||
for (int j = 0; j < dawgs_.size(); ++j) {
|
for (unsigned j = 0; j < dawgs_.size(); ++j) {
|
||||||
const Dawg *other = dawgs_[j];
|
const Dawg *other = dawgs_[j];
|
||||||
if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
|
if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
|
||||||
kDawgSuccessors[dawg->type()][other->type()]) {
|
kDawgSuccessors[dawg->type()][other->type()]) {
|
||||||
@ -432,7 +432,7 @@ int Dict::def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset,
|
|||||||
// Go over the active_dawgs vector and insert DawgPosition records
|
// Go over the active_dawgs vector and insert DawgPosition records
|
||||||
// with the updated ref (an edge with the corresponding unichar id) into
|
// with the updated ref (an edge with the corresponding unichar id) into
|
||||||
// dawg_args->updated_pos.
|
// dawg_args->updated_pos.
|
||||||
for (int a = 0; a < dawg_args->active_dawgs->size(); ++a) {
|
for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
|
||||||
const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
|
const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
|
||||||
const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
|
const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
|
||||||
const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
|
const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
|
||||||
@ -608,11 +608,10 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA
|
|||||||
// beginning of the word. If hyphenated() returns true, copy the entries
|
// beginning of the word. If hyphenated() returns true, copy the entries
|
||||||
// from hyphen_active_dawgs_ instead.
|
// from hyphen_active_dawgs_ instead.
|
||||||
void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const {
|
void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const {
|
||||||
int i;
|
|
||||||
if (hyphenated()) {
|
if (hyphenated()) {
|
||||||
*active_dawgs = hyphen_active_dawgs_;
|
*active_dawgs = hyphen_active_dawgs_;
|
||||||
if (dawg_debug_level >= 3) {
|
if (dawg_debug_level >= 3) {
|
||||||
for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
|
for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) {
|
||||||
tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
|
tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
|
||||||
hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
|
hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
|
||||||
}
|
}
|
||||||
@ -626,7 +625,7 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern
|
|||||||
bool punc_dawg_available = (punc_dawg_ != nullptr) &&
|
bool punc_dawg_available = (punc_dawg_ != nullptr) &&
|
||||||
punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
|
punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
|
||||||
|
|
||||||
for (int i = 0; i < dawgs_.size(); i++) {
|
for (unsigned i = 0; i < dawgs_.size(); i++) {
|
||||||
if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
|
if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
|
||||||
int dawg_ty = dawgs_[i]->type();
|
int dawg_ty = dawgs_[i]->type();
|
||||||
bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
|
bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
|
||||||
@ -666,7 +665,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
|
|||||||
if (best_choice.length() >= kDocDictMaxRepChars) {
|
if (best_choice.length() >= kDocDictMaxRepChars) {
|
||||||
int num_rep_chars = 1;
|
int num_rep_chars = 1;
|
||||||
UNICHAR_ID uch_id = best_choice.unichar_id(0);
|
UNICHAR_ID uch_id = best_choice.unichar_id(0);
|
||||||
for (int i = 1; i < best_choice.length(); ++i) {
|
for (unsigned i = 1; i < best_choice.length(); ++i) {
|
||||||
if (best_choice.unichar_id(i) != uch_id) {
|
if (best_choice.unichar_id(i) != uch_id) {
|
||||||
num_rep_chars = 1;
|
num_rep_chars = 1;
|
||||||
uch_id = best_choice.unichar_id(i);
|
uch_id = best_choice.unichar_id(i);
|
||||||
@ -841,7 +840,7 @@ bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) cons
|
|||||||
|
|
||||||
// Extract the core word from the middle of each word with any digits
|
// Extract the core word from the middle of each word with any digits
|
||||||
// replaced with question marks.
|
// replaced with question marks.
|
||||||
int w1start, w1end, w2start, w2end;
|
unsigned w1start, w1end, w2start, w2end;
|
||||||
word1.punct_stripped(&w1start, &w1end);
|
word1.punct_stripped(&w1start, &w1end);
|
||||||
word2.punct_stripped(&w2start, &w2end);
|
word2.punct_stripped(&w2start, &w2end);
|
||||||
|
|
||||||
@ -857,7 +856,7 @@ bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) cons
|
|||||||
const UNICHARSET &uchset = getUnicharset();
|
const UNICHARSET &uchset = getUnicharset();
|
||||||
std::vector<UNICHAR_ID> bigram_string;
|
std::vector<UNICHAR_ID> bigram_string;
|
||||||
bigram_string.reserve(w1end + w2end + 1);
|
bigram_string.reserve(w1end + w2end + 1);
|
||||||
for (int i = w1start; i < w1end; i++) {
|
for (auto i = w1start; i < w1end; i++) {
|
||||||
const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
|
const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
|
||||||
if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
|
if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
|
||||||
bigram_string.push_back(question_unichar_id_);
|
bigram_string.push_back(question_unichar_id_);
|
||||||
@ -866,7 +865,7 @@ bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) cons
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
bigram_string.push_back(UNICHAR_SPACE);
|
bigram_string.push_back(UNICHAR_SPACE);
|
||||||
for (int i = w2start; i < w2end; i++) {
|
for (auto i = w2start; i < w2end; i++) {
|
||||||
const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
|
const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
|
||||||
if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
|
if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
|
||||||
bigram_string.push_back(question_unichar_id_);
|
bigram_string.push_back(question_unichar_id_);
|
||||||
@ -885,11 +884,10 @@ bool Dict::valid_punctuation(const WERD_CHOICE &word) {
|
|||||||
if (word.empty()) {
|
if (word.empty()) {
|
||||||
return NO_PERM;
|
return NO_PERM;
|
||||||
}
|
}
|
||||||
int i;
|
|
||||||
WERD_CHOICE new_word(word.unicharset());
|
WERD_CHOICE new_word(word.unicharset());
|
||||||
int last_index = word.length() - 1;
|
auto last_index = word.length() - 1;
|
||||||
int new_len = 0;
|
int new_len = 0;
|
||||||
for (i = 0; i <= last_index; ++i) {
|
for (unsigned i = 0; i <= last_index; ++i) {
|
||||||
UNICHAR_ID unichar_id = (word.unichar_id(i));
|
UNICHAR_ID unichar_id = (word.unichar_id(i));
|
||||||
if (getUnicharset().get_ispunctuation(unichar_id)) {
|
if (getUnicharset().get_ispunctuation(unichar_id)) {
|
||||||
new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
|
new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
|
||||||
@ -901,7 +899,7 @@ bool Dict::valid_punctuation(const WERD_CHOICE &word) {
|
|||||||
new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
|
new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (i = 0; i < dawgs_.size(); ++i) {
|
for (unsigned i = 0; i < dawgs_.size(); ++i) {
|
||||||
if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
|
if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
|
||||||
dawgs_[i]->word_in_dawg(new_word)) {
|
dawgs_[i]->word_in_dawg(new_word)) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -46,7 +46,7 @@ void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &
|
|||||||
float *limit, WERD_CHOICE *best_choice, int *attempts_left,
|
float *limit, WERD_CHOICE *best_choice, int *attempts_left,
|
||||||
void *void_more_args) {
|
void *void_more_args) {
|
||||||
auto *more_args = static_cast<DawgArgs *>(void_more_args);
|
auto *more_args = static_cast<DawgArgs *>(void_more_args);
|
||||||
word_ending = (char_choice_index == char_choices.size() - 1);
|
word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
|
||||||
int word_index = word->length() - 1;
|
int word_index = word->length() - 1;
|
||||||
if (best_choice->rating() < *limit) {
|
if (best_choice->rating() < *limit) {
|
||||||
return;
|
return;
|
||||||
@ -73,7 +73,7 @@ void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &
|
|||||||
DawgPositionVector unigram_updated_dawgs;
|
DawgPositionVector unigram_updated_dawgs;
|
||||||
DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
|
DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
|
||||||
// Check unigrams in the ngram with letter_is_okay().
|
// Check unigrams in the ngram with letter_is_okay().
|
||||||
for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
|
for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
|
||||||
UNICHAR_ID uch_id = encoding[i];
|
UNICHAR_ID uch_id = encoding[i];
|
||||||
ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
|
ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
|
||||||
++num_unigrams;
|
++num_unigrams;
|
||||||
@ -195,7 +195,7 @@ void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &cha
|
|||||||
debug, char_choice_index, *limit, word->rating(), word->certainty(),
|
debug, char_choice_index, *limit, word->rating(), word->certainty(),
|
||||||
word->debug_string().c_str());
|
word->debug_string().c_str());
|
||||||
}
|
}
|
||||||
if (char_choice_index < char_choices.size()) {
|
if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
|
||||||
BLOB_CHOICE_IT blob_choice_it;
|
BLOB_CHOICE_IT blob_choice_it;
|
||||||
blob_choice_it.set_to_list(char_choices.at(char_choice_index));
|
blob_choice_it.set_to_list(char_choices.at(char_choice_index));
|
||||||
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
|
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
|
||||||
@ -226,7 +226,7 @@ void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char
|
|||||||
const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
|
const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
|
||||||
float certainties[], float *limit, WERD_CHOICE *best_choice,
|
float certainties[], float *limit, WERD_CHOICE *best_choice,
|
||||||
int *attempts_left, void *more_args) {
|
int *attempts_left, void *more_args) {
|
||||||
int word_ending = (char_choice_index == char_choices.size() - 1);
|
auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
|
||||||
|
|
||||||
// Deal with fragments.
|
// Deal with fragments.
|
||||||
CHAR_FRAGMENT_INFO char_frag_info;
|
CHAR_FRAGMENT_INFO char_frag_info;
|
||||||
|
@ -164,7 +164,6 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
|
|||||||
// Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
|
// Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
|
||||||
// for each unichar id in BestChoice.
|
// for each unichar id in BestChoice.
|
||||||
BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
|
BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
|
||||||
int i;
|
|
||||||
bool ambigs_found = false;
|
bool ambigs_found = false;
|
||||||
// For each position in best_choice:
|
// For each position in best_choice:
|
||||||
// -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
|
// -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
|
||||||
@ -190,7 +189,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
|
|||||||
// unichar id for the corresponding position in best_choice.
|
// unichar id for the corresponding position in best_choice.
|
||||||
// best_choice consisting from only the original letters will
|
// best_choice consisting from only the original letters will
|
||||||
// have a rating of 0.0.
|
// have a rating of 0.0.
|
||||||
for (i = 0; i < best_choice->length(); ++i) {
|
for (unsigned i = 0; i < best_choice->length(); ++i) {
|
||||||
auto *lst = new BLOB_CHOICE_LIST();
|
auto *lst = new BLOB_CHOICE_LIST();
|
||||||
BLOB_CHOICE_IT lst_it(lst);
|
BLOB_CHOICE_IT lst_it(lst);
|
||||||
// TODO(rays/antonova) Put real xheights and y shifts here.
|
// TODO(rays/antonova) Put real xheights and y shifts here.
|
||||||
@ -201,10 +200,9 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
|
|||||||
}
|
}
|
||||||
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
|
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
|
||||||
int wrong_ngram_index;
|
int wrong_ngram_index;
|
||||||
int next_index;
|
|
||||||
int blob_index = 0;
|
int blob_index = 0;
|
||||||
for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
|
for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
|
||||||
UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
|
auto curr_unichar_id = best_choice->unichar_id(i);
|
||||||
if (stopper_debug_level > 2) {
|
if (stopper_debug_level > 2) {
|
||||||
tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
|
tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
|
||||||
getUnicharset().debug_str(curr_unichar_id).c_str());
|
getUnicharset().debug_str(curr_unichar_id).c_str());
|
||||||
@ -212,7 +210,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
|
|||||||
int num_wrong_blobs = best_choice->state(i);
|
int num_wrong_blobs = best_choice->state(i);
|
||||||
wrong_ngram_index = 0;
|
wrong_ngram_index = 0;
|
||||||
wrong_ngram[wrong_ngram_index] = curr_unichar_id;
|
wrong_ngram[wrong_ngram_index] = curr_unichar_id;
|
||||||
if (curr_unichar_id == INVALID_UNICHAR_ID || curr_unichar_id >= table.size() ||
|
if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() ||
|
||||||
table[curr_unichar_id] == nullptr) {
|
table[curr_unichar_id] == nullptr) {
|
||||||
continue; // there is no ambig spec for this unichar id
|
continue; // there is no ambig spec for this unichar id
|
||||||
}
|
}
|
||||||
@ -272,6 +270,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
|
|||||||
}
|
}
|
||||||
spec_it.forward();
|
spec_it.forward();
|
||||||
} else if (compare == -1) {
|
} else if (compare == -1) {
|
||||||
|
unsigned next_index;
|
||||||
if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
|
if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
|
||||||
((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
|
((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
|
||||||
// Add the next unichar id to wrong_ngram and keep looking for
|
// Add the next unichar id to wrong_ngram and keep looking for
|
||||||
@ -293,7 +292,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
|
|||||||
if (ambigs_found) {
|
if (ambigs_found) {
|
||||||
if (stopper_debug_level > 2) {
|
if (stopper_debug_level > 2) {
|
||||||
tprintf("\nResulting ambig_blob_choices:\n");
|
tprintf("\nResulting ambig_blob_choices:\n");
|
||||||
for (i = 0; i < ambig_blob_choices.size(); ++i) {
|
for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {
|
||||||
print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
|
print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
|
||||||
tprintf("\n");
|
tprintf("\n");
|
||||||
}
|
}
|
||||||
@ -310,7 +309,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_r
|
|||||||
// the capability to produce classifications combined from character
|
// the capability to produce classifications combined from character
|
||||||
// fragments is added to other functions.
|
// fragments is added to other functions.
|
||||||
int orig_i = 0;
|
int orig_i = 0;
|
||||||
for (i = 0; i < alt_word->length(); ++i) {
|
for (unsigned i = 0; i < alt_word->length(); ++i) {
|
||||||
const UNICHARSET &uchset = getUnicharset();
|
const UNICHARSET &uchset = getUnicharset();
|
||||||
bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
|
bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
|
||||||
UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
|
UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
|
||||||
@ -444,7 +443,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
|
|||||||
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
|
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
|
||||||
int shortest = INT32_MAX;
|
int shortest = INT32_MAX;
|
||||||
int curr_len = 0;
|
int curr_len = 0;
|
||||||
for (int w = 0; w < WordChoice.length(); ++w) {
|
for (unsigned w = 0; w < WordChoice.length(); ++w) {
|
||||||
if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
|
if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
|
||||||
curr_len++;
|
curr_len++;
|
||||||
} else if (curr_len > 0) {
|
} else if (curr_len > 0) {
|
||||||
|
@ -71,7 +71,7 @@ bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bo
|
|||||||
if (node_ref == NO_EDGE) {
|
if (node_ref == NO_EDGE) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
assert(node_ref < nodes_.size());
|
assert(static_cast<size_t>(node_ref) < nodes_.size());
|
||||||
EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges
|
EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges
|
||||||
: nodes_[node_ref]->backward_edges;
|
: nodes_[node_ref]->backward_edges;
|
||||||
int vec_size = vec.size();
|
int vec_size = vec.size();
|
||||||
@ -111,7 +111,7 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in
|
|||||||
bool word_end, UNICHAR_ID unichar_id) {
|
bool word_end, UNICHAR_ID unichar_id) {
|
||||||
EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges)
|
EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges)
|
||||||
: &(nodes_[node1]->backward_edges);
|
: &(nodes_[node1]->backward_edges);
|
||||||
int search_index;
|
unsigned search_index;
|
||||||
if (node1 == 0 && direction == FORWARD_EDGE) {
|
if (node1 == 0 && direction == FORWARD_EDGE) {
|
||||||
search_index = 0; // find the index to make the add sorted
|
search_index = 0; // find the index to make the add sorted
|
||||||
while (search_index < vec->size() &&
|
while (search_index < vec->size() &&
|
||||||
@ -164,7 +164,7 @@ bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *re
|
|||||||
ASSERT_HOST(repetitions->size() == word.length());
|
ASSERT_HOST(repetitions->size() == word.length());
|
||||||
}
|
}
|
||||||
// Make sure the word does not contain invalid unchar ids.
|
// Make sure the word does not contain invalid unchar ids.
|
||||||
for (int i = 0; i < word.length(); ++i) {
|
for (unsigned i = 0; i < word.length(); ++i) {
|
||||||
if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) {
|
if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -175,7 +175,6 @@ bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *re
|
|||||||
NODE_REF the_next_node;
|
NODE_REF the_next_node;
|
||||||
bool marker_flag = false;
|
bool marker_flag = false;
|
||||||
EDGE_INDEX edge_index;
|
EDGE_INDEX edge_index;
|
||||||
int i;
|
|
||||||
int32_t still_finding_chars = true;
|
int32_t still_finding_chars = true;
|
||||||
int32_t word_end = false;
|
int32_t word_end = false;
|
||||||
bool add_failed = false;
|
bool add_failed = false;
|
||||||
@ -186,6 +185,7 @@ bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *re
|
|||||||
}
|
}
|
||||||
|
|
||||||
UNICHAR_ID unichar_id;
|
UNICHAR_ID unichar_id;
|
||||||
|
unsigned i;
|
||||||
for (i = 0; i < word.length() - 1; ++i) {
|
for (i = 0; i < word.length() - 1; ++i) {
|
||||||
unichar_id = word.unichar_id(i);
|
unichar_id = word.unichar_id(i);
|
||||||
marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
|
marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
|
||||||
@ -417,6 +417,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset)
|
|||||||
if (*str_ptr == '\\') { // regular '\' unichar that was escaped
|
if (*str_ptr == '\\') { // regular '\' unichar that was escaped
|
||||||
curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
|
curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
|
||||||
} else {
|
} else {
|
||||||
|
#if 0 // TODO: This code should be enabled if kSaneNumConcreteChars != 0.
|
||||||
if (word.length() < kSaneNumConcreteChars) {
|
if (word.length() < kSaneNumConcreteChars) {
|
||||||
tprintf(
|
tprintf(
|
||||||
"Please provide at least %d concrete characters at the"
|
"Please provide at least %d concrete characters at the"
|
||||||
@ -425,6 +426,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset)
|
|||||||
failed = true;
|
failed = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
// Parse character class from expression.
|
// Parse character class from expression.
|
||||||
curr_unichar_id = character_class_to_pattern(*str_ptr);
|
curr_unichar_id = character_class_to_pattern(*str_ptr);
|
||||||
}
|
}
|
||||||
@ -508,21 +510,16 @@ SquishedDawg *Trie::trie_to_dawg() {
|
|||||||
if (debug_level_ > 2) {
|
if (debug_level_ > 2) {
|
||||||
print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
|
print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
|
||||||
}
|
}
|
||||||
auto reduced_nodes = new bool[nodes_.size()];
|
std::vector<bool> reduced_nodes(nodes_.size());
|
||||||
for (int i = 0; i < nodes_.size(); i++) {
|
|
||||||
reduced_nodes[i] = false;
|
|
||||||
}
|
|
||||||
this->reduce_node_input(0, reduced_nodes);
|
this->reduce_node_input(0, reduced_nodes);
|
||||||
delete[] reduced_nodes;
|
|
||||||
|
|
||||||
if (debug_level_ > 2) {
|
if (debug_level_ > 2) {
|
||||||
print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
|
print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
|
||||||
}
|
}
|
||||||
// Build a translation map from node indices in nodes_ vector to
|
// Build a translation map from node indices in nodes_ vector to
|
||||||
// their target indices in EDGE_ARRAY.
|
// their target indices in EDGE_ARRAY.
|
||||||
auto *node_ref_map = new NODE_REF[nodes_.size() + 1];
|
std::vector<NODE_REF> node_ref_map(nodes_.size() + 1);
|
||||||
int i, j;
|
unsigned i;
|
||||||
node_ref_map[0] = 0;
|
|
||||||
for (i = 0; i < nodes_.size(); ++i) {
|
for (i = 0; i < nodes_.size(); ++i) {
|
||||||
node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
|
node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
|
||||||
}
|
}
|
||||||
@ -535,10 +532,10 @@ SquishedDawg *Trie::trie_to_dawg() {
|
|||||||
for (i = 0; i < nodes_.size(); ++i) {
|
for (i = 0; i < nodes_.size(); ++i) {
|
||||||
TRIE_NODE_RECORD *node_ptr = nodes_[i];
|
TRIE_NODE_RECORD *node_ptr = nodes_[i];
|
||||||
int end = node_ptr->forward_edges.size();
|
int end = node_ptr->forward_edges.size();
|
||||||
for (j = 0; j < end; ++j) {
|
for (int j = 0; j < end; ++j) {
|
||||||
EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
|
EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
|
||||||
NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
|
NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
|
||||||
ASSERT_HOST(node_ref < nodes_.size());
|
ASSERT_HOST(static_cast<size_t>(node_ref) < nodes_.size());
|
||||||
UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
|
UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
|
||||||
link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
|
link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
|
||||||
end_of_word_from_edge_rec(edge_rec), unichar_id);
|
end_of_word_from_edge_rec(edge_rec), unichar_id);
|
||||||
@ -548,7 +545,6 @@ SquishedDawg *Trie::trie_to_dawg() {
|
|||||||
++edge_array_ptr;
|
++edge_array_ptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
delete[] node_ref_map;
|
|
||||||
|
|
||||||
return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_,
|
return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_,
|
||||||
debug_level_);
|
debug_level_);
|
||||||
@ -571,10 +567,9 @@ bool Trie::eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
|
|||||||
// Translate all edges going to/from next_node2 to go to/from next_node1.
|
// Translate all edges going to/from next_node2 to go to/from next_node1.
|
||||||
EDGE_RECORD *edge_ptr = nullptr;
|
EDGE_RECORD *edge_ptr = nullptr;
|
||||||
EDGE_INDEX edge_index;
|
EDGE_INDEX edge_index;
|
||||||
int i;
|
|
||||||
// The backward link in node to next_node2 will be zeroed out by the caller.
|
// The backward link in node to next_node2 will be zeroed out by the caller.
|
||||||
// Copy all the backward links in next_node2 to node next_node1
|
// Copy all the backward links in next_node2 to node next_node1
|
||||||
for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
|
for (unsigned i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
|
||||||
const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
|
const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
|
||||||
NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
|
NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
|
||||||
UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
|
UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
|
||||||
@ -599,13 +594,13 @@ bool Trie::eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
|
bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
|
||||||
EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes) {
|
EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes) {
|
||||||
if (debug_level_ > 1) {
|
if (debug_level_ > 1) {
|
||||||
tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
|
tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
|
||||||
}
|
}
|
||||||
// Compare each of the edge pairs with the given unichar_id.
|
// Compare each of the edge pairs with the given unichar_id.
|
||||||
bool did_something = false;
|
bool did_something = false;
|
||||||
for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
|
for (unsigned i = edge_index; i < backward_edges->size() - 1; ++i) {
|
||||||
// Find the first edge that can be eliminated.
|
// Find the first edge that can be eliminated.
|
||||||
UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
|
UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
|
||||||
while (i < backward_edges->size()) {
|
while (i < backward_edges->size()) {
|
||||||
@ -625,7 +620,7 @@ bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, N
|
|||||||
}
|
}
|
||||||
const EDGE_RECORD &edge_rec = (*backward_edges)[i];
|
const EDGE_RECORD &edge_rec = (*backward_edges)[i];
|
||||||
// Compare it to the rest of the edges with the given unichar_id.
|
// Compare it to the rest of the edges with the given unichar_id.
|
||||||
for (int j = i + 1; j < backward_edges->size(); ++j) {
|
for (auto j = i + 1; j < backward_edges->size(); ++j) {
|
||||||
const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
|
const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
|
||||||
if (DeadEdge(next_edge_rec)) {
|
if (DeadEdge(next_edge_rec)) {
|
||||||
continue;
|
continue;
|
||||||
@ -662,7 +657,7 @@ void Trie::sort_edges(EDGE_VECTOR *edges) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Trie::reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes) {
|
void Trie::reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes) {
|
||||||
EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
|
EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
|
||||||
sort_edges(&backward_edges);
|
sort_edges(&backward_edges);
|
||||||
if (debug_level_ > 1) {
|
if (debug_level_ > 1) {
|
||||||
@ -671,7 +666,7 @@ void Trie::reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EDGE_INDEX edge_index = 0;
|
EDGE_INDEX edge_index = 0;
|
||||||
while (edge_index < backward_edges.size()) {
|
while (static_cast<size_t>(edge_index) < backward_edges.size()) {
|
||||||
if (DeadEdge(backward_edges[edge_index])) {
|
if (DeadEdge(backward_edges[edge_index])) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -679,7 +674,7 @@ void Trie::reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes) {
|
|||||||
while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) {
|
while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) {
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
while (++edge_index < backward_edges.size()) {
|
while (static_cast<size_t>(++edge_index) < backward_edges.size()) {
|
||||||
UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
|
UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
|
||||||
if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) {
|
if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) {
|
||||||
break;
|
break;
|
||||||
|
@ -36,7 +36,6 @@ class UNICHARSET;
|
|||||||
// typedefs to int and restrict the casts to extracting these values from
|
// typedefs to int and restrict the casts to extracting these values from
|
||||||
// the 64 bit EDGE_RECORD.
|
// the 64 bit EDGE_RECORD.
|
||||||
using EDGE_INDEX = int64_t; // index of an edge in a given node
|
using EDGE_INDEX = int64_t; // index of an edge in a given node
|
||||||
using NODE_MARKER = bool *;
|
|
||||||
using EDGE_VECTOR = std::vector<EDGE_RECORD>;
|
using EDGE_VECTOR = std::vector<EDGE_RECORD>;
|
||||||
|
|
||||||
struct TRIE_NODE_RECORD {
|
struct TRIE_NODE_RECORD {
|
||||||
@ -383,7 +382,7 @@ protected:
|
|||||||
// caller when all edges with this letter have been reduced.
|
// caller when all edges with this letter have been reduced.
|
||||||
// Returns true if further reduction is possible with this same letter.
|
// Returns true if further reduction is possible with this same letter.
|
||||||
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
|
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
|
||||||
EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes);
|
EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in
|
* Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in
|
||||||
@ -394,7 +393,7 @@ protected:
|
|||||||
void sort_edges(EDGE_VECTOR *edges);
|
void sort_edges(EDGE_VECTOR *edges);
|
||||||
|
|
||||||
/** Eliminates any redundant edges from this node in the Trie. */
|
/** Eliminates any redundant edges from this node in the Trie. */
|
||||||
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
|
void reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes);
|
||||||
|
|
||||||
// Returns the pattern unichar id for the given character class code.
|
// Returns the pattern unichar id for the given character class code.
|
||||||
UNICHAR_ID character_class_to_pattern(char ch);
|
UNICHAR_ID character_class_to_pattern(char ch);
|
||||||
|
@ -269,7 +269,7 @@ void LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, boo
|
|||||||
}
|
}
|
||||||
search_->segmentTimestepsByCharacters();
|
search_->segmentTimestepsByCharacters();
|
||||||
unsigned char_it = 0;
|
unsigned char_it = 0;
|
||||||
for (int i = 0; i < words->size(); ++i) {
|
for (size_t i = 0; i < words->size(); ++i) {
|
||||||
for (int j = 0; j < words->at(i)->end; ++j) {
|
for (int j = 0; j < words->at(i)->end; ++j) {
|
||||||
if (char_it < search_->ctc_choices.size()) {
|
if (char_it < search_->ctc_choices.size()) {
|
||||||
words->at(i)->CTC_symbol_choices.push_back(search_->ctc_choices[char_it]);
|
words->at(i)->CTC_symbol_choices.push_back(search_->ctc_choices[char_it]);
|
||||||
|
@ -120,14 +120,14 @@ bool Parallel::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch
|
|||||||
#endif
|
#endif
|
||||||
debug = false;
|
debug = false;
|
||||||
}
|
}
|
||||||
int stack_size = stack_.size();
|
auto stack_size = stack_.size();
|
||||||
if (type_ == NT_PAR_2D_LSTM) {
|
if (type_ == NT_PAR_2D_LSTM) {
|
||||||
// Special case, run parallel in parallel.
|
// Special case, run parallel in parallel.
|
||||||
std::vector<NetworkScratch::IO> in_deltas(stack_size);
|
std::vector<NetworkScratch::IO> in_deltas(stack_size);
|
||||||
std::vector<NetworkScratch::IO> out_deltas(stack_size);
|
std::vector<NetworkScratch::IO> out_deltas(stack_size);
|
||||||
// Split the forward deltas for each stack element.
|
// Split the forward deltas for each stack element.
|
||||||
int feature_offset = 0;
|
int feature_offset = 0;
|
||||||
for (int i = 0; i < stack_.size(); ++i) {
|
for (unsigned i = 0; i < stack_.size(); ++i) {
|
||||||
int num_features = stack_[i]->NumOutputs();
|
int num_features = stack_[i]->NumOutputs();
|
||||||
in_deltas[i].Resize(fwd_deltas, num_features, scratch);
|
in_deltas[i].Resize(fwd_deltas, num_features, scratch);
|
||||||
out_deltas[i].Resize(fwd_deltas, stack_[i]->NumInputs(), scratch);
|
out_deltas[i].Resize(fwd_deltas, stack_[i]->NumInputs(), scratch);
|
||||||
@ -137,11 +137,11 @@ bool Parallel::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch
|
|||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
# pragma omp parallel for num_threads(stack_size)
|
# pragma omp parallel for num_threads(stack_size)
|
||||||
#endif
|
#endif
|
||||||
for (int i = 0; i < stack_size; ++i) {
|
for (unsigned i = 0; i < stack_size; ++i) {
|
||||||
stack_[i]->Backward(debug, *in_deltas[i], scratch, i == 0 ? back_deltas : out_deltas[i]);
|
stack_[i]->Backward(debug, *in_deltas[i], scratch, i == 0 ? back_deltas : out_deltas[i]);
|
||||||
}
|
}
|
||||||
if (needs_to_backprop_) {
|
if (needs_to_backprop_) {
|
||||||
for (int i = 1; i < stack_size; ++i) {
|
for (unsigned i = 1; i < stack_size; ++i) {
|
||||||
back_deltas->AddAllToFloat(*out_deltas[i]);
|
back_deltas->AddAllToFloat(*out_deltas[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -152,7 +152,7 @@ bool Parallel::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch
|
|||||||
// back_deltas.
|
// back_deltas.
|
||||||
NetworkScratch::IO out_deltas;
|
NetworkScratch::IO out_deltas;
|
||||||
int feature_offset = 0;
|
int feature_offset = 0;
|
||||||
for (int i = 0; i < stack_.size(); ++i) {
|
for (unsigned i = 0; i < stack_.size(); ++i) {
|
||||||
int num_features = stack_[i]->NumOutputs();
|
int num_features = stack_[i]->NumOutputs();
|
||||||
in_deltas->CopyUnpacking(fwd_deltas, feature_offset, num_features);
|
in_deltas->CopyUnpacking(fwd_deltas, feature_offset, num_features);
|
||||||
feature_offset += num_features;
|
feature_offset += num_features;
|
||||||
|
@ -142,7 +142,7 @@ void Plumbing::DebugWeights() {
|
|||||||
|
|
||||||
// Returns a set of strings representing the layer-ids of all layers below.
|
// Returns a set of strings representing the layer-ids of all layers below.
|
||||||
void Plumbing::EnumerateLayers(const std::string *prefix, std::vector<std::string> &layers) const {
|
void Plumbing::EnumerateLayers(const std::string *prefix, std::vector<std::string> &layers) const {
|
||||||
for (int i = 0; i < stack_.size(); ++i) {
|
for (size_t i = 0; i < stack_.size(); ++i) {
|
||||||
std::string layer_name;
|
std::string layer_name;
|
||||||
if (prefix) {
|
if (prefix) {
|
||||||
layer_name = *prefix;
|
layer_name = *prefix;
|
||||||
@ -161,7 +161,7 @@ void Plumbing::EnumerateLayers(const std::string *prefix, std::vector<std::strin
|
|||||||
Network *Plumbing::GetLayer(const char *id) const {
|
Network *Plumbing::GetLayer(const char *id) const {
|
||||||
char *next_id;
|
char *next_id;
|
||||||
int index = strtol(id, &next_id, 10);
|
int index = strtol(id, &next_id, 10);
|
||||||
if (index < 0 || index >= stack_.size()) {
|
if (index < 0 || static_cast<unsigned>(index) >= stack_.size()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
if (stack_[index]->IsPlumbingType()) {
|
if (stack_[index]->IsPlumbingType()) {
|
||||||
@ -176,7 +176,7 @@ Network *Plumbing::GetLayer(const char *id) const {
|
|||||||
float *Plumbing::LayerLearningRatePtr(const char *id) {
|
float *Plumbing::LayerLearningRatePtr(const char *id) {
|
||||||
char *next_id;
|
char *next_id;
|
||||||
int index = strtol(id, &next_id, 10);
|
int index = strtol(id, &next_id, 10);
|
||||||
if (index < 0 || index >= stack_.size()) {
|
if (index < 0 || static_cast<unsigned>(index) >= stack_.size()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
if (stack_[index]->IsPlumbingType()) {
|
if (stack_[index]->IsPlumbingType()) {
|
||||||
@ -184,7 +184,7 @@ float *Plumbing::LayerLearningRatePtr(const char *id) {
|
|||||||
ASSERT_HOST(*next_id == ':');
|
ASSERT_HOST(*next_id == ':');
|
||||||
return plumbing->LayerLearningRatePtr(next_id + 1);
|
return plumbing->LayerLearningRatePtr(next_id + 1);
|
||||||
}
|
}
|
||||||
if (index >= learning_rates_.size()) {
|
if (static_cast<unsigned>(index) >= learning_rates_.size()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
return &learning_rates_[index];
|
return &learning_rates_[index];
|
||||||
@ -238,7 +238,7 @@ bool Plumbing::DeSerialize(TFile *fp) {
|
|||||||
// Updates the weights using the given learning rate, momentum and adam_beta.
|
// Updates the weights using the given learning rate, momentum and adam_beta.
|
||||||
// num_samples is used in the adam computation iff use_adam_ is true.
|
// num_samples is used in the adam computation iff use_adam_ is true.
|
||||||
void Plumbing::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {
|
void Plumbing::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {
|
||||||
for (int i = 0; i < stack_.size(); ++i) {
|
for (size_t i = 0; i < stack_.size(); ++i) {
|
||||||
if (network_flags_ & NF_LAYER_SPECIFIC_LR) {
|
if (network_flags_ & NF_LAYER_SPECIFIC_LR) {
|
||||||
if (i < learning_rates_.size()) {
|
if (i < learning_rates_.size()) {
|
||||||
learning_rate = learning_rates_[i];
|
learning_rate = learning_rates_[i];
|
||||||
@ -259,7 +259,7 @@ void Plumbing::CountAlternators(const Network &other, TFloat *same, TFloat *chan
|
|||||||
ASSERT_HOST(other.type() == type_);
|
ASSERT_HOST(other.type() == type_);
|
||||||
const auto *plumbing = static_cast<const Plumbing *>(&other);
|
const auto *plumbing = static_cast<const Plumbing *>(&other);
|
||||||
ASSERT_HOST(plumbing->stack_.size() == stack_.size());
|
ASSERT_HOST(plumbing->stack_.size() == stack_.size());
|
||||||
for (int i = 0; i < stack_.size(); ++i) {
|
for (size_t i = 0; i < stack_.size(); ++i) {
|
||||||
stack_[i]->CountAlternators(*plumbing->stack_[i], same, changed);
|
stack_[i]->CountAlternators(*plumbing->stack_[i], same, changed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -122,7 +122,7 @@ void RecodeBeamSearch::DecodeSecondaryBeams(const NetworkIO &output, double dict
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int width = output.Width();
|
int width = output.Width();
|
||||||
int bucketNumber = 0;
|
unsigned bucketNumber = 0;
|
||||||
for (int t = 0; t < width; ++t) {
|
for (int t = 0; t < width; ++t) {
|
||||||
while ((bucketNumber + 1) < character_boundaries_.size() &&
|
while ((bucketNumber + 1) < character_boundaries_.size() &&
|
||||||
t >= character_boundaries_[bucketNumber + 1]) {
|
t >= character_boundaries_[bucketNumber + 1]) {
|
||||||
@ -160,7 +160,7 @@ void RecodeBeamSearch::SaveMostCertainChoices(const float *outputs, int num_outp
|
|||||||
}
|
}
|
||||||
|
|
||||||
void RecodeBeamSearch::segmentTimestepsByCharacters() {
|
void RecodeBeamSearch::segmentTimestepsByCharacters() {
|
||||||
for (int i = 1; i < character_boundaries_.size(); ++i) {
|
for (unsigned i = 1; i < character_boundaries_.size(); ++i) {
|
||||||
std::vector<std::vector<std::pair<const char *, float>>> segment;
|
std::vector<std::vector<std::pair<const char *, float>>> segment;
|
||||||
for (int j = character_boundaries_[i - 1]; j < character_boundaries_[i]; ++j) {
|
for (int j = character_boundaries_[i - 1]; j < character_boundaries_[i]; ++j) {
|
||||||
segment.push_back(timesteps[j]);
|
segment.push_back(timesteps[j]);
|
||||||
@ -183,7 +183,7 @@ RecodeBeamSearch::combineSegmentedTimesteps(
|
|||||||
void RecodeBeamSearch::calculateCharBoundaries(std::vector<int> *starts, std::vector<int> *ends,
|
void RecodeBeamSearch::calculateCharBoundaries(std::vector<int> *starts, std::vector<int> *ends,
|
||||||
std::vector<int> *char_bounds_, int maxWidth) {
|
std::vector<int> *char_bounds_, int maxWidth) {
|
||||||
char_bounds_->push_back(0);
|
char_bounds_->push_back(0);
|
||||||
for (int i = 0; i < ends->size(); ++i) {
|
for (unsigned i = 0; i < ends->size(); ++i) {
|
||||||
int middle = ((*starts)[i + 1] - (*ends)[i]) / 2;
|
int middle = ((*starts)[i + 1] - (*ends)[i]) / 2;
|
||||||
char_bounds_->push_back((*ends)[i] + middle);
|
char_bounds_->push_back((*ends)[i] + middle);
|
||||||
}
|
}
|
||||||
@ -339,7 +339,7 @@ void RecodeBeamSearch::PrintBeam2(bool uids, int num_outputs, const UNICHARSET *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
int ct = 0;
|
int ct = 0;
|
||||||
int cb = 1;
|
unsigned cb = 1;
|
||||||
for (std::vector<const RecodeNode *> layer : topology) {
|
for (std::vector<const RecodeNode *> layer : topology) {
|
||||||
if (cb >= character_boundaries_.size()) {
|
if (cb >= character_boundaries_.size()) {
|
||||||
break;
|
break;
|
||||||
@ -399,7 +399,7 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
|
|||||||
// new beam is calculated based on the results from the original beam.
|
// new beam is calculated based on the results from the original beam.
|
||||||
std::vector<RecodeBeam *> ¤tBeam = secondary_beam_.empty() ? beam_ : secondary_beam_;
|
std::vector<RecodeBeam *> ¤tBeam = secondary_beam_.empty() ? beam_ : secondary_beam_;
|
||||||
character_boundaries_[0] = 0;
|
character_boundaries_[0] = 0;
|
||||||
for (int j = 1; j < character_boundaries_.size(); ++j) {
|
for (unsigned j = 1; j < character_boundaries_.size(); ++j) {
|
||||||
std::vector<int> unichar_ids;
|
std::vector<int> unichar_ids;
|
||||||
std::vector<float> certs;
|
std::vector<float> certs;
|
||||||
std::vector<float> ratings;
|
std::vector<float> ratings;
|
||||||
@ -434,18 +434,19 @@ void RecodeBeamSearch::extractSymbolChoices(const UNICHARSET *unicharset) {
|
|||||||
}
|
}
|
||||||
if (!unichar_ids.empty()) {
|
if (!unichar_ids.empty()) {
|
||||||
int bestPos = 0;
|
int bestPos = 0;
|
||||||
for (int i = 1; i < unichar_ids.size(); ++i) {
|
for (unsigned i = 1; i < unichar_ids.size(); ++i) {
|
||||||
if (ratings[i] < ratings[bestPos]) {
|
if (ratings[i] < ratings[bestPos]) {
|
||||||
bestPos = i;
|
bestPos = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// TODO: bestCode is currently unused (see commit 2dd5d0d60).
|
#if 0 // TODO: bestCode is currently unused (see commit 2dd5d0d60).
|
||||||
int bestCode = -10;
|
int bestCode = -10;
|
||||||
for (auto &node : best_nodes) {
|
for (auto &node : best_nodes) {
|
||||||
if (node->unichar_id == unichar_ids[bestPos]) {
|
if (node->unichar_id == unichar_ids[bestPos]) {
|
||||||
bestCode = node->code;
|
bestCode = node->code;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
// Exclude the best choice for the followup decoding.
|
// Exclude the best choice for the followup decoding.
|
||||||
std::unordered_set<int> excludeCodeList;
|
std::unordered_set<int> excludeCodeList;
|
||||||
for (auto &best_node : best_nodes) {
|
for (auto &best_node : best_nodes) {
|
||||||
@ -619,7 +620,7 @@ WERD_RES *RecodeBeamSearch::InitializeWord(bool leading_space, const TBOX &line_
|
|||||||
C_BLOB_LIST blobs;
|
C_BLOB_LIST blobs;
|
||||||
C_BLOB_IT b_it(&blobs);
|
C_BLOB_IT b_it(&blobs);
|
||||||
for (int i = word_start; i < word_end; ++i) {
|
for (int i = word_start; i < word_end; ++i) {
|
||||||
if (character_boundaries_.size() > (i + 1)) {
|
if (static_cast<unsigned>(i + 1) < character_boundaries_.size()) {
|
||||||
TBOX box(static_cast<int16_t>(std::floor(character_boundaries_[i] * scale_factor)) +
|
TBOX box(static_cast<int16_t>(std::floor(character_boundaries_[i] * scale_factor)) +
|
||||||
line_box.left(),
|
line_box.left(),
|
||||||
line_box.bottom(),
|
line_box.bottom(),
|
||||||
@ -714,7 +715,7 @@ void RecodeBeamSearch::ComputeSecTopN(std::unordered_set<int> *exList, const flo
|
|||||||
void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio,
|
void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio,
|
||||||
double cert_offset, double worst_dict_cert,
|
double cert_offset, double worst_dict_cert,
|
||||||
const UNICHARSET *charset, bool debug) {
|
const UNICHARSET *charset, bool debug) {
|
||||||
if (t == beam_.size()) {
|
if (t == static_cast<int>(beam_.size())) {
|
||||||
beam_.push_back(new RecodeBeam);
|
beam_.push_back(new RecodeBeam);
|
||||||
}
|
}
|
||||||
RecodeBeam *step = beam_[t];
|
RecodeBeam *step = beam_[t];
|
||||||
@ -783,7 +784,7 @@ void RecodeBeamSearch::DecodeStep(const float *outputs, int t, double dict_ratio
|
|||||||
void RecodeBeamSearch::DecodeSecondaryStep(const float *outputs, int t, double dict_ratio,
|
void RecodeBeamSearch::DecodeSecondaryStep(const float *outputs, int t, double dict_ratio,
|
||||||
double cert_offset, double worst_dict_cert,
|
double cert_offset, double worst_dict_cert,
|
||||||
const UNICHARSET *charset, bool debug) {
|
const UNICHARSET *charset, bool debug) {
|
||||||
if (t == secondary_beam_.size()) {
|
if (t == static_cast<int>(secondary_beam_.size())) {
|
||||||
secondary_beam_.push_back(new RecodeBeam);
|
secondary_beam_.push_back(new RecodeBeam);
|
||||||
}
|
}
|
||||||
RecodeBeam *step = secondary_beam_[t];
|
RecodeBeam *step = secondary_beam_[t];
|
||||||
@ -1280,9 +1281,9 @@ void RecodeBeamSearch::ExtractPath(const RecodeNode *node, std::vector<const Rec
|
|||||||
// Helper prints debug information on the given lattice path.
|
// Helper prints debug information on the given lattice path.
|
||||||
void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset,
|
void RecodeBeamSearch::DebugPath(const UNICHARSET *unicharset,
|
||||||
const std::vector<const RecodeNode *> &path) const {
|
const std::vector<const RecodeNode *> &path) const {
|
||||||
for (int c = 0; c < path.size(); ++c) {
|
for (unsigned c = 0; c < path.size(); ++c) {
|
||||||
const RecodeNode &node = *path[c];
|
const RecodeNode &node = *path[c];
|
||||||
tprintf("%d ", c);
|
tprintf("%u ", c);
|
||||||
node.Print(null_char_, *unicharset, 1);
|
node.Print(null_char_, *unicharset, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1294,9 +1295,9 @@ void RecodeBeamSearch::DebugUnicharPath(const UNICHARSET *unicharset,
|
|||||||
const std::vector<float> &certs,
|
const std::vector<float> &certs,
|
||||||
const std::vector<float> &ratings,
|
const std::vector<float> &ratings,
|
||||||
const std::vector<int> &xcoords) const {
|
const std::vector<int> &xcoords) const {
|
||||||
int num_ids = unichar_ids.size();
|
auto num_ids = unichar_ids.size();
|
||||||
double total_rating = 0.0;
|
double total_rating = 0.0;
|
||||||
for (int c = 0; c < num_ids; ++c) {
|
for (unsigned c = 0; c < num_ids; ++c) {
|
||||||
int coord = xcoords[c];
|
int coord = xcoords[c];
|
||||||
tprintf("%d %d=%s r=%g, c=%g, s=%d, e=%d, perm=%d\n", coord, unichar_ids[c],
|
tprintf("%d %d=%s r=%g, c=%g, s=%d, e=%d, perm=%d\n", coord, unichar_ids[c],
|
||||||
unicharset->debug_str(unichar_ids[c]).c_str(), ratings[c], certs[c],
|
unicharset->debug_str(unichar_ids[c]).c_str(), ratings[c], certs[c],
|
||||||
|
@ -160,16 +160,16 @@ bool Series::Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *s
|
|||||||
// Splits the series after the given index, returning the two parts and
|
// Splits the series after the given index, returning the two parts and
|
||||||
// deletes itself. The first part, up to network with index last_start, goes
|
// deletes itself. The first part, up to network with index last_start, goes
|
||||||
// into start, and the rest goes into end.
|
// into start, and the rest goes into end.
|
||||||
void Series::SplitAt(int last_start, Series **start, Series **end) {
|
void Series::SplitAt(unsigned last_start, Series **start, Series **end) {
|
||||||
*start = nullptr;
|
*start = nullptr;
|
||||||
*end = nullptr;
|
*end = nullptr;
|
||||||
if (last_start < 0 || last_start >= stack_.size()) {
|
if (last_start >= stack_.size()) {
|
||||||
tprintf("Invalid split index %d must be in range [0,%zu]!\n", last_start, stack_.size() - 1);
|
tprintf("Invalid split index %u must be in range [0,%zu]!\n", last_start, stack_.size() - 1);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
auto *master_series = new Series("MasterSeries");
|
auto *master_series = new Series("MasterSeries");
|
||||||
auto *boosted_series = new Series("BoostedSeries");
|
auto *boosted_series = new Series("BoostedSeries");
|
||||||
for (int s = 0; s <= last_start; ++s) {
|
for (unsigned s = 0; s <= last_start; ++s) {
|
||||||
if (s + 1 == stack_.size() && stack_[s]->type() == NT_SOFTMAX) {
|
if (s + 1 == stack_.size() && stack_[s]->type() == NT_SOFTMAX) {
|
||||||
// Change the softmax to a tanh.
|
// Change the softmax to a tanh.
|
||||||
auto *fc = static_cast<FullyConnected *>(stack_[s]);
|
auto *fc = static_cast<FullyConnected *>(stack_[s]);
|
||||||
@ -178,7 +178,7 @@ void Series::SplitAt(int last_start, Series **start, Series **end) {
|
|||||||
master_series->AddToStack(stack_[s]);
|
master_series->AddToStack(stack_[s]);
|
||||||
stack_[s] = nullptr;
|
stack_[s] = nullptr;
|
||||||
}
|
}
|
||||||
for (int s = last_start + 1; s < stack_.size(); ++s) {
|
for (unsigned s = last_start + 1; s < stack_.size(); ++s) {
|
||||||
boosted_series->AddToStack(stack_[s]);
|
boosted_series->AddToStack(stack_[s]);
|
||||||
stack_[s] = nullptr;
|
stack_[s] = nullptr;
|
||||||
}
|
}
|
||||||
|
@ -82,7 +82,7 @@ public:
|
|||||||
// deletes itself. The first part, up to network with index last_start, goes
|
// deletes itself. The first part, up to network with index last_start, goes
|
||||||
// into start, and the rest goes into end.
|
// into start, and the rest goes into end.
|
||||||
TESS_API
|
TESS_API
|
||||||
void SplitAt(int last_start, Series **start, Series **end);
|
void SplitAt(unsigned last_start, Series **start, Series **end);
|
||||||
|
|
||||||
// Appends the elements of the src series to this, removing from src and
|
// Appends the elements of the src series to this, removing from src and
|
||||||
// deleting it.
|
// deleting it.
|
||||||
|
@ -238,7 +238,7 @@ double BaselineRow::AdjustBaselineToGrid(int debug, const FCOORD &direction, dou
|
|||||||
// Find the displacement_modes_ entry nearest to the grid.
|
// Find the displacement_modes_ entry nearest to the grid.
|
||||||
double best_error = 0.0;
|
double best_error = 0.0;
|
||||||
int best_index = -1;
|
int best_index = -1;
|
||||||
for (int i = 0; i < displacement_modes_.size(); ++i) {
|
for (unsigned i = 0; i < displacement_modes_.size(); ++i) {
|
||||||
double blob_y = displacement_modes_[i];
|
double blob_y = displacement_modes_[i];
|
||||||
double error = BaselineBlock::SpacingModelError(blob_y, line_spacing, line_offset);
|
double error = BaselineBlock::SpacingModelError(blob_y, line_spacing, line_offset);
|
||||||
if (debug > 1) {
|
if (debug > 1) {
|
||||||
@ -482,9 +482,9 @@ void BaselineBlock::ParallelizeBaselines(double default_block_skew) {
|
|||||||
// Enforce the line spacing model on all lines that don't yet have a good
|
// Enforce the line spacing model on all lines that don't yet have a good
|
||||||
// baseline.
|
// baseline.
|
||||||
// Start by finding the row that is best fitted to the model.
|
// Start by finding the row that is best fitted to the model.
|
||||||
int best_row = 0;
|
unsigned best_row = 0;
|
||||||
double best_error = SpacingModelError(rows_[0]->PerpDisp(direction), line_spacing_, line_offset_);
|
double best_error = SpacingModelError(rows_[0]->PerpDisp(direction), line_spacing_, line_offset_);
|
||||||
for (int r = 1; r < rows_.size(); ++r) {
|
for (unsigned r = 1; r < rows_.size(); ++r) {
|
||||||
double error = SpacingModelError(rows_[r]->PerpDisp(direction), line_spacing_, line_offset_);
|
double error = SpacingModelError(rows_[r]->PerpDisp(direction), line_spacing_, line_offset_);
|
||||||
if (error < best_error) {
|
if (error < best_error) {
|
||||||
best_error = error;
|
best_error = error;
|
||||||
@ -493,7 +493,7 @@ void BaselineBlock::ParallelizeBaselines(double default_block_skew) {
|
|||||||
}
|
}
|
||||||
// Starting at the best fitting row, work outwards, syncing the offset.
|
// Starting at the best fitting row, work outwards, syncing the offset.
|
||||||
double offset = line_offset_;
|
double offset = line_offset_;
|
||||||
for (int r = best_row + 1; r < rows_.size(); ++r) {
|
for (auto r = best_row + 1; r < rows_.size(); ++r) {
|
||||||
offset = rows_[r]->AdjustBaselineToGrid(debug_level_, direction, line_spacing_, offset);
|
offset = rows_[r]->AdjustBaselineToGrid(debug_level_, direction, line_spacing_, offset);
|
||||||
}
|
}
|
||||||
offset = line_offset_;
|
offset = line_offset_;
|
||||||
@ -516,7 +516,7 @@ void BaselineBlock::SetupBlockParameters() const {
|
|||||||
}
|
}
|
||||||
// Setup the parameters on all the rows.
|
// Setup the parameters on all the rows.
|
||||||
TO_ROW_IT row_it(block_->get_rows());
|
TO_ROW_IT row_it(block_->get_rows());
|
||||||
for (int r = 0; r < rows_.size(); ++r, row_it.forward()) {
|
for (unsigned r = 0; r < rows_.size(); ++r, row_it.forward()) {
|
||||||
BaselineRow *row = rows_[r];
|
BaselineRow *row = rows_[r];
|
||||||
TO_ROW *to_row = row_it.data();
|
TO_ROW *to_row = row_it.data();
|
||||||
row->SetupOldLineParameters(to_row);
|
row->SetupOldLineParameters(to_row);
|
||||||
@ -637,7 +637,7 @@ bool BaselineBlock::ComputeLineSpacing() {
|
|||||||
double max_baseline_error = kMaxBaselineError * line_spacing_;
|
double max_baseline_error = kMaxBaselineError * line_spacing_;
|
||||||
int non_trivial_gaps = 0;
|
int non_trivial_gaps = 0;
|
||||||
int fitting_gaps = 0;
|
int fitting_gaps = 0;
|
||||||
for (int i = 1; i < row_positions.size(); ++i) {
|
for (unsigned i = 1; i < row_positions.size(); ++i) {
|
||||||
double row_gap = fabs(row_positions[i - 1] - row_positions[i]);
|
double row_gap = fabs(row_positions[i - 1] - row_positions[i]);
|
||||||
if (row_gap > max_baseline_error) {
|
if (row_gap > max_baseline_error) {
|
||||||
++non_trivial_gaps;
|
++non_trivial_gaps;
|
||||||
@ -677,7 +677,7 @@ void BaselineBlock::ComputeBaselinePositions(const FCOORD &direction,
|
|||||||
// of the spacings between adjacent overlapping textlines.
|
// of the spacings between adjacent overlapping textlines.
|
||||||
void BaselineBlock::EstimateLineSpacing() {
|
void BaselineBlock::EstimateLineSpacing() {
|
||||||
std::vector<float> spacings;
|
std::vector<float> spacings;
|
||||||
for (int r = 0; r < rows_.size(); ++r) {
|
for (unsigned r = 0; r < rows_.size(); ++r) {
|
||||||
BaselineRow *row = rows_[r];
|
BaselineRow *row = rows_[r];
|
||||||
// Exclude silly lines.
|
// Exclude silly lines.
|
||||||
if (fabs(row->BaselineAngle()) > M_PI * 0.25) {
|
if (fabs(row->BaselineAngle()) > M_PI * 0.25) {
|
||||||
@ -685,7 +685,7 @@ void BaselineBlock::EstimateLineSpacing() {
|
|||||||
}
|
}
|
||||||
// Find the first row after row that overlaps it significantly.
|
// Find the first row after row that overlaps it significantly.
|
||||||
const TBOX &row_box = row->bounding_box();
|
const TBOX &row_box = row->bounding_box();
|
||||||
int r2;
|
unsigned r2;
|
||||||
for (r2 = r + 1; r2 < rows_.size() && !row_box.major_x_overlap(rows_[r2]->bounding_box());
|
for (r2 = r + 1; r2 < rows_.size() && !row_box.major_x_overlap(rows_[r2]->bounding_box());
|
||||||
++r2) {
|
++r2) {
|
||||||
;
|
;
|
||||||
@ -786,8 +786,8 @@ double BaselineBlock::FitLineSpacingModel(const std::vector<double> &positions,
|
|||||||
}
|
}
|
||||||
// Get the median offset.
|
// Get the median offset.
|
||||||
if (debug_level_ > 2) {
|
if (debug_level_ > 2) {
|
||||||
for (int i = 0; i < offsets.size(); ++i) {
|
for (unsigned i = 0; i < offsets.size(); ++i) {
|
||||||
tprintf("%d: %g\n", i, offsets[i]);
|
tprintf("%u: %g\n", i, offsets[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*c_out = MedianOfCircularValues(*m_out, offsets);
|
*c_out = MedianOfCircularValues(*m_out, offsets);
|
||||||
|
@ -137,13 +137,13 @@ public:
|
|||||||
|
|
||||||
float EstimateYFor(float x, float r) {
|
float EstimateYFor(float x, float r) {
|
||||||
ASSERT_HOST(finalized_);
|
ASSERT_HOST(finalized_);
|
||||||
int start = 0, end = values_.size();
|
unsigned start = 0, end = values_.size();
|
||||||
// Because the number of samples (used_) is assumed to be small,
|
// Because the number of samples (used_) is assumed to be small,
|
||||||
// just use linear search to find values within the range.
|
// just use linear search to find values within the range.
|
||||||
while (start < values_.size() && values_[start].x < x * (1.0 - r)) {
|
while (start < values_.size() && values_[start].x < x * (1 - r)) {
|
||||||
start++;
|
start++;
|
||||||
}
|
}
|
||||||
while (end - 1 >= 0 && values_[end - 1].x > x * (1.0 + r)) {
|
while (end > 0 && values_[end - 1].x > x * (1 + r)) {
|
||||||
end--;
|
end--;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,7 +157,7 @@ public:
|
|||||||
// Compute weighted average of the values.
|
// Compute weighted average of the values.
|
||||||
float rc = 0;
|
float rc = 0;
|
||||||
int vote = 0;
|
int vote = 0;
|
||||||
for (int i = start; i < end; i++) {
|
for (auto i = start; i < end; i++) {
|
||||||
rc += values_[i].vote * x * values_[i].y / values_[i].x;
|
rc += values_[i].vote * x * values_[i].y / values_[i].x;
|
||||||
vote += values_[i].vote;
|
vote += values_[i].vote;
|
||||||
}
|
}
|
||||||
@ -457,8 +457,8 @@ private:
|
|||||||
|
|
||||||
// Cleanup chars that are already merged to others.
|
// Cleanup chars that are already merged to others.
|
||||||
void DeleteChars() {
|
void DeleteChars() {
|
||||||
int index = 0;
|
unsigned index = 0;
|
||||||
for (int i = 0; i < characters_.size(); ++i) {
|
for (unsigned i = 0; i < characters_.size(); ++i) {
|
||||||
if (!characters_[i].delete_flag()) {
|
if (!characters_[i].delete_flag()) {
|
||||||
if (index != i) {
|
if (index != i) {
|
||||||
characters_[index] = characters_[i];
|
characters_[index] = characters_[i];
|
||||||
|
@ -1528,8 +1528,8 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction
|
|||||||
// By iteratively including the next smallest distance across the vectors,
|
// By iteratively including the next smallest distance across the vectors,
|
||||||
// (as in a merge sort) we can use the vector indices as counts of each type
|
// (as in a merge sort) we can use the vector indices as counts of each type
|
||||||
// and find the nearest set of objects that give us a definite decision.
|
// and find the nearest set of objects that give us a definite decision.
|
||||||
int counts[NPT_COUNT];
|
unsigned counts[NPT_COUNT];
|
||||||
memset(counts, 0, sizeof(counts[0]) * NPT_COUNT);
|
memset(counts, 0, sizeof(counts));
|
||||||
// If there is image in the search box, tip the balance in image's favor.
|
// If there is image in the search box, tip the balance in image's favor.
|
||||||
int image_bias = image_region ? kSmoothDecisionMargin / 2 : 0;
|
int image_bias = image_region ? kSmoothDecisionMargin / 2 : 0;
|
||||||
BlobRegionType text_dir = part.blob_type();
|
BlobRegionType text_dir = part.blob_type();
|
||||||
@ -1551,15 +1551,15 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(BlobNeighbourDir direction
|
|||||||
}
|
}
|
||||||
*best_distance = min_dist;
|
*best_distance = min_dist;
|
||||||
if (debug) {
|
if (debug) {
|
||||||
tprintf("Totals: htext=%d+%d, vtext=%d+%d, image=%d+%d, at dist=%d\n", counts[NPT_HTEXT],
|
tprintf("Totals: htext=%u+%u, vtext=%u+%u, image=%u+%u, at dist=%d\n", counts[NPT_HTEXT],
|
||||||
counts[NPT_WEAK_HTEXT], counts[NPT_VTEXT], counts[NPT_WEAK_VTEXT], counts[NPT_IMAGE],
|
counts[NPT_WEAK_HTEXT], counts[NPT_VTEXT], counts[NPT_WEAK_VTEXT], counts[NPT_IMAGE],
|
||||||
image_bias, min_dist);
|
image_bias, min_dist);
|
||||||
}
|
}
|
||||||
// See if we have a decision yet.
|
// See if we have a decision yet.
|
||||||
int image_count = counts[NPT_IMAGE];
|
auto image_count = counts[NPT_IMAGE];
|
||||||
int htext_score =
|
auto htext_score =
|
||||||
counts[NPT_HTEXT] + counts[NPT_WEAK_HTEXT] - (image_count + counts[NPT_WEAK_VTEXT]);
|
counts[NPT_HTEXT] + counts[NPT_WEAK_HTEXT] - (image_count + counts[NPT_WEAK_VTEXT]);
|
||||||
int vtext_score =
|
auto vtext_score =
|
||||||
counts[NPT_VTEXT] + counts[NPT_WEAK_VTEXT] - (image_count + counts[NPT_WEAK_HTEXT]);
|
counts[NPT_VTEXT] + counts[NPT_WEAK_VTEXT] - (image_count + counts[NPT_WEAK_HTEXT]);
|
||||||
if (image_count > 0 && image_bias - htext_score >= kSmoothDecisionMargin &&
|
if (image_count > 0 && image_bias - htext_score >= kSmoothDecisionMargin &&
|
||||||
image_bias - vtext_score >= kSmoothDecisionMargin) {
|
image_bias - vtext_score >= kSmoothDecisionMargin) {
|
||||||
|
@ -187,7 +187,7 @@ void ColPartitionSet::AddToColumnSetsIfUnique(PartSetVector *column_sets, WidthC
|
|||||||
delete this;
|
delete this;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < column_sets->size(); ++i) {
|
for (unsigned i = 0; i < column_sets->size(); ++i) {
|
||||||
ColPartitionSet *columns = column_sets->at(i);
|
ColPartitionSet *columns = column_sets->at(i);
|
||||||
// In ordering the column set candidates, good_coverage_ is king,
|
// In ordering the column set candidates, good_coverage_ is king,
|
||||||
// followed by good_column_count_ and then bad_coverage_.
|
// followed by good_column_count_ and then bad_coverage_.
|
||||||
|
@ -1295,7 +1295,7 @@ void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {
|
|||||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||||
row = row_it.data();
|
row = row_it.data();
|
||||||
// Compute the xheight of this row if it has not been computed before.
|
// Compute the xheight of this row if it has not been computed before.
|
||||||
if (row->xheight <= 0.0) {
|
if (row->xheight <= 0) {
|
||||||
compute_row_xheight(row, block->block->classify_rotation(), gradient, block->line_size);
|
compute_row_xheight(row, block->block->classify_rotation(), gradient, block->line_size);
|
||||||
}
|
}
|
||||||
ROW_CATEGORY row_category = get_row_category(row);
|
ROW_CATEGORY row_category = get_row_category(row);
|
||||||
@ -1349,10 +1349,10 @@ void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {
|
|||||||
xheight = static_cast<float>(textord_min_xheight);
|
xheight = static_cast<float>(textord_min_xheight);
|
||||||
corrected_xheight = true;
|
corrected_xheight = true;
|
||||||
}
|
}
|
||||||
if (corrected_xheight || ascrise <= 0.0) {
|
if (corrected_xheight || ascrise <= 0) {
|
||||||
ascrise = xheight * asc_frac_xheight;
|
ascrise = xheight * asc_frac_xheight;
|
||||||
}
|
}
|
||||||
if (corrected_xheight || descdrop >= 0.0) {
|
if (corrected_xheight || descdrop >= 0) {
|
||||||
descdrop = -(xheight * desc_frac_xheight);
|
descdrop = -(xheight * desc_frac_xheight);
|
||||||
}
|
}
|
||||||
block->xheight = xheight;
|
block->xheight = xheight;
|
||||||
@ -1397,7 +1397,7 @@ void Textord::compute_row_xheight(TO_ROW *row, // row to do
|
|||||||
&heights, &floating_heights, textord_single_height_mode && rotation.y() == 0.0, min_height,
|
&heights, &floating_heights, textord_single_height_mode && rotation.y() == 0.0, min_height,
|
||||||
max_height, &(row->xheight), &(row->ascrise));
|
max_height, &(row->xheight), &(row->ascrise));
|
||||||
row->descdrop = 0.0f;
|
row->descdrop = 0.0f;
|
||||||
if (row->xheight > 0.0) {
|
if (row->xheight > 0) {
|
||||||
row->descdrop =
|
row->descdrop =
|
||||||
static_cast<float>(compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
|
static_cast<float>(compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
|
||||||
}
|
}
|
||||||
@ -1699,7 +1699,7 @@ void correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdr
|
|||||||
// -- the row does not have ascenders or descenders, but its xheight
|
// -- the row does not have ascenders or descenders, but its xheight
|
||||||
// is close to the average block xheight (e.g. row with "www.mmm.com")
|
// is close to the average block xheight (e.g. row with "www.mmm.com")
|
||||||
if (row_category == ROW_ASCENDERS_FOUND) {
|
if (row_category == ROW_ASCENDERS_FOUND) {
|
||||||
if (row->descdrop >= 0.0) {
|
if (row->descdrop >= 0) {
|
||||||
row->descdrop = row->xheight * (descdrop / xheight);
|
row->descdrop = row->xheight * (descdrop / xheight);
|
||||||
}
|
}
|
||||||
} else if (row_category == ROW_INVALID ||
|
} else if (row_category == ROW_INVALID ||
|
||||||
|
@ -89,13 +89,13 @@ void StructuredTable::set_max_text_height(int height) {
|
|||||||
bool StructuredTable::is_lined() const {
|
bool StructuredTable::is_lined() const {
|
||||||
return is_lined_;
|
return is_lined_;
|
||||||
}
|
}
|
||||||
int StructuredTable::row_count() const {
|
unsigned StructuredTable::row_count() const {
|
||||||
return cell_y_.empty() ? 0 : cell_y_.size() - 1;
|
return cell_y_.empty() ? 0 : cell_y_.size() - 1;
|
||||||
}
|
}
|
||||||
int StructuredTable::column_count() const {
|
unsigned StructuredTable::column_count() const {
|
||||||
return cell_x_.empty() ? 0 : cell_x_.size() - 1;
|
return cell_x_.empty() ? 0 : cell_x_.size() - 1;
|
||||||
}
|
}
|
||||||
int StructuredTable::cell_count() const {
|
unsigned StructuredTable::cell_count() const {
|
||||||
return row_count() * column_count();
|
return row_count() * column_count();
|
||||||
}
|
}
|
||||||
void StructuredTable::set_bounding_box(const TBOX &box) {
|
void StructuredTable::set_bounding_box(const TBOX &box) {
|
||||||
@ -110,12 +110,12 @@ int StructuredTable::median_cell_height() {
|
|||||||
int StructuredTable::median_cell_width() {
|
int StructuredTable::median_cell_width() {
|
||||||
return median_cell_width_;
|
return median_cell_width_;
|
||||||
}
|
}
|
||||||
int StructuredTable::row_height(int row) const {
|
int StructuredTable::row_height(unsigned row) const {
|
||||||
ASSERT_HOST(0 <= row && row < row_count());
|
ASSERT_HOST(row < row_count());
|
||||||
return cell_y_[row + 1] - cell_y_[row];
|
return cell_y_[row + 1] - cell_y_[row];
|
||||||
}
|
}
|
||||||
int StructuredTable::column_width(int column) const {
|
int StructuredTable::column_width(unsigned column) const {
|
||||||
ASSERT_HOST(0 <= column && column < column_count());
|
ASSERT_HOST(column < column_count());
|
||||||
return cell_x_[column + 1] - cell_x_[column];
|
return cell_x_[column + 1] - cell_x_[column];
|
||||||
}
|
}
|
||||||
int StructuredTable::space_above() const {
|
int StructuredTable::space_above() const {
|
||||||
@ -234,16 +234,16 @@ int StructuredTable::CountFilledCellsInRow(int row) {
|
|||||||
int StructuredTable::CountFilledCellsInColumn(int column) {
|
int StructuredTable::CountFilledCellsInColumn(int column) {
|
||||||
return CountFilledCells(0, row_count() - 1, column, column);
|
return CountFilledCells(0, row_count() - 1, column, column);
|
||||||
}
|
}
|
||||||
int StructuredTable::CountFilledCells(int row_start, int row_end, int column_start,
|
int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start,
|
||||||
int column_end) {
|
unsigned column_end) {
|
||||||
ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
|
ASSERT_HOST(row_start <= row_end && row_end < row_count());
|
||||||
ASSERT_HOST(0 <= column_start && column_start <= column_end && column_end < column_count());
|
ASSERT_HOST(column_start <= column_end && column_end < column_count());
|
||||||
int cell_count = 0;
|
int cell_count = 0;
|
||||||
TBOX cell_box;
|
TBOX cell_box;
|
||||||
for (int row = row_start; row <= row_end; ++row) {
|
for (unsigned row = row_start; row <= row_end; ++row) {
|
||||||
cell_box.set_bottom(cell_y_[row]);
|
cell_box.set_bottom(cell_y_[row]);
|
||||||
cell_box.set_top(cell_y_[row + 1]);
|
cell_box.set_top(cell_y_[row + 1]);
|
||||||
for (int col = column_start; col <= column_end; ++col) {
|
for (unsigned col = column_start; col <= column_end; ++col) {
|
||||||
cell_box.set_left(cell_x_[col]);
|
cell_box.set_left(cell_x_[col]);
|
||||||
cell_box.set_right(cell_x_[col + 1]);
|
cell_box.set_right(cell_x_[col + 1]);
|
||||||
if (CountPartitions(cell_box) > 0) {
|
if (CountPartitions(cell_box) > 0) {
|
||||||
@ -258,8 +258,8 @@ int StructuredTable::CountFilledCells(int row_start, int row_end, int column_sta
|
|||||||
// This can filter out large whitespace caused by growing tables too far
|
// This can filter out large whitespace caused by growing tables too far
|
||||||
// and page numbers.
|
// and page numbers.
|
||||||
bool StructuredTable::VerifyRowFilled(int row) {
|
bool StructuredTable::VerifyRowFilled(int row) {
|
||||||
for (int i = 0; i < column_count(); ++i) {
|
for (unsigned i = 0; i < column_count(); ++i) {
|
||||||
double area_filled = CalculateCellFilledPercentage(row, i);
|
auto area_filled = CalculateCellFilledPercentage(row, i);
|
||||||
if (area_filled >= kMinFilledArea) {
|
if (area_filled >= kMinFilledArea) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -269,9 +269,9 @@ bool StructuredTable::VerifyRowFilled(int row) {
|
|||||||
|
|
||||||
// Finds the filled area in a cell.
|
// Finds the filled area in a cell.
|
||||||
// Assume ColPartitions do not overlap for simplicity (even though they do).
|
// Assume ColPartitions do not overlap for simplicity (even though they do).
|
||||||
double StructuredTable::CalculateCellFilledPercentage(int row, int column) {
|
double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) {
|
||||||
ASSERT_HOST(0 <= row && row <= row_count());
|
ASSERT_HOST(row <= row_count());
|
||||||
ASSERT_HOST(0 <= column && column <= column_count());
|
ASSERT_HOST(column <= column_count());
|
||||||
const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);
|
const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);
|
||||||
ASSERT_HOST(!kCellBox.null_box());
|
ASSERT_HOST(!kCellBox.null_box());
|
||||||
|
|
||||||
@ -532,10 +532,10 @@ void StructuredTable::CalculateStats() {
|
|||||||
STATS height_stats(0, kMaxCellHeight + 1);
|
STATS height_stats(0, kMaxCellHeight + 1);
|
||||||
STATS width_stats(0, kMaxCellWidth + 1);
|
STATS width_stats(0, kMaxCellWidth + 1);
|
||||||
|
|
||||||
for (int i = 0; i < row_count(); ++i) {
|
for (unsigned i = 0; i < row_count(); ++i) {
|
||||||
height_stats.add(row_height(i), column_count());
|
height_stats.add(row_height(i), column_count());
|
||||||
}
|
}
|
||||||
for (int i = 0; i < column_count(); ++i) {
|
for (unsigned i = 0; i < column_count(); ++i) {
|
||||||
width_stats.add(column_width(i), row_count());
|
width_stats.add(column_width(i), row_count());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -617,8 +617,8 @@ void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
|
|||||||
ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
|
ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
|
||||||
|
|
||||||
locations->push_back(min_list.at(0));
|
locations->push_back(min_list.at(0));
|
||||||
int min_index = 0;
|
unsigned min_index = 0;
|
||||||
int max_index = 0;
|
unsigned max_index = 0;
|
||||||
int stacked_partitions = 0;
|
int stacked_partitions = 0;
|
||||||
int last_cross_position = INT32_MAX;
|
int last_cross_position = INT32_MAX;
|
||||||
// max_index will expire after min_index.
|
// max_index will expire after min_index.
|
||||||
@ -904,7 +904,7 @@ bool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, Structure
|
|||||||
const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
|
const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
|
||||||
// Keeps track of the most columns in an accepted table. The resulting table
|
// Keeps track of the most columns in an accepted table. The resulting table
|
||||||
// may be less than the max, but we don't want to stray too far.
|
// may be less than the max, but we don't want to stray too far.
|
||||||
int best_cols = 0;
|
unsigned best_cols = 0;
|
||||||
// Make sure we find a good border.
|
// Make sure we find a good border.
|
||||||
bool found_good_border = false;
|
bool found_good_border = false;
|
||||||
|
|
||||||
|
@ -86,15 +86,15 @@ public:
|
|||||||
// Basic accessors. Some are treated as attributes despite having indirect
|
// Basic accessors. Some are treated as attributes despite having indirect
|
||||||
// representation.
|
// representation.
|
||||||
bool is_lined() const;
|
bool is_lined() const;
|
||||||
int row_count() const;
|
unsigned row_count() const;
|
||||||
int column_count() const;
|
unsigned column_count() const;
|
||||||
int cell_count() const;
|
unsigned cell_count() const;
|
||||||
void set_bounding_box(const TBOX &box);
|
void set_bounding_box(const TBOX &box);
|
||||||
const TBOX &bounding_box() const;
|
const TBOX &bounding_box() const;
|
||||||
int median_cell_height();
|
int median_cell_height();
|
||||||
int median_cell_width();
|
int median_cell_width();
|
||||||
int row_height(int row) const;
|
int row_height(unsigned row) const;
|
||||||
int column_width(int column) const;
|
int column_width(unsigned column) const;
|
||||||
int space_above() const;
|
int space_above() const;
|
||||||
int space_below() const;
|
int space_below() const;
|
||||||
|
|
||||||
@ -120,7 +120,7 @@ public:
|
|||||||
int CountFilledCells();
|
int CountFilledCells();
|
||||||
int CountFilledCellsInRow(int row);
|
int CountFilledCellsInRow(int row);
|
||||||
int CountFilledCellsInColumn(int column);
|
int CountFilledCellsInColumn(int column);
|
||||||
int CountFilledCells(int row_start, int row_end, int column_start, int column_end);
|
int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end);
|
||||||
|
|
||||||
// Makes sure that at least one cell in a row has substantial area filled.
|
// Makes sure that at least one cell in a row has substantial area filled.
|
||||||
// This can filter out large whitespace caused by growing tables too far
|
// This can filter out large whitespace caused by growing tables too far
|
||||||
@ -128,7 +128,7 @@ public:
|
|||||||
// (currently bugged for some reason).
|
// (currently bugged for some reason).
|
||||||
bool VerifyRowFilled(int row);
|
bool VerifyRowFilled(int row);
|
||||||
// Finds the filled area in a cell.
|
// Finds the filled area in a cell.
|
||||||
double CalculateCellFilledPercentage(int row, int column);
|
double CalculateCellFilledPercentage(unsigned row, unsigned column);
|
||||||
|
|
||||||
// Debug display, draws the table in the given color. If the table is not
|
// Debug display, draws the table in the given color. If the table is not
|
||||||
// valid, the table and "best" grid lines are still drawn in the given color.
|
// valid, the table and "best" grid lines are still drawn in the given color.
|
||||||
|
@ -434,7 +434,6 @@ CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIS
|
|||||||
const char *program_feature_type) {
|
const char *program_feature_type) {
|
||||||
uint16_t N;
|
uint16_t N;
|
||||||
CLUSTERER *Clusterer;
|
CLUSTERER *Clusterer;
|
||||||
int32_t CharID;
|
|
||||||
LIST FeatureList = nullptr;
|
LIST FeatureList = nullptr;
|
||||||
FEATURE_SET FeatureSet = nullptr;
|
FEATURE_SET FeatureSet = nullptr;
|
||||||
|
|
||||||
@ -443,7 +442,7 @@ CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIS
|
|||||||
Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
|
Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
|
||||||
|
|
||||||
FeatureList = char_sample->List;
|
FeatureList = char_sample->List;
|
||||||
CharID = 0;
|
uint32_t CharID = 0;
|
||||||
std::vector<float> Sample;
|
std::vector<float> Sample;
|
||||||
iterate(FeatureList) {
|
iterate(FeatureList) {
|
||||||
FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
|
FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
|
||||||
|
@ -592,7 +592,7 @@ CLUSTERER *MasterTrainer::SetupForClustering(const ShapeTable &shape_table,
|
|||||||
for (it.Begin(); !it.AtEnd(); it.Next()) {
|
for (it.Begin(); !it.AtEnd(); it.Next()) {
|
||||||
sample_ptrs.push_back(&it.GetSample());
|
sample_ptrs.push_back(&it.GetSample());
|
||||||
}
|
}
|
||||||
int sample_id = 0;
|
uint32_t sample_id = 0;
|
||||||
for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
|
for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
|
||||||
const TrainingSample *sample = sample_ptrs[i];
|
const TrainingSample *sample = sample_ptrs[i];
|
||||||
uint32_t num_features = sample->num_micro_features();
|
uint32_t num_features = sample->num_micro_features();
|
||||||
|
@ -38,7 +38,7 @@ int main(int argc, char **argv) {
|
|||||||
for (int arg = 1; arg < argc - 1; ++arg) {
|
for (int arg = 1; arg < argc - 1; ++arg) {
|
||||||
// Load the input unicharset
|
// Load the input unicharset
|
||||||
if (input_unicharset.load_from_file(argv[arg])) {
|
if (input_unicharset.load_from_file(argv[arg])) {
|
||||||
printf("Loaded unicharset of size %d from file %s\n", input_unicharset.size(), argv[arg]);
|
printf("Loaded unicharset of size %zu from file %s\n", input_unicharset.size(), argv[arg]);
|
||||||
result_unicharset.AppendOtherUnicharset(input_unicharset);
|
result_unicharset.AppendOtherUnicharset(input_unicharset);
|
||||||
} else {
|
} else {
|
||||||
printf("Failed to load unicharset from file %s!!\n", argv[arg]);
|
printf("Failed to load unicharset from file %s!!\n", argv[arg]);
|
||||||
|
@ -264,7 +264,7 @@ SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_
|
|||||||
}
|
}
|
||||||
|
|
||||||
SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob,
|
SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob,
|
||||||
WERD_RES *word_res, int *blob_number) {
|
WERD_RES *word_res, unsigned *blob_number) {
|
||||||
TWERD *word = word_res->chopped_word;
|
TWERD *word = word_res->chopped_word;
|
||||||
for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
|
for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
|
||||||
TBLOB *blob = word->blobs[*blob_number];
|
TBLOB *blob = word->blobs[*blob_number];
|
||||||
@ -301,7 +301,7 @@ SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*blob_number = -1;
|
*blob_number = UINT_MAX;
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -319,24 +319,25 @@ SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic
|
|||||||
*/
|
*/
|
||||||
SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
||||||
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
|
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
|
||||||
int *blob_number) {
|
unsigned *blob_number) {
|
||||||
float rating_ceiling = FLT_MAX;
|
float rating_ceiling = FLT_MAX;
|
||||||
SEAM *seam = nullptr;
|
SEAM *seam = nullptr;
|
||||||
do {
|
do {
|
||||||
*blob_number = select_blob_to_split_from_fixpt(fixpt);
|
auto blob = select_blob_to_split_from_fixpt(fixpt);
|
||||||
if (chop_debug) {
|
if (chop_debug) {
|
||||||
tprintf("blob_number from fixpt = %d\n", *blob_number);
|
tprintf("blob_number from fixpt = %d\n", blob);
|
||||||
}
|
}
|
||||||
bool split_point_from_dict = (*blob_number != -1);
|
bool split_point_from_dict = (blob != -1);
|
||||||
if (split_point_from_dict) {
|
if (split_point_from_dict) {
|
||||||
fixpt->clear();
|
fixpt->clear();
|
||||||
} else {
|
} else {
|
||||||
*blob_number = select_blob_to_split(blob_choices, rating_ceiling, split_next_to_fragment);
|
blob = select_blob_to_split(blob_choices, rating_ceiling, split_next_to_fragment);
|
||||||
}
|
}
|
||||||
if (chop_debug) {
|
if (chop_debug) {
|
||||||
tprintf("blob_number = %d\n", *blob_number);
|
tprintf("blob_number = %d\n", blob);
|
||||||
}
|
}
|
||||||
if (*blob_number == -1) {
|
*blob_number = blob;
|
||||||
|
if (blob == -1) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -365,7 +366,7 @@ SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices,
|
|||||||
*/
|
*/
|
||||||
SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes,
|
SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes,
|
||||||
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
||||||
int *blob_number) {
|
unsigned *blob_number) {
|
||||||
if (prioritize_division) {
|
if (prioritize_division) {
|
||||||
return chop_overlapping_blob(boxes, true, word_res, blob_number);
|
return chop_overlapping_blob(boxes, true, word_res, blob_number);
|
||||||
} else {
|
} else {
|
||||||
@ -445,7 +446,7 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
|||||||
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
|
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
|
||||||
LMPainPoints *pain_points,
|
LMPainPoints *pain_points,
|
||||||
std::vector<SegSearchPending> *pending) {
|
std::vector<SegSearchPending> *pending) {
|
||||||
int blob_number;
|
unsigned blob_number;
|
||||||
do { // improvement loop.
|
do { // improvement loop.
|
||||||
// Make a simple vector of BLOB_CHOICEs to make it easy to pick which
|
// Make a simple vector of BLOB_CHOICEs to make it easy to pick which
|
||||||
// one to chop.
|
// one to chop.
|
||||||
@ -522,12 +523,11 @@ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
|||||||
int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices,
|
int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices,
|
||||||
float rating_ceiling, bool split_next_to_fragment) {
|
float rating_ceiling, bool split_next_to_fragment) {
|
||||||
BLOB_CHOICE *blob_choice;
|
BLOB_CHOICE *blob_choice;
|
||||||
int x;
|
|
||||||
float worst = -FLT_MAX;
|
float worst = -FLT_MAX;
|
||||||
int worst_index = -1;
|
int worst_index = -1;
|
||||||
float worst_near_fragment = -FLT_MAX;
|
float worst_near_fragment = -FLT_MAX;
|
||||||
int worst_index_near_fragment = -1;
|
int worst_index_near_fragment = -1;
|
||||||
const CHAR_FRAGMENT **fragments = nullptr;
|
std::vector<const CHAR_FRAGMENT *> fragments;
|
||||||
|
|
||||||
if (chop_debug) {
|
if (chop_debug) {
|
||||||
if (rating_ceiling < FLT_MAX) {
|
if (rating_ceiling < FLT_MAX) {
|
||||||
@ -538,7 +538,7 @@ int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (split_next_to_fragment && blob_choices.size() > 0) {
|
if (split_next_to_fragment && blob_choices.size() > 0) {
|
||||||
fragments = new const CHAR_FRAGMENT *[blob_choices.size()];
|
fragments.resize(blob_choices.size());
|
||||||
if (blob_choices[0] != nullptr) {
|
if (blob_choices[0] != nullptr) {
|
||||||
fragments[0] = getDict().getUnicharset().get_fragment(blob_choices[0]->unichar_id());
|
fragments[0] = getDict().getUnicharset().get_fragment(blob_choices[0]->unichar_id());
|
||||||
} else {
|
} else {
|
||||||
@ -546,9 +546,8 @@ int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (x = 0; x < blob_choices.size(); ++x) {
|
for (unsigned x = 0; x < blob_choices.size(); ++x) {
|
||||||
if (blob_choices[x] == nullptr) {
|
if (blob_choices[x] == nullptr) {
|
||||||
delete[] fragments;
|
|
||||||
return x;
|
return x;
|
||||||
} else {
|
} else {
|
||||||
blob_choice = blob_choices[x];
|
blob_choice = blob_choices[x];
|
||||||
@ -591,7 +590,6 @@ int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
delete[] fragments;
|
|
||||||
// TODO(daria): maybe a threshold of badness for
|
// TODO(daria): maybe a threshold of badness for
|
||||||
// worst_near_fragment would be useful.
|
// worst_near_fragment would be useful.
|
||||||
return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index;
|
return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index;
|
||||||
|
@ -828,10 +828,9 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(bool word_end, int curr_c
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
int i;
|
// Check that the path terminated before the current character is a word.
|
||||||
// Check a that the path terminated before the current character is a word.
|
|
||||||
bool has_word_ending = false;
|
bool has_word_ending = false;
|
||||||
for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
|
for (unsigned i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
|
||||||
const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
|
const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
|
||||||
const Dawg *pdawg = pos.dawg_index < 0 ? nullptr : dict_->GetDawg(pos.dawg_index);
|
const Dawg *pdawg = pos.dawg_index < 0 ? nullptr : dict_->GetDawg(pos.dawg_index);
|
||||||
if (pdawg == nullptr || pos.back_to_punc) {
|
if (pdawg == nullptr || pos.back_to_punc) {
|
||||||
@ -860,7 +859,7 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(bool word_end, int curr_c
|
|||||||
// like don't.
|
// like don't.
|
||||||
const auto &normed_ids = dict_->getUnicharset().normed_ids(b.unichar_id());
|
const auto &normed_ids = dict_->getUnicharset().normed_ids(b.unichar_id());
|
||||||
DawgPositionVector tmp_active_dawgs;
|
DawgPositionVector tmp_active_dawgs;
|
||||||
for (int i = 0; i < normed_ids.size(); ++i) {
|
for (unsigned i = 0; i < normed_ids.size(); ++i) {
|
||||||
if (language_model_debug_level > 2) {
|
if (language_model_debug_level > 2) {
|
||||||
tprintf("Test Letter OK for unichar %d, normed %d\n", b.unichar_id(), normed_ids[i]);
|
tprintf("Test Letter OK for unichar %d, normed %d\n", b.unichar_id(), normed_ids[i]);
|
||||||
}
|
}
|
||||||
|
@ -154,7 +154,7 @@ bool ParamsModel::SaveToFile(const char *full_path) const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool all_good = true;
|
bool all_good = true;
|
||||||
for (int i = 0; i < weights.size(); i++) {
|
for (unsigned i = 0; i < weights.size(); i++) {
|
||||||
if (fprintf(fp, "%s %f\n", kParamsTrainingFeatureTypeName[i], weights[i]) < 0) {
|
if (fprintf(fp, "%s %f\n", kParamsTrainingFeatureTypeName[i], weights[i]) < 0) {
|
||||||
all_good = false;
|
all_good = false;
|
||||||
}
|
}
|
||||||
|
@ -164,8 +164,8 @@ void Wordrec::UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
|
|||||||
LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
|
LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
|
||||||
BlamerBundle *blamer_bundle) {
|
BlamerBundle *blamer_bundle) {
|
||||||
MATRIX *ratings = word_res->ratings;
|
MATRIX *ratings = word_res->ratings;
|
||||||
ASSERT_HOST(ratings->dimension() == pending->size());
|
ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == pending->size());
|
||||||
ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
|
ASSERT_HOST(static_cast<unsigned>(ratings->dimension()) == best_choice_bundle->beam.size());
|
||||||
for (int col = starting_col; col < ratings->dimension(); ++col) {
|
for (int col = starting_col; col < ratings->dimension(); ++col) {
|
||||||
if (!(*pending)[col].WorkToDo()) {
|
if (!(*pending)[col].WorkToDo()) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -338,13 +338,13 @@ public:
|
|||||||
SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
|
SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
|
||||||
const std::vector<SEAM *> &seams);
|
const std::vector<SEAM *> &seams);
|
||||||
SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
|
SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
|
||||||
int *blob_number);
|
unsigned *blob_number);
|
||||||
SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
|
||||||
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
|
bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
|
||||||
int *blob_number);
|
unsigned *blob_number);
|
||||||
SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
|
SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
|
||||||
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
|
||||||
int *blob_number);
|
unsigned *blob_number);
|
||||||
void chop_word_main(WERD_RES *word);
|
void chop_word_main(WERD_RES *word);
|
||||||
void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
|
||||||
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
|
BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
|
||||||
|
Loading…
Reference in New Issue
Block a user