Fixed issue #633 (multi-language mode

This commit is contained in:
Ray Smith 2017-01-25 15:58:39 -08:00
parent ca16a08c10
commit b453f74e01
5 changed files with 70 additions and 43 deletions

View File

@ -754,16 +754,32 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
}
}
// Factored helper considers the indexed word and updates all the pointed
// values.
static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
float* rating, float* certainty, bool* bad,
bool* valid_permuter, int* right, int* next_left) {
// Helper finds the gap between the index word and the next.
static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
int* next_left) {
*right = -MAX_INT32;
*next_left = MAX_INT32;
if (index < words.size()) {
*right = words[index]->word->bounding_box().right();
if (index + 1 < words.size())
*next_left = words[index + 1]->word->bounding_box().left();
}
}
// Factored helper computes the rating, certainty, badness and validity of
// the permuter of the words in [first_index, end_index).
static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
int first_index, int end_index, float* rating,
float* certainty, bool* bad,
bool* valid_permuter) {
if (end_index <= first_index) {
*bad = true;
*valid_permuter = false;
}
for (int index = first_index; index < end_index && index < words.size();
++index) {
WERD_CHOICE* choice = words[index]->best_choice;
if (choice == NULL) {
if (choice == nullptr) {
*bad = true;
} else {
*rating += choice->rating();
@ -771,12 +787,6 @@ static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
if (!Dict::valid_word_permuter(choice->permuter(), false))
*valid_permuter = false;
}
*right = words[index]->word->bounding_box().right();
if (index + 1 < words.size())
*next_left = words[index + 1]->word->bounding_box().left();
} else {
*valid_permuter = false;
*bad = true;
}
}
@ -801,24 +811,13 @@ static int SelectBestWords(double rating_ratio,
while (b < best_words->size() || n < new_words->size()) {
// Start of the current run in each.
int start_b = b, start_n = n;
// Rating of the current run in each.
float b_rating = 0.0f, n_rating = 0.0f;
// Certainty of the current run in each.
float b_certainty = 0.0f, n_certainty = 0.0f;
// True if any word is missing its best choice.
bool b_bad = false, n_bad = false;
// True if all words have a valid permuter.
bool b_valid_permuter = true, n_valid_permuter = true;
while (b < best_words->size() || n < new_words->size()) {
int b_right = -MAX_INT32;
int next_b_left = MAX_INT32;
EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
&b_valid_permuter, &b_right, &next_b_left);
WordGap(*best_words, b, &b_right, &next_b_left);
int n_right = -MAX_INT32;
int next_n_left = MAX_INT32;
EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
&n_valid_permuter, &n_right, &next_n_left);
WordGap(*new_words, n, &n_right, &next_n_left);
if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
// The word breaks overlap. [start_b,b] and [start_n, n] match.
break;
@ -830,6 +829,20 @@ static int SelectBestWords(double rating_ratio,
else
++n;
}
// Rating of the current run in each.
float b_rating = 0.0f, n_rating = 0.0f;
// Certainty of the current run in each.
float b_certainty = 0.0f, n_certainty = 0.0f;
// True if any word is missing its best choice.
bool b_bad = false, n_bad = false;
// True if all words have a valid permuter.
bool b_valid_permuter = true, n_valid_permuter = true;
int end_b = b < best_words->size() ? b + 1 : b;
int end_n = n < new_words->size() ? n + 1 : n;
EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
&b_bad, &b_valid_permuter);
EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
&n_bad, &n_valid_permuter);
bool new_better = false;
if (!n_bad && (b_bad || (n_certainty > b_certainty &&
n_rating < b_rating) ||
@ -837,7 +850,7 @@ static int SelectBestWords(double rating_ratio,
n_rating < b_rating * rating_ratio &&
n_certainty > b_certainty - certainty_margin))) {
// New is better.
for (int i = start_n; i <= n; ++i) {
for (int i = start_n; i < end_n; ++i) {
out_words.push_back((*new_words)[i]);
(*new_words)[i] = NULL;
++num_new;
@ -845,14 +858,12 @@ static int SelectBestWords(double rating_ratio,
new_better = true;
} else if (!b_bad) {
// Current best is better.
for (int i = start_b; i <= b; ++i) {
for (int i = start_b; i < end_b; ++i) {
out_words.push_back((*best_words)[i]);
(*best_words)[i] = NULL;
++num_best;
}
}
int end_b = b < best_words->size() ? b + 1 : b;
int end_n = n < new_words->size() ? n + 1 : n;
if (debug) {
tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
" valid dict: %d v %d\n",
@ -875,10 +886,9 @@ static int SelectBestWords(double rating_ratio,
// Returns positive if this recognizer found more new best words than the
// number kept from best_words.
int Tesseract::RetryWithLanguage(const WordData& word_data,
WordRecognizer recognizer,
WordRecognizer recognizer, bool debug,
WERD_RES** in_word,
PointerVector<WERD_RES>* best_words) {
bool debug = classify_debug_level;
if (debug) {
tprintf("Trying word using lang %s, oem %d\n",
lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
@ -1281,7 +1291,8 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
// Points to the best result. May be word or in lang_words.
WERD_RES* word = word_data->word;
clock_t start_t = clock();
if (classify_debug_level) {
bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
if (debug) {
tprintf("%s word with lang %s at:",
word->done ? "Already done" : "Processing",
most_recently_used_->lang.string());
@ -1300,12 +1311,12 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
most_recently_used_ != sub_langs_[sub]; ++sub) {}
}
most_recently_used_->RetryWithLanguage(
*word_data, recognizer, &word_data->lang_words[sub], &best_words);
*word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
Tesseract* best_lang_tess = most_recently_used_;
if (!WordsAcceptable(best_words)) {
// Try all the other languages to see if they are any better.
if (most_recently_used_ != this &&
this->RetryWithLanguage(*word_data, recognizer,
this->RetryWithLanguage(*word_data, recognizer, debug,
&word_data->lang_words[sub_langs_.size()],
&best_words) > 0) {
best_lang_tess = this;
@ -1313,7 +1324,7 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
++i) {
if (most_recently_used_ != sub_langs_[i] &&
sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
&word_data->lang_words[i],
&best_words) > 0) {
best_lang_tess = sub_langs_[i];

View File

@ -309,6 +309,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
word_certainty);
word->best_choice->print();
}
word->best_choice->set_certainty(word_certainty);
// Discard words that are impossibly bad, but allow a bit more for
// dictionary words, and keep bad words in non-space-delimited langs.
if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
@ -324,7 +325,6 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
// It is a dud.
word->SetupFake(lstm_recognizer_->GetUnicharset());
}
word->best_choice->set_certainty(word_certainty);
}
}
}

View File

@ -214,6 +214,8 @@ Tesseract::Tesseract()
BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
this->params()),
INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
this->params()),
BOOL_MEMBER(paragraph_text_based, true,
@ -636,6 +638,8 @@ Tesseract::~Tesseract() {
}
void Tesseract::Clear() {
STRING debug_name = imagebasename + "_debug.pdf";
pixa_debug_.WritePDF(debug_name.string());
pixDestroy(&pix_binary_);
pixDestroy(&pix_grey_);
pixDestroy(&pix_thresholds_);
@ -703,7 +707,7 @@ void Tesseract::PrepareForPageseg() {
// the newly splitted image.
splitter_.set_orig_pix(pix_binary());
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
if (splitter_.Split(true)) {
if (splitter_.Split(true, &pixa_debug_)) {
ASSERT_HOST(splitter_.splitted_image());
pixDestroy(&pix_binary_);
pix_binary_ = pixClone(splitter_.splitted_image());
@ -732,7 +736,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
splitter_.set_segmentation_block_list(block_list);
splitter_.set_ocr_split_strategy(max_ocr_strategy);
// Run the splitter for OCR
bool split_for_ocr = splitter_.Split(false);
bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
// Restore pix_binary to the binarized original pix for future reference.
ASSERT_HOST(splitter_.orig_pix());
pixDestroy(&pix_binary_);

View File

@ -28,11 +28,12 @@
#include "allheaders.h"
#include "control.h"
#include "docqual.h"
#include "debugpixa.h"
#include "devanagari_processing.h"
#include "docqual.h"
#include "genericvector.h"
#include "params.h"
#include "ocrclass.h"
#include "params.h"
#include "textord.h"
#include "wordrec.h"
@ -372,9 +373,8 @@ class Tesseract : public Wordrec {
// Helper to recognize the word using the given (language-specific) tesseract.
// Returns positive if this recognizer found more new best words than the
// number kept from best_words.
int RetryWithLanguage(const WordData& word_data,
WordRecognizer recognizer,
WERD_RES** in_word,
int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
bool debug, WERD_RES** in_word,
PointerVector<WERD_RES>* best_words);
// Moves good-looking "noise"/diacritics from the reject list to the main
// blob list on the current word. Returns true if anything was done, and
@ -907,6 +907,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(test_pt, false, "Test for point");
double_VAR_H(test_pt_x, 99999.99, "xcoord");
double_VAR_H(test_pt_y, 99999.99, "ycoord");
INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
BOOL_VAR_H(paragraph_text_based, true,
"Run paragraph detection on the post-text-recognition "
@ -1194,6 +1195,8 @@ class Tesseract : public Wordrec {
Pix* pix_original_;
// Thresholds that were used to generate the thresholded image from grey.
Pix* pix_thresholds_;
// Debug images. If non-empty, will be written on destruction.
DebugPixa pixa_debug_;
// Input image resolution after any scaling. The resolution is not well
// transmitted by operations on Pix, so we keep an independent record here.
int source_resolution_;

View File

@ -276,6 +276,15 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
}
if (t < width) {
int unichar_id = best_nodes[t]->unichar_id;
if (unichar_id == UNICHAR_SPACE && !certs->empty() &&
best_nodes[t]->permuter != NO_PERM) {
// All the rating and certainty go on the previous character except
// for the space itself.
if (certainty < certs->back()) certs->back() = certainty;
ratings->back() += rating;
certainty = 0.0;
rating = 0.0;
}
unichar_ids->push_back(unichar_id);
xcoords->push_back(t);
do {