mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Fixed issue #633 (multi-language mode
This commit is contained in:
parent
ca16a08c10
commit
b453f74e01
@ -754,16 +754,32 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
|
||||
}
|
||||
}
|
||||
|
||||
// Factored helper considers the indexed word and updates all the pointed
|
||||
// values.
|
||||
static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
|
||||
float* rating, float* certainty, bool* bad,
|
||||
bool* valid_permuter, int* right, int* next_left) {
|
||||
// Helper finds the gap between the index word and the next.
|
||||
static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
|
||||
int* next_left) {
|
||||
*right = -MAX_INT32;
|
||||
*next_left = MAX_INT32;
|
||||
if (index < words.size()) {
|
||||
*right = words[index]->word->bounding_box().right();
|
||||
if (index + 1 < words.size())
|
||||
*next_left = words[index + 1]->word->bounding_box().left();
|
||||
}
|
||||
}
|
||||
|
||||
// Factored helper computes the rating, certainty, badness and validity of
|
||||
// the permuter of the words in [first_index, end_index).
|
||||
static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
|
||||
int first_index, int end_index, float* rating,
|
||||
float* certainty, bool* bad,
|
||||
bool* valid_permuter) {
|
||||
if (end_index <= first_index) {
|
||||
*bad = true;
|
||||
*valid_permuter = false;
|
||||
}
|
||||
for (int index = first_index; index < end_index && index < words.size();
|
||||
++index) {
|
||||
WERD_CHOICE* choice = words[index]->best_choice;
|
||||
if (choice == NULL) {
|
||||
if (choice == nullptr) {
|
||||
*bad = true;
|
||||
} else {
|
||||
*rating += choice->rating();
|
||||
@ -771,12 +787,6 @@ static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
|
||||
if (!Dict::valid_word_permuter(choice->permuter(), false))
|
||||
*valid_permuter = false;
|
||||
}
|
||||
*right = words[index]->word->bounding_box().right();
|
||||
if (index + 1 < words.size())
|
||||
*next_left = words[index + 1]->word->bounding_box().left();
|
||||
} else {
|
||||
*valid_permuter = false;
|
||||
*bad = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -801,24 +811,13 @@ static int SelectBestWords(double rating_ratio,
|
||||
while (b < best_words->size() || n < new_words->size()) {
|
||||
// Start of the current run in each.
|
||||
int start_b = b, start_n = n;
|
||||
// Rating of the current run in each.
|
||||
float b_rating = 0.0f, n_rating = 0.0f;
|
||||
// Certainty of the current run in each.
|
||||
float b_certainty = 0.0f, n_certainty = 0.0f;
|
||||
// True if any word is missing its best choice.
|
||||
bool b_bad = false, n_bad = false;
|
||||
// True if all words have a valid permuter.
|
||||
bool b_valid_permuter = true, n_valid_permuter = true;
|
||||
|
||||
while (b < best_words->size() || n < new_words->size()) {
|
||||
int b_right = -MAX_INT32;
|
||||
int next_b_left = MAX_INT32;
|
||||
EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
|
||||
&b_valid_permuter, &b_right, &next_b_left);
|
||||
WordGap(*best_words, b, &b_right, &next_b_left);
|
||||
int n_right = -MAX_INT32;
|
||||
int next_n_left = MAX_INT32;
|
||||
EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
|
||||
&n_valid_permuter, &n_right, &next_n_left);
|
||||
WordGap(*new_words, n, &n_right, &next_n_left);
|
||||
if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
|
||||
// The word breaks overlap. [start_b,b] and [start_n, n] match.
|
||||
break;
|
||||
@ -830,6 +829,20 @@ static int SelectBestWords(double rating_ratio,
|
||||
else
|
||||
++n;
|
||||
}
|
||||
// Rating of the current run in each.
|
||||
float b_rating = 0.0f, n_rating = 0.0f;
|
||||
// Certainty of the current run in each.
|
||||
float b_certainty = 0.0f, n_certainty = 0.0f;
|
||||
// True if any word is missing its best choice.
|
||||
bool b_bad = false, n_bad = false;
|
||||
// True if all words have a valid permuter.
|
||||
bool b_valid_permuter = true, n_valid_permuter = true;
|
||||
int end_b = b < best_words->size() ? b + 1 : b;
|
||||
int end_n = n < new_words->size() ? n + 1 : n;
|
||||
EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
|
||||
&b_bad, &b_valid_permuter);
|
||||
EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
|
||||
&n_bad, &n_valid_permuter);
|
||||
bool new_better = false;
|
||||
if (!n_bad && (b_bad || (n_certainty > b_certainty &&
|
||||
n_rating < b_rating) ||
|
||||
@ -837,7 +850,7 @@ static int SelectBestWords(double rating_ratio,
|
||||
n_rating < b_rating * rating_ratio &&
|
||||
n_certainty > b_certainty - certainty_margin))) {
|
||||
// New is better.
|
||||
for (int i = start_n; i <= n; ++i) {
|
||||
for (int i = start_n; i < end_n; ++i) {
|
||||
out_words.push_back((*new_words)[i]);
|
||||
(*new_words)[i] = NULL;
|
||||
++num_new;
|
||||
@ -845,14 +858,12 @@ static int SelectBestWords(double rating_ratio,
|
||||
new_better = true;
|
||||
} else if (!b_bad) {
|
||||
// Current best is better.
|
||||
for (int i = start_b; i <= b; ++i) {
|
||||
for (int i = start_b; i < end_b; ++i) {
|
||||
out_words.push_back((*best_words)[i]);
|
||||
(*best_words)[i] = NULL;
|
||||
++num_best;
|
||||
}
|
||||
}
|
||||
int end_b = b < best_words->size() ? b + 1 : b;
|
||||
int end_n = n < new_words->size() ? n + 1 : n;
|
||||
if (debug) {
|
||||
tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
|
||||
" valid dict: %d v %d\n",
|
||||
@ -875,10 +886,9 @@ static int SelectBestWords(double rating_ratio,
|
||||
// Returns positive if this recognizer found more new best words than the
|
||||
// number kept from best_words.
|
||||
int Tesseract::RetryWithLanguage(const WordData& word_data,
|
||||
WordRecognizer recognizer,
|
||||
WordRecognizer recognizer, bool debug,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* best_words) {
|
||||
bool debug = classify_debug_level;
|
||||
if (debug) {
|
||||
tprintf("Trying word using lang %s, oem %d\n",
|
||||
lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
|
||||
@ -1281,7 +1291,8 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
|
||||
// Points to the best result. May be word or in lang_words.
|
||||
WERD_RES* word = word_data->word;
|
||||
clock_t start_t = clock();
|
||||
if (classify_debug_level) {
|
||||
bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
|
||||
if (debug) {
|
||||
tprintf("%s word with lang %s at:",
|
||||
word->done ? "Already done" : "Processing",
|
||||
most_recently_used_->lang.string());
|
||||
@ -1300,12 +1311,12 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
|
||||
most_recently_used_ != sub_langs_[sub]; ++sub) {}
|
||||
}
|
||||
most_recently_used_->RetryWithLanguage(
|
||||
*word_data, recognizer, &word_data->lang_words[sub], &best_words);
|
||||
*word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
|
||||
Tesseract* best_lang_tess = most_recently_used_;
|
||||
if (!WordsAcceptable(best_words)) {
|
||||
// Try all the other languages to see if they are any better.
|
||||
if (most_recently_used_ != this &&
|
||||
this->RetryWithLanguage(*word_data, recognizer,
|
||||
this->RetryWithLanguage(*word_data, recognizer, debug,
|
||||
&word_data->lang_words[sub_langs_.size()],
|
||||
&best_words) > 0) {
|
||||
best_lang_tess = this;
|
||||
@ -1313,7 +1324,7 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
|
||||
for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
|
||||
++i) {
|
||||
if (most_recently_used_ != sub_langs_[i] &&
|
||||
sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
|
||||
sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
|
||||
&word_data->lang_words[i],
|
||||
&best_words) > 0) {
|
||||
best_lang_tess = sub_langs_[i];
|
||||
|
@ -309,6 +309,7 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
|
||||
word_certainty);
|
||||
word->best_choice->print();
|
||||
}
|
||||
word->best_choice->set_certainty(word_certainty);
|
||||
// Discard words that are impossibly bad, but allow a bit more for
|
||||
// dictionary words, and keep bad words in non-space-delimited langs.
|
||||
if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
|
||||
@ -324,7 +325,6 @@ void Tesseract::SearchWords(PointerVector<WERD_RES>* words) {
|
||||
// It is a dud.
|
||||
word->SetupFake(lstm_recognizer_->GetUnicharset());
|
||||
}
|
||||
word->best_choice->set_certainty(word_certainty);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -214,6 +214,8 @@ Tesseract::Tesseract()
|
||||
BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
|
||||
double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
|
||||
double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
|
||||
INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
|
||||
this->params()),
|
||||
INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
|
||||
this->params()),
|
||||
BOOL_MEMBER(paragraph_text_based, true,
|
||||
@ -636,6 +638,8 @@ Tesseract::~Tesseract() {
|
||||
}
|
||||
|
||||
void Tesseract::Clear() {
|
||||
STRING debug_name = imagebasename + "_debug.pdf";
|
||||
pixa_debug_.WritePDF(debug_name.string());
|
||||
pixDestroy(&pix_binary_);
|
||||
pixDestroy(&pix_grey_);
|
||||
pixDestroy(&pix_thresholds_);
|
||||
@ -703,7 +707,7 @@ void Tesseract::PrepareForPageseg() {
|
||||
// the newly splitted image.
|
||||
splitter_.set_orig_pix(pix_binary());
|
||||
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
|
||||
if (splitter_.Split(true)) {
|
||||
if (splitter_.Split(true, &pixa_debug_)) {
|
||||
ASSERT_HOST(splitter_.splitted_image());
|
||||
pixDestroy(&pix_binary_);
|
||||
pix_binary_ = pixClone(splitter_.splitted_image());
|
||||
@ -732,7 +736,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
|
||||
splitter_.set_segmentation_block_list(block_list);
|
||||
splitter_.set_ocr_split_strategy(max_ocr_strategy);
|
||||
// Run the splitter for OCR
|
||||
bool split_for_ocr = splitter_.Split(false);
|
||||
bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
|
||||
// Restore pix_binary to the binarized original pix for future reference.
|
||||
ASSERT_HOST(splitter_.orig_pix());
|
||||
pixDestroy(&pix_binary_);
|
||||
|
@ -28,11 +28,12 @@
|
||||
|
||||
#include "allheaders.h"
|
||||
#include "control.h"
|
||||
#include "docqual.h"
|
||||
#include "debugpixa.h"
|
||||
#include "devanagari_processing.h"
|
||||
#include "docqual.h"
|
||||
#include "genericvector.h"
|
||||
#include "params.h"
|
||||
#include "ocrclass.h"
|
||||
#include "params.h"
|
||||
#include "textord.h"
|
||||
#include "wordrec.h"
|
||||
|
||||
@ -372,9 +373,8 @@ class Tesseract : public Wordrec {
|
||||
// Helper to recognize the word using the given (language-specific) tesseract.
|
||||
// Returns positive if this recognizer found more new best words than the
|
||||
// number kept from best_words.
|
||||
int RetryWithLanguage(const WordData& word_data,
|
||||
WordRecognizer recognizer,
|
||||
WERD_RES** in_word,
|
||||
int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
|
||||
bool debug, WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* best_words);
|
||||
// Moves good-looking "noise"/diacritics from the reject list to the main
|
||||
// blob list on the current word. Returns true if anything was done, and
|
||||
@ -907,6 +907,7 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(test_pt, false, "Test for point");
|
||||
double_VAR_H(test_pt_x, 99999.99, "xcoord");
|
||||
double_VAR_H(test_pt_y, 99999.99, "ycoord");
|
||||
INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
|
||||
INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
|
||||
BOOL_VAR_H(paragraph_text_based, true,
|
||||
"Run paragraph detection on the post-text-recognition "
|
||||
@ -1194,6 +1195,8 @@ class Tesseract : public Wordrec {
|
||||
Pix* pix_original_;
|
||||
// Thresholds that were used to generate the thresholded image from grey.
|
||||
Pix* pix_thresholds_;
|
||||
// Debug images. If non-empty, will be written on destruction.
|
||||
DebugPixa pixa_debug_;
|
||||
// Input image resolution after any scaling. The resolution is not well
|
||||
// transmitted by operations on Pix, so we keep an independent record here.
|
||||
int source_resolution_;
|
||||
|
@ -276,6 +276,15 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
|
||||
}
|
||||
if (t < width) {
|
||||
int unichar_id = best_nodes[t]->unichar_id;
|
||||
if (unichar_id == UNICHAR_SPACE && !certs->empty() &&
|
||||
best_nodes[t]->permuter != NO_PERM) {
|
||||
// All the rating and certainty go on the previous character except
|
||||
// for the space itself.
|
||||
if (certainty < certs->back()) certs->back() = certainty;
|
||||
ratings->back() += rating;
|
||||
certainty = 0.0;
|
||||
rating = 0.0;
|
||||
}
|
||||
unichar_ids->push_back(unichar_id);
|
||||
xcoords->push_back(t);
|
||||
do {
|
||||
|
Loading…
Reference in New Issue
Block a user