mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 09:52:40 +08:00
Major refactor of control.cpp to enable line recognition
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1147 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
e249d7bcb2
commit
dbf6197471
@ -790,6 +790,10 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
|
||||
* Runs page layout analysis in the mode set by SetPageSegMode.
|
||||
* May optionally be called prior to Recognize to get access to just
|
||||
* the page layout results. Returns an iterator to the results.
|
||||
* If merge_similar_words is true, words are combined where suitable for use
|
||||
* with a line recognizer. Use if you want to use AnalyseLayout to find the
|
||||
* textlines, and then want to process textline fragments with an external
|
||||
* line recognizer.
|
||||
* Returns NULL on error or an empty page.
|
||||
* The returned iterator must be deleted after use.
|
||||
* WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
@ -797,11 +801,11 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
|
||||
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
* DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
*/
|
||||
PageIterator* TessBaseAPI::AnalyseLayout() {
|
||||
PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
|
||||
if (FindLines() == 0) {
|
||||
if (block_list_->empty())
|
||||
return NULL; // The page was empty.
|
||||
page_res_ = new PAGE_RES(block_list_, NULL);
|
||||
page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL);
|
||||
DetectParagraphs(false);
|
||||
return new PageIterator(
|
||||
page_res_, tesseract_, thresholder_->GetScaleFactor(),
|
||||
@ -823,18 +827,22 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
|
||||
if (page_res_ != NULL)
|
||||
delete page_res_;
|
||||
if (block_list_->empty()) {
|
||||
page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_);
|
||||
page_res_ = new PAGE_RES(false, block_list_,
|
||||
&tesseract_->prev_word_best_choice_);
|
||||
return 0; // Empty page.
|
||||
}
|
||||
|
||||
tesseract_->SetBlackAndWhitelist();
|
||||
recognition_done_ = true;
|
||||
if (tesseract_->tessedit_resegment_from_line_boxes)
|
||||
if (tesseract_->tessedit_resegment_from_line_boxes) {
|
||||
page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_);
|
||||
else if (tesseract_->tessedit_resegment_from_boxes)
|
||||
} else if (tesseract_->tessedit_resegment_from_boxes) {
|
||||
page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
|
||||
else
|
||||
page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_);
|
||||
} else {
|
||||
// TODO(rays) LSTM here.
|
||||
page_res_ = new PAGE_RES(false,
|
||||
block_list_, &tesseract_->prev_word_best_choice_);
|
||||
}
|
||||
if (tesseract_->tessedit_make_boxes_from_boxes) {
|
||||
tesseract_->CorrectClassifyWords(page_res_);
|
||||
return 0;
|
||||
@ -900,7 +908,8 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
|
||||
|
||||
recognition_done_ = true;
|
||||
|
||||
page_res_ = new PAGE_RES(block_list_, &(tesseract_->prev_word_best_choice_));
|
||||
page_res_ = new PAGE_RES(false, block_list_,
|
||||
&(tesseract_->prev_word_best_choice_));
|
||||
|
||||
PAGE_RES_IT page_res_it(page_res_);
|
||||
|
||||
@ -1977,7 +1986,10 @@ void TessBaseAPI::Threshold(Pix** pix) {
|
||||
// than over-estimate resolution.
|
||||
thresholder_->SetSourceYResolution(kMinCredibleResolution);
|
||||
}
|
||||
thresholder_->ThresholdToPix(pix);
|
||||
PageSegMode pageseg_mode =
|
||||
static_cast<PageSegMode>(
|
||||
static_cast<int>(tesseract_->tessedit_pageseg_mode));
|
||||
thresholder_->ThresholdToPix(pageseg_mode, pix);
|
||||
thresholder_->GetImageSizes(&rect_left_, &rect_top_,
|
||||
&rect_width_, &rect_height_,
|
||||
&image_width_, &image_height_);
|
||||
@ -2332,7 +2344,7 @@ void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
|
||||
|
||||
|
||||
PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
|
||||
PAGE_RES *page_res = new PAGE_RES(block_list,
|
||||
PAGE_RES *page_res = new PAGE_RES(false, block_list,
|
||||
&(tesseract_->prev_word_best_choice_));
|
||||
tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
|
||||
return page_res;
|
||||
@ -2341,7 +2353,7 @@ PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
|
||||
PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
|
||||
PAGE_RES* pass1_result) {
|
||||
if (!pass1_result)
|
||||
pass1_result = new PAGE_RES(block_list,
|
||||
pass1_result = new PAGE_RES(false, block_list,
|
||||
&(tesseract_->prev_word_best_choice_));
|
||||
tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
|
||||
return pass1_result;
|
||||
|
@ -484,14 +484,21 @@ class TESS_API TessBaseAPI {
|
||||
* Runs page layout analysis in the mode set by SetPageSegMode.
|
||||
* May optionally be called prior to Recognize to get access to just
|
||||
* the page layout results. Returns an iterator to the results.
|
||||
* Returns NULL on error.
|
||||
* If merge_similar_words is true, words are combined where suitable for use
|
||||
* with a line recognizer. Use if you want to use AnalyseLayout to find the
|
||||
* textlines, and then want to process textline fragments with an external
|
||||
* line recognizer.
|
||||
* Returns NULL on error or an empty page.
|
||||
* The returned iterator must be deleted after use.
|
||||
* WARNING! This class points to data held within the TessBaseAPI class, and
|
||||
* therefore can only be used while the TessBaseAPI class still exists and
|
||||
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End
|
||||
* DetectOS, or anything else that changes the internal PAGE_RES.
|
||||
*/
|
||||
PageIterator* AnalyseLayout();
|
||||
PageIterator* AnalyseLayout() {
|
||||
return AnalyseLayout(false);
|
||||
}
|
||||
PageIterator* AnalyseLayout(bool merge_similar_words);
|
||||
|
||||
/**
|
||||
* Recognize the image from SetAndThresholdImage, generating Tesseract
|
||||
|
@ -110,30 +110,20 @@ static void clear_any_old_text(BLOCK_LIST *block_list) {
|
||||
PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
|
||||
bool find_segmentation,
|
||||
BLOCK_LIST *block_list) {
|
||||
int box_count = 0;
|
||||
int box_failures = 0;
|
||||
|
||||
FILE* box_file = OpenBoxFile(fname);
|
||||
TBOX box;
|
||||
GenericVector<TBOX> boxes;
|
||||
GenericVector<STRING> texts, full_texts;
|
||||
|
||||
bool found_box = true;
|
||||
while (found_box) {
|
||||
int line_number = 0; // Line number of the box file.
|
||||
STRING text, full_text;
|
||||
found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
|
||||
if (found_box) {
|
||||
++box_count;
|
||||
MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
|
||||
} else {
|
||||
full_text = "";
|
||||
}
|
||||
boxes.push_back(box);
|
||||
texts.push_back(text);
|
||||
full_texts.push_back(full_text);
|
||||
if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
|
||||
NULL)) {
|
||||
return NULL; // Can't do it.
|
||||
}
|
||||
|
||||
int box_count = boxes.size();
|
||||
int box_failures = 0;
|
||||
// Add an empty everything to the end.
|
||||
boxes.push_back(TBOX());
|
||||
texts.push_back(STRING());
|
||||
full_texts.push_back(STRING());
|
||||
|
||||
// In word mode, we use the boxes to make a word for each box, but
|
||||
// in blob mode we use the existing words and maximally chop them first.
|
||||
PAGE_RES* page_res = find_segmentation ?
|
||||
@ -239,7 +229,7 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
|
||||
}
|
||||
}
|
||||
}
|
||||
PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
|
||||
PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
WERD_RES* word_res;
|
||||
while ((word_res = pr_it.word()) != NULL) {
|
||||
|
@ -69,16 +69,11 @@ const double kMinRefitXHeightFraction = 0.5;
|
||||
namespace tesseract {
|
||||
void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
|
||||
TBOX &selection_box) {
|
||||
WERD *word;
|
||||
ROW *pseudo_row; // row of word
|
||||
BLOCK *pseudo_block; // block of word
|
||||
|
||||
word = make_pseudo_word(page_res, selection_box,
|
||||
pseudo_block, pseudo_row);
|
||||
if (word != NULL) {
|
||||
WERD_RES word_res(word);
|
||||
recog_interactive(pseudo_block, pseudo_row, &word_res);
|
||||
delete word;
|
||||
PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
|
||||
if (it != NULL) {
|
||||
recog_interactive(it);
|
||||
it->DeleteCurrentWord();
|
||||
delete it;
|
||||
}
|
||||
}
|
||||
|
||||
@ -92,19 +87,22 @@ void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
|
||||
* @param row row of word
|
||||
* @param word_res word to recognise
|
||||
*/
|
||||
BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
|
||||
inT16 char_qual;
|
||||
inT16 good_char_qual;
|
||||
|
||||
WordData word_data(block, row, word_res);
|
||||
WordData word_data(*pr_it);
|
||||
SetupWordPassN(2, &word_data);
|
||||
classify_word_and_language(&Tesseract::classify_word_pass2, &word_data);
|
||||
classify_word_and_language(&Tesseract::classify_word_pass2, pr_it,
|
||||
&word_data);
|
||||
if (tessedit_debug_quality_metrics) {
|
||||
word_char_quality(word_res, row, &char_qual, &good_char_qual);
|
||||
tprintf
|
||||
("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
|
||||
word_res->reject_map.length(), word_blob_quality(word_res, row),
|
||||
word_outline_errs(word_res), char_qual, good_char_qual);
|
||||
WERD_RES* word_res = pr_it->word();
|
||||
word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
|
||||
tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
|
||||
"char_quality: %d; good_char_quality: %d\n",
|
||||
word_res->reject_map.length(),
|
||||
word_blob_quality(word_res, pr_it->row()->row),
|
||||
word_outline_errs(word_res), char_qual, good_char_qual);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
@ -163,8 +161,6 @@ void Tesseract::SetupAllWordsPassN(int pass_n,
|
||||
PAGE_RES_IT page_res_it(page_res);
|
||||
for (page_res_it.restart_page(); page_res_it.word() != NULL;
|
||||
page_res_it.forward()) {
|
||||
if (pass_n == 1)
|
||||
page_res_it.word()->SetupFake(unicharset);
|
||||
if (target_word_box == NULL ||
|
||||
ProcessTargetWord(page_res_it.word()->word->bounding_box(),
|
||||
*target_word_box, word_config, 1)) {
|
||||
@ -180,33 +176,29 @@ void Tesseract::SetupAllWordsPassN(int pass_n,
|
||||
|
||||
// Sets up the single word ready for whichever engine is to be run.
|
||||
void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
|
||||
if (pass_n == 1 || !word->word->done || tessedit_training_tess) {
|
||||
if (pass_n == 2) {
|
||||
if (pass_n == 1 || !word->word->done) {
|
||||
if (pass_n == 1) {
|
||||
word->word->SetupForRecognition(unicharset, this, BestPix(),
|
||||
tessedit_ocr_engine_mode, NULL,
|
||||
classify_bln_numeric_mode,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
word->row, word->block);
|
||||
} else if (pass_n == 2) {
|
||||
// TODO(rays) Should we do this on pass1 too?
|
||||
word->word->caps_height = 0.0;
|
||||
if (word->word->x_height == 0.0f)
|
||||
word->word->x_height = word->row->x_height();
|
||||
}
|
||||
// Cube doesn't get setup for pass2.
|
||||
if (pass_n != 2 || tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
|
||||
word->word->SetupForRecognition(
|
||||
unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
|
||||
classify_bln_numeric_mode, textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx, word->row, word->block);
|
||||
}
|
||||
}
|
||||
if (!sub_langs_.empty()) {
|
||||
if (word->lang_words.size() != sub_langs_.size()) {
|
||||
// Setup the words for all the sub-languages now.
|
||||
WERD_RES empty;
|
||||
word->lang_words.init_to_size(sub_langs_.size(), empty);
|
||||
}
|
||||
for (int s = 0; s < sub_langs_.size(); ++s) {
|
||||
Tesseract* lang_t = sub_langs_[s];
|
||||
if (pass_n == 1 || (lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY &&
|
||||
(!word->lang_words[s].done || lang_t->tessedit_training_tess))) {
|
||||
word->lang_words[s].InitForRetryRecognition(*word->word);
|
||||
word->lang_words[s].SetupForRecognition(
|
||||
for (int s = 0; s <= sub_langs_.size(); ++s) {
|
||||
// The sub_langs_.size() entry is for the master language.
|
||||
Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
|
||||
WERD_RES* word_res = new WERD_RES;
|
||||
word_res->InitForRetryRecognition(*word->word);
|
||||
word->lang_words.push_back(word_res);
|
||||
// Cube doesn't get setup for pass2.
|
||||
if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
|
||||
word_res->SetupForRecognition(
|
||||
lang_t->unicharset, lang_t, BestPix(),
|
||||
lang_t->tessedit_ocr_engine_mode, NULL,
|
||||
lang_t->classify_bln_numeric_mode,
|
||||
@ -217,17 +209,19 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Runs word recognition on all the words.
|
||||
bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
|
||||
PAGE_RES_IT* pr_it,
|
||||
GenericVector<WordData>* words) {
|
||||
// TODO(rays) Before this loop can be parallelized (it would yield a massive
|
||||
// speed-up) all remaining member globals need to be converted to local/heap
|
||||
// (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
|
||||
// added. The results will be significantly different with adaption on, and
|
||||
// deterioration will need investigation.
|
||||
pr_it->restart_page();
|
||||
for (int w = 0; w < words->size(); ++w) {
|
||||
WordData* word = &(*words)[w];
|
||||
if (w > 0) word->prev_word = &(*words)[w - 1];
|
||||
if (monitor != NULL) {
|
||||
monitor->ocr_alive = TRUE;
|
||||
if (pass_n == 1)
|
||||
@ -244,16 +238,26 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (word->word->tess_failed) continue;
|
||||
if (word->word->tess_failed) {
|
||||
int s;
|
||||
for (s = 0; s < word->lang_words.size() &&
|
||||
word->lang_words[s]->tess_failed; ++s) {}
|
||||
// If all are failed, skip it. Image words are skipped by this test.
|
||||
if (s > word->lang_words.size()) continue;
|
||||
}
|
||||
// Sync pr_it with the wth WordData.
|
||||
while (pr_it->word() != NULL && pr_it->word() != word->word)
|
||||
pr_it->forward();
|
||||
ASSERT_HOST(pr_it->word() != NULL);
|
||||
WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
|
||||
: &Tesseract::classify_word_pass2;
|
||||
classify_word_and_language(recognizer, word);
|
||||
classify_word_and_language(recognizer, pr_it, word);
|
||||
if (tessedit_dump_choices) {
|
||||
word_dumper(NULL, word->row, word->word);
|
||||
tprintf("Pass%d: %s [%s]\n", pass_n,
|
||||
word->word->best_choice->unichar_string().string(),
|
||||
word->word->best_choice->debug_string().string());
|
||||
}
|
||||
pr_it->forward();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -326,12 +330,12 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
|
||||
|
||||
most_recently_used_ = this;
|
||||
// Run pass 1 word recognition.
|
||||
if (!RecogAllWordsPassN(1, monitor, &words)) return false;
|
||||
if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
|
||||
// Pass 1 post-processing.
|
||||
while (page_res_it.word() != NULL) {
|
||||
for (page_res_it.restart_page(); page_res_it.word() != NULL;
|
||||
page_res_it.forward()) {
|
||||
if (page_res_it.word()->word->flag(W_REP_CHAR)) {
|
||||
fix_rep_char(&page_res_it);
|
||||
page_res_it.forward();
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -346,15 +350,14 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
|
||||
page_res->misadaption_log.push_back(
|
||||
page_res_it.word()->blamer_bundle->misadaption_debug());
|
||||
}
|
||||
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
|
||||
if (dopasses == 1) return true;
|
||||
|
||||
// ****************** Pass 2 *******************
|
||||
if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption) {
|
||||
if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
|
||||
tessedit_ocr_engine_mode != OEM_CUBE_ONLY ) {
|
||||
page_res_it.restart_page();
|
||||
GenericVector<WordData> words;
|
||||
SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
|
||||
@ -363,17 +366,7 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
|
||||
}
|
||||
most_recently_used_ = this;
|
||||
// Run pass 2 word recognition.
|
||||
if (!RecogAllWordsPassN(2, monitor, &words)) return false;
|
||||
// Pass 2 post-processing.
|
||||
while (page_res_it.word() != NULL) {
|
||||
WERD_RES* word = page_res_it.word();
|
||||
if (word->word->flag(W_REP_CHAR) && !word->done) {
|
||||
fix_rep_char(&page_res_it);
|
||||
page_res_it.forward();
|
||||
continue;
|
||||
}
|
||||
page_res_it.forward();
|
||||
}
|
||||
if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
|
||||
}
|
||||
|
||||
// The next passes can only be run if tesseract has been used, as cube
|
||||
@ -407,8 +400,8 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
|
||||
// ****************** Pass 9 *******************
|
||||
// Check the correctness of the final results.
|
||||
blamer_pass(page_res);
|
||||
script_pos_pass(page_res);
|
||||
}
|
||||
script_pos_pass(page_res);
|
||||
|
||||
// Write results pass.
|
||||
set_global_loc_code(LOC_WRITE_RESULTS);
|
||||
@ -745,166 +738,232 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
|
||||
}
|
||||
}
|
||||
|
||||
// Helper returns true if the new_word is better than the word, using a
|
||||
// simple test of better certainty AND rating (to reduce false positives
|
||||
// from cube) or a dictionary vs non-dictionary word.
|
||||
static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word,
|
||||
double rating_ratio,
|
||||
double certainty_margin) {
|
||||
if (new_word.best_choice == NULL) {
|
||||
return false; // New one no good.
|
||||
// Factored helper considers the indexed word and updates all the pointed
|
||||
// values.
|
||||
static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
|
||||
float* rating, float* certainty, bool* bad,
|
||||
bool* valid_permuter, int* right, int* next_left) {
|
||||
*right = -MAX_INT32;
|
||||
*next_left = MAX_INT32;
|
||||
if (index < words.size()) {
|
||||
WERD_CHOICE* choice = words[index]->best_choice;
|
||||
if (choice == NULL) {
|
||||
*bad = true;
|
||||
} else {
|
||||
*rating += choice->rating();
|
||||
*certainty = MIN(*certainty, choice->certainty());
|
||||
if (!Dict::valid_word_permuter(choice->permuter(), false))
|
||||
*valid_permuter = false;
|
||||
}
|
||||
*right = words[index]->word->bounding_box().right();
|
||||
if (index + 1 < words.size())
|
||||
*next_left = words[index + 1]->word->bounding_box().left();
|
||||
} else {
|
||||
*valid_permuter = false;
|
||||
*bad = true;
|
||||
}
|
||||
if (word.best_choice == NULL) {
|
||||
return true; // Old one no good.
|
||||
}
|
||||
|
||||
// Helper chooses the best combination of words, transferring good ones from
|
||||
// new_words to best_words. To win, a new word must have (better rating and
|
||||
// certainty) or (better permuter status and rating within rating ratio and
|
||||
// certainty within certainty margin) than current best.
|
||||
// All the new_words are consumed (moved to best_words or deleted.)
|
||||
// The return value is the number of new_words used minus the number of
|
||||
// best_words that remain in the output.
|
||||
static int SelectBestWords(double rating_ratio,
|
||||
double certainty_margin,
|
||||
bool debug,
|
||||
PointerVector<WERD_RES>* new_words,
|
||||
PointerVector<WERD_RES>* best_words) {
|
||||
// Process the smallest groups of words that have an overlapping word
|
||||
// boundary at the end.
|
||||
GenericVector<WERD_RES*> out_words;
|
||||
// Index into each word vector (best, new).
|
||||
int b = 0, n = 0;
|
||||
int num_best = 0, num_new = 0;
|
||||
while (b < best_words->size() || n < new_words->size()) {
|
||||
// Start of the current run in each.
|
||||
int start_b = b, start_n = n;
|
||||
// Rating of the current run in each.
|
||||
float b_rating = 0.0f, n_rating = 0.0f;
|
||||
// Certainty of the current run in each.
|
||||
float b_certainty = 0.0f, n_certainty = 0.0f;
|
||||
// True if any word is missing its best choice.
|
||||
bool b_bad = false, n_bad = false;
|
||||
// True if all words have a valid permuter.
|
||||
bool b_valid_permuter = true, n_valid_permuter = true;
|
||||
|
||||
while (b < best_words->size() || n < new_words->size()) {
|
||||
int b_right = -MAX_INT32;
|
||||
int next_b_left = MAX_INT32;
|
||||
EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
|
||||
&b_valid_permuter, &b_right, &next_b_left);
|
||||
int n_right = -MAX_INT32;
|
||||
int next_n_left = MAX_INT32;
|
||||
EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
|
||||
&n_valid_permuter, &n_right, &next_n_left);
|
||||
if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
|
||||
// The word breaks overlap. [start_b,b] and [start_n, n] match.
|
||||
break;
|
||||
}
|
||||
// Keep searching for the matching word break.
|
||||
if ((b_right < n_right && b < best_words->size()) ||
|
||||
n == new_words->size())
|
||||
++b;
|
||||
else
|
||||
++n;
|
||||
}
|
||||
bool new_better = false;
|
||||
if (!n_bad && (b_bad || (n_certainty > b_certainty &&
|
||||
n_rating < b_rating) ||
|
||||
(!b_valid_permuter && n_valid_permuter &&
|
||||
n_rating < b_rating * rating_ratio &&
|
||||
n_certainty > b_certainty - certainty_margin))) {
|
||||
// New is better.
|
||||
for (int i = start_n; i <= n; ++i) {
|
||||
out_words.push_back((*new_words)[i]);
|
||||
(*new_words)[i] = NULL;
|
||||
++num_new;
|
||||
}
|
||||
new_better = true;
|
||||
} else if (!b_bad) {
|
||||
// Current best is better.
|
||||
for (int i = start_b; i <= b; ++i) {
|
||||
out_words.push_back((*best_words)[i]);
|
||||
(*best_words)[i] = NULL;
|
||||
++num_best;
|
||||
}
|
||||
}
|
||||
int end_b = b < best_words->size() ? b + 1 : b;
|
||||
int end_n = n < new_words->size() ? n + 1 : n;
|
||||
if (debug) {
|
||||
tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
|
||||
" valid dict: %d v %d\n",
|
||||
end_n - start_n, new_better ? "better" : "worse",
|
||||
end_b - start_b, n_rating, b_rating,
|
||||
n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
|
||||
}
|
||||
// Move on to the next group.
|
||||
b = end_b;
|
||||
n = end_n;
|
||||
}
|
||||
if (new_word.best_choice->certainty() > word.best_choice->certainty() &&
|
||||
new_word.best_choice->rating() < word.best_choice->rating()) {
|
||||
return true; // New word has better confidence.
|
||||
}
|
||||
if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
|
||||
Dict::valid_word_permuter(new_word.best_choice->permuter(), false) &&
|
||||
new_word.best_choice->rating() <
|
||||
word.best_choice->rating() * rating_ratio &&
|
||||
new_word.best_choice->certainty() >
|
||||
word.best_choice->certainty() - certainty_margin) {
|
||||
return true; // New word is from a dictionary.
|
||||
}
|
||||
return false; // New word is no better.
|
||||
// Transfer from out_words to best_words.
|
||||
best_words->clear();
|
||||
for (int i = 0; i < out_words.size(); ++i)
|
||||
best_words->push_back(out_words[i]);
|
||||
return num_new - num_best;
|
||||
}
|
||||
|
||||
// Helper to recognize the word using the given (language-specific) tesseract.
|
||||
// Returns true if the result was better than previously.
|
||||
bool Tesseract::RetryWithLanguage(const WERD_RES& best_word,
|
||||
WordData* word_data, WERD_RES* word,
|
||||
WordRecognizer recognizer) {
|
||||
if (classify_debug_level || cube_debug_level) {
|
||||
tprintf("Retrying word using lang %s, oem %d\n",
|
||||
// Returns positive if this recognizer found more new best words than the
|
||||
// number kept from best_words.
|
||||
int Tesseract::RetryWithLanguage(const WordData& word_data,
|
||||
WordRecognizer recognizer,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* best_words) {
|
||||
bool debug = classify_debug_level || cube_debug_level;
|
||||
if (debug) {
|
||||
tprintf("Trying word using lang %s, oem %d\n",
|
||||
lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
|
||||
}
|
||||
// Run the recognizer on the word.
|
||||
PointerVector<WERD_RES> new_words;
|
||||
(this->*recognizer)(word_data, in_word, &new_words);
|
||||
if (new_words.empty()) {
|
||||
// Transfer input word to new_words, as the classifier must have put
|
||||
// the result back in the input.
|
||||
new_words.push_back(*in_word);
|
||||
*in_word = NULL;
|
||||
}
|
||||
if (debug) {
|
||||
for (int i = 0; i < new_words.size(); ++i)
|
||||
new_words[i]->DebugTopChoice("Lang result");
|
||||
}
|
||||
// Initial version is a bit of a hack based on better certainty and rating
|
||||
// (to reduce false positives from cube) or a dictionary vs non-dictionary
|
||||
// word.
|
||||
(this->*recognizer)(word_data, word);
|
||||
bool new_is_better = NewWordBetter(best_word, *word,
|
||||
classify_max_rating_ratio,
|
||||
classify_max_certainty_margin);
|
||||
if (classify_debug_level || cube_debug_level) {
|
||||
if (word->best_choice == NULL) {
|
||||
tprintf("NULL result %s better!\n",
|
||||
new_is_better ? "IS" : "NOT");
|
||||
} else {
|
||||
tprintf("New result %s better:%s, r=%g, c=%g\n",
|
||||
new_is_better ? "IS" : "NOT",
|
||||
word->best_choice->unichar_string().string(),
|
||||
word->best_choice->rating(),
|
||||
word->best_choice->certainty());
|
||||
}
|
||||
return SelectBestWords(classify_max_rating_ratio,
|
||||
classify_max_certainty_margin,
|
||||
debug, &new_words, best_words);
|
||||
}
|
||||
|
||||
// Helper returns true if all the words are acceptable.
|
||||
static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
|
||||
for (int w = 0; w < words.size(); ++w) {
|
||||
if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
|
||||
}
|
||||
return new_is_better;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Generic function for classifying a word. Can be used either for pass1 or
|
||||
// pass2 according to the function passed to recognizer.
|
||||
// word block and row are the current location in the document's PAGE_RES.
|
||||
// word_data holds the word to be recognized, and its block and row, and
|
||||
// pr_it points to the word as well, in case we are running LSTM and it wants
|
||||
// to output multiple words.
|
||||
// Recognizes in the current language, and if successful that is all.
|
||||
// If recognition was not successful, tries all available languages until
|
||||
// it gets a successful result or runs out of languages. Keeps the best result.
|
||||
void Tesseract::classify_word_and_language(WordRecognizer recognizer,
|
||||
PAGE_RES_IT* pr_it,
|
||||
WordData* word_data) {
|
||||
// Best result so far.
|
||||
PointerVector<WERD_RES> best_words;
|
||||
// Points to the best result. May be word or in lang_words.
|
||||
WERD_RES* word = word_data->word;
|
||||
clock_t start_t = clock();
|
||||
if (classify_debug_level || cube_debug_level) {
|
||||
tprintf("Processing word with lang %s at:",
|
||||
tprintf("%s word with lang %s at:",
|
||||
word->done ? "Already done" : "Processing",
|
||||
most_recently_used_->lang.string());
|
||||
word->word->bounding_box().print();
|
||||
}
|
||||
const char* result_type = "Initial";
|
||||
bool initially_done = !word->tess_failed && word->done;
|
||||
if (initially_done) {
|
||||
if (word->done) {
|
||||
// If done on pass1, leave it as-is.
|
||||
most_recently_used_ = word->tesseract;
|
||||
result_type = "Already done";
|
||||
} else {
|
||||
if (most_recently_used_ != this) {
|
||||
// Point to the word for most_recently_used_.
|
||||
for (int s = 0; s < sub_langs_.size(); ++s) {
|
||||
if (most_recently_used_ == sub_langs_[s]) {
|
||||
word = &word_data->lang_words[s];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
(most_recently_used_->*recognizer)(word_data, word);
|
||||
if (!word->tess_failed && word->tess_accepted)
|
||||
result_type = "Accepted";
|
||||
if (!word->tess_failed)
|
||||
most_recently_used_ = word->tesseract;
|
||||
return;
|
||||
}
|
||||
if (classify_debug_level || cube_debug_level) {
|
||||
tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d"
|
||||
" xht=[%g,%g]\n",
|
||||
result_type,
|
||||
word->best_choice->unichar_string().string(),
|
||||
word->best_choice->rating(),
|
||||
word->best_choice->certainty(),
|
||||
word->tess_accepted, word->tess_would_adapt,
|
||||
word->best_choice->min_x_height(),
|
||||
word->best_choice->max_x_height());
|
||||
int sub = sub_langs_.size();
|
||||
if (most_recently_used_ != this) {
|
||||
// Get the index of the most_recently_used_.
|
||||
for (sub = 0; sub < sub_langs_.size() &&
|
||||
most_recently_used_ != sub_langs_[sub]; ++sub) {}
|
||||
}
|
||||
if (word->tess_failed || !word->tess_accepted) {
|
||||
most_recently_used_->RetryWithLanguage(
|
||||
*word_data, recognizer, &word_data->lang_words[sub], &best_words);
|
||||
Tesseract* best_lang_tess = most_recently_used_;
|
||||
if (!WordsAcceptable(best_words)) {
|
||||
// Try all the other languages to see if they are any better.
|
||||
Tesseract* previous_used = most_recently_used_;
|
||||
if (most_recently_used_ != this) {
|
||||
if (classify_debug_level) {
|
||||
tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
|
||||
}
|
||||
if (word_data->word->tesseract == this) {
|
||||
// This is pass1, and we are trying the main language.
|
||||
if (RetryWithLanguage(*word, word_data, word_data->word, recognizer)) {
|
||||
most_recently_used_ = this;
|
||||
word = word_data->word;
|
||||
}
|
||||
} else {
|
||||
// This is pass2, and we are trying the main language again, but it
|
||||
// has no word allocated to it, so we must re-initialize it.
|
||||
WERD_RES main_word(*word_data->word);
|
||||
main_word.InitForRetryRecognition(*word_data->word);
|
||||
main_word.SetupForRecognition(unicharset, this, BestPix(),
|
||||
tessedit_ocr_engine_mode, NULL,
|
||||
classify_bln_numeric_mode,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
word_data->row, word_data->block);
|
||||
if (RetryWithLanguage(*word, word_data, &main_word, recognizer)) {
|
||||
most_recently_used_ = this;
|
||||
word_data->word->ConsumeWordResults(&main_word);
|
||||
word = word_data->word;
|
||||
}
|
||||
}
|
||||
if (!word->tess_failed && word->tess_accepted)
|
||||
return; // No need to look at the others.
|
||||
if (most_recently_used_ != this &&
|
||||
this->RetryWithLanguage(*word_data, recognizer,
|
||||
&word_data->lang_words[sub_langs_.size()],
|
||||
&best_words) > 0) {
|
||||
best_lang_tess = this;
|
||||
}
|
||||
|
||||
for (int i = 0; i < sub_langs_.size(); ++i) {
|
||||
if (sub_langs_[i] != previous_used) {
|
||||
if (classify_debug_level) {
|
||||
tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
|
||||
i, sub_langs_[i]->lang.string());
|
||||
}
|
||||
if (sub_langs_[i]->RetryWithLanguage(*word, word_data,
|
||||
&word_data->lang_words[i],
|
||||
recognizer)) {
|
||||
most_recently_used_ = sub_langs_[i];
|
||||
word = &word_data->lang_words[i];
|
||||
if (!word->tess_failed && word->tess_accepted)
|
||||
break; // No need to look at the others.
|
||||
}
|
||||
for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
|
||||
++i) {
|
||||
if (most_recently_used_ != sub_langs_[i] &&
|
||||
sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
|
||||
&word_data->lang_words[i],
|
||||
&best_words) > 0) {
|
||||
best_lang_tess = sub_langs_[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
if (word != word_data->word) {
|
||||
// Move the result for the best language to the main word.
|
||||
word_data->word->ConsumeWordResults(word);
|
||||
most_recently_used_ = best_lang_tess;
|
||||
if (!best_words.empty()) {
|
||||
if (best_words.size() == 1 && !best_words[0]->combination) {
|
||||
// Move the best single result to the main word.
|
||||
word_data->word->ConsumeWordResults(best_words[0]);
|
||||
} else {
|
||||
// Words came from LSTM, and must be moved to the PAGE_RES properly.
|
||||
word_data->word = best_words.back();
|
||||
pr_it->ReplaceCurrentWord(&best_words);
|
||||
}
|
||||
ASSERT_HOST(word_data->word->box_word != NULL);
|
||||
} else {
|
||||
tprintf("no best words!!\n");
|
||||
}
|
||||
clock_t ocr_t = clock();
|
||||
if (tessedit_timing_debug) {
|
||||
@ -920,16 +979,19 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
|
||||
* Baseline normalize the word and pass it to Tess.
|
||||
*/
|
||||
|
||||
void Tesseract::classify_word_pass1(WordData* word_data, WERD_RES* word) {
|
||||
ROW* row = word_data->row;
|
||||
BLOCK* block = word_data->block;
|
||||
prev_word_best_choice_ = word_data->prev_word != NULL
|
||||
? word_data->prev_word->word->best_choice : NULL;
|
||||
void Tesseract::classify_word_pass1(const WordData& word_data,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* out_words) {
|
||||
ROW* row = word_data.row;
|
||||
BLOCK* block = word_data.block;
|
||||
prev_word_best_choice_ = word_data.prev_word != NULL
|
||||
? word_data.prev_word->word->best_choice : NULL;
|
||||
// If we only intend to run cube - run it and return.
|
||||
if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
|
||||
cube_word_pass1(block, row, word);
|
||||
cube_word_pass1(block, row, *in_word);
|
||||
return;
|
||||
}
|
||||
WERD_RES* word = *in_word;
|
||||
match_word_pass_n(1, word, row, block);
|
||||
if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
|
||||
word->tess_would_adapt = AdaptableWord(word);
|
||||
@ -1027,19 +1089,23 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
|
||||
* Control what to do with the word in pass 2
|
||||
*/
|
||||
|
||||
void Tesseract::classify_word_pass2(WordData* word_data, WERD_RES* word) {
|
||||
void Tesseract::classify_word_pass2(const WordData& word_data,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* out_words) {
|
||||
// Return if we do not want to run Tesseract.
|
||||
if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
|
||||
tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
|
||||
tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED &&
|
||||
word_data.word->best_choice != NULL)
|
||||
return;
|
||||
ROW* row = word_data->row;
|
||||
BLOCK* block = word_data->block;
|
||||
prev_word_best_choice_ = word_data->prev_word != NULL
|
||||
? word_data->prev_word->word->best_choice : NULL;
|
||||
ROW* row = word_data.row;
|
||||
BLOCK* block = word_data.block;
|
||||
WERD_RES* word = *in_word;
|
||||
prev_word_best_choice_ = word_data.prev_word != NULL
|
||||
? word_data.prev_word->word->best_choice : NULL;
|
||||
|
||||
set_global_subloc_code(SUBLOC_NORM);
|
||||
check_debug_pt(word, 30);
|
||||
if (!word->done || tessedit_training_tess) {
|
||||
if (!word->done) {
|
||||
word->caps_height = 0.0;
|
||||
if (word->x_height == 0.0f)
|
||||
word->x_height = row->x_height();
|
||||
@ -1161,11 +1227,9 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
|
||||
const WERD_CHOICE &word = *(word_res->best_choice);
|
||||
|
||||
// Find the frequency of each unique character in the word.
|
||||
UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
|
||||
SortHelper<UNICHAR_ID> rep_ch(word.length());
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
if (word.unichar_id(i) != space)
|
||||
rep_ch.Add(word.unichar_id(i), 1);
|
||||
rep_ch.Add(word.unichar_id(i), 1);
|
||||
}
|
||||
|
||||
// Find the most frequent result.
|
||||
@ -1194,51 +1258,9 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
|
||||
++gap_count;
|
||||
prev_blob = blob;
|
||||
}
|
||||
if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) {
|
||||
// Needs spaces between.
|
||||
ExplodeRepeatedWord(best_choice, page_res_it);
|
||||
} else {
|
||||
// Just correct existing classification.
|
||||
CorrectRepcharChoices(best_choice, word_res);
|
||||
word_res->reject_map.initialise(word.length());
|
||||
}
|
||||
}
|
||||
|
||||
// Explode the word at the given iterator location into individual words
|
||||
// of a single given unichar_id defined by best_choice.
|
||||
// The original word is deleted, and the replacements copy most of their
|
||||
// fields from the original.
|
||||
void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
|
||||
PAGE_RES_IT* page_res_it) {
|
||||
WERD_RES *word_res = page_res_it->word();
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
|
||||
// Make a new word for each blob in the original.
|
||||
WERD* werd = word_res->word;
|
||||
C_BLOB_IT blob_it(werd->cblob_list());
|
||||
for (; !blob_it.empty(); blob_it.forward()) {
|
||||
bool first_blob = blob_it.at_first();
|
||||
bool last_blob = blob_it.at_last();
|
||||
WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
|
||||
blob_it.extract());
|
||||
// Note that blamer_bundle (truth information) is not copied, which is
|
||||
// desirable, since the newly inserted words would not have the original
|
||||
// bounding box corresponding to the one recorded in truth fields.
|
||||
WERD_RES* rep_word =
|
||||
page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
|
||||
// Setup the single char WERD_RES
|
||||
if (rep_word->SetupForRecognition(*word_res->uch_set, this, BestPix(),
|
||||
tessedit_ocr_engine_mode, NULL, false,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
page_res_it->row()->row,
|
||||
page_res_it->block()->block)) {
|
||||
rep_word->CloneChoppedToRebuild();
|
||||
BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
|
||||
rep_word->FakeClassifyWord(1, &blob_choice);
|
||||
}
|
||||
}
|
||||
page_res_it->DeleteCurrentWord();
|
||||
// Just correct existing classification.
|
||||
CorrectRepcharChoices(best_choice, word_res);
|
||||
word_res->reject_map.initialise(word.length());
|
||||
}
|
||||
|
||||
ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
|
||||
@ -1405,16 +1427,19 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
|
||||
show_map_detail = TRUE;
|
||||
break;
|
||||
}
|
||||
tprintf(" \"%s\" ",
|
||||
word->best_choice->unichar_string().string());
|
||||
word->reject_map.print (debug_fp);
|
||||
tprintf ("\n");
|
||||
if (show_map_detail) {
|
||||
tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
|
||||
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
|
||||
tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
|
||||
word->reject_map[i].full_print(debug_fp);
|
||||
if (word->best_choice != NULL) {
|
||||
tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
|
||||
word->reject_map.print(debug_fp);
|
||||
tprintf("\n");
|
||||
if (show_map_detail) {
|
||||
tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
|
||||
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
|
||||
tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
|
||||
word->reject_map[i].full_print(debug_fp);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tprintf("null best choice\n");
|
||||
}
|
||||
tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
|
||||
tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
|
||||
|
@ -205,7 +205,8 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
|
||||
if ((!word->part_of_combo) && (word->box_word == NULL)) {
|
||||
WordData word_data(block, row, word);
|
||||
SetupWordPassN(2, &word_data);
|
||||
classify_word_and_language(&Tesseract::classify_word_pass2, &word_data);
|
||||
classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
|
||||
&word_data);
|
||||
}
|
||||
prev_word_best_choice_ = word->best_choice;
|
||||
}
|
||||
|
@ -30,15 +30,12 @@ namespace tesseract {
|
||||
void Tesseract::process_selected_words(
|
||||
PAGE_RES* page_res, // blocks to check
|
||||
TBOX & selection_box,
|
||||
BOOL8(tesseract::Tesseract::*word_processor)( // function to call
|
||||
BLOCK* block, ROW* row, WERD_RES* word_res)) {
|
||||
BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)) {
|
||||
for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
|
||||
page_res_it.forward()) {
|
||||
WERD* word = page_res_it.word()->word;
|
||||
if (word->bounding_box().overlap(selection_box)) {
|
||||
if (!((this->*word_processor)(page_res_it.block()->block,
|
||||
page_res_it.row()->row,
|
||||
page_res_it.word())))
|
||||
if (!(this->*word_processor)(&page_res_it))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -39,13 +39,11 @@ void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) {
|
||||
for (int w = 0; w < words.size(); ++w) {
|
||||
if (words[w].word->ratings != NULL &&
|
||||
words[w].word->ratings->get(0, 0) == NULL) {
|
||||
for (int b = 0; b < words[w].word->chopped_word->NumBlobs(); ++b) {
|
||||
blobs.push_back(BlobData(b, this, *words[w].word));
|
||||
}
|
||||
for (int s = 0; s < words[w].lang_words.size(); ++s) {
|
||||
const WERD_RES& word = words[w].lang_words[s];
|
||||
Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
|
||||
const WERD_RES& word = *words[w].lang_words[s];
|
||||
for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
|
||||
blobs.push_back(BlobData(b, sub_langs_[s], word));
|
||||
blobs.push_back(BlobData(b, sub, word));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -306,10 +306,7 @@ SVMenuNode *Tesseract::build_menu_new() {
|
||||
* Redisplay page
|
||||
*/
|
||||
void Tesseract::do_re_display(
|
||||
BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block,
|
||||
ROW* row,
|
||||
WERD_RES* word_res)) {
|
||||
PAGE_RES_IT pr_it(current_page_res);
|
||||
BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it)) {
|
||||
int block_count = 1;
|
||||
|
||||
image_win->Clear();
|
||||
@ -317,8 +314,9 @@ void Tesseract::do_re_display(
|
||||
image_win->Image(pix_binary_, 0, 0);
|
||||
}
|
||||
|
||||
PAGE_RES_IT pr_it(current_page_res);
|
||||
for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
|
||||
(this->*word_painter)(pr_it.block()->block, pr_it.row()->row, word);
|
||||
(this->*word_painter)(&pr_it);
|
||||
if (display_baselines && pr_it.row() != pr_it.prev_row())
|
||||
pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
|
||||
if (display_blocks && pr_it.block() != pr_it.prev_block())
|
||||
@ -714,11 +712,10 @@ void show_point(PAGE_RES* page_res, float x, float y) {
|
||||
#endif // GRAPHICS_DISABLED
|
||||
namespace tesseract {
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row,
|
||||
WERD_RES* word_res) {
|
||||
word_res->word->bounding_box().plot(image_win, ScrollView::BLACK,
|
||||
ScrollView::BLACK);
|
||||
return word_set_display(block, row, word_res);
|
||||
BOOL8 Tesseract:: word_blank_and_set_display(PAGE_RES_IT* pr_it) {
|
||||
pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
|
||||
ScrollView::BLACK);
|
||||
return word_set_display(pr_it);
|
||||
}
|
||||
|
||||
|
||||
@ -727,7 +724,8 @@ BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row,
|
||||
*
|
||||
* Normalize word and display in word window
|
||||
*/
|
||||
BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
BOOL8 Tesseract::word_bln_display(PAGE_RES_IT* pr_it) {
|
||||
WERD_RES* word_res = pr_it->word();
|
||||
if (word_res->chopped_word == NULL) {
|
||||
// Setup word normalization parameters.
|
||||
word_res->SetupForRecognition(unicharset, this, BestPix(),
|
||||
@ -735,7 +733,7 @@ BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
classify_bln_numeric_mode,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
row, block);
|
||||
pr_it->row()->row, pr_it->block()->block);
|
||||
}
|
||||
bln_word_window_handle()->Clear();
|
||||
display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
|
||||
@ -758,7 +756,8 @@ BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
*
|
||||
* Display a word according to its display modes
|
||||
*/
|
||||
BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
BOOL8 Tesseract::word_display(PAGE_RES_IT* pr_it) {
|
||||
WERD_RES* word_res = pr_it->word();
|
||||
WERD* word = word_res->word;
|
||||
TBOX word_bb; // word bounding box
|
||||
int word_height; // ht of word BB
|
||||
@ -918,14 +917,15 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
*
|
||||
* Dump members to the debug window
|
||||
*/
|
||||
BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
if (block != NULL) {
|
||||
BOOL8 Tesseract::word_dumper(PAGE_RES_IT* pr_it) {
|
||||
if (pr_it->block()->block != NULL) {
|
||||
tprintf("\nBlock data...\n");
|
||||
block->print(NULL, FALSE);
|
||||
pr_it->block()->block->print(NULL, FALSE);
|
||||
}
|
||||
tprintf("\nRow data...\n");
|
||||
row->print(NULL);
|
||||
pr_it->row()->row->print(NULL);
|
||||
tprintf("\nWord data...\n");
|
||||
WERD_RES* word_res = pr_it->word();
|
||||
word_res->word->print();
|
||||
if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
|
||||
word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
|
||||
@ -941,8 +941,8 @@ BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
*
|
||||
* Display word according to current display mode settings
|
||||
*/
|
||||
BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
WERD* word = word_res->word;
|
||||
BOOL8 Tesseract::word_set_display(PAGE_RES_IT* pr_it) {
|
||||
WERD* word = pr_it->word()->word;
|
||||
word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
|
||||
word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
|
||||
word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
|
||||
@ -950,26 +950,24 @@ BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
|
||||
word->set_display_flag(DF_BN_POLYGONAL,
|
||||
word_display_mode.bit(DF_BN_POLYGONAL));
|
||||
word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
|
||||
return word_display(block, row, word_res);
|
||||
return word_display(pr_it);
|
||||
}
|
||||
|
||||
// page_res is non-const because the iterator doesn't know if you are going
|
||||
// to change the items it points to! Really a const here though.
|
||||
void Tesseract::blob_feature_display(PAGE_RES* page_res,
|
||||
const TBOX& selection_box) {
|
||||
ROW* row; // row of word
|
||||
BLOCK* block; // block of word
|
||||
WERD* word = make_pseudo_word(page_res, selection_box, block, row);
|
||||
if (word != NULL) {
|
||||
WERD_RES word_res(word);
|
||||
word_res.x_height = row->x_height();
|
||||
word_res.SetupForRecognition(unicharset, this, BestPix(),
|
||||
tessedit_ocr_engine_mode, NULL,
|
||||
classify_bln_numeric_mode,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
row, block);
|
||||
TWERD* bln_word = word_res.chopped_word;
|
||||
PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
|
||||
if (it != NULL) {
|
||||
WERD_RES* word_res = it->word();
|
||||
word_res->x_height = it->row()->row->x_height();
|
||||
word_res->SetupForRecognition(unicharset, this, BestPix(),
|
||||
tessedit_ocr_engine_mode, NULL,
|
||||
classify_bln_numeric_mode,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
it->row()->row, it->block()->block);
|
||||
TWERD* bln_word = word_res->chopped_word;
|
||||
TBLOB* bln_blob = bln_word->blobs[0];
|
||||
INT_FX_RESULT_STRUCT fx_info;
|
||||
GenericVector<INT_FEATURE_STRUCT> bl_features;
|
||||
@ -989,7 +987,8 @@ void Tesseract::blob_feature_display(PAGE_RES* page_res,
|
||||
RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
|
||||
cn_win->Update();
|
||||
|
||||
delete word;
|
||||
it->DeleteCurrentWord();
|
||||
delete it;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -51,15 +51,11 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {
|
||||
|
||||
// Copies the bounding box from page_res_it->word() to the given TBOX.
|
||||
bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
|
||||
while (page_res_it->block() != NULL) {
|
||||
if (page_res_it->word() != NULL)
|
||||
break;
|
||||
while (page_res_it->block() != NULL && page_res_it->word() == NULL)
|
||||
page_res_it->forward();
|
||||
}
|
||||
|
||||
if (page_res_it->word() != NULL) {
|
||||
*tbox = page_res_it->word()->word->bounding_box();
|
||||
page_res_it->forward();
|
||||
|
||||
// If tbox->left() is negative, the training image has vertical text and
|
||||
// all the coordinates of bounding boxes of page_res are rotated by 90
|
||||
@ -109,26 +105,34 @@ void Tesseract::recog_training_segmented(const STRING &fname,
|
||||
// Align bottom left points of the TBOXes.
|
||||
while (keep_going &&
|
||||
!NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
|
||||
keep_going = (bbox.bottom() < tbox.bottom()) ?
|
||||
read_t(&page_res_it, &tbox) :
|
||||
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
||||
if (bbox.bottom() < tbox.bottom()) {
|
||||
page_res_it.forward();
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
} else {
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
|
||||
&bbox);
|
||||
}
|
||||
}
|
||||
while (keep_going &&
|
||||
!NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
|
||||
keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
|
||||
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
||||
if (bbox.left() > tbox.left()) {
|
||||
page_res_it.forward();
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
} else {
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
|
||||
&bbox);
|
||||
}
|
||||
}
|
||||
// OCR the word if top right points of the TBOXes are similar.
|
||||
if (keep_going &&
|
||||
NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
|
||||
NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
|
||||
ambigs_classify_and_output(page_res_it.prev_word(),
|
||||
page_res_it.prev_row(),
|
||||
page_res_it.prev_block(),
|
||||
label.string(), output_file);
|
||||
ambigs_classify_and_output(label.string(), &page_res_it, output_file);
|
||||
examined_words++;
|
||||
}
|
||||
page_res_it.forward();
|
||||
} while (keep_going);
|
||||
fclose(box_file);
|
||||
|
||||
// Set up scripts on all of the words that did not get sent to
|
||||
// ambigs_classify_and_output. They all should have, but if all the
|
||||
@ -196,16 +200,16 @@ static void PrintMatrixPaths(int col, int dim,
|
||||
// raw choice as a result of the classification. For words labeled with a
|
||||
// single unichar also outputs all alternatives from blob_choices of the
|
||||
// best choice.
|
||||
void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
|
||||
ROW_RES *row_res,
|
||||
BLOCK_RES *block_res,
|
||||
const char *label,
|
||||
void Tesseract::ambigs_classify_and_output(const char *label,
|
||||
PAGE_RES_IT* pr_it,
|
||||
FILE *output_file) {
|
||||
// Classify word.
|
||||
fflush(stdout);
|
||||
WordData word_data(block_res->block, row_res->row, werd_res);
|
||||
WordData word_data(*pr_it);
|
||||
SetupWordPassN(1, &word_data);
|
||||
classify_word_pass1(&word_data, werd_res);
|
||||
classify_word_and_language(&Tesseract::classify_word_pass1,
|
||||
pr_it, &word_data);
|
||||
WERD_RES* werd_res = word_data.word;
|
||||
WERD_CHOICE *best_choice = werd_res->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
|
||||
|
@ -96,8 +96,6 @@ Tesseract::Tesseract()
|
||||
" whose outlines overlap horizontally.", this->params()),
|
||||
BOOL_MEMBER(tessedit_display_outwords, false,
|
||||
"Draw output words", this->params()),
|
||||
BOOL_MEMBER(tessedit_training_tess, false,
|
||||
"Call Tess to learn blobs", this->params()),
|
||||
BOOL_MEMBER(tessedit_dump_choices, false,
|
||||
"Dump char choices", this->params()),
|
||||
BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
|
||||
@ -315,16 +313,6 @@ Tesseract::Tesseract()
|
||||
"Write .html hOCR output file", this->params()),
|
||||
BOOL_MEMBER(tessedit_create_pdf, false,
|
||||
"Write .pdf output file", this->params()),
|
||||
INT_MEMBER(tessedit_pdf_compression, 0,
|
||||
"Type of image compression in pdf output: "
|
||||
"0 - autoselection (default); "
|
||||
"1 - jpeg; "
|
||||
"2 - G4; "
|
||||
"3 - flate",
|
||||
this->params()),
|
||||
INT_MEMBER(tessedit_pdf_jpg_quality, 85,
|
||||
"Quality level of jpeg image compression in pdf output",
|
||||
this->params()),
|
||||
STRING_MEMBER(unrecognised_char, "|",
|
||||
"Output char for unidentified blobs", this->params()),
|
||||
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
|
||||
|
@ -31,20 +31,20 @@
|
||||
#include "textord.h"
|
||||
#include "wordrec.h"
|
||||
|
||||
class PAGE_RES;
|
||||
class PAGE_RES_IT;
|
||||
class BLOB_CHOICE_LIST_CLIST;
|
||||
class BLOCK_LIST;
|
||||
class CharSamp;
|
||||
class TO_BLOCK_LIST;
|
||||
class WERD_RES;
|
||||
class ROW;
|
||||
class TBOX;
|
||||
class SVMenuNode;
|
||||
struct Pix;
|
||||
class WERD_CHOICE;
|
||||
class WERD;
|
||||
class BLOB_CHOICE_LIST_CLIST;
|
||||
struct OSResults;
|
||||
class PAGE_RES;
|
||||
class PAGE_RES_IT;
|
||||
struct Pix;
|
||||
class ROW;
|
||||
class SVMenuNode;
|
||||
class TBOX;
|
||||
class TO_BLOCK_LIST;
|
||||
class WERD;
|
||||
class WERD_CHOICE;
|
||||
class WERD_RES;
|
||||
|
||||
|
||||
// Top-level class for all tesseract global instance data.
|
||||
@ -144,10 +144,19 @@ struct WordData {
|
||||
ROW* row;
|
||||
BLOCK* block;
|
||||
WordData* prev_word;
|
||||
GenericVector<WERD_RES> lang_words;
|
||||
PointerVector<WERD_RES> lang_words;
|
||||
};
|
||||
|
||||
typedef void (Tesseract::*WordRecognizer)(WordData* word_data, WERD_RES* word);
|
||||
// Definition of a Tesseract WordRecognizer. The WordData provides the context
|
||||
// of row/block, in_word holds an initialized, possibly pre-classified word,
|
||||
// that the recognizer may or may not consume (but if so it sets *in_word=NULL)
|
||||
// and produces one or more output words in out_words, which may be the
|
||||
// consumed in_word, or may be generated independently.
|
||||
// This api allows both a conventional tesseract classifier to work, or a
|
||||
// line-level classifier that generates multiple words from a merged input.
|
||||
typedef void (Tesseract::*WordRecognizer)(const WordData& word_data,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* out_words);
|
||||
|
||||
class Tesseract : public Wordrec {
|
||||
public:
|
||||
@ -279,6 +288,7 @@ class Tesseract : public Wordrec {
|
||||
void SetupWordPassN(int pass_n, WordData* word);
|
||||
// Runs word recognition on all the words.
|
||||
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
|
||||
PAGE_RES_IT* pr_it,
|
||||
GenericVector<WordData>* words);
|
||||
bool recog_all_words(PAGE_RES* page_res,
|
||||
ETEXT_DESC* monitor,
|
||||
@ -294,28 +304,35 @@ class Tesseract : public Wordrec {
|
||||
// Sets script positions and detects smallcaps on all output words.
|
||||
void script_pos_pass(PAGE_RES* page_res);
|
||||
// Helper to recognize the word using the given (language-specific) tesseract.
|
||||
// Returns true if the result was better than previously.
|
||||
bool RetryWithLanguage(const WERD_RES& best_word, WordData* word_data,
|
||||
WERD_RES* word, WordRecognizer recognizer);
|
||||
// Returns positive if this recognizer found more new best words than the
|
||||
// number kept from best_words.
|
||||
int RetryWithLanguage(const WordData& word_data,
|
||||
WordRecognizer recognizer,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* best_words);
|
||||
void classify_word_and_language(WordRecognizer recognizer,
|
||||
PAGE_RES_IT* pr_it,
|
||||
WordData* word_data);
|
||||
void classify_word_pass1(WordData* word_data, WERD_RES* word);
|
||||
void classify_word_pass1(const WordData& word_data,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* out_words);
|
||||
void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
|
||||
TBOX &selection_box);
|
||||
|
||||
void fix_rep_char(PAGE_RES_IT* page_res_it);
|
||||
void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it);
|
||||
|
||||
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
|
||||
const char *s,
|
||||
const char *lengths);
|
||||
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
|
||||
void classify_word_pass2(WordData* word_data, WERD_RES* word);
|
||||
void classify_word_pass2(const WordData& word_data,
|
||||
WERD_RES** in_word,
|
||||
PointerVector<WERD_RES>* out_words);
|
||||
void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
|
||||
WERD_RES* word, WERD_RES* new_word);
|
||||
bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
|
||||
bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
|
||||
BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res);
|
||||
BOOL8 recog_interactive(PAGE_RES_IT* pr_it);
|
||||
|
||||
// Set fonts of this word.
|
||||
void set_word_fonts(WERD_RES *word);
|
||||
@ -473,15 +490,13 @@ class Tesseract : public Wordrec {
|
||||
);
|
||||
void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
|
||||
void do_re_display(
|
||||
BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block,
|
||||
ROW* row,
|
||||
WERD_RES* word_res));
|
||||
BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res);
|
||||
BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res);
|
||||
BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
|
||||
BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
|
||||
BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
|
||||
BOOL8 word_display(PAGE_RES_IT* pr_it);
|
||||
BOOL8 word_bln_display(PAGE_RES_IT* pr_it);
|
||||
BOOL8 word_blank_and_set_display(PAGE_RES_IT* pr_its);
|
||||
BOOL8 word_set_display(PAGE_RES_IT* pr_it);
|
||||
// #ifndef GRAPHICS_DISABLED
|
||||
BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res);
|
||||
BOOL8 word_dumper(PAGE_RES_IT* pr_it);
|
||||
// #endif // GRAPHICS_DISABLED
|
||||
void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
|
||||
//// reject.h //////////////////////////////////////////////////////////
|
||||
@ -537,10 +552,7 @@ class Tesseract : public Wordrec {
|
||||
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
|
||||
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
|
||||
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
|
||||
void fix_fuzzy_space_list( //space explorer
|
||||
WERD_RES_LIST &best_perm,
|
||||
ROW *row,
|
||||
BLOCK* block);
|
||||
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
|
||||
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
|
||||
void fix_fuzzy_spaces( //find fuzzy words
|
||||
ETEXT_DESC *monitor, //progress monitor
|
||||
@ -583,9 +595,7 @@ class Tesseract : public Wordrec {
|
||||
PAGE_RES* page_res, // blocks to check
|
||||
//function to call
|
||||
TBOX & selection_box,
|
||||
BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block,
|
||||
ROW* row,
|
||||
WERD_RES* word_res));
|
||||
BOOL8 (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
|
||||
//// tessbox.cpp ///////////////////////////////////////////////////////
|
||||
void tess_add_doc_word( //test acceptability
|
||||
WERD_CHOICE *word_choice //after context
|
||||
@ -752,7 +762,6 @@ class Tesseract : public Wordrec {
|
||||
"Each bounding box is assumed to contain ngrams. Only"
|
||||
" learn the ngrams whose outlines overlap horizontally.");
|
||||
BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
|
||||
BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
|
||||
BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
|
||||
BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
|
||||
BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
|
||||
@ -908,13 +917,6 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
|
||||
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
|
||||
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
||||
INT_VAR_H(tessedit_pdf_compression, 0, "Type of image encoding in pdf output:"
|
||||
"0 - autoselection (default); "
|
||||
"1 - jpeg; "
|
||||
"2 - G4; "
|
||||
"3 - flate");
|
||||
INT_VAR_H(tessedit_pdf_jpg_quality, 85, "Quality level of jpeg image "
|
||||
"compression in pdf output");
|
||||
STRING_VAR_H(unrecognised_char, "|",
|
||||
"Output char for unidentified blobs");
|
||||
INT_VAR_H(suspect_level, 99, "Suspect marker level");
|
||||
@ -1046,10 +1048,8 @@ class Tesseract : public Wordrec {
|
||||
PAGE_RES *page_res,
|
||||
volatile ETEXT_DESC *monitor,
|
||||
FILE *output_file);
|
||||
void ambigs_classify_and_output(WERD_RES *werd_res,
|
||||
ROW_RES *row_res,
|
||||
BLOCK_RES *block_res,
|
||||
const char *label,
|
||||
void ambigs_classify_and_output(const char *label,
|
||||
PAGE_RES_IT* pr_it,
|
||||
FILE *output_file);
|
||||
|
||||
inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
|
||||
|
@ -171,7 +171,7 @@ void ImageThresholder::SetImage(const Pix* pix) {
|
||||
// Threshold the source image as efficiently as possible to the output Pix.
|
||||
// Creates a Pix and sets pix to point to the resulting pointer.
|
||||
// Caller must use pixDestroy to free the created Pix.
|
||||
void ImageThresholder::ThresholdToPix(Pix** pix) {
|
||||
void ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) {
|
||||
if (pix_channels_ == 0) {
|
||||
// We have a binary image, so it just has to be cloned.
|
||||
*pix = GetPixRect();
|
||||
|
@ -20,7 +20,8 @@
|
||||
#ifndef TESSERACT_CCMAIN_THRESHOLDER_H__
|
||||
#define TESSERACT_CCMAIN_THRESHOLDER_H__
|
||||
|
||||
#include "platform.h"
|
||||
#include "platform.h"
|
||||
#include "publictypes.h"
|
||||
|
||||
struct Pix;
|
||||
|
||||
@ -116,7 +117,7 @@ class TESS_API ImageThresholder {
|
||||
/// Threshold the source image as efficiently as possible to the output Pix.
|
||||
/// Creates a Pix and sets pix to point to the resulting pointer.
|
||||
/// Caller must use pixDestroy to free the created Pix.
|
||||
virtual void ThresholdToPix(Pix** pix);
|
||||
virtual void ThresholdToPix(PageSegMode pageseg_mode, Pix** pix);
|
||||
|
||||
// Gets a pix that contains an 8 bit threshold value at each pixel. The
|
||||
// returned pix may be an integer reduction of the binary image such that
|
||||
|
@ -23,17 +23,15 @@
|
||||
* make_pseudo_word
|
||||
*
|
||||
* Make all the blobs inside a selection into a single word.
|
||||
* The word is always a copy and needs to be deleted.
|
||||
* The returned PAGE_RES_IT* it points to the new word. After use, call
|
||||
* it->DeleteCurrentWord() to delete the fake word, and then
|
||||
* delete it to get rid of the iterator itself.
|
||||
**********************************************************************/
|
||||
|
||||
WERD *make_pseudo_word(PAGE_RES* page_res, // Blocks to check.
|
||||
const TBOX &selection_box,
|
||||
BLOCK *&pseudo_block,
|
||||
ROW *&pseudo_row) { // Row of selection.
|
||||
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
C_BLOB_LIST new_blobs; // list of gathered blobs
|
||||
C_BLOB_IT new_blob_it = &new_blobs; // iterator
|
||||
WERD *pseudo_word; // fabricated word
|
||||
|
||||
for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
|
||||
word_res = pr_it.forward()) {
|
||||
@ -45,15 +43,17 @@ WERD *make_pseudo_word(PAGE_RES* page_res, // Blocks to check.
|
||||
C_BLOB* blob = blob_it.data();
|
||||
if (blob->bounding_box().overlap(selection_box)) {
|
||||
new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
|
||||
pseudo_row = pr_it.row()->row;
|
||||
pseudo_block = pr_it.block()->block;
|
||||
}
|
||||
}
|
||||
if (!new_blobs.empty()) {
|
||||
WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
|
||||
word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
|
||||
PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
|
||||
while (it->word() != word_res && it->word() != NULL) it->forward();
|
||||
ASSERT_HOST(it->word() == word_res);
|
||||
return it;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!new_blobs.empty())
|
||||
pseudo_word = new WERD(&new_blobs, 1, NULL);
|
||||
else
|
||||
pseudo_word = NULL;
|
||||
return pseudo_word;
|
||||
return NULL;
|
||||
}
|
||||
|
@ -22,9 +22,6 @@
|
||||
|
||||
#include "pageres.h"
|
||||
|
||||
WERD *make_pseudo_word(PAGE_RES* page_res, // blocks to check
|
||||
const TBOX &selection_box,
|
||||
BLOCK *&pseudo_block,
|
||||
ROW *&pseudo_row);
|
||||
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box);
|
||||
|
||||
#endif
|
||||
|
@ -157,6 +157,13 @@ void BoxWord::InsertBox(int index, const TBOX& box) {
|
||||
ComputeBoundingBox();
|
||||
}
|
||||
|
||||
// Changes the box at the given index to the new box.
|
||||
// Recomputes the bounding box.
|
||||
void BoxWord::ChangeBox(int index, const TBOX& box) {
|
||||
boxes_[index] = box;
|
||||
ComputeBoundingBox();
|
||||
}
|
||||
|
||||
// Deletes the box with the given index, and shuffles up the rest.
|
||||
// Recomputes the bounding box.
|
||||
void BoxWord::DeleteBox(int index) {
|
||||
|
@ -63,6 +63,10 @@ class BoxWord {
|
||||
// Recomputes the bounding box.
|
||||
void InsertBox(int index, const TBOX& box);
|
||||
|
||||
// Changes the box at the given index to the new box.
|
||||
// Recomputes the bounding box.
|
||||
void ChangeBox(int index, const TBOX& box);
|
||||
|
||||
// Deletes the box with the given index, and shuffles up the rest.
|
||||
// Recomputes the bounding box.
|
||||
void DeleteBox(int index);
|
||||
|
@ -34,6 +34,13 @@ static const double kStopperAmbiguityThresholdGain = 8.0;
|
||||
static const double kStopperAmbiguityThresholdOffset = 1.5;
|
||||
// Max number of broken pieces to associate.
|
||||
const int kWordrecMaxNumJoinChunks = 4;
|
||||
// Max ratio of word box height to line size to allow it to be processed as
|
||||
// a line with other words.
|
||||
const double kMaxWordSizeRatio = 1.25;
|
||||
// Max ratio of line box height to line size to allow a new word to be added.
|
||||
const double kMaxLineSizeRatio = 1.25;
|
||||
// Max ratio of word gap to line size to allow a new word to be added.
|
||||
const double kMaxWordGapRatio = 2.0;
|
||||
|
||||
// Computes and returns a threshold of certainty difference used to determine
|
||||
// which words to keep, based on the adjustment factors of the two words.
|
||||
@ -49,6 +56,7 @@ static double StopperAmbigThreshold(double f1, double f2) {
|
||||
* Constructor for page results
|
||||
*************************************************************************/
|
||||
PAGE_RES::PAGE_RES(
|
||||
bool merge_similar_words,
|
||||
BLOCK_LIST *the_block_list,
|
||||
WERD_CHOICE **prev_word_best_choice_ptr) {
|
||||
Init();
|
||||
@ -56,7 +64,8 @@ PAGE_RES::PAGE_RES(
|
||||
BLOCK_RES_IT block_res_it(&block_res_list);
|
||||
for (block_it.mark_cycle_pt();
|
||||
!block_it.cycled_list(); block_it.forward()) {
|
||||
block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
|
||||
block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
|
||||
block_it.data()));
|
||||
}
|
||||
prev_word_best_choice = prev_word_best_choice_ptr;
|
||||
}
|
||||
@ -67,7 +76,7 @@ PAGE_RES::PAGE_RES(
|
||||
* Constructor for BLOCK results
|
||||
*************************************************************************/
|
||||
|
||||
BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
|
||||
BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
|
||||
ROW_IT row_it (the_block->row_list ());
|
||||
ROW_RES_IT row_res_it(&row_res_list);
|
||||
|
||||
@ -83,22 +92,20 @@ BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
|
||||
block = the_block;
|
||||
|
||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||
row_res_it.add_to_end(new ROW_RES(row_it.data()));
|
||||
row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* ROW_RES::ROW_RES
|
||||
*
|
||||
* Constructor for ROW results
|
||||
*************************************************************************/
|
||||
|
||||
ROW_RES::ROW_RES(ROW *the_row) {
|
||||
ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
|
||||
WERD_IT word_it(the_row->word_list());
|
||||
WERD_RES_IT word_res_it(&word_res_list);
|
||||
WERD_RES *combo = NULL; // current combination of fuzzies
|
||||
WERD_RES *word_res; // current word
|
||||
WERD *copy_word;
|
||||
|
||||
char_count = 0;
|
||||
@ -106,20 +113,48 @@ ROW_RES::ROW_RES(ROW *the_row) {
|
||||
whole_word_rej_count = 0;
|
||||
|
||||
row = the_row;
|
||||
bool add_next_word = false;
|
||||
TBOX union_box;
|
||||
float line_height = the_row->x_height() + the_row->ascenders() -
|
||||
the_row->descenders();
|
||||
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
||||
word_res = new WERD_RES(word_it.data());
|
||||
WERD_RES* word_res = new WERD_RES(word_it.data());
|
||||
word_res->x_height = the_row->x_height();
|
||||
|
||||
if (word_res->word->flag(W_FUZZY_NON)) {
|
||||
if (add_next_word) {
|
||||
ASSERT_HOST(combo != NULL);
|
||||
// We are adding this word to the combination.
|
||||
word_res->part_of_combo = TRUE;
|
||||
combo->copy_on(word_res);
|
||||
} else if (merge_similar_words) {
|
||||
union_box = word_res->word->bounding_box();
|
||||
add_next_word = !word_res->word->flag(W_REP_CHAR) &&
|
||||
union_box.height() <= line_height * kMaxWordSizeRatio;
|
||||
word_res->odd_size = !add_next_word;
|
||||
}
|
||||
if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
|
||||
WERD* next_word = word_it.data_relative(1);
|
||||
if (merge_similar_words) {
|
||||
if (add_next_word && !next_word->flag(W_REP_CHAR)) {
|
||||
// Next word will be added on if all of the following are true:
|
||||
// Not a rep char.
|
||||
// Box height small enough.
|
||||
// Union box height small enough.
|
||||
// Horizontal gap small enough.
|
||||
TBOX next_box = next_word->bounding_box();
|
||||
int prev_right = union_box.right();
|
||||
union_box += next_box;
|
||||
if (next_box.height() > line_height * kMaxWordSizeRatio ||
|
||||
union_box.height() > line_height * kMaxLineSizeRatio ||
|
||||
next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
|
||||
add_next_word = false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
add_next_word = next_word->flag(W_FUZZY_NON);
|
||||
}
|
||||
if (add_next_word) {
|
||||
if (combo == NULL) {
|
||||
copy_word = new WERD;
|
||||
//deep copy
|
||||
*copy_word = *(word_it.data());
|
||||
*copy_word = *(word_it.data()); // deep copy
|
||||
combo = new WERD_RES(copy_word);
|
||||
combo->x_height = the_row->x_height();
|
||||
combo->combination = TRUE;
|
||||
@ -208,6 +243,7 @@ void WERD_RES::CopySimpleFields(const WERD_RES& source) {
|
||||
done = source.done;
|
||||
unlv_crunch_mode = source.unlv_crunch_mode;
|
||||
small_caps = source.small_caps;
|
||||
odd_size = source.odd_size;
|
||||
italic = source.italic;
|
||||
bold = source.bold;
|
||||
fontinfo = source.fontinfo;
|
||||
@ -318,8 +354,7 @@ void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
TBOX box = b_it.data()->bounding_box();
|
||||
box_word->InsertBox(box_word->length(), box);
|
||||
fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
|
||||
-1, -1, -1, 0, 0, 0, BCC_FAKE);
|
||||
fake_choices[blob_id++] = new BLOB_CHOICE;
|
||||
}
|
||||
FakeClassifyWord(blob_count, fake_choices);
|
||||
delete [] fake_choices;
|
||||
@ -446,6 +481,13 @@ void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
|
||||
}
|
||||
}
|
||||
|
||||
// Prints the top choice along with the accepted/done flags.
|
||||
void WERD_RES::DebugTopChoice(const char* msg) const {
|
||||
tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
|
||||
tess_accepted, tess_would_adapt, done);
|
||||
best_choice->print(msg);
|
||||
}
|
||||
|
||||
// Removes from best_choices all choices which are not within a reasonable
|
||||
// range of the best choice.
|
||||
// TODO(rays) incorporate the information used here into the params training
|
||||
@ -830,6 +872,7 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
|
||||
}
|
||||
FakeWordFromRatings();
|
||||
reject_map.initialise(blob_count);
|
||||
done = true;
|
||||
}
|
||||
|
||||
// Creates a WERD_CHOICE for the word using the top choices from the leading
|
||||
@ -1038,6 +1081,7 @@ void WERD_RES::InitNonPointers() {
|
||||
done = FALSE;
|
||||
unlv_crunch_mode = CR_NONE;
|
||||
small_caps = false;
|
||||
odd_size = false;
|
||||
italic = FALSE;
|
||||
bold = FALSE;
|
||||
// The fontinfos and tesseract count as non-pointers as they point to
|
||||
@ -1239,6 +1283,159 @@ WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
|
||||
return new_res;
|
||||
}
|
||||
|
||||
// Helper computes the boundaries between blobs in the word. The blob bounds
|
||||
// are likely very poor, if they come from LSTM, where it only outputs the
|
||||
// character at one pixel within it, so we find the midpoints between them.
|
||||
static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
|
||||
GenericVector<int>* blob_ends) {
|
||||
C_BLOB_IT blob_it(word.word->cblob_list());
|
||||
for (int i = 0; i < word.best_state.size(); ++i) {
|
||||
int length = word.best_state[i];
|
||||
// Get the bounding box of the fake blobs
|
||||
TBOX blob_box = blob_it.data()->bounding_box();
|
||||
blob_it.forward();
|
||||
for (int b = 1; b < length; ++b) {
|
||||
blob_box += blob_it.data()->bounding_box();
|
||||
blob_it.forward();
|
||||
}
|
||||
// This blob_box is crap, so for now we are only looking for the
|
||||
// boundaries between them.
|
||||
int blob_end = MAX_INT32;
|
||||
if (!blob_it.at_first() || next_word_blobs != NULL) {
|
||||
if (blob_it.at_first())
|
||||
blob_it.set_to_list(next_word_blobs);
|
||||
blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
|
||||
}
|
||||
blob_ends->push_back(blob_end);
|
||||
}
|
||||
}
|
||||
|
||||
// Replaces the current WERD/WERD_RES with the given words. The given words
|
||||
// contain fake blobs that indicate the position of the characters. These are
|
||||
// replaced with real blobs from the current word as much as possible.
|
||||
void PAGE_RES_IT::ReplaceCurrentWord(
|
||||
tesseract::PointerVector<WERD_RES>* words) {
|
||||
WERD_RES* input_word = word();
|
||||
// Set the BOL/EOL flags on the words from the input word.
|
||||
if (input_word->word->flag(W_BOL)) {
|
||||
(*words)[0]->word->set_flag(W_BOL, true);
|
||||
} else {
|
||||
(*words)[0]->word->set_blanks(1);
|
||||
}
|
||||
words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
|
||||
|
||||
// Move the blobs from the input word to the new set of words.
|
||||
// If the input word_res is a combination, then the replacements will also be
|
||||
// combinations, and will own their own words. If the input word_res is not a
|
||||
// combination, then the final replacements will not be either, (although it
|
||||
// is allowed for the input words to be combinations) and their words
|
||||
// will get put on the row list. This maintains the ownership rules.
|
||||
WERD_IT w_it(row()->row->word_list());
|
||||
if (!input_word->combination) {
|
||||
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
||||
WERD* word = w_it.data();
|
||||
if (word == input_word->word)
|
||||
break;
|
||||
}
|
||||
// w_it is now set to the input_word's word.
|
||||
ASSERT_HOST(!w_it.cycled_list());
|
||||
}
|
||||
// Insert into the appropriate place in the ROW_RES.
|
||||
WERD_RES_IT wr_it(&row()->word_res_list);
|
||||
for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
|
||||
WERD_RES* word = wr_it.data();
|
||||
if (word == input_word)
|
||||
break;
|
||||
}
|
||||
ASSERT_HOST(!wr_it.cycled_list());
|
||||
// Since we only have an estimate of the bounds between blobs, use the blob
|
||||
// x-middle as the determiner of where to put the blobs
|
||||
C_BLOB_IT src_b_it(input_word->word->cblob_list());
|
||||
src_b_it.sort(&C_BLOB::SortByXMiddle);
|
||||
C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
|
||||
rej_b_it.sort(&C_BLOB::SortByXMiddle);
|
||||
for (int w = 0; w < words->size(); ++w) {
|
||||
WERD_RES* word_w = (*words)[w];
|
||||
// Compute blob boundaries.
|
||||
GenericVector<int> blob_ends;
|
||||
C_BLOB_LIST* next_word_blobs =
|
||||
w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : NULL;
|
||||
ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
|
||||
// Delete the fake blobs on the current word.
|
||||
word_w->word->cblob_list()->clear();
|
||||
C_BLOB_IT dest_it(word_w->word->cblob_list());
|
||||
// Build the box word as we move the blobs.
|
||||
tesseract::BoxWord* box_word = new tesseract::BoxWord;
|
||||
for (int i = 0; i < blob_ends.size(); ++i) {
|
||||
int end_x = blob_ends[i];
|
||||
TBOX blob_box;
|
||||
// Add the blobs up to end_x.
|
||||
while (!src_b_it.empty() &&
|
||||
src_b_it.data()->bounding_box().x_middle() < end_x) {
|
||||
blob_box += src_b_it.data()->bounding_box();
|
||||
dest_it.add_after_then_move(src_b_it.extract());
|
||||
src_b_it.forward();
|
||||
}
|
||||
while (!rej_b_it.empty() &&
|
||||
rej_b_it.data()->bounding_box().x_middle() < end_x) {
|
||||
blob_box += rej_b_it.data()->bounding_box();
|
||||
dest_it.add_after_then_move(rej_b_it.extract());
|
||||
rej_b_it.forward();
|
||||
}
|
||||
// Clip to the previously computed bounds. Although imperfectly accurate,
|
||||
// it is good enough, and much more complicated to determine where else
|
||||
// to clip.
|
||||
if (i > 0 && blob_box.left() < blob_ends[i - 1])
|
||||
blob_box.set_left(blob_ends[i - 1]);
|
||||
if (blob_box.right() > end_x)
|
||||
blob_box.set_right(end_x);
|
||||
box_word->InsertBox(i, blob_box);
|
||||
}
|
||||
// Fix empty boxes. If a very joined blob sits over multiple characters,
|
||||
// then we will have some empty boxes from using the middle, so look for
|
||||
// overlaps.
|
||||
for (int i = 0; i < box_word->length(); ++i) {
|
||||
TBOX box = box_word->BlobBox(i);
|
||||
if (box.null_box()) {
|
||||
// Nothing has its middle in the bounds of this blob, so use anything
|
||||
// that overlaps.
|
||||
for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
|
||||
dest_it.forward()) {
|
||||
TBOX blob_box = dest_it.data()->bounding_box();
|
||||
if (blob_box.left() < blob_ends[i] &&
|
||||
(i == 0 || blob_box.right() >= blob_ends[i - 1])) {
|
||||
if (i > 0 && blob_box.left() < blob_ends[i - 1])
|
||||
blob_box.set_left(blob_ends[i - 1]);
|
||||
if (blob_box.right() > blob_ends[i])
|
||||
blob_box.set_right(blob_ends[i]);
|
||||
box_word->ChangeBox(i, blob_box);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
delete word_w->box_word;
|
||||
word_w->box_word = box_word;
|
||||
if (!input_word->combination) {
|
||||
// Insert word_w->word into the ROW. It doesn't own its word, so the
|
||||
// ROW needs to own it.
|
||||
w_it.add_before_stay_put(word_w->word);
|
||||
word_w->combination = false;
|
||||
}
|
||||
(*words)[w] = NULL; // We are taking ownership.
|
||||
wr_it.add_before_stay_put(word_w);
|
||||
}
|
||||
// We have taken ownership of the words.
|
||||
words->clear();
|
||||
// Delete the current word, which has been replaced. We could just call
|
||||
// DeleteCurrentWord, but that would iterate both lists again, and we know
|
||||
// we are already in the right place.
|
||||
if (!input_word->combination)
|
||||
delete w_it.extract();
|
||||
delete wr_it.extract();
|
||||
ResetWordIterator();
|
||||
}
|
||||
|
||||
// Deletes the current WERD_RES and its underlying WERD.
|
||||
void PAGE_RES_IT::DeleteCurrentWord() {
|
||||
// Check that this word is as we expect. part_of_combos are NEVER iterated
|
||||
@ -1298,18 +1495,30 @@ WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
|
||||
// Resets the word_res_it so that it is one past the next_word_res, as
|
||||
// it should be after internal_forward. If next_row_res != row_res,
|
||||
// then the next_word_res is in the next row, so there is no need to do
|
||||
// anything, since operations on the current word will not have disturbed
|
||||
// the word_res_it.
|
||||
// anything to word_res_it, but it is still a good idea to reset the pointers
|
||||
// word_res and prev_word_res, which are still in the current row.
|
||||
void PAGE_RES_IT::ResetWordIterator() {
|
||||
if (row_res == next_row_res) {
|
||||
// Reset the member iterator so it can move forward and detect the
|
||||
// cycled_list state correctly.
|
||||
word_res_it.move_to_first();
|
||||
word_res_it.mark_cycle_pt();
|
||||
while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res)
|
||||
while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) {
|
||||
if (prev_row_res == row_res)
|
||||
prev_word_res = word_res;
|
||||
word_res = word_res_it.data();
|
||||
word_res_it.forward();
|
||||
}
|
||||
ASSERT_HOST(!word_res_it.cycled_list());
|
||||
word_res_it.forward();
|
||||
} else {
|
||||
// word_res_it is OK, but reset word_res and prev_word_res if needed.
|
||||
WERD_RES_IT wr_it(&row_res->word_res_list);
|
||||
for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
|
||||
if (prev_row_res == row_res)
|
||||
prev_word_res = word_res;
|
||||
word_res = wr_it.data();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,7 +82,8 @@ class PAGE_RES { // page result
|
||||
|
||||
PAGE_RES() { Init(); } // empty constructor
|
||||
|
||||
PAGE_RES(BLOCK_LIST *block_list, // real blocks
|
||||
PAGE_RES(bool merge_similar_words,
|
||||
BLOCK_LIST *block_list, // real blocks
|
||||
WERD_CHOICE **prev_word_best_choice_ptr);
|
||||
|
||||
~PAGE_RES () { // destructor
|
||||
@ -111,7 +112,7 @@ class BLOCK_RES:public ELIST_LINK {
|
||||
BLOCK_RES() {
|
||||
} // empty constructor
|
||||
|
||||
BLOCK_RES(BLOCK *the_block); // real block
|
||||
BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
|
||||
|
||||
~BLOCK_RES () { // destructor
|
||||
}
|
||||
@ -132,7 +133,7 @@ class ROW_RES:public ELIST_LINK {
|
||||
ROW_RES() {
|
||||
} // empty constructor
|
||||
|
||||
ROW_RES(ROW *the_row); // real row
|
||||
ROW_RES(bool merge_similar_words, ROW *the_row); // real row
|
||||
|
||||
~ROW_RES() { // destructor
|
||||
}
|
||||
@ -279,7 +280,8 @@ class WERD_RES : public ELIST_LINK {
|
||||
BOOL8 tess_accepted; // Tess thinks its ok?
|
||||
BOOL8 tess_would_adapt; // Tess would adapt?
|
||||
BOOL8 done; // ready for output?
|
||||
bool small_caps; // word appears to be small caps
|
||||
bool small_caps; // word appears to be small caps
|
||||
bool odd_size; // word is bigger than line or leader dots.
|
||||
inT8 italic;
|
||||
inT8 bold;
|
||||
// The fontinfos are pointers to data owned by the classifier.
|
||||
@ -486,6 +488,9 @@ class WERD_RES : public ELIST_LINK {
|
||||
// the word_to_debug.
|
||||
void DebugWordChoices(bool debug, const char* word_to_debug);
|
||||
|
||||
// Prints the top choice along with the accepted/done flags.
|
||||
void DebugTopChoice(const char* msg) const;
|
||||
|
||||
// Removes from best_choices all choices which are not within a reasonable
|
||||
// range of the best choice.
|
||||
void FilterWordChoices(int debug_level);
|
||||
@ -694,6 +699,11 @@ class PAGE_RES_IT {
|
||||
// the resulting WERD_RES is returned for further setup with best_choice etc.
|
||||
WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
|
||||
|
||||
// Replaces the current WERD/WERD_RES with the given words. The given words
|
||||
// contain fake blobs that indicate the position of the characters. These are
|
||||
// replaced with real blobs from the current word as much as possible.
|
||||
void ReplaceCurrentWord(tesseract::PointerVector<WERD_RES>* words);
|
||||
|
||||
// Deletes the current WERD_RES and its underlying WERD.
|
||||
void DeleteCurrentWord();
|
||||
|
||||
|
@ -164,28 +164,37 @@ enum PageSegMode {
|
||||
PSM_SINGLE_CHAR, ///< Treat the image as a single character.
|
||||
PSM_SPARSE_TEXT, ///< Find as much text as possible in no particular order.
|
||||
PSM_SPARSE_TEXT_OSD, ///< Sparse text with orientation and script det.
|
||||
PSM_RAW_LINE, ///< Treat the image as a single text line, bypassing
|
||||
///< hacks that are Tesseract-specific.
|
||||
|
||||
PSM_COUNT ///< Number of enum entries.
|
||||
};
|
||||
|
||||
/**
|
||||
* Macros that act on a PageSegMode to determine whether components of
|
||||
* Inline functions that act on a PageSegMode to determine whether components of
|
||||
* layout analysis are enabled.
|
||||
* *Depend critically on the order of elements of PageSegMode.*
|
||||
* NOTE that arg is an int for compatibility with INT_PARAM.
|
||||
*/
|
||||
#define PSM_OSD_ENABLED(pageseg_mode) ((pageseg_mode) <= PSM_AUTO_OSD || \
|
||||
(pageseg_mode) == PSM_SPARSE_TEXT_OSD)
|
||||
#define PSM_COL_FIND_ENABLED(pageseg_mode) \
|
||||
((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_AUTO)
|
||||
#define PSM_SPARSE(pageseg_mode) \
|
||||
((pageseg_mode) == PSM_SPARSE_TEXT || (pageseg_mode) == PSM_SPARSE_TEXT_OSD)
|
||||
#define PSM_BLOCK_FIND_ENABLED(pageseg_mode) \
|
||||
((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_COLUMN)
|
||||
#define PSM_LINE_FIND_ENABLED(pageseg_mode) \
|
||||
((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_BLOCK)
|
||||
#define PSM_WORD_FIND_ENABLED(pageseg_mode) \
|
||||
(((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_LINE) || \
|
||||
(pageseg_mode) == PSM_SPARSE_TEXT || (pageseg_mode) == PSM_SPARSE_TEXT_OSD)
|
||||
inline bool PSM_OSD_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
|
||||
}
|
||||
inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
|
||||
}
|
||||
inline bool PSM_SPARSE(int pageseg_mode) {
|
||||
return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
|
||||
}
|
||||
inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
|
||||
}
|
||||
inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
|
||||
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
|
||||
}
|
||||
inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
|
||||
return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
|
||||
pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
|
||||
}
|
||||
|
||||
/**
|
||||
* enum of the elements of the page hierarchy, used in ResultIterator
|
||||
|
@ -48,11 +48,11 @@ class BLOB_CHOICE: public ELIST_LINK
|
||||
{
|
||||
public:
|
||||
BLOB_CHOICE() {
|
||||
unichar_id_ = INVALID_UNICHAR_ID;
|
||||
unichar_id_ = UNICHAR_SPACE;
|
||||
fontinfo_id_ = -1;
|
||||
fontinfo_id2_ = -1;
|
||||
rating_ = MAX_FLOAT32;
|
||||
certainty_ = -MAX_FLOAT32;
|
||||
rating_ = 10.0;
|
||||
certainty_ = -1.0;
|
||||
script_id_ = -1;
|
||||
xgap_before_ = 0;
|
||||
xgap_after_ = 0;
|
||||
|
@ -78,6 +78,12 @@ class DLLSYM TBOX { // bounding box
|
||||
void set_right(int x) {
|
||||
top_right.set_x(x);
|
||||
}
|
||||
int x_middle() const {
|
||||
return (bot_left.x() + top_right.x()) / 2;
|
||||
}
|
||||
int y_middle() const {
|
||||
return (bot_left.y() + top_right.y()) / 2;
|
||||
}
|
||||
|
||||
const ICOORD &botleft() const { // access function
|
||||
return bot_left;
|
||||
|
@ -247,10 +247,11 @@ C_BLOB* C_BLOB::FakeBlob(const TBOX& box) {
|
||||
* Return the bounding box of the blob.
|
||||
**********************************************************************/
|
||||
|
||||
TBOX C_BLOB::bounding_box() { //bounding box
|
||||
C_OUTLINE *outline; //current outline
|
||||
C_OUTLINE_IT it = &outlines; //outlines of blob
|
||||
TBOX box; //bounding box
|
||||
TBOX C_BLOB::bounding_box() const { // bounding box
|
||||
C_OUTLINE *outline; // current outline
|
||||
// This is a read-only iteration of the outlines.
|
||||
C_OUTLINE_IT it = const_cast<C_OUTLINE_LIST*>(&outlines);
|
||||
TBOX box; // bounding box
|
||||
|
||||
for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
|
||||
outline = it.data ();
|
||||
|
@ -65,7 +65,7 @@ class C_BLOB:public ELIST_LINK
|
||||
return &outlines;
|
||||
}
|
||||
|
||||
TBOX bounding_box(); //compute bounding box
|
||||
TBOX bounding_box() const; // compute bounding box
|
||||
inT32 area(); //compute area
|
||||
inT32 perimeter(); // Total perimeter of outlines and 1st level children.
|
||||
inT32 outer_area(); //compute area
|
||||
@ -116,6 +116,14 @@ class C_BLOB:public ELIST_LINK
|
||||
return blob;
|
||||
}
|
||||
|
||||
static int SortByXMiddle(const void *v1, const void *v2) {
|
||||
const C_BLOB* blob1 = *reinterpret_cast<const C_BLOB* const *>(v1);
|
||||
const C_BLOB* blob2 = *reinterpret_cast<const C_BLOB* const *>(v2);
|
||||
return blob1->bounding_box().x_middle() -
|
||||
blob2->bounding_box().x_middle();
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
C_OUTLINE_LIST outlines; //master elements
|
||||
};
|
||||
|
@ -17,15 +17,17 @@
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "unicharset.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "params.h"
|
||||
#include "serialis.h"
|
||||
#include "tesscallback.h"
|
||||
#include "tprintf.h"
|
||||
#include "unichar.h"
|
||||
#include "unicharset.h"
|
||||
#include "params.h"
|
||||
|
||||
// Special character used in representing character fragments.
|
||||
static const char kSeparator = '|';
|
||||
@ -448,11 +450,19 @@ void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
|
||||
}
|
||||
}
|
||||
|
||||
// Makes this a copy of src. Clears this completely first, so the automattic
|
||||
// ids will not be present in this if not in src.
|
||||
// Makes this a copy of src. Clears this completely first, so the automatic
|
||||
// ids will not be present in this if not in src. Does NOT reorder the set!
|
||||
void UNICHARSET::CopyFrom(const UNICHARSET& src) {
|
||||
clear();
|
||||
AppendOtherUnicharset(src);
|
||||
for (int ch = 0; ch < src.size_used; ++ch) {
|
||||
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
||||
const char* utf8 = src.id_to_unichar(ch);
|
||||
unichar_insert(utf8);
|
||||
unichars[ch].properties.ExpandRangesFrom(src_props);
|
||||
}
|
||||
// Set properties, including mirror and other_case, WITHOUT reordering
|
||||
// the unicharset.
|
||||
PartialSetPropertiesFromOther(0, src);
|
||||
}
|
||||
|
||||
// For each id in src, if it does not occur in this, add it, as in
|
||||
@ -689,8 +699,11 @@ bool UNICHARSET::eq(UNICHAR_ID unichar_id,
|
||||
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
|
||||
}
|
||||
|
||||
bool UNICHARSET::save_to_file(FILE *file) const {
|
||||
fprintf(file, "%d\n", this->size());
|
||||
bool UNICHARSET::save_to_string(STRING *str) const {
|
||||
const int kFileBufSize = 1024;
|
||||
char buffer[kFileBufSize + 1];
|
||||
snprintf(buffer, kFileBufSize, "%d\n", this->size());
|
||||
*str = buffer;
|
||||
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
|
||||
int min_bottom, max_bottom, min_top, max_top;
|
||||
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
|
||||
@ -702,11 +715,11 @@ bool UNICHARSET::save_to_file(FILE *file) const {
|
||||
get_advance_range(id, &min_advance, &max_advance);
|
||||
unsigned int properties = this->get_properties(id);
|
||||
if (strcmp(this->id_to_unichar(id), " ") == 0) {
|
||||
fprintf(file, "%s %x %s %d\n", "NULL", properties,
|
||||
snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
|
||||
this->get_script_from_script_id(this->get_script(id)),
|
||||
this->get_other_case(id));
|
||||
} else {
|
||||
fprintf(file,
|
||||
snprintf(buffer, kFileBufSize,
|
||||
"%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
|
||||
this->id_to_unichar(id), properties,
|
||||
min_bottom, max_bottom, min_top, max_top, min_width, max_width,
|
||||
@ -716,10 +729,12 @@ bool UNICHARSET::save_to_file(FILE *file) const {
|
||||
this->get_mirror(id), this->get_normed_unichar(id),
|
||||
this->debug_str(id).string());
|
||||
}
|
||||
*str += buffer;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO(rays) Replace with TFile everywhere.
|
||||
class InMemoryFilePointer {
|
||||
public:
|
||||
InMemoryFilePointer(const char *memory, int mem_size)
|
||||
@ -776,6 +791,14 @@ bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
|
||||
return success;
|
||||
}
|
||||
|
||||
bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
|
||||
TessResultCallback2<char *, char *, int> *fgets_cb =
|
||||
NewPermanentTessCallback(file, &tesseract::TFile::FGets);
|
||||
bool success = load_via_fgets(fgets_cb, skip_fragments);
|
||||
delete fgets_cb;
|
||||
return success;
|
||||
}
|
||||
|
||||
bool UNICHARSET::load_via_fgets(
|
||||
TessResultCallback2<char *, char *, int> *fgets_cb,
|
||||
bool skip_fragments) {
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "errcode.h"
|
||||
#include "genericvector.h"
|
||||
#include "helpers.h"
|
||||
#include "serialis.h"
|
||||
#include "strngs.h"
|
||||
#include "tesscallback.h"
|
||||
#include "unichar.h"
|
||||
@ -317,7 +318,22 @@ class UNICHARSET {
|
||||
|
||||
// Saves the content of the UNICHARSET to the given file.
|
||||
// Returns true if the operation is successful.
|
||||
bool save_to_file(FILE *file) const;
|
||||
bool save_to_file(FILE *file) const {
|
||||
STRING str;
|
||||
if (!save_to_string(&str)) return false;
|
||||
if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
|
||||
return true;
|
||||
}
|
||||
bool save_to_file(tesseract::TFile *file) const {
|
||||
STRING str;
|
||||
if (!save_to_string(&str)) return false;
|
||||
if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Saves the content of the UNICHARSET to the given STRING.
|
||||
// Returns true if the operation is successful.
|
||||
bool save_to_string(STRING *str) const;
|
||||
|
||||
// Load a unicharset from a unicharset file that has been loaded into
|
||||
// the given memory buffer.
|
||||
@ -348,6 +364,8 @@ class UNICHARSET {
|
||||
// Returns true if the operation is successful.
|
||||
bool load_from_file(FILE *file, bool skip_fragments);
|
||||
bool load_from_file(FILE *file) { return load_from_file(file, false); }
|
||||
bool load_from_file(tesseract::TFile *file, bool skip_fragments);
|
||||
|
||||
|
||||
// Sets up internal data after loading the file, based on the char
|
||||
// properties. Called from load_from_file, but also needs to be run
|
||||
|
@ -161,7 +161,8 @@ float MakeRowFromSubBlobs(TO_BLOCK* block, C_BLOB* blob, TO_ROW_IT* row_it) {
|
||||
* only a single blob, it makes 2 rows, in case the top-level blob
|
||||
* is a container of the real blobs to recognize.
|
||||
*/
|
||||
float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
|
||||
float make_single_row(ICOORD page_tr, bool allow_sub_blobs,
|
||||
TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
|
||||
BLOBNBOX_IT blob_it = &block->blobs;
|
||||
TO_ROW_IT row_it = block->get_rows();
|
||||
|
||||
@ -169,11 +170,17 @@ float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
|
||||
blob_it.add_list_after(&block->small_blobs);
|
||||
blob_it.add_list_after(&block->noise_blobs);
|
||||
blob_it.add_list_after(&block->large_blobs);
|
||||
if (block->blobs.singleton()) {
|
||||
if (block->blobs.singleton() && allow_sub_blobs) {
|
||||
blob_it.move_to_first();
|
||||
float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
|
||||
if (size > block->line_size)
|
||||
block->line_size = size;
|
||||
} else if (block->blobs.empty()) {
|
||||
// Make a fake blob.
|
||||
C_BLOB* blob = C_BLOB::FakeBlob(block->block->bounding_box());
|
||||
// The blobnbox owns the blob.
|
||||
BLOBNBOX* bblob = new BLOBNBOX(blob);
|
||||
blob_it.add_after_then_move(bblob);
|
||||
}
|
||||
MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
|
||||
// Fit an LMS line to the rows.
|
||||
|
@ -133,7 +133,7 @@ inline bool within_error_margin(float test, float num, float margin) {
|
||||
void fill_heights(TO_ROW *row, float gradient, int min_height,
|
||||
int max_height, STATS *heights, STATS *floating_heights);
|
||||
|
||||
float make_single_row(ICOORD page_tr, TO_BLOCK* block,
|
||||
float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK* block,
|
||||
TO_BLOCK_LIST* blocks);
|
||||
float make_rows(ICOORD page_tr, // top right
|
||||
TO_BLOCK_LIST *port_blocks);
|
||||
|
@ -317,8 +317,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
|
||||
if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
|
||||
gradient = make_rows(page_tr_, to_blocks);
|
||||
} else if (!PSM_SPARSE(pageseg_mode)) {
|
||||
// SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
|
||||
gradient = make_single_row(page_tr_, to_block, to_blocks);
|
||||
// RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
|
||||
gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE,
|
||||
to_block, to_blocks);
|
||||
}
|
||||
BaselineDetect baseline_detector(textord_baseline_debug,
|
||||
reskew, to_blocks);
|
||||
@ -339,7 +340,8 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
|
||||
make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
|
||||
to_block->get_rows(), to_block->block->row_list());
|
||||
}
|
||||
cleanup_blocks(blocks); // Remove empties.
|
||||
cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
|
||||
// Remove empties.
|
||||
|
||||
// Compute the margins for each row in the block, to be used later for
|
||||
// paragraph detection.
|
||||
|
@ -206,7 +206,7 @@ class Textord {
|
||||
// Must have at least one WERD.
|
||||
// WERDs contain a fake blob.
|
||||
void cleanup_nontext_block(BLOCK* block);
|
||||
void cleanup_blocks(BLOCK_LIST *blocks);
|
||||
void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);
|
||||
BOOL8 clean_noise_from_row(ROW *row);
|
||||
void clean_noise_from_words(ROW *row);
|
||||
// Remove outlines that are a tiny fraction in either width or height
|
||||
|
@ -360,9 +360,11 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
|
||||
// Non-text blocks must contain at least one row.
|
||||
ROW_IT row_it(block->row_list());
|
||||
if (row_it.empty()) {
|
||||
float height = block->bounding_box().height();
|
||||
inT32 zero = 0;
|
||||
ROW* row = new ROW(0, &zero, NULL, height / 2.0f, height / 4.0f,
|
||||
TBOX box = block->bounding_box();
|
||||
float height = box.height();
|
||||
inT32 xstarts[2] = {box.left(), box.right()};
|
||||
double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
|
||||
ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
|
||||
height / 4.0f, 0, 1);
|
||||
row_it.add_after_then_move(row);
|
||||
}
|
||||
@ -398,9 +400,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
|
||||
* Delete empty blocks, rows from the page.
|
||||
**********************************************************************/
|
||||
|
||||
void Textord::cleanup_blocks( //remove empties
|
||||
BLOCK_LIST *blocks //list
|
||||
) {
|
||||
void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
|
||||
BLOCK_IT block_it = blocks; //iterator
|
||||
ROW_IT row_it; //row iterator
|
||||
|
||||
@ -417,22 +417,24 @@ void Textord::cleanup_blocks( //remove empties
|
||||
}
|
||||
num_rows = 0;
|
||||
num_rows_all = 0;
|
||||
row_it.set_to_list(block->row_list());
|
||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||
++num_rows_all;
|
||||
clean_small_noise_from_words(row_it.data());
|
||||
if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
|
||||
clean_noise_from_row(row_it.data())) ||
|
||||
row_it.data()->word_list()->empty()) {
|
||||
delete row_it.extract(); // lose empty row.
|
||||
} else {
|
||||
if (textord_noise_rejwords)
|
||||
clean_noise_from_words(row_it.data());
|
||||
if (textord_blshift_maxshift >= 0)
|
||||
tweak_row_baseline(row_it.data(),
|
||||
textord_blshift_maxshift,
|
||||
textord_blshift_xfraction);
|
||||
++num_rows;
|
||||
if (clean_noise) {
|
||||
row_it.set_to_list(block->row_list());
|
||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||
++num_rows_all;
|
||||
clean_small_noise_from_words(row_it.data());
|
||||
if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
|
||||
clean_noise_from_row(row_it.data())) ||
|
||||
row_it.data()->word_list()->empty()) {
|
||||
delete row_it.extract(); // lose empty row.
|
||||
} else {
|
||||
if (textord_noise_rejwords)
|
||||
clean_noise_from_words(row_it.data());
|
||||
if (textord_blshift_maxshift >= 0)
|
||||
tweak_row_baseline(row_it.data(),
|
||||
textord_blshift_maxshift,
|
||||
textord_blshift_xfraction);
|
||||
++num_rows;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (block->row_list()->empty()) {
|
||||
|
@ -299,7 +299,7 @@ bool LanguageModel::UpdateState(
|
||||
//if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
|
||||
UNICHAR_ID unichar_id = choice->unichar_id();
|
||||
if (unicharset.get_fragment(unichar_id)) {
|
||||
continue; // skip fragments
|
||||
continue; // Skip fragments.
|
||||
}
|
||||
// Set top choice flags.
|
||||
LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
|
||||
@ -651,6 +651,8 @@ bool LanguageModel::AddViterbiStateEntry(
|
||||
ngram_info, (language_model_debug_level > 0) ?
|
||||
dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
|
||||
new_vse->cost = ComputeAdjustedPathCost(new_vse);
|
||||
if (language_model_debug_level >= 3)
|
||||
tprintf("Adjusted cost = %g\n", new_vse->cost);
|
||||
|
||||
// Invoke Top Choice language model component to make the final adjustments
|
||||
// to new_vse->top_choice_flags.
|
||||
@ -1311,7 +1313,7 @@ void LanguageModel::UpdateBestChoice(
|
||||
vse->dawg_info != NULL && vse->top_choice_flags);
|
||||
}
|
||||
}
|
||||
if (wordrec_display_segmentations) {
|
||||
if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
|
||||
word->DisplaySegmentation(word_res->chopped_word);
|
||||
}
|
||||
}
|
||||
|
@ -37,52 +37,16 @@ void Wordrec::DoSegSearch(WERD_RES* word_res) {
|
||||
void Wordrec::SegSearch(WERD_RES* word_res,
|
||||
BestChoiceBundle* best_choice_bundle,
|
||||
BlamerBundle* blamer_bundle) {
|
||||
if (segsearch_debug_level > 0) {
|
||||
tprintf("Starting SegSearch on ratings matrix%s:\n",
|
||||
wordrec_enable_assoc ? " (with assoc)" : "");
|
||||
word_res->ratings->print(getDict().getUnicharset());
|
||||
}
|
||||
LMPainPoints pain_points(segsearch_max_pain_points,
|
||||
segsearch_max_char_wh_ratio,
|
||||
assume_fixed_pitch_char_segment,
|
||||
&getDict(), segsearch_debug_level);
|
||||
|
||||
pain_points.GenerateInitial(word_res);
|
||||
|
||||
// Compute scaling factor that will help us recover blob outline length
|
||||
// from classifier rating and certainty for the blob.
|
||||
float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
|
||||
|
||||
language_model_->InitForWord(prev_word_best_choice_,
|
||||
assume_fixed_pitch_char_segment,
|
||||
segsearch_max_char_wh_ratio, rating_cert_scale);
|
||||
|
||||
// Initialize blamer-related information: map character boxes recorded in
|
||||
// blamer_bundle->norm_truth_word to the corresponding i,j indices in the
|
||||
// ratings matrix. We expect this step to succeed, since when running the
|
||||
// chopper we checked that the correct chops are present.
|
||||
if (blamer_bundle != NULL) {
|
||||
blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
|
||||
wordrec_debug_blamer);
|
||||
}
|
||||
|
||||
MATRIX_COORD pain_point;
|
||||
float pain_point_priority;
|
||||
|
||||
// pending[col] tells whether there is update work to do to combine
|
||||
// best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
|
||||
// As the language model state is updated, pending entries are modified to
|
||||
// minimize duplication of work. It is important that during the update the
|
||||
// children are considered in the non-decreasing order of their column, since
|
||||
// this guarantees that all the parents would be up to date before an update
|
||||
// of a child is done.
|
||||
GenericVector<SegSearchPending> pending;
|
||||
pending.init_to_size(word_res->ratings->dimension(), SegSearchPending());
|
||||
|
||||
// Search the ratings matrix for the initial best path.
|
||||
pending[0].SetColumnClassified();
|
||||
UpdateSegSearchNodes(rating_cert_scale, 0, &pending, word_res,
|
||||
&pain_points, best_choice_bundle, blamer_bundle);
|
||||
InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
|
||||
blamer_bundle);
|
||||
|
||||
if (!SegSearchDone(0)) { // find a better choice
|
||||
if (chop_enable && word_res->chopped_word != NULL) {
|
||||
@ -98,6 +62,9 @@ void Wordrec::SegSearch(WERD_RES* word_res,
|
||||
}
|
||||
}
|
||||
// Keep trying to find a better path by fixing the "pain points".
|
||||
|
||||
MATRIX_COORD pain_point;
|
||||
float pain_point_priority;
|
||||
int num_futile_classifications = 0;
|
||||
STRING blamer_debug;
|
||||
while (wordrec_enable_assoc &&
|
||||
@ -159,6 +126,72 @@ void Wordrec::SegSearch(WERD_RES* word_res,
|
||||
}
|
||||
}
|
||||
|
||||
// Setup and run just the initial segsearch on an established matrix,
|
||||
// without doing any additional chopping or joining.
|
||||
void Wordrec::WordSearch(WERD_RES* word_res) {
|
||||
LMPainPoints pain_points(segsearch_max_pain_points,
|
||||
segsearch_max_char_wh_ratio,
|
||||
assume_fixed_pitch_char_segment,
|
||||
&getDict(), segsearch_debug_level);
|
||||
GenericVector<SegSearchPending> pending;
|
||||
BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
|
||||
// Run Segmentation Search.
|
||||
InitialSegSearch(word_res, &pain_points, &pending, &best_choice_bundle, NULL);
|
||||
if (segsearch_debug_level > 0) {
|
||||
tprintf("Ending ratings matrix%s:\n",
|
||||
wordrec_enable_assoc ? " (with assoc)" : "");
|
||||
word_res->ratings->print(getDict().getUnicharset());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Setup and run just the initial segsearch on an established matrix,
|
||||
// without doing any additional chopping or joining.
|
||||
// (Internal factored version that can be used as part of the main SegSearch.)
|
||||
void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
|
||||
GenericVector<SegSearchPending>* pending,
|
||||
BestChoiceBundle* best_choice_bundle,
|
||||
BlamerBundle* blamer_bundle) {
|
||||
if (segsearch_debug_level > 0) {
|
||||
tprintf("Starting SegSearch on ratings matrix%s:\n",
|
||||
wordrec_enable_assoc ? " (with assoc)" : "");
|
||||
word_res->ratings->print(getDict().getUnicharset());
|
||||
}
|
||||
|
||||
pain_points->GenerateInitial(word_res);
|
||||
|
||||
// Compute scaling factor that will help us recover blob outline length
|
||||
// from classifier rating and certainty for the blob.
|
||||
float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
|
||||
|
||||
language_model_->InitForWord(prev_word_best_choice_,
|
||||
assume_fixed_pitch_char_segment,
|
||||
segsearch_max_char_wh_ratio, rating_cert_scale);
|
||||
|
||||
// Initialize blamer-related information: map character boxes recorded in
|
||||
// blamer_bundle->norm_truth_word to the corresponding i,j indices in the
|
||||
// ratings matrix. We expect this step to succeed, since when running the
|
||||
// chopper we checked that the correct chops are present.
|
||||
if (blamer_bundle != NULL) {
|
||||
blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
|
||||
wordrec_debug_blamer);
|
||||
}
|
||||
|
||||
// pending[col] tells whether there is update work to do to combine
|
||||
// best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
|
||||
// As the language model state is updated, pending entries are modified to
|
||||
// minimize duplication of work. It is important that during the update the
|
||||
// children are considered in the non-decreasing order of their column, since
|
||||
// this guarantees that all the parents would be up to date before an update
|
||||
// of a child is done.
|
||||
pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
|
||||
|
||||
// Search the ratings matrix for the initial best path.
|
||||
(*pending)[0].SetColumnClassified();
|
||||
UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
|
||||
pain_points, best_choice_bundle, blamer_bundle);
|
||||
}
|
||||
|
||||
void Wordrec::UpdateSegSearchNodes(
|
||||
float rating_cert_scale,
|
||||
int starting_col,
|
||||
|
@ -266,11 +266,22 @@ class Wordrec : public Classify {
|
||||
// to combine blobs. Segmentation search will run only one "iteration"
|
||||
// on the classifications already recorded in chunks_record.ratings.
|
||||
//
|
||||
// Note: this function assumes that word, output_best_state,
|
||||
// best_char_choices and fixpt arguments are not NULL.
|
||||
// Note: this function assumes that word_res, best_choice_bundle arguments
|
||||
// are not NULL.
|
||||
void SegSearch(WERD_RES* word_res,
|
||||
BestChoiceBundle* best_choice_bundle,
|
||||
BlamerBundle* blamer_bundle);
|
||||
// Setup and run just the initial segsearch on an established matrix,
|
||||
// without doing any additional chopping or joining.
|
||||
void WordSearch(WERD_RES* word_res);
|
||||
|
||||
// Setup and run just the initial segsearch on an established matrix,
|
||||
// without doing any additional chopping or joining.
|
||||
// (Internal factored version that can be used as part of the main SegSearch.)
|
||||
void InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
|
||||
GenericVector<SegSearchPending>* pending,
|
||||
BestChoiceBundle* best_choice_bundle,
|
||||
BlamerBundle* blamer_bundle);
|
||||
|
||||
// Runs SegSearch() function (above) without needing a best_choice_bundle
|
||||
// or blamer_bundle. Used for testing.
|
||||
|
Loading…
Reference in New Issue
Block a user