Major refactor of control.cpp to enable line recognition

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1147 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2014-08-11 23:23:06 +00:00
parent e249d7bcb2
commit dbf6197471
34 changed files with 931 additions and 560 deletions

View File

@ -790,6 +790,10 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
* Runs page layout analysis in the mode set by SetPageSegMode. * Runs page layout analysis in the mode set by SetPageSegMode.
* May optionally be called prior to Recognize to get access to just * May optionally be called prior to Recognize to get access to just
* the page layout results. Returns an iterator to the results. * the page layout results. Returns an iterator to the results.
* If merge_similar_words is true, words are combined where suitable for use
* with a line recognizer. Use if you want to use AnalyseLayout to find the
* textlines, and then want to process textline fragments with an external
* line recognizer.
* Returns NULL on error or an empty page. * Returns NULL on error or an empty page.
* The returned iterator must be deleted after use. * The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and * WARNING! This class points to data held within the TessBaseAPI class, and
@ -797,11 +801,11 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES. * DetectOS, or anything else that changes the internal PAGE_RES.
*/ */
PageIterator* TessBaseAPI::AnalyseLayout() { PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
if (FindLines() == 0) { if (FindLines() == 0) {
if (block_list_->empty()) if (block_list_->empty())
return NULL; // The page was empty. return NULL; // The page was empty.
page_res_ = new PAGE_RES(block_list_, NULL); page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL);
DetectParagraphs(false); DetectParagraphs(false);
return new PageIterator( return new PageIterator(
page_res_, tesseract_, thresholder_->GetScaleFactor(), page_res_, tesseract_, thresholder_->GetScaleFactor(),
@ -823,18 +827,22 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
if (page_res_ != NULL) if (page_res_ != NULL)
delete page_res_; delete page_res_;
if (block_list_->empty()) { if (block_list_->empty()) {
page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_); page_res_ = new PAGE_RES(false, block_list_,
&tesseract_->prev_word_best_choice_);
return 0; // Empty page. return 0; // Empty page.
} }
tesseract_->SetBlackAndWhitelist(); tesseract_->SetBlackAndWhitelist();
recognition_done_ = true; recognition_done_ = true;
if (tesseract_->tessedit_resegment_from_line_boxes) if (tesseract_->tessedit_resegment_from_line_boxes) {
page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_); page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_);
else if (tesseract_->tessedit_resegment_from_boxes) } else if (tesseract_->tessedit_resegment_from_boxes) {
page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_); page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
else } else {
page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_); // TODO(rays) LSTM here.
page_res_ = new PAGE_RES(false,
block_list_, &tesseract_->prev_word_best_choice_);
}
if (tesseract_->tessedit_make_boxes_from_boxes) { if (tesseract_->tessedit_make_boxes_from_boxes) {
tesseract_->CorrectClassifyWords(page_res_); tesseract_->CorrectClassifyWords(page_res_);
return 0; return 0;
@ -900,7 +908,8 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
recognition_done_ = true; recognition_done_ = true;
page_res_ = new PAGE_RES(block_list_, &(tesseract_->prev_word_best_choice_)); page_res_ = new PAGE_RES(false, block_list_,
&(tesseract_->prev_word_best_choice_));
PAGE_RES_IT page_res_it(page_res_); PAGE_RES_IT page_res_it(page_res_);
@ -1977,7 +1986,10 @@ void TessBaseAPI::Threshold(Pix** pix) {
// than over-estimate resolution. // than over-estimate resolution.
thresholder_->SetSourceYResolution(kMinCredibleResolution); thresholder_->SetSourceYResolution(kMinCredibleResolution);
} }
thresholder_->ThresholdToPix(pix); PageSegMode pageseg_mode =
static_cast<PageSegMode>(
static_cast<int>(tesseract_->tessedit_pageseg_mode));
thresholder_->ThresholdToPix(pageseg_mode, pix);
thresholder_->GetImageSizes(&rect_left_, &rect_top_, thresholder_->GetImageSizes(&rect_left_, &rect_top_,
&rect_width_, &rect_height_, &rect_width_, &rect_height_,
&image_width_, &image_height_); &image_width_, &image_height_);
@ -2332,7 +2344,7 @@ void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) { PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
PAGE_RES *page_res = new PAGE_RES(block_list, PAGE_RES *page_res = new PAGE_RES(false, block_list,
&(tesseract_->prev_word_best_choice_)); &(tesseract_->prev_word_best_choice_));
tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1); tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
return page_res; return page_res;
@ -2341,7 +2353,7 @@ PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list, PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
PAGE_RES* pass1_result) { PAGE_RES* pass1_result) {
if (!pass1_result) if (!pass1_result)
pass1_result = new PAGE_RES(block_list, pass1_result = new PAGE_RES(false, block_list,
&(tesseract_->prev_word_best_choice_)); &(tesseract_->prev_word_best_choice_));
tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2); tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
return pass1_result; return pass1_result;

View File

@ -484,14 +484,21 @@ class TESS_API TessBaseAPI {
* Runs page layout analysis in the mode set by SetPageSegMode. * Runs page layout analysis in the mode set by SetPageSegMode.
* May optionally be called prior to Recognize to get access to just * May optionally be called prior to Recognize to get access to just
* the page layout results. Returns an iterator to the results. * the page layout results. Returns an iterator to the results.
* Returns NULL on error. * If merge_similar_words is true, words are combined where suitable for use
* with a line recognizer. Use if you want to use AnalyseLayout to find the
* textlines, and then want to process textline fragments with an external
* line recognizer.
* Returns NULL on error or an empty page.
* The returned iterator must be deleted after use. * The returned iterator must be deleted after use.
* WARNING! This class points to data held within the TessBaseAPI class, and * WARNING! This class points to data held within the TessBaseAPI class, and
* therefore can only be used while the TessBaseAPI class still exists and * therefore can only be used while the TessBaseAPI class still exists and
* has not been subjected to a call of Init, SetImage, Recognize, Clear, End * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
* DetectOS, or anything else that changes the internal PAGE_RES. * DetectOS, or anything else that changes the internal PAGE_RES.
*/ */
PageIterator* AnalyseLayout(); PageIterator* AnalyseLayout() {
return AnalyseLayout(false);
}
PageIterator* AnalyseLayout(bool merge_similar_words);
/** /**
* Recognize the image from SetAndThresholdImage, generating Tesseract * Recognize the image from SetAndThresholdImage, generating Tesseract

View File

@ -110,30 +110,20 @@ static void clear_any_old_text(BLOCK_LIST *block_list) {
PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname, PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
bool find_segmentation, bool find_segmentation,
BLOCK_LIST *block_list) { BLOCK_LIST *block_list) {
int box_count = 0;
int box_failures = 0;
FILE* box_file = OpenBoxFile(fname);
TBOX box;
GenericVector<TBOX> boxes; GenericVector<TBOX> boxes;
GenericVector<STRING> texts, full_texts; GenericVector<STRING> texts, full_texts;
if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
bool found_box = true; NULL)) {
while (found_box) { return NULL; // Can't do it.
int line_number = 0; // Line number of the box file.
STRING text, full_text;
found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
if (found_box) {
++box_count;
MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
} else {
full_text = "";
}
boxes.push_back(box);
texts.push_back(text);
full_texts.push_back(full_text);
} }
int box_count = boxes.size();
int box_failures = 0;
// Add an empty everything to the end.
boxes.push_back(TBOX());
texts.push_back(STRING());
full_texts.push_back(STRING());
// In word mode, we use the boxes to make a word for each box, but // In word mode, we use the boxes to make a word for each box, but
// in blob mode we use the existing words and maximally chop them first. // in blob mode we use the existing words and maximally chop them first.
PAGE_RES* page_res = find_segmentation ? PAGE_RES* page_res = find_segmentation ?
@ -239,7 +229,7 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
} }
} }
} }
PAGE_RES* page_res = new PAGE_RES(block_list, NULL); PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
PAGE_RES_IT pr_it(page_res); PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res; WERD_RES* word_res;
while ((word_res = pr_it.word()) != NULL) { while ((word_res = pr_it.word()) != NULL) {

View File

@ -69,16 +69,11 @@ const double kMinRefitXHeightFraction = 0.5;
namespace tesseract { namespace tesseract {
void Tesseract::recog_pseudo_word(PAGE_RES* page_res, void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
TBOX &selection_box) { TBOX &selection_box) {
WERD *word; PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
ROW *pseudo_row; // row of word if (it != NULL) {
BLOCK *pseudo_block; // block of word recog_interactive(it);
it->DeleteCurrentWord();
word = make_pseudo_word(page_res, selection_box, delete it;
pseudo_block, pseudo_row);
if (word != NULL) {
WERD_RES word_res(word);
recog_interactive(pseudo_block, pseudo_row, &word_res);
delete word;
} }
} }
@ -92,19 +87,22 @@ void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
* @param row row of word * @param row row of word
* @param word_res word to recognise * @param word_res word to recognise
*/ */
BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) { BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
inT16 char_qual; inT16 char_qual;
inT16 good_char_qual; inT16 good_char_qual;
WordData word_data(block, row, word_res); WordData word_data(*pr_it);
SetupWordPassN(2, &word_data); SetupWordPassN(2, &word_data);
classify_word_and_language(&Tesseract::classify_word_pass2, &word_data); classify_word_and_language(&Tesseract::classify_word_pass2, pr_it,
&word_data);
if (tessedit_debug_quality_metrics) { if (tessedit_debug_quality_metrics) {
word_char_quality(word_res, row, &char_qual, &good_char_qual); WERD_RES* word_res = pr_it->word();
tprintf word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n", tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
word_res->reject_map.length(), word_blob_quality(word_res, row), "char_quality: %d; good_char_quality: %d\n",
word_outline_errs(word_res), char_qual, good_char_qual); word_res->reject_map.length(),
word_blob_quality(word_res, pr_it->row()->row),
word_outline_errs(word_res), char_qual, good_char_qual);
} }
return TRUE; return TRUE;
} }
@ -163,8 +161,6 @@ void Tesseract::SetupAllWordsPassN(int pass_n,
PAGE_RES_IT page_res_it(page_res); PAGE_RES_IT page_res_it(page_res);
for (page_res_it.restart_page(); page_res_it.word() != NULL; for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) { page_res_it.forward()) {
if (pass_n == 1)
page_res_it.word()->SetupFake(unicharset);
if (target_word_box == NULL || if (target_word_box == NULL ||
ProcessTargetWord(page_res_it.word()->word->bounding_box(), ProcessTargetWord(page_res_it.word()->word->bounding_box(),
*target_word_box, word_config, 1)) { *target_word_box, word_config, 1)) {
@ -180,33 +176,29 @@ void Tesseract::SetupAllWordsPassN(int pass_n,
// Sets up the single word ready for whichever engine is to be run. // Sets up the single word ready for whichever engine is to be run.
void Tesseract::SetupWordPassN(int pass_n, WordData* word) { void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
if (pass_n == 1 || !word->word->done || tessedit_training_tess) { if (pass_n == 1 || !word->word->done) {
if (pass_n == 2) { if (pass_n == 1) {
word->word->SetupForRecognition(unicharset, this, BestPix(),
tessedit_ocr_engine_mode, NULL,
classify_bln_numeric_mode,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
word->row, word->block);
} else if (pass_n == 2) {
// TODO(rays) Should we do this on pass1 too? // TODO(rays) Should we do this on pass1 too?
word->word->caps_height = 0.0; word->word->caps_height = 0.0;
if (word->word->x_height == 0.0f) if (word->word->x_height == 0.0f)
word->word->x_height = word->row->x_height(); word->word->x_height = word->row->x_height();
} }
// Cube doesn't get setup for pass2. for (int s = 0; s <= sub_langs_.size(); ++s) {
if (pass_n != 2 || tessedit_ocr_engine_mode != OEM_CUBE_ONLY) { // The sub_langs_.size() entry is for the master language.
word->word->SetupForRecognition( Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, WERD_RES* word_res = new WERD_RES;
classify_bln_numeric_mode, textord_use_cjk_fp_model, word_res->InitForRetryRecognition(*word->word);
poly_allow_detailed_fx, word->row, word->block); word->lang_words.push_back(word_res);
} // Cube doesn't get setup for pass2.
} if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
if (!sub_langs_.empty()) { word_res->SetupForRecognition(
if (word->lang_words.size() != sub_langs_.size()) {
// Setup the words for all the sub-languages now.
WERD_RES empty;
word->lang_words.init_to_size(sub_langs_.size(), empty);
}
for (int s = 0; s < sub_langs_.size(); ++s) {
Tesseract* lang_t = sub_langs_[s];
if (pass_n == 1 || (lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY &&
(!word->lang_words[s].done || lang_t->tessedit_training_tess))) {
word->lang_words[s].InitForRetryRecognition(*word->word);
word->lang_words[s].SetupForRecognition(
lang_t->unicharset, lang_t, BestPix(), lang_t->unicharset, lang_t, BestPix(),
lang_t->tessedit_ocr_engine_mode, NULL, lang_t->tessedit_ocr_engine_mode, NULL,
lang_t->classify_bln_numeric_mode, lang_t->classify_bln_numeric_mode,
@ -217,17 +209,19 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
} }
} }
// Runs word recognition on all the words. // Runs word recognition on all the words.
bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
PAGE_RES_IT* pr_it,
GenericVector<WordData>* words) { GenericVector<WordData>* words) {
// TODO(rays) Before this loop can be parallelized (it would yield a massive // TODO(rays) Before this loop can be parallelized (it would yield a massive
// speed-up) all remaining member globals need to be converted to local/heap // speed-up) all remaining member globals need to be converted to local/heap
// (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
// added. The results will be significantly different with adaption on, and // added. The results will be significantly different with adaption on, and
// deterioration will need investigation. // deterioration will need investigation.
pr_it->restart_page();
for (int w = 0; w < words->size(); ++w) { for (int w = 0; w < words->size(); ++w) {
WordData* word = &(*words)[w]; WordData* word = &(*words)[w];
if (w > 0) word->prev_word = &(*words)[w - 1];
if (monitor != NULL) { if (monitor != NULL) {
monitor->ocr_alive = TRUE; monitor->ocr_alive = TRUE;
if (pass_n == 1) if (pass_n == 1)
@ -244,16 +238,26 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
return false; return false;
} }
} }
if (word->word->tess_failed) continue; if (word->word->tess_failed) {
int s;
for (s = 0; s < word->lang_words.size() &&
word->lang_words[s]->tess_failed; ++s) {}
// If all are failed, skip it. Image words are skipped by this test.
if (s > word->lang_words.size()) continue;
}
// Sync pr_it with the wth WordData.
while (pr_it->word() != NULL && pr_it->word() != word->word)
pr_it->forward();
ASSERT_HOST(pr_it->word() != NULL);
WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
: &Tesseract::classify_word_pass2; : &Tesseract::classify_word_pass2;
classify_word_and_language(recognizer, word); classify_word_and_language(recognizer, pr_it, word);
if (tessedit_dump_choices) { if (tessedit_dump_choices) {
word_dumper(NULL, word->row, word->word);
tprintf("Pass%d: %s [%s]\n", pass_n, tprintf("Pass%d: %s [%s]\n", pass_n,
word->word->best_choice->unichar_string().string(), word->word->best_choice->unichar_string().string(),
word->word->best_choice->debug_string().string()); word->word->best_choice->debug_string().string());
} }
pr_it->forward();
} }
return true; return true;
} }
@ -326,12 +330,12 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
most_recently_used_ = this; most_recently_used_ = this;
// Run pass 1 word recognition. // Run pass 1 word recognition.
if (!RecogAllWordsPassN(1, monitor, &words)) return false; if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
// Pass 1 post-processing. // Pass 1 post-processing.
while (page_res_it.word() != NULL) { for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
if (page_res_it.word()->word->flag(W_REP_CHAR)) { if (page_res_it.word()->word->flag(W_REP_CHAR)) {
fix_rep_char(&page_res_it); fix_rep_char(&page_res_it);
page_res_it.forward();
continue; continue;
} }
@ -346,15 +350,14 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
page_res->misadaption_log.push_back( page_res->misadaption_log.push_back(
page_res_it.word()->blamer_bundle->misadaption_debug()); page_res_it.word()->blamer_bundle->misadaption_debug());
} }
page_res_it.forward();
} }
} }
if (dopasses == 1) return true; if (dopasses == 1) return true;
// ****************** Pass 2 ******************* // ****************** Pass 2 *******************
if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption) { if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
tessedit_ocr_engine_mode != OEM_CUBE_ONLY ) {
page_res_it.restart_page(); page_res_it.restart_page();
GenericVector<WordData> words; GenericVector<WordData> words;
SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words); SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
@ -363,17 +366,7 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
} }
most_recently_used_ = this; most_recently_used_ = this;
// Run pass 2 word recognition. // Run pass 2 word recognition.
if (!RecogAllWordsPassN(2, monitor, &words)) return false; if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
// Pass 2 post-processing.
while (page_res_it.word() != NULL) {
WERD_RES* word = page_res_it.word();
if (word->word->flag(W_REP_CHAR) && !word->done) {
fix_rep_char(&page_res_it);
page_res_it.forward();
continue;
}
page_res_it.forward();
}
} }
// The next passes can only be run if tesseract has been used, as cube // The next passes can only be run if tesseract has been used, as cube
@ -407,8 +400,8 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
// ****************** Pass 9 ******************* // ****************** Pass 9 *******************
// Check the correctness of the final results. // Check the correctness of the final results.
blamer_pass(page_res); blamer_pass(page_res);
script_pos_pass(page_res);
} }
script_pos_pass(page_res);
// Write results pass. // Write results pass.
set_global_loc_code(LOC_WRITE_RESULTS); set_global_loc_code(LOC_WRITE_RESULTS);
@ -745,166 +738,232 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) {
} }
} }
// Helper returns true if the new_word is better than the word, using a // Factored helper considers the indexed word and updates all the pointed
// simple test of better certainty AND rating (to reduce false positives // values.
// from cube) or a dictionary vs non-dictionary word. static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,
static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word, float* rating, float* certainty, bool* bad,
double rating_ratio, bool* valid_permuter, int* right, int* next_left) {
double certainty_margin) { *right = -MAX_INT32;
if (new_word.best_choice == NULL) { *next_left = MAX_INT32;
return false; // New one no good. if (index < words.size()) {
WERD_CHOICE* choice = words[index]->best_choice;
if (choice == NULL) {
*bad = true;
} else {
*rating += choice->rating();
*certainty = MIN(*certainty, choice->certainty());
if (!Dict::valid_word_permuter(choice->permuter(), false))
*valid_permuter = false;
}
*right = words[index]->word->bounding_box().right();
if (index + 1 < words.size())
*next_left = words[index + 1]->word->bounding_box().left();
} else {
*valid_permuter = false;
*bad = true;
} }
if (word.best_choice == NULL) { }
return true; // Old one no good.
// Helper chooses the best combination of words, transferring good ones from
// new_words to best_words. To win, a new word must have (better rating and
// certainty) or (better permuter status and rating within rating ratio and
// certainty within certainty margin) than current best.
// All the new_words are consumed (moved to best_words or deleted.)
// The return value is the number of new_words used minus the number of
// best_words that remain in the output.
static int SelectBestWords(double rating_ratio,
double certainty_margin,
bool debug,
PointerVector<WERD_RES>* new_words,
PointerVector<WERD_RES>* best_words) {
// Process the smallest groups of words that have an overlapping word
// boundary at the end.
GenericVector<WERD_RES*> out_words;
// Index into each word vector (best, new).
int b = 0, n = 0;
int num_best = 0, num_new = 0;
while (b < best_words->size() || n < new_words->size()) {
// Start of the current run in each.
int start_b = b, start_n = n;
// Rating of the current run in each.
float b_rating = 0.0f, n_rating = 0.0f;
// Certainty of the current run in each.
float b_certainty = 0.0f, n_certainty = 0.0f;
// True if any word is missing its best choice.
bool b_bad = false, n_bad = false;
// True if all words have a valid permuter.
bool b_valid_permuter = true, n_valid_permuter = true;
while (b < best_words->size() || n < new_words->size()) {
int b_right = -MAX_INT32;
int next_b_left = MAX_INT32;
EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,
&b_valid_permuter, &b_right, &next_b_left);
int n_right = -MAX_INT32;
int next_n_left = MAX_INT32;
EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,
&n_valid_permuter, &n_right, &next_n_left);
if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
// The word breaks overlap. [start_b,b] and [start_n, n] match.
break;
}
// Keep searching for the matching word break.
if ((b_right < n_right && b < best_words->size()) ||
n == new_words->size())
++b;
else
++n;
}
bool new_better = false;
if (!n_bad && (b_bad || (n_certainty > b_certainty &&
n_rating < b_rating) ||
(!b_valid_permuter && n_valid_permuter &&
n_rating < b_rating * rating_ratio &&
n_certainty > b_certainty - certainty_margin))) {
// New is better.
for (int i = start_n; i <= n; ++i) {
out_words.push_back((*new_words)[i]);
(*new_words)[i] = NULL;
++num_new;
}
new_better = true;
} else if (!b_bad) {
// Current best is better.
for (int i = start_b; i <= b; ++i) {
out_words.push_back((*best_words)[i]);
(*best_words)[i] = NULL;
++num_best;
}
}
int end_b = b < best_words->size() ? b + 1 : b;
int end_n = n < new_words->size() ? n + 1 : n;
if (debug) {
tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
" valid dict: %d v %d\n",
end_n - start_n, new_better ? "better" : "worse",
end_b - start_b, n_rating, b_rating,
n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
}
// Move on to the next group.
b = end_b;
n = end_n;
} }
if (new_word.best_choice->certainty() > word.best_choice->certainty() && // Transfer from out_words to best_words.
new_word.best_choice->rating() < word.best_choice->rating()) { best_words->clear();
return true; // New word has better confidence. for (int i = 0; i < out_words.size(); ++i)
} best_words->push_back(out_words[i]);
if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) && return num_new - num_best;
Dict::valid_word_permuter(new_word.best_choice->permuter(), false) &&
new_word.best_choice->rating() <
word.best_choice->rating() * rating_ratio &&
new_word.best_choice->certainty() >
word.best_choice->certainty() - certainty_margin) {
return true; // New word is from a dictionary.
}
return false; // New word is no better.
} }
// Helper to recognize the word using the given (language-specific) tesseract. // Helper to recognize the word using the given (language-specific) tesseract.
// Returns true if the result was better than previously. // Returns positive if this recognizer found more new best words than the
bool Tesseract::RetryWithLanguage(const WERD_RES& best_word, // number kept from best_words.
WordData* word_data, WERD_RES* word, int Tesseract::RetryWithLanguage(const WordData& word_data,
WordRecognizer recognizer) { WordRecognizer recognizer,
if (classify_debug_level || cube_debug_level) { WERD_RES** in_word,
tprintf("Retrying word using lang %s, oem %d\n", PointerVector<WERD_RES>* best_words) {
bool debug = classify_debug_level || cube_debug_level;
if (debug) {
tprintf("Trying word using lang %s, oem %d\n",
lang.string(), static_cast<int>(tessedit_ocr_engine_mode)); lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
} }
// Run the recognizer on the word. // Run the recognizer on the word.
PointerVector<WERD_RES> new_words;
(this->*recognizer)(word_data, in_word, &new_words);
if (new_words.empty()) {
// Transfer input word to new_words, as the classifier must have put
// the result back in the input.
new_words.push_back(*in_word);
*in_word = NULL;
}
if (debug) {
for (int i = 0; i < new_words.size(); ++i)
new_words[i]->DebugTopChoice("Lang result");
}
// Initial version is a bit of a hack based on better certainty and rating // Initial version is a bit of a hack based on better certainty and rating
// (to reduce false positives from cube) or a dictionary vs non-dictionary // (to reduce false positives from cube) or a dictionary vs non-dictionary
// word. // word.
(this->*recognizer)(word_data, word); return SelectBestWords(classify_max_rating_ratio,
bool new_is_better = NewWordBetter(best_word, *word, classify_max_certainty_margin,
classify_max_rating_ratio, debug, &new_words, best_words);
classify_max_certainty_margin); }
if (classify_debug_level || cube_debug_level) {
if (word->best_choice == NULL) { // Helper returns true if all the words are acceptable.
tprintf("NULL result %s better!\n", static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
new_is_better ? "IS" : "NOT"); for (int w = 0; w < words.size(); ++w) {
} else { if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
tprintf("New result %s better:%s, r=%g, c=%g\n",
new_is_better ? "IS" : "NOT",
word->best_choice->unichar_string().string(),
word->best_choice->rating(),
word->best_choice->certainty());
}
} }
return new_is_better; return true;
} }
// Generic function for classifying a word. Can be used either for pass1 or // Generic function for classifying a word. Can be used either for pass1 or
// pass2 according to the function passed to recognizer. // pass2 according to the function passed to recognizer.
// word block and row are the current location in the document's PAGE_RES. // word_data holds the word to be recognized, and its block and row, and
// pr_it points to the word as well, in case we are running LSTM and it wants
// to output multiple words.
// Recognizes in the current language, and if successful that is all. // Recognizes in the current language, and if successful that is all.
// If recognition was not successful, tries all available languages until // If recognition was not successful, tries all available languages until
// it gets a successful result or runs out of languages. Keeps the best result. // it gets a successful result or runs out of languages. Keeps the best result.
void Tesseract::classify_word_and_language(WordRecognizer recognizer, void Tesseract::classify_word_and_language(WordRecognizer recognizer,
PAGE_RES_IT* pr_it,
WordData* word_data) { WordData* word_data) {
// Best result so far.
PointerVector<WERD_RES> best_words;
// Points to the best result. May be word or in lang_words. // Points to the best result. May be word or in lang_words.
WERD_RES* word = word_data->word; WERD_RES* word = word_data->word;
clock_t start_t = clock(); clock_t start_t = clock();
if (classify_debug_level || cube_debug_level) { if (classify_debug_level || cube_debug_level) {
tprintf("Processing word with lang %s at:", tprintf("%s word with lang %s at:",
word->done ? "Already done" : "Processing",
most_recently_used_->lang.string()); most_recently_used_->lang.string());
word->word->bounding_box().print(); word->word->bounding_box().print();
} }
const char* result_type = "Initial"; if (word->done) {
bool initially_done = !word->tess_failed && word->done;
if (initially_done) {
// If done on pass1, leave it as-is. // If done on pass1, leave it as-is.
most_recently_used_ = word->tesseract; if (!word->tess_failed)
result_type = "Already done"; most_recently_used_ = word->tesseract;
} else { return;
if (most_recently_used_ != this) {
// Point to the word for most_recently_used_.
for (int s = 0; s < sub_langs_.size(); ++s) {
if (most_recently_used_ == sub_langs_[s]) {
word = &word_data->lang_words[s];
break;
}
}
}
(most_recently_used_->*recognizer)(word_data, word);
if (!word->tess_failed && word->tess_accepted)
result_type = "Accepted";
} }
if (classify_debug_level || cube_debug_level) { int sub = sub_langs_.size();
tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d" if (most_recently_used_ != this) {
" xht=[%g,%g]\n", // Get the index of the most_recently_used_.
result_type, for (sub = 0; sub < sub_langs_.size() &&
word->best_choice->unichar_string().string(), most_recently_used_ != sub_langs_[sub]; ++sub) {}
word->best_choice->rating(),
word->best_choice->certainty(),
word->tess_accepted, word->tess_would_adapt,
word->best_choice->min_x_height(),
word->best_choice->max_x_height());
} }
if (word->tess_failed || !word->tess_accepted) { most_recently_used_->RetryWithLanguage(
*word_data, recognizer, &word_data->lang_words[sub], &best_words);
Tesseract* best_lang_tess = most_recently_used_;
if (!WordsAcceptable(best_words)) {
// Try all the other languages to see if they are any better. // Try all the other languages to see if they are any better.
Tesseract* previous_used = most_recently_used_; if (most_recently_used_ != this &&
if (most_recently_used_ != this) { this->RetryWithLanguage(*word_data, recognizer,
if (classify_debug_level) { &word_data->lang_words[sub_langs_.size()],
tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string()); &best_words) > 0) {
} best_lang_tess = this;
if (word_data->word->tesseract == this) {
// This is pass1, and we are trying the main language.
if (RetryWithLanguage(*word, word_data, word_data->word, recognizer)) {
most_recently_used_ = this;
word = word_data->word;
}
} else {
// This is pass2, and we are trying the main language again, but it
// has no word allocated to it, so we must re-initialize it.
WERD_RES main_word(*word_data->word);
main_word.InitForRetryRecognition(*word_data->word);
main_word.SetupForRecognition(unicharset, this, BestPix(),
tessedit_ocr_engine_mode, NULL,
classify_bln_numeric_mode,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
word_data->row, word_data->block);
if (RetryWithLanguage(*word, word_data, &main_word, recognizer)) {
most_recently_used_ = this;
word_data->word->ConsumeWordResults(&main_word);
word = word_data->word;
}
}
if (!word->tess_failed && word->tess_accepted)
return; // No need to look at the others.
} }
for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
for (int i = 0; i < sub_langs_.size(); ++i) { ++i) {
if (sub_langs_[i] != previous_used) { if (most_recently_used_ != sub_langs_[i] &&
if (classify_debug_level) { sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,
tprintf("Retrying with sub-Tesseract[%d] lang: %s\n", &word_data->lang_words[i],
i, sub_langs_[i]->lang.string()); &best_words) > 0) {
} best_lang_tess = sub_langs_[i];
if (sub_langs_[i]->RetryWithLanguage(*word, word_data,
&word_data->lang_words[i],
recognizer)) {
most_recently_used_ = sub_langs_[i];
word = &word_data->lang_words[i];
if (!word->tess_failed && word->tess_accepted)
break; // No need to look at the others.
}
} }
} }
} }
if (word != word_data->word) { most_recently_used_ = best_lang_tess;
// Move the result for the best language to the main word. if (!best_words.empty()) {
word_data->word->ConsumeWordResults(word); if (best_words.size() == 1 && !best_words[0]->combination) {
// Move the best single result to the main word.
word_data->word->ConsumeWordResults(best_words[0]);
} else {
// Words came from LSTM, and must be moved to the PAGE_RES properly.
word_data->word = best_words.back();
pr_it->ReplaceCurrentWord(&best_words);
}
ASSERT_HOST(word_data->word->box_word != NULL);
} else {
tprintf("no best words!!\n");
} }
clock_t ocr_t = clock(); clock_t ocr_t = clock();
if (tessedit_timing_debug) { if (tessedit_timing_debug) {
@ -920,16 +979,19 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
* Baseline normalize the word and pass it to Tess. * Baseline normalize the word and pass it to Tess.
*/ */
void Tesseract::classify_word_pass1(WordData* word_data, WERD_RES* word) { void Tesseract::classify_word_pass1(const WordData& word_data,
ROW* row = word_data->row; WERD_RES** in_word,
BLOCK* block = word_data->block; PointerVector<WERD_RES>* out_words) {
prev_word_best_choice_ = word_data->prev_word != NULL ROW* row = word_data.row;
? word_data->prev_word->word->best_choice : NULL; BLOCK* block = word_data.block;
prev_word_best_choice_ = word_data.prev_word != NULL
? word_data.prev_word->word->best_choice : NULL;
// If we only intend to run cube - run it and return. // If we only intend to run cube - run it and return.
if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
cube_word_pass1(block, row, word); cube_word_pass1(block, row, *in_word);
return; return;
} }
WERD_RES* word = *in_word;
match_word_pass_n(1, word, row, block); match_word_pass_n(1, word, row, block);
if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
word->tess_would_adapt = AdaptableWord(word); word->tess_would_adapt = AdaptableWord(word);
@ -1027,19 +1089,23 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
* Control what to do with the word in pass 2 * Control what to do with the word in pass 2
*/ */
void Tesseract::classify_word_pass2(WordData* word_data, WERD_RES* word) { void Tesseract::classify_word_pass2(const WordData& word_data,
WERD_RES** in_word,
PointerVector<WERD_RES>* out_words) {
// Return if we do not want to run Tesseract. // Return if we do not want to run Tesseract.
if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED) tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED &&
word_data.word->best_choice != NULL)
return; return;
ROW* row = word_data->row; ROW* row = word_data.row;
BLOCK* block = word_data->block; BLOCK* block = word_data.block;
prev_word_best_choice_ = word_data->prev_word != NULL WERD_RES* word = *in_word;
? word_data->prev_word->word->best_choice : NULL; prev_word_best_choice_ = word_data.prev_word != NULL
? word_data.prev_word->word->best_choice : NULL;
set_global_subloc_code(SUBLOC_NORM); set_global_subloc_code(SUBLOC_NORM);
check_debug_pt(word, 30); check_debug_pt(word, 30);
if (!word->done || tessedit_training_tess) { if (!word->done) {
word->caps_height = 0.0; word->caps_height = 0.0;
if (word->x_height == 0.0f) if (word->x_height == 0.0f)
word->x_height = row->x_height(); word->x_height = row->x_height();
@ -1161,11 +1227,9 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
const WERD_CHOICE &word = *(word_res->best_choice); const WERD_CHOICE &word = *(word_res->best_choice);
// Find the frequency of each unique character in the word. // Find the frequency of each unique character in the word.
UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
SortHelper<UNICHAR_ID> rep_ch(word.length()); SortHelper<UNICHAR_ID> rep_ch(word.length());
for (int i = 0; i < word.length(); ++i) { for (int i = 0; i < word.length(); ++i) {
if (word.unichar_id(i) != space) rep_ch.Add(word.unichar_id(i), 1);
rep_ch.Add(word.unichar_id(i), 1);
} }
// Find the most frequent result. // Find the most frequent result.
@ -1194,51 +1258,9 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
++gap_count; ++gap_count;
prev_blob = blob; prev_blob = blob;
} }
if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) { // Just correct existing classification.
// Needs spaces between. CorrectRepcharChoices(best_choice, word_res);
ExplodeRepeatedWord(best_choice, page_res_it); word_res->reject_map.initialise(word.length());
} else {
// Just correct existing classification.
CorrectRepcharChoices(best_choice, word_res);
word_res->reject_map.initialise(word.length());
}
}
// Explode the word at the given iterator location into individual words
// of a single given unichar_id defined by best_choice.
// The original word is deleted, and the replacements copy most of their
// fields from the original.
void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
PAGE_RES_IT* page_res_it) {
WERD_RES *word_res = page_res_it->word();
ASSERT_HOST(best_choice != NULL);
// Make a new word for each blob in the original.
WERD* werd = word_res->word;
C_BLOB_IT blob_it(werd->cblob_list());
for (; !blob_it.empty(); blob_it.forward()) {
bool first_blob = blob_it.at_first();
bool last_blob = blob_it.at_last();
WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
blob_it.extract());
// Note that blamer_bundle (truth information) is not copied, which is
// desirable, since the newly inserted words would not have the original
// bounding box corresponding to the one recorded in truth fields.
WERD_RES* rep_word =
page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
// Setup the single char WERD_RES
if (rep_word->SetupForRecognition(*word_res->uch_set, this, BestPix(),
tessedit_ocr_engine_mode, NULL, false,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
page_res_it->row()->row,
page_res_it->block()->block)) {
rep_word->CloneChoppedToRebuild();
BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
rep_word->FakeClassifyWord(1, &blob_choice);
}
}
page_res_it->DeleteCurrentWord();
} }
ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string( ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
@ -1405,16 +1427,19 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
show_map_detail = TRUE; show_map_detail = TRUE;
break; break;
} }
tprintf(" \"%s\" ", if (word->best_choice != NULL) {
word->best_choice->unichar_string().string()); tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
word->reject_map.print (debug_fp); word->reject_map.print(debug_fp);
tprintf ("\n"); tprintf("\n");
if (show_map_detail) { if (show_map_detail) {
tprintf ("\"%s\"\n", word->best_choice->unichar_string().string()); tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
word->reject_map[i].full_print(debug_fp); word->reject_map[i].full_print(debug_fp);
}
} }
} else {
tprintf("null best choice\n");
} }
tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");

View File

@ -205,7 +205,8 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
if ((!word->part_of_combo) && (word->box_word == NULL)) { if ((!word->part_of_combo) && (word->box_word == NULL)) {
WordData word_data(block, row, word); WordData word_data(block, row, word);
SetupWordPassN(2, &word_data); SetupWordPassN(2, &word_data);
classify_word_and_language(&Tesseract::classify_word_pass2, &word_data); classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
&word_data);
} }
prev_word_best_choice_ = word->best_choice; prev_word_best_choice_ = word->best_choice;
} }

View File

@ -30,15 +30,12 @@ namespace tesseract {
void Tesseract::process_selected_words( void Tesseract::process_selected_words(
PAGE_RES* page_res, // blocks to check PAGE_RES* page_res, // blocks to check
TBOX & selection_box, TBOX & selection_box,
BOOL8(tesseract::Tesseract::*word_processor)( // function to call BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)) {
BLOCK* block, ROW* row, WERD_RES* word_res)) {
for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL; for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
page_res_it.forward()) { page_res_it.forward()) {
WERD* word = page_res_it.word()->word; WERD* word = page_res_it.word()->word;
if (word->bounding_box().overlap(selection_box)) { if (word->bounding_box().overlap(selection_box)) {
if (!((this->*word_processor)(page_res_it.block()->block, if (!(this->*word_processor)(&page_res_it))
page_res_it.row()->row,
page_res_it.word())))
return; return;
} }
} }

View File

@ -39,13 +39,11 @@ void Tesseract::PrerecAllWordsPar(const GenericVector<WordData>& words) {
for (int w = 0; w < words.size(); ++w) { for (int w = 0; w < words.size(); ++w) {
if (words[w].word->ratings != NULL && if (words[w].word->ratings != NULL &&
words[w].word->ratings->get(0, 0) == NULL) { words[w].word->ratings->get(0, 0) == NULL) {
for (int b = 0; b < words[w].word->chopped_word->NumBlobs(); ++b) {
blobs.push_back(BlobData(b, this, *words[w].word));
}
for (int s = 0; s < words[w].lang_words.size(); ++s) { for (int s = 0; s < words[w].lang_words.size(); ++s) {
const WERD_RES& word = words[w].lang_words[s]; Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
const WERD_RES& word = *words[w].lang_words[s];
for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) { for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
blobs.push_back(BlobData(b, sub_langs_[s], word)); blobs.push_back(BlobData(b, sub, word));
} }
} }
} }

View File

@ -306,10 +306,7 @@ SVMenuNode *Tesseract::build_menu_new() {
* Redisplay page * Redisplay page
*/ */
void Tesseract::do_re_display( void Tesseract::do_re_display(
BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block, BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it)) {
ROW* row,
WERD_RES* word_res)) {
PAGE_RES_IT pr_it(current_page_res);
int block_count = 1; int block_count = 1;
image_win->Clear(); image_win->Clear();
@ -317,8 +314,9 @@ void Tesseract::do_re_display(
image_win->Image(pix_binary_, 0, 0); image_win->Image(pix_binary_, 0, 0);
} }
PAGE_RES_IT pr_it(current_page_res);
for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) { for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
(this->*word_painter)(pr_it.block()->block, pr_it.row()->row, word); (this->*word_painter)(&pr_it);
if (display_baselines && pr_it.row() != pr_it.prev_row()) if (display_baselines && pr_it.row() != pr_it.prev_row())
pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN); pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
if (display_blocks && pr_it.block() != pr_it.prev_block()) if (display_blocks && pr_it.block() != pr_it.prev_block())
@ -714,11 +712,10 @@ void show_point(PAGE_RES* page_res, float x, float y) {
#endif // GRAPHICS_DISABLED #endif // GRAPHICS_DISABLED
namespace tesseract { namespace tesseract {
#ifndef GRAPHICS_DISABLED #ifndef GRAPHICS_DISABLED
BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row, BOOL8 Tesseract:: word_blank_and_set_display(PAGE_RES_IT* pr_it) {
WERD_RES* word_res) { pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
word_res->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK);
ScrollView::BLACK); return word_set_display(pr_it);
return word_set_display(block, row, word_res);
} }
@ -727,7 +724,8 @@ BOOL8 Tesseract:: word_blank_and_set_display(BLOCK* block, ROW* row,
* *
* Normalize word and display in word window * Normalize word and display in word window
*/ */
BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) { BOOL8 Tesseract::word_bln_display(PAGE_RES_IT* pr_it) {
WERD_RES* word_res = pr_it->word();
if (word_res->chopped_word == NULL) { if (word_res->chopped_word == NULL) {
// Setup word normalization parameters. // Setup word normalization parameters.
word_res->SetupForRecognition(unicharset, this, BestPix(), word_res->SetupForRecognition(unicharset, this, BestPix(),
@ -735,7 +733,7 @@ BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
classify_bln_numeric_mode, classify_bln_numeric_mode,
textord_use_cjk_fp_model, textord_use_cjk_fp_model,
poly_allow_detailed_fx, poly_allow_detailed_fx,
row, block); pr_it->row()->row, pr_it->block()->block);
} }
bln_word_window_handle()->Clear(); bln_word_window_handle()->Clear();
display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
@ -758,7 +756,8 @@ BOOL8 Tesseract::word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
* *
* Display a word according to its display modes * Display a word according to its display modes
*/ */
BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) { BOOL8 Tesseract::word_display(PAGE_RES_IT* pr_it) {
WERD_RES* word_res = pr_it->word();
WERD* word = word_res->word; WERD* word = word_res->word;
TBOX word_bb; // word bounding box TBOX word_bb; // word bounding box
int word_height; // ht of word BB int word_height; // ht of word BB
@ -918,14 +917,15 @@ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
* *
* Dump members to the debug window * Dump members to the debug window
*/ */
BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) { BOOL8 Tesseract::word_dumper(PAGE_RES_IT* pr_it) {
if (block != NULL) { if (pr_it->block()->block != NULL) {
tprintf("\nBlock data...\n"); tprintf("\nBlock data...\n");
block->print(NULL, FALSE); pr_it->block()->block->print(NULL, FALSE);
} }
tprintf("\nRow data...\n"); tprintf("\nRow data...\n");
row->print(NULL); pr_it->row()->row->print(NULL);
tprintf("\nWord data...\n"); tprintf("\nWord data...\n");
WERD_RES* word_res = pr_it->word();
word_res->word->print(); word_res->word->print();
if (word_res->blamer_bundle != NULL && wordrec_debug_blamer && if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) { word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
@ -941,8 +941,8 @@ BOOL8 Tesseract::word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res) {
* *
* Display word according to current display mode settings * Display word according to current display mode settings
*/ */
BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) { BOOL8 Tesseract::word_set_display(PAGE_RES_IT* pr_it) {
WERD* word = word_res->word; WERD* word = pr_it->word()->word;
word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX)); word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT)); word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL)); word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
@ -950,26 +950,24 @@ BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
word->set_display_flag(DF_BN_POLYGONAL, word->set_display_flag(DF_BN_POLYGONAL,
word_display_mode.bit(DF_BN_POLYGONAL)); word_display_mode.bit(DF_BN_POLYGONAL));
word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER)); word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
return word_display(block, row, word_res); return word_display(pr_it);
} }
// page_res is non-const because the iterator doesn't know if you are going // page_res is non-const because the iterator doesn't know if you are going
// to change the items it points to! Really a const here though. // to change the items it points to! Really a const here though.
void Tesseract::blob_feature_display(PAGE_RES* page_res, void Tesseract::blob_feature_display(PAGE_RES* page_res,
const TBOX& selection_box) { const TBOX& selection_box) {
ROW* row; // row of word PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
BLOCK* block; // block of word if (it != NULL) {
WERD* word = make_pseudo_word(page_res, selection_box, block, row); WERD_RES* word_res = it->word();
if (word != NULL) { word_res->x_height = it->row()->row->x_height();
WERD_RES word_res(word); word_res->SetupForRecognition(unicharset, this, BestPix(),
word_res.x_height = row->x_height(); tessedit_ocr_engine_mode, NULL,
word_res.SetupForRecognition(unicharset, this, BestPix(), classify_bln_numeric_mode,
tessedit_ocr_engine_mode, NULL, textord_use_cjk_fp_model,
classify_bln_numeric_mode, poly_allow_detailed_fx,
textord_use_cjk_fp_model, it->row()->row, it->block()->block);
poly_allow_detailed_fx, TWERD* bln_word = word_res->chopped_word;
row, block);
TWERD* bln_word = word_res.chopped_word;
TBLOB* bln_blob = bln_word->blobs[0]; TBLOB* bln_blob = bln_word->blobs[0];
INT_FX_RESULT_STRUCT fx_info; INT_FX_RESULT_STRUCT fx_info;
GenericVector<INT_FEATURE_STRUCT> bl_features; GenericVector<INT_FEATURE_STRUCT> bl_features;
@ -989,7 +987,8 @@ void Tesseract::blob_feature_display(PAGE_RES* page_res,
RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN); RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
cn_win->Update(); cn_win->Update();
delete word; it->DeleteCurrentWord();
delete it;
} }
} }

View File

@ -51,15 +51,11 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {
// Copies the bounding box from page_res_it->word() to the given TBOX. // Copies the bounding box from page_res_it->word() to the given TBOX.
bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) { bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
while (page_res_it->block() != NULL) { while (page_res_it->block() != NULL && page_res_it->word() == NULL)
if (page_res_it->word() != NULL)
break;
page_res_it->forward(); page_res_it->forward();
}
if (page_res_it->word() != NULL) { if (page_res_it->word() != NULL) {
*tbox = page_res_it->word()->word->bounding_box(); *tbox = page_res_it->word()->word->bounding_box();
page_res_it->forward();
// If tbox->left() is negative, the training image has vertical text and // If tbox->left() is negative, the training image has vertical text and
// all the coordinates of bounding boxes of page_res are rotated by 90 // all the coordinates of bounding boxes of page_res are rotated by 90
@ -109,26 +105,34 @@ void Tesseract::recog_training_segmented(const STRING &fname,
// Align bottom left points of the TBOXes. // Align bottom left points of the TBOXes.
while (keep_going && while (keep_going &&
!NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
keep_going = (bbox.bottom() < tbox.bottom()) ? if (bbox.bottom() < tbox.bottom()) {
read_t(&page_res_it, &tbox) : page_res_it.forward();
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); keep_going = read_t(&page_res_it, &tbox);
} else {
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
&bbox);
}
} }
while (keep_going && while (keep_going &&
!NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) : if (bbox.left() > tbox.left()) {
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); page_res_it.forward();
keep_going = read_t(&page_res_it, &tbox);
} else {
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
&bbox);
}
} }
// OCR the word if top right points of the TBOXes are similar. // OCR the word if top right points of the TBOXes are similar.
if (keep_going && if (keep_going &&
NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
ambigs_classify_and_output(page_res_it.prev_word(), ambigs_classify_and_output(label.string(), &page_res_it, output_file);
page_res_it.prev_row(),
page_res_it.prev_block(),
label.string(), output_file);
examined_words++; examined_words++;
} }
page_res_it.forward();
} while (keep_going); } while (keep_going);
fclose(box_file);
// Set up scripts on all of the words that did not get sent to // Set up scripts on all of the words that did not get sent to
// ambigs_classify_and_output. They all should have, but if all the // ambigs_classify_and_output. They all should have, but if all the
@ -196,16 +200,16 @@ static void PrintMatrixPaths(int col, int dim,
// raw choice as a result of the classification. For words labeled with a // raw choice as a result of the classification. For words labeled with a
// single unichar also outputs all alternatives from blob_choices of the // single unichar also outputs all alternatives from blob_choices of the
// best choice. // best choice.
void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res, void Tesseract::ambigs_classify_and_output(const char *label,
ROW_RES *row_res, PAGE_RES_IT* pr_it,
BLOCK_RES *block_res,
const char *label,
FILE *output_file) { FILE *output_file) {
// Classify word. // Classify word.
fflush(stdout); fflush(stdout);
WordData word_data(block_res->block, row_res->row, werd_res); WordData word_data(*pr_it);
SetupWordPassN(1, &word_data); SetupWordPassN(1, &word_data);
classify_word_pass1(&word_data, werd_res); classify_word_and_language(&Tesseract::classify_word_pass1,
pr_it, &word_data);
WERD_RES* werd_res = word_data.word;
WERD_CHOICE *best_choice = werd_res->best_choice; WERD_CHOICE *best_choice = werd_res->best_choice;
ASSERT_HOST(best_choice != NULL); ASSERT_HOST(best_choice != NULL);

View File

@ -96,8 +96,6 @@ Tesseract::Tesseract()
" whose outlines overlap horizontally.", this->params()), " whose outlines overlap horizontally.", this->params()),
BOOL_MEMBER(tessedit_display_outwords, false, BOOL_MEMBER(tessedit_display_outwords, false,
"Draw output words", this->params()), "Draw output words", this->params()),
BOOL_MEMBER(tessedit_training_tess, false,
"Call Tess to learn blobs", this->params()),
BOOL_MEMBER(tessedit_dump_choices, false, BOOL_MEMBER(tessedit_dump_choices, false,
"Dump char choices", this->params()), "Dump char choices", this->params()),
BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
@ -315,16 +313,6 @@ Tesseract::Tesseract()
"Write .html hOCR output file", this->params()), "Write .html hOCR output file", this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, BOOL_MEMBER(tessedit_create_pdf, false,
"Write .pdf output file", this->params()), "Write .pdf output file", this->params()),
INT_MEMBER(tessedit_pdf_compression, 0,
"Type of image compression in pdf output: "
"0 - autoselection (default); "
"1 - jpeg; "
"2 - G4; "
"3 - flate",
this->params()),
INT_MEMBER(tessedit_pdf_jpg_quality, 85,
"Quality level of jpeg image compression in pdf output",
this->params()),
STRING_MEMBER(unrecognised_char, "|", STRING_MEMBER(unrecognised_char, "|",
"Output char for unidentified blobs", this->params()), "Output char for unidentified blobs", this->params()),
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),

View File

@ -31,20 +31,20 @@
#include "textord.h" #include "textord.h"
#include "wordrec.h" #include "wordrec.h"
class PAGE_RES; class BLOB_CHOICE_LIST_CLIST;
class PAGE_RES_IT;
class BLOCK_LIST; class BLOCK_LIST;
class CharSamp; class CharSamp;
class TO_BLOCK_LIST;
class WERD_RES;
class ROW;
class TBOX;
class SVMenuNode;
struct Pix;
class WERD_CHOICE;
class WERD;
class BLOB_CHOICE_LIST_CLIST;
struct OSResults; struct OSResults;
class PAGE_RES;
class PAGE_RES_IT;
struct Pix;
class ROW;
class SVMenuNode;
class TBOX;
class TO_BLOCK_LIST;
class WERD;
class WERD_CHOICE;
class WERD_RES;
// Top-level class for all tesseract global instance data. // Top-level class for all tesseract global instance data.
@ -144,10 +144,19 @@ struct WordData {
ROW* row; ROW* row;
BLOCK* block; BLOCK* block;
WordData* prev_word; WordData* prev_word;
GenericVector<WERD_RES> lang_words; PointerVector<WERD_RES> lang_words;
}; };
typedef void (Tesseract::*WordRecognizer)(WordData* word_data, WERD_RES* word); // Definition of a Tesseract WordRecognizer. The WordData provides the context
// of row/block, in_word holds an initialized, possibly pre-classified word,
// that the recognizer may or may not consume (but if so it sets *in_word=NULL)
// and produces one or more output words in out_words, which may be the
// consumed in_word, or may be generated independently.
// This api allows both a conventional tesseract classifier to work, or a
// line-level classifier that generates multiple words from a merged input.
typedef void (Tesseract::*WordRecognizer)(const WordData& word_data,
WERD_RES** in_word,
PointerVector<WERD_RES>* out_words);
class Tesseract : public Wordrec { class Tesseract : public Wordrec {
public: public:
@ -279,6 +288,7 @@ class Tesseract : public Wordrec {
void SetupWordPassN(int pass_n, WordData* word); void SetupWordPassN(int pass_n, WordData* word);
// Runs word recognition on all the words. // Runs word recognition on all the words.
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
PAGE_RES_IT* pr_it,
GenericVector<WordData>* words); GenericVector<WordData>* words);
bool recog_all_words(PAGE_RES* page_res, bool recog_all_words(PAGE_RES* page_res,
ETEXT_DESC* monitor, ETEXT_DESC* monitor,
@ -294,28 +304,35 @@ class Tesseract : public Wordrec {
// Sets script positions and detects smallcaps on all output words. // Sets script positions and detects smallcaps on all output words.
void script_pos_pass(PAGE_RES* page_res); void script_pos_pass(PAGE_RES* page_res);
// Helper to recognize the word using the given (language-specific) tesseract. // Helper to recognize the word using the given (language-specific) tesseract.
// Returns true if the result was better than previously. // Returns positive if this recognizer found more new best words than the
bool RetryWithLanguage(const WERD_RES& best_word, WordData* word_data, // number kept from best_words.
WERD_RES* word, WordRecognizer recognizer); int RetryWithLanguage(const WordData& word_data,
WordRecognizer recognizer,
WERD_RES** in_word,
PointerVector<WERD_RES>* best_words);
void classify_word_and_language(WordRecognizer recognizer, void classify_word_and_language(WordRecognizer recognizer,
PAGE_RES_IT* pr_it,
WordData* word_data); WordData* word_data);
void classify_word_pass1(WordData* word_data, WERD_RES* word); void classify_word_pass1(const WordData& word_data,
WERD_RES** in_word,
PointerVector<WERD_RES>* out_words);
void recog_pseudo_word(PAGE_RES* page_res, // blocks to check void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
TBOX &selection_box); TBOX &selection_box);
void fix_rep_char(PAGE_RES_IT* page_res_it); void fix_rep_char(PAGE_RES_IT* page_res_it);
void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it);
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set, ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
const char *s, const char *s,
const char *lengths); const char *lengths);
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block); void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
void classify_word_pass2(WordData* word_data, WERD_RES* word); void classify_word_pass2(const WordData& word_data,
WERD_RES** in_word,
PointerVector<WERD_RES>* out_words);
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
WERD_RES* word, WERD_RES* new_word); WERD_RES* word, WERD_RES* new_word);
bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row); bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row); bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res); BOOL8 recog_interactive(PAGE_RES_IT* pr_it);
// Set fonts of this word. // Set fonts of this word.
void set_word_fonts(WERD_RES *word); void set_word_fonts(WERD_RES *word);
@ -473,15 +490,13 @@ class Tesseract : public Wordrec {
); );
void debug_word(PAGE_RES* page_res, const TBOX &selection_box); void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
void do_re_display( void do_re_display(
BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block, BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
ROW* row, BOOL8 word_display(PAGE_RES_IT* pr_it);
WERD_RES* word_res)); BOOL8 word_bln_display(PAGE_RES_IT* pr_it);
BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res); BOOL8 word_blank_and_set_display(PAGE_RES_IT* pr_its);
BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res); BOOL8 word_set_display(PAGE_RES_IT* pr_it);
BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
// #ifndef GRAPHICS_DISABLED // #ifndef GRAPHICS_DISABLED
BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res); BOOL8 word_dumper(PAGE_RES_IT* pr_it);
// #endif // GRAPHICS_DISABLED // #endif // GRAPHICS_DISABLED
void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box); void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
//// reject.h ////////////////////////////////////////////////////////// //// reject.h //////////////////////////////////////////////////////////
@ -537,10 +552,7 @@ class Tesseract : public Wordrec {
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block); void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list); inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block); void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
void fix_fuzzy_space_list( //space explorer void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
WERD_RES_LIST &best_perm,
ROW *row,
BLOCK* block);
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block); void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
void fix_fuzzy_spaces( //find fuzzy words void fix_fuzzy_spaces( //find fuzzy words
ETEXT_DESC *monitor, //progress monitor ETEXT_DESC *monitor, //progress monitor
@ -583,9 +595,7 @@ class Tesseract : public Wordrec {
PAGE_RES* page_res, // blocks to check PAGE_RES* page_res, // blocks to check
//function to call //function to call
TBOX & selection_box, TBOX & selection_box,
BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block, BOOL8 (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
ROW* row,
WERD_RES* word_res));
//// tessbox.cpp /////////////////////////////////////////////////////// //// tessbox.cpp ///////////////////////////////////////////////////////
void tess_add_doc_word( //test acceptability void tess_add_doc_word( //test acceptability
WERD_CHOICE *word_choice //after context WERD_CHOICE *word_choice //after context
@ -752,7 +762,6 @@ class Tesseract : public Wordrec {
"Each bounding box is assumed to contain ngrams. Only" "Each bounding box is assumed to contain ngrams. Only"
" learn the ngrams whose outlines overlap horizontally."); " learn the ngrams whose outlines overlap horizontally.");
BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words"); BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices"); BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats"); BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
@ -908,13 +917,6 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
INT_VAR_H(tessedit_pdf_compression, 0, "Type of image encoding in pdf output:"
"0 - autoselection (default); "
"1 - jpeg; "
"2 - G4; "
"3 - flate");
INT_VAR_H(tessedit_pdf_jpg_quality, 85, "Quality level of jpeg image "
"compression in pdf output");
STRING_VAR_H(unrecognised_char, "|", STRING_VAR_H(unrecognised_char, "|",
"Output char for unidentified blobs"); "Output char for unidentified blobs");
INT_VAR_H(suspect_level, 99, "Suspect marker level"); INT_VAR_H(suspect_level, 99, "Suspect marker level");
@ -1046,10 +1048,8 @@ class Tesseract : public Wordrec {
PAGE_RES *page_res, PAGE_RES *page_res,
volatile ETEXT_DESC *monitor, volatile ETEXT_DESC *monitor,
FILE *output_file); FILE *output_file);
void ambigs_classify_and_output(WERD_RES *werd_res, void ambigs_classify_and_output(const char *label,
ROW_RES *row_res, PAGE_RES_IT* pr_it,
BLOCK_RES *block_res,
const char *label,
FILE *output_file); FILE *output_file);
inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; } inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }

View File

@ -171,7 +171,7 @@ void ImageThresholder::SetImage(const Pix* pix) {
// Threshold the source image as efficiently as possible to the output Pix. // Threshold the source image as efficiently as possible to the output Pix.
// Creates a Pix and sets pix to point to the resulting pointer. // Creates a Pix and sets pix to point to the resulting pointer.
// Caller must use pixDestroy to free the created Pix. // Caller must use pixDestroy to free the created Pix.
void ImageThresholder::ThresholdToPix(Pix** pix) { void ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) {
if (pix_channels_ == 0) { if (pix_channels_ == 0) {
// We have a binary image, so it just has to be cloned. // We have a binary image, so it just has to be cloned.
*pix = GetPixRect(); *pix = GetPixRect();

View File

@ -20,7 +20,8 @@
#ifndef TESSERACT_CCMAIN_THRESHOLDER_H__ #ifndef TESSERACT_CCMAIN_THRESHOLDER_H__
#define TESSERACT_CCMAIN_THRESHOLDER_H__ #define TESSERACT_CCMAIN_THRESHOLDER_H__
#include "platform.h" #include "platform.h"
#include "publictypes.h"
struct Pix; struct Pix;
@ -116,7 +117,7 @@ class TESS_API ImageThresholder {
/// Threshold the source image as efficiently as possible to the output Pix. /// Threshold the source image as efficiently as possible to the output Pix.
/// Creates a Pix and sets pix to point to the resulting pointer. /// Creates a Pix and sets pix to point to the resulting pointer.
/// Caller must use pixDestroy to free the created Pix. /// Caller must use pixDestroy to free the created Pix.
virtual void ThresholdToPix(Pix** pix); virtual void ThresholdToPix(PageSegMode pageseg_mode, Pix** pix);
// Gets a pix that contains an 8 bit threshold value at each pixel. The // Gets a pix that contains an 8 bit threshold value at each pixel. The
// returned pix may be an integer reduction of the binary image such that // returned pix may be an integer reduction of the binary image such that

View File

@ -23,17 +23,15 @@
* make_pseudo_word * make_pseudo_word
* *
* Make all the blobs inside a selection into a single word. * Make all the blobs inside a selection into a single word.
* The word is always a copy and needs to be deleted. * The returned PAGE_RES_IT* it points to the new word. After use, call
* it->DeleteCurrentWord() to delete the fake word, and then
* delete it to get rid of the iterator itself.
**********************************************************************/ **********************************************************************/
WERD *make_pseudo_word(PAGE_RES* page_res, // Blocks to check. PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
const TBOX &selection_box,
BLOCK *&pseudo_block,
ROW *&pseudo_row) { // Row of selection.
PAGE_RES_IT pr_it(page_res); PAGE_RES_IT pr_it(page_res);
C_BLOB_LIST new_blobs; // list of gathered blobs C_BLOB_LIST new_blobs; // list of gathered blobs
C_BLOB_IT new_blob_it = &new_blobs; // iterator C_BLOB_IT new_blob_it = &new_blobs; // iterator
WERD *pseudo_word; // fabricated word
for (WERD_RES* word_res = pr_it.word(); word_res != NULL; for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) { word_res = pr_it.forward()) {
@ -45,15 +43,17 @@ WERD *make_pseudo_word(PAGE_RES* page_res, // Blocks to check.
C_BLOB* blob = blob_it.data(); C_BLOB* blob = blob_it.data();
if (blob->bounding_box().overlap(selection_box)) { if (blob->bounding_box().overlap(selection_box)) {
new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob)); new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
pseudo_row = pr_it.row()->row;
pseudo_block = pr_it.block()->block;
} }
} }
if (!new_blobs.empty()) {
WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
while (it->word() != word_res && it->word() != NULL) it->forward();
ASSERT_HOST(it->word() == word_res);
return it;
}
} }
} }
if (!new_blobs.empty()) return NULL;
pseudo_word = new WERD(&new_blobs, 1, NULL);
else
pseudo_word = NULL;
return pseudo_word;
} }

View File

@ -22,9 +22,6 @@
#include "pageres.h" #include "pageres.h"
WERD *make_pseudo_word(PAGE_RES* page_res, // blocks to check PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box);
const TBOX &selection_box,
BLOCK *&pseudo_block,
ROW *&pseudo_row);
#endif #endif

View File

@ -157,6 +157,13 @@ void BoxWord::InsertBox(int index, const TBOX& box) {
ComputeBoundingBox(); ComputeBoundingBox();
} }
// Changes the box at the given index to the new box.
// Recomputes the bounding box.
void BoxWord::ChangeBox(int index, const TBOX& box) {
boxes_[index] = box;
ComputeBoundingBox();
}
// Deletes the box with the given index, and shuffles up the rest. // Deletes the box with the given index, and shuffles up the rest.
// Recomputes the bounding box. // Recomputes the bounding box.
void BoxWord::DeleteBox(int index) { void BoxWord::DeleteBox(int index) {

View File

@ -63,6 +63,10 @@ class BoxWord {
// Recomputes the bounding box. // Recomputes the bounding box.
void InsertBox(int index, const TBOX& box); void InsertBox(int index, const TBOX& box);
// Changes the box at the given index to the new box.
// Recomputes the bounding box.
void ChangeBox(int index, const TBOX& box);
// Deletes the box with the given index, and shuffles up the rest. // Deletes the box with the given index, and shuffles up the rest.
// Recomputes the bounding box. // Recomputes the bounding box.
void DeleteBox(int index); void DeleteBox(int index);

View File

@ -34,6 +34,13 @@ static const double kStopperAmbiguityThresholdGain = 8.0;
static const double kStopperAmbiguityThresholdOffset = 1.5; static const double kStopperAmbiguityThresholdOffset = 1.5;
// Max number of broken pieces to associate. // Max number of broken pieces to associate.
const int kWordrecMaxNumJoinChunks = 4; const int kWordrecMaxNumJoinChunks = 4;
// Max ratio of word box height to line size to allow it to be processed as
// a line with other words.
const double kMaxWordSizeRatio = 1.25;
// Max ratio of line box height to line size to allow a new word to be added.
const double kMaxLineSizeRatio = 1.25;
// Max ratio of word gap to line size to allow a new word to be added.
const double kMaxWordGapRatio = 2.0;
// Computes and returns a threshold of certainty difference used to determine // Computes and returns a threshold of certainty difference used to determine
// which words to keep, based on the adjustment factors of the two words. // which words to keep, based on the adjustment factors of the two words.
@ -49,6 +56,7 @@ static double StopperAmbigThreshold(double f1, double f2) {
* Constructor for page results * Constructor for page results
*************************************************************************/ *************************************************************************/
PAGE_RES::PAGE_RES( PAGE_RES::PAGE_RES(
bool merge_similar_words,
BLOCK_LIST *the_block_list, BLOCK_LIST *the_block_list,
WERD_CHOICE **prev_word_best_choice_ptr) { WERD_CHOICE **prev_word_best_choice_ptr) {
Init(); Init();
@ -56,7 +64,8 @@ PAGE_RES::PAGE_RES(
BLOCK_RES_IT block_res_it(&block_res_list); BLOCK_RES_IT block_res_it(&block_res_list);
for (block_it.mark_cycle_pt(); for (block_it.mark_cycle_pt();
!block_it.cycled_list(); block_it.forward()) { !block_it.cycled_list(); block_it.forward()) {
block_res_it.add_to_end(new BLOCK_RES(block_it.data())); block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
block_it.data()));
} }
prev_word_best_choice = prev_word_best_choice_ptr; prev_word_best_choice = prev_word_best_choice_ptr;
} }
@ -67,7 +76,7 @@ PAGE_RES::PAGE_RES(
* Constructor for BLOCK results * Constructor for BLOCK results
*************************************************************************/ *************************************************************************/
BLOCK_RES::BLOCK_RES(BLOCK *the_block) { BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
ROW_IT row_it (the_block->row_list ()); ROW_IT row_it (the_block->row_list ());
ROW_RES_IT row_res_it(&row_res_list); ROW_RES_IT row_res_it(&row_res_list);
@ -83,22 +92,20 @@ BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
block = the_block; block = the_block;
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
row_res_it.add_to_end(new ROW_RES(row_it.data())); row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
} }
} }
/************************************************************************* /*************************************************************************
* ROW_RES::ROW_RES * ROW_RES::ROW_RES
* *
* Constructor for ROW results * Constructor for ROW results
*************************************************************************/ *************************************************************************/
ROW_RES::ROW_RES(ROW *the_row) { ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
WERD_IT word_it(the_row->word_list()); WERD_IT word_it(the_row->word_list());
WERD_RES_IT word_res_it(&word_res_list); WERD_RES_IT word_res_it(&word_res_list);
WERD_RES *combo = NULL; // current combination of fuzzies WERD_RES *combo = NULL; // current combination of fuzzies
WERD_RES *word_res; // current word
WERD *copy_word; WERD *copy_word;
char_count = 0; char_count = 0;
@ -106,20 +113,48 @@ ROW_RES::ROW_RES(ROW *the_row) {
whole_word_rej_count = 0; whole_word_rej_count = 0;
row = the_row; row = the_row;
bool add_next_word = false;
TBOX union_box;
float line_height = the_row->x_height() + the_row->ascenders() -
the_row->descenders();
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
word_res = new WERD_RES(word_it.data()); WERD_RES* word_res = new WERD_RES(word_it.data());
word_res->x_height = the_row->x_height(); word_res->x_height = the_row->x_height();
if (add_next_word) {
if (word_res->word->flag(W_FUZZY_NON)) {
ASSERT_HOST(combo != NULL); ASSERT_HOST(combo != NULL);
// We are adding this word to the combination.
word_res->part_of_combo = TRUE; word_res->part_of_combo = TRUE;
combo->copy_on(word_res); combo->copy_on(word_res);
} else if (merge_similar_words) {
union_box = word_res->word->bounding_box();
add_next_word = !word_res->word->flag(W_REP_CHAR) &&
union_box.height() <= line_height * kMaxWordSizeRatio;
word_res->odd_size = !add_next_word;
} }
if (word_it.data_relative(1)->flag(W_FUZZY_NON)) { WERD* next_word = word_it.data_relative(1);
if (merge_similar_words) {
if (add_next_word && !next_word->flag(W_REP_CHAR)) {
// Next word will be added on if all of the following are true:
// Not a rep char.
// Box height small enough.
// Union box height small enough.
// Horizontal gap small enough.
TBOX next_box = next_word->bounding_box();
int prev_right = union_box.right();
union_box += next_box;
if (next_box.height() > line_height * kMaxWordSizeRatio ||
union_box.height() > line_height * kMaxLineSizeRatio ||
next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
add_next_word = false;
}
}
} else {
add_next_word = next_word->flag(W_FUZZY_NON);
}
if (add_next_word) {
if (combo == NULL) { if (combo == NULL) {
copy_word = new WERD; copy_word = new WERD;
//deep copy *copy_word = *(word_it.data()); // deep copy
*copy_word = *(word_it.data());
combo = new WERD_RES(copy_word); combo = new WERD_RES(copy_word);
combo->x_height = the_row->x_height(); combo->x_height = the_row->x_height();
combo->combination = TRUE; combo->combination = TRUE;
@ -208,6 +243,7 @@ void WERD_RES::CopySimpleFields(const WERD_RES& source) {
done = source.done; done = source.done;
unlv_crunch_mode = source.unlv_crunch_mode; unlv_crunch_mode = source.unlv_crunch_mode;
small_caps = source.small_caps; small_caps = source.small_caps;
odd_size = source.odd_size;
italic = source.italic; italic = source.italic;
bold = source.bold; bold = source.bold;
fontinfo = source.fontinfo; fontinfo = source.fontinfo;
@ -318,8 +354,7 @@ void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
TBOX box = b_it.data()->bounding_box(); TBOX box = b_it.data()->bounding_box();
box_word->InsertBox(box_word->length(), box); box_word->InsertBox(box_word->length(), box);
fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f, fake_choices[blob_id++] = new BLOB_CHOICE;
-1, -1, -1, 0, 0, 0, BCC_FAKE);
} }
FakeClassifyWord(blob_count, fake_choices); FakeClassifyWord(blob_count, fake_choices);
delete [] fake_choices; delete [] fake_choices;
@ -446,6 +481,13 @@ void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
} }
} }
// Prints the top choice along with the accepted/done flags.
void WERD_RES::DebugTopChoice(const char* msg) const {
tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
tess_accepted, tess_would_adapt, done);
best_choice->print(msg);
}
// Removes from best_choices all choices which are not within a reasonable // Removes from best_choices all choices which are not within a reasonable
// range of the best choice. // range of the best choice.
// TODO(rays) incorporate the information used here into the params training // TODO(rays) incorporate the information used here into the params training
@ -830,6 +872,7 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
} }
FakeWordFromRatings(); FakeWordFromRatings();
reject_map.initialise(blob_count); reject_map.initialise(blob_count);
done = true;
} }
// Creates a WERD_CHOICE for the word using the top choices from the leading // Creates a WERD_CHOICE for the word using the top choices from the leading
@ -1038,6 +1081,7 @@ void WERD_RES::InitNonPointers() {
done = FALSE; done = FALSE;
unlv_crunch_mode = CR_NONE; unlv_crunch_mode = CR_NONE;
small_caps = false; small_caps = false;
odd_size = false;
italic = FALSE; italic = FALSE;
bold = FALSE; bold = FALSE;
// The fontinfos and tesseract count as non-pointers as they point to // The fontinfos and tesseract count as non-pointers as they point to
@ -1239,6 +1283,159 @@ WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
return new_res; return new_res;
} }
// Helper computes the boundaries between blobs in the word. The blob bounds
// are likely very poor, if they come from LSTM, where it only outputs the
// character at one pixel within it, so we find the midpoints between them.
static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
GenericVector<int>* blob_ends) {
C_BLOB_IT blob_it(word.word->cblob_list());
for (int i = 0; i < word.best_state.size(); ++i) {
int length = word.best_state[i];
// Get the bounding box of the fake blobs
TBOX blob_box = blob_it.data()->bounding_box();
blob_it.forward();
for (int b = 1; b < length; ++b) {
blob_box += blob_it.data()->bounding_box();
blob_it.forward();
}
// This blob_box is crap, so for now we are only looking for the
// boundaries between them.
int blob_end = MAX_INT32;
if (!blob_it.at_first() || next_word_blobs != NULL) {
if (blob_it.at_first())
blob_it.set_to_list(next_word_blobs);
blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
}
blob_ends->push_back(blob_end);
}
}
// Replaces the current WERD/WERD_RES with the given words. The given words
// contain fake blobs that indicate the position of the characters. These are
// replaced with real blobs from the current word as much as possible.
void PAGE_RES_IT::ReplaceCurrentWord(
tesseract::PointerVector<WERD_RES>* words) {
WERD_RES* input_word = word();
// Set the BOL/EOL flags on the words from the input word.
if (input_word->word->flag(W_BOL)) {
(*words)[0]->word->set_flag(W_BOL, true);
} else {
(*words)[0]->word->set_blanks(1);
}
words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
// Move the blobs from the input word to the new set of words.
// If the input word_res is a combination, then the replacements will also be
// combinations, and will own their own words. If the input word_res is not a
// combination, then the final replacements will not be either, (although it
// is allowed for the input words to be combinations) and their words
// will get put on the row list. This maintains the ownership rules.
WERD_IT w_it(row()->row->word_list());
if (!input_word->combination) {
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD* word = w_it.data();
if (word == input_word->word)
break;
}
// w_it is now set to the input_word's word.
ASSERT_HOST(!w_it.cycled_list());
}
// Insert into the appropriate place in the ROW_RES.
WERD_RES_IT wr_it(&row()->word_res_list);
for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
WERD_RES* word = wr_it.data();
if (word == input_word)
break;
}
ASSERT_HOST(!wr_it.cycled_list());
// Since we only have an estimate of the bounds between blobs, use the blob
// x-middle as the determiner of where to put the blobs
C_BLOB_IT src_b_it(input_word->word->cblob_list());
src_b_it.sort(&C_BLOB::SortByXMiddle);
C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
rej_b_it.sort(&C_BLOB::SortByXMiddle);
for (int w = 0; w < words->size(); ++w) {
WERD_RES* word_w = (*words)[w];
// Compute blob boundaries.
GenericVector<int> blob_ends;
C_BLOB_LIST* next_word_blobs =
w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : NULL;
ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
// Delete the fake blobs on the current word.
word_w->word->cblob_list()->clear();
C_BLOB_IT dest_it(word_w->word->cblob_list());
// Build the box word as we move the blobs.
tesseract::BoxWord* box_word = new tesseract::BoxWord;
for (int i = 0; i < blob_ends.size(); ++i) {
int end_x = blob_ends[i];
TBOX blob_box;
// Add the blobs up to end_x.
while (!src_b_it.empty() &&
src_b_it.data()->bounding_box().x_middle() < end_x) {
blob_box += src_b_it.data()->bounding_box();
dest_it.add_after_then_move(src_b_it.extract());
src_b_it.forward();
}
while (!rej_b_it.empty() &&
rej_b_it.data()->bounding_box().x_middle() < end_x) {
blob_box += rej_b_it.data()->bounding_box();
dest_it.add_after_then_move(rej_b_it.extract());
rej_b_it.forward();
}
// Clip to the previously computed bounds. Although imperfectly accurate,
// it is good enough, and much more complicated to determine where else
// to clip.
if (i > 0 && blob_box.left() < blob_ends[i - 1])
blob_box.set_left(blob_ends[i - 1]);
if (blob_box.right() > end_x)
blob_box.set_right(end_x);
box_word->InsertBox(i, blob_box);
}
// Fix empty boxes. If a very joined blob sits over multiple characters,
// then we will have some empty boxes from using the middle, so look for
// overlaps.
for (int i = 0; i < box_word->length(); ++i) {
TBOX box = box_word->BlobBox(i);
if (box.null_box()) {
// Nothing has its middle in the bounds of this blob, so use anything
// that overlaps.
for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
dest_it.forward()) {
TBOX blob_box = dest_it.data()->bounding_box();
if (blob_box.left() < blob_ends[i] &&
(i == 0 || blob_box.right() >= blob_ends[i - 1])) {
if (i > 0 && blob_box.left() < blob_ends[i - 1])
blob_box.set_left(blob_ends[i - 1]);
if (blob_box.right() > blob_ends[i])
blob_box.set_right(blob_ends[i]);
box_word->ChangeBox(i, blob_box);
break;
}
}
}
}
delete word_w->box_word;
word_w->box_word = box_word;
if (!input_word->combination) {
// Insert word_w->word into the ROW. It doesn't own its word, so the
// ROW needs to own it.
w_it.add_before_stay_put(word_w->word);
word_w->combination = false;
}
(*words)[w] = NULL; // We are taking ownership.
wr_it.add_before_stay_put(word_w);
}
// We have taken ownership of the words.
words->clear();
// Delete the current word, which has been replaced. We could just call
// DeleteCurrentWord, but that would iterate both lists again, and we know
// we are already in the right place.
if (!input_word->combination)
delete w_it.extract();
delete wr_it.extract();
ResetWordIterator();
}
// Deletes the current WERD_RES and its underlying WERD. // Deletes the current WERD_RES and its underlying WERD.
void PAGE_RES_IT::DeleteCurrentWord() { void PAGE_RES_IT::DeleteCurrentWord() {
// Check that this word is as we expect. part_of_combos are NEVER iterated // Check that this word is as we expect. part_of_combos are NEVER iterated
@ -1298,18 +1495,30 @@ WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
// Resets the word_res_it so that it is one past the next_word_res, as // Resets the word_res_it so that it is one past the next_word_res, as
// it should be after internal_forward. If next_row_res != row_res, // it should be after internal_forward. If next_row_res != row_res,
// then the next_word_res is in the next row, so there is no need to do // then the next_word_res is in the next row, so there is no need to do
// anything, since operations on the current word will not have disturbed // anything to word_res_it, but it is still a good idea to reset the pointers
// the word_res_it. // word_res and prev_word_res, which are still in the current row.
void PAGE_RES_IT::ResetWordIterator() { void PAGE_RES_IT::ResetWordIterator() {
if (row_res == next_row_res) { if (row_res == next_row_res) {
// Reset the member iterator so it can move forward and detect the // Reset the member iterator so it can move forward and detect the
// cycled_list state correctly. // cycled_list state correctly.
word_res_it.move_to_first(); word_res_it.move_to_first();
word_res_it.mark_cycle_pt(); word_res_it.mark_cycle_pt();
while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) {
if (prev_row_res == row_res)
prev_word_res = word_res;
word_res = word_res_it.data();
word_res_it.forward(); word_res_it.forward();
}
ASSERT_HOST(!word_res_it.cycled_list()); ASSERT_HOST(!word_res_it.cycled_list());
word_res_it.forward(); word_res_it.forward();
} else {
// word_res_it is OK, but reset word_res and prev_word_res if needed.
WERD_RES_IT wr_it(&row_res->word_res_list);
for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
if (prev_row_res == row_res)
prev_word_res = word_res;
word_res = wr_it.data();
}
} }
} }

View File

@ -82,7 +82,8 @@ class PAGE_RES { // page result
PAGE_RES() { Init(); } // empty constructor PAGE_RES() { Init(); } // empty constructor
PAGE_RES(BLOCK_LIST *block_list, // real blocks PAGE_RES(bool merge_similar_words,
BLOCK_LIST *block_list, // real blocks
WERD_CHOICE **prev_word_best_choice_ptr); WERD_CHOICE **prev_word_best_choice_ptr);
~PAGE_RES () { // destructor ~PAGE_RES () { // destructor
@ -111,7 +112,7 @@ class BLOCK_RES:public ELIST_LINK {
BLOCK_RES() { BLOCK_RES() {
} // empty constructor } // empty constructor
BLOCK_RES(BLOCK *the_block); // real block BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
~BLOCK_RES () { // destructor ~BLOCK_RES () { // destructor
} }
@ -132,7 +133,7 @@ class ROW_RES:public ELIST_LINK {
ROW_RES() { ROW_RES() {
} // empty constructor } // empty constructor
ROW_RES(ROW *the_row); // real row ROW_RES(bool merge_similar_words, ROW *the_row); // real row
~ROW_RES() { // destructor ~ROW_RES() { // destructor
} }
@ -279,7 +280,8 @@ class WERD_RES : public ELIST_LINK {
BOOL8 tess_accepted; // Tess thinks its ok? BOOL8 tess_accepted; // Tess thinks its ok?
BOOL8 tess_would_adapt; // Tess would adapt? BOOL8 tess_would_adapt; // Tess would adapt?
BOOL8 done; // ready for output? BOOL8 done; // ready for output?
bool small_caps; // word appears to be small caps bool small_caps; // word appears to be small caps
bool odd_size; // word is bigger than line or leader dots.
inT8 italic; inT8 italic;
inT8 bold; inT8 bold;
// The fontinfos are pointers to data owned by the classifier. // The fontinfos are pointers to data owned by the classifier.
@ -486,6 +488,9 @@ class WERD_RES : public ELIST_LINK {
// the word_to_debug. // the word_to_debug.
void DebugWordChoices(bool debug, const char* word_to_debug); void DebugWordChoices(bool debug, const char* word_to_debug);
// Prints the top choice along with the accepted/done flags.
void DebugTopChoice(const char* msg) const;
// Removes from best_choices all choices which are not within a reasonable // Removes from best_choices all choices which are not within a reasonable
// range of the best choice. // range of the best choice.
void FilterWordChoices(int debug_level); void FilterWordChoices(int debug_level);
@ -694,6 +699,11 @@ class PAGE_RES_IT {
// the resulting WERD_RES is returned for further setup with best_choice etc. // the resulting WERD_RES is returned for further setup with best_choice etc.
WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word); WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
// Replaces the current WERD/WERD_RES with the given words. The given words
// contain fake blobs that indicate the position of the characters. These are
// replaced with real blobs from the current word as much as possible.
void ReplaceCurrentWord(tesseract::PointerVector<WERD_RES>* words);
// Deletes the current WERD_RES and its underlying WERD. // Deletes the current WERD_RES and its underlying WERD.
void DeleteCurrentWord(); void DeleteCurrentWord();

View File

@ -164,28 +164,37 @@ enum PageSegMode {
PSM_SINGLE_CHAR, ///< Treat the image as a single character. PSM_SINGLE_CHAR, ///< Treat the image as a single character.
PSM_SPARSE_TEXT, ///< Find as much text as possible in no particular order. PSM_SPARSE_TEXT, ///< Find as much text as possible in no particular order.
PSM_SPARSE_TEXT_OSD, ///< Sparse text with orientation and script det. PSM_SPARSE_TEXT_OSD, ///< Sparse text with orientation and script det.
PSM_RAW_LINE, ///< Treat the image as a single text line, bypassing
///< hacks that are Tesseract-specific.
PSM_COUNT ///< Number of enum entries. PSM_COUNT ///< Number of enum entries.
}; };
/** /**
* Macros that act on a PageSegMode to determine whether components of * Inline functions that act on a PageSegMode to determine whether components of
* layout analysis are enabled. * layout analysis are enabled.
* *Depend critically on the order of elements of PageSegMode.* * *Depend critically on the order of elements of PageSegMode.*
* NOTE that arg is an int for compatibility with INT_PARAM.
*/ */
#define PSM_OSD_ENABLED(pageseg_mode) ((pageseg_mode) <= PSM_AUTO_OSD || \ inline bool PSM_OSD_ENABLED(int pageseg_mode) {
(pageseg_mode) == PSM_SPARSE_TEXT_OSD) return pageseg_mode <= PSM_AUTO_OSD || pageseg_mode == PSM_SPARSE_TEXT_OSD;
#define PSM_COL_FIND_ENABLED(pageseg_mode) \ }
((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_AUTO) inline bool PSM_COL_FIND_ENABLED(int pageseg_mode) {
#define PSM_SPARSE(pageseg_mode) \ return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_AUTO;
((pageseg_mode) == PSM_SPARSE_TEXT || (pageseg_mode) == PSM_SPARSE_TEXT_OSD) }
#define PSM_BLOCK_FIND_ENABLED(pageseg_mode) \ inline bool PSM_SPARSE(int pageseg_mode) {
((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_COLUMN) return pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
#define PSM_LINE_FIND_ENABLED(pageseg_mode) \ }
((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_BLOCK) inline bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode) {
#define PSM_WORD_FIND_ENABLED(pageseg_mode) \ return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_COLUMN;
(((pageseg_mode) >= PSM_AUTO_OSD && (pageseg_mode) <= PSM_SINGLE_LINE) || \ }
(pageseg_mode) == PSM_SPARSE_TEXT || (pageseg_mode) == PSM_SPARSE_TEXT_OSD) inline bool PSM_LINE_FIND_ENABLED(int pageseg_mode) {
return pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_BLOCK;
}
inline bool PSM_WORD_FIND_ENABLED(int pageseg_mode) {
return (pageseg_mode >= PSM_AUTO_OSD && pageseg_mode <= PSM_SINGLE_LINE) ||
pageseg_mode == PSM_SPARSE_TEXT || pageseg_mode == PSM_SPARSE_TEXT_OSD;
}
/** /**
* enum of the elements of the page hierarchy, used in ResultIterator * enum of the elements of the page hierarchy, used in ResultIterator

View File

@ -48,11 +48,11 @@ class BLOB_CHOICE: public ELIST_LINK
{ {
public: public:
BLOB_CHOICE() { BLOB_CHOICE() {
unichar_id_ = INVALID_UNICHAR_ID; unichar_id_ = UNICHAR_SPACE;
fontinfo_id_ = -1; fontinfo_id_ = -1;
fontinfo_id2_ = -1; fontinfo_id2_ = -1;
rating_ = MAX_FLOAT32; rating_ = 10.0;
certainty_ = -MAX_FLOAT32; certainty_ = -1.0;
script_id_ = -1; script_id_ = -1;
xgap_before_ = 0; xgap_before_ = 0;
xgap_after_ = 0; xgap_after_ = 0;

View File

@ -78,6 +78,12 @@ class DLLSYM TBOX { // bounding box
void set_right(int x) { void set_right(int x) {
top_right.set_x(x); top_right.set_x(x);
} }
int x_middle() const {
return (bot_left.x() + top_right.x()) / 2;
}
int y_middle() const {
return (bot_left.y() + top_right.y()) / 2;
}
const ICOORD &botleft() const { // access function const ICOORD &botleft() const { // access function
return bot_left; return bot_left;

View File

@ -247,10 +247,11 @@ C_BLOB* C_BLOB::FakeBlob(const TBOX& box) {
* Return the bounding box of the blob. * Return the bounding box of the blob.
**********************************************************************/ **********************************************************************/
TBOX C_BLOB::bounding_box() { //bounding box TBOX C_BLOB::bounding_box() const { // bounding box
C_OUTLINE *outline; //current outline C_OUTLINE *outline; // current outline
C_OUTLINE_IT it = &outlines; //outlines of blob // This is a read-only iteration of the outlines.
TBOX box; //bounding box C_OUTLINE_IT it = const_cast<C_OUTLINE_LIST*>(&outlines);
TBOX box; // bounding box
for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
outline = it.data (); outline = it.data ();

View File

@ -65,7 +65,7 @@ class C_BLOB:public ELIST_LINK
return &outlines; return &outlines;
} }
TBOX bounding_box(); //compute bounding box TBOX bounding_box() const; // compute bounding box
inT32 area(); //compute area inT32 area(); //compute area
inT32 perimeter(); // Total perimeter of outlines and 1st level children. inT32 perimeter(); // Total perimeter of outlines and 1st level children.
inT32 outer_area(); //compute area inT32 outer_area(); //compute area
@ -116,6 +116,14 @@ class C_BLOB:public ELIST_LINK
return blob; return blob;
} }
static int SortByXMiddle(const void *v1, const void *v2) {
const C_BLOB* blob1 = *reinterpret_cast<const C_BLOB* const *>(v1);
const C_BLOB* blob2 = *reinterpret_cast<const C_BLOB* const *>(v2);
return blob1->bounding_box().x_middle() -
blob2->bounding_box().x_middle();
}
private: private:
C_OUTLINE_LIST outlines; //master elements C_OUTLINE_LIST outlines; //master elements
}; };

View File

@ -17,15 +17,17 @@
// //
/////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////
#include "unicharset.h"
#include <assert.h> #include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include "params.h"
#include "serialis.h"
#include "tesscallback.h" #include "tesscallback.h"
#include "tprintf.h" #include "tprintf.h"
#include "unichar.h" #include "unichar.h"
#include "unicharset.h"
#include "params.h"
// Special character used in representing character fragments. // Special character used in representing character fragments.
static const char kSeparator = '|'; static const char kSeparator = '|';
@ -448,11 +450,19 @@ void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
} }
} }
// Makes this a copy of src. Clears this completely first, so the automattic // Makes this a copy of src. Clears this completely first, so the automatic
// ids will not be present in this if not in src. // ids will not be present in this if not in src. Does NOT reorder the set!
void UNICHARSET::CopyFrom(const UNICHARSET& src) { void UNICHARSET::CopyFrom(const UNICHARSET& src) {
clear(); clear();
AppendOtherUnicharset(src); for (int ch = 0; ch < src.size_used; ++ch) {
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
const char* utf8 = src.id_to_unichar(ch);
unichar_insert(utf8);
unichars[ch].properties.ExpandRangesFrom(src_props);
}
// Set properties, including mirror and other_case, WITHOUT reordering
// the unicharset.
PartialSetPropertiesFromOther(0, src);
} }
// For each id in src, if it does not occur in this, add it, as in // For each id in src, if it does not occur in this, add it, as in
@ -689,8 +699,11 @@ bool UNICHARSET::eq(UNICHAR_ID unichar_id,
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0; return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
} }
bool UNICHARSET::save_to_file(FILE *file) const { bool UNICHARSET::save_to_string(STRING *str) const {
fprintf(file, "%d\n", this->size()); const int kFileBufSize = 1024;
char buffer[kFileBufSize + 1];
snprintf(buffer, kFileBufSize, "%d\n", this->size());
*str = buffer;
for (UNICHAR_ID id = 0; id < this->size(); ++id) { for (UNICHAR_ID id = 0; id < this->size(); ++id) {
int min_bottom, max_bottom, min_top, max_top; int min_bottom, max_bottom, min_top, max_top;
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
@ -702,11 +715,11 @@ bool UNICHARSET::save_to_file(FILE *file) const {
get_advance_range(id, &min_advance, &max_advance); get_advance_range(id, &min_advance, &max_advance);
unsigned int properties = this->get_properties(id); unsigned int properties = this->get_properties(id);
if (strcmp(this->id_to_unichar(id), " ") == 0) { if (strcmp(this->id_to_unichar(id), " ") == 0) {
fprintf(file, "%s %x %s %d\n", "NULL", properties, snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
this->get_script_from_script_id(this->get_script(id)), this->get_script_from_script_id(this->get_script(id)),
this->get_other_case(id)); this->get_other_case(id));
} else { } else {
fprintf(file, snprintf(buffer, kFileBufSize,
"%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n", "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
this->id_to_unichar(id), properties, this->id_to_unichar(id), properties,
min_bottom, max_bottom, min_top, max_top, min_width, max_width, min_bottom, max_bottom, min_top, max_top, min_width, max_width,
@ -716,10 +729,12 @@ bool UNICHARSET::save_to_file(FILE *file) const {
this->get_mirror(id), this->get_normed_unichar(id), this->get_mirror(id), this->get_normed_unichar(id),
this->debug_str(id).string()); this->debug_str(id).string());
} }
*str += buffer;
} }
return true; return true;
} }
// TODO(rays) Replace with TFile everywhere.
class InMemoryFilePointer { class InMemoryFilePointer {
public: public:
InMemoryFilePointer(const char *memory, int mem_size) InMemoryFilePointer(const char *memory, int mem_size)
@ -776,6 +791,14 @@ bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
return success; return success;
} }
bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
TessResultCallback2<char *, char *, int> *fgets_cb =
NewPermanentTessCallback(file, &tesseract::TFile::FGets);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
bool UNICHARSET::load_via_fgets( bool UNICHARSET::load_via_fgets(
TessResultCallback2<char *, char *, int> *fgets_cb, TessResultCallback2<char *, char *, int> *fgets_cb,
bool skip_fragments) { bool skip_fragments) {

View File

@ -23,6 +23,7 @@
#include "errcode.h" #include "errcode.h"
#include "genericvector.h" #include "genericvector.h"
#include "helpers.h" #include "helpers.h"
#include "serialis.h"
#include "strngs.h" #include "strngs.h"
#include "tesscallback.h" #include "tesscallback.h"
#include "unichar.h" #include "unichar.h"
@ -317,7 +318,22 @@ class UNICHARSET {
// Saves the content of the UNICHARSET to the given file. // Saves the content of the UNICHARSET to the given file.
// Returns true if the operation is successful. // Returns true if the operation is successful.
bool save_to_file(FILE *file) const; bool save_to_file(FILE *file) const {
STRING str;
if (!save_to_string(&str)) return false;
if (fwrite(&str[0], str.length(), 1, file) != 1) return false;
return true;
}
bool save_to_file(tesseract::TFile *file) const {
STRING str;
if (!save_to_string(&str)) return false;
if (file->FWrite(&str[0], str.length(), 1) != 1) return false;
return true;
}
// Saves the content of the UNICHARSET to the given STRING.
// Returns true if the operation is successful.
bool save_to_string(STRING *str) const;
// Load a unicharset from a unicharset file that has been loaded into // Load a unicharset from a unicharset file that has been loaded into
// the given memory buffer. // the given memory buffer.
@ -348,6 +364,8 @@ class UNICHARSET {
// Returns true if the operation is successful. // Returns true if the operation is successful.
bool load_from_file(FILE *file, bool skip_fragments); bool load_from_file(FILE *file, bool skip_fragments);
bool load_from_file(FILE *file) { return load_from_file(file, false); } bool load_from_file(FILE *file) { return load_from_file(file, false); }
bool load_from_file(tesseract::TFile *file, bool skip_fragments);
// Sets up internal data after loading the file, based on the char // Sets up internal data after loading the file, based on the char
// properties. Called from load_from_file, but also needs to be run // properties. Called from load_from_file, but also needs to be run

View File

@ -161,7 +161,8 @@ float MakeRowFromSubBlobs(TO_BLOCK* block, C_BLOB* blob, TO_ROW_IT* row_it) {
* only a single blob, it makes 2 rows, in case the top-level blob * only a single blob, it makes 2 rows, in case the top-level blob
* is a container of the real blobs to recognize. * is a container of the real blobs to recognize.
*/ */
float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) { float make_single_row(ICOORD page_tr, bool allow_sub_blobs,
TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
BLOBNBOX_IT blob_it = &block->blobs; BLOBNBOX_IT blob_it = &block->blobs;
TO_ROW_IT row_it = block->get_rows(); TO_ROW_IT row_it = block->get_rows();
@ -169,11 +170,17 @@ float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
blob_it.add_list_after(&block->small_blobs); blob_it.add_list_after(&block->small_blobs);
blob_it.add_list_after(&block->noise_blobs); blob_it.add_list_after(&block->noise_blobs);
blob_it.add_list_after(&block->large_blobs); blob_it.add_list_after(&block->large_blobs);
if (block->blobs.singleton()) { if (block->blobs.singleton() && allow_sub_blobs) {
blob_it.move_to_first(); blob_it.move_to_first();
float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it); float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
if (size > block->line_size) if (size > block->line_size)
block->line_size = size; block->line_size = size;
} else if (block->blobs.empty()) {
// Make a fake blob.
C_BLOB* blob = C_BLOB::FakeBlob(block->block->bounding_box());
// The blobnbox owns the blob.
BLOBNBOX* bblob = new BLOBNBOX(blob);
blob_it.add_after_then_move(bblob);
} }
MakeRowFromBlobs(block->line_size, &blob_it, &row_it); MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
// Fit an LMS line to the rows. // Fit an LMS line to the rows.

View File

@ -133,7 +133,7 @@ inline bool within_error_margin(float test, float num, float margin) {
void fill_heights(TO_ROW *row, float gradient, int min_height, void fill_heights(TO_ROW *row, float gradient, int min_height,
int max_height, STATS *heights, STATS *floating_heights); int max_height, STATS *heights, STATS *floating_heights);
float make_single_row(ICOORD page_tr, TO_BLOCK* block, float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK* block,
TO_BLOCK_LIST* blocks); TO_BLOCK_LIST* blocks);
float make_rows(ICOORD page_tr, // top right float make_rows(ICOORD page_tr, // top right
TO_BLOCK_LIST *port_blocks); TO_BLOCK_LIST *port_blocks);

View File

@ -317,8 +317,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
if (PSM_LINE_FIND_ENABLED(pageseg_mode)) { if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
gradient = make_rows(page_tr_, to_blocks); gradient = make_rows(page_tr_, to_blocks);
} else if (!PSM_SPARSE(pageseg_mode)) { } else if (!PSM_SPARSE(pageseg_mode)) {
// SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
gradient = make_single_row(page_tr_, to_block, to_blocks); gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE,
to_block, to_blocks);
} }
BaselineDetect baseline_detector(textord_baseline_debug, BaselineDetect baseline_detector(textord_baseline_debug,
reskew, to_blocks); reskew, to_blocks);
@ -339,7 +340,8 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
make_single_word(pageseg_mode == PSM_SINGLE_CHAR, make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
to_block->get_rows(), to_block->block->row_list()); to_block->get_rows(), to_block->block->row_list());
} }
cleanup_blocks(blocks); // Remove empties. cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
// Remove empties.
// Compute the margins for each row in the block, to be used later for // Compute the margins for each row in the block, to be used later for
// paragraph detection. // paragraph detection.

View File

@ -206,7 +206,7 @@ class Textord {
// Must have at least one WERD. // Must have at least one WERD.
// WERDs contain a fake blob. // WERDs contain a fake blob.
void cleanup_nontext_block(BLOCK* block); void cleanup_nontext_block(BLOCK* block);
void cleanup_blocks(BLOCK_LIST *blocks); void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);
BOOL8 clean_noise_from_row(ROW *row); BOOL8 clean_noise_from_row(ROW *row);
void clean_noise_from_words(ROW *row); void clean_noise_from_words(ROW *row);
// Remove outlines that are a tiny fraction in either width or height // Remove outlines that are a tiny fraction in either width or height

View File

@ -360,9 +360,11 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
// Non-text blocks must contain at least one row. // Non-text blocks must contain at least one row.
ROW_IT row_it(block->row_list()); ROW_IT row_it(block->row_list());
if (row_it.empty()) { if (row_it.empty()) {
float height = block->bounding_box().height(); TBOX box = block->bounding_box();
inT32 zero = 0; float height = box.height();
ROW* row = new ROW(0, &zero, NULL, height / 2.0f, height / 4.0f, inT32 xstarts[2] = {box.left(), box.right()};
double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
height / 4.0f, 0, 1); height / 4.0f, 0, 1);
row_it.add_after_then_move(row); row_it.add_after_then_move(row);
} }
@ -398,9 +400,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
* Delete empty blocks, rows from the page. * Delete empty blocks, rows from the page.
**********************************************************************/ **********************************************************************/
void Textord::cleanup_blocks( //remove empties void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
BLOCK_LIST *blocks //list
) {
BLOCK_IT block_it = blocks; //iterator BLOCK_IT block_it = blocks; //iterator
ROW_IT row_it; //row iterator ROW_IT row_it; //row iterator
@ -417,22 +417,24 @@ void Textord::cleanup_blocks( //remove empties
} }
num_rows = 0; num_rows = 0;
num_rows_all = 0; num_rows_all = 0;
row_it.set_to_list(block->row_list()); if (clean_noise) {
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { row_it.set_to_list(block->row_list());
++num_rows_all; for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
clean_small_noise_from_words(row_it.data()); ++num_rows_all;
if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() && clean_small_noise_from_words(row_it.data());
clean_noise_from_row(row_it.data())) || if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
row_it.data()->word_list()->empty()) { clean_noise_from_row(row_it.data())) ||
delete row_it.extract(); // lose empty row. row_it.data()->word_list()->empty()) {
} else { delete row_it.extract(); // lose empty row.
if (textord_noise_rejwords) } else {
clean_noise_from_words(row_it.data()); if (textord_noise_rejwords)
if (textord_blshift_maxshift >= 0) clean_noise_from_words(row_it.data());
tweak_row_baseline(row_it.data(), if (textord_blshift_maxshift >= 0)
textord_blshift_maxshift, tweak_row_baseline(row_it.data(),
textord_blshift_xfraction); textord_blshift_maxshift,
++num_rows; textord_blshift_xfraction);
++num_rows;
}
} }
} }
if (block->row_list()->empty()) { if (block->row_list()->empty()) {

View File

@ -299,7 +299,7 @@ bool LanguageModel::UpdateState(
//if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue; //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
UNICHAR_ID unichar_id = choice->unichar_id(); UNICHAR_ID unichar_id = choice->unichar_id();
if (unicharset.get_fragment(unichar_id)) { if (unicharset.get_fragment(unichar_id)) {
continue; // skip fragments continue; // Skip fragments.
} }
// Set top choice flags. // Set top choice flags.
LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag; LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
@ -651,6 +651,8 @@ bool LanguageModel::AddViterbiStateEntry(
ngram_info, (language_model_debug_level > 0) ? ngram_info, (language_model_debug_level > 0) ?
dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL); dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
new_vse->cost = ComputeAdjustedPathCost(new_vse); new_vse->cost = ComputeAdjustedPathCost(new_vse);
if (language_model_debug_level >= 3)
tprintf("Adjusted cost = %g\n", new_vse->cost);
// Invoke Top Choice language model component to make the final adjustments // Invoke Top Choice language model component to make the final adjustments
// to new_vse->top_choice_flags. // to new_vse->top_choice_flags.
@ -1311,7 +1313,7 @@ void LanguageModel::UpdateBestChoice(
vse->dawg_info != NULL && vse->top_choice_flags); vse->dawg_info != NULL && vse->top_choice_flags);
} }
} }
if (wordrec_display_segmentations) { if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
word->DisplaySegmentation(word_res->chopped_word); word->DisplaySegmentation(word_res->chopped_word);
} }
} }

View File

@ -37,52 +37,16 @@ void Wordrec::DoSegSearch(WERD_RES* word_res) {
void Wordrec::SegSearch(WERD_RES* word_res, void Wordrec::SegSearch(WERD_RES* word_res,
BestChoiceBundle* best_choice_bundle, BestChoiceBundle* best_choice_bundle,
BlamerBundle* blamer_bundle) { BlamerBundle* blamer_bundle) {
if (segsearch_debug_level > 0) {
tprintf("Starting SegSearch on ratings matrix%s:\n",
wordrec_enable_assoc ? " (with assoc)" : "");
word_res->ratings->print(getDict().getUnicharset());
}
LMPainPoints pain_points(segsearch_max_pain_points, LMPainPoints pain_points(segsearch_max_pain_points,
segsearch_max_char_wh_ratio, segsearch_max_char_wh_ratio,
assume_fixed_pitch_char_segment, assume_fixed_pitch_char_segment,
&getDict(), segsearch_debug_level); &getDict(), segsearch_debug_level);
pain_points.GenerateInitial(word_res);
// Compute scaling factor that will help us recover blob outline length // Compute scaling factor that will help us recover blob outline length
// from classifier rating and certainty for the blob. // from classifier rating and certainty for the blob.
float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
language_model_->InitForWord(prev_word_best_choice_,
assume_fixed_pitch_char_segment,
segsearch_max_char_wh_ratio, rating_cert_scale);
// Initialize blamer-related information: map character boxes recorded in
// blamer_bundle->norm_truth_word to the corresponding i,j indices in the
// ratings matrix. We expect this step to succeed, since when running the
// chopper we checked that the correct chops are present.
if (blamer_bundle != NULL) {
blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
wordrec_debug_blamer);
}
MATRIX_COORD pain_point;
float pain_point_priority;
// pending[col] tells whether there is update work to do to combine
// best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
// As the language model state is updated, pending entries are modified to
// minimize duplication of work. It is important that during the update the
// children are considered in the non-decreasing order of their column, since
// this guarantees that all the parents would be up to date before an update
// of a child is done.
GenericVector<SegSearchPending> pending; GenericVector<SegSearchPending> pending;
pending.init_to_size(word_res->ratings->dimension(), SegSearchPending()); InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
blamer_bundle);
// Search the ratings matrix for the initial best path.
pending[0].SetColumnClassified();
UpdateSegSearchNodes(rating_cert_scale, 0, &pending, word_res,
&pain_points, best_choice_bundle, blamer_bundle);
if (!SegSearchDone(0)) { // find a better choice if (!SegSearchDone(0)) { // find a better choice
if (chop_enable && word_res->chopped_word != NULL) { if (chop_enable && word_res->chopped_word != NULL) {
@ -98,6 +62,9 @@ void Wordrec::SegSearch(WERD_RES* word_res,
} }
} }
// Keep trying to find a better path by fixing the "pain points". // Keep trying to find a better path by fixing the "pain points".
MATRIX_COORD pain_point;
float pain_point_priority;
int num_futile_classifications = 0; int num_futile_classifications = 0;
STRING blamer_debug; STRING blamer_debug;
while (wordrec_enable_assoc && while (wordrec_enable_assoc &&
@ -159,6 +126,72 @@ void Wordrec::SegSearch(WERD_RES* word_res,
} }
} }
// Setup and run just the initial segsearch on an established matrix,
// without doing any additional chopping or joining.
void Wordrec::WordSearch(WERD_RES* word_res) {
LMPainPoints pain_points(segsearch_max_pain_points,
segsearch_max_char_wh_ratio,
assume_fixed_pitch_char_segment,
&getDict(), segsearch_debug_level);
GenericVector<SegSearchPending> pending;
BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
// Run Segmentation Search.
InitialSegSearch(word_res, &pain_points, &pending, &best_choice_bundle, NULL);
if (segsearch_debug_level > 0) {
tprintf("Ending ratings matrix%s:\n",
wordrec_enable_assoc ? " (with assoc)" : "");
word_res->ratings->print(getDict().getUnicharset());
}
}
// Setup and run just the initial segsearch on an established matrix,
// without doing any additional chopping or joining.
// (Internal factored version that can be used as part of the main SegSearch.)
void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
GenericVector<SegSearchPending>* pending,
BestChoiceBundle* best_choice_bundle,
BlamerBundle* blamer_bundle) {
if (segsearch_debug_level > 0) {
tprintf("Starting SegSearch on ratings matrix%s:\n",
wordrec_enable_assoc ? " (with assoc)" : "");
word_res->ratings->print(getDict().getUnicharset());
}
pain_points->GenerateInitial(word_res);
// Compute scaling factor that will help us recover blob outline length
// from classifier rating and certainty for the blob.
float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
language_model_->InitForWord(prev_word_best_choice_,
assume_fixed_pitch_char_segment,
segsearch_max_char_wh_ratio, rating_cert_scale);
// Initialize blamer-related information: map character boxes recorded in
// blamer_bundle->norm_truth_word to the corresponding i,j indices in the
// ratings matrix. We expect this step to succeed, since when running the
// chopper we checked that the correct chops are present.
if (blamer_bundle != NULL) {
blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
wordrec_debug_blamer);
}
// pending[col] tells whether there is update work to do to combine
// best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
// As the language model state is updated, pending entries are modified to
// minimize duplication of work. It is important that during the update the
// children are considered in the non-decreasing order of their column, since
// this guarantees that all the parents would be up to date before an update
// of a child is done.
pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
// Search the ratings matrix for the initial best path.
(*pending)[0].SetColumnClassified();
UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
pain_points, best_choice_bundle, blamer_bundle);
}
void Wordrec::UpdateSegSearchNodes( void Wordrec::UpdateSegSearchNodes(
float rating_cert_scale, float rating_cert_scale,
int starting_col, int starting_col,

View File

@ -266,11 +266,22 @@ class Wordrec : public Classify {
// to combine blobs. Segmentation search will run only one "iteration" // to combine blobs. Segmentation search will run only one "iteration"
// on the classifications already recorded in chunks_record.ratings. // on the classifications already recorded in chunks_record.ratings.
// //
// Note: this function assumes that word, output_best_state, // Note: this function assumes that word_res, best_choice_bundle arguments
// best_char_choices and fixpt arguments are not NULL. // are not NULL.
void SegSearch(WERD_RES* word_res, void SegSearch(WERD_RES* word_res,
BestChoiceBundle* best_choice_bundle, BestChoiceBundle* best_choice_bundle,
BlamerBundle* blamer_bundle); BlamerBundle* blamer_bundle);
// Setup and run just the initial segsearch on an established matrix,
// without doing any additional chopping or joining.
void WordSearch(WERD_RES* word_res);
// Setup and run just the initial segsearch on an established matrix,
// without doing any additional chopping or joining.
// (Internal factored version that can be used as part of the main SegSearch.)
void InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
GenericVector<SegSearchPending>* pending,
BestChoiceBundle* best_choice_bundle,
BlamerBundle* blamer_bundle);
// Runs SegSearch() function (above) without needing a best_choice_bundle // Runs SegSearch() function (above) without needing a best_choice_bundle
// or blamer_bundle. Used for testing. // or blamer_bundle. Used for testing.