From 754e38d2b48184f8dbaeca7acd7119f64acc7bf7 Mon Sep 17 00:00:00 2001 From: Noah Metzger Date: Wed, 13 Feb 2019 11:13:00 +0100 Subject: [PATCH] Added the option to get the timesteps separated by the suggested segmentation Signed-off-by: Noah Metzger --- src/api/hocrrenderer.cpp | 37 +++++++++++++++++++++++++++- src/ccmain/resultiterator.cpp | 12 ++++++++- src/ccmain/resultiterator.h | 5 +++- src/ccmain/tesseractclass.cpp | 4 ++- src/ccmain/tesseractclass.h | 4 ++- src/ccstruct/pageres.h | 2 ++ src/lstm/recodebeam.cpp | 46 ++++++++++++++++++++++++++++------- src/lstm/recodebeam.h | 3 ++- 8 files changed, 98 insertions(+), 15 deletions(-) diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index 5d2de9a3..3a5cb986 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -130,7 +130,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) return nullptr; - int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1; + int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. bool para_is_ltr = true; // Default direction is LTR const char* paragraph_lang = nullptr; @@ -215,8 +215,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { // Now, process the word... std::vector>>* confidencemap = nullptr; + std::vector>>>* + symbolMap = nullptr; if (tesseract_->lstm_choice_mode) { confidencemap = res_it->GetBestLSTMSymbolChoices(); + symbolMap = res_it->GetBestSegmentedLSTMSymbolChoices(); } hocr_str << "\n " + << timesteps[0][0].first; + for (size_t i = 1; i < timesteps.size(); i++) { + hocr_str << "\n "; + std::vector> timestep = + timesteps[i]; + for (std::pair conf : timestep) { + hocr_str << "" + << conf.first << ""; + gcnt++; + } + hocr_str << ""; + tcnt++; + } + hocr_str << ""; + scnt++; + } } hocr_str << ""; tcnt = 1; diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp index 19c4687d..b5814190 100644 --- a/src/ccmain/resultiterator.cpp +++ b/src/ccmain/resultiterator.cpp @@ -605,7 +605,8 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const { return result; } -std::vector>>* ResultIterator::GetBestLSTMSymbolChoices() const { +std::vector>>* + ResultIterator::GetBestLSTMSymbolChoices() const { if (it_->word() != nullptr) { return &it_->word()->timesteps; } else { @@ -613,6 +614,15 @@ std::vector>>* ResultIterator::GetBest } } +std::vector>>>* + ResultIterator::GetBestSegmentedLSTMSymbolChoices() const { + if (it_->word() != nullptr) { + return &it_->word()->symbol_steps; + } else { + return nullptr; + } +} + void ResultIterator::AppendUTF8WordText(STRING *text) const { if (!it_->word()) return; ASSERT_HOST(it_->word()->best_choice != nullptr); diff --git a/src/ccmain/resultiterator.h b/src/ccmain/resultiterator.h index b658e5a0..de45bc12 100644 --- a/src/ccmain/resultiterator.h +++ b/src/ccmain/resultiterator.h @@ -100,7 +100,10 @@ class TESS_API ResultIterator : public LTRResultIterator { /** * Returns the LSTM choices for every LSTM timestep for the current word. */ - virtual std::vector>>* GetBestLSTMSymbolChoices() const; + virtual std::vector>>* + GetBestLSTMSymbolChoices() const; + virtual std::vector>>>* + GetBestSegmentedLSTMSymbolChoices() const; /** * Return whether the current paragraph's dominant reading direction diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index 128114d3..5ee5ccb2 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -526,7 +526,9 @@ Tesseract::Tesseract() "Allows to include alternative symbols choices in the hOCR output. " "Valid input values are 0, 1 and 2. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " - "With 2 the alternative symbol choices are accumulated per character.", + "With 2 the alternative symbol choices are accumulated per character." + "With 3 the alternative symbol choices per timestep are included and " + "separated by the suggested segmentation of Tesseract", this->params()), backup_config_file_(nullptr), diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index a9b1f8a8..f937cc69 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -1127,7 +1127,9 @@ class Tesseract : public Wordrec { "Allows to include alternative symbols choices in the hOCR output. " "Valid input values are 0, 1 and 2. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " - "With 2 the alternative symbol choices are accumulated per character."); + "With 2 the alternative symbol choices are accumulated per character." + "With 3 the alternative symbol choices per timestep are included and " + "separated by the suggested segmentation of Tesseract"); //// ambigsrecog.cpp ///////////////////////////////////////////////////////// FILE *init_recog_training(const STRING &fname); diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h index df4e0f77..55997839 100644 --- a/src/ccstruct/pageres.h +++ b/src/ccstruct/pageres.h @@ -222,6 +222,8 @@ class WERD_RES : public ELIST_LINK { GenericVector blob_gaps; // Stores the lstm choices of every timestep std::vector>> timesteps; + std::vector>>> + symbol_steps; // Ratings matrix contains classifier choices for each classified combination // of blobs. The dimension is the same as the number of blobs in chopped_word // and the leading diagonal corresponds to classifier results of the blobs diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 7f5dbfda..06c24027 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -187,7 +188,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, GenericVector xcoords; GenericVector best_nodes; GenericVector second_nodes; - std::deque> best_choices; + std::deque> best_choices; ExtractBestPaths(&best_nodes, &second_nodes); if (debug) { DebugPath(unicharset, best_nodes); @@ -201,13 +202,14 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, int timestepEnd = 0; //if lstm choice mode is required in granularity level 2 it stores the x //Coordinates of every chosen character to match the alternative choices to it - if (lstm_choice_mode == 2) { + if (lstm_choice_mode == 2 || lstm_choice_mode == 3) { ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords, &best_choices); if (best_choices.size() > 0) { - current_char = best_choices.front().first; - timestepEnd = best_choices.front().second; - best_choices.pop_front(); + current_char = std::get<0>(best_choices.front()); + timestepEnd = std::get<1>(best_choices.front()); + if(lstm_choice_mode == 2) + best_choices.pop_front(); } } else { ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, @@ -258,7 +260,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, choice_pairs.push_back(choice); } } - if ((best_choices.size() > 0 && i == best_choices.front().second - 1) + if ((best_choices.size() > 0 && i == std::get<1>(best_choices.front()) - 1) || i == xcoords[word_end]-1) { std::map summed_propabilities; for (auto it = choice_pairs.begin(); it != choice_pairs.end(); ++it) { @@ -283,7 +285,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, it->second)); } if (best_choices.size() > 0) { - current_char = best_choices.front().first; + current_char = std::get<0>(best_choices.front()); best_choices.pop_front(); } choice_pairs.clear(); @@ -292,6 +294,25 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, } } timestepEnd = xcoords[word_end]; + } else if (lstm_choice_mode == 3) { + std::vector>> currentSymbol; + for (size_t i = timestepEnd; i < xcoords[word_end]; i++) { + if (i == std::get<1>(best_choices.front())) { + if (currentSymbol.size() > 0) { + word_res->symbol_steps.push_back(currentSymbol); + currentSymbol.clear(); + } + std::vector> choice_Header; + choice_Header.push_back(std::pair( + unicharset->id_to_unichar_ext(std::get<0>(best_choices.front())), + 2.0)); + currentSymbol.push_back(choice_Header); + if(best_choices.size()>1) best_choices.pop_front(); + } + currentSymbol.push_back(timesteps[i]); + } + word_res->symbol_steps.push_back(currentSymbol); + timestepEnd = xcoords[word_end]; } for (int i = word_start; i < word_end; ++i) { BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; @@ -366,7 +387,7 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds( const GenericVector& best_nodes, GenericVector* unichar_ids, GenericVector* certs, GenericVector* ratings, GenericVector* xcoords, - std::deque>* best_choices) { + std::deque>* best_choices) { unichar_ids->truncate(0); certs->truncate(0); ratings->truncate(0); @@ -375,6 +396,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds( int t = 0; int width = best_nodes.size(); while (t < width) { + int id; + int tposition; double certainty = 0.0; double rating = 0.0; while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) { @@ -396,7 +419,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds( unichar_ids->push_back(unichar_id); xcoords->push_back(t); if (best_choices != nullptr) { - best_choices->push_back(std::pair(unichar_id, t)); + tposition = t; + id = unichar_id; } do { double cert = best_nodes[t++]->certainty; @@ -414,6 +438,10 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds( if (certainty < certs->back()) certs->back() = certainty; ratings->back() += rating; } + if (best_choices != nullptr) { + best_choices->push_back( + std::tuple(id, tposition, rating)); + } } xcoords->push_back(width); } diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index 6bd44653..7aa0d7b3 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -29,6 +29,7 @@ #include "unicharcompress.h" #include #include +#include #include namespace tesseract { @@ -281,7 +282,7 @@ class RecodeBeamSearch { const GenericVector& best_nodes, GenericVector* unichar_ids, GenericVector* certs, GenericVector* ratings, GenericVector* xcoords, - std::deque>* best_choices = nullptr); + std::deque>* best_choices = nullptr); // Sets up a word with the ratings matrix and fake blobs with boxes in the // right places.