diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 296e2a98..93c02a0f 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -1606,12 +1606,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (italic) hocr_str += ""; if (bold) hocr_str += ""; // If glyph confidence is required it is added here - if (tesseract_->glyph_confidences && confidencemap != nullptr) { + if (tesseract_->glyph_confidences == 1 && confidencemap != nullptr) { for (size_t i = 0; i < confidencemap->size(); i++) { hocr_str += "\n > timestep = (*confidencemap)[i]; for (std::pair conf : timestep) { hocr_str += "glyph_confidences == 2 && confidencemap != nullptr) { + for (size_t i = 0; i < confidencemap->size(); i++) { + std::vector> timestep = (*confidencemap)[i]; + if (timestep.size() > 0) { + hocr_str += "\n params()), - BOOL_MEMBER(glyph_confidences, false, + INT_MEMBER(glyph_confidences, 0, "Allows to include glyph confidences in the hOCR output", this->params()), diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index c3fe7124..45fbc6c9 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -1114,7 +1114,8 @@ class Tesseract : public Wordrec { "Preserve multiple interword spaces"); STRING_VAR_H(page_separator, "\f", "Page separator (default is form feed control character)"); - BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output"); + INT_VAR_H(glyph_confidences, 0, + "Allows to include glyph confidences in the hOCR output"); //// ambigsrecog.cpp ///////////////////////////////////////////////////////// FILE *init_recog_training(const STRING &fname); diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp index 7ef79d24..62ca9900 100644 --- a/src/lstm/lstmrecognizer.cpp +++ b/src/lstm/lstmrecognizer.cpp @@ -172,7 +172,8 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) { void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, bool debug, double worst_dict_cert, const TBOX& line_box, - PointerVector* words, bool glyph_confidences) { + PointerVector* words, + int glyph_confidences) { NetworkIO outputs; float scale_factor; NetworkIO inputs; diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h index 0755db9a..dcfbc2b5 100644 --- a/src/lstm/lstmrecognizer.h +++ b/src/lstm/lstmrecognizer.h @@ -185,7 +185,7 @@ class LSTMRecognizer { void RecognizeLine(const ImageData& image_data, bool invert, bool debug, double worst_dict_cert, const TBOX& line_box, PointerVector* words, - bool glyph_confidences = false); + int glyph_confidences = 0); // Helper computes min and mean best results in the output. void OutputStats(const NetworkIO& outputs, diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 682484f1..ddad441c 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -22,6 +22,8 @@ #include "networkio.h" #include "pageres.h" #include "unicharcompress.h" +#include +#include #include #include @@ -79,7 +81,7 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder, // Decodes the set of network outputs, storing the lattice internally. void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio, double cert_offset, double worst_dict_cert, - const UNICHARSET* charset, bool glyph_confidence) { + const UNICHARSET* charset, int glyph_confidence) { beam_size_ = 0; int width = output.Width(); if (glyph_confidence) @@ -177,7 +179,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, float scale_factor, bool debug, const UNICHARSET* unicharset, PointerVector* words, - bool glyph_confidence) { + int glyph_confidence) { words->truncate(0); GenericVector unichar_ids; GenericVector certs; @@ -185,6 +187,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, GenericVector xcoords; GenericVector best_nodes; GenericVector second_nodes; + std::deque> best_glyphs; ExtractBestPaths(&best_nodes, &second_nodes); if (debug) { DebugPath(unicharset, best_nodes); @@ -194,7 +197,22 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, DebugUnicharPath(unicharset, second_nodes, unichar_ids, certs, ratings, xcoords); } - ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords); + int current_char; + int timestepEnd = 0; + //if glyph confidence is required in granularity level 2 it stores the x + //Coordinates of every chosen character to match the alternative glyphs to it + if (glyph_confidence == 2) { + ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, + &xcoords, &best_glyphs); + if (best_glyphs.size() > 0) { + current_char = best_glyphs.front().first; + timestepEnd = best_glyphs.front().second; + best_glyphs.pop_front(); + } + } else { + ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, + &xcoords); + } int num_ids = unichar_ids.size(); if (debug) { DebugUnicharPath(unicharset, best_nodes, unichar_ids, certs, ratings, @@ -202,7 +220,6 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, } // Convert labels to unichar-ids. int word_end = 0; - int timestepEnd = 0; float prev_space_cert = 0.0f; for (int word_start = 0; word_start < num_ids; word_start = word_end) { for (word_end = word_start + 1; word_end < num_ids; ++word_end) { @@ -226,11 +243,55 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, WERD_RES* word_res = InitializeWord( leading_space, line_box, word_start, word_end, std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor); - if (glyph_confidence) { + if (glyph_confidence == 1) { for (size_t i = timestepEnd; i < xcoords[word_end]; i++) { word_res->timesteps.push_back(timesteps[i]); } timestepEnd = xcoords[word_end]; + } else if (glyph_confidence == 2) { + float sum = 0; + std::vector> glyph_pairs; + for (size_t i = timestepEnd; i < xcoords[word_end]; i++) { + for (std::pair glyph : timesteps[i]) { + if (std::strcmp(glyph.first, "") != 0) { + sum += glyph.second; + glyph_pairs.push_back(glyph); + } + } + if (best_glyphs.size() > 0 && i == best_glyphs.front().second-1 + || i == xcoords[word_end]-1) { + std::map summed_propabilities; + for(auto it = glyph_pairs.begin(); it != glyph_pairs.end(); ++it) { + summed_propabilities[it->first] += it->second; + } + std::vector> accumulated_timestep; + accumulated_timestep.push_back(std::pair + (unicharset->id_to_unichar_ext + (current_char), 2.0)); + int pos; + for (auto it = summed_propabilities.begin(); + it != summed_propabilities.end(); ++it) { + if(sum == 0) break; + it->second/=sum; + pos = 0; + while (accumulated_timestep.size() > pos + && accumulated_timestep[pos].second > it->second) { + pos++; + } + accumulated_timestep.insert(accumulated_timestep.begin() + pos, + std::pair(it->first, + it->second)); + } + if (best_glyphs.size() > 0) { + current_char = best_glyphs.front().first; + best_glyphs.pop_front(); + } + glyph_pairs.clear(); + word_res->timesteps.push_back(accumulated_timestep); + sum = 0; + } + } + timestepEnd = xcoords[word_end]; } for (int i = word_start; i < word_end; ++i) { BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; @@ -304,7 +365,8 @@ void RecodeBeamSearch::DebugBeamPos(const UNICHARSET& unicharset, void RecodeBeamSearch::ExtractPathAsUnicharIds( const GenericVector& best_nodes, GenericVector* unichar_ids, GenericVector* certs, - GenericVector* ratings, GenericVector* xcoords) { + GenericVector* ratings, GenericVector* xcoords, + std::deque>* best_glyphs) { unichar_ids->truncate(0); certs->truncate(0); ratings->truncate(0); @@ -333,6 +395,9 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds( } unichar_ids->push_back(unichar_id); xcoords->push_back(t); + if(best_glyphs != nullptr) { + best_glyphs->push_back(std::pair(unichar_id,t)); + } do { double cert = best_nodes[t++]->certainty; // Special-case NO-PERM space to forget the certainty of the previous diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index c9970daa..3d5adca7 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -28,6 +28,7 @@ #include "networkio.h" #include "ratngs.h" #include "unicharcompress.h" +#include #include #include @@ -185,7 +186,7 @@ class RecodeBeamSearch { // If charset is not null, it enables detailed debugging of the beam search. void Decode(const NetworkIO& output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET* charset, - bool glyph_confidence = false); + int glyph_confidence = 0); void Decode(const GENERIC_2D_ARRAY& output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET* charset); @@ -204,12 +205,16 @@ class RecodeBeamSearch { // Returns the best path as a set of WERD_RES. void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor, bool debug, const UNICHARSET* unicharset, - PointerVector* words, bool glyph_confidence); + PointerVector* words, + int glyph_confidence = 0); // Generates debug output of the content of the beams after a Decode. void DebugBeams(const UNICHARSET& unicharset) const; - + + // Stores the alternative characters of every timestep together with their + // probability. std::vector< std::vector>> timesteps; + // Clipping value for certainty inside Tesseract. Reflects the minimum value // of certainty that will be returned by ExtractBestPathAsUnicharIds. // Supposedly on a uniform scale that can be compared across languages and @@ -276,7 +281,8 @@ class RecodeBeamSearch { static void ExtractPathAsUnicharIds( const GenericVector& best_nodes, GenericVector* unichar_ids, GenericVector* certs, - GenericVector* ratings, GenericVector* xcoords); + GenericVector* ratings, GenericVector* xcoords, + std::deque>* best_glyphs = nullptr); // Sets up a word with the ratings matrix and fake blobs with boxes in the // right places.