diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index c5c80a73..296e2a98 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -49,6 +49,8 @@ #include // for size_t #include // for std::cin #include // for std::unique_ptr +#include // for std::pair +#include // for std::vector #include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box... #include "blobclass.h" // for ExtractFontName #include "boxword.h" // for BoxWord @@ -398,6 +400,7 @@ int TessBaseAPI::Init(const char* data, int data_size, const char* language, return -1; } } + PERF_COUNT_SUB("update tesseract_") // Update datapath and language requested for the last valid initialization. if (datapath_ == nullptr) @@ -1389,6 +1392,17 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, *hocr_str += "'"; } +static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, + int num2, int num3) { + const size_t BUFSIZE = 64; + char id_buffer[BUFSIZE]; + snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3); + id_buffer[BUFSIZE - 1] = '\0'; + *hocr_str += " id='"; + *hocr_str += id_buffer; + *hocr_str += "'"; +} + static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level, STRING* hocr_str) { int left, top, right, bottom; @@ -1449,7 +1463,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) return nullptr; - int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; + int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. bool para_is_ltr = true; // Default direction is LTR const char* paragraph_lang = nullptr; @@ -1529,7 +1543,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { } // Now, process the word... - hocr_str += ">>* confidencemap = nullptr; + if (tesseract_->glyph_confidences) { + confidencemap = res_it->GetGlyphConfidences(); + } + hocr_str += "\n Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (italic) hocr_str += ""; if (bold) hocr_str += ""; - hocr_str += " "; + // If glyph confidence is required it is added here + if (tesseract_->glyph_confidences && confidencemap != nullptr) { + for (size_t i = 0; i < confidencemap->size(); i++) { + hocr_str += "\n > timestep = (*confidencemap)[i]; + for (std::pair conf : timestep) { + hocr_str += "RecognizeLine(*im_data, true, classify_debug_level > 0, kWorstDictCertainty / kCertaintyScale, - word_box, words); + word_box, words, glyph_confidences); delete im_data; SearchWords(words); } diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp index 5c502cbd..66381b3f 100644 --- a/src/ccmain/resultiterator.cpp +++ b/src/ccmain/resultiterator.cpp @@ -27,6 +27,8 @@ #include "tesseractclass.h" #include "unicharset.h" #include "unicodes.h" +#include +#include namespace tesseract { @@ -602,6 +604,14 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const { return result; } +std::vector>>* ResultIterator::GetGlyphConfidences() const { + if (it_->word() != nullptr) { + return &it_->word()->timesteps; + } else { + return nullptr; + } +} + void ResultIterator::AppendUTF8WordText(STRING *text) const { if (!it_->word()) return; ASSERT_HOST(it_->word()->best_choice != nullptr); diff --git a/src/ccmain/resultiterator.h b/src/ccmain/resultiterator.h index 7bd48368..8526aed7 100644 --- a/src/ccmain/resultiterator.h +++ b/src/ccmain/resultiterator.h @@ -22,6 +22,8 @@ #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ +#include // for std::pair +#include // for std::vector #include "ltrresultiterator.h" // for LTRResultIterator #include "platform.h" // for TESS_API, TESS_LOCAL #include "publictypes.h" // for PageIteratorLevel @@ -95,6 +97,11 @@ class TESS_API ResultIterator : public LTRResultIterator { */ virtual char* GetUTF8Text(PageIteratorLevel level) const; + /** + * Returns the glyph confidences for every LSTM timestep for the current Word + */ + virtual std::vector>>* GetGlyphConfidences() const; + /** * Return whether the current paragraph's dominant reading direction * is left-to-right (as opposed to right-to-left). diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index 7a6160f7..72138233 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -508,6 +508,9 @@ Tesseract::Tesseract() STRING_MEMBER(page_separator, "\f", "Page separator (default is form feed control character)", this->params()), + BOOL_MEMBER(glyph_confidences, false, + "Allows to include glyph confidences in the hOCR output", + this->params()), backup_config_file_(nullptr), pix_binary_(nullptr), diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 2be0982e..c3fe7124 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -1114,6 +1114,7 @@ class Tesseract : public Wordrec { "Preserve multiple interword spaces"); STRING_VAR_H(page_separator, "\f", "Page separator (default is form feed control character)"); + BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output"); //// ambigsrecog.cpp ///////////////////////////////////////////////////////// FILE *init_recog_training(const STRING &fname); diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h index 31b2f8df..b405bf61 100644 --- a/src/ccstruct/pageres.h +++ b/src/ccstruct/pageres.h @@ -21,6 +21,8 @@ #define PAGERES_H #include // for int32_t, int16_t +#include // for std::pair +#include // for std::vector #include // for int8_t #include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS #include "clst.h" // for CLIST_ITERATOR, CLISTIZEH @@ -218,6 +220,8 @@ class WERD_RES : public ELIST_LINK { // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between // blob i and blob i+1. GenericVector blob_gaps; + // Stores the glyph confidences of every timestep of the lstm + std::vector>> timesteps; // Ratings matrix contains classifier choices for each classified combination // of blobs. The dimension is the same as the number of blobs in chopped_word // and the leading diagonal corresponds to classifier results of the blobs diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp index 1b3ecee3..7766476a 100644 --- a/src/lstm/lstmrecognizer.cpp +++ b/src/lstm/lstmrecognizer.cpp @@ -172,7 +172,7 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) { void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, bool debug, double worst_dict_cert, const TBOX& line_box, - PointerVector* words) { + PointerVector* words, bool glyph_confidences) { NetworkIO outputs; float scale_factor; NetworkIO inputs; @@ -183,9 +183,11 @@ void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, search_ = new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_); } - search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, nullptr); + search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, + &GetUnicharset(), glyph_confidences); search_->ExtractBestPathAsWords(line_box, scale_factor, debug, - &GetUnicharset(), words); + &GetUnicharset(), words, + glyph_confidences); } // Helper computes min and mean best results in the output. diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h index f1377740..0d1afbb4 100644 --- a/src/lstm/lstmrecognizer.h +++ b/src/lstm/lstmrecognizer.h @@ -184,7 +184,8 @@ class LSTMRecognizer { // will be used in a dictionary word. void RecognizeLine(const ImageData& image_data, bool invert, bool debug, double worst_dict_cert, const TBOX& line_box, - PointerVector* words); + PointerVector* words, + bool glyph_confidences = false); // Helper computes min and mean best results in the output. void OutputStats(const NetworkIO& outputs, diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp index 9119f28e..4625d26e 100644 --- a/src/lstm/recodebeam.cpp +++ b/src/lstm/recodebeam.cpp @@ -22,6 +22,8 @@ #include "networkio.h" #include "pageres.h" #include "unicharcompress.h" +#include +#include #include @@ -77,13 +79,18 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder, // Decodes the set of network outputs, storing the lattice internally. void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio, double cert_offset, double worst_dict_cert, - const UNICHARSET* charset) { + const UNICHARSET* charset, bool glyph_confidence) { beam_size_ = 0; int width = output.Width(); + if (glyph_confidence) + timesteps.clear(); for (int t = 0; t < width; ++t) { ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]); DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, charset); + if (glyph_confidence) { + SaveMostCertainGlyphs(output.f(t), output.NumFeatures(), charset, t); + } } } void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY& output, @@ -98,6 +105,35 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY& output, } } +void RecodeBeamSearch::SaveMostCertainGlyphs(const float* outputs, + int num_outputs, + const UNICHARSET* charset, + int xCoord) { + std::vector> glyphs; + int pos = 0; + for (int i = 0; i < num_outputs; ++i) { + if (outputs[i] >= 0.01f) { + const char* charakter; + if (i + 2 >= num_outputs) { + charakter = ""; + } else if (i > 0) { + charakter = charset->id_to_unichar_ext(i + 2); + } else { + charakter = charset->id_to_unichar_ext(i); + } + pos = 0; + //order the possible glyphs within one timestep + //beginning with the most likely + while (glyphs.size() > pos && glyphs[pos].second > outputs[i]) { + pos++; + } + glyphs.insert(glyphs.begin() + pos, + std::pair(charakter, outputs[i])); + } + } + timesteps.push_back(glyphs); +} + // Returns the best path as labels/scores/xcoords similar to simple CTC. void RecodeBeamSearch::ExtractBestPathAsLabels( GenericVector* labels, GenericVector* xcoords) const { @@ -140,7 +176,8 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds( void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, float scale_factor, bool debug, const UNICHARSET* unicharset, - PointerVector* words) { + PointerVector* words, + bool glyph_confidence) { words->truncate(0); GenericVector unichar_ids; GenericVector certs; @@ -165,6 +202,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, } // Convert labels to unichar-ids. int word_end = 0; + int timestepEnd = 0; float prev_space_cert = 0.0f; for (int word_start = 0; word_start < num_ids; word_start = word_end) { for (word_end = word_start + 1; word_end < num_ids; ++word_end) { @@ -188,6 +226,12 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, WERD_RES* word_res = InitializeWord( leading_space, line_box, word_start, word_end, std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor); + if (glyph_confidence) { + for (size_t i = timestepEnd; i < xcoords[word_end]; i++) { + word_res->timesteps.push_back(timesteps[i]); + } + timestepEnd = xcoords[word_end]; + } for (int i = word_start; i < word_end; ++i) { BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; BLOB_CHOICE_IT bc_it(choices); @@ -381,7 +425,7 @@ void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs, void RecodeBeamSearch::DecodeStep(const float* outputs, int t, double dict_ratio, double cert_offset, double worst_dict_cert, - const UNICHARSET* charset) { + const UNICHARSET* charset, bool debug) { if (t == beam_.size()) beam_.push_back(new RecodeBeam); RecodeBeam* step = beam_[t]; beam_size_ = t + 1; @@ -396,7 +440,7 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t, } } else { RecodeBeam* prev = beam_[t - 1]; - if (charset != nullptr) { + if (debug) { int beam_index = BeamIndex(true, NC_ANYTHING, 0); for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { GenericVector path; diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h index 61a2490f..85636581 100644 --- a/src/lstm/recodebeam.h +++ b/src/lstm/recodebeam.h @@ -28,6 +28,8 @@ #include "networkio.h" #include "ratngs.h" #include "unicharcompress.h" +#include +#include namespace tesseract { @@ -182,7 +184,8 @@ class RecodeBeamSearch { // Decodes the set of network outputs, storing the lattice internally. // If charset is not null, it enables detailed debugging of the beam search. void Decode(const NetworkIO& output, double dict_ratio, double cert_offset, - double worst_dict_cert, const UNICHARSET* charset); + double worst_dict_cert, const UNICHARSET* charset, + bool glyph_confidence = false); void Decode(const GENERIC_2D_ARRAY& output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET* charset); @@ -201,11 +204,12 @@ class RecodeBeamSearch { // Returns the best path as a set of WERD_RES. void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor, bool debug, const UNICHARSET* unicharset, - PointerVector* words); + PointerVector* words, bool glyph_confidence); // Generates debug output of the content of the beams after a Decode. void DebugBeams(const UNICHARSET& unicharset) const; - + + std::vector< std::vector>> timesteps; // Clipping value for certainty inside Tesseract. Reflects the minimum value // of certainty that will be returned by ExtractBestPathAsUnicharIds. // Supposedly on a uniform scale that can be compared across languages and @@ -291,7 +295,10 @@ class RecodeBeamSearch { // for the current timestep. void DecodeStep(const float* outputs, int t, double dict_ratio, double cert_offset, double worst_dict_cert, - const UNICHARSET* charset); + const UNICHARSET* charset, bool debug = false); + + //Saves the most certain glyphs for the current time-step + void SaveMostCertainGlyphs(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord); // Adds to the appropriate beams the legal (according to recoder) // continuations of context prev, which is from the given index to beams_,