From 91c7504a35310ed576d9522211373f81f49bd854 Mon Sep 17 00:00:00 2001
From: Noah Metzger <noah.metzger@bib.uni-mannheim.de>
Date: Wed, 25 Jul 2018 15:01:07 +0200
Subject: [PATCH] Added a feature to enrich the hOCR output with glyph
 confidences

By using the parameter -c glyph_confidences=true the user is able to enrich
the hOCR output with additional information. Tesseract then lists additionally
the timesteps with all glyphs that were considered with their confidence
for every timestep of the LSTM.

The format of the hOCR output is slightly changed: There is now a linebreak
after every word for better readability by humans.

Signed-off-by: Noah Metzger <noah.metzger@bib.uni-mannheim.de>
---
 src/api/baseapi.cpp           | 49 +++++++++++++++++++++++++++++++--
 src/ccmain/linerec.cpp        |  2 +-
 src/ccmain/resultiterator.cpp | 10 +++++++
 src/ccmain/resultiterator.h   |  7 +++++
 src/ccmain/tesseractclass.cpp |  3 ++
 src/ccmain/tesseractclass.h   |  1 +
 src/ccstruct/pageres.h        |  4 +++
 src/lstm/lstmrecognizer.cpp   |  8 ++++--
 src/lstm/lstmrecognizer.h     |  3 +-
 src/lstm/recodebeam.cpp       | 52 ++++++++++++++++++++++++++++++++---
 src/lstm/recodebeam.h         | 15 +++++++---
 11 files changed, 138 insertions(+), 16 deletions(-)
diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
index c5c80a73..296e2a98 100644
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
@@ -49,6 +49,8 @@
 #include <fstream>             // for size_t
 #include <iostream>            // for std::cin
 #include <memory>              // for std::unique_ptr
+#include <set>                 // for std::pair
+#include <vector>              // for std::vector
 #include "allheaders.h"        // for pixDestroy, boxCreate, boxaAddBox, box...
 #include "blobclass.h"         // for ExtractFontName
 #include "boxword.h"           // for BoxWord
@@ -398,6 +400,7 @@ int TessBaseAPI::Init(const char* data, int data_size, const char* language,
       return -1;
     }
   }
+
   PERF_COUNT_SUB("update tesseract_")
   // Update datapath and language requested for the last valid initialization.
   if (datapath_ == nullptr)
@@ -1389,6 +1392,17 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
   *hocr_str += "'";
 }
 
+static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
+  int num2, int num3) {
+  const size_t BUFSIZE = 64;
+  char id_buffer[BUFSIZE];
+  snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3);
+  id_buffer[BUFSIZE - 1] = '\0';
+  *hocr_str += " id='";
+  *hocr_str += id_buffer;
+  *hocr_str += "'";
+}
+
 static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
                          STRING* hocr_str) {
   int left, top, right, bottom;
@@ -1449,7 +1463,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
   if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
     return nullptr;
 
-  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
+  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
   int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
   bool para_is_ltr = true;        // Default direction is LTR
   const char* paragraph_lang = nullptr;
@@ -1529,7 +1543,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     }
 
     // Now, process the word...
-    hocr_str += "<span class='ocrx_word'";
+    std::vector<std::vector<std::pair<const char*, float>>>* confidencemap = nullptr;
+    if (tesseract_->glyph_confidences) {
+      confidencemap = res_it->GetGlyphConfidences();
+    }
+    hocr_str += "\n      <span class='ocrx_word'";
     AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
     int left, top, right, bottom;
     bool bold, italic, underlined, monospace, serif, smallcaps;
@@ -1587,7 +1605,32 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
     if (italic) hocr_str += "</em>";
     if (bold) hocr_str += "</strong>";
-    hocr_str += "</span> ";
+    // If glyph confidence is required it is added here
+    if (tesseract_->glyph_confidences && confidencemap != nullptr) {
+      for (size_t i = 0; i < confidencemap->size(); i++) {
+        hocr_str += "\n       <span class='ocrx_cinfo'";
+        AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
+        hocr_str += ">";
+        //*
+        std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
+        for (std::pair<const char*, float> conf : timestep) {
+          hocr_str += "<span class='ocr_glyph'";
+          AddIdTohOCR(&hocr_str, "glyph", page_id, wcnt, gcnt);
+          hocr_str.add_str_int(" title='x_confs ", int(conf.second * 100));
+          hocr_str += "'";
+          hocr_str += ">";
+          hocr_str += conf.first;
+          hocr_str += "</span>";
+          gcnt++;
+        }
+        //*/
+        hocr_str += "</span>";
+        tcnt++;
+      }
+    }
+    hocr_str += "</span>";
+    tcnt = 1;
+    gcnt = 1;
     wcnt++;
     // Close any ending block/paragraph/textline.
     if (last_word_in_line) {
diff --git a/src/ccmain/linerec.cpp b/src/ccmain/linerec.cpp
index 081bed08..fc4e53a5 100644
--- a/src/ccmain/linerec.cpp
+++ b/src/ccmain/linerec.cpp
@@ -239,7 +239,7 @@ void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
   if (im_data == nullptr) return;
   lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
                                   kWorstDictCertainty / kCertaintyScale,
-                                  word_box, words);
+                                  word_box, words, glyph_confidences);
   delete im_data;
   SearchWords(words);
 }
diff --git a/src/ccmain/resultiterator.cpp b/src/ccmain/resultiterator.cpp
index 5c502cbd..66381b3f 100644
--- a/src/ccmain/resultiterator.cpp
+++ b/src/ccmain/resultiterator.cpp
@@ -27,6 +27,8 @@
 #include "tesseractclass.h"
 #include "unicharset.h"
 #include "unicodes.h"
+#include <set>
+#include <vector>
 
 namespace tesseract {
 
@@ -602,6 +604,14 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
   return result;
 }
 
+std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetGlyphConfidences() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->timesteps;
+  } else {
+    return nullptr;
+  }
+}
+
 void ResultIterator::AppendUTF8WordText(STRING *text) const {
   if (!it_->word()) return;
   ASSERT_HOST(it_->word()->best_choice != nullptr);
diff --git a/src/ccmain/resultiterator.h b/src/ccmain/resultiterator.h
index 7bd48368..8526aed7 100644
--- a/src/ccmain/resultiterator.h
+++ b/src/ccmain/resultiterator.h
@@ -22,6 +22,8 @@
 #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
 #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
 
+#include <set>                  // for std::pair
+#include <vector>               // for std::vector
 #include "ltrresultiterator.h"  // for LTRResultIterator
 #include "platform.h"           // for TESS_API, TESS_LOCAL
 #include "publictypes.h"        // for PageIteratorLevel
@@ -95,6 +97,11 @@ class TESS_API ResultIterator : public LTRResultIterator {
   */
   virtual char* GetUTF8Text(PageIteratorLevel level) const;
 
+  /**
+   * Returns the glyph confidences for every LSTM timestep for the current Word
+  */
+  virtual std::vector<std::vector<std::pair<const char*, float>>>* GetGlyphConfidences() const;
+
   /**
    * Return whether the current paragraph's dominant reading direction
    * is left-to-right (as opposed to right-to-left).
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
index 7a6160f7..72138233 100644
--- a/src/ccmain/tesseractclass.cpp
+++ b/src/ccmain/tesseractclass.cpp
@@ -508,6 +508,9 @@ Tesseract::Tesseract()
       STRING_MEMBER(page_separator, "\f",
                     "Page separator (default is form feed control character)",
                     this->params()),
+      BOOL_MEMBER(glyph_confidences, false,
+                  "Allows to include glyph confidences in the hOCR output",
+                   this->params()),
 
       backup_config_file_(nullptr),
       pix_binary_(nullptr),
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
index 2be0982e..c3fe7124 100644
--- a/src/ccmain/tesseractclass.h
+++ b/src/ccmain/tesseractclass.h
@@ -1114,6 +1114,7 @@ class Tesseract : public Wordrec {
              "Preserve multiple interword spaces");
   STRING_VAR_H(page_separator, "\f",
                "Page separator (default is form feed control character)");
+  BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output");
 
   //// ambigsrecog.cpp /////////////////////////////////////////////////////////
   FILE *init_recog_training(const STRING &fname);
diff --git a/src/ccstruct/pageres.h b/src/ccstruct/pageres.h
index 31b2f8df..b405bf61 100644
--- a/src/ccstruct/pageres.h
+++ b/src/ccstruct/pageres.h
@@ -21,6 +21,8 @@
 #define PAGERES_H
 
 #include <cstdint>             // for int32_t, int16_t
+#include <set>                 // for std::pair
+#include <vector>              // for std::vector
 #include <sys/types.h>         // for int8_t
 #include "blamer.h"            // for BlamerBundle (ptr only), IRR_NUM_REASONS
 #include "clst.h"              // for CLIST_ITERATOR, CLISTIZEH
@@ -218,6 +220,8 @@ class WERD_RES : public ELIST_LINK {
   // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
   // blob i and blob i+1.
   GenericVector<int> blob_gaps;
+  // Stores the glyph confidences of every timestep of the lstm
+  std::vector<std::vector<std::pair<const char*, float>>> timesteps;
   // Ratings matrix contains classifier choices for each classified combination
   // of blobs. The dimension is the same as the number of blobs in chopped_word
   // and the leading diagonal corresponds to classifier results of the blobs
diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp
index 1b3ecee3..7766476a 100644
--- a/src/lstm/lstmrecognizer.cpp
+++ b/src/lstm/lstmrecognizer.cpp
@@ -172,7 +172,7 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
 void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
                                    bool debug, double worst_dict_cert,
                                    const TBOX& line_box,
-                                   PointerVector<WERD_RES>* words) {
+                                   PointerVector<WERD_RES>* words, bool glyph_confidences) {
   NetworkIO outputs;
   float scale_factor;
   NetworkIO inputs;
@@ -183,9 +183,11 @@ void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
     search_ =
         new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_);
   }
-  search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, nullptr);
+  search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert,
+                  &GetUnicharset(), glyph_confidences);
   search_->ExtractBestPathAsWords(line_box, scale_factor, debug,
-                                  &GetUnicharset(), words);
+                                  &GetUnicharset(), words, 
+                                  glyph_confidences);
 }
 
 // Helper computes min and mean best results in the output.
diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h
index f1377740..0d1afbb4 100644
--- a/src/lstm/lstmrecognizer.h
+++ b/src/lstm/lstmrecognizer.h
@@ -184,7 +184,8 @@ class LSTMRecognizer {
   // will be used in a dictionary word.
   void RecognizeLine(const ImageData& image_data, bool invert, bool debug,
                      double worst_dict_cert, const TBOX& line_box,
-                     PointerVector<WERD_RES>* words);
+                     PointerVector<WERD_RES>* words, 
+                     bool glyph_confidences = false);
 
   // Helper computes min and mean best results in the output.
   void OutputStats(const NetworkIO& outputs,
diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp
index 9119f28e..4625d26e 100644
--- a/src/lstm/recodebeam.cpp
+++ b/src/lstm/recodebeam.cpp
@@ -22,6 +22,8 @@
 #include "networkio.h"
 #include "pageres.h"
 #include "unicharcompress.h"
+#include <set>
+#include <vector>
 
 #include <algorithm>
 
@@ -77,13 +79,18 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder,
 // Decodes the set of network outputs, storing the lattice internally.
 void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
                               double cert_offset, double worst_dict_cert,
-                              const UNICHARSET* charset) {
+                              const UNICHARSET* charset, bool glyph_confidence) {
   beam_size_ = 0;
   int width = output.Width();
+  if (glyph_confidence) 
+    timesteps.clear();
   for (int t = 0; t < width; ++t) {
     ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
     DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
                charset);
+    if (glyph_confidence) {
+      SaveMostCertainGlyphs(output.f(t), output.NumFeatures(), charset, t);
+    }
   }
 }
 void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
@@ -98,6 +105,35 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
   }
 }
 
+void RecodeBeamSearch::SaveMostCertainGlyphs(const float* outputs,
+                                             int num_outputs,
+                                             const UNICHARSET* charset,
+                                             int xCoord) {
+  std::vector<std::pair<const char*, float>> glyphs;
+  int pos = 0;
+  for (int i = 0; i < num_outputs; ++i) {
+    if (outputs[i] >= 0.01f) {
+      const char* charakter;
+      if (i + 2 >= num_outputs) {
+        charakter = "";
+      } else if (i > 0) {
+        charakter = charset->id_to_unichar_ext(i + 2);
+      } else {
+        charakter = charset->id_to_unichar_ext(i);
+      }
+      pos = 0;
+      //order the possible glyphs within one timestep
+      //beginning with the most likely
+      while (glyphs.size() > pos && glyphs[pos].second > outputs[i]) {
+        pos++;
+      }
+      glyphs.insert(glyphs.begin() + pos,
+                    std::pair<const char*, float>(charakter, outputs[i]));      
+    }
+  }
+  timesteps.push_back(glyphs);
+}
+
 // Returns the best path as labels/scores/xcoords similar to simple CTC.
 void RecodeBeamSearch::ExtractBestPathAsLabels(
     GenericVector<int>* labels, GenericVector<int>* xcoords) const {
@@ -140,7 +176,8 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(
 void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
                                               float scale_factor, bool debug,
                                               const UNICHARSET* unicharset,
-                                              PointerVector<WERD_RES>* words) {
+                                              PointerVector<WERD_RES>* words,
+                                              bool glyph_confidence) {
   words->truncate(0);
   GenericVector<int> unichar_ids;
   GenericVector<float> certs;
@@ -165,6 +202,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
   }
   // Convert labels to unichar-ids.
   int word_end = 0;
+  int timestepEnd = 0;
   float prev_space_cert = 0.0f;
   for (int word_start = 0; word_start < num_ids; word_start = word_end) {
     for (word_end = word_start + 1; word_end < num_ids; ++word_end) {
@@ -188,6 +226,12 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
     WERD_RES* word_res = InitializeWord(
         leading_space, line_box, word_start, word_end,
         std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor);
+    if (glyph_confidence) {
+      for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
+        word_res->timesteps.push_back(timesteps[i]);
+      }
+      timestepEnd = xcoords[word_end];
+    }
     for (int i = word_start; i < word_end; ++i) {
       BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
       BLOB_CHOICE_IT bc_it(choices);
@@ -381,7 +425,7 @@ void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
 void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
                                   double dict_ratio, double cert_offset,
                                   double worst_dict_cert,
-                                  const UNICHARSET* charset) {
+                                  const UNICHARSET* charset, bool debug) {
   if (t == beam_.size()) beam_.push_back(new RecodeBeam);
   RecodeBeam* step = beam_[t];
   beam_size_ = t + 1;
@@ -396,7 +440,7 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
     }
   } else {
     RecodeBeam* prev = beam_[t - 1];
-    if (charset != nullptr) {
+    if (debug) {
       int beam_index = BeamIndex(true, NC_ANYTHING, 0);
       for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
         GenericVector<const RecodeNode*> path;
diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h
index 61a2490f..85636581 100644
--- a/src/lstm/recodebeam.h
+++ b/src/lstm/recodebeam.h
@@ -28,6 +28,8 @@
 #include "networkio.h"
 #include "ratngs.h"
 #include "unicharcompress.h"
+#include <set>
+#include <vector>
 
 namespace tesseract {
 
@@ -182,7 +184,8 @@ class RecodeBeamSearch {
   // Decodes the set of network outputs, storing the lattice internally.
   // If charset is not null, it enables detailed debugging of the beam search.
   void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
-              double worst_dict_cert, const UNICHARSET* charset);
+              double worst_dict_cert, const UNICHARSET* charset,
+              bool glyph_confidence = false);
   void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
               double cert_offset, double worst_dict_cert,
               const UNICHARSET* charset);
@@ -201,11 +204,12 @@ class RecodeBeamSearch {
   // Returns the best path as a set of WERD_RES.
   void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
                               bool debug, const UNICHARSET* unicharset,
-                              PointerVector<WERD_RES>* words);
+                              PointerVector<WERD_RES>* words, bool glyph_confidence);
 
   // Generates debug output of the content of the beams after a Decode.
   void DebugBeams(const UNICHARSET& unicharset) const;
-
+  
+  std::vector< std::vector<std::pair<const char*, float>>> timesteps;
   // Clipping value for certainty inside Tesseract. Reflects the minimum value
   // of certainty that will be returned by ExtractBestPathAsUnicharIds.
   // Supposedly on a uniform scale that can be compared across languages and
@@ -291,7 +295,10 @@ class RecodeBeamSearch {
   // for the current timestep.
   void DecodeStep(const float* outputs, int t, double dict_ratio,
                   double cert_offset, double worst_dict_cert,
-                  const UNICHARSET* charset);
+                  const UNICHARSET* charset, bool debug = false);
+
+  //Saves the most certain glyphs for the current time-step
+  void SaveMostCertainGlyphs(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord);
 
   // Adds to the appropriate beams the legal (according to recoder)
   // continuations of context prev, which is from the given index to beams_,