Added the option to get the timesteps separated by the suggested segmentation

Signed-off-by: Noah Metzger <noah.metzger@bib.uni-mannheim.de>
2024-12-18 03:19:15 +08:00 · 2019-02-13 11:13:00 +01:00 · 2019-02-13 11:13:00 +01:00 · 754e38d2b4
commit 754e38d2b4
parent d2c3309df9
8 changed files with 98 additions and 15 deletions
--- a/src/api/hocrrenderer.cpp
+++ b/src/api/hocrrenderer.cpp
@ -130,7 +130,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
    return nullptr;

-  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
+  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
  int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
  bool para_is_ltr = true;        // Default direction is LTR
  const char* paragraph_lang = nullptr;
@ -215,8 +215,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
    // Now, process the word...
    std::vector<std::vector<std::pair<const char*, float>>>* confidencemap =
        nullptr;
+    std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+        symbolMap = nullptr;
    if (tesseract_->lstm_choice_mode) {
      confidencemap = res_it->GetBestLSTMSymbolChoices();
+      symbolMap = res_it->GetBestSegmentedLSTMSymbolChoices();
    }
    hocr_str << "\n      <span class='ocrx_word'"
             << " id='"
@ -324,6 +327,38 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
          tcnt++;
        }
      }
+    } else if (tesseract_->lstm_choice_mode == 3 && symbolMap != nullptr) {
+      for (size_t j = 0; j < symbolMap->size(); j++) {
+        std::vector<std::vector<std::pair<const char*, float>>> timesteps =
+            (*symbolMap)[j];
+        hocr_str << "\n       <span class='ocr_symbol'"
+                 << " id='"
+                 << "symbolstep_" << page_id << "_" << wcnt << "_" << scnt
+                 << "'>"
+                 << timesteps[0][0].first;
+        for (size_t i = 1; i < timesteps.size(); i++) {
+          hocr_str << "\n        <span class='ocrx_cinfo'"
+                   << " id='"
+                   << "timestep_" << page_id << "_" << wcnt << "_" << tcnt
+                   << "'"
+                   << ">";
+          std::vector<std::pair<const char*, float>> timestep =
+              timesteps[i];
+          for (std::pair<const char*, float> conf : timestep) {
+            hocr_str << "<span class='ocr_glyph'"
+                     << " id='"
+                     << "choice_" << page_id << "_" << wcnt << "_" << gcnt
+                     << "'"
+                     << " title='x_confs " << int(conf.second * 100) << "'>"
+                     << conf.first << "</span>";
+            gcnt++;
+          }
+          hocr_str << "</span>";
+          tcnt++;
+        }
+        hocr_str << "</span>";
+        scnt++;
+      }
    }
    hocr_str << "</span>";
    tcnt = 1;
--- a/src/ccmain/resultiterator.cpp
+++ b/src/ccmain/resultiterator.cpp
@ -605,7 +605,8 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
  return result;
 }

-std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetBestLSTMSymbolChoices() const {
+std::vector<std::vector<std::pair<const char*, float>>>*
+  ResultIterator::GetBestLSTMSymbolChoices() const {
  if (it_->word() != nullptr) {
    return &it_->word()->timesteps;
  } else {
@ -613,6 +614,15 @@ std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetBest
  }
 }

+std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+  ResultIterator::GetBestSegmentedLSTMSymbolChoices() const {
+  if (it_->word() != nullptr) {
+    return &it_->word()->symbol_steps;
+  } else {
+    return nullptr;
+  }
+}
+
 void ResultIterator::AppendUTF8WordText(STRING *text) const {
  if (!it_->word()) return;
  ASSERT_HOST(it_->word()->best_choice != nullptr);
--- a/src/ccmain/resultiterator.h
+++ b/src/ccmain/resultiterator.h
@ -100,7 +100,10 @@ class TESS_API ResultIterator : public LTRResultIterator {
  /**
   * Returns the LSTM choices for every LSTM timestep for the current word.
  */
-  virtual std::vector<std::vector<std::pair<const char*, float>>>* GetBestLSTMSymbolChoices() const;
+  virtual std::vector<std::vector<std::pair<const char*, float>>>*
+    GetBestLSTMSymbolChoices() const;
+  virtual std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
+    GetBestSegmentedLSTMSymbolChoices() const;

  /**
   * Return whether the current paragraph's dominant reading direction
--- a/src/ccmain/tesseractclass.cpp
+++ b/src/ccmain/tesseractclass.cpp
@ -526,7 +526,9 @@ Tesseract::Tesseract()
          "Allows to include alternative symbols choices in the hOCR output. "
          "Valid input values are 0, 1 and 2. 0 is the default value. "
          "With 1 the alternative symbol choices per timestep are included. "
-          "With 2 the alternative symbol choices are accumulated per character.",
+          "With 2 the alternative symbol choices are accumulated per character."
+          "With 3 the alternative symbol choices per timestep are included and "
+          "separated by the suggested segmentation of Tesseract",
          this->params()),

      backup_config_file_(nullptr),
--- a/src/ccmain/tesseractclass.h
+++ b/src/ccmain/tesseractclass.h
@ -1127,7 +1127,9 @@ class Tesseract : public Wordrec {
            "Allows to include alternative symbols choices in the hOCR output. "
            "Valid input values are 0, 1 and 2. 0 is the default value. "
            "With 1 the alternative symbol choices per timestep are included. "
-            "With 2 the alternative symbol choices are accumulated per character.");
+            "With 2 the alternative symbol choices are accumulated per character."
+            "With 3 the alternative symbol choices per timestep are included and "
+            "separated by the suggested segmentation of Tesseract");

  //// ambigsrecog.cpp /////////////////////////////////////////////////////////
  FILE *init_recog_training(const STRING &fname);
--- a/src/ccstruct/pageres.h
+++ b/src/ccstruct/pageres.h
@ -222,6 +222,8 @@ class WERD_RES : public ELIST_LINK {
  GenericVector<int> blob_gaps;
  // Stores the lstm choices of every timestep
  std::vector<std::vector<std::pair<const char*, float>>> timesteps;
+  std::vector<std::vector<std::vector<std::pair<const char*, float>>>>
+      symbol_steps;
  // Ratings matrix contains classifier choices for each classified combination
  // of blobs. The dimension is the same as the number of blobs in chopped_word
  // and the leading diagonal corresponds to classifier results of the blobs
--- a/src/lstm/recodebeam.cpp
+++ b/src/lstm/recodebeam.cpp
@ -25,6 +25,7 @@
 #include <deque>
 #include <map>
 #include <set>
+#include <tuple>
 #include <vector>

 #include <algorithm>
@ -187,7 +188,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
  GenericVector<int> xcoords;
  GenericVector<const RecodeNode*> best_nodes;
  GenericVector<const RecodeNode*> second_nodes;
-  std::deque<std::pair<int,int>> best_choices;
+  std::deque<std::tuple<int, int, double>> best_choices;
  ExtractBestPaths(&best_nodes, &second_nodes);
  if (debug) {
    DebugPath(unicharset, best_nodes);
@ -201,12 +202,13 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
  int timestepEnd = 0;
  //if lstm choice mode is required in granularity level 2 it stores the x
  //Coordinates of every chosen character to match the alternative choices to it
-  if (lstm_choice_mode == 2) {
+  if (lstm_choice_mode == 2 || lstm_choice_mode == 3) {
    ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings,
                            &xcoords, &best_choices);
    if (best_choices.size() > 0) {
-      current_char = best_choices.front().first;
-      timestepEnd = best_choices.front().second;
+      current_char = std::get<0>(best_choices.front());
+      timestepEnd = std::get<1>(best_choices.front());
+      if(lstm_choice_mode == 2)
        best_choices.pop_front();
    }
  } else {
@ -258,7 +260,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
            choice_pairs.push_back(choice);
          }
        }
-        if ((best_choices.size() > 0 && i == best_choices.front().second - 1)
+        if ((best_choices.size() > 0 && i == std::get<1>(best_choices.front()) - 1)
            || i == xcoords[word_end]-1) {
          std::map<const char*, float> summed_propabilities;
          for (auto it = choice_pairs.begin(); it != choice_pairs.end(); ++it) {
@ -283,7 +285,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
                                        it->second));
          }
          if (best_choices.size() > 0) {
-            current_char = best_choices.front().first;
+            current_char = std::get<0>(best_choices.front());
            best_choices.pop_front();
          }
          choice_pairs.clear();
@ -292,6 +294,25 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
        }
      }
      timestepEnd = xcoords[word_end];
+    } else if (lstm_choice_mode == 3) {
+      std::vector<std::vector<std::pair<const char*, float>>> currentSymbol;
+      for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
+        if (i == std::get<1>(best_choices.front())) {
+          if (currentSymbol.size() > 0) {
+            word_res->symbol_steps.push_back(currentSymbol);
+            currentSymbol.clear();
+          }
+          std::vector<std::pair<const char*, float>> choice_Header;
+          choice_Header.push_back(std::pair<const char*, float>(
+              unicharset->id_to_unichar_ext(std::get<0>(best_choices.front())),
+                                            2.0));
+          currentSymbol.push_back(choice_Header);
+          if(best_choices.size()>1) best_choices.pop_front();
+        }
+        currentSymbol.push_back(timesteps[i]);
+      }
+      word_res->symbol_steps.push_back(currentSymbol);
+      timestepEnd = xcoords[word_end];
    }
    for (int i = word_start; i < word_end; ++i) {
      BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
@ -366,7 +387,7 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
    const GenericVector<const RecodeNode*>& best_nodes,
    GenericVector<int>* unichar_ids, GenericVector<float>* certs,
    GenericVector<float>* ratings, GenericVector<int>* xcoords,
-    std::deque<std::pair<int, int>>* best_choices) {
+    std::deque<std::tuple<int, int, double>>* best_choices) {
  unichar_ids->truncate(0);
  certs->truncate(0);
  ratings->truncate(0);
@ -375,6 +396,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
  int t = 0;
  int width = best_nodes.size();
  while (t < width) {
+    int id;
+    int tposition;
    double certainty = 0.0;
    double rating = 0.0;
    while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) {
@ -396,7 +419,8 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
      unichar_ids->push_back(unichar_id);
      xcoords->push_back(t);
      if (best_choices != nullptr) {
-        best_choices->push_back(std::pair<int, int>(unichar_id, t));
+        tposition = t;
+        id = unichar_id;
      }
      do {
        double cert = best_nodes[t++]->certainty;
@ -414,6 +438,10 @@ void RecodeBeamSearch::ExtractPathAsUnicharIds(
      if (certainty < certs->back()) certs->back() = certainty;
      ratings->back() += rating;
    }
+    if (best_choices != nullptr) {
+      best_choices->push_back(
+          std::tuple<int, int, double>(id, tposition, rating));
+    }
  }
  xcoords->push_back(width);
 }
--- a/src/lstm/recodebeam.h
+++ b/src/lstm/recodebeam.h
@ -29,6 +29,7 @@
 #include "unicharcompress.h"
 #include <deque>
 #include <set>
+#include <tuple>
 #include <vector>

 namespace tesseract {
@ -281,7 +282,7 @@ class RecodeBeamSearch {
      const GenericVector<const RecodeNode*>& best_nodes,
      GenericVector<int>* unichar_ids, GenericVector<float>* certs,
      GenericVector<float>* ratings, GenericVector<int>* xcoords,
-      std::deque<std::pair<int,int>>* best_choices = nullptr);
+      std::deque<std::tuple<int,int,double>>* best_choices = nullptr);

  // Sets up a word with the ratings matrix and fake blobs with boxes in the
  // right places.