Added a feature to enrich the hOCR output with glyph confidences

By using the parameter -c glyph_confidences=true the user is able to enrich
the hOCR output with additional information. Tesseract then lists additionally
the timesteps with all glyphs that were considered with their confidence
for every timestep of the LSTM.

The format of the hOCR output is slightly changed: There is now a linebreak
after every word for better readability by humans.

Signed-off-by: Noah Metzger <noah.metzger@bib.uni-mannheim.de>
This commit is contained in:
Noah Metzger 2018-07-25 15:01:07 +02:00
parent 607e8fd85c
commit 91c7504a35
11 changed files with 138 additions and 16 deletions

View File

@ -49,6 +49,8 @@
#include <fstream> // for size_t #include <fstream> // for size_t
#include <iostream> // for std::cin #include <iostream> // for std::cin
#include <memory> // for std::unique_ptr #include <memory> // for std::unique_ptr
#include <set> // for std::pair
#include <vector> // for std::vector
#include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box... #include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box...
#include "blobclass.h" // for ExtractFontName #include "blobclass.h" // for ExtractFontName
#include "boxword.h" // for BoxWord #include "boxword.h" // for BoxWord
@ -398,6 +400,7 @@ int TessBaseAPI::Init(const char* data, int data_size, const char* language,
return -1; return -1;
} }
} }
PERF_COUNT_SUB("update tesseract_") PERF_COUNT_SUB("update tesseract_")
// Update datapath and language requested for the last valid initialization. // Update datapath and language requested for the last valid initialization.
if (datapath_ == nullptr) if (datapath_ == nullptr)
@ -1389,6 +1392,17 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
*hocr_str += "'"; *hocr_str += "'";
} }
static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
int num2, int num3) {
const size_t BUFSIZE = 64;
char id_buffer[BUFSIZE];
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3);
id_buffer[BUFSIZE - 1] = '\0';
*hocr_str += " id='";
*hocr_str += id_buffer;
*hocr_str += "'";
}
static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level, static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
STRING* hocr_str) { STRING* hocr_str) {
int left, top, right, bottom; int left, top, right, bottom;
@ -1449,7 +1463,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
return nullptr; return nullptr;
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers. int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool para_is_ltr = true; // Default direction is LTR bool para_is_ltr = true; // Default direction is LTR
const char* paragraph_lang = nullptr; const char* paragraph_lang = nullptr;
@ -1529,7 +1543,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
} }
// Now, process the word... // Now, process the word...
hocr_str += "<span class='ocrx_word'"; std::vector<std::vector<std::pair<const char*, float>>>* confidencemap = nullptr;
if (tesseract_->glyph_confidences) {
confidencemap = res_it->GetGlyphConfidences();
}
hocr_str += "\n <span class='ocrx_word'";
AddIdTohOCR(&hocr_str, "word", page_id, wcnt); AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
int left, top, right, bottom; int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps; bool bold, italic, underlined, monospace, serif, smallcaps;
@ -1587,7 +1605,32 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (italic) hocr_str += "</em>"; if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>"; if (bold) hocr_str += "</strong>";
// If glyph confidence is required it is added here
if (tesseract_->glyph_confidences && confidencemap != nullptr) {
for (size_t i = 0; i < confidencemap->size(); i++) {
hocr_str += "\n <span class='ocrx_cinfo'";
AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
hocr_str += ">";
//*
std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
for (std::pair<const char*, float> conf : timestep) {
hocr_str += "<span class='ocr_glyph'";
AddIdTohOCR(&hocr_str, "glyph", page_id, wcnt, gcnt);
hocr_str.add_str_int(" title='x_confs ", int(conf.second * 100));
hocr_str += "'";
hocr_str += ">";
hocr_str += conf.first;
hocr_str += "</span>"; hocr_str += "</span>";
gcnt++;
}
//*/
hocr_str += "</span>";
tcnt++;
}
}
hocr_str += "</span>";
tcnt = 1;
gcnt = 1;
wcnt++; wcnt++;
// Close any ending block/paragraph/textline. // Close any ending block/paragraph/textline.
if (last_word_in_line) { if (last_word_in_line) {

View File

@ -239,7 +239,7 @@ void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
if (im_data == nullptr) return; if (im_data == nullptr) return;
lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0, lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
kWorstDictCertainty / kCertaintyScale, kWorstDictCertainty / kCertaintyScale,
word_box, words); word_box, words, glyph_confidences);
delete im_data; delete im_data;
SearchWords(words); SearchWords(words);
} }

View File

@ -27,6 +27,8 @@
#include "tesseractclass.h" #include "tesseractclass.h"
#include "unicharset.h" #include "unicharset.h"
#include "unicodes.h" #include "unicodes.h"
#include <set>
#include <vector>
namespace tesseract { namespace tesseract {
@ -602,6 +604,14 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
return result; return result;
} }
std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetGlyphConfidences() const {
if (it_->word() != nullptr) {
return &it_->word()->timesteps;
} else {
return nullptr;
}
}
void ResultIterator::AppendUTF8WordText(STRING *text) const { void ResultIterator::AppendUTF8WordText(STRING *text) const {
if (!it_->word()) return; if (!it_->word()) return;
ASSERT_HOST(it_->word()->best_choice != nullptr); ASSERT_HOST(it_->word()->best_choice != nullptr);

View File

@ -22,6 +22,8 @@
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ #define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
#include <set> // for std::pair
#include <vector> // for std::vector
#include "ltrresultiterator.h" // for LTRResultIterator #include "ltrresultiterator.h" // for LTRResultIterator
#include "platform.h" // for TESS_API, TESS_LOCAL #include "platform.h" // for TESS_API, TESS_LOCAL
#include "publictypes.h" // for PageIteratorLevel #include "publictypes.h" // for PageIteratorLevel
@ -95,6 +97,11 @@ class TESS_API ResultIterator : public LTRResultIterator {
*/ */
virtual char* GetUTF8Text(PageIteratorLevel level) const; virtual char* GetUTF8Text(PageIteratorLevel level) const;
/**
* Returns the glyph confidences for every LSTM timestep for the current Word
*/
virtual std::vector<std::vector<std::pair<const char*, float>>>* GetGlyphConfidences() const;
/** /**
* Return whether the current paragraph's dominant reading direction * Return whether the current paragraph's dominant reading direction
* is left-to-right (as opposed to right-to-left). * is left-to-right (as opposed to right-to-left).

View File

@ -508,6 +508,9 @@ Tesseract::Tesseract()
STRING_MEMBER(page_separator, "\f", STRING_MEMBER(page_separator, "\f",
"Page separator (default is form feed control character)", "Page separator (default is form feed control character)",
this->params()), this->params()),
BOOL_MEMBER(glyph_confidences, false,
"Allows to include glyph confidences in the hOCR output",
this->params()),
backup_config_file_(nullptr), backup_config_file_(nullptr),
pix_binary_(nullptr), pix_binary_(nullptr),

View File

@ -1114,6 +1114,7 @@ class Tesseract : public Wordrec {
"Preserve multiple interword spaces"); "Preserve multiple interword spaces");
STRING_VAR_H(page_separator, "\f", STRING_VAR_H(page_separator, "\f",
"Page separator (default is form feed control character)"); "Page separator (default is form feed control character)");
BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output");
//// ambigsrecog.cpp ///////////////////////////////////////////////////////// //// ambigsrecog.cpp /////////////////////////////////////////////////////////
FILE *init_recog_training(const STRING &fname); FILE *init_recog_training(const STRING &fname);

View File

@ -21,6 +21,8 @@
#define PAGERES_H #define PAGERES_H
#include <cstdint> // for int32_t, int16_t #include <cstdint> // for int32_t, int16_t
#include <set> // for std::pair
#include <vector> // for std::vector
#include <sys/types.h> // for int8_t #include <sys/types.h> // for int8_t
#include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS #include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS
#include "clst.h" // for CLIST_ITERATOR, CLISTIZEH #include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
@ -218,6 +220,8 @@ class WERD_RES : public ELIST_LINK {
// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
// blob i and blob i+1. // blob i and blob i+1.
GenericVector<int> blob_gaps; GenericVector<int> blob_gaps;
// Stores the glyph confidences of every timestep of the lstm
std::vector<std::vector<std::pair<const char*, float>>> timesteps;
// Ratings matrix contains classifier choices for each classified combination // Ratings matrix contains classifier choices for each classified combination
// of blobs. The dimension is the same as the number of blobs in chopped_word // of blobs. The dimension is the same as the number of blobs in chopped_word
// and the leading diagonal corresponds to classifier results of the blobs // and the leading diagonal corresponds to classifier results of the blobs

View File

@ -172,7 +172,7 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
bool debug, double worst_dict_cert, bool debug, double worst_dict_cert,
const TBOX& line_box, const TBOX& line_box,
PointerVector<WERD_RES>* words) { PointerVector<WERD_RES>* words, bool glyph_confidences) {
NetworkIO outputs; NetworkIO outputs;
float scale_factor; float scale_factor;
NetworkIO inputs; NetworkIO inputs;
@ -183,9 +183,11 @@ void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
search_ = search_ =
new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_); new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_);
} }
search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, nullptr); search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert,
&GetUnicharset(), glyph_confidences);
search_->ExtractBestPathAsWords(line_box, scale_factor, debug, search_->ExtractBestPathAsWords(line_box, scale_factor, debug,
&GetUnicharset(), words); &GetUnicharset(), words,
glyph_confidences);
} }
// Helper computes min and mean best results in the output. // Helper computes min and mean best results in the output.

View File

@ -184,7 +184,8 @@ class LSTMRecognizer {
// will be used in a dictionary word. // will be used in a dictionary word.
void RecognizeLine(const ImageData& image_data, bool invert, bool debug, void RecognizeLine(const ImageData& image_data, bool invert, bool debug,
double worst_dict_cert, const TBOX& line_box, double worst_dict_cert, const TBOX& line_box,
PointerVector<WERD_RES>* words); PointerVector<WERD_RES>* words,
bool glyph_confidences = false);
// Helper computes min and mean best results in the output. // Helper computes min and mean best results in the output.
void OutputStats(const NetworkIO& outputs, void OutputStats(const NetworkIO& outputs,

View File

@ -22,6 +22,8 @@
#include "networkio.h" #include "networkio.h"
#include "pageres.h" #include "pageres.h"
#include "unicharcompress.h" #include "unicharcompress.h"
#include <set>
#include <vector>
#include <algorithm> #include <algorithm>
@ -77,13 +79,18 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder,
// Decodes the set of network outputs, storing the lattice internally. // Decodes the set of network outputs, storing the lattice internally.
void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio, void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
double cert_offset, double worst_dict_cert, double cert_offset, double worst_dict_cert,
const UNICHARSET* charset) { const UNICHARSET* charset, bool glyph_confidence) {
beam_size_ = 0; beam_size_ = 0;
int width = output.Width(); int width = output.Width();
if (glyph_confidence)
timesteps.clear();
for (int t = 0; t < width; ++t) { for (int t = 0; t < width; ++t) {
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]); ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
charset); charset);
if (glyph_confidence) {
SaveMostCertainGlyphs(output.f(t), output.NumFeatures(), charset, t);
}
} }
} }
void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output, void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
@ -98,6 +105,35 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
} }
} }
void RecodeBeamSearch::SaveMostCertainGlyphs(const float* outputs,
int num_outputs,
const UNICHARSET* charset,
int xCoord) {
std::vector<std::pair<const char*, float>> glyphs;
int pos = 0;
for (int i = 0; i < num_outputs; ++i) {
if (outputs[i] >= 0.01f) {
const char* charakter;
if (i + 2 >= num_outputs) {
charakter = "";
} else if (i > 0) {
charakter = charset->id_to_unichar_ext(i + 2);
} else {
charakter = charset->id_to_unichar_ext(i);
}
pos = 0;
//order the possible glyphs within one timestep
//beginning with the most likely
while (glyphs.size() > pos && glyphs[pos].second > outputs[i]) {
pos++;
}
glyphs.insert(glyphs.begin() + pos,
std::pair<const char*, float>(charakter, outputs[i]));
}
}
timesteps.push_back(glyphs);
}
// Returns the best path as labels/scores/xcoords similar to simple CTC. // Returns the best path as labels/scores/xcoords similar to simple CTC.
void RecodeBeamSearch::ExtractBestPathAsLabels( void RecodeBeamSearch::ExtractBestPathAsLabels(
GenericVector<int>* labels, GenericVector<int>* xcoords) const { GenericVector<int>* labels, GenericVector<int>* xcoords) const {
@ -140,7 +176,8 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(
void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
float scale_factor, bool debug, float scale_factor, bool debug,
const UNICHARSET* unicharset, const UNICHARSET* unicharset,
PointerVector<WERD_RES>* words) { PointerVector<WERD_RES>* words,
bool glyph_confidence) {
words->truncate(0); words->truncate(0);
GenericVector<int> unichar_ids; GenericVector<int> unichar_ids;
GenericVector<float> certs; GenericVector<float> certs;
@ -165,6 +202,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
} }
// Convert labels to unichar-ids. // Convert labels to unichar-ids.
int word_end = 0; int word_end = 0;
int timestepEnd = 0;
float prev_space_cert = 0.0f; float prev_space_cert = 0.0f;
for (int word_start = 0; word_start < num_ids; word_start = word_end) { for (int word_start = 0; word_start < num_ids; word_start = word_end) {
for (word_end = word_start + 1; word_end < num_ids; ++word_end) { for (word_end = word_start + 1; word_end < num_ids; ++word_end) {
@ -188,6 +226,12 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
WERD_RES* word_res = InitializeWord( WERD_RES* word_res = InitializeWord(
leading_space, line_box, word_start, word_end, leading_space, line_box, word_start, word_end,
std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor); std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor);
if (glyph_confidence) {
for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
word_res->timesteps.push_back(timesteps[i]);
}
timestepEnd = xcoords[word_end];
}
for (int i = word_start; i < word_end; ++i) { for (int i = word_start; i < word_end; ++i) {
BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
BLOB_CHOICE_IT bc_it(choices); BLOB_CHOICE_IT bc_it(choices);
@ -381,7 +425,7 @@ void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
void RecodeBeamSearch::DecodeStep(const float* outputs, int t, void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
double dict_ratio, double cert_offset, double dict_ratio, double cert_offset,
double worst_dict_cert, double worst_dict_cert,
const UNICHARSET* charset) { const UNICHARSET* charset, bool debug) {
if (t == beam_.size()) beam_.push_back(new RecodeBeam); if (t == beam_.size()) beam_.push_back(new RecodeBeam);
RecodeBeam* step = beam_[t]; RecodeBeam* step = beam_[t];
beam_size_ = t + 1; beam_size_ = t + 1;
@ -396,7 +440,7 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
} }
} else { } else {
RecodeBeam* prev = beam_[t - 1]; RecodeBeam* prev = beam_[t - 1];
if (charset != nullptr) { if (debug) {
int beam_index = BeamIndex(true, NC_ANYTHING, 0); int beam_index = BeamIndex(true, NC_ANYTHING, 0);
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
GenericVector<const RecodeNode*> path; GenericVector<const RecodeNode*> path;

View File

@ -28,6 +28,8 @@
#include "networkio.h" #include "networkio.h"
#include "ratngs.h" #include "ratngs.h"
#include "unicharcompress.h" #include "unicharcompress.h"
#include <set>
#include <vector>
namespace tesseract { namespace tesseract {
@ -182,7 +184,8 @@ class RecodeBeamSearch {
// Decodes the set of network outputs, storing the lattice internally. // Decodes the set of network outputs, storing the lattice internally.
// If charset is not null, it enables detailed debugging of the beam search. // If charset is not null, it enables detailed debugging of the beam search.
void Decode(const NetworkIO& output, double dict_ratio, double cert_offset, void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
double worst_dict_cert, const UNICHARSET* charset); double worst_dict_cert, const UNICHARSET* charset,
bool glyph_confidence = false);
void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio, void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
double cert_offset, double worst_dict_cert, double cert_offset, double worst_dict_cert,
const UNICHARSET* charset); const UNICHARSET* charset);
@ -201,11 +204,12 @@ class RecodeBeamSearch {
// Returns the best path as a set of WERD_RES. // Returns the best path as a set of WERD_RES.
void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor, void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
bool debug, const UNICHARSET* unicharset, bool debug, const UNICHARSET* unicharset,
PointerVector<WERD_RES>* words); PointerVector<WERD_RES>* words, bool glyph_confidence);
// Generates debug output of the content of the beams after a Decode. // Generates debug output of the content of the beams after a Decode.
void DebugBeams(const UNICHARSET& unicharset) const; void DebugBeams(const UNICHARSET& unicharset) const;
std::vector< std::vector<std::pair<const char*, float>>> timesteps;
// Clipping value for certainty inside Tesseract. Reflects the minimum value // Clipping value for certainty inside Tesseract. Reflects the minimum value
// of certainty that will be returned by ExtractBestPathAsUnicharIds. // of certainty that will be returned by ExtractBestPathAsUnicharIds.
// Supposedly on a uniform scale that can be compared across languages and // Supposedly on a uniform scale that can be compared across languages and
@ -291,7 +295,10 @@ class RecodeBeamSearch {
// for the current timestep. // for the current timestep.
void DecodeStep(const float* outputs, int t, double dict_ratio, void DecodeStep(const float* outputs, int t, double dict_ratio,
double cert_offset, double worst_dict_cert, double cert_offset, double worst_dict_cert,
const UNICHARSET* charset); const UNICHARSET* charset, bool debug = false);
//Saves the most certain glyphs for the current time-step
void SaveMostCertainGlyphs(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord);
// Adds to the appropriate beams the legal (according to recoder) // Adds to the appropriate beams the legal (according to recoder)
// continuations of context prev, which is from the given index to beams_, // continuations of context prev, which is from the given index to beams_,