This commit is contained in:
Pablo Duboue 2025-05-26 10:14:18 +02:00 committed by GitHub
commit ba6bcbe2d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 68 additions and 15 deletions

View File

@ -574,6 +574,14 @@ public:
*/
char *GetTSVText(int page_number);
/**
* Make a TSV-formatted string from the internal data structures.
* Allows additional column with detected language.
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *GetTSVText(int page_number, bool lang_info);
/**
* Make a box file for LSTM training from the internal data structures.
* Constructs coordinates in the original image - not just the rectangle.

View File

@ -220,7 +220,7 @@ private:
*/
class TESS_API TessTsvRenderer : public TessResultRenderer {
public:
explicit TessTsvRenderer(const char *outputbase, bool font_info);
explicit TessTsvRenderer(const char *outputbase, bool lang_info);
explicit TessTsvRenderer(const char *outputbase);
protected:
@ -229,7 +229,7 @@ protected:
bool EndDocumentHandler() override;
private:
bool font_info_; // whether to print font information
bool lang_info_; // whether to print language information
};
/**

View File

@ -1347,6 +1347,16 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st
* Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetTSVText(int page_number) {
return GetTSVText(page_number, false);
}
/**
* Make a TSV-formatted string from the internal data structures.
* Allows additional column with detected language.
* page_number is 0-based but will appear in the output as 1-based.
* Returned string must be freed with the delete [] operator.
*/
char *TessBaseAPI::GetTSVText(int page_number, bool lang_info) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
return nullptr;
}
@ -1361,6 +1371,7 @@ char *TessBaseAPI::GetTSVText(int page_number) {
int par_num = 0;
int line_num = 0;
int word_num = 0;
std::string lang;
std::string tsv_str;
tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
@ -1372,7 +1383,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(rect_top_);
tsv_str += "\t" + std::to_string(rect_width_);
tsv_str += "\t" + std::to_string(rect_height_);
tsv_str += "\t-1\t\n";
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t" + lang;
}
tsv_str += "\t\n";
const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
@ -1393,9 +1408,16 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for block
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t";
}
tsv_str += "\t\n"; // end of row for block
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
if (lang_info) {
lang = res_it->WordRecognitionLanguage();
}
par_num++;
line_num = 0;
word_num = 0;
@ -1405,7 +1427,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for para
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t" + lang;
}
tsv_str += "\t\n"; // end of row for para
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
line_num++;
@ -1416,7 +1442,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(line_num);
tsv_str += "\t" + std::to_string(word_num);
AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
tsv_str += "\t-1\t\n"; // end of row for line
tsv_str += "\t-1";
if (lang_info) {
tsv_str += "\t";
}
tsv_str += "\t\n"; // end of row for line
}
// Now, process the word...
@ -1433,10 +1463,18 @@ char *TessBaseAPI::GetTSVText(int page_number) {
tsv_str += "\t" + std::to_string(right - left);
tsv_str += "\t" + std::to_string(bottom - top);
tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
tsv_str += "\t";
if (lang_info) {
const char *word_lang = res_it->WordRecognitionLanguage();
tsv_str += "\t";
if (word_lang) {
tsv_str += word_lang;
}
}
#if !defined(NDEBUG)
// Increment counts if at end of block/paragraph/textline.
tsv_str += "\t";
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
lcnt++;
}

View File

@ -156,19 +156,23 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
* TSV Text Renderer interface implementation
**********************************************************************/
TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
font_info_ = false;
lang_info_ = false;
}
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool lang_info)
: TessResultRenderer(outputbase, "tsv") {
font_info_ = font_info;
lang_info_ = lang_info;
}
bool TessTsvRenderer::BeginDocumentHandler() {
// Output TSV column headings
AppendString(
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
"num\tleft\ttop\twidth\theight\tconf\t");
if (lang_info_) {
AppendString("lang\t");
}
AppendString("text\n");
return true;
}
@ -177,7 +181,7 @@ bool TessTsvRenderer::EndDocumentHandler() {
}
bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum(), lang_info_));
if (tsv == nullptr) {
return false;
}

View File

@ -402,6 +402,7 @@ Tesseract::Tesseract()
this->params())
, BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
, BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
, BOOL_MEMBER(tsv_lang_info, false, "Include language info in the .tsv output file", this->params())
, BOOL_MEMBER(poly_allow_detailed_fx, false,
"Allow feature extractors to see the original outline", this->params())
, BOOL_INIT_MEMBER(tessedit_init_config_only, false,

View File

@ -926,6 +926,7 @@ public:
BOOL_VAR_H(tessedit_flip_0O);
double_VAR_H(tessedit_lower_flip_hyphen);
double_VAR_H(tessedit_upper_flip_hyphen);
BOOL_VAR_H(tsv_lang_info);
BOOL_VAR_H(rej_trust_doc_dawg);
BOOL_VAR_H(rej_1Il_use_dict_word);
BOOL_VAR_H(rej_1Il_trust_permuter_type);

View File

@ -548,9 +548,9 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,
api.GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;
api.GetBoolVariable("hocr_font_info", &font_info);
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, font_info);
bool lang_info;
api.GetBoolVariable("tsv_lang_info", &lang_info);
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, lang_info);
if (renderer->happy()) {
renderers.push_back(std::move(renderer));
} else {

View File

@ -1 +1,2 @@
tessedit_create_tsv 1
tsv_lang_info 0