mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 09:52:40 +08:00
Merge efb267097d
into 3b7c70e34d
This commit is contained in:
commit
ba6bcbe2d9
@ -574,6 +574,14 @@ public:
|
||||
*/
|
||||
char *GetTSVText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* Allows additional column with detected language.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *GetTSVText(int page_number, bool lang_info);
|
||||
|
||||
/**
|
||||
* Make a box file for LSTM training from the internal data structures.
|
||||
* Constructs coordinates in the original image - not just the rectangle.
|
||||
|
@ -220,7 +220,7 @@ private:
|
||||
*/
|
||||
class TESS_API TessTsvRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessTsvRenderer(const char *outputbase, bool font_info);
|
||||
explicit TessTsvRenderer(const char *outputbase, bool lang_info);
|
||||
explicit TessTsvRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
@ -229,7 +229,7 @@ protected:
|
||||
bool EndDocumentHandler() override;
|
||||
|
||||
private:
|
||||
bool font_info_; // whether to print font information
|
||||
bool lang_info_; // whether to print language information
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -1347,6 +1347,16 @@ static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::st
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *TessBaseAPI::GetTSVText(int page_number) {
|
||||
return GetTSVText(page_number, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* Allows additional column with detected language.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char *TessBaseAPI::GetTSVText(int page_number, bool lang_info) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
|
||||
return nullptr;
|
||||
}
|
||||
@ -1361,6 +1371,7 @@ char *TessBaseAPI::GetTSVText(int page_number) {
|
||||
int par_num = 0;
|
||||
int line_num = 0;
|
||||
int word_num = 0;
|
||||
std::string lang;
|
||||
|
||||
std::string tsv_str;
|
||||
tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
|
||||
@ -1372,7 +1383,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
|
||||
tsv_str += "\t" + std::to_string(rect_top_);
|
||||
tsv_str += "\t" + std::to_string(rect_width_);
|
||||
tsv_str += "\t" + std::to_string(rect_height_);
|
||||
tsv_str += "\t-1\t\n";
|
||||
tsv_str += "\t-1";
|
||||
if (lang_info) {
|
||||
tsv_str += "\t" + lang;
|
||||
}
|
||||
tsv_str += "\t\n";
|
||||
|
||||
const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
@ -1393,9 +1408,16 @@ char *TessBaseAPI::GetTSVText(int page_number) {
|
||||
tsv_str += "\t" + std::to_string(line_num);
|
||||
tsv_str += "\t" + std::to_string(word_num);
|
||||
AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
|
||||
tsv_str += "\t-1\t\n"; // end of row for block
|
||||
tsv_str += "\t-1";
|
||||
if (lang_info) {
|
||||
tsv_str += "\t";
|
||||
}
|
||||
tsv_str += "\t\n"; // end of row for block
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
if (lang_info) {
|
||||
lang = res_it->WordRecognitionLanguage();
|
||||
}
|
||||
par_num++;
|
||||
line_num = 0;
|
||||
word_num = 0;
|
||||
@ -1405,7 +1427,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
|
||||
tsv_str += "\t" + std::to_string(line_num);
|
||||
tsv_str += "\t" + std::to_string(word_num);
|
||||
AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
|
||||
tsv_str += "\t-1\t\n"; // end of row for para
|
||||
tsv_str += "\t-1";
|
||||
if (lang_info) {
|
||||
tsv_str += "\t" + lang;
|
||||
}
|
||||
tsv_str += "\t\n"; // end of row for para
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
line_num++;
|
||||
@ -1416,7 +1442,11 @@ char *TessBaseAPI::GetTSVText(int page_number) {
|
||||
tsv_str += "\t" + std::to_string(line_num);
|
||||
tsv_str += "\t" + std::to_string(word_num);
|
||||
AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
|
||||
tsv_str += "\t-1\t\n"; // end of row for line
|
||||
tsv_str += "\t-1";
|
||||
if (lang_info) {
|
||||
tsv_str += "\t";
|
||||
}
|
||||
tsv_str += "\t\n"; // end of row for line
|
||||
}
|
||||
|
||||
// Now, process the word...
|
||||
@ -1433,10 +1463,18 @@ char *TessBaseAPI::GetTSVText(int page_number) {
|
||||
tsv_str += "\t" + std::to_string(right - left);
|
||||
tsv_str += "\t" + std::to_string(bottom - top);
|
||||
tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
|
||||
tsv_str += "\t";
|
||||
|
||||
if (lang_info) {
|
||||
const char *word_lang = res_it->WordRecognitionLanguage();
|
||||
tsv_str += "\t";
|
||||
if (word_lang) {
|
||||
tsv_str += word_lang;
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
// Increment counts if at end of block/paragraph/textline.
|
||||
tsv_str += "\t";
|
||||
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
|
||||
lcnt++;
|
||||
}
|
||||
|
@ -156,19 +156,23 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
* TSV Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase) : TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = false;
|
||||
lang_info_ = false;
|
||||
}
|
||||
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool lang_info)
|
||||
: TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = font_info;
|
||||
lang_info_ = lang_info;
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::BeginDocumentHandler() {
|
||||
// Output TSV column headings
|
||||
AppendString(
|
||||
"level\tpage_num\tblock_num\tpar_num\tline_num\tword_"
|
||||
"num\tleft\ttop\twidth\theight\tconf\ttext\n");
|
||||
"num\tleft\ttop\twidth\theight\tconf\t");
|
||||
if (lang_info_) {
|
||||
AppendString("lang\t");
|
||||
}
|
||||
AppendString("text\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -177,7 +181,7 @@ bool TessTsvRenderer::EndDocumentHandler() {
|
||||
}
|
||||
|
||||
bool TessTsvRenderer::AddImageHandler(TessBaseAPI *api) {
|
||||
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum()));
|
||||
const std::unique_ptr<const char[]> tsv(api->GetTSVText(imagenum(), lang_info_));
|
||||
if (tsv == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
@ -402,6 +402,7 @@ Tesseract::Tesseract()
|
||||
this->params())
|
||||
, BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params())
|
||||
, BOOL_MEMBER(textord_use_cjk_fp_model, false, "Use CJK fixed pitch model", this->params())
|
||||
, BOOL_MEMBER(tsv_lang_info, false, "Include language info in the .tsv output file", this->params())
|
||||
, BOOL_MEMBER(poly_allow_detailed_fx, false,
|
||||
"Allow feature extractors to see the original outline", this->params())
|
||||
, BOOL_INIT_MEMBER(tessedit_init_config_only, false,
|
||||
|
@ -926,6 +926,7 @@ public:
|
||||
BOOL_VAR_H(tessedit_flip_0O);
|
||||
double_VAR_H(tessedit_lower_flip_hyphen);
|
||||
double_VAR_H(tessedit_upper_flip_hyphen);
|
||||
BOOL_VAR_H(tsv_lang_info);
|
||||
BOOL_VAR_H(rej_trust_doc_dawg);
|
||||
BOOL_VAR_H(rej_1Il_use_dict_word);
|
||||
BOOL_VAR_H(rej_1Il_trust_permuter_type);
|
||||
|
@ -548,9 +548,9 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,
|
||||
|
||||
api.GetBoolVariable("tessedit_create_tsv", &b);
|
||||
if (b) {
|
||||
bool font_info;
|
||||
api.GetBoolVariable("hocr_font_info", &font_info);
|
||||
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, font_info);
|
||||
bool lang_info;
|
||||
api.GetBoolVariable("tsv_lang_info", &lang_info);
|
||||
auto renderer = std::make_unique<tesseract::TessTsvRenderer>(outputbase, lang_info);
|
||||
if (renderer->happy()) {
|
||||
renderers.push_back(std::move(renderer));
|
||||
} else {
|
||||
|
@ -1 +1,2 @@
|
||||
tessedit_create_tsv 1
|
||||
tsv_lang_info 0
|
||||
|
Loading…
Reference in New Issue
Block a user