mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 12:49:35 +08:00
Cleanup TSV renderer
Remove all references to hocr, hocr.tsv, etc. Remove dead code for font info, input filename, HTML escapes. Improved comments. Fixed indentation.
This commit is contained in:
parent
858f4b75ce
commit
6700edd8bc
159
api/baseapi.cpp
159
api/baseapi.cpp
@ -1417,7 +1417,7 @@ static void AddBoxTohOCR(const ResultIterator *it,
|
||||
*hocr_str += "\">";
|
||||
}
|
||||
|
||||
static void AddBoxTohOCRTSV(const PageIterator *it,
|
||||
static void AddBoxToTSV(const PageIterator *it,
|
||||
PageIteratorLevel level,
|
||||
STRING* hocr_str) {
|
||||
int left, top, right, bottom;
|
||||
@ -1615,57 +1615,31 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a TSV-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Image name/input_file_ can be set by SetInputName before calling
|
||||
* GetHOCRText
|
||||
* STL removed from original patch submission and refactored by rays.
|
||||
*/
|
||||
char* TessBaseAPI::GetHOCRTSVText(int page_number) {
|
||||
char* TessBaseAPI::GetTSVText(int page_number) {
|
||||
if (tesseract_ == NULL ||
|
||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||
return NULL;
|
||||
|
||||
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
|
||||
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
||||
bool font_info = false;
|
||||
GetBoolVariable("hocr_font_info", &font_info);
|
||||
int page_id = page_number + 1; // we use 1-based page numbers.
|
||||
|
||||
STRING hocr_str("");
|
||||
|
||||
if (input_file_ == NULL)
|
||||
SetInputName(NULL);
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
|
||||
NULL, NULL);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
|
||||
uni16_str, str16_len);
|
||||
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
|
||||
NULL, NULL, NULL);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
|
||||
utf8_len, NULL, NULL);
|
||||
*input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
STRING tsv_str("");
|
||||
|
||||
int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;
|
||||
|
||||
hocr_str.add_str_int("1\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
hocr_str.add_str_int("\t", rect_left_);
|
||||
hocr_str.add_str_int("\t", rect_top_);
|
||||
hocr_str.add_str_int("\t", rect_width_);
|
||||
hocr_str.add_str_int("\t", rect_height_);
|
||||
hocr_str += "\t-1\t\n";
|
||||
tsv_str.add_str_int("1\t", page_num); // level 1 - page
|
||||
tsv_str.add_str_int("\t", block_num);
|
||||
tsv_str.add_str_int("\t", par_num);
|
||||
tsv_str.add_str_int("\t", line_num);
|
||||
tsv_str.add_str_int("\t", word_num);
|
||||
tsv_str.add_str_int("\t", rect_left_);
|
||||
tsv_str.add_str_int("\t", rect_top_);
|
||||
tsv_str.add_str_int("\t", rect_width_);
|
||||
tsv_str.add_str_int("\t", rect_height_);
|
||||
tsv_str += "\t-1\t\n";
|
||||
|
||||
ResultIterator *res_it = GetIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
@ -1674,36 +1648,36 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Open any new block/paragraph/textline.
|
||||
// Add rows for any new block/paragraph/textline.
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
block_num++, par_num = 0, line_num = 0, word_num = 0;
|
||||
hocr_str.add_str_int("2\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
|
||||
hocr_str += "\t-1\t\n";
|
||||
tsv_str.add_str_int("2\t", page_num); // level 2 - block
|
||||
tsv_str.add_str_int("\t", block_num);
|
||||
tsv_str.add_str_int("\t", par_num);
|
||||
tsv_str.add_str_int("\t", line_num);
|
||||
tsv_str.add_str_int("\t", word_num);
|
||||
AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
|
||||
tsv_str += "\t-1\t\n"; // end of row for block
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
par_num++, line_num = 0, word_num = 0;
|
||||
hocr_str.add_str_int("3\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
|
||||
hocr_str += "\t-1\t\n";
|
||||
tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
|
||||
tsv_str.add_str_int("\t", block_num);
|
||||
tsv_str.add_str_int("\t", par_num);
|
||||
tsv_str.add_str_int("\t", line_num);
|
||||
tsv_str.add_str_int("\t", word_num);
|
||||
AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
|
||||
tsv_str += "\t-1\t\n"; // end of row for para
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
line_num++, word_num = 0;
|
||||
hocr_str.add_str_int("4\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
|
||||
hocr_str += "\t-1\t\n";
|
||||
tsv_str.add_str_int("4\t", page_num); // level 4 - line
|
||||
tsv_str.add_str_int("\t", block_num);
|
||||
tsv_str.add_str_int("\t", par_num);
|
||||
tsv_str.add_str_int("\t", line_num);
|
||||
tsv_str.add_str_int("\t", word_num);
|
||||
AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
|
||||
tsv_str += "\t-1\t\n"; // end of row for line
|
||||
}
|
||||
|
||||
// Now, process the word...
|
||||
@ -1715,49 +1689,34 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
|
||||
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
|
||||
&monospace, &serif, &smallcaps,
|
||||
&pointsize, &font_id);
|
||||
word_num++;
|
||||
hocr_str.add_str_int("5\t", page_num);
|
||||
hocr_str.add_str_int("\t", block_num);
|
||||
hocr_str.add_str_int("\t", par_num);
|
||||
hocr_str.add_str_int("\t", line_num);
|
||||
hocr_str.add_str_int("\t", word_num);
|
||||
hocr_str.add_str_int("\t", left);
|
||||
hocr_str.add_str_int("\t", top);
|
||||
hocr_str.add_str_int("\t", right - left + 1);
|
||||
hocr_str.add_str_int("\t", bottom - top + 1);
|
||||
hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
hocr_str += "\t";
|
||||
word_num++;
|
||||
tsv_str.add_str_int("5\t", page_num); // level 5 - word
|
||||
tsv_str.add_str_int("\t", block_num);
|
||||
tsv_str.add_str_int("\t", par_num);
|
||||
tsv_str.add_str_int("\t", line_num);
|
||||
tsv_str.add_str_int("\t", word_num);
|
||||
tsv_str.add_str_int("\t", left);
|
||||
tsv_str.add_str_int("\t", top);
|
||||
tsv_str.add_str_int("\t", right - left + 1);
|
||||
tsv_str.add_str_int("\t", bottom - top + 1);
|
||||
tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
|
||||
tsv_str += "\t";
|
||||
|
||||
// Increment counts if at end of block/paragraph/textline.
|
||||
if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
|
||||
if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
|
||||
if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
|
||||
|
||||
do {
|
||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||
// if (grapheme && grapheme[0] != 0) {
|
||||
// if (grapheme[1] == 0) {
|
||||
// hocr_str += HOcrEscape(grapheme);
|
||||
// } else {
|
||||
hocr_str += grapheme;
|
||||
// }
|
||||
// }
|
||||
delete []grapheme;
|
||||
tsv_str += res_it->GetUTF8Text(RIL_SYMBOL);
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
hocr_str += "\n";
|
||||
tsv_str += "\n"; // end of row
|
||||
wcnt++;
|
||||
// Close any ending block/paragraph/textline.
|
||||
if (last_word_in_line) {
|
||||
lcnt++;
|
||||
}
|
||||
if (last_word_in_para) {
|
||||
pcnt++;
|
||||
}
|
||||
if (last_word_in_block) {
|
||||
bcnt++;
|
||||
}
|
||||
}
|
||||
|
||||
char *ret = new char[hocr_str.length() + 1];
|
||||
strcpy(ret, hocr_str.string());
|
||||
char *ret = new char[tsv_str.length() + 1];
|
||||
strcpy(ret, tsv_str.string());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
@ -603,12 +603,10 @@ class TESS_API TessBaseAPI {
|
||||
char* GetHOCRText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a TSV-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
*/
|
||||
char* GetHOCRTSVText(int page_number);
|
||||
|
||||
char* GetTSVText(int page_number);
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded in the same
|
||||
|
@ -182,31 +182,32 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||
/**********************************************************************
|
||||
* HOcr Text Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "hocr.tsv") {
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = false;
|
||||
}
|
||||
|
||||
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
|
||||
: TessResultRenderer(outputbase, "hocr.tsv") {
|
||||
TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info)
|
||||
: TessResultRenderer(outputbase, "tsv") {
|
||||
font_info_ = font_info;
|
||||
}
|
||||
|
||||
bool TessHOcrTsvRenderer::BeginDocumentHandler() {
|
||||
bool TessTsvRenderer::BeginDocumentHandler() {
|
||||
// Output TSV column headings
|
||||
AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrTsvRenderer::EndDocumentHandler() {
|
||||
bool TessTsvRenderer::EndDocumentHandler() {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||
char* hocrtsv = api->GetHOCRTSVText(imagenum());
|
||||
if (hocrtsv == NULL) return false;
|
||||
bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||
char* tsv = api->GetTSVText(imagenum());
|
||||
if (tsv == NULL) return false;
|
||||
|
||||
AppendString(hocrtsv);
|
||||
delete[] hocrtsv;
|
||||
AppendString(tsv);
|
||||
delete[] tsv;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -163,12 +163,12 @@ private:
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into an hocr tsv string
|
||||
* Renders Tesseract output into a TSV string
|
||||
*/
|
||||
class TESS_API TessHOcrTsvRenderer : public TessResultRenderer {
|
||||
class TESS_API TessTsvRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessHOcrTsvRenderer(const char *outputbase, bool font_info);
|
||||
explicit TessHOcrTsvRenderer(const char *outputbase);
|
||||
explicit TessTsvRenderer(const char *outputbase, bool font_info);
|
||||
explicit TessTsvRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
virtual bool BeginDocumentHandler();
|
||||
|
@ -299,12 +299,12 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
|
||||
new tesseract::TessHOcrRenderer(outputbase, font_info));
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_hocrtsv", &b);
|
||||
api->GetBoolVariable("tessedit_create_tsv", &b);
|
||||
if (b) {
|
||||
bool font_info;
|
||||
api->GetBoolVariable("hocr_font_info", &font_info);
|
||||
renderers->push_back(
|
||||
new tesseract::TessHOcrTsvRenderer(outputbase, font_info));
|
||||
new tesseract::TessTsvRenderer(outputbase, font_info));
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_pdf", &b);
|
||||
|
@ -385,7 +385,7 @@ Tesseract::Tesseract()
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_hocrtsv, false, "Write .hocr.tsv TSV output file",
|
||||
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
|
||||
this->params()),
|
||||
|
@ -1003,7 +1003,7 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
|
||||
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
|
||||
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
|
||||
BOOL_VAR_H(tessedit_create_hocrtsv, false, "Write .hocr.tsv hOCR-tsv output file");
|
||||
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
|
||||
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
||||
STRING_VAR_H(unrecognised_char, "|",
|
||||
"Output char for unidentified blobs");
|
||||
|
@ -1,2 +0,0 @@
|
||||
tessedit_create_hocrtsv 1
|
||||
tessedit_pageseg_mode 1
|
2
tessdata/configs/tsv
Normal file
2
tessdata/configs/tsv
Normal file
@ -0,0 +1,2 @@
|
||||
tessedit_create_tsv 1
|
||||
tessedit_pageseg_mode 1
|
Loading…
Reference in New Issue
Block a user