Add PAGE XML renderer / export (#4214)

Add PAGE XML export and documentation.
To generate PAGE XML output just add 'page' to the tesseract command.

The output is outputname + '.page.xml' to avoid conflicts with ALTO export.

The output can be customized with the flags:
tessedit_create_page_polygon and tessedit_create_page_wordlevel.

Co-authored-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Jan Kamlah 2024-04-19 21:09:47 +02:00 committed by Stefan Weil
parent bae520ea00
commit 577e8a8b93
13 changed files with 1220 additions and 2 deletions

View File

@ -740,6 +740,7 @@ set(TESSERACT_SRC
src/api/capi.cpp
src/api/renderer.cpp
src/api/altorenderer.cpp
src/api/pagerenderer.cpp
src/api/hocrrenderer.cpp
src/api/lstmboxrenderer.cpp
src/api/pdfrenderer.cpp
@ -764,6 +765,7 @@ set(TESSERACT_CONFIGS
tessdata/configs/lstmbox
tessdata/configs/lstmdebug
tessdata/configs/makebox
tessdata/configs/page
tessdata/configs/pdf
tessdata/configs/quiet
tessdata/configs/rebox

View File

@ -113,6 +113,7 @@ libtesseract_la_LDFLAGS += -version-info $(GENERIC_LIBRARY_VERSION)
libtesseract_la_SOURCES = src/api/baseapi.cpp
libtesseract_la_SOURCES += src/api/altorenderer.cpp
libtesseract_la_SOURCES += src/api/pagerenderer.cpp
libtesseract_la_SOURCES += src/api/capi.cpp
libtesseract_la_SOURCES += src/api/hocrrenderer.cpp
libtesseract_la_SOURCES += src/api/lstmboxrenderer.cpp

View File

@ -36,7 +36,7 @@ Tesseract has **unicode (UTF-8) support**, and can **recognize [more than 100 la
Tesseract supports **[various image formats](https://tesseract-ocr.github.io/tessdoc/InputFormats)** including PNG, JPEG and TIFF.
Tesseract supports **various output formats**: plain text, hOCR (HTML), PDF, invisible-text-only PDF, TSV and ALTO.
Tesseract supports **various output formats**: plain text, hOCR (HTML), PDF, invisible-text-only PDF, TSV, ALTO and PAGE.
You should note that in many cases, in order to get better OCR results, you'll need to **[improve the quality](https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html) of the image** you are giving Tesseract.

View File

@ -104,6 +104,10 @@ OPTIONS
* *alto* -- Output in ALTO format ('OUTPUTBASE'`.xml`).
* *hocr* -- Output in hOCR format ('OUTPUTBASE'`.hocr`).
* *page* -- Output in PAGE format ('OUTPUTBASE'`.page.xml`).
The output can be customized with the flags:
page_xml_polygon -- Create polygons instead of bounding boxes (default: true)
page_xml_level -- Create the PAGE file on 0=linelevel or 1=wordlevel (default: 0)
* *pdf* -- Output PDF ('OUTPUTBASE'`.pdf`).
* *tsv* -- Output TSV ('OUTPUTBASE'`.tsv`).
* *txt* -- Output plain text ('OUTPUTBASE'`.txt`).

View File

@ -550,6 +550,18 @@ public:
*/
char *GetAltoText(int page_number);
/**
* Make an XML-formatted string with PAGE markup from the internal
* data structures.
*/
char *GetPAGEText(ETEXT_DESC *monitor, int page_number);
/**
* Make an XML-formatted string with PAGE markup from the internal
* data structures.
*/
char *GetPAGEText(int page_number);
/**
* Make a TSV-formatted string from the internal data structures.
* page_number is 0-based but will appear in the output as 1-based.

View File

@ -198,6 +198,23 @@ private:
bool begin_document;
};
/**
* Renders Tesseract output into a PAGE XML text string
*/
class TESS_API TessPAGERenderer : public TessResultRenderer {
public:
explicit TessPAGERenderer(const char *outputbase);
protected:
bool BeginDocumentHandler() override;
bool AddImageHandler(TessBaseAPI *api) override;
bool EndDocumentHandler() override;
private:
bool begin_document;
};
/**
* Renders Tesseract output into a TSV string
*/

View File

@ -68,6 +68,10 @@ TessResultRenderer *TessAltoRendererCreate(const char *outputbase) {
return new tesseract::TessAltoRenderer(outputbase);
}
TessResultRenderer *TessPAGERendererCreate(const char *outputbase) {
return new tesseract::TessPAGERenderer(outputbase);
}
TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
return new tesseract::TessTsvRenderer(outputbase);
}
@ -420,6 +424,10 @@ char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {
return handle->GetAltoText(page_number);
}
char *TessBaseAPIGetPAGEText(TessBaseAPI *handle, int page_number) {
return handle->GetPAGEText(page_number);
}
char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
return handle->GetTSVText(page_number);
}

1154
src/api/pagerenderer.cpp Normal file

File diff suppressed because it is too large Load Diff

View File

@ -340,6 +340,9 @@ Tesseract::Tesseract()
, BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
, BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
, BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
, BOOL_MEMBER(tessedit_create_page_xml, false, "Write .page.xml PAGE file", this->params())
, BOOL_MEMBER(page_xml_polygon, true, "Create the PAGE file with polygons instead of box values", this->params())
, INT_MEMBER(page_xml_level, 0, "Create the PAGE file on 0=line or 1=word level.", this->params())
, BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
this->params())
, BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())

View File

@ -897,6 +897,9 @@ public:
BOOL_VAR_H(tessedit_create_txt);
BOOL_VAR_H(tessedit_create_hocr);
BOOL_VAR_H(tessedit_create_alto);
BOOL_VAR_H(tessedit_create_page_xml);
BOOL_VAR_H(page_xml_polygon);
INT_VAR_H(page_xml_level);
BOOL_VAR_H(tessedit_create_lstmbox);
BOOL_VAR_H(tessedit_create_tsv);
BOOL_VAR_H(tessedit_create_wordstrbox);

View File

@ -500,6 +500,17 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,
}
}
api.GetBoolVariable("tessedit_create_page_xml", &b);
if (b) {
auto renderer = std::make_unique<tesseract::TessPAGERenderer>(outputbase);
if (renderer->happy()) {
renderers.push_back(std::move(renderer));
} else {
tprintf("Error, could not create PAGE output file: %s\n", strerror(errno));
error = true;
}
}
api.GetBoolVariable("tessedit_create_tsv", &b);
if (b) {
bool font_info;

View File

@ -3,6 +3,6 @@ data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug
data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images
data_DATA += lstmbox wordstrbox
# Configurations for OCR output.
data_DATA += alto hocr pdf tsv txt
data_DATA += alto hocr page pdf tsv txt
data_DATA += linebox rebox strokewidth bigram
EXTRA_DIST = $(data_DATA)

3
tessdata/configs/page Normal file
View File

@ -0,0 +1,3 @@
tessedit_create_page_xml 1
# page_xml_polygon 1
# page_xml_level 0