mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Add PAGE XML renderer / export (#4214)
Add PAGE XML export and documentation. To generate PAGE XML output just add 'page' to the tesseract command. The output is outputname + '.page.xml' to avoid conflicts with ALTO export. The output can be customized with the flags: tessedit_create_page_polygon and tessedit_create_page_wordlevel. Co-authored-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
bae520ea00
commit
577e8a8b93
@ -740,6 +740,7 @@ set(TESSERACT_SRC
|
||||
src/api/capi.cpp
|
||||
src/api/renderer.cpp
|
||||
src/api/altorenderer.cpp
|
||||
src/api/pagerenderer.cpp
|
||||
src/api/hocrrenderer.cpp
|
||||
src/api/lstmboxrenderer.cpp
|
||||
src/api/pdfrenderer.cpp
|
||||
@ -764,6 +765,7 @@ set(TESSERACT_CONFIGS
|
||||
tessdata/configs/lstmbox
|
||||
tessdata/configs/lstmdebug
|
||||
tessdata/configs/makebox
|
||||
tessdata/configs/page
|
||||
tessdata/configs/pdf
|
||||
tessdata/configs/quiet
|
||||
tessdata/configs/rebox
|
||||
|
@ -113,6 +113,7 @@ libtesseract_la_LDFLAGS += -version-info $(GENERIC_LIBRARY_VERSION)
|
||||
|
||||
libtesseract_la_SOURCES = src/api/baseapi.cpp
|
||||
libtesseract_la_SOURCES += src/api/altorenderer.cpp
|
||||
libtesseract_la_SOURCES += src/api/pagerenderer.cpp
|
||||
libtesseract_la_SOURCES += src/api/capi.cpp
|
||||
libtesseract_la_SOURCES += src/api/hocrrenderer.cpp
|
||||
libtesseract_la_SOURCES += src/api/lstmboxrenderer.cpp
|
||||
|
@ -36,7 +36,7 @@ Tesseract has **unicode (UTF-8) support**, and can **recognize [more than 100 la
|
||||
|
||||
Tesseract supports **[various image formats](https://tesseract-ocr.github.io/tessdoc/InputFormats)** including PNG, JPEG and TIFF.
|
||||
|
||||
Tesseract supports **various output formats**: plain text, hOCR (HTML), PDF, invisible-text-only PDF, TSV and ALTO.
|
||||
Tesseract supports **various output formats**: plain text, hOCR (HTML), PDF, invisible-text-only PDF, TSV, ALTO and PAGE.
|
||||
|
||||
You should note that in many cases, in order to get better OCR results, you'll need to **[improve the quality](https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html) of the image** you are giving Tesseract.
|
||||
|
||||
|
@ -104,6 +104,10 @@ OPTIONS
|
||||
|
||||
* *alto* -- Output in ALTO format ('OUTPUTBASE'`.xml`).
|
||||
* *hocr* -- Output in hOCR format ('OUTPUTBASE'`.hocr`).
|
||||
* *page* -- Output in PAGE format ('OUTPUTBASE'`.page.xml`).
|
||||
The output can be customized with the flags:
|
||||
page_xml_polygon -- Create polygons instead of bounding boxes (default: true)
|
||||
page_xml_level -- Create the PAGE file on 0=linelevel or 1=wordlevel (default: 0)
|
||||
* *pdf* -- Output PDF ('OUTPUTBASE'`.pdf`).
|
||||
* *tsv* -- Output TSV ('OUTPUTBASE'`.tsv`).
|
||||
* *txt* -- Output plain text ('OUTPUTBASE'`.txt`).
|
||||
|
@ -550,6 +550,18 @@ public:
|
||||
*/
|
||||
char *GetAltoText(int page_number);
|
||||
|
||||
/**
|
||||
* Make an XML-formatted string with PAGE markup from the internal
|
||||
* data structures.
|
||||
*/
|
||||
char *GetPAGEText(ETEXT_DESC *monitor, int page_number);
|
||||
|
||||
/**
|
||||
* Make an XML-formatted string with PAGE markup from the internal
|
||||
* data structures.
|
||||
*/
|
||||
char *GetPAGEText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
|
@ -198,6 +198,23 @@ private:
|
||||
bool begin_document;
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders Tesseract output into a PAGE XML text string
|
||||
*/
|
||||
class TESS_API TessPAGERenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessPAGERenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
bool BeginDocumentHandler() override;
|
||||
bool AddImageHandler(TessBaseAPI *api) override;
|
||||
bool EndDocumentHandler() override;
|
||||
|
||||
private:
|
||||
bool begin_document;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Renders Tesseract output into a TSV string
|
||||
*/
|
||||
|
@ -68,6 +68,10 @@ TessResultRenderer *TessAltoRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessAltoRenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessPAGERendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessPAGERenderer(outputbase);
|
||||
}
|
||||
|
||||
TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
|
||||
return new tesseract::TessTsvRenderer(outputbase);
|
||||
}
|
||||
@ -420,6 +424,10 @@ char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetAltoText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetPAGEText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetPAGEText(page_number);
|
||||
}
|
||||
|
||||
char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
|
||||
return handle->GetTSVText(page_number);
|
||||
}
|
||||
|
1154
src/api/pagerenderer.cpp
Normal file
1154
src/api/pagerenderer.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -340,6 +340,9 @@ Tesseract::Tesseract()
|
||||
, BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_page_xml, false, "Write .page.xml PAGE file", this->params())
|
||||
, BOOL_MEMBER(page_xml_polygon, true, "Create the PAGE file with polygons instead of box values", this->params())
|
||||
, INT_MEMBER(page_xml_level, 0, "Create the PAGE file on 0=line or 1=word level.", this->params())
|
||||
, BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
|
||||
this->params())
|
||||
, BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
|
||||
|
@ -897,6 +897,9 @@ public:
|
||||
BOOL_VAR_H(tessedit_create_txt);
|
||||
BOOL_VAR_H(tessedit_create_hocr);
|
||||
BOOL_VAR_H(tessedit_create_alto);
|
||||
BOOL_VAR_H(tessedit_create_page_xml);
|
||||
BOOL_VAR_H(page_xml_polygon);
|
||||
INT_VAR_H(page_xml_level);
|
||||
BOOL_VAR_H(tessedit_create_lstmbox);
|
||||
BOOL_VAR_H(tessedit_create_tsv);
|
||||
BOOL_VAR_H(tessedit_create_wordstrbox);
|
||||
|
@ -500,6 +500,17 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,
|
||||
}
|
||||
}
|
||||
|
||||
api.GetBoolVariable("tessedit_create_page_xml", &b);
|
||||
if (b) {
|
||||
auto renderer = std::make_unique<tesseract::TessPAGERenderer>(outputbase);
|
||||
if (renderer->happy()) {
|
||||
renderers.push_back(std::move(renderer));
|
||||
} else {
|
||||
tprintf("Error, could not create PAGE output file: %s\n", strerror(errno));
|
||||
error = true;
|
||||
}
|
||||
}
|
||||
|
||||
api.GetBoolVariable("tessedit_create_tsv", &b);
|
||||
if (b) {
|
||||
bool font_info;
|
||||
|
@ -3,6 +3,6 @@ data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug
|
||||
data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images
|
||||
data_DATA += lstmbox wordstrbox
|
||||
# Configurations for OCR output.
|
||||
data_DATA += alto hocr pdf tsv txt
|
||||
data_DATA += alto hocr page pdf tsv txt
|
||||
data_DATA += linebox rebox strokewidth bigram
|
||||
EXTRA_DIST = $(data_DATA)
|
||||
|
3
tessdata/configs/page
Normal file
3
tessdata/configs/page
Normal file
@ -0,0 +1,3 @@
|
||||
tessedit_create_page_xml 1
|
||||
# page_xml_polygon 1
|
||||
# page_xml_level 0
|
Loading…
Reference in New Issue
Block a user