mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
Add PAGE XML renderer / export (#4214)
Add PAGE XML export and documentation. To generate PAGE XML output just add 'page' to the tesseract command. The output is outputname + '.page.xml' to avoid conflicts with ALTO export. The output can be customized with the flags: tessedit_create_page_polygon and tessedit_create_page_wordlevel. Co-authored-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
bae520ea00
commit
577e8a8b93
@ -740,6 +740,7 @@ set(TESSERACT_SRC
|
|||||||
src/api/capi.cpp
|
src/api/capi.cpp
|
||||||
src/api/renderer.cpp
|
src/api/renderer.cpp
|
||||||
src/api/altorenderer.cpp
|
src/api/altorenderer.cpp
|
||||||
|
src/api/pagerenderer.cpp
|
||||||
src/api/hocrrenderer.cpp
|
src/api/hocrrenderer.cpp
|
||||||
src/api/lstmboxrenderer.cpp
|
src/api/lstmboxrenderer.cpp
|
||||||
src/api/pdfrenderer.cpp
|
src/api/pdfrenderer.cpp
|
||||||
@ -764,6 +765,7 @@ set(TESSERACT_CONFIGS
|
|||||||
tessdata/configs/lstmbox
|
tessdata/configs/lstmbox
|
||||||
tessdata/configs/lstmdebug
|
tessdata/configs/lstmdebug
|
||||||
tessdata/configs/makebox
|
tessdata/configs/makebox
|
||||||
|
tessdata/configs/page
|
||||||
tessdata/configs/pdf
|
tessdata/configs/pdf
|
||||||
tessdata/configs/quiet
|
tessdata/configs/quiet
|
||||||
tessdata/configs/rebox
|
tessdata/configs/rebox
|
||||||
|
@ -113,6 +113,7 @@ libtesseract_la_LDFLAGS += -version-info $(GENERIC_LIBRARY_VERSION)
|
|||||||
|
|
||||||
libtesseract_la_SOURCES = src/api/baseapi.cpp
|
libtesseract_la_SOURCES = src/api/baseapi.cpp
|
||||||
libtesseract_la_SOURCES += src/api/altorenderer.cpp
|
libtesseract_la_SOURCES += src/api/altorenderer.cpp
|
||||||
|
libtesseract_la_SOURCES += src/api/pagerenderer.cpp
|
||||||
libtesseract_la_SOURCES += src/api/capi.cpp
|
libtesseract_la_SOURCES += src/api/capi.cpp
|
||||||
libtesseract_la_SOURCES += src/api/hocrrenderer.cpp
|
libtesseract_la_SOURCES += src/api/hocrrenderer.cpp
|
||||||
libtesseract_la_SOURCES += src/api/lstmboxrenderer.cpp
|
libtesseract_la_SOURCES += src/api/lstmboxrenderer.cpp
|
||||||
|
@ -36,7 +36,7 @@ Tesseract has **unicode (UTF-8) support**, and can **recognize [more than 100 la
|
|||||||
|
|
||||||
Tesseract supports **[various image formats](https://tesseract-ocr.github.io/tessdoc/InputFormats)** including PNG, JPEG and TIFF.
|
Tesseract supports **[various image formats](https://tesseract-ocr.github.io/tessdoc/InputFormats)** including PNG, JPEG and TIFF.
|
||||||
|
|
||||||
Tesseract supports **various output formats**: plain text, hOCR (HTML), PDF, invisible-text-only PDF, TSV and ALTO.
|
Tesseract supports **various output formats**: plain text, hOCR (HTML), PDF, invisible-text-only PDF, TSV, ALTO and PAGE.
|
||||||
|
|
||||||
You should note that in many cases, in order to get better OCR results, you'll need to **[improve the quality](https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html) of the image** you are giving Tesseract.
|
You should note that in many cases, in order to get better OCR results, you'll need to **[improve the quality](https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html) of the image** you are giving Tesseract.
|
||||||
|
|
||||||
|
@ -104,6 +104,10 @@ OPTIONS
|
|||||||
|
|
||||||
* *alto* -- Output in ALTO format ('OUTPUTBASE'`.xml`).
|
* *alto* -- Output in ALTO format ('OUTPUTBASE'`.xml`).
|
||||||
* *hocr* -- Output in hOCR format ('OUTPUTBASE'`.hocr`).
|
* *hocr* -- Output in hOCR format ('OUTPUTBASE'`.hocr`).
|
||||||
|
* *page* -- Output in PAGE format ('OUTPUTBASE'`.page.xml`).
|
||||||
|
The output can be customized with the flags:
|
||||||
|
page_xml_polygon -- Create polygons instead of bounding boxes (default: true)
|
||||||
|
page_xml_level -- Create the PAGE file on 0=linelevel or 1=wordlevel (default: 0)
|
||||||
* *pdf* -- Output PDF ('OUTPUTBASE'`.pdf`).
|
* *pdf* -- Output PDF ('OUTPUTBASE'`.pdf`).
|
||||||
* *tsv* -- Output TSV ('OUTPUTBASE'`.tsv`).
|
* *tsv* -- Output TSV ('OUTPUTBASE'`.tsv`).
|
||||||
* *txt* -- Output plain text ('OUTPUTBASE'`.txt`).
|
* *txt* -- Output plain text ('OUTPUTBASE'`.txt`).
|
||||||
|
@ -550,6 +550,18 @@ public:
|
|||||||
*/
|
*/
|
||||||
char *GetAltoText(int page_number);
|
char *GetAltoText(int page_number);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make an XML-formatted string with PAGE markup from the internal
|
||||||
|
* data structures.
|
||||||
|
*/
|
||||||
|
char *GetPAGEText(ETEXT_DESC *monitor, int page_number);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make an XML-formatted string with PAGE markup from the internal
|
||||||
|
* data structures.
|
||||||
|
*/
|
||||||
|
char *GetPAGEText(int page_number);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Make a TSV-formatted string from the internal data structures.
|
* Make a TSV-formatted string from the internal data structures.
|
||||||
* page_number is 0-based but will appear in the output as 1-based.
|
* page_number is 0-based but will appear in the output as 1-based.
|
||||||
|
@ -198,6 +198,23 @@ private:
|
|||||||
bool begin_document;
|
bool begin_document;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Renders Tesseract output into a PAGE XML text string
|
||||||
|
*/
|
||||||
|
class TESS_API TessPAGERenderer : public TessResultRenderer {
|
||||||
|
public:
|
||||||
|
explicit TessPAGERenderer(const char *outputbase);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
bool BeginDocumentHandler() override;
|
||||||
|
bool AddImageHandler(TessBaseAPI *api) override;
|
||||||
|
bool EndDocumentHandler() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool begin_document;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Renders Tesseract output into a TSV string
|
* Renders Tesseract output into a TSV string
|
||||||
*/
|
*/
|
||||||
|
@ -68,6 +68,10 @@ TessResultRenderer *TessAltoRendererCreate(const char *outputbase) {
|
|||||||
return new tesseract::TessAltoRenderer(outputbase);
|
return new tesseract::TessAltoRenderer(outputbase);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TessResultRenderer *TessPAGERendererCreate(const char *outputbase) {
|
||||||
|
return new tesseract::TessPAGERenderer(outputbase);
|
||||||
|
}
|
||||||
|
|
||||||
TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
|
TessResultRenderer *TessTsvRendererCreate(const char *outputbase) {
|
||||||
return new tesseract::TessTsvRenderer(outputbase);
|
return new tesseract::TessTsvRenderer(outputbase);
|
||||||
}
|
}
|
||||||
@ -420,6 +424,10 @@ char *TessBaseAPIGetAltoText(TessBaseAPI *handle, int page_number) {
|
|||||||
return handle->GetAltoText(page_number);
|
return handle->GetAltoText(page_number);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char *TessBaseAPIGetPAGEText(TessBaseAPI *handle, int page_number) {
|
||||||
|
return handle->GetPAGEText(page_number);
|
||||||
|
}
|
||||||
|
|
||||||
char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
|
char *TessBaseAPIGetTsvText(TessBaseAPI *handle, int page_number) {
|
||||||
return handle->GetTSVText(page_number);
|
return handle->GetTSVText(page_number);
|
||||||
}
|
}
|
||||||
|
1154
src/api/pagerenderer.cpp
Normal file
1154
src/api/pagerenderer.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -340,6 +340,9 @@ Tesseract::Tesseract()
|
|||||||
, BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
|
, BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file", this->params())
|
||||||
, BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
|
, BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params())
|
||||||
, BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
|
, BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params())
|
||||||
|
, BOOL_MEMBER(tessedit_create_page_xml, false, "Write .page.xml PAGE file", this->params())
|
||||||
|
, BOOL_MEMBER(page_xml_polygon, true, "Create the PAGE file with polygons instead of box values", this->params())
|
||||||
|
, INT_MEMBER(page_xml_level, 0, "Create the PAGE file on 0=line or 1=word level.", this->params())
|
||||||
, BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
|
, BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
|
||||||
this->params())
|
this->params())
|
||||||
, BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
|
, BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params())
|
||||||
|
@ -897,6 +897,9 @@ public:
|
|||||||
BOOL_VAR_H(tessedit_create_txt);
|
BOOL_VAR_H(tessedit_create_txt);
|
||||||
BOOL_VAR_H(tessedit_create_hocr);
|
BOOL_VAR_H(tessedit_create_hocr);
|
||||||
BOOL_VAR_H(tessedit_create_alto);
|
BOOL_VAR_H(tessedit_create_alto);
|
||||||
|
BOOL_VAR_H(tessedit_create_page_xml);
|
||||||
|
BOOL_VAR_H(page_xml_polygon);
|
||||||
|
INT_VAR_H(page_xml_level);
|
||||||
BOOL_VAR_H(tessedit_create_lstmbox);
|
BOOL_VAR_H(tessedit_create_lstmbox);
|
||||||
BOOL_VAR_H(tessedit_create_tsv);
|
BOOL_VAR_H(tessedit_create_tsv);
|
||||||
BOOL_VAR_H(tessedit_create_wordstrbox);
|
BOOL_VAR_H(tessedit_create_wordstrbox);
|
||||||
|
@ -500,6 +500,17 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
api.GetBoolVariable("tessedit_create_page_xml", &b);
|
||||||
|
if (b) {
|
||||||
|
auto renderer = std::make_unique<tesseract::TessPAGERenderer>(outputbase);
|
||||||
|
if (renderer->happy()) {
|
||||||
|
renderers.push_back(std::move(renderer));
|
||||||
|
} else {
|
||||||
|
tprintf("Error, could not create PAGE output file: %s\n", strerror(errno));
|
||||||
|
error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
api.GetBoolVariable("tessedit_create_tsv", &b);
|
api.GetBoolVariable("tessedit_create_tsv", &b);
|
||||||
if (b) {
|
if (b) {
|
||||||
bool font_info;
|
bool font_info;
|
||||||
|
@ -3,6 +3,6 @@ data_DATA = inter makebox box.train unlv ambigs.train lstm.train lstmdebug
|
|||||||
data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images
|
data_DATA += api_config kannada box.train.stderr quiet logfile digits get.images
|
||||||
data_DATA += lstmbox wordstrbox
|
data_DATA += lstmbox wordstrbox
|
||||||
# Configurations for OCR output.
|
# Configurations for OCR output.
|
||||||
data_DATA += alto hocr pdf tsv txt
|
data_DATA += alto hocr page pdf tsv txt
|
||||||
data_DATA += linebox rebox strokewidth bigram
|
data_DATA += linebox rebox strokewidth bigram
|
||||||
EXTRA_DIST = $(data_DATA)
|
EXTRA_DIST = $(data_DATA)
|
||||||
|
3
tessdata/configs/page
Normal file
3
tessdata/configs/page
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
tessedit_create_page_xml 1
|
||||||
|
# page_xml_polygon 1
|
||||||
|
# page_xml_level 0
|
Loading…
Reference in New Issue
Block a user