mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
Add support for ALTO output
This commit is contained in:
parent
685b136d89
commit
d7cee03a94
@ -215,6 +215,7 @@ set(tesseract_src ${tesseract_src}
|
||||
src/api/capi.cpp
|
||||
src/api/renderer.cpp
|
||||
src/api/pdfrenderer.cpp
|
||||
src/api/altorenderer.cpp
|
||||
)
|
||||
|
||||
if (WIN32)
|
||||
@ -223,7 +224,7 @@ if (WIN32)
|
||||
set(tesseract_hdr
|
||||
${tesseract_hdr}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/vs2010/tesseract/resource.h)
|
||||
set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc)
|
||||
set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc src/api/altorenderer.cpp)
|
||||
set_source_files_properties(
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/arch/dotproductsse.cpp
|
||||
PROPERTIES COMPILE_DEFINITIONS __SSE4_1__)
|
||||
|
@ -31,6 +31,7 @@ LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../
|
||||
|
||||
EXPLICIT_SRC_EXCLUDES := \
|
||||
$(LOCAL_PATH)/../../api/pdfrenderer.cpp \
|
||||
$(LOCAL_PATH)/../../api/altorenderer.cpp \
|
||||
$(LOCAL_PATH)/../../api/tesseractmain.cpp \
|
||||
|
||||
LOCAL_SRC_FILES := $(filter-out $(EXPLICIT_SRC_EXCLUDES), $(LOCAL_SRC_FILES))
|
||||
|
@ -32,7 +32,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
|
||||
if VISIBILITY
|
||||
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
|
||||
endif
|
||||
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp
|
||||
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp altorenderer.cpp
|
||||
|
||||
lib_LTLIBRARIES += libtesseract.la
|
||||
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS)
|
||||
|
252
src/api/altorenderer.cpp
Normal file
252
src/api/altorenderer.cpp
Normal file
@ -0,0 +1,252 @@
|
||||
// File: altorenderer.cpp
|
||||
// Description: ALTO rendering interface
|
||||
// Author: Jake Sebright
|
||||
|
||||
// (C) Copyright 2018
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "baseapi.h"
|
||||
#include <memory>
|
||||
#include "renderer.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
///
|
||||
/// Add coordinates to specified TextBlock, TextLine, or String bounding box
|
||||
/// Add word confidence if adding to a String bounding box
|
||||
///
|
||||
static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
|
||||
STRING *alto_str) {
|
||||
int left, top, right, bottom;
|
||||
it->BoundingBox(level, &left, &top, &right, &bottom);
|
||||
|
||||
int hpos = left;
|
||||
int vpos = top;
|
||||
int height = bottom - top;
|
||||
int width = right - left;
|
||||
|
||||
*alto_str += " HPOS=\"";
|
||||
alto_str->add_str_int("", hpos);
|
||||
*alto_str += "\"";
|
||||
*alto_str += " VPOS=\"";
|
||||
alto_str->add_str_int("", vpos);
|
||||
*alto_str += "\"";
|
||||
*alto_str += " WIDTH=\"";
|
||||
alto_str->add_str_int("", width);
|
||||
*alto_str += "\"";
|
||||
*alto_str += " HEIGHT=\"";
|
||||
alto_str->add_str_int("", height);
|
||||
*alto_str += "\"";
|
||||
|
||||
if (level == RIL_WORD) {
|
||||
int wc = it->Confidence(RIL_WORD);
|
||||
*alto_str += " WC=\"0.";
|
||||
alto_str->add_str_int("", wc);
|
||||
*alto_str += "\"";
|
||||
}
|
||||
if (level != RIL_WORD) {
|
||||
|
||||
*alto_str += ">";
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Add a unique ID to an ALTO element
|
||||
///
|
||||
static void AddIdToAlto(STRING *alto_str, const std::string base, int num1) {
|
||||
const size_t BUFSIZE = 64;
|
||||
char id_buffer[BUFSIZE];
|
||||
snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
|
||||
id_buffer[BUFSIZE - 1] = '\0';
|
||||
*alto_str += " ID=\"";
|
||||
*alto_str += id_buffer;
|
||||
*alto_str += "\"";
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the beginning of the document
|
||||
///
|
||||
bool TessAltoRenderer::BeginDocumentHandler() {
|
||||
AppendString(
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
||||
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
|
||||
"\t<Description>\n"
|
||||
"\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
|
||||
"\t\t<sourceImageInformation>\n"
|
||||
"\t\t\t<fileName>");
|
||||
|
||||
AppendString(title());
|
||||
|
||||
AppendString("\t\t\t</fileName>\n"
|
||||
"\t\t</sourceImageInformation>\n"
|
||||
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
|
||||
"\t\t\t<ocrProcessingStep>\n"
|
||||
"\t\t\t\t<processingSoftware>\n"
|
||||
"\t\t\t\t\t<softwareName>tesseract ");
|
||||
AppendString(TessBaseAPI::Version());
|
||||
AppendString("</softwareName>\n"
|
||||
"\t\t\t\t</processingSoftware>\n"
|
||||
"\t\t\t</ocrProcessingStep>\n"
|
||||
"\t\t</OCRProcessing>\n"
|
||||
"\t</Description>\n"
|
||||
"\t<Layout>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the layout of the image
|
||||
///
|
||||
bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||
const std::unique_ptr<const char[]> hocr(api->GetAltoText(imagenum()));
|
||||
if (hocr == nullptr) return false;
|
||||
|
||||
AppendString(hocr.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///
|
||||
/// Append the ALTO XML for the end of the document
|
||||
///
|
||||
bool TessAltoRenderer::EndDocumentHandler() {
|
||||
AppendString("\t</Layout>\n</alto>\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
TessAltoRenderer::TessAltoRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "xml") {
|
||||
}
|
||||
|
||||
///
|
||||
/// Make an XML-formatted string with ALTO markup from the internal
|
||||
/// data structures.
|
||||
///
|
||||
char *TessBaseAPI::GetAltoText(int page_number) {
|
||||
return GetAltoText(nullptr, page_number);
|
||||
}
|
||||
|
||||
///
|
||||
/// Make an XML-formatted string with ALTO markup from the internal
|
||||
/// data structures.
|
||||
///
|
||||
char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
|
||||
return nullptr;
|
||||
|
||||
int lcnt = 0, bcnt = 0, wcnt = 0;
|
||||
int page_id = page_number;
|
||||
|
||||
STRING alto_str("");
|
||||
|
||||
if (input_file_ == nullptr)
|
||||
SetInputName(nullptr);
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len =
|
||||
MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
|
||||
uni16_str, str16_len);
|
||||
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0,
|
||||
nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
|
||||
utf8_len, nullptr, nullptr);
|
||||
*input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
alto_str += "\t\t<Page WIDTH=\"";
|
||||
alto_str.add_str_int("", rect_width_);
|
||||
alto_str += "\" HEIGHT=\"";
|
||||
alto_str.add_str_int("", rect_height_);
|
||||
alto_str += "\" PHYSICAL_IMG_NR=\"";
|
||||
alto_str.add_str_int("", rect_height_);
|
||||
alto_str += "\"";
|
||||
AddIdToAlto(&alto_str, "page", page_id);
|
||||
alto_str += ">\n";
|
||||
alto_str += ("\t\t\t<PrintSpace HPOS=\"0\" "
|
||||
"VPOS=\"0\""
|
||||
" WIDTH=\"");
|
||||
alto_str.add_str_int("", rect_width_);
|
||||
alto_str += "\" HEIGHT=\"";
|
||||
alto_str.add_str_int("", rect_height_);
|
||||
alto_str += "\">\n";
|
||||
|
||||
ResultIterator *res_it = GetIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
alto_str += "\t\t\t\t<TextBlock ";
|
||||
AddIdToAlto(&alto_str, "block", bcnt);
|
||||
AddBoxToAlto(res_it, RIL_BLOCK, &alto_str);
|
||||
alto_str += "\n";
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
|
||||
alto_str += "\t\t\t\t\t<TextLine ";
|
||||
AddIdToAlto(&alto_str, "line", lcnt);
|
||||
AddBoxToAlto(res_it, RIL_TEXTLINE, &alto_str);
|
||||
alto_str += "\n";
|
||||
}
|
||||
|
||||
alto_str += "\t\t\t\t\t\t<String ";
|
||||
AddIdToAlto(&alto_str, "string", wcnt);
|
||||
AddBoxToAlto(res_it, RIL_WORD, &alto_str);
|
||||
alto_str += " CONTENT=\"";
|
||||
|
||||
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
||||
|
||||
do {
|
||||
const std::unique_ptr<const char[]> grapheme(
|
||||
res_it->GetUTF8Text(RIL_SYMBOL));
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
alto_str += HOcrEscape(grapheme.get());
|
||||
}
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||
|
||||
alto_str += "\"/>\n";
|
||||
|
||||
wcnt++;
|
||||
|
||||
if (last_word_in_line) {
|
||||
alto_str += "\t\t\t\t\t</TextLine>\n";
|
||||
lcnt++;
|
||||
}
|
||||
|
||||
if (last_word_in_block) {
|
||||
alto_str += "\t\t\t\t</TextBlock>\n";
|
||||
bcnt++;
|
||||
}
|
||||
}
|
||||
|
||||
alto_str += "\t\t\t</PrintSpace>\n";
|
||||
alto_str += "\t\t</Page>\n";
|
||||
|
||||
char *ret = new char[alto_str.length() + 1];
|
||||
strcpy(ret, alto_str.string());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@ -594,6 +594,19 @@ class TESS_API TessBaseAPI {
|
||||
*/
|
||||
char* GetHOCRText(int page_number);
|
||||
|
||||
/**
|
||||
* Make an XML-formatted string with Alto markup from the internal
|
||||
* data structures.
|
||||
*/
|
||||
char* GetAltoText(ETEXT_DESC* monitor, int page_number);
|
||||
|
||||
|
||||
/**
|
||||
* Make an XML-formatted string with Alto markup from the internal
|
||||
* data structures.
|
||||
*/
|
||||
char* GetAltoText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a TSV-formatted string from the internal data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
|
@ -66,6 +66,11 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu
|
||||
return new TessHOcrRenderer(outputbase, font_info);
|
||||
}
|
||||
|
||||
TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase)
|
||||
{
|
||||
return new TessAltoRenderer(outputbase);
|
||||
}
|
||||
|
||||
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir,
|
||||
BOOL textonly)
|
||||
{
|
||||
|
@ -56,6 +56,7 @@ extern "C" {
|
||||
typedef tesseract::TessResultRenderer TessResultRenderer;
|
||||
typedef tesseract::TessTextRenderer TessTextRenderer;
|
||||
typedef tesseract::TessHOcrRenderer TessHOcrRenderer;
|
||||
typedef tesseract::TessAltoRenderer TessAltoRenderer;
|
||||
typedef tesseract::TessPDFRenderer TessPDFRenderer;
|
||||
typedef tesseract::TessUnlvRenderer TessUnlvRenderer;
|
||||
typedef tesseract::TessBoxTextRenderer TessBoxTextRenderer;
|
||||
@ -126,6 +127,7 @@ TESS_API void TESS_CALL TessDeleteIntArray(int* arr);
|
||||
TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase);
|
||||
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase);
|
||||
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info);
|
||||
TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase);
|
||||
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir,
|
||||
BOOL textonly);
|
||||
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase);
|
||||
@ -277,6 +279,8 @@ TESS_API TessMutableIterator*
|
||||
TESS_API char* TESS_CALL TessBaseAPIGetUTF8Text(TessBaseAPI* handle);
|
||||
TESS_API char* TESS_CALL TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number);
|
||||
|
||||
TESS_API char* TESS_CALL TessBaseAPIGetAltoText(TessBaseAPI* handle, int page_number);
|
||||
|
||||
TESS_API char* TESS_CALL TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number);
|
||||
|
||||
TESS_API char* TESS_CALL TessBaseAPIGetUNLVText(TessBaseAPI* handle);
|
||||
|
@ -166,6 +166,20 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
|
||||
bool font_info_; // whether to print font information
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into an alto text string
|
||||
*/
|
||||
class TESS_API TessAltoRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessAltoRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
virtual bool BeginDocumentHandler();
|
||||
virtual bool AddImageHandler(TessBaseAPI* api);
|
||||
virtual bool EndDocumentHandler();
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders Tesseract output into a TSV string
|
||||
*/
|
||||
|
@ -419,6 +419,19 @@ static void PreloadRenderers(
|
||||
}
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_alto", &b);
|
||||
if (b) {
|
||||
tesseract::TessAltoRenderer* renderer =
|
||||
new tesseract::TessAltoRenderer(outputbase);
|
||||
if (renderer->happy()) {
|
||||
renderers->push_back(renderer);
|
||||
} else {
|
||||
delete renderer;
|
||||
tprintf("Error, could not create ALTO output file: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_tsv", &b);
|
||||
if (b) {
|
||||
bool font_info;
|
||||
|
@ -387,6 +387,8 @@ Tesseract::Tesseract()
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
|
||||
|
@ -1037,6 +1037,7 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
|
||||
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
|
||||
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
|
||||
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
|
||||
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
|
||||
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
||||
BOOL_VAR_H(textonly_pdf, false,
|
||||
|
1
tessdata/configs/alto
Normal file
1
tessdata/configs/alto
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_alto 1
|
Loading…
Reference in New Issue
Block a user