From d7cee03a94656878e6e0a8377f548e8ed1f90ca3 Mon Sep 17 00:00:00 2001 From: Jake Sebright Date: Fri, 30 Nov 2018 00:09:36 -0500 Subject: [PATCH] Add support for ALTO output --- CMakeLists.txt | 3 +- android/jni/Android.mk | 1 + src/api/Makefile.am | 2 +- src/api/altorenderer.cpp | 252 ++++++++++++++++++++++++++++++++++ src/api/baseapi.h | 13 ++ src/api/capi.cpp | 5 + src/api/capi.h | 4 + src/api/renderer.h | 14 ++ src/api/tesseractmain.cpp | 13 ++ src/ccmain/tesseractclass.cpp | 2 + src/ccmain/tesseractclass.h | 1 + tessdata/configs/alto | 1 + 12 files changed, 309 insertions(+), 2 deletions(-) create mode 100644 src/api/altorenderer.cpp create mode 100644 tessdata/configs/alto diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cf2cecc..3076140e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,6 +215,7 @@ set(tesseract_src ${tesseract_src} src/api/capi.cpp src/api/renderer.cpp src/api/pdfrenderer.cpp + src/api/altorenderer.cpp ) if (WIN32) @@ -223,7 +224,7 @@ if (WIN32) set(tesseract_hdr ${tesseract_hdr} ${CMAKE_CURRENT_SOURCE_DIR}/src/vs2010/tesseract/resource.h) - set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc) + set(tesseract_rsc ${CMAKE_CURRENT_BINARY_DIR}/vs2010/tesseract/libtesseract.rc src/api/altorenderer.cpp) set_source_files_properties( ${CMAKE_CURRENT_SOURCE_DIR}/src/arch/dotproductsse.cpp PROPERTIES COMPILE_DEFINITIONS __SSE4_1__) diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 170bb513..225a2f4c 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -31,6 +31,7 @@ LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../ EXPLICIT_SRC_EXCLUDES := \ $(LOCAL_PATH)/../../api/pdfrenderer.cpp \ + $(LOCAL_PATH)/../../api/altorenderer.cpp \ $(LOCAL_PATH)/../../api/tesseractmain.cpp \ LOCAL_SRC_FILES := $(filter-out $(EXPLICIT_SRC_EXCLUDES), $(LOCAL_SRC_FILES)) diff --git a/src/api/Makefile.am b/src/api/Makefile.am index bdecc6a7..3fdf17fb 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -32,7 +32,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS) if VISIBILITY libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS endif -libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp +libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp altorenderer.cpp lib_LTLIBRARIES += libtesseract.la libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp new file mode 100644 index 00000000..11af90f6 --- /dev/null +++ b/src/api/altorenderer.cpp @@ -0,0 +1,252 @@ +// File: altorenderer.cpp +// Description: ALTO rendering interface +// Author: Jake Sebright + +// (C) Copyright 2018 +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "baseapi.h" +#include +#include "renderer.h" + +namespace tesseract { + + /// + /// Add coordinates to specified TextBlock, TextLine, or String bounding box + /// Add word confidence if adding to a String bounding box + /// + static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, + STRING *alto_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + + int hpos = left; + int vpos = top; + int height = bottom - top; + int width = right - left; + + *alto_str += " HPOS=\""; + alto_str->add_str_int("", hpos); + *alto_str += "\""; + *alto_str += " VPOS=\""; + alto_str->add_str_int("", vpos); + *alto_str += "\""; + *alto_str += " WIDTH=\""; + alto_str->add_str_int("", width); + *alto_str += "\""; + *alto_str += " HEIGHT=\""; + alto_str->add_str_int("", height); + *alto_str += "\""; + + if (level == RIL_WORD) { + int wc = it->Confidence(RIL_WORD); + *alto_str += " WC=\"0."; + alto_str->add_str_int("", wc); + *alto_str += "\""; + } + if (level != RIL_WORD) { + + *alto_str += ">"; + } + } + + /// + /// Add a unique ID to an ALTO element + /// + static void AddIdToAlto(STRING *alto_str, const std::string base, int num1) { + const size_t BUFSIZE = 64; + char id_buffer[BUFSIZE]; + snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); + id_buffer[BUFSIZE - 1] = '\0'; + *alto_str += " ID=\""; + *alto_str += id_buffer; + *alto_str += "\""; + } + + /// + /// Append the ALTO XML for the beginning of the document + /// + bool TessAltoRenderer::BeginDocumentHandler() { + AppendString( + "\n" + "\n" + "\t\n" + "\t\tpixel\n" + "\t\t\n" + "\t\t\t"); + + AppendString(title()); + + AppendString("\t\t\t\n" + "\t\t\n" + "\t\t\n" + "\t\t\t\n" + "\t\t\t\t\n" + "\t\t\t\t\ttesseract "); + AppendString(TessBaseAPI::Version()); + AppendString("\n" + "\t\t\t\t\n" + "\t\t\t\n" + "\t\t\n" + "\t\n" + "\t\n"); + + return true; + } + + /// + /// Append the ALTO XML for the layout of the image + /// + bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) { + const std::unique_ptr hocr(api->GetAltoText(imagenum())); + if (hocr == nullptr) return false; + + AppendString(hocr.get()); + + return true; + } + + /// + /// Append the ALTO XML for the end of the document + /// + bool TessAltoRenderer::EndDocumentHandler() { + AppendString("\t\n\n"); + + return true; + } + + TessAltoRenderer::TessAltoRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "xml") { + } + + /// + /// Make an XML-formatted string with ALTO markup from the internal + /// data structures. + /// + char *TessBaseAPI::GetAltoText(int page_number) { + return GetAltoText(nullptr, page_number); + } + + /// + /// Make an XML-formatted string with ALTO markup from the internal + /// data structures. + /// + char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) + return nullptr; + + int lcnt = 0, bcnt = 0, wcnt = 0; + int page_id = page_number; + + STRING alto_str(""); + + if (input_file_ == nullptr) + SetInputName(nullptr); + + #ifdef _WIN32 + // convert input name from ANSI encoding to utf-8 + int str16_len = + MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0); + wchar_t *uni16_str = new WCHAR[str16_len]; + str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, + uni16_str, str16_len); + int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, + nullptr, nullptr); + char *utf8_str = new char[utf8_len]; + WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, + utf8_len, nullptr, nullptr); + *input_file_ = utf8_str; + delete[] uni16_str; + delete[] utf8_str; + #endif + + alto_str += "\t\t\n"; + + ResultIterator *res_it = GetIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + alto_str += "\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { + + alto_str += "\t\t\t\t\tIsAtFinalElement(RIL_TEXTLINE, RIL_WORD); + bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); + + do { + const std::unique_ptr grapheme( + res_it->GetUTF8Text(RIL_SYMBOL)); + if (grapheme && grapheme[0] != 0) { + alto_str += HOcrEscape(grapheme.get()); + } + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + + alto_str += "\"/>\n"; + + wcnt++; + + if (last_word_in_line) { + alto_str += "\t\t\t\t\t\n"; + lcnt++; + } + + if (last_word_in_block) { + alto_str += "\t\t\t\t\n"; + bcnt++; + } + } + + alto_str += "\t\t\t\n"; + alto_str += "\t\t\n"; + + char *ret = new char[alto_str.length() + 1]; + strcpy(ret, alto_str.string()); + delete res_it; + return ret; + } + +} diff --git a/src/api/baseapi.h b/src/api/baseapi.h index da12d647..efa97ecd 100644 --- a/src/api/baseapi.h +++ b/src/api/baseapi.h @@ -594,6 +594,19 @@ class TESS_API TessBaseAPI { */ char* GetHOCRText(int page_number); + /** + * Make an XML-formatted string with Alto markup from the internal + * data structures. + */ + char* GetAltoText(ETEXT_DESC* monitor, int page_number); + + + /** + * Make an XML-formatted string with Alto markup from the internal + * data structures. + */ + char* GetAltoText(int page_number); + /** * Make a TSV-formatted string from the internal data structures. * page_number is 0-based but will appear in the output as 1-based. diff --git a/src/api/capi.cpp b/src/api/capi.cpp index 2146e8c8..cffdf613 100644 --- a/src/api/capi.cpp +++ b/src/api/capi.cpp @@ -66,6 +66,11 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu return new TessHOcrRenderer(outputbase, font_info); } +TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase) +{ + return new TessAltoRenderer(outputbase); +} + TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir, BOOL textonly) { diff --git a/src/api/capi.h b/src/api/capi.h index ba4445b5..5101fdcf 100644 --- a/src/api/capi.h +++ b/src/api/capi.h @@ -56,6 +56,7 @@ extern "C" { typedef tesseract::TessResultRenderer TessResultRenderer; typedef tesseract::TessTextRenderer TessTextRenderer; typedef tesseract::TessHOcrRenderer TessHOcrRenderer; +typedef tesseract::TessAltoRenderer TessAltoRenderer; typedef tesseract::TessPDFRenderer TessPDFRenderer; typedef tesseract::TessUnlvRenderer TessUnlvRenderer; typedef tesseract::TessBoxTextRenderer TessBoxTextRenderer; @@ -126,6 +127,7 @@ TESS_API void TESS_CALL TessDeleteIntArray(int* arr); TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info); +TESS_API TessResultRenderer* TESS_CALL TessAltoRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir, BOOL textonly); TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase); @@ -277,6 +279,8 @@ TESS_API TessMutableIterator* TESS_API char* TESS_CALL TessBaseAPIGetUTF8Text(TessBaseAPI* handle); TESS_API char* TESS_CALL TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number); +TESS_API char* TESS_CALL TessBaseAPIGetAltoText(TessBaseAPI* handle, int page_number); + TESS_API char* TESS_CALL TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number); TESS_API char* TESS_CALL TessBaseAPIGetUNLVText(TessBaseAPI* handle); diff --git a/src/api/renderer.h b/src/api/renderer.h index 6c753403..5c31e4b8 100644 --- a/src/api/renderer.h +++ b/src/api/renderer.h @@ -166,6 +166,20 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { bool font_info_; // whether to print font information }; +/** + * Renders tesseract output into an alto text string + */ + class TESS_API TessAltoRenderer : public TessResultRenderer { + public: + explicit TessAltoRenderer(const char *outputbase); + + protected: + virtual bool BeginDocumentHandler(); + virtual bool AddImageHandler(TessBaseAPI* api); + virtual bool EndDocumentHandler(); + + }; + /** * Renders Tesseract output into a TSV string */ diff --git a/src/api/tesseractmain.cpp b/src/api/tesseractmain.cpp index 7b487a50..93584231 100644 --- a/src/api/tesseractmain.cpp +++ b/src/api/tesseractmain.cpp @@ -419,6 +419,19 @@ static void PreloadRenderers( } } + api->GetBoolVariable("tessedit_create_alto", &b); + if (b) { + tesseract::TessAltoRenderer* renderer = + new tesseract::TessAltoRenderer(outputbase); + if (renderer->happy()) { + renderers->push_back(renderer); + } else { + delete renderer; + tprintf("Error, could not create ALTO output file: %s\n", + strerror(errno)); + } + } + api->GetBoolVariable("tessedit_create_tsv", &b); if (b) { bool font_info; diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index c6aa8337..997e6a42 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -387,6 +387,8 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params()), + BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", + this->params()), BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 8b821f86..61ee7276 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -1037,6 +1037,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file"); BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); + BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file"); BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); BOOL_VAR_H(textonly_pdf, false, diff --git a/tessdata/configs/alto b/tessdata/configs/alto new file mode 100644 index 00000000..0dd12a7a --- /dev/null +++ b/tessdata/configs/alto @@ -0,0 +1 @@ +tessedit_create_alto 1