2018-11-30 13:09:36 +08:00
|
|
|
// File: altorenderer.cpp
|
|
|
|
// Description: ALTO rendering interface
|
|
|
|
// Author: Jake Sebright
|
|
|
|
|
|
|
|
// (C) Copyright 2018
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
#include <memory>
|
2018-12-13 00:39:54 +08:00
|
|
|
#include <sstream> // for std::stringstream
|
2018-11-30 13:37:25 +08:00
|
|
|
#include "baseapi.h"
|
2018-11-30 13:09:36 +08:00
|
|
|
#include "renderer.h"
|
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
/// Add coordinates to specified TextBlock, TextLine or String bounding box.
|
|
|
|
/// Add word confidence if adding to a String bounding box.
|
2018-11-30 13:37:25 +08:00
|
|
|
///
|
|
|
|
static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level,
|
2018-12-13 00:39:54 +08:00
|
|
|
std::stringstream& alto_str) {
|
2018-11-30 13:37:25 +08:00
|
|
|
int left, top, right, bottom;
|
|
|
|
it->BoundingBox(level, &left, &top, &right, &bottom);
|
|
|
|
|
|
|
|
int hpos = left;
|
|
|
|
int vpos = top;
|
|
|
|
int height = bottom - top;
|
|
|
|
int width = right - left;
|
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << " HPOS=\"" << hpos << "\"";
|
|
|
|
alto_str << " VPOS=\"" << vpos << "\"";
|
|
|
|
alto_str << " WIDTH=\"" << width << "\"";
|
|
|
|
alto_str << " HEIGHT=\"" << height << "\"";
|
2018-11-30 13:37:25 +08:00
|
|
|
|
|
|
|
if (level == RIL_WORD) {
|
|
|
|
int wc = it->Confidence(RIL_WORD);
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << " WC=\"0." << wc << "\"";
|
|
|
|
} else {
|
|
|
|
alto_str << ">";
|
2018-11-30 13:37:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
///
|
|
|
|
/// Append the ALTO XML for the beginning of the document
|
|
|
|
///
|
|
|
|
bool TessAltoRenderer::BeginDocumentHandler() {
|
|
|
|
AppendString(
|
|
|
|
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
|
|
|
"<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
|
|
|
|
"xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
|
|
|
|
"xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
|
|
|
|
"xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
|
|
|
|
"http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
|
|
|
|
"\t<Description>\n"
|
|
|
|
"\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
|
|
|
|
"\t\t<sourceImageInformation>\n"
|
|
|
|
"\t\t\t<fileName>");
|
|
|
|
|
|
|
|
AppendString(title());
|
|
|
|
|
|
|
|
AppendString(
|
|
|
|
"\t\t\t</fileName>\n"
|
|
|
|
"\t\t</sourceImageInformation>\n"
|
|
|
|
"\t\t<OCRProcessing ID=\"OCR_0\">\n"
|
|
|
|
"\t\t\t<ocrProcessingStep>\n"
|
|
|
|
"\t\t\t\t<processingSoftware>\n"
|
|
|
|
"\t\t\t\t\t<softwareName>tesseract ");
|
|
|
|
AppendString(TessBaseAPI::Version());
|
|
|
|
AppendString(
|
|
|
|
"</softwareName>\n"
|
|
|
|
"\t\t\t\t</processingSoftware>\n"
|
|
|
|
"\t\t\t</ocrProcessingStep>\n"
|
|
|
|
"\t\t</OCRProcessing>\n"
|
|
|
|
"\t</Description>\n"
|
|
|
|
"\t<Layout>\n");
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
///
|
|
|
|
/// Append the ALTO XML for the layout of the image
|
|
|
|
///
|
|
|
|
bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) {
|
2018-12-13 00:39:54 +08:00
|
|
|
const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
|
|
|
|
if (text == nullptr) return false;
|
2018-11-30 13:37:25 +08:00
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
AppendString(text.get());
|
2018-11-30 13:37:25 +08:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
///
|
|
|
|
/// Append the ALTO XML for the end of the document
|
|
|
|
///
|
|
|
|
bool TessAltoRenderer::EndDocumentHandler() {
|
|
|
|
AppendString("\t</Layout>\n</alto>\n");
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
TessAltoRenderer::TessAltoRenderer(const char* outputbase)
|
|
|
|
: TessResultRenderer(outputbase, "xml") {}
|
|
|
|
|
|
|
|
///
|
|
|
|
/// Make an XML-formatted string with ALTO markup from the internal
|
|
|
|
/// data structures.
|
|
|
|
///
|
|
|
|
char* TessBaseAPI::GetAltoText(int page_number) {
|
|
|
|
return GetAltoText(nullptr, page_number);
|
|
|
|
}
|
|
|
|
|
|
|
|
///
|
|
|
|
/// Make an XML-formatted string with ALTO markup from the internal
|
|
|
|
/// data structures.
|
|
|
|
///
|
|
|
|
char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) {
|
|
|
|
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
int lcnt = 0, bcnt = 0, wcnt = 0;
|
|
|
|
int page_id = page_number;
|
|
|
|
|
|
|
|
if (input_file_ == nullptr) SetInputName(nullptr);
|
|
|
|
|
|
|
|
#ifdef _WIN32
|
|
|
|
// convert input name from ANSI encoding to utf-8
|
|
|
|
int str16_len =
|
|
|
|
MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
|
|
|
|
wchar_t* uni16_str = new WCHAR[str16_len];
|
|
|
|
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
|
|
|
|
uni16_str, str16_len);
|
|
|
|
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
|
|
|
|
0, nullptr, nullptr);
|
|
|
|
char* utf8_str = new char[utf8_len];
|
|
|
|
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
|
|
|
|
nullptr, nullptr);
|
|
|
|
*input_file_ = utf8_str;
|
|
|
|
delete[] uni16_str;
|
|
|
|
delete[] utf8_str;
|
|
|
|
#endif
|
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
std::stringstream alto_str;
|
|
|
|
alto_str
|
|
|
|
<< "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\""
|
|
|
|
<< rect_height_
|
|
|
|
// TODO: next line is buggy because rect_height is not an image number.
|
|
|
|
<< "\" PHYSICAL_IMG_NR=\"" << rect_height_ << "\""
|
|
|
|
<< " ID=\"page_" << page_id << "\">\n"
|
|
|
|
<< "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
|
|
|
|
<< " WIDTH=\"" << rect_width_ << "\""
|
|
|
|
<< " HEIGHT=\"" << rect_height_ << "\">\n";
|
2018-11-30 13:37:25 +08:00
|
|
|
|
|
|
|
ResultIterator* res_it = GetIterator();
|
|
|
|
while (!res_it->Empty(RIL_BLOCK)) {
|
|
|
|
if (res_it->Empty(RIL_WORD)) {
|
|
|
|
res_it->Next(RIL_WORD);
|
|
|
|
continue;
|
2018-11-30 13:09:36 +08:00
|
|
|
}
|
|
|
|
|
2018-11-30 13:37:25 +08:00
|
|
|
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << "\t\t\t\t<TextBlock ID=\"block_" << bcnt << "\"";
|
|
|
|
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
|
|
|
|
alto_str << "\n";
|
2018-11-30 13:09:36 +08:00
|
|
|
}
|
|
|
|
|
2018-11-30 13:37:25 +08:00
|
|
|
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << "\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
|
|
|
|
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
|
|
|
|
alto_str << "\n";
|
2018-11-30 13:09:36 +08:00
|
|
|
}
|
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << "\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
|
|
|
|
AddBoxToAlto(res_it, RIL_WORD, alto_str);
|
|
|
|
alto_str << " CONTENT=\"";
|
2018-11-30 13:09:36 +08:00
|
|
|
|
2018-11-30 13:37:25 +08:00
|
|
|
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
|
|
|
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
2018-11-30 13:09:36 +08:00
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
int left, top, right, bottom;
|
|
|
|
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
|
|
|
|
2018-11-30 13:37:25 +08:00
|
|
|
do {
|
|
|
|
const std::unique_ptr<const char[]> grapheme(
|
|
|
|
res_it->GetUTF8Text(RIL_SYMBOL));
|
|
|
|
if (grapheme && grapheme[0] != 0) {
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << HOcrEscape(grapheme.get()).c_str();
|
2018-11-30 13:37:25 +08:00
|
|
|
}
|
|
|
|
res_it->Next(RIL_SYMBOL);
|
|
|
|
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
2018-11-30 13:09:36 +08:00
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << "\"/>";
|
2018-11-30 13:09:36 +08:00
|
|
|
|
2018-11-30 13:37:25 +08:00
|
|
|
wcnt++;
|
2018-11-30 13:09:36 +08:00
|
|
|
|
2018-11-30 13:37:25 +08:00
|
|
|
if (last_word_in_line) {
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << "\n\t\t\t\t\t</TextLine>\n";
|
2018-11-30 13:37:25 +08:00
|
|
|
lcnt++;
|
2018-12-13 00:39:54 +08:00
|
|
|
} else {
|
|
|
|
int hpos = right;
|
|
|
|
int vpos = top;
|
|
|
|
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
|
|
|
int width = left - hpos;
|
|
|
|
alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos
|
|
|
|
<< "\" HPOS=\"" << hpos << "\"/>\n";
|
2018-11-30 13:09:36 +08:00
|
|
|
}
|
|
|
|
|
2018-11-30 13:37:25 +08:00
|
|
|
if (last_word_in_block) {
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << "\t\t\t\t</TextBlock>\n";
|
2018-11-30 13:37:25 +08:00
|
|
|
bcnt++;
|
2018-11-30 13:09:36 +08:00
|
|
|
}
|
2018-11-30 13:37:25 +08:00
|
|
|
}
|
2018-11-30 13:09:36 +08:00
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
alto_str << "\t\t\t</PrintSpace>\n"
|
|
|
|
<< "\t\t</Page>\n";
|
|
|
|
const std::string& text = alto_str.str();
|
2018-11-30 13:09:36 +08:00
|
|
|
|
2018-12-13 00:39:54 +08:00
|
|
|
char* result = new char[text.length() + 1];
|
|
|
|
strcpy(result, text.c_str());
|
2018-11-30 13:37:25 +08:00
|
|
|
delete res_it;
|
2018-12-13 00:39:54 +08:00
|
|
|
return result;
|
2018-11-30 13:09:36 +08:00
|
|
|
}
|
2018-11-30 13:37:25 +08:00
|
|
|
|
|
|
|
} // namespace tesseract
|