// File: altorenderer.cpp // Description: ALTO rendering interface // Author: Jake Sebright // (C) Copyright 2018 // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include // for std::stringstream #include "baseapi.h" #include "renderer.h" namespace tesseract { /// Add coordinates to specified TextBlock, TextLine or String bounding box. /// Add word confidence if adding to a String bounding box. /// static void AddBoxToAlto(const ResultIterator* it, PageIteratorLevel level, std::stringstream& alto_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); int hpos = left; int vpos = top; int height = bottom - top; int width = right - left; alto_str << " HPOS=\"" << hpos << "\""; alto_str << " VPOS=\"" << vpos << "\""; alto_str << " WIDTH=\"" << width << "\""; alto_str << " HEIGHT=\"" << height << "\""; if (level == RIL_WORD) { int wc = it->Confidence(RIL_WORD); alto_str << " WC=\"0." << wc << "\""; } else { alto_str << ">"; } } /// /// Append the ALTO XML for the beginning of the document /// bool TessAltoRenderer::BeginDocumentHandler() { AppendString( "\n" "\n" "\t\n" "\t\tpixel\n" "\t\t\n" "\t\t\t"); AppendString(title()); AppendString( "\t\t\t\n" "\t\t\n" "\t\t\n" "\t\t\t\n" "\t\t\t\t\n" "\t\t\t\t\ttesseract "); AppendString(TessBaseAPI::Version()); AppendString( "\n" "\t\t\t\t\n" "\t\t\t\n" "\t\t\n" "\t\n" "\t\n"); return true; } /// /// Append the ALTO XML for the layout of the image /// bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) { const std::unique_ptr text(api->GetAltoText(imagenum())); if (text == nullptr) return false; AppendString(text.get()); return true; } /// /// Append the ALTO XML for the end of the document /// bool TessAltoRenderer::EndDocumentHandler() { AppendString("\t\n\n"); return true; } TessAltoRenderer::TessAltoRenderer(const char* outputbase) : TessResultRenderer(outputbase, "xml") {} /// /// Make an XML-formatted string with ALTO markup from the internal /// data structures. /// char* TessBaseAPI::GetAltoText(int page_number) { return GetAltoText(nullptr, page_number); } /// /// Make an XML-formatted string with ALTO markup from the internal /// data structures. /// char* TessBaseAPI::GetAltoText(ETEXT_DESC* monitor, int page_number) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) return nullptr; int lcnt = 0, bcnt = 0, wcnt = 0; int page_id = page_number; if (input_file_ == nullptr) SetInputName(nullptr); #ifdef _WIN32 // convert input name from ANSI encoding to utf-8 int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0); wchar_t* uni16_str = new WCHAR[str16_len]; str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, uni16_str, str16_len); int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr); char* utf8_str = new char[utf8_len]; WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr); *input_file_ = utf8_str; delete[] uni16_str; delete[] utf8_str; #endif std::stringstream alto_str; alto_str << "\t\t\n" << "\t\t\t\n"; ResultIterator* res_it = GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } if (res_it->IsAtBeginningOf(RIL_BLOCK)) { alto_str << "\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { alto_str << "\t\t\t\t\tIsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); int left, top, right, bottom; res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); do { const std::unique_ptr grapheme( res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { alto_str << HOcrEscape(grapheme.get()).c_str(); } res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); alto_str << "\"/>"; wcnt++; if (last_word_in_line) { alto_str << "\n\t\t\t\t\t\n"; lcnt++; } else { int hpos = right; int vpos = top; res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); int width = left - hpos; alto_str << "\n"; } if (last_word_in_block) { alto_str << "\t\t\t\t\n"; bcnt++; } } alto_str << "\t\t\t\n" << "\t\t\n"; const std::string& text = alto_str.str(); char* result = new char[text.length() + 1]; strcpy(result, text.c_str()); delete res_it; return result; } } // namespace tesseract