/********************************************************************** * File: hocrrenderer.cpp * Description: Simple API for calling tesseract. * Author: Ray Smith (original code from baseapi.cpp) * Author: Stefan Weil (moved to separate file and cleaned code) * * (C) Copyright 2006, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include // for std::locale::classic #include // for std::unique_ptr #include // for std::stringstream #include "baseapi.h" // for TessBaseAPI #ifdef _WIN32 # include "host.h" // windows.h for MultiByteToWideChar, ... #endif #include "renderer.h" #include "tesseractclass.h" // for Tesseract namespace tesseract { /** * Gets the block orientation at the current iterator position. */ static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) { tesseract::Orientation orientation; tesseract::WritingDirection writing_direction; tesseract::TextlineOrder textline_order; float deskew_angle; it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); return orientation; } /** * Fits a line to the baseline at the given level, and appends its coefficients * to the hOCR string. * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for * rotated textlines. For this reason, on textlines that are not upright, this * method currently only inserts a 'textangle' property to indicate the rotation * direction and does not add any baseline information to the hocr string. */ static void AddBaselineCoordsTohOCR(const PageIterator* it, PageIteratorLevel level, std::stringstream& hocr_str) { tesseract::Orientation orientation = GetBlockTextOrientation(it); if (orientation != ORIENTATION_PAGE_UP) { hocr_str << "; textangle " << 360 - orientation * 90; return; } int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); // Try to get the baseline coordinates at this level. int x1, y1, x2, y2; if (!it->Baseline(level, &x1, &y1, &x2, &y2)) return; // Following the description of this field of the hOCR spec, we convert the // baseline coordinates so that "the bottom left of the bounding box is the // origin". x1 -= left; x2 -= left; y1 -= bottom; y2 -= bottom; // Now fit a line through the points so we can extract coefficients for the // equation: y = p1 x + p0 if (x1 == x2) { // Problem computing the polynomial coefficients. return; } double p1 = (y2 - y1) / static_cast(x2 - x1); double p0 = y1 - p1 * x1; hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " << round(p0 * 1000.0) / 1000.0; } static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level, std::stringstream& hocr_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); // This is the only place we use double quotes instead of single quotes, // but it may too late to change for consistency hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " << bottom; // Add baseline coordinates & heights for textlines only. if (level == RIL_TEXTLINE) { AddBaselineCoordsTohOCR(it, level, hocr_str); // add custom height measures float row_height, descenders, ascenders; // row attributes it->RowAttributes(&row_height, &descenders, &ascenders); // TODO(rays): Do we want to limit these to a single decimal place? hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders << "; x_ascenders " << ascenders; } hocr_str << "\">"; } /** * Make a HTML-formatted string with hOCR markup from the internal * data structures. * page_number is 0-based but will appear in the output as 1-based. * Image name/input_file_ can be set by SetInputName before calling * GetHOCRText * STL removed from original patch submission and refactored by rays. * Returned string must be freed with the delete [] operator. */ char* TessBaseAPI::GetHOCRText(int page_number) { return GetHOCRText(nullptr, page_number); } /** * Make a HTML-formatted string with hOCR markup from the internal * data structures. * page_number is 0-based but will appear in the output as 1-based. * Image name/input_file_ can be set by SetInputName before calling * GetHOCRText * STL removed from original patch submission and refactored by rays. * Returned string must be freed with the delete [] operator. */ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) return nullptr; int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. bool para_is_ltr = true; // Default direction is LTR const char* paragraph_lang = nullptr; bool font_info = false; bool hocr_boxes = false; GetBoolVariable("hocr_font_info", &font_info); GetBoolVariable("hocr_char_boxes", &hocr_boxes); if (input_file_ == nullptr) SetInputName(nullptr); #ifdef _WIN32 // convert input name from ANSI encoding to utf-8 int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0); wchar_t* uni16_str = new WCHAR[str16_len]; str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, uni16_str, str16_len); int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr); char* utf8_str = new char[utf8_len]; WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr); *input_file_ = utf8_str; delete[] uni16_str; delete[] utf8_str; #endif std::stringstream hocr_str; // Use "C" locale (needed for double values x_size and x_descenders). hocr_str.imbue(std::locale::classic()); // Use 8 digits for double values. hocr_str.precision(8); hocr_str << "
\n"; std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { para_is_ltr = true; // reset to default direction hocr_str << "
IsAtBeginningOf(RIL_PARA)) { hocr_str << "\n

ParagraphIsLtr(); if (!para_is_ltr) { hocr_str << " dir='rtl'"; } hocr_str << " id='" << "par_" << page_id << "_" << pcnt << "'"; paragraph_lang = res_it->WordRecognitionLanguage(); if (paragraph_lang) { hocr_str << " lang='" << paragraph_lang << "'"; } AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str); } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { hocr_str << "\n >>* rawTimestepMap = nullptr; std::vector>>* choiceMap = nullptr; std::vector>>>* symbolMap = nullptr; if (tesseract_->lstm_choice_mode) { choiceMap = res_it->GetBestLSTMSymbolChoices(); symbolMap = res_it->GetSegmentedLSTMTimesteps(); rawTimestepMap = res_it->GetRawLSTMTimesteps(); } hocr_str << "\n BoundingBox(RIL_WORD, &left, &top, &right, &bottom); font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); hocr_str << " title='bbox " << left << " " << top << " " << right << " " << bottom << "; x_wconf " << static_cast(res_it->Confidence(RIL_WORD)); if (font_info) { if (font_name) { hocr_str << "; x_font " << HOcrEscape(font_name).c_str(); } hocr_str << "; x_fsize " << pointsize; } hocr_str << "'"; const char* lang = res_it->WordRecognitionLanguage(); if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { hocr_str << " lang='" << lang << "'"; } switch (res_it->WordDirection()) { // Only emit direction if different from current paragraph direction case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str << " dir='ltr'"; break; case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str << " dir='rtl'"; break; case DIR_MIX: case DIR_NEUTRAL: default: // Do nothing. break; } hocr_str << ">"; bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); if (bold) hocr_str << ""; if (italic) hocr_str << ""; do { const std::unique_ptr grapheme( res_it->GetUTF8Text(RIL_SYMBOL)); if (grapheme && grapheme[0] != 0) { if (hocr_boxes) { res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); hocr_str << "\n "; } hocr_str << HOcrEscape(grapheme.get()).c_str(); if (hocr_boxes) { hocr_str << ""; } } res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (italic) hocr_str << ""; if (bold) hocr_str << ""; // If the lstm choice mode is required it is added here if (tesseract_->lstm_choice_mode == 1 && rawTimestepMap != nullptr) { for (auto timestep : *rawTimestepMap) { hocr_str << "\n "; for (std::pair conf : timestep) { hocr_str << "" << conf.first << ""; gcnt++; } hocr_str << ""; tcnt++; } } else if (tesseract_->lstm_choice_mode == 2 && choiceMap != nullptr) { for (auto timestep : *choiceMap) { if (timestep.size() > 0) { hocr_str << "\n "; for (auto & j : timestep) { hocr_str << "" << j.first << ""; gcnt++; } hocr_str << ""; tcnt++; } } } else if (tesseract_->lstm_choice_mode == 3 && symbolMap != nullptr) { for (auto timesteps : *symbolMap) { hocr_str << "\n "; for (auto timestep : timesteps) { hocr_str << "\n "; for (std::pair conf : timestep) { hocr_str << "" << conf.first << ""; gcnt++; } hocr_str << ""; tcnt++; } hocr_str << ""; scnt++; } } hocr_str << ""; tcnt = 1; gcnt = 1; wcnt++; // Close any ending block/paragraph/textline. if (last_word_in_line) { hocr_str << "\n "; lcnt++; } if (last_word_in_para) { hocr_str << "\n

\n"; pcnt++; para_is_ltr = true; // back to default direction } if (last_word_in_block) { hocr_str << "
\n"; bcnt++; } } hocr_str << "
\n"; const std::string& text = hocr_str.str(); char* result = new char[text.length() + 1]; strcpy(result, text.c_str()); return result; } /********************************************************************** * HOcr Text Renderer interface implementation **********************************************************************/ TessHOcrRenderer::TessHOcrRenderer(const char* outputbase) : TessResultRenderer(outputbase, "hocr") { font_info_ = false; } TessHOcrRenderer::TessHOcrRenderer(const char* outputbase, bool font_info) : TessResultRenderer(outputbase, "hocr") { font_info_ = font_info; } bool TessHOcrRenderer::BeginDocumentHandler() { AppendString( "\n" "\n" "\n \n "); AppendString(title()); AppendString( "\n" " \n" " \n" " \n" " \n" " \n"); return true; } bool TessHOcrRenderer::EndDocumentHandler() { AppendString(" \n\n"); return true; } bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) { const std::unique_ptr hocr(api->GetHOCRText(imagenum())); if (hocr == nullptr) return false; AppendString(hocr.get()); return true; } } // namespace tesseract