mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-27 10:34:12 +08:00
068eb4c35d
These classes are taken from the hOCR specification, and seem to map well onto the BlockType types. There are probably more that could be added.
454 lines
16 KiB
C++
454 lines
16 KiB
C++
/**********************************************************************
|
|
* File: hocrrenderer.cpp
|
|
* Description: Simple API for calling tesseract.
|
|
* Author: Ray Smith (original code from baseapi.cpp)
|
|
* Author: Stefan Weil (moved to separate file and cleaned code)
|
|
*
|
|
* (C) Copyright 2006, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include <locale> // for std::locale::classic
|
|
#include <memory> // for std::unique_ptr
|
|
#include <sstream> // for std::stringstream
|
|
#include "baseapi.h" // for TessBaseAPI
|
|
#ifdef _WIN32
|
|
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
|
#endif
|
|
#include "renderer.h"
|
|
#include "tesseractclass.h" // for Tesseract
|
|
|
|
namespace tesseract {
|
|
|
|
/**
|
|
* Gets the block orientation at the current iterator position.
|
|
*/
|
|
static tesseract::Orientation GetBlockTextOrientation(const PageIterator* it) {
|
|
tesseract::Orientation orientation;
|
|
tesseract::WritingDirection writing_direction;
|
|
tesseract::TextlineOrder textline_order;
|
|
float deskew_angle;
|
|
it->Orientation(&orientation, &writing_direction, &textline_order,
|
|
&deskew_angle);
|
|
return orientation;
|
|
}
|
|
|
|
/**
|
|
* Fits a line to the baseline at the given level, and appends its coefficients
|
|
* to the hOCR string.
|
|
* NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
|
|
* rotated textlines. For this reason, on textlines that are not upright, this
|
|
* method currently only inserts a 'textangle' property to indicate the rotation
|
|
* direction and does not add any baseline information to the hocr string.
|
|
*/
|
|
static void AddBaselineCoordsTohOCR(const PageIterator* it,
|
|
PageIteratorLevel level,
|
|
std::stringstream& hocr_str) {
|
|
tesseract::Orientation orientation = GetBlockTextOrientation(it);
|
|
if (orientation != ORIENTATION_PAGE_UP) {
|
|
hocr_str << "; textangle " << 360 - orientation * 90;
|
|
return;
|
|
}
|
|
|
|
int left, top, right, bottom;
|
|
it->BoundingBox(level, &left, &top, &right, &bottom);
|
|
|
|
// Try to get the baseline coordinates at this level.
|
|
int x1, y1, x2, y2;
|
|
if (!it->Baseline(level, &x1, &y1, &x2, &y2)) return;
|
|
// Following the description of this field of the hOCR spec, we convert the
|
|
// baseline coordinates so that "the bottom left of the bounding box is the
|
|
// origin".
|
|
x1 -= left;
|
|
x2 -= left;
|
|
y1 -= bottom;
|
|
y2 -= bottom;
|
|
|
|
// Now fit a line through the points so we can extract coefficients for the
|
|
// equation: y = p1 x + p0
|
|
if (x1 == x2) {
|
|
// Problem computing the polynomial coefficients.
|
|
return;
|
|
}
|
|
double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
|
|
double p0 = y1 - p1 * x1;
|
|
|
|
hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
|
|
<< round(p0 * 1000.0) / 1000.0;
|
|
}
|
|
|
|
static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
|
|
std::stringstream& hocr_str) {
|
|
int left, top, right, bottom;
|
|
it->BoundingBox(level, &left, &top, &right, &bottom);
|
|
// This is the only place we use double quotes instead of single quotes,
|
|
// but it may too late to change for consistency
|
|
hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
|
|
<< bottom;
|
|
// Add baseline coordinates & heights for textlines only.
|
|
if (level == RIL_TEXTLINE) {
|
|
AddBaselineCoordsTohOCR(it, level, hocr_str);
|
|
// add custom height measures
|
|
float row_height, descenders, ascenders; // row attributes
|
|
it->RowAttributes(&row_height, &descenders, &ascenders);
|
|
// TODO(rays): Do we want to limit these to a single decimal place?
|
|
hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
|
|
<< "; x_ascenders " << ascenders;
|
|
}
|
|
hocr_str << "\">";
|
|
}
|
|
|
|
/**
|
|
* Make a HTML-formatted string with hOCR markup from the internal
|
|
* data structures.
|
|
* page_number is 0-based but will appear in the output as 1-based.
|
|
* Image name/input_file_ can be set by SetInputName before calling
|
|
* GetHOCRText
|
|
* STL removed from original patch submission and refactored by rays.
|
|
* Returned string must be freed with the delete [] operator.
|
|
*/
|
|
char* TessBaseAPI::GetHOCRText(int page_number) {
|
|
return GetHOCRText(nullptr, page_number);
|
|
}
|
|
|
|
/**
|
|
* Make a HTML-formatted string with hOCR markup from the internal
|
|
* data structures.
|
|
* page_number is 0-based but will appear in the output as 1-based.
|
|
* Image name/input_file_ can be set by SetInputName before calling
|
|
* GetHOCRText
|
|
* STL removed from original patch submission and refactored by rays.
|
|
* Returned string must be freed with the delete [] operator.
|
|
*/
|
|
char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
|
|
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
|
|
return nullptr;
|
|
|
|
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, gcnt = 1;
|
|
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
|
bool para_is_ltr = true; // Default direction is LTR
|
|
const char* paragraph_lang = nullptr;
|
|
bool font_info = false;
|
|
bool hocr_boxes = false;
|
|
GetBoolVariable("hocr_font_info", &font_info);
|
|
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
|
|
|
|
if (input_file_ == nullptr) SetInputName(nullptr);
|
|
|
|
#ifdef _WIN32
|
|
// convert input name from ANSI encoding to utf-8
|
|
int str16_len =
|
|
MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0);
|
|
wchar_t* uni16_str = new WCHAR[str16_len];
|
|
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
|
|
uni16_str, str16_len);
|
|
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
|
|
0, nullptr, nullptr);
|
|
char* utf8_str = new char[utf8_len];
|
|
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
|
|
nullptr, nullptr);
|
|
*input_file_ = utf8_str;
|
|
delete[] uni16_str;
|
|
delete[] utf8_str;
|
|
#endif
|
|
|
|
std::stringstream hocr_str;
|
|
// Use "C" locale (needed for double values x_size and x_descenders).
|
|
hocr_str.imbue(std::locale::classic());
|
|
// Use 8 digits for double values.
|
|
hocr_str.precision(8);
|
|
hocr_str << " <div class='ocr_page'";
|
|
hocr_str << " id='"
|
|
<< "page_" << page_id << "'";
|
|
hocr_str << " title='image \"";
|
|
if (input_file_) {
|
|
hocr_str << HOcrEscape(input_file_->string()).c_str();
|
|
} else {
|
|
hocr_str << "unknown";
|
|
}
|
|
hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
|
|
<< rect_width_ << " " << rect_height_ << "; ppageno " << page_number
|
|
<< "'>\n";
|
|
|
|
std::unique_ptr<ResultIterator> res_it(GetIterator());
|
|
while (!res_it->Empty(RIL_BLOCK)) {
|
|
if (res_it->Empty(RIL_WORD)) {
|
|
res_it->Next(RIL_WORD);
|
|
continue;
|
|
}
|
|
|
|
// Open any new block/paragraph/textline.
|
|
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
|
para_is_ltr = true; // reset to default direction
|
|
hocr_str << " <div class='ocr_carea'"
|
|
<< " id='"
|
|
<< "block_" << page_id << "_" << bcnt << "'";
|
|
AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
|
|
}
|
|
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
|
hocr_str << "\n <p class='ocr_par'";
|
|
para_is_ltr = res_it->ParagraphIsLtr();
|
|
if (!para_is_ltr) {
|
|
hocr_str << " dir='rtl'";
|
|
}
|
|
hocr_str << " id='"
|
|
<< "par_" << page_id << "_" << pcnt << "'";
|
|
paragraph_lang = res_it->WordRecognitionLanguage();
|
|
if (paragraph_lang) {
|
|
hocr_str << " lang='" << paragraph_lang << "'";
|
|
}
|
|
AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
|
|
}
|
|
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
|
hocr_str << "\n <span class='";
|
|
switch (res_it->BlockType()) {
|
|
case PT_HEADING_TEXT:
|
|
hocr_str << "ocr_header";
|
|
break;
|
|
case PT_PULLOUT_TEXT:
|
|
hocr_str << "ocr_textfloat";
|
|
break;
|
|
case PT_CAPTION_TEXT:
|
|
hocr_str << "ocr_caption";
|
|
break;
|
|
default:
|
|
hocr_str << "ocr_line";
|
|
}
|
|
hocr_str << "' id='"
|
|
<< "line_" << page_id << "_" << lcnt << "'";
|
|
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
|
|
}
|
|
|
|
// Now, process the word...
|
|
std::vector<std::vector<std::pair<const char*, float>>>* rawTimestepMap =
|
|
nullptr;
|
|
std::vector<std::vector<std::pair<const char*, float>>>* choiceMap =
|
|
nullptr;
|
|
std::vector<std::vector<std::vector<std::pair<const char*, float>>>>*
|
|
symbolMap = nullptr;
|
|
if (tesseract_->lstm_choice_mode) {
|
|
|
|
choiceMap = res_it->GetBestLSTMSymbolChoices();
|
|
symbolMap = res_it->GetSegmentedLSTMTimesteps();
|
|
rawTimestepMap = res_it->GetRawLSTMTimesteps();
|
|
}
|
|
hocr_str << "\n <span class='ocrx_word'"
|
|
<< " id='"
|
|
<< "word_" << page_id << "_" << wcnt << "'";
|
|
int left, top, right, bottom;
|
|
bool bold, italic, underlined, monospace, serif, smallcaps;
|
|
int pointsize, font_id;
|
|
const char* font_name;
|
|
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
|
|
font_name =
|
|
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
|
|
&serif, &smallcaps, &pointsize, &font_id);
|
|
hocr_str << " title='bbox " << left << " " << top << " " << right << " "
|
|
<< bottom << "; x_wconf "
|
|
<< static_cast<int>(res_it->Confidence(RIL_WORD));
|
|
if (font_info) {
|
|
if (font_name) {
|
|
hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
|
|
}
|
|
hocr_str << "; x_fsize " << pointsize;
|
|
}
|
|
hocr_str << "'";
|
|
const char* lang = res_it->WordRecognitionLanguage();
|
|
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
|
|
hocr_str << " lang='" << lang << "'";
|
|
}
|
|
switch (res_it->WordDirection()) {
|
|
// Only emit direction if different from current paragraph direction
|
|
case DIR_LEFT_TO_RIGHT:
|
|
if (!para_is_ltr) hocr_str << " dir='ltr'";
|
|
break;
|
|
case DIR_RIGHT_TO_LEFT:
|
|
if (para_is_ltr) hocr_str << " dir='rtl'";
|
|
break;
|
|
case DIR_MIX:
|
|
case DIR_NEUTRAL:
|
|
default: // Do nothing.
|
|
break;
|
|
}
|
|
hocr_str << ">";
|
|
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
|
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
|
|
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
|
|
if (bold) hocr_str << "<strong>";
|
|
if (italic) hocr_str << "<em>";
|
|
do {
|
|
const std::unique_ptr<const char[]> grapheme(
|
|
res_it->GetUTF8Text(RIL_SYMBOL));
|
|
if (grapheme && grapheme[0] != 0) {
|
|
if (hocr_boxes) {
|
|
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
|
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
|
|
<< left << " " << top << " " << right << " " << bottom
|
|
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
|
|
}
|
|
hocr_str << HOcrEscape(grapheme.get()).c_str();
|
|
if (hocr_boxes) {
|
|
hocr_str << "</span>";
|
|
}
|
|
}
|
|
res_it->Next(RIL_SYMBOL);
|
|
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
|
if (italic) hocr_str << "</em>";
|
|
if (bold) hocr_str << "</strong>";
|
|
// If the lstm choice mode is required it is added here
|
|
if (tesseract_->lstm_choice_mode == 1 && rawTimestepMap != nullptr) {
|
|
for (auto timestep : *rawTimestepMap) {
|
|
hocr_str << "\n <span class='ocrx_cinfo'"
|
|
<< " id='"
|
|
<< "timestep_" << page_id << "_" << wcnt << "_" << tcnt << "'"
|
|
<< ">";
|
|
for (std::pair<const char*, float> conf : timestep) {
|
|
hocr_str << "<span class='ocr_glyph'"
|
|
<< " id='"
|
|
<< "choice_" << page_id << "_" << wcnt << "_" << gcnt << "'"
|
|
<< " title='x_confs " << int(conf.second * 100) << "'>"
|
|
<< conf.first << "</span>";
|
|
gcnt++;
|
|
}
|
|
hocr_str << "</span>";
|
|
tcnt++;
|
|
}
|
|
} else if (tesseract_->lstm_choice_mode == 2 && choiceMap != nullptr) {
|
|
for (auto timestep : *choiceMap) {
|
|
if (timestep.size() > 0) {
|
|
hocr_str << "\n <span class='ocrx_cinfo'"
|
|
<< " id='"
|
|
<< "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
|
|
<< "'>";
|
|
for (auto & j : timestep) {
|
|
hocr_str << "<span class='ocr_glyph'"
|
|
<< " id='"
|
|
<< "choice_" << page_id << "_" << wcnt << "_" << gcnt
|
|
<< "'"
|
|
<< " title='x_confs " << int(j.second * 100)
|
|
<< "'>" << j.first << "</span>";
|
|
gcnt++;
|
|
}
|
|
hocr_str << "</span>";
|
|
tcnt++;
|
|
}
|
|
}
|
|
} else if (tesseract_->lstm_choice_mode == 3 && symbolMap != nullptr) {
|
|
for (auto timesteps : *symbolMap) {
|
|
hocr_str << "\n <span class='ocr_symbol'"
|
|
<< " id='"
|
|
<< "symbol_" << page_id << "_" << wcnt << "_" << scnt
|
|
<< "'>";
|
|
for (auto timestep : timesteps) {
|
|
hocr_str << "\n <span class='ocrx_cinfo'"
|
|
<< " id='"
|
|
<< "timestep_" << page_id << "_" << wcnt << "_" << tcnt
|
|
<< "'"
|
|
<< ">";
|
|
for (std::pair<const char*, float> conf : timestep) {
|
|
hocr_str << "<span class='ocr_glyph'"
|
|
<< " id='"
|
|
<< "choice_" << page_id << "_" << wcnt << "_" << gcnt
|
|
<< "'"
|
|
<< " title='x_confs " << int(conf.second * 100) << "'>"
|
|
<< conf.first << "</span>";
|
|
gcnt++;
|
|
}
|
|
hocr_str << "</span>";
|
|
tcnt++;
|
|
}
|
|
hocr_str << "</span>";
|
|
scnt++;
|
|
}
|
|
}
|
|
hocr_str << "</span>";
|
|
tcnt = 1;
|
|
gcnt = 1;
|
|
wcnt++;
|
|
// Close any ending block/paragraph/textline.
|
|
if (last_word_in_line) {
|
|
hocr_str << "\n </span>";
|
|
lcnt++;
|
|
}
|
|
if (last_word_in_para) {
|
|
hocr_str << "\n </p>\n";
|
|
pcnt++;
|
|
para_is_ltr = true; // back to default direction
|
|
}
|
|
if (last_word_in_block) {
|
|
hocr_str << " </div>\n";
|
|
bcnt++;
|
|
}
|
|
}
|
|
hocr_str << " </div>\n";
|
|
|
|
const std::string& text = hocr_str.str();
|
|
char* result = new char[text.length() + 1];
|
|
strcpy(result, text.c_str());
|
|
return result;
|
|
}
|
|
|
|
/**********************************************************************
|
|
* HOcr Text Renderer interface implementation
|
|
**********************************************************************/
|
|
TessHOcrRenderer::TessHOcrRenderer(const char* outputbase)
|
|
: TessResultRenderer(outputbase, "hocr") {
|
|
font_info_ = false;
|
|
}
|
|
|
|
TessHOcrRenderer::TessHOcrRenderer(const char* outputbase, bool font_info)
|
|
: TessResultRenderer(outputbase, "hocr") {
|
|
font_info_ = font_info;
|
|
}
|
|
|
|
bool TessHOcrRenderer::BeginDocumentHandler() {
|
|
AppendString(
|
|
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
|
|
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
|
|
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
|
|
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
|
|
"lang=\"en\">\n <head>\n <title>");
|
|
AppendString(title());
|
|
AppendString(
|
|
"</title>\n"
|
|
" <meta http-equiv=\"Content-Type\" content=\"text/html;"
|
|
"charset=utf-8\"/>\n"
|
|
" <meta name='ocr-system' content='tesseract " PACKAGE_VERSION
|
|
"' />\n"
|
|
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
|
|
" ocr_line ocrx_word ocrp_wconf");
|
|
if (font_info_) AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
|
|
AppendString(
|
|
"'/>\n"
|
|
" </head>\n"
|
|
" <body>\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool TessHOcrRenderer::EndDocumentHandler() {
|
|
AppendString(" </body>\n</html>\n");
|
|
|
|
return true;
|
|
}
|
|
|
|
bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
|
|
const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
|
|
if (hocr == nullptr) return false;
|
|
|
|
AppendString(hocr.get());
|
|
|
|
return true;
|
|
}
|
|
|
|
} // namespace tesseract
|