Modernize code for renderers and remove filename conversion for Windows (#4330)

Commit db52047420 added the filename conversion for the hOCR renderer,
but it was removed later for TSV in commit 6700edd8bc.

Tesseract does not use a filename conversion anywhere else, so remove it
for the other renderers, too.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2024-10-23 07:34:06 +02:00 committed by GitHub
parent 3020c14a60
commit 638868ed38
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 18 additions and 77 deletions

View File

@ -15,9 +15,6 @@
#include "errcode.h" // for ASSERT_HOST
#include "helpers.h" // for copy_string
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include "tprintf.h" // for tprintf
#include <tesseract/baseapi.h>
@ -145,20 +142,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}
#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
int utf8_len =
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif
std::stringstream alto_str;
// Use "C" locale (needed for int values larger than 999).
alto_str.imbue(std::locale::classic());
@ -169,7 +152,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
<< " WIDTH=\"" << rect_width_ << "\""
<< " HEIGHT=\"" << rect_height_ << "\">\n";
ResultIterator *res_it = GetIterator();
std::unique_ptr<ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
@ -186,7 +169,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
// Handle all kinds of images.
// TODO: optionally add TYPE, for example TYPE="photo".
alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</Illustration>\n";
res_it->Next(RIL_BLOCK);
continue;
@ -195,7 +178,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
case PT_VERT_LINE:
// Handle horizontal and vertical lines.
alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</GraphicalElement >\n";
res_it->Next(RIL_BLOCK);
continue;
@ -208,24 +191,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "\n";
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
AddBoxToAlto(res_it, RIL_PARA, alto_str);
AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);
alto_str << "\n";
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);
alto_str << "\n";
}
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
AddBoxToAlto(res_it, RIL_WORD, alto_str);
AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);
alto_str << " CONTENT=\"";
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
@ -272,7 +255,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
alto_str << "\t\t\t</PrintSpace>\n"
<< "\t\t</Page>\n";
delete res_it;
return copy_string(alto_str.str());
}

View File

@ -21,9 +21,6 @@
#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include <tesseract/renderer.h>
#include "helpers.h" // for copy_string
#include "tesseractclass.h" // for Tesseract
@ -151,23 +148,6 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}
#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len =
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif
std::stringstream hocr_str;
// Use "C" locale (needed for double values x_size and x_descenders).
hocr_str.imbue(std::locale::classic());

View File

@ -2,7 +2,7 @@
// Description: PAGE XML rendering interface
// Author: Jan Kamlah
// (C) Copyright 2021
// (C) Copyright 2024
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
@ -15,9 +15,6 @@
#include "errcode.h" // for ASSERT_HOST
#include "helpers.h" // for copy_string
#ifdef _WIN32
# include "host.h" // windows.h for MultiByteToWideChar, ...
#endif
#include "tprintf.h" // for tprintf
#include <tesseract/baseapi.h>
@ -717,23 +714,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
SetInputName(nullptr);
}
#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len =
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
0, nullptr, nullptr);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr, nullptr);
input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif
// Used variables
std::stringstream reading_order_str;
@ -788,7 +768,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
<< "\t\t\t<OrderedGroup id=\"ro" << ro_id
<< "\" caption=\"Regions reading order\">\n";
ResultIterator *res_it = GetIterator();
std::unique_ptr<ResultIterator> res_it(GetIterator());
float block_conf = 0;
float line_conf = 0;
@ -808,7 +788,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
// Handle all kinds of images.
page_str << "\t\t<GraphicRegion id=\"r" << rcnt++ << "\">\n";
page_str << "\t\t\t";
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
page_str << "\t\t</GraphicRegion>\n";
res_it->Next(RIL_BLOCK);
continue;
@ -818,7 +798,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
// Handle horizontal and vertical lines.
page_str << "\t\t<SeparatorRegion id=\"r" << rcnt++ << "\">\n";
page_str << "\t\t\t";
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
page_str << "\t\t</SeparatorRegion>\n";
res_it->Next(RIL_BLOCK);
continue;
@ -849,7 +829,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP &&
orientation_block != ORIENTATION_PAGE_DOWN)) &&
LEVELFLAG == 0) {
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
}
}
@ -892,9 +872,9 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
line_str << "custom=\"" << "readingOrder {index:" << lcnt << ";}\">\n";
// If level is linebased, get the line polygon and baseline
if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) {
AddPointToWordPolygon(res_it, RIL_TEXTLINE, line_top_ltr_pts,
AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts,
line_bottom_ltr_pts, writing_direction);
AddBaselineToPTA(res_it, RIL_TEXTLINE, line_baseline_pts);
AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts);
if (ttb_flag) {
line_baseline_pts = TransposePolygonline(line_baseline_pts);
}
@ -914,18 +894,18 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
<< WritingDirectionToStr(writing_direction) << "\" "
<< "custom=\"" << "readingOrder {index:" << wcnt << ";}\">\n";
if ((!POLYGONFLAG || skewed_flag) || ttb_flag) {
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
writing_direction);
}
}
if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) {
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
writing_direction);
}
// Get the word baseline information
AddBaselineToPTA(res_it, RIL_WORD, word_baseline_pts);
AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts);
// Get the word text content and polygon
do {
@ -934,7 +914,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
if (grapheme && grapheme[0] != 0) {
word_content << HOcrEscape(grapheme.get()).c_str();
if (POLYGONFLAG && !skewed_flag && !ttb_flag) {
AddPointToWordPolygon(res_it, RIL_SYMBOL, word_top_pts,
AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts,
word_bottom_pts, writing_direction);
}
}
@ -1146,7 +1126,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
const std::string &text = reading_order_str.str();
reading_order_str.str("");
delete res_it;
return copy_string(text);
}