mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
Modernize code for renderers and remove filename conversion for Windows (#4330)
Commitdb52047420
added the filename conversion for the hOCR renderer, but it was removed later for TSV in commit6700edd8bc
. Tesseract does not use a filename conversion anywhere else, so remove it for the other renderers, too. Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
3020c14a60
commit
638868ed38
@ -15,9 +15,6 @@
|
||||
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#include "helpers.h" // for copy_string
|
||||
#ifdef _WIN32
|
||||
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
||||
#endif
|
||||
#include "tprintf.h" // for tprintf
|
||||
|
||||
#include <tesseract/baseapi.h>
|
||||
@ -145,20 +142,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
SetInputName(nullptr);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
|
||||
int utf8_len =
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
|
||||
input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
std::stringstream alto_str;
|
||||
// Use "C" locale (needed for int values larger than 999).
|
||||
alto_str.imbue(std::locale::classic());
|
||||
@ -169,7 +152,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
<< " WIDTH=\"" << rect_width_ << "\""
|
||||
<< " HEIGHT=\"" << rect_height_ << "\">\n";
|
||||
|
||||
ResultIterator *res_it = GetIterator();
|
||||
std::unique_ptr<ResultIterator> res_it(GetIterator());
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_WORD)) {
|
||||
res_it->Next(RIL_WORD);
|
||||
@ -186,7 +169,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
// Handle all kinds of images.
|
||||
// TODO: optionally add TYPE, for example TYPE="photo".
|
||||
alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
|
||||
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
|
||||
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
|
||||
alto_str << "</Illustration>\n";
|
||||
res_it->Next(RIL_BLOCK);
|
||||
continue;
|
||||
@ -195,7 +178,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
case PT_VERT_LINE:
|
||||
// Handle horizontal and vertical lines.
|
||||
alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
|
||||
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
|
||||
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
|
||||
alto_str << "</GraphicalElement >\n";
|
||||
res_it->Next(RIL_BLOCK);
|
||||
continue;
|
||||
@ -208,24 +191,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
|
||||
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_PARA)) {
|
||||
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_PARA, alto_str);
|
||||
AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
|
||||
AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);
|
||||
alto_str << "\n";
|
||||
}
|
||||
|
||||
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
|
||||
AddBoxToAlto(res_it, RIL_WORD, alto_str);
|
||||
AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);
|
||||
alto_str << " CONTENT=\"";
|
||||
|
||||
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
|
||||
@ -272,7 +255,6 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
|
||||
alto_str << "\t\t\t</PrintSpace>\n"
|
||||
<< "\t\t</Page>\n";
|
||||
|
||||
delete res_it;
|
||||
return copy_string(alto_str.str());
|
||||
}
|
||||
|
||||
|
@ -21,9 +21,6 @@
|
||||
#include <locale> // for std::locale::classic
|
||||
#include <memory> // for std::unique_ptr
|
||||
#include <sstream> // for std::stringstream
|
||||
#ifdef _WIN32
|
||||
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
||||
#endif
|
||||
#include <tesseract/renderer.h>
|
||||
#include "helpers.h" // for copy_string
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
@ -151,23 +148,6 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
|
||||
SetInputName(nullptr);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len =
|
||||
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
|
||||
str16_len);
|
||||
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
|
||||
0, nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
|
||||
nullptr, nullptr);
|
||||
input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
std::stringstream hocr_str;
|
||||
// Use "C" locale (needed for double values x_size and x_descenders).
|
||||
hocr_str.imbue(std::locale::classic());
|
||||
|
@ -2,7 +2,7 @@
|
||||
// Description: PAGE XML rendering interface
|
||||
// Author: Jan Kamlah
|
||||
|
||||
// (C) Copyright 2021
|
||||
// (C) Copyright 2024
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
@ -15,9 +15,6 @@
|
||||
|
||||
#include "errcode.h" // for ASSERT_HOST
|
||||
#include "helpers.h" // for copy_string
|
||||
#ifdef _WIN32
|
||||
# include "host.h" // windows.h for MultiByteToWideChar, ...
|
||||
#endif
|
||||
#include "tprintf.h" // for tprintf
|
||||
|
||||
#include <tesseract/baseapi.h>
|
||||
@ -717,23 +714,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
SetInputName(nullptr);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// convert input name from ANSI encoding to utf-8
|
||||
int str16_len =
|
||||
MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
|
||||
wchar_t *uni16_str = new WCHAR[str16_len];
|
||||
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
|
||||
str16_len);
|
||||
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
|
||||
0, nullptr, nullptr);
|
||||
char *utf8_str = new char[utf8_len];
|
||||
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
|
||||
nullptr, nullptr);
|
||||
input_file_ = utf8_str;
|
||||
delete[] uni16_str;
|
||||
delete[] utf8_str;
|
||||
#endif
|
||||
|
||||
// Used variables
|
||||
|
||||
std::stringstream reading_order_str;
|
||||
@ -788,7 +768,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
<< "\t\t\t<OrderedGroup id=\"ro" << ro_id
|
||||
<< "\" caption=\"Regions reading order\">\n";
|
||||
|
||||
ResultIterator *res_it = GetIterator();
|
||||
std::unique_ptr<ResultIterator> res_it(GetIterator());
|
||||
|
||||
float block_conf = 0;
|
||||
float line_conf = 0;
|
||||
@ -808,7 +788,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
// Handle all kinds of images.
|
||||
page_str << "\t\t<GraphicRegion id=\"r" << rcnt++ << "\">\n";
|
||||
page_str << "\t\t\t";
|
||||
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
|
||||
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
|
||||
page_str << "\t\t</GraphicRegion>\n";
|
||||
res_it->Next(RIL_BLOCK);
|
||||
continue;
|
||||
@ -818,7 +798,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
// Handle horizontal and vertical lines.
|
||||
page_str << "\t\t<SeparatorRegion id=\"r" << rcnt++ << "\">\n";
|
||||
page_str << "\t\t\t";
|
||||
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
|
||||
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
|
||||
page_str << "\t\t</SeparatorRegion>\n";
|
||||
res_it->Next(RIL_BLOCK);
|
||||
continue;
|
||||
@ -849,7 +829,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP &&
|
||||
orientation_block != ORIENTATION_PAGE_DOWN)) &&
|
||||
LEVELFLAG == 0) {
|
||||
AddBoxToPAGE(res_it, RIL_BLOCK, page_str);
|
||||
AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
|
||||
}
|
||||
}
|
||||
|
||||
@ -892,9 +872,9 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
line_str << "custom=\"" << "readingOrder {index:" << lcnt << ";}\">\n";
|
||||
// If level is linebased, get the line polygon and baseline
|
||||
if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) {
|
||||
AddPointToWordPolygon(res_it, RIL_TEXTLINE, line_top_ltr_pts,
|
||||
AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts,
|
||||
line_bottom_ltr_pts, writing_direction);
|
||||
AddBaselineToPTA(res_it, RIL_TEXTLINE, line_baseline_pts);
|
||||
AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts);
|
||||
if (ttb_flag) {
|
||||
line_baseline_pts = TransposePolygonline(line_baseline_pts);
|
||||
}
|
||||
@ -914,18 +894,18 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
<< WritingDirectionToStr(writing_direction) << "\" "
|
||||
<< "custom=\"" << "readingOrder {index:" << wcnt << ";}\">\n";
|
||||
if ((!POLYGONFLAG || skewed_flag) || ttb_flag) {
|
||||
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
|
||||
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
|
||||
writing_direction);
|
||||
}
|
||||
}
|
||||
|
||||
if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) {
|
||||
AddPointToWordPolygon(res_it, RIL_WORD, word_top_pts, word_bottom_pts,
|
||||
AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
|
||||
writing_direction);
|
||||
}
|
||||
|
||||
// Get the word baseline information
|
||||
AddBaselineToPTA(res_it, RIL_WORD, word_baseline_pts);
|
||||
AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts);
|
||||
|
||||
// Get the word text content and polygon
|
||||
do {
|
||||
@ -934,7 +914,7 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
word_content << HOcrEscape(grapheme.get()).c_str();
|
||||
if (POLYGONFLAG && !skewed_flag && !ttb_flag) {
|
||||
AddPointToWordPolygon(res_it, RIL_SYMBOL, word_top_pts,
|
||||
AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts,
|
||||
word_bottom_pts, writing_direction);
|
||||
}
|
||||
}
|
||||
@ -1146,7 +1126,6 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
|
||||
const std::string &text = reading_order_str.str();
|
||||
reading_order_str.str("");
|
||||
|
||||
delete res_it;
|
||||
return copy_string(text);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user