Merge branch 'tesseract-ocr:main' into modernize_datadir

This commit is contained in:
zdenop 2025-05-03 12:40:48 +02:00 committed by GitHub
commit 391972fbaf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 182 additions and 62 deletions

View File

@ -13,7 +13,7 @@ jobs:
fail-fast: false
matrix:
config:
- { name: 20.04-openmp, os: ubuntu-20.04 }
- { name: 24.04-openmp, os: ubuntu-24.04 }
- { name: 22.04-openmp, os: ubuntu-22.04 }
steps:

View File

@ -15,10 +15,9 @@ jobs:
config:
- { name: ubuntu-22.04-clang-15-autotools, os: ubuntu-22.04, cxx: clang++-15 } #installed
- { name: ubuntu-24.04-gcc-14-autotools, os: ubuntu-24.04, cxx: g++-14 } #installed
- { name: ubuntu-22.04-gcc-12-autotools, os: ubuntu-22.04, cxx: g++-12 } #installed
- { name: ubuntu-22.04-gcc-11-autotools, os: ubuntu-22.04, cxx: g++-11 } #installed
- { name: ubuntu-20.04-gcc-10-autotools, os: ubuntu-20.04, cxx: g++-10 } #installed
- { name: ubuntu-20.04-gcc-9-autotools, os: ubuntu-20.04, cxx: g++-9 } #installed
steps:
- uses: actions/checkout@v4

View File

@ -19,11 +19,8 @@ jobs:
- { name: macos-15-clang-cmake, os: macos-15, cxx: clang++ } # default
- { name: ubuntu-22.04-clang-15-cmake, os: ubuntu-22.04, cxx: clang++-15 } #installed
- { name: ubuntu-24.04-gcc-12-cmake, os: ubuntu-24.04, cxx: g++-14 } #installed
- { name: ubuntu-22.04-gcc-12-cmake, os: ubuntu-22.04, cxx: g++-12 } #installed
- { name: ubuntu-22.04-gcc-11-cmake, os: ubuntu-22.04, cxx: g++-11 } #installed
- { name: ubuntu-20.04-gcc-10-cmake, os: ubuntu-20.04, cxx: g++-10 } #installed
- { name: ubuntu-20.04-gcc-9-cmake, os: ubuntu-20.04, cxx: g++-9 } #installed
steps:
- name: Install compilers on Linux

View File

@ -14,8 +14,8 @@ jobs:
strategy:
fail-fast: false
matrix:
compiler: [ g++, clang++-15 ]
os: [ ubuntu-22.04 ]
compiler: [ g++, clang++-18 ]
os: [ ubuntu-24.04 ]
steps:
- uses: actions/checkout@v4

View File

@ -23,7 +23,7 @@ jobs:
fail-fast: false
matrix:
config:
- { name: ubuntu-20.04-gcc-unittest, os: ubuntu-20.04, cxx: g++, cxxflags: '-g -O2 -fsanitize=address,undefined' }
- { name: ubuntu-24.04-gcc-unittest, os: ubuntu-24.04, cxx: g++, cxxflags: '-g -O2 -fsanitize=address,undefined' }
- { name: ubuntu-22.04-clang-unittest, os: ubuntu-22.04, cxx: clang++, cxxflags: '-g -O2 -fsanitize=address,undefined -stdlib=libc++' }
steps:
- uses: actions/checkout@v4

View File

@ -282,7 +282,7 @@ endif()
# Compiler specific environment
if(CMAKE_COMPILER_IS_GNUCXX OR MINGW)
set(CMAKE_CXX_FLAGS_DEBUG
"${CMAKE_CXX_FLAGS_DEBUG} -Wall -DDEBUG -pedantic -Og")
"${CMAKE_CXX_FLAGS_DEBUG} -Wall -DDEBUG -pedantic -Og -Wno-unknown-pragmas")
elseif(MSVC)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
add_definitions(-D_CRT_NONSTDC_NO_DEPRECATE) # strdup
@ -306,6 +306,10 @@ elseif(MSVC)
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
message(STATUS "Building with static CRT.")
endif()
# Workaround: When building on VS 2022 17.10 or newer, but using an older runtime,
# mutexes can crash
# https://stackoverflow.com/questions/78598141/first-stdmutexlock-crashes-in-application-built-with-latest-visual-studio
add_definitions(-D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR)
endif()
if(CLANG) # clang all platforms
set(CMAKE_CXX_FLAGS_RELEASE
@ -522,6 +526,7 @@ message(STATUS "General configuration for Tesseract ${PACKAGE_VERSION}")
message(STATUS "--------------------------------------------------------")
message(STATUS "Build type: ${CMAKE_BUILD_TYPE} ${BUILD_ARCH}")
message(STATUS "Compiler: ${CMAKE_CXX_COMPILER_ID}")
message(STATUS "Compiler version: ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "Used standard: C++${CMAKE_CXX_STANDARD}")
message(STATUS "CXX compiler options: ${COMPILER_FLAGS}")
get_directory_property(DirCompDefs COMPILE_DEFINITIONS)
@ -894,7 +899,9 @@ if(BUILD_TESTS
AND EXISTS
${CMAKE_CURRENT_SOURCE_DIR}/unittest/third_party/googletest/CMakeLists.txt
)
enable_testing()
add_subdirectory(unittest/third_party/googletest)
add_subdirectory(unittest)
endif()
if(BUILD_TRAINING_TOOLS)

View File

@ -221,9 +221,9 @@ fi
# additional checks for RVV targets
if test x$check_for_rvv = x1; then
AC_MSG_NOTICE([checking how to detect RVV availability])
AC_CHECK_FUNCS([getauxval])
AC_CHECK_FUNCS([getauxval elf_aux_info])
if test $ac_cv_func_getauxval = no; then
if test $ac_cv_func_getauxval = no && test $ac_cv_func_elf_aux_info = no; then
AC_MSG_WARN([RVV is available, but we don't know how to check for it. Will not be able to use RVV.])
fi
fi

View File

@ -51,6 +51,20 @@ static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
}
}
static std::string GetID(const char *prefix, int page_number, int counter) {
std::stringstream idstr;
// IDs will only have the counter for the first page to keep them consistent
// with the IDs assigned before this change was made.
// From the second page on, IDs will also contain the page number to make them unique.
if (page_number == 0) {
idstr << prefix << "_" << counter;
} else {
idstr << prefix << "_" << page_number << "_" << counter;
}
return idstr.str();
}
///
/// Append the ALTO XML for the beginning of the document
///
@ -168,7 +182,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
case PT_PULLOUT_IMAGE: {
// Handle all kinds of images.
// TODO: optionally add TYPE, for example TYPE="photo".
alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
alto_str << "\t\t\t\t<Illustration ID=\"" << GetID("cblock", page_number, bcnt++) << "\"";
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</Illustration>\n";
res_it->Next(RIL_BLOCK);
@ -177,7 +191,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
case PT_HORZ_LINE:
case PT_VERT_LINE:
// Handle horizontal and vertical lines.
alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
alto_str << "\t\t\t\t<GraphicalElement ID=\"" << GetID("cblock", page_number, bcnt++) << "\"";
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "</GraphicalElement >\n";
res_it->Next(RIL_BLOCK);
@ -190,24 +204,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
}
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
alto_str << "\t\t\t\t<ComposedBlock ID=\"" << GetID("cblock", page_number, bcnt) << "\"";
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
alto_str << "\n";
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
alto_str << "\t\t\t\t\t<TextBlock ID=\"" << GetID("block", page_number, tcnt) << "\"";
AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);
alto_str << "\n";
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
alto_str << "\t\t\t\t\t\t<TextLine ID=\"" << GetID("line", page_number, lcnt) << "\"";
AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);
alto_str << "\n";
}
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
alto_str << "\t\t\t\t\t\t\t<String ID=\"" << GetID("string", page_number, wcnt) << "\"";
AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);
alto_str << " CONTENT=\"";

View File

@ -150,10 +150,9 @@ static void addAvailableLanguages(const std::string &datadir,
std::filesystem::recursive_directory_iterator(datadir,
std::filesystem::directory_options::follow_directory_symlink |
std::filesystem::directory_options::skip_permission_denied)) {
auto path = entry.path().lexically_relative(datadir).string();
auto extPos = path.rfind(".traineddata");
if (extPos != std::string::npos) {
langs->push_back(path.substr(0, extPos));
auto path = entry.path().lexically_relative(datadir);
if (path.extension() == ".traineddata") {
langs->push_back(path.replace_extension("").string());
}
}
}

View File

@ -61,12 +61,11 @@
# include <sys/auxv.h>
# elif defined(HAVE_ELF_AUX_INFO)
# include <sys/auxv.h>
# include <sys/elf.h>
# endif
#endif
#if defined(HAVE_RVV)
# if defined(HAVE_GETAUXVAL)
# if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
# include <sys/auxv.h>
# define HWCAP_RV(letter) (1ul << ((letter) - 'A'))
# endif
@ -244,6 +243,10 @@ SIMDDetect::SIMDDetect() {
# if defined(HAVE_GETAUXVAL)
const unsigned long hwcap = getauxval(AT_HWCAP);
rvv_available_ = hwcap & HWCAP_RV('V');
# elif defined(HAVE_ELF_AUX_INFO)
unsigned long hwcap = 0;
elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
rvv_available_ = hwcap & HWCAP_RV('V');
# endif
#endif

View File

@ -207,7 +207,10 @@ std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
tprintf("\nimage width: %d height: %d ppi: %d\n", pix_w, pix_h, yres_);
}
if (method == ThresholdMethod::Sauvola) {
if (method == ThresholdMethod::Sauvola && pix_w > 6 && pix_h > 6) {
// pixSauvolaBinarizeTiled requires half_window_size >= 2.
// Therefore window_size must be at least 4 which requires
// pix_w and pix_h to be at least 7.
int window_size;
double window_size_factor;
api->GetDoubleVariable("thresholding_window_size", &window_size_factor);

View File

@ -370,7 +370,7 @@ void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const {
while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
(*start)++;
}
while (*end > 0 && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
while (*end > *start && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
(*end)--;
}
}

View File

@ -70,15 +70,6 @@ FILE *get_debugfp() {
return debugfp;
}
// Trace printf.
void tprintf(const char *format, ...) {
FILE *f = get_debugfp();
va_list args; // variable args
va_start(args, format); // variable list
vfprintf(f, format, args);
va_end(args);
}
TessErrStream tesserr;
} // namespace tesseract

View File

@ -21,26 +21,25 @@
#include "params.h" // for INT_VAR_H
#include <tesseract/export.h> // for TESS_API
#include <cstdarg>
#include <utility> // for std::forward
namespace tesseract {
#if !defined(__GNUC__) && !defined(__attribute__)
# define __attribute__(attr) // compiler without support for __attribute__
#endif
// Disable some log messages by setting log_level > 0.
extern TESS_API INT_VAR_H(log_level);
// Main logging function.
extern TESS_API void tprintf( // Trace printf
const char *format, ...) // Message
__attribute__((format(printf, 1, 2)));
// Get file for debug output.
FILE *get_debugfp();
TESS_API FILE *get_debugfp();
// Main logging function. Trace printf.
inline void tprintf(const char *format, ...) {
va_list args;
va_start(args, format);
vfprintf(get_debugfp(), format, args);
va_end(args);
}
} // namespace tesseract
#undef __attribute__
#endif // define TESSERACT_CCUTIL_TPRINTF_H

View File

@ -25,7 +25,6 @@
#include "classify.h"
#include "intproto.h"
#include "params.h"
#include "tprintf.h"
#include <cmath> // for M_PI
#include <cstdio>

View File

@ -110,7 +110,7 @@ static const char kWildcard[] = "*";
class TESS_API Dawg {
public:
/// Magic number to determine endianness when reading the Dawg from file.
static const int16_t kDawgMagicNumber = 42;
static constexpr int16_t kDawgMagicNumber = 42;
/// A special unichar id that indicates that any appropriate pattern
/// (e.g.dictionary word, 0-9 digit, etc) can be inserted instead
/// Used for expressing patterns in punctuation and number Dawgs.

View File

@ -23,7 +23,6 @@
#include "networkio.h"
#include "serialis.h"
#include "static_shape.h"
#include "tprintf.h"
#include <cmath>
#include <cstdio>

View File

@ -418,7 +418,7 @@ static bool ParseArgs(int argc, char **argv, const char **lang, const char **ima
try {
auto loglevel = loglevels.at(loglevel_string);
log_level = loglevel;
} catch (const std::out_of_range &e) {
} catch (const std::out_of_range &) {
// TODO: Allow numeric argument?
tprintf("Error, unsupported --loglevel %s\n", loglevel_string.c_str());
return false;
@ -648,7 +648,7 @@ static void PreloadRenderers(tesseract::TessBaseAPI &api,
*
**********************************************************************/
int main1(int argc, char **argv) {
static int main1(int argc, char **argv) {
#if defined(__USE_GNU) && defined(HAVE_FEENABLEEXCEPT)
// Raise SIGFPE.
# if defined(__clang__)

View File

@ -230,7 +230,7 @@ bool PangoFontInfo::CoversUTF8Text(const char *utf8_text, int byte_length) const
int len = it.get_utf8(tmp);
tmp[len] = '\0';
tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
#if PANGO_VERSION_CHECK(1, 52, 0)
#if PANGO_VERSION_CHECK(1, 50, 4)
g_object_unref(coverage);
#else
pango_coverage_unref(coverage);
@ -239,7 +239,7 @@ bool PangoFontInfo::CoversUTF8Text(const char *utf8_text, int byte_length) const
return false;
}
}
#if PANGO_VERSION_CHECK(1, 52, 0)
#if PANGO_VERSION_CHECK(1, 50, 4)
g_object_unref(coverage);
#else
pango_coverage_unref(coverage);
@ -311,7 +311,7 @@ int PangoFontInfo::DropUncoveredChars(std::string *utf8_text) const {
my_strnmove(out, utf8_char, utf8_len);
out += utf8_len;
}
#if PANGO_VERSION_CHECK(1, 52, 0)
#if PANGO_VERSION_CHECK(1, 50, 4)
g_object_unref(coverage);
#else
pango_coverage_unref(coverage);
@ -615,7 +615,7 @@ int FontUtils::FontScore(const std::unordered_map<char32, int64_t> &ch_map,
ch_flags->push_back(covered);
}
}
#if PANGO_VERSION_CHECK(1, 52, 0)
#if PANGO_VERSION_CHECK(1, 50, 4)
g_object_unref(coverage);
#else
pango_coverage_unref(coverage);

2
test

@ -1 +1 @@
Subproject commit 2761899921c08014cf9dbf3b63592237fb9e6ecb
Subproject commit 232ff181c66516116ec0e84c4963f70de15050fd

110
unittest/CMakeLists.txt Normal file
View File

@ -0,0 +1,110 @@
# find_package(GTest REQUIRED)
include(GoogleTest) # Todo install GoogleTests?
# Set common include directories
set(COMMON_INCLUDE_DIRS
${CMAKE_CURRENT_BINARY_DIR}/../src/training
${CMAKE_CURRENT_SOURCE_DIR}/../src/ccutil
${CMAKE_CURRENT_SOURCE_DIR}/../src/ccstruct
${CMAKE_CURRENT_SOURCE_DIR}/../src/viewer
${CMAKE_CURRENT_SOURCE_DIR}/../include
${CMAKE_CURRENT_SOURCE_DIR}/../src/training/unicharset
${CMAKE_CURRENT_SOURCE_DIR}/../src/training/common
${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/googlemock/include)
if (MSVC)
set(TESSBIN_DIR ${EXECUTABLE_OUTPUT_PATH}/$<CONFIG>)
else()
set(TESSBIN_DIR ${EXECUTABLE_OUTPUT_PATH})
endif()
# Set common compile definitions
set(COMMON_COMPILE_DEFINITIONS
"-DTESTING_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../test/testing\""
"-DTESSDATA_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../tessdata\""
"-DTESSBIN_DIR=\"${TESSBIN_DIR}\""
"-DTESTDATA_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../test/testdata\""
"-DLANGDATA_DIR=\"${CMAKE_CURRENT_SOURCE_DIR}/../langdata_lstm\"")
file(
GLOB TEST_SOURCES
RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"*.cc")
set(COMMON_LINK_LIBS libtesseract GTest::gtest_main common_training
unicharset_training)
set(TRAINING_TESTS
commandlineflags_test.cc
dawg_test.cc
lstm_recode_test.cc
lstm_squashed_test.cc
lstm_test.cc
lstm_test.cc
normstrngs_test.cc
unichar_test.cc
unicharcompress_test.cc
unicharset_test.cc
validate_grapheme_test.cc
validate_indic_test.cc
validate_khmer_test.cc
validate_myanmar_test.cc
validator_test.cc)
set(PANGO_TESTS ligature_table_test.cc pango_font_info_test.cc
pango_font_info_test.cc stringrenderer_test.cc)
set(LEGACY_TESTS
applybox_test.cc
bitvector_test.cc
equationdetect_test.cc
indexmapbidi_test.cc
intfeaturemap_test.cc
mastertrainer_test.cc
osd_test.cc
params_model_test.cc
shapetable_test.cc)
if(BUILD_TRAINING_TOOLS AND PANGO_FOUND)
list(APPEND COMMON_INCLUDE_DIRS
${CMAKE_CURRENT_SOURCE_DIR}/../src/training/pango ${PANGO_INCLUDE_DIRS})
else()
list(REMOVE_ITEM TEST_SOURCES ${PANGO_TESTS})
endif()
if(DISABLED_LEGACY_ENGINE)
list(REMOVE_ITEM TEST_SOURCES ${LEGACY_TESTS})
endif()
if(NOT BUILD_TRAINING_TOOLS)
list(REMOVE_ITEM TEST_SOURCES ${TRAINING_TESTS})
endif()
set(TATWEEL_TEST_EXTRA_SRC util/utf8/unilib.cc util/utf8/unicodetext.cc
third_party/utf/rune.c)
message(STATUS "Enabled tests: ${TEST_SOURCES}")
foreach(test_source IN LISTS TEST_SOURCES)
get_filename_component(test_name ${test_source} NAME_WE)
if(${test_source} IN_LIST PANGO_TESTS)
list(APPEND COMMON_LINK_LIBS pango_training ${PANGO_LIBRARIES})
endif()
if(${test_name} MATCHES "tatweel_test")
list(APPEND test_source ${TATWEEL_TEST_EXTRA_SRC})
list(APPEND COMMON_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}/util/utf8)
endif()
add_executable(${test_name} ${test_source})
if(${test_name} MATCHES "progress_test")
target_link_libraries(${test_name} PRIVATE GTest::gmock)
endif()
target_compile_definitions(${test_name} PRIVATE ${COMMON_COMPILE_DEFINITIONS})
target_include_directories(${test_name} PRIVATE ${COMMON_INCLUDE_DIRS})
target_link_libraries(${test_name} PRIVATE ${COMMON_LINK_LIBS})
add_test(NAME ${test_name} COMMAND ${test_name})
endforeach()
# Discover tests gtest_discover_tests(apiexample_test baseapi_test
# baseapi_thread_test) add_test(baseapi_gtests baseapi_test.cc)

View File

@ -82,6 +82,9 @@ To run the tests, do the following in tesseract folder
```
autoreconf -fiv
git submodule update --init
git clone https://github.com/egorpugin/tessdata tessdata_unittest --depth 1
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
export TESSDATA_PREFIX=/prefix/to/path/to/tessdata
make check
```

View File

@ -22,7 +22,6 @@
#include "include_gunit.h"
#include "matrix.h"
#include "simddetect.h"
#include "tprintf.h"
namespace tesseract {

View File

@ -19,7 +19,6 @@
#include "include_gunit.h"
#include "helpers.h"
#include "tprintf.h"
#include "functions.h"
#include "lang_model_helpers.h"

@ -1 +1 @@
Subproject commit b514bdc898e2951020cbdca1304b75f5950d1f59
Subproject commit 7d76a231b0e29caf86e68d1df858308cd53b2a66

View File

@ -16,7 +16,6 @@
#include "include_gunit.h"
#include "log.h" // for LOG
#include "serialis.h"
#include "tprintf.h"
#include "unicharcompress.h"
namespace tesseract {