From 280db06bbf460ded786508b1d47f272845d688c3 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 8 Jun 2018 15:27:36 +0200 Subject: [PATCH 1/2] scanutils: Fix illegal memory access Format strings which contain "%*s" show this error in Valgrind: ==32503== Conditional jump or move depends on uninitialised value(s) ==32503== at 0x2B8BB0: tvfscanf(_IO_FILE*, char const*, __va_list_tag*) (scanutils.cpp:486) ==32503== by 0x2B825A: tfscanf(_IO_FILE*, char const*, ...) (scanutils.cpp:234) ==32503== by 0x272B01: read_unlv_file(STRING, int, int, BLOCK_LIST*) (blread.cpp:54) ==32503== by 0x1753CD: tesseract::Tesseract::SegmentPage(STRING const*, BLOCK_LIST*, tesseract::Tesseract*, OSResults*) (pagesegmain.cpp:115) ==32503== by 0x1363CD: tesseract::TessBaseAPI::FindLines() (baseapi.cpp:2291) ==32503== by 0x130CF1: tesseract::TessBaseAPI::Recognize(ETEXT_DESC*) (baseapi.cpp:802) ==32503== by 0x1322D3: tesseract::TessBaseAPI::ProcessPage(Pix*, int, char const*, char const*, int, tesseract::TessResultRenderer*) (baseapi.cpp:1176) ==32503== by 0x131A84: tesseract::TessBaseAPI::ProcessPagesMultipageTiff(unsigned char const*, unsigned long, char const*, char const*, int, tesseract::TessResultRenderer*, int) (baseapi.cpp:1013) ==32503== by 0x132052: tesseract::TessBaseAPI::ProcessPagesInternal(char const*, char const*, int, tesseract::TessResultRenderer*) (baseapi.cpp:1129) ==32503== by 0x131B1E: tesseract::TessBaseAPI::ProcessPages(char const*, char const*, int, tesseract::TessResultRenderer*) (baseapi.cpp:1032) ==32503== by 0x12E00C: main (tesseractmain.cpp:537) ==32503== Uninitialised value was created by a stack allocation ==32503== at 0x272A60: read_unlv_file(STRING, int, int, BLOCK_LIST*) (blread.cpp:41) Signed-off-by: Stefan Weil --- src/ccutil/scanutils.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/ccutil/scanutils.cpp b/src/ccutil/scanutils.cpp index f47546ae..aeb16707 100644 --- a/src/ccutil/scanutils.cpp +++ b/src/ccutil/scanutils.cpp @@ -472,8 +472,10 @@ static int tvfscanf(FILE* stream, const char *format, va_list ap) { case 's': // String { - char *sp; - sp = sarg = va_arg(ap, char *); + if (!(flags & FL_SPLAT)) { + sarg = va_arg(ap, char *); + } + char *sp = sarg; while (width--) { q = fgetc(stream); if (isspace(static_cast(q)) || q <= 0) { @@ -488,7 +490,6 @@ static int tvfscanf(FILE* stream, const char *format, va_list ap) { } else if (!(flags & FL_SPLAT)) { *sp = '\0'; // Terminate output converted++; - } else { } } break; From 3292484f67af8bdda23aa5e510918d0115785291 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Fri, 8 Jun 2018 17:38:18 +0200 Subject: [PATCH 2/2] Test for correct locale settings Normal C++ programs like those which are built for tesseract automatically set the locale "C". There can be different locale settings if the tesseract library is used in other software. A wrong locale can cause wrong results from sscanf which is used at different places in the tesseract code, so make sure that we have the right locale settings and fail if that is not the case. Signed-off-by: Stefan Weil --- src/api/baseapi.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 82e3e33e..b17fe2d2 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -43,6 +43,7 @@ #endif // _WIN32 #include +#include #include #include #include @@ -185,7 +186,15 @@ TessBaseAPI::TessBaseAPI() rect_width_(0), rect_height_(0), image_width_(0), - image_height_(0) {} + image_height_(0) { + const char *locale; + locale = std::setlocale(LC_ALL, nullptr); + ASSERT_HOST(!strcmp(locale, "C")); + locale = std::setlocale(LC_CTYPE, nullptr); + ASSERT_HOST(!strcmp(locale, "C")); + locale = std::setlocale(LC_NUMERIC, nullptr); + ASSERT_HOST(!strcmp(locale, "C")); +} TessBaseAPI::~TessBaseAPI() { End();