From 286d8275c783062057d09bb8e5e6607a8917abd9 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Thu, 27 Dec 2018 10:15:59 +0100 Subject: [PATCH] Add support for image or image list by URL This allows OCR of images from the internet without downloading them first: tesseract http://IMAGE_URL OUTPUT ... It uses libcurl. Signed-off-by: Stefan Weil --- configure.ac | 8 ++++++++ src/api/Makefile.am | 5 +++++ src/api/baseapi.cpp | 43 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 9d12c60e..236a3224 100644 --- a/configure.ac +++ b/configure.ac @@ -428,6 +428,14 @@ AC_CHECK_TYPES([mbstate_t],,, [#include "wchar.h"]) # Test auxiliary packages # ---------------------------------------- +AM_CONDITIONAL([HAVE_LIBCURL], false) +PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false]) +if $have_libcurl; then + AM_CONDITIONAL([HAVE_LIBCURL], true) +else + AM_CONDITIONAL([HAVE_LIBCURL], false) +fi + PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false]) if $have_lept; then CPPFLAGS="$CPPFLAGS $LEPTONICA_CFLAGS" diff --git a/src/api/Makefile.am b/src/api/Makefile.am index cb1803de..2d4a79b0 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -32,6 +32,9 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS) if VISIBILITY libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS endif +if HAVE_LIBCURL +libtesseract_api_la_CPPFLAGS += $(libcurl_CFLAGS) -DHAVE_LIBCURL +endif libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp libtesseract_api_la_SOURCES += altorenderer.cpp libtesseract_api_la_SOURCES += hocrrenderer.cpp @@ -42,6 +45,7 @@ libtesseract_api_la_SOURCES += renderer.cpp lib_LTLIBRARIES += libtesseract.la libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS) +libtesseract_la_LDFLAGS += $(libcurl_LIBS) libtesseract_la_LDFLAGS += $(TENSORFLOW_LIBS) libtesseract_la_SOURCES = # Dummy C++ source to cause C++ linking. @@ -94,6 +98,7 @@ tesseract_LDADD += $(LEPTONICA_LIBS) tesseract_LDADD += $(OPENMP_CXXFLAGS) tesseract_LDADD += $(TENSORFLOW_LIBS) tesseract_LDADD += $(libarchive_LIBS) +tesseract_LDADD += $(libcurl_LIBS) if T_WIN tesseract_LDADD += -ltiff diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 3e16fe4c..4a316464 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -49,6 +49,9 @@ #include // for std::pair #include // for std::stringstream #include // for std::vector +#ifdef HAVE_LIBCURL +#include +#endif #include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box... #ifndef DISABLED_LEGACY_ENGINE #include "blobclass.h" // for ExtractFontName @@ -1081,6 +1084,15 @@ bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config, return result; } +static size_t +WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) +{ + size = size * nmemb; + std::string* buf = reinterpret_cast(userp); + buf->append(reinterpret_cast(contents), size); + return size; +} + // In the ideal scenario, Tesseract will start working on data as soon // as it can. For example, if you stream a filelist through stdin, we // should start the OCR process as soon as the first filename is @@ -1119,6 +1131,31 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, buf.assign((std::istreambuf_iterator(std::cin)), (std::istreambuf_iterator())); data = reinterpret_cast(buf.data()); + } else if (strncmp(filename, "http:", 5) == 0 || + strncmp(filename, "https:", 6) == 0 ) { + // Get image or image list by URL. +#ifdef HAVE_LIBCURL + CURL* curl = curl_easy_init(); + if (curl == nullptr) { + fprintf(stderr, "Error, curl_easy_init failed\n"); + return false; + } else { + CURLcode curlcode; + curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename); + ASSERT_HOST(curlcode == CURLE_OK); + curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + ASSERT_HOST(curlcode == CURLE_OK); + curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf); + ASSERT_HOST(curlcode == CURLE_OK); + curlcode = curl_easy_perform(curl); + ASSERT_HOST(curlcode == CURLE_OK); + curl_easy_cleanup(curl); + data = reinterpret_cast(buf.data()); + } +#else + fprintf(stderr, "Error, this tesseract has no URL support\n"); + return false; +#endif } else { // Check whether the input file can be read. if (FILE* file = fopen(filename, "rb")) { @@ -1132,14 +1169,14 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, // Here is our autodetection int format; - int r = (stdInput) ? + int r = (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format); // Maybe we have a filelist if (r != 0 || format == IFF_UNKNOWN) { STRING s; - if (stdInput) { + if (data != nullptr) { s = buf.c_str(); } else { std::ifstream t(filename); @@ -1164,7 +1201,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, // Fail early if we can, before producing any output Pix *pix = nullptr; if (!tiff) { - pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename); + pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename); if (pix == nullptr) { return false; }