Add support for image or image list by URL

This allows OCR of images from the internet without downloading them first: tesseract http://IMAGE_URL OUTPUT ... It uses libcurl. Signed-off-by: Stefan Weil <sw@weilnetz.de>
2024-11-23 18:49:08 +08:00 · 2018-12-27 10:15:59 +01:00 · 2018-12-27 10:15:59 +01:00 · 286d8275c7
commit 286d8275c7
parent da0fa73e77
3 changed files with 53 additions and 3 deletions
--- a/configure.ac
+++ b/configure.ac
@ -428,6 +428,14 @@ AC_CHECK_TYPES([mbstate_t],,, [#include "wchar.h"])
 # Test auxiliary packages
 # ----------------------------------------

+AM_CONDITIONAL([HAVE_LIBCURL], false)
+PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false])
+if $have_libcurl; then
+  AM_CONDITIONAL([HAVE_LIBCURL], true)
+else
+  AM_CONDITIONAL([HAVE_LIBCURL], false)
+fi
+
 PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false])
 if $have_lept; then
  CPPFLAGS="$CPPFLAGS $LEPTONICA_CFLAGS"
--- a/src/api/Makefile.am
+++ b/src/api/Makefile.am
@ -32,6 +32,9 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
 if VISIBILITY
 libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
 endif
+if HAVE_LIBCURL
+libtesseract_api_la_CPPFLAGS += $(libcurl_CFLAGS) -DHAVE_LIBCURL
+endif
 libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
 libtesseract_api_la_SOURCES += altorenderer.cpp
 libtesseract_api_la_SOURCES += hocrrenderer.cpp
@ -42,6 +45,7 @@ libtesseract_api_la_SOURCES += renderer.cpp

 lib_LTLIBRARIES += libtesseract.la
 libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
+libtesseract_la_LDFLAGS += $(libcurl_LIBS)
 libtesseract_la_LDFLAGS += $(TENSORFLOW_LIBS)
 libtesseract_la_SOURCES =
 # Dummy C++ source to cause C++ linking.
@ -94,6 +98,7 @@ tesseract_LDADD += $(LEPTONICA_LIBS)
 tesseract_LDADD += $(OPENMP_CXXFLAGS)
 tesseract_LDADD += $(TENSORFLOW_LIBS)
 tesseract_LDADD += $(libarchive_LIBS)
+tesseract_LDADD += $(libcurl_LIBS)

 if T_WIN
 tesseract_LDADD += -ltiff
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
@ -49,6 +49,9 @@
 #include <set>                 // for std::pair
 #include <sstream>             // for std::stringstream
 #include <vector>              // for std::vector
+#ifdef HAVE_LIBCURL
+#include <curl/curl.h>
+#endif
 #include "allheaders.h"        // for pixDestroy, boxCreate, boxaAddBox, box...
 #ifndef DISABLED_LEGACY_ENGINE
 #include "blobclass.h"         // for ExtractFontName
@ -1081,6 +1084,15 @@ bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
  return result;
 }

+static size_t
+WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
+{
+  size = size * nmemb;
+  std::string* buf = reinterpret_cast<std::string*>(userp);
+  buf->append(reinterpret_cast<const char*>(contents), size);
+  return size;
+}
+
 // In the ideal scenario, Tesseract will start working on data as soon
 // as it can. For example, if you stream a filelist through stdin, we
 // should start the OCR process as soon as the first filename is
@ -1119,6 +1131,31 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
    buf.assign((std::istreambuf_iterator<char>(std::cin)),
               (std::istreambuf_iterator<char>()));
    data = reinterpret_cast<const l_uint8 *>(buf.data());
+  } else if (strncmp(filename, "http:", 5) == 0 ||
+             strncmp(filename, "https:", 6) == 0 ) {
+    // Get image or image list by URL.
+#ifdef HAVE_LIBCURL
+    CURL* curl = curl_easy_init();
+    if (curl ==  nullptr) {
+      fprintf(stderr, "Error, curl_easy_init failed\n");
+      return false;
+    } else {
+      CURLcode curlcode;
+      curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
+      ASSERT_HOST(curlcode == CURLE_OK);
+      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
+      ASSERT_HOST(curlcode == CURLE_OK);
+      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
+      ASSERT_HOST(curlcode == CURLE_OK);
+      curlcode = curl_easy_perform(curl);
+      ASSERT_HOST(curlcode == CURLE_OK);
+      curl_easy_cleanup(curl);
+      data = reinterpret_cast<const l_uint8 *>(buf.data());
+    }
+#else
+    fprintf(stderr, "Error, this tesseract has no URL support\n");
+    return false;
+#endif
  } else {
    // Check whether the input file can be read.
    if (FILE* file = fopen(filename, "rb")) {
@ -1132,14 +1169,14 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,

  // Here is our autodetection
  int format;
-  int r = (stdInput) ?
+  int r = (data != nullptr) ?
      findFileFormatBuffer(data, &format) :
      findFileFormat(filename, &format);

  // Maybe we have a filelist
  if (r != 0 || format == IFF_UNKNOWN) {
    STRING s;
-    if (stdInput) {
+    if (data != nullptr) {
      s = buf.c_str();
    } else {
      std::ifstream t(filename);
@ -1164,7 +1201,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
  // Fail early if we can, before producing any output
  Pix *pix = nullptr;
  if (!tiff) {
-    pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
+    pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
    if (pix == nullptr) {
      return false;
    }