Add support for image or image list by URL

This allows OCR of images from the internet without downloading them first:

    tesseract http://IMAGE_URL OUTPUT ...

It uses libcurl.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2018-12-27 10:15:59 +01:00 committed by zdenop
parent 190536bbd7
commit ca172592da
3 changed files with 53 additions and 3 deletions

View File

@ -428,6 +428,14 @@ AC_CHECK_TYPES([mbstate_t],,, [#include "wchar.h"])
# Test auxiliary packages
# ----------------------------------------
AM_CONDITIONAL([HAVE_LIBCURL], false)
PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false])
if $have_libcurl; then
AM_CONDITIONAL([HAVE_LIBCURL], true)
else
AM_CONDITIONAL([HAVE_LIBCURL], false)
fi
PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false])
if $have_lept; then
CPPFLAGS="$CPPFLAGS $LEPTONICA_CFLAGS"

View File

@ -32,6 +32,9 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
if VISIBILITY
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
endif
if HAVE_LIBCURL
libtesseract_api_la_CPPFLAGS += $(libcurl_CFLAGS) -DHAVE_LIBCURL
endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
libtesseract_api_la_SOURCES += altorenderer.cpp
libtesseract_api_la_SOURCES += hocrrenderer.cpp
@ -42,6 +45,7 @@ libtesseract_api_la_SOURCES += renderer.cpp
lib_LTLIBRARIES += libtesseract.la
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
libtesseract_la_LDFLAGS += $(libcurl_LIBS)
libtesseract_la_LDFLAGS += $(TENSORFLOW_LIBS)
libtesseract_la_SOURCES =
# Dummy C++ source to cause C++ linking.
@ -94,6 +98,7 @@ tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
tesseract_LDADD += $(TENSORFLOW_LIBS)
tesseract_LDADD += $(libarchive_LIBS)
tesseract_LDADD += $(libcurl_LIBS)
if T_WIN
tesseract_LDADD += -ltiff

View File

@ -49,6 +49,9 @@
#include <set> // for std::pair
#include <sstream> // for std::stringstream
#include <vector> // for std::vector
#ifdef HAVE_LIBCURL
#include <curl/curl.h>
#endif
#include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box...
#ifndef DISABLED_LEGACY_ENGINE
#include "blobclass.h" // for ExtractFontName
@ -1084,6 +1087,15 @@ bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
return result;
}
static size_t
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size = size * nmemb;
std::string* buf = reinterpret_cast<std::string*>(userp);
buf->append(reinterpret_cast<const char*>(contents), size);
return size;
}
// In the ideal scenario, Tesseract will start working on data as soon
// as it can. For example, if you stream a filelist through stdin, we
// should start the OCR process as soon as the first filename is
@ -1122,6 +1134,31 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
buf.assign((std::istreambuf_iterator<char>(std::cin)),
(std::istreambuf_iterator<char>()));
data = reinterpret_cast<const l_uint8 *>(buf.data());
} else if (strncmp(filename, "http:", 5) == 0 ||
strncmp(filename, "https:", 6) == 0 ) {
// Get image or image list by URL.
#ifdef HAVE_LIBCURL
CURL* curl = curl_easy_init();
if (curl == nullptr) {
fprintf(stderr, "Error, curl_easy_init failed\n");
return false;
} else {
CURLcode curlcode;
curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
ASSERT_HOST(curlcode == CURLE_OK);
curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
ASSERT_HOST(curlcode == CURLE_OK);
curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
ASSERT_HOST(curlcode == CURLE_OK);
curlcode = curl_easy_perform(curl);
ASSERT_HOST(curlcode == CURLE_OK);
curl_easy_cleanup(curl);
data = reinterpret_cast<const l_uint8 *>(buf.data());
}
#else
fprintf(stderr, "Error, this tesseract has no URL support\n");
return false;
#endif
} else {
// Check whether the input file can be read.
if (FILE* file = fopen(filename, "rb")) {
@ -1135,14 +1172,14 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
// Here is our autodetection
int format;
int r = (stdInput) ?
int r = (data != nullptr) ?
findFileFormatBuffer(data, &format) :
findFileFormat(filename, &format);
// Maybe we have a filelist
if (r != 0 || format == IFF_UNKNOWN) {
STRING s;
if (stdInput) {
if (data != nullptr) {
s = buf.c_str();
} else {
std::ifstream t(filename);
@ -1167,7 +1204,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
// Fail early if we can, before producing any output
Pix *pix = nullptr;
if (!tiff) {
pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
if (pix == nullptr) {
return false;
}