mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 15:03:45 +08:00
Add support for image or image list by URL
This allows OCR of images from the internet without downloading them first: tesseract http://IMAGE_URL OUTPUT ... It uses libcurl. Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
190536bbd7
commit
ca172592da
@ -428,6 +428,14 @@ AC_CHECK_TYPES([mbstate_t],,, [#include "wchar.h"])
|
||||
# Test auxiliary packages
|
||||
# ----------------------------------------
|
||||
|
||||
AM_CONDITIONAL([HAVE_LIBCURL], false)
|
||||
PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false])
|
||||
if $have_libcurl; then
|
||||
AM_CONDITIONAL([HAVE_LIBCURL], true)
|
||||
else
|
||||
AM_CONDITIONAL([HAVE_LIBCURL], false)
|
||||
fi
|
||||
|
||||
PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false])
|
||||
if $have_lept; then
|
||||
CPPFLAGS="$CPPFLAGS $LEPTONICA_CFLAGS"
|
||||
|
@ -32,6 +32,9 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
|
||||
if VISIBILITY
|
||||
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
|
||||
endif
|
||||
if HAVE_LIBCURL
|
||||
libtesseract_api_la_CPPFLAGS += $(libcurl_CFLAGS) -DHAVE_LIBCURL
|
||||
endif
|
||||
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
|
||||
libtesseract_api_la_SOURCES += altorenderer.cpp
|
||||
libtesseract_api_la_SOURCES += hocrrenderer.cpp
|
||||
@ -42,6 +45,7 @@ libtesseract_api_la_SOURCES += renderer.cpp
|
||||
|
||||
lib_LTLIBRARIES += libtesseract.la
|
||||
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
|
||||
libtesseract_la_LDFLAGS += $(libcurl_LIBS)
|
||||
libtesseract_la_LDFLAGS += $(TENSORFLOW_LIBS)
|
||||
libtesseract_la_SOURCES =
|
||||
# Dummy C++ source to cause C++ linking.
|
||||
@ -94,6 +98,7 @@ tesseract_LDADD += $(LEPTONICA_LIBS)
|
||||
tesseract_LDADD += $(OPENMP_CXXFLAGS)
|
||||
tesseract_LDADD += $(TENSORFLOW_LIBS)
|
||||
tesseract_LDADD += $(libarchive_LIBS)
|
||||
tesseract_LDADD += $(libcurl_LIBS)
|
||||
|
||||
if T_WIN
|
||||
tesseract_LDADD += -ltiff
|
||||
|
@ -49,6 +49,9 @@
|
||||
#include <set> // for std::pair
|
||||
#include <sstream> // for std::stringstream
|
||||
#include <vector> // for std::vector
|
||||
#ifdef HAVE_LIBCURL
|
||||
#include <curl/curl.h>
|
||||
#endif
|
||||
#include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box...
|
||||
#ifndef DISABLED_LEGACY_ENGINE
|
||||
#include "blobclass.h" // for ExtractFontName
|
||||
@ -1084,6 +1087,15 @@ bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
|
||||
return result;
|
||||
}
|
||||
|
||||
static size_t
|
||||
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
|
||||
{
|
||||
size = size * nmemb;
|
||||
std::string* buf = reinterpret_cast<std::string*>(userp);
|
||||
buf->append(reinterpret_cast<const char*>(contents), size);
|
||||
return size;
|
||||
}
|
||||
|
||||
// In the ideal scenario, Tesseract will start working on data as soon
|
||||
// as it can. For example, if you stream a filelist through stdin, we
|
||||
// should start the OCR process as soon as the first filename is
|
||||
@ -1122,6 +1134,31 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
|
||||
buf.assign((std::istreambuf_iterator<char>(std::cin)),
|
||||
(std::istreambuf_iterator<char>()));
|
||||
data = reinterpret_cast<const l_uint8 *>(buf.data());
|
||||
} else if (strncmp(filename, "http:", 5) == 0 ||
|
||||
strncmp(filename, "https:", 6) == 0 ) {
|
||||
// Get image or image list by URL.
|
||||
#ifdef HAVE_LIBCURL
|
||||
CURL* curl = curl_easy_init();
|
||||
if (curl == nullptr) {
|
||||
fprintf(stderr, "Error, curl_easy_init failed\n");
|
||||
return false;
|
||||
} else {
|
||||
CURLcode curlcode;
|
||||
curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
|
||||
ASSERT_HOST(curlcode == CURLE_OK);
|
||||
curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
|
||||
ASSERT_HOST(curlcode == CURLE_OK);
|
||||
curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
|
||||
ASSERT_HOST(curlcode == CURLE_OK);
|
||||
curlcode = curl_easy_perform(curl);
|
||||
ASSERT_HOST(curlcode == CURLE_OK);
|
||||
curl_easy_cleanup(curl);
|
||||
data = reinterpret_cast<const l_uint8 *>(buf.data());
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "Error, this tesseract has no URL support\n");
|
||||
return false;
|
||||
#endif
|
||||
} else {
|
||||
// Check whether the input file can be read.
|
||||
if (FILE* file = fopen(filename, "rb")) {
|
||||
@ -1135,14 +1172,14 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
|
||||
|
||||
// Here is our autodetection
|
||||
int format;
|
||||
int r = (stdInput) ?
|
||||
int r = (data != nullptr) ?
|
||||
findFileFormatBuffer(data, &format) :
|
||||
findFileFormat(filename, &format);
|
||||
|
||||
// Maybe we have a filelist
|
||||
if (r != 0 || format == IFF_UNKNOWN) {
|
||||
STRING s;
|
||||
if (stdInput) {
|
||||
if (data != nullptr) {
|
||||
s = buf.c_str();
|
||||
} else {
|
||||
std::ifstream t(filename);
|
||||
@ -1167,7 +1204,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
|
||||
// Fail early if we can, before producing any output
|
||||
Pix *pix = nullptr;
|
||||
if (!tiff) {
|
||||
pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
|
||||
pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
|
||||
if (pix == nullptr) {
|
||||
return false;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user