Enabled streaming input and output of multi-page documents

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1105 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2014-05-21 15:46:21 +00:00
parent 30e5220f2e
commit 25a8c7b720
6 changed files with 359 additions and 447 deletions

View File

@ -41,6 +41,11 @@
#include <string.h>
#endif // _WIN32
#include <iostream>
#include <string>
#include <iterator>
#include <fstream>
#if !defined(VERSION)
#include "version.h"
#endif
@ -73,6 +78,8 @@
#include "strngs.h"
#include "openclwrapper.h"
BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin");
namespace tesseract {
/** Minimum sensible image size to be worth running tesseract. */
@ -536,9 +543,10 @@ void TessBaseAPI::SetSourceResolution(int ppi) {
* Because of that, an implementation that sources and targets Pix may end up
* with less copies than an implementation that does not.
*/
void TessBaseAPI::SetImage(const Pix* pix) {
void TessBaseAPI::SetImage(Pix* pix) {
if (InternalSetImage())
thresholder_->SetImage(pix);
SetInputImage(pix);
}
/**
@ -681,7 +689,8 @@ Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level,
if (pixa != NULL) {
Pix* pix = NULL;
if (raw_image) {
pix = page_it->GetImage(level, raw_padding, &left, &top);
pix = page_it->GetImage(level, raw_padding, input_image_,
&left, &top);
} else {
pix = page_it->GetBinaryImage(level);
}
@ -907,50 +916,12 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
return 0;
}
/**
* Recognizes all the pages in the named file, as a multi-page tiff or
* list of filenames, or single image, and gets the appropriate kind of text
* according to parameters: tessedit_create_boxfile,
* tessedit_make_boxes_from_boxes, tessedit_write_unlv, tessedit_create_hocr.
* Calls ProcessPage on each page in the input file, which may be a
* multi-page tiff, single-page other file format, or a plain text list of
* images to read. If tessedit_page_number is non-negative, processing begins
* at that page of a multi-page tiff file, or filelist.
* The text is returned in text_out. Returns false on error.
* If non-zero timeout_millisec terminates processing after the timeout on
* a single page.
* If non-NULL and non-empty, and some page fails for some reason,
* the page is reprocessed with the retry_config config file. Useful
* for interactively debugging a bad page.
*/
bool TessBaseAPI::ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out) {
TessResultRenderer* renderer = NewRenderer();
if (!ProcessPages(filename, retry_config, timeout_millisec, renderer)) {
delete renderer;
return false;
}
const char* out_data;
inT32 out_len;
bool success = renderer->GetOutput(&out_data, &out_len);
if (success) {
// TODO(ewiseblatt): 20111103
// if text_out->size() != out_len then we have binary data which STRING wont
// support so this should fail. Really want to eliminate this interface
// alltogether so not worrying about at this time.
text_out->assign(out_data, out_len);
}
delete renderer;
return success;
}
void TessBaseAPI::SetInputImage(Pix *pix) {
if (input_image_)
pixDestroy(&input_image_);
input_image_ = pixClone(pix);
input_image_ = NULL;
if (pix)
input_image_ = pixClone(pix);
}
Pix* TessBaseAPI::GetInputImage() {
@ -971,168 +942,213 @@ int TessBaseAPI::GetSourceYResolution() {
return thresholder_->GetSourceYResolution();
}
// If flist exists, get data from there. Otherwise get data from buf.
// Seems convoluted, but is the easiest way I know of to meet multiple
// goals. Support streaming from stdin, and also work on platforms
// lacking fmemopen.
bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
STRING *buf,
const char* retry_config,
int timeout_millisec,
TessResultRenderer* renderer,
int tessedit_page_number) {
if (!flist && !buf) return false;
int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
char pagename[MAX_PATH];
GenericVector<STRING> lines;
if (!flist) {
buf->split('\n', &lines);
if (lines.empty()) return false;
}
// Skip to the requested page number.
for (int i = 0; i < page; i++) {
if (flist) {
if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
}
}
// Begin producing output
const char* kUnknownTitle = "";
if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
return false;
}
// Loop over all pages - or just the requested one
while (true) {
if (flist) {
if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
} else {
if (page >= lines.size()) break;
snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
}
chomp_string(pagename);
Pix *pix = pixRead(pagename);
if (pix == NULL) {
tprintf("Image file %s cannot be read!\n", pagename);
return false;
}
tprintf("Page %d : %s\n", page, pagename);
bool r = ProcessPage(pix, page, pagename, retry_config,
timeout_millisec, renderer);
pixDestroy(&pix);
if (!r) return false;
if (tessedit_page_number >= 0) break;
++page;
}
// Finish producing output
if (renderer && !renderer->EndDocument()) {
return false;
}
return true;
}
bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
size_t size,
const char* filename,
const char* retry_config,
int timeout_millisec,
TessResultRenderer* renderer,
int tessedit_page_number) {
Pix *pix = NULL;
#ifdef USE_OPENCL
OpenclDevice od;
#endif
int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
for (; ; ++page) {
if (tessedit_page_number >= 0)
page = tessedit_page_number;
#ifdef USE_OPENCL
if ( od.selectedDeviceIsOpenCL() ) {
// FIXME(jbreiden) Not implemented.
pix = od.pixReadMemTiffCl(data, size, page);
} else {
#endif
pix = pixReadMemTiff(data, size, page);
#ifdef USE_OPENCL
}
#endif
if (pix == NULL) break;
tprintf("Page %d\n", page + 1);
char page_str[kMaxIntSize];
snprintf(page_str, kMaxIntSize - 1, "%d", page);
SetVariable("applybox_page", page_str);
bool r = ProcessPage(pix, page, filename, retry_config,
timeout_millisec, renderer);
pixDestroy(&pix);
if (!r) return false;
if (tessedit_page_number >= 0) break;
}
return true;
}
// In the ideal scenario, Tesseract will start working on data as soon
// as it can. For example, if you steam a filelist through stdin, we
// should start the OCR process as soon as the first filename is
// available. This is particularly useful when hooking Tesseract up to
// slow hardware such as a book scanning machine.
//
// Unfortunately there are tradeoffs. You can't seek on stdin. That
// makes automatic detection of datatype (TIFF? filelist? PNG?)
// impractical. So we support a command line flag to explicitly
// identify the scenario that really matters: filelists on
// stdin. We'll still do our best if the user likes pipes. That means
// piling up any data coming into stdin into a memory buffer.
bool TessBaseAPI::ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer) {
PERF_COUNT_START("ProcessPages")
int page = tesseract_->tessedit_page_number;
if (page < 0)
page = 0;
FILE* fp = fopen(filename, "rb");
if (fp == NULL) {
tprintf("Image file %s cannot be opened!\n", filename);
return false;
bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
if (stdInput) {
#ifdef WIN32
if (_setmode(_fileno(stdin), _O_BINARY) == -1)
tprintf("ERROR: cin to binary: %s", strerror(errno));
#endif // WIN32
}
// Find the number of pages if a tiff file, or zero otherwise.
int npages = 0;
int format;
Pix *pix;
pix = pixRead(filename);
format = pixGetInputFormat(pix);
if (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
format == IFF_TIFF_ZIP)
tiffGetCount(fp, &npages);
fclose(fp);
bool success = true;
if (stream_filelist) {
return ProcessPagesFileList(stdin, NULL, retry_config,
timeout_millisec, renderer,
tesseract_->tessedit_page_number);
}
// At this point we are officially in autodection territory.
// That means we are going to buffer stdin so that it is
// seekable. To keep code simple we will also buffer data
// coming from a file.
std::string buf;
if (stdInput) {
buf.assign((std::istreambuf_iterator<char>(std::cin)),
(std::istreambuf_iterator<char>()));
} else {
std::ifstream ifs(filename);
buf.assign((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>()));
}
// Here is our autodetection
int format;
const l_uint8 * data = reinterpret_cast<const l_uint8 *>(buf.c_str());
findFileFormatBuffer(data, &format);
// Maybe we have a filelist
if (format == IFF_UNKNOWN) {
STRING s(buf.c_str());
return ProcessPagesFileList(NULL, &s, retry_config,
timeout_millisec, renderer,
tesseract_->tessedit_page_number);
}
// Maybe we have a TIFF which is potentially multipage
bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
format == IFF_TIFF_ZIP);
// Fail early if we can, before producing any output
Pix *pix = NULL;
if (!tiff) {
pix = pixReadMem(data, buf.size());
if (pix == NULL) {
return false;
}
}
// Begin the output
const char* kUnknownTitle = "";
if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
success = false;
}
#ifdef USE_OPENCL
OpenclDevice od;
#endif
if (npages > 0) {
pixDestroy(&pix);
for (; page < npages; ++page) {
// only use opencl if compiled w/ OpenCL and selected device is opencl
#ifdef USE_OPENCL
if ( od.selectedDeviceIsOpenCL() ) {
pix = od.pixReadTiffCl(filename, page);
} else {
#endif
pix = pixReadTiff(filename, page);
#ifdef USE_OPENCL
}
#endif
if (pix == NULL) break;
if ((page >= 0) && (npages > 1))
tprintf("Page %d of %d\n", page + 1, npages);
char page_str[kMaxIntSize];
snprintf(page_str, kMaxIntSize - 1, "%d", page);
SetVariable("applybox_page", page_str);
success &= ProcessPage(pix, page, filename, retry_config,
timeout_millisec, renderer);
pixDestroy(&pix);
if (tesseract_->tessedit_page_number >= 0 || npages == 1) {
break;
}
}
} else {
// The file is not a tiff file.
if (pix != NULL) {
success &= ProcessPage(pix, 0, filename, retry_config,
timeout_millisec, renderer);
pixDestroy(&pix);
} else {
// The file is not an image file, so try it as a list of filenames.
FILE* fimg = fopen(filename, "rb");
if (fimg == NULL) {
tprintf("File %s cannot be opened!\n", filename);
return false;
}
tprintf("Reading %s as a list of filenames...\n", filename);
char pagename[MAX_PATH];
// Skip to the requested page number.
for (int i = 0; i < page &&
fgets(pagename, sizeof(pagename), fimg) != NULL;
++i);
while (fgets(pagename, sizeof(pagename), fimg) != NULL) {
chomp_string(pagename);
pix = pixRead(pagename);
if (pix == NULL) {
tprintf("Image file %s cannot be read!\n", pagename);
fclose(fimg);
return false;
}
tprintf("Page %d : %s\n", page, pagename);
success &= ProcessPage(pix, page, pagename, retry_config,
timeout_millisec, renderer);
pixDestroy(&pix);
++page;
}
fclose(fimg);
}
return false;
}
bool all_ok = success;
if (renderer && !renderer->EndDocument()) {
all_ok = false;
// Produce output
bool r = false;
if (tiff) {
r = ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config,
timeout_millisec, renderer,
tesseract_->tessedit_page_number);
} else {
r = ProcessPage(pix, 0, filename, retry_config,
timeout_millisec, renderer);
pixDestroy(&pix);
}
// End the output
if (!r || (renderer && !renderer->EndDocument())) {
return false;
}
PERF_COUNT_END
return all_ok;
}
/**
* Recognizes a single page for ProcessPages, appending the text to text_out.
* The pix is the image processed - filename and page_index are metadata
* used by side-effect processes, such as reading a box file or formatting
* as hOCR.
* If non-zero timeout_millisec terminates processing after the timeout.
* If non-NULL and non-empty, and some page fails for some reason,
* the page is reprocessed with the retry_config config file. Useful
* for interactively debugging a bad page.
* The text is returned in text_out. Returns false on error.
*/
bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out) {
TessResultRenderer* renderer = NewRenderer();
if (!ProcessPage(pix, page_index, filename, retry_config, timeout_millisec,
renderer)) {
return false;
}
const char* out_data;
inT32 out_len;
if (!renderer->GetOutput(&out_data, &out_len)) {
return false;
}
// TODO(ewiseblatt): 20111103
// if text_out->size() != out_len then we have binary data which STRING wont
// support so this should fail. Really want to eliminate this interface
// alltogether so not worrying about at this time.
text_out->assign(out_data, out_len);
return true;
}
/**
* Recognizes a single page for ProcessPages, appending the text to text_out.
* The pix is the image processed - filename and page_index are metadata
* used by side-effect processes, such as reading a box file or formatting
* as hOCR.
* If non-zero timeout_millisec terminates processing after the timeout.
* If non-NULL and non-empty, and some page fails for some reason,
* the page is reprocessed with the retry_config config file. Useful
* for interactively debugging a bad page.
* The text is returned in renderer. Returns false on error.
*/
bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer) {
PERF_COUNT_START("ProcessPage")
SetInputName(filename);
SetImage(pix);
SetInputImage(pix);
bool failed = false;
if (timeout_millisec > 0) {
// Running with a timeout.
@ -1174,12 +1190,8 @@ bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
ReadConfigFile(kOldVarsFile);
}
if (renderer) {
if (failed) {
renderer->AddError(this);
} else {
failed = !renderer->AddImage(this);
}
if (renderer && !failed) {
failed = !renderer->AddImage(this);
}
PERF_COUNT_END
return !failed;
@ -1782,6 +1794,7 @@ void TessBaseAPI::Clear() {
if (thresholder_ != NULL)
thresholder_->Clear();
ClearResults();
SetInputImage(NULL);
}
/**
@ -1826,6 +1839,10 @@ void TessBaseAPI::End() {
delete input_file_;
input_file_ = NULL;
}
if (input_image_ != NULL) {
pixDestroy(&input_image_);
input_image_ = NULL;
}
if (output_file_ != NULL) {
delete output_file_;
output_file_ = NULL;
@ -2554,22 +2571,6 @@ CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const {
return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext();
}
TessResultRenderer* TessBaseAPI::NewRenderer() {
if (tesseract_->tessedit_create_boxfile
|| tesseract_->tessedit_make_boxes_from_boxes) {
return new TessBoxTextRenderer();
} else if (tesseract_->tessedit_create_hocr) {
return new TessHOcrRenderer();
} else if (tesseract_->tessedit_create_pdf) {
return new TessPDFRenderer(tesseract_->datadir.c_str());
} else if (tesseract_->tessedit_write_unlv) {
return new TessUnlvRenderer();
} else if (tesseract_->tessedit_create_boxfile) {
return new TessBoxTextRenderer();
} else {
return new TessTextRenderer();
}
}
/** Escape a char string - remove <>&"' with HTML codes. */
void HOcrEscape(const char* text, STRING& ret) {

View File

@ -346,7 +346,7 @@ class TESS_API TessBaseAPI {
* Because of that, an implementation that sources and targets Pix may end up
* with less copies than an implementation that does not.
*/
void SetImage(const Pix* pix);
void SetImage(Pix* pix);
/**
* Set the resolution of the source image in pixels per inch so font size
@ -505,44 +505,40 @@ class TESS_API TessBaseAPI {
int RecognizeForChopTest(ETEXT_DESC* monitor);
/**
* Recognizes all the pages in the named file, as a multi-page tiff or
* list of filenames, or single image, and gets the appropriate kind of text
* according to parameters: tessedit_create_boxfile,
* tessedit_make_boxes_from_boxes, tessedit_write_unlv, tessedit_create_hocr.
* Calls ProcessPage on each page in the input file, which may be a
* multi-page tiff, single-page other file format, or a plain text list of
* images to read. If tessedit_page_number is non-negative, processing begins
* at that page of a multi-page tiff file, or filelist.
* The text is returned in text_out. Returns false on error.
* If non-zero timeout_millisec terminates processing after the timeout on
* a single page.
* If non-NULL and non-empty, and some page fails for some reason,
* the page is reprocessed with the retry_config config file. Useful
* for interactively debugging a bad page.
* Turns images into symbolic text.
*
* filename can point to a single image, a multi-page TIFF,
* or a plain text list of image filenames.
*
* retry_config is useful for debugging. If not NULL, you can fall
* back to an alternate configuration if a page fails for some
* reason.
*
* timeout_millisec terminates processing if any single page
* takes too long. Set to 0 for unlimited time.
*
* renderer is responible for creating the output. For example,
* use the TessTextRenderer if you want plaintext output, or
* the TessPDFRender to produce searchable PDF.
*
* If tessedit_page_number is non-negative, will only process that
* single page. Works for multi-page tiff file, or filelist.
*
* Returns true if successful, false on error.
*/
bool ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out);
bool ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer);
/**
* Recognizes a single page for ProcessPages, appending the text to text_out.
* The pix is the image processed - filename and page_index are metadata
* used by side-effect processes, such as reading a box file or formatting
* as hOCR.
* If non-zero timeout_millisec terminates processing after the timeout.
* If non-NULL and non-empty, and some page fails for some reason,
* the page is reprocessed with the retry_config config file. Useful
* for interactively debugging a bad page.
* The text is returned in text_out. Returns false on error.
* Turn a single image into symbolic text.
*
* The pix is the image processed. filename and page_index are
* metadata used by side-effect processes, such as reading a box
* file or formatting as hOCR.
*
* See ProcessPages for desciptions of other parameters.
*/
bool ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
STRING* text_out);
bool ProcessPage(Pix* pix, int page_index, const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer);
@ -852,16 +848,20 @@ class TESS_API TessBaseAPI {
/* @} */
private:
/**
* DEPRECATED
* Returns new renderer instance based on how tesseract was configured to
* render results using old API. This should be removed along with those
* attributes so that the renderer is just passed in rather than the
* old methods taking output strings.
*
* Caller must destroy result.
*/
TessResultRenderer* NewRenderer();
// A list of image filenames gets special consideration
bool ProcessPagesFileList(FILE *fp,
STRING *buf,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer,
int tessedit_page_number);
// TIFF supports multipage so gets special consideration
bool ProcessPagesMultipageTiff(const unsigned char *data,
size_t size,
const char* filename,
const char* retry_config,
int timeout_millisec,
TessResultRenderer* renderer,
int tessedit_page_number);
}; // class TessBaseAPI.
/** Escape a char string - remove &<>"' with HTML codes. */

View File

@ -32,8 +32,8 @@ const int kCharWidth = 2;
* PDF Renderer interface implementation
**********************************************************************/
TessPDFRenderer::TessPDFRenderer(const char *datadir)
: TessResultRenderer("PDF", "pdf") {
TessPDFRenderer::TessPDFRenderer(const char* outputbase, const char *datadir)
: TessResultRenderer(outputbase, "pdf") {
obj_ = 0;
datadir_ = datadir;
offsets_.push_back(0);
@ -440,20 +440,26 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum,
FILE *fp = fopen(filename, "rb");
if (!fp)
return false;
int format;
const char *filter;
int spp, w, h;
int cmyk = false;
int format;
findFileFormatStream(fp, &format);
if (format != IFF_JFIF_JPEG) {
fclose(fp);
return false;
switch(format) {
case IFF_JFIF_JPEG:
freadHeaderJpeg(fp, &w, &h, &spp, NULL, &cmyk);
filter = "/DCTDecode";
break;
case IFF_JP2:
freadHeaderJp2k(fp, &w, &h, &spp);
filter = "/JPXDecode";
break;
default:
fclose(fp);
return false;
}
fseek(fp, 0, SEEK_END);
long int jpeg_size = ftell(fp);
fseek(fp, 0, SEEK_SET);
int spp, cmyk, w, h;
freadHeaderJpeg(fp, &w, &h, &spp, NULL, &cmyk);
const char *colorspace;
switch (spp) {
case 1:
@ -472,6 +478,10 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum,
return false;
}
fseek(fp, 0, SEEK_END);
long int file_size = ftell(fp);
fseek(fp, 0, SEEK_SET);
// IMAGE
snprintf(b1, sizeof(b1),
"%ld 0 obj\n"
@ -482,10 +492,10 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum,
" /Width %d\n"
" /Height %d\n"
" /BitsPerComponent 8\n"
" /Filter /DCTDecode\n"
" /Filter %s\n"
">>\n"
"stream\n", objnum, jpeg_size,
colorspace, w, h);
"stream\n", objnum, file_size,
colorspace, w, h, filter);
size_t b1_len = strlen(b1);
snprintf(b2, sizeof(b2),
@ -494,17 +504,17 @@ bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum,
"endobj\n");
size_t b2_len = strlen(b2);
*pdf_object_size = b1_len + jpeg_size + b2_len;
*pdf_object_size = b1_len + file_size + b2_len;
*pdf_object = new char[*pdf_object_size];
if (!pdf_object)
return false;
memcpy(*pdf_object, b1, b1_len);
if (static_cast<int>(fread(*pdf_object + b1_len, 1, jpeg_size, fp)) !=
jpeg_size) {
if (static_cast<int>(fread(*pdf_object + b1_len, 1, file_size, fp)) !=
file_size) {
delete[] pdf_object;
return false;
}
memcpy(*pdf_object + b1_len + jpeg_size, b2, b2_len);
memcpy(*pdf_object + b1_len + file_size, b2, b2_len);
fclose(fp);
return true;
}

View File

@ -14,23 +14,30 @@
namespace tesseract {
// Start with a 4K output buffer which should be pretty big for a page of text
// though might need to grow for other formats or multi-page documents.
static const int kInitialAlloc = 1 << 12;
/**********************************************************************
* Base Renderer interface implementation
**********************************************************************/
TessResultRenderer::TessResultRenderer(const char* type, const char* extension)
: full_typename_(type), file_extension_(extension),
TessResultRenderer::TessResultRenderer(const char *outputbase,
const char* extension)
: file_extension_(extension),
title_(""), imagenum_(-1),
output_data_(NULL),
next_(NULL) {
ResetData();
fout_(stdout),
next_(NULL),
happy_(true) {
if (strcmp(outputbase, "-") && strcmp(outputbase, "stdout")) {
STRING outfile = STRING(outputbase) + STRING(".") + STRING(file_extension_);
fout_ = fopen(outfile.string(), "wb");
if (fout_ == NULL) {
happy_ = false;
}
}
}
TessResultRenderer::~TessResultRenderer() {
delete[] output_data_;
if (fout_ != stdout)
fclose(fout_);
else
clearerr(fout_);
delete next_;
}
@ -48,8 +55,7 @@ void TessResultRenderer::insert(TessResultRenderer* next) {
}
bool TessResultRenderer::BeginDocument(const char* title) {
ResetData();
if (!happy_) return false;
title_ = title;
imagenum_ = -1;
bool ok = BeginDocumentHandler();
@ -60,6 +66,7 @@ bool TessResultRenderer::BeginDocument(const char* title) {
}
bool TessResultRenderer::AddImage(TessBaseAPI* api) {
if (!happy_) return false;
++imagenum_;
bool ok = AddImageHandler(api);
if (next_) {
@ -68,16 +75,8 @@ bool TessResultRenderer::AddImage(TessBaseAPI* api) {
return ok;
}
bool TessResultRenderer::AddError(TessBaseAPI* api) {
++imagenum_;
bool ok = AddErrorHandler(api);
if (next_) {
ok = next_->AddError(api) && ok;
}
return ok;
}
bool TessResultRenderer::EndDocument() {
if (!happy_) return false;
bool ok = EndDocumentHandler();
if (next_) {
ok = next_->EndDocument() && ok;
@ -85,62 +84,29 @@ bool TessResultRenderer::EndDocument() {
return ok;
}
bool TessResultRenderer::GetOutput(const char** data, int* data_len) const {
*data = output_data_;
*data_len = output_len_;
return true;
}
void TessResultRenderer::ResetData() {
delete[] output_data_;
output_data_ = new char[kInitialAlloc];
output_alloc_ = kInitialAlloc;
output_len_ = 0;
}
void TessResultRenderer::ReserveAdditionalData(int relative_len) {
int total = relative_len + output_len_;
if (total <= output_alloc_)
return;
if (total < 2 * output_alloc_) {
total = 2 * output_alloc_;
}
char* new_data = new char[total];
memcpy(new_data, output_data_, output_len_);
delete[] output_data_;
output_data_ = new_data;
}
void TessResultRenderer::AppendString(const char* s) {
AppendData(s, strlen(s));
}
void TessResultRenderer::AppendData(const char* s, int len) {
ReserveAdditionalData(len);
memcpy(output_data_ + output_len_, s, len);
output_len_ += len;
int n = fwrite(s, 1, len, fout_);
if (n != len) happy_ = false;
}
bool TessResultRenderer::BeginDocumentHandler() {
return true;
}
bool TessResultRenderer::AddErrorHandler(TessBaseAPI* api) {
return true;
return happy_;
}
bool TessResultRenderer::EndDocumentHandler() {
return true;
return happy_;
}
/**********************************************************************
* UTF8 Text Renderer interface implementation
**********************************************************************/
TessTextRenderer::TessTextRenderer()
: TessResultRenderer("Text", "txt") {
TessTextRenderer::TessTextRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "txt") {
}
bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
@ -158,8 +124,8 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
/**********************************************************************
* HOcr Text Renderer interface implementation
**********************************************************************/
TessHOcrRenderer::TessHOcrRenderer()
: TessResultRenderer("HOcr", "hocr") {
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "hocr") {
}
bool TessHOcrRenderer::BeginDocumentHandler() {
@ -201,8 +167,8 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
/**********************************************************************
* UNLV Text Renderer interface implementation
**********************************************************************/
TessUnlvRenderer::TessUnlvRenderer()
: TessResultRenderer("UNLV", "unlv") {
TessUnlvRenderer::TessUnlvRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "unlv") {
}
bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) {
@ -218,8 +184,8 @@ bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) {
/**********************************************************************
* BoxText Renderer interface implementation
**********************************************************************/
TessBoxTextRenderer::TessBoxTextRenderer()
: TessResultRenderer("Box Text", "box") {
TessBoxTextRenderer::TessBoxTextRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {
}
bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) {

View File

@ -47,7 +47,7 @@ class TESS_API TessResultRenderer {
virtual ~TessResultRenderer();
// Takes ownership of pointer so must be new'd instance.
// Renderers arent ordered, but appends the sequences of next parameter
// Renderers aren't ordered, but appends the sequences of next parameter
// and existing next(). The renderers should be unique across both lists.
void insert(TessResultRenderer* next);
@ -70,23 +70,17 @@ class TESS_API TessResultRenderer {
*/
bool AddImage(TessBaseAPI* api);
/**
* Called to inform the renderer when tesseract failed on an image.
*/
bool AddError(TessBaseAPI* api);
/**
* Finishes the document and finalizes the output data
* Invalid if BeginDocument not yet called.
*/
bool EndDocument();
const char* full_typename() const { return full_typename_; }
const char* file_extension() const { return file_extension_; }
const char* title() const { return title_; }
/**
* Returns the index of the last image given to AddImage or AddError
* Returns the index of the last image given to AddImage
* (i.e. images are incremented whether the image succeeded or not)
*
* This is always defined. It means either the number of the
@ -96,20 +90,19 @@ class TESS_API TessResultRenderer {
*/
int imagenum() const { return imagenum_; }
/**
* The results are not defined if EndDocument has not yet been called.
* Returns the current output from the renderer. The data is owned by
* the renderer and only valid until the next call into the renderer
* that may modify document state (such as Begin/End Document
* or AddImage.
*/
virtual bool GetOutput(const char** data, int* data_len) const;
protected:
/**
* Called by concrete classes
* Called by concrete classes.
*
* outputbase is the name of the output file excluding
* extension. For example, "/path/to/chocolate-chip-cookie-recipe"
*
* extension indicates the file extension to be used for output
* files. For example "pdf" will produce a .pdf file, and "hocr"
* will produce .hocr files.
*/
TessResultRenderer(const char* type, const char* extension);
TessResultRenderer(const char *outputbase,
const char* extension);
// Hook for specialized handling in BeginDocument()
virtual bool BeginDocumentHandler();
@ -117,22 +110,9 @@ class TESS_API TessResultRenderer {
// This must be overriden to render the OCR'd results
virtual bool AddImageHandler(TessBaseAPI* api) = 0;
// The default handler ignores the error and just returns true
virtual bool AddErrorHandler(TessBaseAPI* api);
// Hook for specialized handling in EndDocument()
virtual bool EndDocumentHandler();
// Clear output data.
void ResetData();
// Renderers can call this method to allocate data storage in advance,
// which can cut down on allocations and copying. This isnt required,
// and if used can still request less than will ultimately be used without
// worrying about data corruption. It's purely performance.
// Note that relative_len is in addition to what is already being used.
void ReserveAdditionalData(int relative_len);
// Renderers can call this to append '\0' terminated strings into
// the output string returned by GetOutput.
// This method will grow the output buffer if needed.
@ -145,15 +125,13 @@ class TESS_API TessResultRenderer {
void AppendData(const char* s, int len);
private:
const char* full_typename_; // name of renderer
const char* file_extension_; // standard extension for generated output
const char* title_; // title of document being renderered
int imagenum_; // index of last image added
char* output_data_; // output bytes
int output_alloc_; // bytes allocated
int output_len_; // bytes actually used
TessResultRenderer* next_; // Can link multiple renderers together.
FILE* fout_; // output file pointer
TessResultRenderer* next_; // Can link multiple renderers together
bool happy_; // I get grumpy when the disk fills up, etc.
};
/**
@ -161,7 +139,7 @@ class TESS_API TessResultRenderer {
*/
class TESS_API TessTextRenderer : public TessResultRenderer {
public:
TessTextRenderer();
explicit TessTextRenderer(const char *outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);
@ -172,7 +150,7 @@ class TESS_API TessTextRenderer : public TessResultRenderer {
*/
class TESS_API TessHOcrRenderer : public TessResultRenderer {
public:
TessHOcrRenderer();
explicit TessHOcrRenderer(const char *outputbase);
protected:
virtual bool BeginDocumentHandler();
@ -185,7 +163,9 @@ protected:
*/
class TESS_API TessPDFRenderer : public TessResultRenderer {
public:
TessPDFRenderer(const char *datadir);
// datadir is the location of the TESSDATA. We need it because
// we load a custom PDF font from this location.
TessPDFRenderer(const char *outputbase, const char *datadir);
protected:
virtual bool BeginDocumentHandler();
@ -224,7 +204,7 @@ private:
*/
class TESS_API TessUnlvRenderer : public TessResultRenderer {
public:
TessUnlvRenderer();
explicit TessUnlvRenderer(const char *outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);
@ -235,7 +215,7 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer {
*/
class TESS_API TessBoxTextRenderer : public TessResultRenderer {
public:
TessBoxTextRenderer();
explicit TessBoxTextRenderer(const char *outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);

View File

@ -85,7 +85,7 @@ int main(int argc, char **argv) {
// Make the order of args a bit more forgiving than it used to be.
const char* lang = "eng";
const char* image = NULL;
const char* output = NULL;
const char* outputbase = NULL;
const char* datapath = NULL;
bool noocr = false;
bool list_langs = false;
@ -94,7 +94,7 @@ int main(int argc, char **argv) {
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
int arg = 1;
while (arg < argc && (output == NULL || argv[arg][0] == '-')) {
while (arg < argc && (outputbase == NULL || argv[arg][0] == '-')) {
if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) {
lang = argv[arg + 1];
++arg;
@ -123,8 +123,8 @@ int main(int argc, char **argv) {
++arg;
} else if (image == NULL) {
image = argv[arg];
} else if (output == NULL) {
output = argv[arg];
} else if (outputbase == NULL) {
outputbase = argv[arg];
}
++arg;
}
@ -134,7 +134,7 @@ int main(int argc, char **argv) {
noocr = true;
}
if (output == NULL && noocr == false) {
if (outputbase == NULL && noocr == false) {
fprintf(stderr, "Usage:\n %s imagename|stdin outputbase|stdout "
"[options...] [configfile...]\n\n", argv[0]);
@ -172,14 +172,15 @@ int main(int argc, char **argv) {
exit(1);
}
if (output != NULL && strcmp(output, "-") && strcmp(output, "stdout")) {
if (outputbase != NULL && strcmp(outputbase, "-") &&
strcmp(outputbase, "stdout")) {
tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n",
tesseract::TessBaseAPI::Version());
}
PERF_COUNT_START("Tesseract:main")
tesseract::TessBaseAPI api;
api.SetOutputName(output);
api.SetOutputName(outputbase);
int rc = api.Init(datapath, lang, tesseract::OEM_DEFAULT,
&(argv[arg]), argc - arg, &vars_vec, &vars_values, false);
@ -192,7 +193,12 @@ int main(int argc, char **argv) {
for (arg = 0; arg < argc; arg++) {
if (strcmp(argv[arg], "-c") == 0 && arg + 1 < argc) {
strncpy(opt1, argv[arg + 1], 255);
*(strchr(opt1, '=')) = 0;
char *p = strchr(opt1, '=');
if (!p) {
fprintf(stderr, "Missing = in configvar assignment\n");
exit(1);
}
*p = 0;
strncpy(opt2, strchr(argv[arg + 1], '=') + 1, 255);
opt2[254] = 0;
++arg;
@ -239,32 +245,11 @@ int main(int argc, char **argv) {
if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
api.SetPageSegMode(pagesegmode);
bool stdInput = !strcmp(image, "stdin") || !strcmp(image, "-");
Pix* pixs = NULL;
if (stdInput) {
char byt;
GenericVector<l_uint8> ch_data;
std::istream file(std::cin.rdbuf());
#ifdef WIN32
if (_setmode(_fileno(stdin), _O_BINARY) == -1)
tprintf("ERROR: cin to binary: %s", strerror(errno));
#endif // WIN32
while (file.get(byt)) {
ch_data.push_back(byt);
}
std::cin.ignore(std::cin.rdbuf()->in_avail() + 1);
pixs = pixReadMem(&ch_data[0], ch_data.size());
}
if (pagesegmode == tesseract::PSM_AUTO_ONLY ||
pagesegmode == tesseract::PSM_OSD_ONLY) {
int ret_val = 0;
if (!pixs)
pixs = pixRead(image);
Pix* pixs = pixRead(image);
if (!pixs) {
fprintf(stderr, "Cannot open input file: %s\n", image);
exit(2);
@ -296,7 +281,7 @@ int main(int argc, char **argv) {
it->Orientation(&orientation, &direction, &order, &deskew_angle);
tprintf("Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" \
"Deskew angle: %.4f\n",
orientation, direction, order, deskew_angle);
orientation, direction, order, deskew_angle);
} else {
ret_val = 1;
}
@ -309,59 +294,29 @@ int main(int argc, char **argv) {
tesseract::TessResultRenderer* renderer = NULL;
bool b;
api.GetBoolVariable("tessedit_create_hocr", &b);
if (b && renderer == NULL) renderer = new tesseract::TessHOcrRenderer();
if (b)
renderer = new tesseract::TessHOcrRenderer(outputbase);
api.GetBoolVariable("tessedit_create_pdf", &b);
if (b && renderer == NULL)
renderer = new tesseract::TessPDFRenderer(api.GetDatapath());
renderer = new tesseract::TessPDFRenderer(outputbase, api.GetDatapath());
api.GetBoolVariable("tessedit_write_unlv", &b);
if (b && renderer == NULL) renderer = new tesseract::TessUnlvRenderer();
if (b && renderer == NULL)
renderer = new tesseract::TessUnlvRenderer(outputbase);
api.GetBoolVariable("tessedit_create_boxfile", &b);
if (b && renderer == NULL) renderer = new tesseract::TessBoxTextRenderer();
if (b && renderer == NULL)
renderer = new tesseract::TessBoxTextRenderer(outputbase);
if (renderer == NULL) renderer = new tesseract::TessTextRenderer();
if (renderer == NULL)
renderer = new tesseract::TessTextRenderer(outputbase);
if (pixs) {
if (renderer) renderer->BeginDocument("");
api.ProcessPage(pixs, 0, NULL, NULL, 0, renderer);
if (renderer) renderer->EndDocument();
pixDestroy(&pixs);
} else {
FILE* fin = fopen(image, "rb");
if (fin == NULL) {
fprintf(stderr, "Cannot open input file: %s\n", image);
exit(2);
}
fclose(fin);
if (!api.ProcessPages(image, NULL, 0, renderer)) {
fprintf(stderr, "Error during processing.\n");
exit(1);
}
if (!api.ProcessPages(image, NULL, 0, renderer)) {
fprintf(stderr, "Error during processing.\n");
exit(1);
}
FILE* fout = stdout;
if (strcmp(output, "-") && strcmp(output, "stdout")) {
STRING outfile = STRING(output)
+ STRING(".")
+ STRING(renderer->file_extension());
fout = fopen(outfile.string(), "wb");
if (fout == NULL) {
fprintf(stderr, "Cannot create output file %s\n", outfile.string());
exit(1);
}
}
const char* data;
inT32 data_len;
if (renderer->GetOutput(&data, &data_len)) {
fwrite(data, 1, data_len, fout);
if (fout != stdout)
fclose(fout);
else
clearerr(fout);
}
PERF_COUNT_END
return 0; // Normal exit
}