mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
d745f3fd7d
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@285 d0cd1f9f-072b-0410-8dd7-cf729c803f20
1578 lines
54 KiB
C++
1578 lines
54 KiB
C++
/**********************************************************************
|
|
* File: baseapi.cpp
|
|
* Description: Simple API for calling tesseract.
|
|
* Author: Ray Smith
|
|
* Created: Fri Oct 06 15:35:01 PDT 2006
|
|
*
|
|
* (C) Copyright 2006, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
|
|
#ifdef HAVE_LIBLEPT
|
|
// Include leptonica library only if autoconf (or makefile etc) tell us to.
|
|
#include "allheaders.h"
|
|
#endif
|
|
|
|
#include "baseapi.h"
|
|
|
|
#include "thresholder.h"
|
|
#include "tesseractmain.h"
|
|
#include "tesseractclass.h"
|
|
#include "tessedit.h"
|
|
#include "ocrclass.h"
|
|
#include "pageres.h"
|
|
#include "tessvars.h"
|
|
#include "control.h"
|
|
#include "applybox.h"
|
|
#include "pgedit.h"
|
|
#include "varabled.h"
|
|
#include "output.h"
|
|
#include "mainblk.h"
|
|
#include "globals.h"
|
|
#include "adaptmatch.h"
|
|
#include "edgblob.h"
|
|
#include "tessbox.h"
|
|
#include "tordvars.h"
|
|
#include "imgs.h"
|
|
#include "makerow.h"
|
|
#include "tstruct.h"
|
|
#include "tessout.h"
|
|
#include "tface.h"
|
|
#include "permute.h"
|
|
#include "otsuthr.h"
|
|
#include "osdetect.h"
|
|
#include "chopper.h"
|
|
#include "matchtab.h"
|
|
|
|
namespace tesseract {
|
|
|
|
// Minimum sensible image size to be worth running tesseract.
|
|
const int kMinRectSize = 10;
|
|
// Character returned when Tesseract couldn't recognize as anything.
|
|
const char kTesseractReject = '~';
|
|
// Character used by UNLV error counter as a reject.
|
|
const char kUNLVReject = '~';
|
|
// Character used by UNLV as a suspect marker.
|
|
const char kUNLVSuspect = '^';
|
|
// Filename used for input image file, from which to derive a name to search
|
|
// for a possible UNLV zone file, if none is specified by SetInputName.
|
|
const char* kInputFile = "noname.tif";
|
|
|
|
TessBaseAPI::TessBaseAPI()
|
|
: tesseract_(NULL),
|
|
// Thresholder is initialized to NULL here, but will be set before use by:
|
|
// A constructor of a derived API, SetThresholder(), or
|
|
// created implicitly when used in InternalSetImage.
|
|
thresholder_(NULL),
|
|
threshold_done_(false),
|
|
block_list_(NULL),
|
|
page_res_(NULL),
|
|
input_file_(NULL),
|
|
output_file_(NULL),
|
|
datapath_(NULL),
|
|
language_(NULL),
|
|
rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
|
|
image_width_(0), image_height_(0) {
|
|
}
|
|
|
|
TessBaseAPI::~TessBaseAPI() {
|
|
End();
|
|
}
|
|
|
|
// Set the name of the input file. Needed only for training and
|
|
// loading a UNLV zone file.
|
|
void TessBaseAPI::SetInputName(const char* name) {
|
|
if (input_file_ == NULL)
|
|
input_file_ = new STRING(name);
|
|
else
|
|
*input_file_ = name;
|
|
}
|
|
|
|
// Set the name of the output files. Needed only for debugging.
|
|
void TessBaseAPI::SetOutputName(const char* name) {
|
|
if (output_file_ == NULL)
|
|
output_file_ = new STRING(name);
|
|
else
|
|
*output_file_ = name;
|
|
}
|
|
|
|
// Set the value of an internal "variable" (of either old or new types).
|
|
// Supply the name of the variable and the value as a string, just as
|
|
// you would in a config file.
|
|
// Returns false if the name lookup failed.
|
|
// SetVariable may be used before Init, to set things that control
|
|
// initialization, but note that on End all settings are lost and
|
|
// the next Init will use the defaults unless SetVariable is used again.
|
|
bool TessBaseAPI::SetVariable(const char* variable, const char* value) {
|
|
if (tesseract_ == NULL)
|
|
tesseract_ = new Tesseract;
|
|
return set_variable(variable, value);
|
|
}
|
|
|
|
// The datapath must be the name of the data directory (no ending /) or
|
|
// some other file in which the data directory resides (for instance argv[0].)
|
|
// The language is (usually) an ISO 639-3 string or NULL will default to eng.
|
|
// If numeric_mode is true, then only digits and Roman numerals will
|
|
// be returned.
|
|
// Returns 0 on success and -1 on initialization failure.
|
|
int TessBaseAPI::Init(const char* datapath, const char* language,
|
|
char **configs, int configs_size,
|
|
bool configs_global_only) {
|
|
// If the datapath or the language have changed, then start again.
|
|
// Note that the language_ field stores the last requested language that was
|
|
// initialized successfully, while tesseract_->lang stores the language
|
|
// actually used. They differ only if the requested language was NULL, in
|
|
// which case tesseract_->lang is set to the Tesseract default ("eng").
|
|
if (tesseract_ != NULL &&
|
|
(datapath_ == NULL || language_ == NULL || *datapath_ != datapath
|
|
|| (*language_ != language && tesseract_->lang != language))) {
|
|
tesseract_->end_tesseract();
|
|
delete tesseract_;
|
|
tesseract_ = NULL;
|
|
}
|
|
|
|
bool reset_classifier = true;
|
|
if (tesseract_ == NULL) {
|
|
reset_classifier = false;
|
|
tesseract_ = new Tesseract;
|
|
if (tesseract_->init_tesseract(
|
|
datapath, output_file_ != NULL ? output_file_->string() : NULL,
|
|
language, configs, configs_size, configs_global_only) != 0) {
|
|
return -1;
|
|
}
|
|
}
|
|
// Update datapath and language requested for the last valid initialization.
|
|
if (datapath_ == NULL)
|
|
datapath_ = new STRING(datapath);
|
|
else
|
|
*datapath_ = datapath;
|
|
if (language_ == NULL)
|
|
language_ = new STRING(language);
|
|
else
|
|
*language_ = language;
|
|
|
|
// For same language and datapath, just reset the adaptive classifier.
|
|
if (reset_classifier) tesseract_->ResetAdaptiveClassifier();
|
|
|
|
return 0;
|
|
}
|
|
|
|
// Init only the lang model component of Tesseract. The only functions
|
|
// that work after this init are SetVariable and IsValidWord.
|
|
// WARNING: temporary! This function will be removed from here and placed
|
|
// in a separate API at some future time.
|
|
int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
|
|
if (tesseract_ == NULL)
|
|
tesseract_ = new Tesseract;
|
|
return tesseract_->init_tesseract_lm(datapath, NULL, language);
|
|
}
|
|
|
|
// Init only the classifer component of Tesseract. Used to initialize the
|
|
// specified language when no dawg models are available.
|
|
int TessBaseAPI::InitWithoutLangModel(const char* datapath,
|
|
const char* language) {
|
|
// If the datapath or the language have changed, then start again.
|
|
if (tesseract_ != NULL &&
|
|
(datapath_ == NULL || language_ == NULL ||
|
|
*datapath_ != datapath || *language_ != language)) {
|
|
tesseract_->end_tesseract();
|
|
delete tesseract_;
|
|
tesseract_ = NULL;
|
|
}
|
|
if (datapath_ == NULL)
|
|
datapath_ = new STRING(datapath);
|
|
else
|
|
*datapath_ = datapath;
|
|
if (language_ == NULL)
|
|
language_ = new STRING(language);
|
|
else
|
|
*language_ = language;
|
|
if (tesseract_ == NULL) {
|
|
tesseract_ = new Tesseract;
|
|
return tesseract_->init_tesseract_classifier(
|
|
datapath, output_file_ != NULL ? output_file_->string() : NULL,
|
|
language, NULL, 0, false);
|
|
}
|
|
// For same language and datapath, just reset the adaptive classifier.
|
|
tesseract_->ResetAdaptiveClassifier();
|
|
return 0;
|
|
}
|
|
|
|
// Read a "config" file containing a set of variable, value pairs.
|
|
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
|
|
// and also accepts a relative or absolute path name.
|
|
void TessBaseAPI::ReadConfigFile(const char* filename, bool global_only) {
|
|
tesseract_->read_config_file(filename, global_only);
|
|
}
|
|
|
|
// Set the current page segmentation mode. Defaults to PSM_AUTO.
|
|
// The mode is stored as an INT_VARIABLE so it can also be modified by
|
|
// ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
|
|
void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
|
|
if (tesseract_ == NULL)
|
|
tesseract_ = new Tesseract;
|
|
tesseract_->tessedit_pageseg_mode.set_value(mode);
|
|
}
|
|
|
|
// Return the current page segmentation mode.
|
|
PageSegMode TessBaseAPI::GetPageSegMode() const {
|
|
if (tesseract_ == NULL)
|
|
return PSM_SINGLE_BLOCK;
|
|
return static_cast<PageSegMode>(
|
|
static_cast<int>(tesseract_->tessedit_pageseg_mode));
|
|
}
|
|
|
|
// Set the hint for trading accuracy against speed.
|
|
// Default is AVS_FASTEST, which is the old behaviour.
|
|
// Note that this is only a hint. Depending on the language and/or
|
|
// build configuration, speed and accuracy may not be tradeable.
|
|
// Also note that despite being an enum, any value in the range
|
|
// AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not
|
|
// have an effect, depending on the implementation.
|
|
// The mode is stored as an INT_VARIABLE so it can also be modified by
|
|
// ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string).
|
|
void TessBaseAPI::SetAccuracyVSpeed(AccuracyVSpeed mode) {
|
|
if (tesseract_ == NULL)
|
|
tesseract_ = new Tesseract;
|
|
tesseract_->tessedit_accuracyvspeed.set_value(mode);
|
|
}
|
|
|
|
// Recognize a rectangle from an image and return the result as a string.
|
|
// May be called many times for a single Init.
|
|
// Currently has no error checking.
|
|
// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
|
|
// Palette color images will not work properly and must be converted to
|
|
// 24 bit.
|
|
// Binary images of 1 bit per pixel may also be given but they must be
|
|
// byte packed with the MSB of the first byte being the first pixel, and a
|
|
// one pixel is WHITE. For binary images set bytes_per_pixel=0.
|
|
// The recognized text is returned as a char* which is coded
|
|
// as UTF8 and must be freed with the delete [] operator.
|
|
char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
|
|
int bytes_per_pixel,
|
|
int bytes_per_line,
|
|
int left, int top,
|
|
int width, int height) {
|
|
if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
|
|
return NULL; // Nothing worth doing.
|
|
|
|
// Since this original api didn't give the exact size of the image,
|
|
// we have to invent a reasonable value.
|
|
int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
|
|
SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height,
|
|
bytes_per_pixel, bytes_per_line);
|
|
SetRectangle(left, top, width, height);
|
|
|
|
return GetUTF8Text();
|
|
}
|
|
|
|
// Call between pages or documents etc to free up memory and forget
|
|
// adaptive data.
|
|
void TessBaseAPI::ClearAdaptiveClassifier() {
|
|
if (tesseract_ == NULL)
|
|
return;
|
|
tesseract_->ResetAdaptiveClassifier();
|
|
}
|
|
|
|
// Provide an image for Tesseract to recognize. Format is as
|
|
// TesseractRect above. Does not copy the image buffer, or take
|
|
// ownership. The source image may be destroyed after Recognize is called,
|
|
// either explicitly or implicitly via one of the Get*Text functions.
|
|
// SetImage clears all recognition results, and sets the rectangle to the
|
|
// full image, so it may be followed immediately by a GetUTF8Text, and it
|
|
// will automatically perform recognition.
|
|
void TessBaseAPI::SetImage(const unsigned char* imagedata,
|
|
int width, int height,
|
|
int bytes_per_pixel, int bytes_per_line) {
|
|
if (InternalSetImage())
|
|
thresholder_->SetImage(imagedata, width, height,
|
|
bytes_per_pixel, bytes_per_line);
|
|
}
|
|
|
|
// Provide an image for Tesseract to recognize. As with SetImage above,
|
|
// Tesseract doesn't take a copy or ownership or pixDestroy the image, so
|
|
// it must persist until after Recognize.
|
|
// Pix vs raw, which to use?
|
|
// Use Pix where possible. A future version of Tesseract may choose to use Pix
|
|
// as its internal representation and discard IMAGE altogether.
|
|
// Because of that, an implementation that sources and targets Pix may end up
|
|
// with less copies than an implementation that does not.
|
|
void TessBaseAPI::SetImage(const Pix* pix) {
|
|
#ifdef HAVE_LIBLEPT
|
|
if (InternalSetImage())
|
|
thresholder_->SetImage(pix);
|
|
#endif
|
|
}
|
|
|
|
// Restrict recognition to a sub-rectangle of the image. Call after SetImage.
|
|
// Each SetRectangle clears the recogntion results so multiple rectangles
|
|
// can be recognized with the same image.
|
|
void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
|
|
if (thresholder_ == NULL)
|
|
return;
|
|
thresholder_->SetRectangle(left, top, width, height);
|
|
ClearResults();
|
|
}
|
|
|
|
// ONLY available if you have Leptonica installed.
|
|
// Get a copy of the internal thresholded image from Tesseract.
|
|
Pix* TessBaseAPI::GetThresholdedImage() {
|
|
#ifdef HAVE_LIBLEPT
|
|
if (tesseract_ == NULL)
|
|
return NULL;
|
|
if (tesseract_->pix_binary() == NULL)
|
|
Threshold(tesseract_->mutable_pix_binary());
|
|
return pixClone(tesseract_->pix_binary());
|
|
#endif
|
|
}
|
|
|
|
// Get the result of page layout analysis as a leptonica-style
|
|
// Boxa, Pixa pair, in reading order.
|
|
// Can be called before or after Recognize.
|
|
// For now only gets text regions.
|
|
Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
|
|
#ifdef HAVE_LIBLEPT
|
|
if (block_list_ == NULL || block_list_->empty()) {
|
|
FindLines();
|
|
}
|
|
int im_height = pixGetHeight(tesseract_->pix_binary());
|
|
Boxa* boxa = boxaCreate(block_list_->length());
|
|
if (pixa != NULL) {
|
|
*pixa = pixaCreate(boxaGetCount(boxa));
|
|
}
|
|
BLOCK_IT it(block_list_);
|
|
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
|
BLOCK* block = it.data();
|
|
POLY_BLOCK* poly = block->poly_block();
|
|
TBOX box;
|
|
if (poly != NULL) {
|
|
if (!poly->IsText())
|
|
continue; // Use only text blocks.
|
|
POLY_BLOCK image_block(poly->points(), poly->isA());
|
|
image_block.rotate(block->re_rotation());
|
|
box = *image_block.bounding_box();
|
|
if (pixa != NULL) {
|
|
Pix* pix = pixCreate(box.width(), box.height(), 1);
|
|
PB_LINE_IT *lines;
|
|
// Block outline is a polygon, so use a PC_LINE_IT to get the
|
|
// rasterized interior. (Runs of interior pixels on a line.)
|
|
lines = new PB_LINE_IT(&image_block);
|
|
for (int y = box.bottom(); y < box.top(); ++y) {
|
|
ICOORDELT_LIST* segments = lines->get_line(y);
|
|
if (!segments->empty()) {
|
|
ICOORDELT_IT s_it(segments);
|
|
// Each element of segments is a start x and x size of the
|
|
// run of interior pixels.
|
|
for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) {
|
|
int start = s_it.data()->x();
|
|
int xext = s_it.data()->y();
|
|
// Copy the run from the source image to the block image.
|
|
pixRasterop(pix, start - box.left(),
|
|
box.height() - 1 - (y - box.bottom()),
|
|
xext, 1, PIX_SRC, tesseract_->pix_binary(),
|
|
start, im_height - 1 - y);
|
|
}
|
|
}
|
|
delete segments;
|
|
}
|
|
delete lines;
|
|
pixaAddPix(*pixa, pix, L_INSERT);
|
|
}
|
|
} else {
|
|
if (!block_list_->singleton())
|
|
continue; // A null poly block can only be used if it is the only block.
|
|
box = block->bounding_box();
|
|
if (pixa != NULL) {
|
|
Pix* pix = pixCreate(box.width(), box.height(), 1);
|
|
// Just copy the whole block as there is only a bounding box.
|
|
pixRasterop(pix, 0, 0, box.width(), box.height(),
|
|
PIX_SRC, tesseract_->pix_binary(),
|
|
box.left(), im_height - box.top());
|
|
pixaAddPix(*pixa, pix, L_INSERT);
|
|
}
|
|
}
|
|
Box* lbox = boxCreate(box.left(), im_height - box.top(),
|
|
box.width(), box.height());
|
|
boxaAddBox(boxa, lbox, L_INSERT);
|
|
}
|
|
return boxa;
|
|
#endif
|
|
}
|
|
|
|
// Get the textlines as a leptonica-style
|
|
// Boxa, Pixa pair, in reading order.
|
|
// Can be called before or after Recognize.
|
|
// If blockids is not NULL, the block-id of each line is also returned as an
|
|
// array of one element per line. delete [] after use.
|
|
Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) {
|
|
#ifdef HAVE_LIBLEPT
|
|
if (block_list_ == NULL || block_list_->empty()) {
|
|
FindLines();
|
|
}
|
|
// A local PAGE_RES prevents the clear if Recognize is called after.
|
|
PAGE_RES page_res(block_list_);
|
|
PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
|
|
// Count the lines to get a size for the arrays.
|
|
int line_count = 0;
|
|
for (page_res_it.restart_page(); page_res_it.word() != NULL;
|
|
page_res_it.forward()) {
|
|
if (page_res_it.row() != page_res_it.next_row()) {
|
|
++line_count;
|
|
}
|
|
}
|
|
|
|
int im_height = pixGetHeight(tesseract_->pix_binary());
|
|
Boxa* boxa = boxaCreate(line_count);
|
|
if (pixa != NULL)
|
|
*pixa = pixaCreate(line_count);
|
|
if (blockids != NULL)
|
|
*blockids = new int[line_count];
|
|
int blockid = 0;
|
|
int lineindex = 0;
|
|
for (page_res_it.restart_page(); page_res_it.word() != NULL;
|
|
page_res_it.forward(), ++lineindex) {
|
|
WERD_RES *word = page_res_it.word();
|
|
BLOCK* block = page_res_it.block()->block;
|
|
// Get the line bounding box.
|
|
PAGE_RES_IT word_it(page_res_it); // Save start of line.
|
|
TBOX line_box = word->word->bounding_box();
|
|
while (page_res_it.next_row() == page_res_it.row()) {
|
|
page_res_it.forward();
|
|
word = page_res_it.word();
|
|
TBOX word_box = word->word->bounding_box();
|
|
word_box.rotate(block->re_rotation());
|
|
line_box += word_box;
|
|
}
|
|
Box* lbox = boxCreate(line_box.left(), im_height - line_box.top(),
|
|
line_box.width(), line_box.height());
|
|
boxaAddBox(boxa, lbox, L_INSERT);
|
|
if (pixa != NULL) {
|
|
Pix* pix = pixCreate(line_box.width(), line_box.height(), 1);
|
|
// Copy all the words to the output pix.
|
|
while (word_it.row() == page_res_it.row()) {
|
|
word = word_it.word();
|
|
TBOX word_box = word->word->bounding_box();
|
|
word_box.rotate(block->re_rotation());
|
|
pixRasterop(pix, word_box.left() - line_box.left(),
|
|
line_box.top() - word_box.top(),
|
|
word_box.width(), word_box.height(),
|
|
PIX_SRC, tesseract_->pix_binary(),
|
|
word_box.left(), im_height - word_box.top());
|
|
word_it.forward();
|
|
}
|
|
pixaAddPix(*pixa, pix, L_INSERT);
|
|
pixaAddBox(*pixa, lbox, L_CLONE);
|
|
}
|
|
if (blockids != NULL) {
|
|
(*blockids)[lineindex] = blockid;
|
|
if (page_res_it.block() != page_res_it.next_block())
|
|
++blockid;
|
|
}
|
|
}
|
|
return boxa;
|
|
#endif
|
|
}
|
|
|
|
// Get the words as a leptonica-style
|
|
// Boxa, Pixa pair, in reading order.
|
|
// Can be called before or after Recognize.
|
|
Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
|
|
#ifdef HAVE_LIBLEPT
|
|
if (block_list_ == NULL || block_list_->empty()) {
|
|
FindLines();
|
|
}
|
|
// A local PAGE_RES prevents the clear if Recognize is called after.
|
|
PAGE_RES page_res(block_list_);
|
|
PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
|
|
// Count the words to get a size for the arrays.
|
|
int word_count = 0;
|
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
page_res_it.forward())
|
|
++word_count;
|
|
|
|
int im_height = pixGetHeight(tesseract_->pix_binary());
|
|
Boxa* boxa = boxaCreate(word_count);
|
|
if (pixa != NULL) {
|
|
*pixa = pixaCreate(word_count);
|
|
}
|
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
page_res_it.forward()) {
|
|
WERD_RES *word = page_res_it.word();
|
|
BLOCK* block = page_res_it.block()->block;
|
|
TBOX box = word->word->bounding_box();
|
|
box.rotate(block->re_rotation());
|
|
Box* lbox = boxCreate(box.left(), im_height - box.top(),
|
|
box.width(), box.height());
|
|
boxaAddBox(boxa, lbox, L_INSERT);
|
|
if (pixa != NULL) {
|
|
Pix* pix = pixCreate(box.width(), box.height(), 1);
|
|
// Copy the whole word bounding box to the output pix.
|
|
pixRasterop(pix, 0, 0, box.width(), box.height(),
|
|
PIX_SRC, tesseract_->pix_binary(),
|
|
box.left(), im_height - box.top());
|
|
pixaAddPix(*pixa, pix, L_INSERT);
|
|
pixaAddBox(*pixa, lbox, L_CLONE);
|
|
}
|
|
}
|
|
return boxa;
|
|
#endif // HAVE_LIBLEPT
|
|
}
|
|
|
|
// Dump the internal binary image to a PGM file.
|
|
void TessBaseAPI::DumpPGM(const char* filename) {
|
|
if (tesseract_ == NULL)
|
|
return;
|
|
IMAGELINE line;
|
|
line.init(page_image.get_xsize());
|
|
FILE *fp = fopen(filename, "w");
|
|
fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n",
|
|
page_image.get_xsize(), page_image.get_ysize());
|
|
for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
|
|
page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
|
|
for (int i = 0; i < page_image.get_xsize(); ++i) {
|
|
uinT8 b = line.pixels[i] ? 255 : 0;
|
|
fwrite(&b, 1, 1, fp);
|
|
}
|
|
}
|
|
fclose(fp);
|
|
}
|
|
|
|
// Recognize the tesseract global image and return the result as Tesseract
|
|
// internal structures.
|
|
int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) {
|
|
if (tesseract_ == NULL)
|
|
return -1;
|
|
if (thresholder_ == NULL || thresholder_->IsEmpty()) {
|
|
tprintf("Please call SetImage before attempting recognition.");
|
|
return -1;
|
|
}
|
|
if (page_res_ != NULL)
|
|
ClearResults();
|
|
if (FindLines() != 0)
|
|
return -1;
|
|
if (tesseract_->tessedit_resegment_from_boxes)
|
|
tesseract_->apply_boxes(*input_file_, block_list_);
|
|
tesseract_->SetBlackAndWhitelist();
|
|
|
|
page_res_ = new PAGE_RES(block_list_);
|
|
int result = 0;
|
|
if (interactive_mode) {
|
|
tesseract_->pgeditor_main(block_list_);
|
|
// The page_res is invalid after an interactive session, so cleanup
|
|
// in a way that lets us continue to the next page without crashing.
|
|
delete page_res_;
|
|
page_res_ = NULL;
|
|
return -1;
|
|
} else if (tesseract_->tessedit_train_from_boxes) {
|
|
apply_box_training(*output_file_, block_list_);
|
|
} else if (tesseract_->global_tessedit_ambigs_training) {
|
|
FILE *ambigs_output_file = tesseract_->init_ambigs_training(*input_file_);
|
|
// OCR the page segmented into words by tesseract.
|
|
tesseract_->ambigs_training_segmented(
|
|
*input_file_, page_res_, monitor, ambigs_output_file);
|
|
fclose(ambigs_output_file);
|
|
} else {
|
|
// Now run the main recognition.
|
|
// Running base tesseract if the inttemp for the current language loaded.
|
|
if (tesseract_->inttemp_loaded_) {
|
|
tesseract_->recog_all_words(page_res_, monitor);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Tests the chopper by exhaustively running chop_one_blob.
|
|
int TessBaseAPI::RecognizeForChopTest(struct ETEXT_STRUCT* monitor) {
|
|
if (tesseract_ == NULL)
|
|
return -1;
|
|
if (thresholder_ == NULL || thresholder_->IsEmpty()) {
|
|
tprintf("Please call SetImage before attempting recognition.");
|
|
return -1;
|
|
}
|
|
if (page_res_ != NULL)
|
|
ClearResults();
|
|
if (FindLines() != 0)
|
|
return -1;
|
|
// Additional conditions under which chopper test cannot be run
|
|
if (tesseract_->tessedit_train_from_boxes_word_level || interactive_mode)
|
|
return -1;
|
|
ASSERT_HOST(tesseract_->inttemp_loaded_);
|
|
|
|
page_res_ = new PAGE_RES(block_list_);
|
|
|
|
PAGE_RES_IT page_res_it(page_res_);
|
|
|
|
tesseract_->tess_matcher = &Tesseract::tess_default_matcher;
|
|
tesseract_->tess_tester = NULL;
|
|
tesseract_->tess_trainer = NULL;
|
|
|
|
while (page_res_it.word() != NULL) {
|
|
WERD_RES *word_res = page_res_it.word();
|
|
WERD *word = word_res->word;
|
|
if (word->cblob_list()->empty()) {
|
|
page_res_it.forward();
|
|
continue;
|
|
}
|
|
WERD *bln_word = make_bln_copy(word, page_res_it.row()->row,
|
|
page_res_it.block()->block,
|
|
word_res->x_height, &word_res->denorm);
|
|
ASSERT_HOST(!bln_word->blob_list()->empty());
|
|
TWERD *tessword = make_tess_word(bln_word, NULL);
|
|
if (tessword->blobs == NULL) {
|
|
make_tess_word(bln_word, NULL);
|
|
}
|
|
TBLOB *pblob;
|
|
TBLOB *blob;
|
|
init_match_table();
|
|
BLOB_CHOICE_LIST *match_result;
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
|
|
tesseract_->tess_denorm = &word_res->denorm;
|
|
tesseract_->tess_word = bln_word;
|
|
ASSERT_HOST(tessword->blobs != NULL);
|
|
for (blob = tessword->blobs, pblob = NULL;
|
|
blob != NULL; blob = blob->next) {
|
|
match_result = tesseract_->classify_blob(pblob, blob, blob->next, NULL,
|
|
"chop_word:", Green);
|
|
if (match_result == NULL)
|
|
tprintf("Null classifier output!\n");
|
|
tesseract_->modify_blob_choice(match_result, 0);
|
|
ASSERT_HOST(!match_result->empty());
|
|
*char_choices += match_result;
|
|
pblob = blob;
|
|
}
|
|
inT32 blob_number;
|
|
SEAMS seam_list = start_seam_list(tessword->blobs);
|
|
int right_chop_index = 0;
|
|
while (tesseract_->chop_one_blob(tessword, char_choices,
|
|
&blob_number, &seam_list,
|
|
&right_chop_index)) {
|
|
}
|
|
|
|
word_res->best_choice = new WERD_CHOICE();
|
|
word_res->raw_choice = new WERD_CHOICE();
|
|
word_res->best_choice->make_bad();
|
|
word_res->raw_choice->make_bad();
|
|
tesseract_->getDict().permute_characters(*char_choices, 1000.0,
|
|
word_res->best_choice,
|
|
word_res->raw_choice);
|
|
|
|
word_res->outword = make_ed_word(tessword, bln_word);
|
|
page_res_it.forward();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Make a text string from the internal data structures.
|
|
char* TessBaseAPI::GetUTF8Text() {
|
|
if (tesseract_ == NULL ||
|
|
(page_res_ == NULL && Recognize(NULL) < 0))
|
|
return NULL;
|
|
int total_length = TextLength(NULL);
|
|
PAGE_RES_IT page_res_it(page_res_);
|
|
char* result = new char[total_length];
|
|
char* ptr = result;
|
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
page_res_it.forward()) {
|
|
WERD_RES *word = page_res_it.word();
|
|
WERD_CHOICE* choice = word->best_choice;
|
|
if (choice != NULL) {
|
|
strcpy(ptr, choice->unichar_string().string());
|
|
ptr += choice->unichar_string().length();
|
|
if (word->word->flag(W_EOL))
|
|
*ptr++ = '\n';
|
|
else
|
|
*ptr++ = ' ';
|
|
}
|
|
}
|
|
*ptr++ = '\n';
|
|
*ptr = '\0';
|
|
return result;
|
|
}
|
|
|
|
static int ConvertWordToBoxText(WERD_RES *word,
|
|
ROW_RES* row,
|
|
int left,
|
|
int bottom,
|
|
char* word_str) {
|
|
// Copy the output word and denormalize it back to image coords.
|
|
WERD copy_outword;
|
|
copy_outword = *(word->outword);
|
|
copy_outword.baseline_denormalise(&word->denorm);
|
|
PBLOB_IT blob_it;
|
|
blob_it.set_to_list(copy_outword.blob_list());
|
|
int length = copy_outword.blob_list()->length();
|
|
int output_size = 0;
|
|
|
|
if (length > 0) {
|
|
for (int index = 0, offset = 0; index < length;
|
|
offset += word->best_choice->unichar_lengths()[index++],
|
|
blob_it.forward()) {
|
|
PBLOB* blob = blob_it.data();
|
|
TBOX blob_box = blob->bounding_box();
|
|
if (word->tess_failed ||
|
|
blob_box.left() < 0 ||
|
|
blob_box.right() > page_image.get_xsize() ||
|
|
blob_box.bottom() < 0 ||
|
|
blob_box.top() > page_image.get_ysize()) {
|
|
// Bounding boxes can be illegal when tess fails on a word.
|
|
blob_box = word->word->bounding_box(); // Use original word as backup.
|
|
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
|
|
blob_box.left(), blob_box.bottom(),
|
|
blob_box.right(), blob_box.top());
|
|
}
|
|
|
|
// A single classification unit can be composed of several UTF-8
|
|
// characters. Append each of them to the result.
|
|
for (int sub = 0;
|
|
sub < word->best_choice->unichar_lengths()[index]; ++sub) {
|
|
char ch = word->best_choice->unichar_string()[offset + sub];
|
|
// Tesseract uses space for recognition failure. Fix to a reject
|
|
// character, kTesseractReject so we don't create illegal box files.
|
|
if (ch == ' ')
|
|
ch = kTesseractReject;
|
|
word_str[output_size++] = ch;
|
|
}
|
|
sprintf(word_str + output_size, " %d %d %d %d\n",
|
|
blob_box.left() + left, blob_box.bottom() + bottom,
|
|
blob_box.right() + left, blob_box.top() + bottom);
|
|
output_size += strlen(word_str + output_size);
|
|
}
|
|
}
|
|
return output_size;
|
|
}
|
|
|
|
// Multiplier for max expected textlength assumes typically 4 numbers @
|
|
// (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
|
|
// orginal UTF8 characters, and one kMaxCharsPerChar.
|
|
const int kCharsPerChar = 25;
|
|
// A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
|
|
// space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
|
|
// Test against this on each iteration for safety.
|
|
const int kMaxCharsPerChar = 85 + UNICHAR_LEN;
|
|
|
|
// The recognized text is returned as a char* which is coded
|
|
// as a UTF8 box file and must be freed with the delete [] operator.
|
|
char* TessBaseAPI::GetBoxText() {
|
|
int bottom = image_height_ - (rect_top_ + rect_height_);
|
|
if (tesseract_ == NULL ||
|
|
(page_res_ == NULL && Recognize(NULL) < 0))
|
|
return NULL;
|
|
int blob_count;
|
|
int utf8_length = TextLength(&blob_count);
|
|
int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar;
|
|
PAGE_RES_IT page_res_it(page_res_);
|
|
char* result = new char[total_length];
|
|
char* ptr = result;
|
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
page_res_it.forward()) {
|
|
WERD_RES *word = page_res_it.word();
|
|
ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
|
|
ptr);
|
|
// Just in case...
|
|
if (ptr - result + kMaxCharsPerChar > total_length)
|
|
break;
|
|
}
|
|
*ptr = '\0';
|
|
return result;
|
|
}
|
|
|
|
// Conversion table for non-latin characters.
|
|
// Maps characters out of the latin set into the latin set.
|
|
// TODO(rays) incorporate this translation into unicharset.
|
|
const int kUniChs[] = {
|
|
0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
|
|
};
|
|
// Latin chars corresponding to the unicode chars above.
|
|
const int kLatinChs[] = {
|
|
0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
|
|
};
|
|
|
|
// The recognized text is returned as a char* which is coded
|
|
// as UNLV format Latin-1 with specific reject and suspect codes
|
|
// and must be freed with the delete [] operator.
|
|
char* TessBaseAPI::GetUNLVText() {
|
|
if (tesseract_ == NULL ||
|
|
(page_res_ == NULL && Recognize(NULL) < 0))
|
|
return NULL;
|
|
bool tilde_crunch_written = false;
|
|
bool last_char_was_newline = true;
|
|
bool last_char_was_tilde = false;
|
|
|
|
int total_length = TextLength(NULL);
|
|
PAGE_RES_IT page_res_it(page_res_);
|
|
char* result = new char[total_length];
|
|
char* ptr = result;
|
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
page_res_it.forward()) {
|
|
WERD_RES *word = page_res_it.word();
|
|
// Process the current word.
|
|
if (word->unlv_crunch_mode != CR_NONE) {
|
|
if (word->unlv_crunch_mode != CR_DELETE &&
|
|
(!tilde_crunch_written ||
|
|
(word->unlv_crunch_mode == CR_KEEP_SPACE &&
|
|
word->word->space() > 0 &&
|
|
!word->word->flag(W_FUZZY_NON) &&
|
|
!word->word->flag(W_FUZZY_SP)))) {
|
|
if (!word->word->flag(W_BOL) &&
|
|
word->word->space() > 0 &&
|
|
!word->word->flag(W_FUZZY_NON) &&
|
|
!word->word->flag(W_FUZZY_SP)) {
|
|
/* Write a space to separate from preceeding good text */
|
|
*ptr++ = ' ';
|
|
last_char_was_tilde = false;
|
|
}
|
|
if (!last_char_was_tilde) {
|
|
// Write a reject char.
|
|
last_char_was_tilde = true;
|
|
*ptr++ = kUNLVReject;
|
|
tilde_crunch_written = true;
|
|
last_char_was_newline = false;
|
|
}
|
|
}
|
|
} else {
|
|
// NORMAL PROCESSING of non tilde crunched words.
|
|
tilde_crunch_written = false;
|
|
|
|
if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
|
|
ensure_rep_chars_are_consistent(word);
|
|
|
|
tesseract_->set_unlv_suspects(word);
|
|
const char* wordstr = word->best_choice->unichar_string().string();
|
|
const STRING& lengths = word->best_choice->unichar_lengths();
|
|
int length = lengths.length();
|
|
int i = 0;
|
|
int offset = 0;
|
|
|
|
if (last_char_was_tilde &&
|
|
word->word->space() == 0 && wordstr[offset] == ' ') {
|
|
// Prevent adjacent tilde across words - we know that adjacent tildes
|
|
// within words have been removed.
|
|
// Skip the first character.
|
|
offset = lengths[i++];
|
|
}
|
|
if (i < length && wordstr[offset] != 0) {
|
|
if (!last_char_was_newline)
|
|
*ptr++ = ' ';
|
|
else
|
|
last_char_was_newline = false;
|
|
for (; i < length; offset += lengths[i++]) {
|
|
if (wordstr[offset] == ' ' ||
|
|
wordstr[offset] == kTesseractReject) {
|
|
*ptr++ = kUNLVReject;
|
|
last_char_was_tilde = true;
|
|
} else {
|
|
if (word->reject_map[i].rejected())
|
|
*ptr++ = kUNLVSuspect;
|
|
UNICHAR ch(wordstr + offset, lengths[i]);
|
|
int uni_ch = ch.first_uni();
|
|
for (int j = 0; kUniChs[j] != 0; ++j) {
|
|
if (kUniChs[j] == uni_ch) {
|
|
uni_ch = kLatinChs[j];
|
|
break;
|
|
}
|
|
}
|
|
if (uni_ch <= 0xff) {
|
|
*ptr++ = static_cast<char>(uni_ch);
|
|
last_char_was_tilde = false;
|
|
} else {
|
|
*ptr++ = kUNLVReject;
|
|
last_char_was_tilde = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (word->word->flag(W_EOL) && !last_char_was_newline) {
|
|
/* Add a new line output */
|
|
*ptr++ = '\n';
|
|
tilde_crunch_written = false;
|
|
last_char_was_newline = true;
|
|
last_char_was_tilde = false;
|
|
}
|
|
}
|
|
*ptr++ = '\n';
|
|
*ptr = '\0';
|
|
return result;
|
|
}
|
|
|
|
// Returns the average word confidence for Tesseract page result.
|
|
int TessBaseAPI::MeanTextConf() {
|
|
int* conf = AllWordConfidences();
|
|
if (!conf) return 0;
|
|
int sum = 0;
|
|
int *pt = conf;
|
|
while (*pt >= 0) sum += *pt++;
|
|
if (pt != conf) sum /= pt - conf;
|
|
delete [] conf;
|
|
return sum;
|
|
}
|
|
|
|
// Returns an array of all word confidences, terminated by -1.
|
|
int* TessBaseAPI::AllWordConfidences() {
|
|
if (tesseract_ == NULL ||
|
|
(page_res_ == NULL && Recognize(NULL) < 0))
|
|
return NULL;
|
|
int n_word = 0;
|
|
PAGE_RES_IT res_it(page_res_);
|
|
for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
|
|
n_word++;
|
|
|
|
int* conf = new int[n_word+1];
|
|
n_word = 0;
|
|
for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
|
|
WERD_RES *word = res_it.word();
|
|
WERD_CHOICE* choice = word->best_choice;
|
|
int w_conf = static_cast<int>(100 + 5 * choice->certainty());
|
|
// This is the eq for converting Tesseract confidence to 1..100
|
|
if (w_conf < 0) w_conf = 0;
|
|
if (w_conf > 100) w_conf = 100;
|
|
conf[n_word++] = w_conf;
|
|
}
|
|
conf[n_word] = -1;
|
|
return conf;
|
|
}
|
|
|
|
// Free up recognition results and any stored image data, without actually
|
|
// freeing any recognition data that would be time-consuming to reload.
|
|
// Afterwards, you must call SetImage or TesseractRect before doing
|
|
// any Recognize or Get* operation.
|
|
void TessBaseAPI::Clear() {
|
|
if (thresholder_ != NULL)
|
|
thresholder_->Clear();
|
|
ClearResults();
|
|
page_image.destroy();
|
|
}
|
|
|
|
// Close down tesseract and free up all memory. End() is equivalent to
|
|
// destructing and reconstructing your TessBaseAPI.
|
|
// Once End() has been used, none of the other API functions may be used
|
|
// other than Init and anything declared above it in the class definition.
|
|
void TessBaseAPI::End() {
|
|
if (thresholder_ != NULL) {
|
|
delete thresholder_;
|
|
thresholder_ = NULL;
|
|
}
|
|
if (page_res_ != NULL) {
|
|
delete page_res_;
|
|
page_res_ = NULL;
|
|
}
|
|
if (block_list_ != NULL) {
|
|
delete block_list_;
|
|
block_list_ = NULL;
|
|
}
|
|
if (tesseract_ != NULL) {
|
|
tesseract_->end_tesseract();
|
|
delete tesseract_;
|
|
tesseract_ = NULL;
|
|
}
|
|
if (input_file_ != NULL) {
|
|
delete input_file_;
|
|
input_file_ = NULL;
|
|
}
|
|
if (output_file_ != NULL) {
|
|
delete output_file_;
|
|
output_file_ = NULL;
|
|
}
|
|
if (datapath_ != NULL) {
|
|
delete datapath_;
|
|
datapath_ = NULL;
|
|
}
|
|
if (language_ != NULL) {
|
|
delete language_;
|
|
language_ = NULL;
|
|
}
|
|
}
|
|
|
|
// Check whether a word is valid according to Tesseract's language model
|
|
// returns 0 if the word is invalid, non-zero if valid
|
|
int TessBaseAPI::IsValidWord(const char *word) {
|
|
return tesseract_->getDict().valid_word(word);
|
|
}
|
|
|
|
|
|
bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
|
|
if (page_res_ == NULL)
|
|
FindLines();
|
|
if (block_list_->length() < 1) {
|
|
return false;
|
|
}
|
|
|
|
// Get first block
|
|
BLOCK_IT block_it(block_list_);
|
|
block_it.move_to_first();
|
|
ROW_LIST* rows = block_it.data()->row_list();
|
|
if (rows->length() != 1) {
|
|
return false;
|
|
}
|
|
|
|
// Get first line of block
|
|
ROW_IT row_it(rows);
|
|
row_it.move_to_first();
|
|
ROW* row = row_it.data();
|
|
|
|
// Calculate offset and slope (NOTE: Kind of ugly)
|
|
*out_offset = static_cast<int>(row->base_line(0.0));
|
|
*out_slope = row->base_line(1.0) - row->base_line(0.0);
|
|
|
|
return true;
|
|
}
|
|
|
|
// Set the letter_is_okay function to point somewhere else.
|
|
void TessBaseAPI::SetDictFunc(DictFunc f) {
|
|
if (tesseract_ != NULL) {
|
|
tesseract_->getDict().letter_is_okay_ = f;
|
|
}
|
|
}
|
|
|
|
// Common code for setting the image.
|
|
bool TessBaseAPI::InternalSetImage() {
|
|
if (tesseract_ == NULL) {
|
|
tprintf("Please call Init before attempting to send an image.");
|
|
return false;
|
|
}
|
|
if (thresholder_ == NULL)
|
|
thresholder_ = new ImageThresholder;
|
|
ClearResults();
|
|
return true;
|
|
}
|
|
|
|
// Run the thresholder to make the thresholded image. If pix is not NULL,
|
|
// the source is thresholded to pix instead of the internal IMAGE.
|
|
void TessBaseAPI::Threshold(Pix** pix) {
|
|
#ifdef HAVE_LIBLEPT
|
|
if (pix != NULL)
|
|
thresholder_->ThresholdToPix(pix);
|
|
else
|
|
thresholder_->ThresholdToIMAGE(&page_image);
|
|
#else
|
|
thresholder_->ThresholdToIMAGE(&page_image);
|
|
#endif
|
|
thresholder_->GetImageSizes(&rect_left_, &rect_top_,
|
|
&rect_width_, &rect_height_,
|
|
&image_width_, &image_height_);
|
|
threshold_done_ = true;
|
|
}
|
|
|
|
// Find lines from the image making the BLOCK_LIST.
|
|
int TessBaseAPI::FindLines() {
|
|
if (!block_list_->empty()) {
|
|
return 0;
|
|
}
|
|
if (tesseract_ == NULL) {
|
|
tesseract_ = new Tesseract;
|
|
tesseract_->InitAdaptiveClassifier();
|
|
}
|
|
#ifdef HAVE_LIBLEPT
|
|
if (tesseract_->pix_binary() == NULL)
|
|
Threshold(tesseract_->mutable_pix_binary());
|
|
#endif
|
|
if (!threshold_done_)
|
|
Threshold(NULL);
|
|
|
|
if (tesseract_->SegmentPage(input_file_, &page_image, block_list_) < 0)
|
|
return -1;
|
|
ASSERT_HOST(page_image.get_xsize() == rect_width_ ||
|
|
page_image.get_xsize() == rect_width_ - 1);
|
|
ASSERT_HOST(page_image.get_ysize() == rect_height_ ||
|
|
page_image.get_ysize() == rect_height_ - 1);
|
|
return 0;
|
|
}
|
|
|
|
// Delete the pageres and clear the block list ready for a new page.
|
|
void TessBaseAPI::ClearResults() {
|
|
threshold_done_ = false;
|
|
if (tesseract_ != NULL)
|
|
tesseract_->Clear();
|
|
if (page_res_ != NULL) {
|
|
delete page_res_;
|
|
page_res_ = NULL;
|
|
}
|
|
if (block_list_ == NULL)
|
|
block_list_ = new BLOCK_LIST;
|
|
else
|
|
block_list_->clear();
|
|
}
|
|
|
|
// Return the length of the output text string, as UTF8, assuming
|
|
// one newline per line and one per block, with a terminator,
|
|
// and assuming a single character reject marker for each rejected character.
|
|
// Also return the number of recognized blobs in blob_count.
|
|
int TessBaseAPI::TextLength(int* blob_count) {
|
|
if (tesseract_ == NULL || page_res_ == NULL)
|
|
return 0;
|
|
|
|
PAGE_RES_IT page_res_it(page_res_);
|
|
int total_length = 2;
|
|
int total_blobs = 0;
|
|
// Iterate over the data structures to extract the recognition result.
|
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
page_res_it.forward()) {
|
|
WERD_RES *word = page_res_it.word();
|
|
WERD_CHOICE* choice = word->best_choice;
|
|
if (choice != NULL) {
|
|
total_blobs += choice->length() + 1;
|
|
total_length += choice->unichar_string().length() + 1;
|
|
for (int i = 0; i < word->reject_map.length(); ++i) {
|
|
if (word->reject_map[i].rejected())
|
|
++total_length;
|
|
}
|
|
}
|
|
}
|
|
if (blob_count != NULL)
|
|
*blob_count = total_blobs;
|
|
return total_length;
|
|
}
|
|
|
|
// Estimates the Orientation And Script of the image.
|
|
// Returns true if the image was processed successfully.
|
|
bool TessBaseAPI::DetectOS(OSResults* osr) {
|
|
if (tesseract_ == NULL)
|
|
return false;
|
|
ClearResults();
|
|
Threshold(NULL);
|
|
if (input_file_ == NULL)
|
|
input_file_ = new STRING(kInputFile);
|
|
return orientation_and_script_detection(*input_file_, osr, tesseract_);
|
|
}
|
|
|
|
// ____________________________________________________________________________
|
|
// Ocropus add-ons.
|
|
|
|
// Find lines from the image making the BLOCK_LIST.
|
|
BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
|
|
FindLines();
|
|
BLOCK_LIST* result = block_list_;
|
|
block_list_ = NULL;
|
|
return result;
|
|
}
|
|
|
|
// Delete a block list.
|
|
// This is to keep BLOCK_LIST pointer opaque
|
|
// and let go of including the other headers.
|
|
void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
|
|
delete block_list;
|
|
}
|
|
|
|
|
|
static ROW *make_tess_ocrrow(float baseline,
|
|
float xheight,
|
|
float descender,
|
|
float ascender) {
|
|
inT32 xstarts[] = {-32000};
|
|
double quad_coeffs[] = {0, 0, baseline};
|
|
return new ROW(1,
|
|
xstarts,
|
|
quad_coeffs,
|
|
xheight,
|
|
ascender - (baseline + xheight),
|
|
descender - baseline,
|
|
0,
|
|
0);
|
|
}
|
|
|
|
// Almost a copy of make_tess_row() from ccmain/tstruct.cpp.
|
|
static void fill_dummy_row(float baseline, float xheight,
|
|
float descender, float ascender,
|
|
TEXTROW* tessrow) {
|
|
tessrow->baseline.segments = 1;
|
|
tessrow->baseline.xstarts[0] = -32767;
|
|
tessrow->baseline.xstarts[1] = 32767;
|
|
tessrow->baseline.quads[0].a = 0;
|
|
tessrow->baseline.quads[0].b = 0;
|
|
tessrow->baseline.quads[0].c = bln_baseline_offset;
|
|
tessrow->xheight.segments = 1;
|
|
tessrow->xheight.xstarts[0] = -32767;
|
|
tessrow->xheight.xstarts[1] = 32767;
|
|
tessrow->xheight.quads[0].a = 0;
|
|
tessrow->xheight.quads[0].b = 0;
|
|
tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height;
|
|
tessrow->lineheight = bln_x_height;
|
|
tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight;
|
|
tessrow->descdrop = bln_x_height * (descender - baseline) / xheight;
|
|
}
|
|
|
|
|
|
// Return a TBLOB * from the whole page_image.
|
|
// To be freed later with free_blob().
|
|
TBLOB *make_tesseract_blob(float baseline, float xheight,
|
|
float descender, float ascender) {
|
|
BLOCK *block = new BLOCK("a character",
|
|
TRUE,
|
|
0, 0,
|
|
0, 0,
|
|
page_image.get_xsize(),
|
|
page_image.get_ysize());
|
|
|
|
// Create C_BLOBs from the page
|
|
extract_edges(NULL, &page_image, &page_image,
|
|
ICOORD(page_image.get_xsize(), page_image.get_ysize()),
|
|
block);
|
|
|
|
// Create one PBLOB from all C_BLOBs
|
|
C_BLOB_LIST *list = block->blob_list();
|
|
C_BLOB_IT c_blob_it(list);
|
|
PBLOB *pblob = new PBLOB; // will be (hopefully) deleted by the pblob_list
|
|
for (c_blob_it.mark_cycle_pt();
|
|
!c_blob_it.cycled_list();
|
|
c_blob_it.forward()) {
|
|
C_BLOB *c_blob = c_blob_it.data();
|
|
PBLOB c_as_p(c_blob, baseline + xheight);
|
|
merge_blobs(pblob, &c_as_p);
|
|
}
|
|
PBLOB_LIST *pblob_list = new PBLOB_LIST; // will be deleted by the word
|
|
PBLOB_IT pblob_it(pblob_list);
|
|
pblob_it.add_after_then_move(pblob);
|
|
|
|
// Normalize PBLOB
|
|
WERD word(pblob_list, 0, " ");
|
|
ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender);
|
|
word.baseline_normalise(row);
|
|
delete row;
|
|
|
|
// Create a TBLOB from PBLOB
|
|
return make_tess_blob(pblob, /* flatten: */ TRUE);
|
|
}
|
|
|
|
|
|
// Adapt to recognize the current image as the given character.
|
|
// The image must be preloaded and be just an image of a single character.
|
|
void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
|
|
int length,
|
|
float baseline,
|
|
float xheight,
|
|
float descender,
|
|
float ascender) {
|
|
UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
|
|
LINE_STATS LineStats;
|
|
TEXTROW row;
|
|
fill_dummy_row(baseline, xheight, descender, ascender, &row);
|
|
GetLineStatsFromRow(&row, &LineStats);
|
|
|
|
TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender);
|
|
float threshold;
|
|
UNICHAR_ID best_class = 0;
|
|
float best_rating = -100;
|
|
|
|
|
|
// Classify to get a raw choice.
|
|
BLOB_CHOICE_LIST choices;
|
|
tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices, NULL);
|
|
BLOB_CHOICE_IT choice_it;
|
|
choice_it.set_to_list(&choices);
|
|
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
|
|
choice_it.forward()) {
|
|
if (choice_it.data()->rating() > best_rating) {
|
|
best_rating = choice_it.data()->rating();
|
|
best_class = choice_it.data()->unichar_id();
|
|
}
|
|
}
|
|
|
|
if (id == best_class) {
|
|
threshold = matcher_good_threshold;
|
|
} else {
|
|
/* the blob was incorrectly classified - find the rating threshold
|
|
needed to create a template which will correct the error with
|
|
some margin. However, don't waste time trying to make
|
|
templates which are too tight. */
|
|
threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id);
|
|
threshold *= .9;
|
|
const float max_threshold = .125;
|
|
const float min_threshold = .02;
|
|
|
|
if (threshold > max_threshold)
|
|
threshold = max_threshold;
|
|
|
|
// I have cuddled the following line to set it out of the strike
|
|
// of the coverage testing tool. I have no idea how to trigger
|
|
// this situation nor I have any necessity to do it. --mezhirov
|
|
if (threshold < min_threshold) threshold = min_threshold;
|
|
}
|
|
|
|
if (blob->outlines)
|
|
tesseract_->AdaptToChar(blob, &LineStats, id, threshold);
|
|
free_blob(blob);
|
|
}
|
|
|
|
|
|
PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
|
|
PAGE_RES *page_res = new PAGE_RES(block_list);
|
|
tesseract_->recog_all_words(page_res, NULL, NULL, 1);
|
|
return page_res;
|
|
}
|
|
|
|
PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
|
|
PAGE_RES* pass1_result) {
|
|
if (!pass1_result)
|
|
pass1_result = new PAGE_RES(block_list);
|
|
tesseract_->recog_all_words(pass1_result, NULL, NULL, 2);
|
|
return pass1_result;
|
|
}
|
|
|
|
struct TESS_CHAR : ELIST_LINK {
|
|
char *unicode_repr;
|
|
int length; // of unicode_repr
|
|
float cost;
|
|
TBOX box;
|
|
|
|
TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
|
|
length = (len == -1 ? strlen(repr) : len);
|
|
unicode_repr = new char[length + 1];
|
|
strncpy(unicode_repr, repr, length);
|
|
}
|
|
|
|
TESS_CHAR() { // Satisfies ELISTIZE.
|
|
}
|
|
~TESS_CHAR() {
|
|
delete [] unicode_repr;
|
|
}
|
|
};
|
|
|
|
ELISTIZEH(TESS_CHAR)
|
|
ELISTIZE(TESS_CHAR)
|
|
|
|
static void add_space(TESS_CHAR_IT* it) {
|
|
TESS_CHAR *t = new TESS_CHAR(0, " ");
|
|
it->add_after_then_move(t);
|
|
}
|
|
|
|
|
|
static float rating_to_cost(float rating) {
|
|
rating = 100 + 5*rating;
|
|
// cuddled that to save from coverage profiler
|
|
// (I have never seen ratings worse than -100,
|
|
// but the check won't hurt)
|
|
if (rating < 0) rating = 0;
|
|
return rating;
|
|
}
|
|
|
|
|
|
// Extract the OCR results, costs (penalty points for uncertainty),
|
|
// and the bounding boxes of the characters.
|
|
static void extract_result(TESS_CHAR_IT* out,
|
|
PAGE_RES* page_res) {
|
|
PAGE_RES_IT page_res_it(page_res);
|
|
int word_count = 0;
|
|
while (page_res_it.word() != NULL) {
|
|
WERD_RES *word = page_res_it.word();
|
|
const char *str = word->best_choice->unichar_string().string();
|
|
const char *len = word->best_choice->unichar_lengths().string();
|
|
|
|
if (word_count)
|
|
add_space(out);
|
|
TBOX bln_rect;
|
|
PBLOB_LIST *blobs = word->outword->blob_list();
|
|
PBLOB_IT it(blobs);
|
|
int n = strlen(len);
|
|
TBOX** boxes_to_fix = new TBOX*[n];
|
|
for (int i = 0; i < n; i++) {
|
|
PBLOB *blob = it.data();
|
|
TBOX current = blob->bounding_box();
|
|
bln_rect = bln_rect.bounding_union(current);
|
|
TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->certainty()),
|
|
str, *len);
|
|
tc->box = current;
|
|
boxes_to_fix[i] = &tc->box;
|
|
|
|
out->add_after_then_move(tc);
|
|
it.forward();
|
|
str += *len;
|
|
len++;
|
|
}
|
|
|
|
// Find the word bbox before normalization.
|
|
// Here we can't use the C_BLOB bboxes directly,
|
|
// since connected letters are not yet cut.
|
|
TBOX real_rect = word->word->bounding_box();
|
|
|
|
// Denormalize boxes by transforming the bbox of the whole bln word
|
|
// into the denorm bbox (`real_rect') of the whole word.
|
|
double x_stretch = static_cast<double>(real_rect.width())
|
|
/ bln_rect.width();
|
|
double y_stretch = static_cast<double>(real_rect.height())
|
|
/ bln_rect.height();
|
|
for (int j = 0; j < n; j++) {
|
|
TBOX *box = boxes_to_fix[j];
|
|
int x0 = static_cast<int>(real_rect.left() +
|
|
x_stretch * (box->left() - bln_rect.left()) + 0.5);
|
|
int x1 = static_cast<int>(real_rect.left() +
|
|
x_stretch * (box->right() - bln_rect.left()) + 0.5);
|
|
int y0 = static_cast<int>(real_rect.bottom() +
|
|
y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5);
|
|
int y1 = static_cast<int>(real_rect.bottom() +
|
|
y_stretch * (box->top() - bln_rect.bottom()) + 0.5);
|
|
*box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1));
|
|
}
|
|
delete [] boxes_to_fix;
|
|
|
|
page_res_it.forward();
|
|
word_count++;
|
|
}
|
|
}
|
|
|
|
|
|
// Extract the OCR results, costs (penalty points for uncertainty),
|
|
// and the bounding boxes of the characters.
|
|
int TessBaseAPI::TesseractExtractResult(char** text,
|
|
int** lengths,
|
|
float** costs,
|
|
int** x0,
|
|
int** y0,
|
|
int** x1,
|
|
int** y1,
|
|
PAGE_RES* page_res) {
|
|
TESS_CHAR_LIST tess_chars;
|
|
TESS_CHAR_IT tess_chars_it(&tess_chars);
|
|
extract_result(&tess_chars_it, page_res);
|
|
tess_chars_it.move_to_first();
|
|
int n = tess_chars.length();
|
|
int text_len = 0;
|
|
*lengths = new int[n];
|
|
*costs = new float[n];
|
|
*x0 = new int[n];
|
|
*y0 = new int[n];
|
|
*x1 = new int[n];
|
|
*y1 = new int[n];
|
|
int i = 0;
|
|
for (tess_chars_it.mark_cycle_pt();
|
|
!tess_chars_it.cycled_list();
|
|
tess_chars_it.forward(), i++) {
|
|
TESS_CHAR *tc = tess_chars_it.data();
|
|
text_len += (*lengths)[i] = tc->length;
|
|
(*costs)[i] = tc->cost;
|
|
(*x0)[i] = tc->box.left();
|
|
(*y0)[i] = tc->box.bottom();
|
|
(*x1)[i] = tc->box.right();
|
|
(*y1)[i] = tc->box.top();
|
|
}
|
|
char *p = *text = new char[text_len];
|
|
|
|
tess_chars_it.move_to_first();
|
|
for (tess_chars_it.mark_cycle_pt();
|
|
!tess_chars_it.cycled_list();
|
|
tess_chars_it.forward()) {
|
|
TESS_CHAR *tc = tess_chars_it.data();
|
|
strncpy(p, tc->unicode_repr, tc->length);
|
|
p += tc->length;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
// This method returns the features associated with the current image.
|
|
// Make sure setimage has been called before calling this method.
|
|
void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features,
|
|
int* num_features) {
|
|
if (page_res_ != NULL)
|
|
ClearResults();
|
|
if (!threshold_done_)
|
|
Threshold(NULL);
|
|
// We have only one block, which is of the size of the page.
|
|
BLOCK_LIST* blocks = new BLOCK_LIST;
|
|
BLOCK *block = new BLOCK("", // filename.
|
|
TRUE, // proportional.
|
|
0, // kerning.
|
|
0, // spacing.
|
|
0, // Left.
|
|
0, // Bottom.
|
|
page_image.get_xsize(), // Right.
|
|
page_image.get_ysize()); // Top.
|
|
ICOORD bleft, tright;
|
|
block->bounding_box (bleft, tright);
|
|
|
|
BLOCK_IT block_it_add = blocks;
|
|
block_it_add.add_to_end(block);
|
|
|
|
ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize());
|
|
TEXTROW tessrow;
|
|
make_tess_row(NULL, // Denormalizer.
|
|
&tessrow); // Output row.
|
|
LINE_STATS line_stats;
|
|
GetLineStatsFromRow(&tessrow, &line_stats);
|
|
|
|
// Perform a CC analysis to detect the blobs.
|
|
BLOCK_IT block_it = blocks;
|
|
for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
|
|
block_it.forward ()) {
|
|
BLOCK* block = block_it.data();
|
|
#ifndef GRAPHICS_DISABLED
|
|
extract_edges(NULL, // Scrollview window.
|
|
&page_image, // Image.
|
|
&page_image, // Thresholded image.
|
|
page_tr, // corner of page.
|
|
block); // block.
|
|
#else
|
|
extract_edges(&page_image, // Image.
|
|
&page_image, // Thresholded image.
|
|
page_tr, // corner of page.
|
|
block); // block.
|
|
#endif
|
|
C_BLOB_IT blob_it = block->blob_list();
|
|
PBLOB *pblob = new PBLOB;
|
|
// Iterate over all blobs found and get their features.
|
|
for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
|
|
blob_it.forward()) {
|
|
C_BLOB* blob = blob_it.data();
|
|
blob = blob;
|
|
PBLOB c_as_p(blob, page_image.get_ysize());
|
|
merge_blobs(pblob, &c_as_p);
|
|
}
|
|
|
|
PBLOB_LIST *pblob_list = new PBLOB_LIST;
|
|
PBLOB_IT pblob_it(pblob_list);
|
|
pblob_it.add_after_then_move(pblob);
|
|
WERD word(pblob_list, // Blob list.
|
|
0, // Blanks in front.
|
|
" "); // Correct text.
|
|
ROW *row = make_tess_ocrrow(0, // baseline.
|
|
page_image.get_ysize(), // xheight.
|
|
0, // ascent.
|
|
0); // descent.
|
|
word.baseline_normalise(row);
|
|
delete row;
|
|
if (pblob->out_list () == NULL) {
|
|
tprintf("Blob list is empty");
|
|
}
|
|
TBLOB* tblob = make_tess_blob(pblob, // Blob.
|
|
TRUE); // Flatten.
|
|
|
|
CLASS_NORMALIZATION_ARRAY norm_array;
|
|
inT32 len;
|
|
*num_features = tesseract_->GetCharNormFeatures(
|
|
tblob, &line_stats,
|
|
tesseract_->PreTrainedTemplates,
|
|
int_features, norm_array, &len);
|
|
}
|
|
delete blocks;
|
|
}
|
|
|
|
// Return the pointer to the i-th dawg loaded into tesseract_ object.
|
|
const Dawg *TessBaseAPI::GetDawg(int i) const {
|
|
if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
|
|
return tesseract_->getDict().GetDawg(i);
|
|
}
|
|
|
|
// Return the number of dawgs loaded into tesseract_ object.
|
|
int TessBaseAPI::NumDawgs() const {
|
|
return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
|
|
}
|
|
|
|
// Return the language used in the last valid initialization.
|
|
const char* TessBaseAPI::GetLastInitLanguage() const {
|
|
return (tesseract_ == NULL || tesseract_->lang.string() == NULL) ?
|
|
"" : tesseract_->lang.string();
|
|
}
|
|
} // namespace tesseract.
|