/********************************************************************** * File: baseapi.cpp * Description: Simple API for calling tesseract. * Author: Ray Smith * Created: Fri Oct 06 15:35:01 PDT 2006 * * (C) Copyright 2006, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif #ifdef HAVE_LIBLEPT // Include leptonica library only if autoconf (or makefile etc) tell us to. #include "allheaders.h" #endif #include "baseapi.h" #include "thresholder.h" #include "tesseractmain.h" #include "tesseractclass.h" #include "tessedit.h" #include "ocrclass.h" #include "pageres.h" #include "tessvars.h" #include "control.h" #include "applybox.h" #include "pgedit.h" #include "varabled.h" #include "output.h" #include "mainblk.h" #include "globals.h" #include "adaptmatch.h" #include "edgblob.h" #include "tessbox.h" #include "tordvars.h" #include "imgs.h" #include "makerow.h" #include "tstruct.h" #include "tessout.h" #include "tface.h" #include "permute.h" #include "otsuthr.h" #include "osdetect.h" #include "chopper.h" #include "matchtab.h" namespace tesseract { // Minimum sensible image size to be worth running tesseract. const int kMinRectSize = 10; // Character returned when Tesseract couldn't recognize as anything. const char kTesseractReject = '~'; // Character used by UNLV error counter as a reject. const char kUNLVReject = '~'; // Character used by UNLV as a suspect marker. const char kUNLVSuspect = '^'; // Filename used for input image file, from which to derive a name to search // for a possible UNLV zone file, if none is specified by SetInputName. const char* kInputFile = "noname.tif"; TessBaseAPI::TessBaseAPI() : tesseract_(NULL), // Thresholder is initialized to NULL here, but will be set before use by: // A constructor of a derived API, SetThresholder(), or // created implicitly when used in InternalSetImage. thresholder_(NULL), threshold_done_(false), block_list_(NULL), page_res_(NULL), input_file_(NULL), output_file_(NULL), datapath_(NULL), language_(NULL), rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0), image_width_(0), image_height_(0) { } TessBaseAPI::~TessBaseAPI() { End(); } // Set the name of the input file. Needed only for training and // loading a UNLV zone file. void TessBaseAPI::SetInputName(const char* name) { if (input_file_ == NULL) input_file_ = new STRING(name); else *input_file_ = name; } // Set the name of the output files. Needed only for debugging. void TessBaseAPI::SetOutputName(const char* name) { if (output_file_ == NULL) output_file_ = new STRING(name); else *output_file_ = name; } // Set the value of an internal "variable" (of either old or new types). // Supply the name of the variable and the value as a string, just as // you would in a config file. // Returns false if the name lookup failed. // SetVariable may be used before Init, to set things that control // initialization, but note that on End all settings are lost and // the next Init will use the defaults unless SetVariable is used again. bool TessBaseAPI::SetVariable(const char* variable, const char* value) { if (tesseract_ == NULL) tesseract_ = new Tesseract; return set_variable(variable, value); } // The datapath must be the name of the data directory (no ending /) or // some other file in which the data directory resides (for instance argv[0].) // The language is (usually) an ISO 639-3 string or NULL will default to eng. // If numeric_mode is true, then only digits and Roman numerals will // be returned. // Returns 0 on success and -1 on initialization failure. int TessBaseAPI::Init(const char* datapath, const char* language, char **configs, int configs_size, bool configs_global_only) { // If the datapath or the language have changed, then start again. // Note that the language_ field stores the last requested language that was // initialized successfully, while tesseract_->lang stores the language // actually used. They differ only if the requested language was NULL, in // which case tesseract_->lang is set to the Tesseract default ("eng"). if (tesseract_ != NULL && (datapath_ == NULL || language_ == NULL || *datapath_ != datapath || (*language_ != language && tesseract_->lang != language))) { tesseract_->end_tesseract(); delete tesseract_; tesseract_ = NULL; } bool reset_classifier = true; if (tesseract_ == NULL) { reset_classifier = false; tesseract_ = new Tesseract; if (tesseract_->init_tesseract( datapath, output_file_ != NULL ? output_file_->string() : NULL, language, configs, configs_size, configs_global_only) != 0) { return -1; } } // Update datapath and language requested for the last valid initialization. if (datapath_ == NULL) datapath_ = new STRING(datapath); else *datapath_ = datapath; if (language_ == NULL) language_ = new STRING(language); else *language_ = language; // For same language and datapath, just reset the adaptive classifier. if (reset_classifier) tesseract_->ResetAdaptiveClassifier(); return 0; } // Init only the lang model component of Tesseract. The only functions // that work after this init are SetVariable and IsValidWord. // WARNING: temporary! This function will be removed from here and placed // in a separate API at some future time. int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { if (tesseract_ == NULL) tesseract_ = new Tesseract; return tesseract_->init_tesseract_lm(datapath, NULL, language); } // Init only the classifer component of Tesseract. Used to initialize the // specified language when no dawg models are available. int TessBaseAPI::InitWithoutLangModel(const char* datapath, const char* language) { // If the datapath or the language have changed, then start again. if (tesseract_ != NULL && (datapath_ == NULL || language_ == NULL || *datapath_ != datapath || *language_ != language)) { tesseract_->end_tesseract(); delete tesseract_; tesseract_ = NULL; } if (datapath_ == NULL) datapath_ = new STRING(datapath); else *datapath_ = datapath; if (language_ == NULL) language_ = new STRING(language); else *language_ = language; if (tesseract_ == NULL) { tesseract_ = new Tesseract; return tesseract_->init_tesseract_classifier( datapath, output_file_ != NULL ? output_file_->string() : NULL, language, NULL, 0, false); } // For same language and datapath, just reset the adaptive classifier. tesseract_->ResetAdaptiveClassifier(); return 0; } // Read a "config" file containing a set of variable, value pairs. // Searches the standard places: tessdata/configs, tessdata/tessconfigs // and also accepts a relative or absolute path name. void TessBaseAPI::ReadConfigFile(const char* filename, bool global_only) { tesseract_->read_config_file(filename, global_only); } // Set the current page segmentation mode. Defaults to PSM_AUTO. // The mode is stored as an INT_VARIABLE so it can also be modified by // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). void TessBaseAPI::SetPageSegMode(PageSegMode mode) { if (tesseract_ == NULL) tesseract_ = new Tesseract; tesseract_->tessedit_pageseg_mode.set_value(mode); } // Return the current page segmentation mode. PageSegMode TessBaseAPI::GetPageSegMode() const { if (tesseract_ == NULL) return PSM_SINGLE_BLOCK; return static_cast( static_cast(tesseract_->tessedit_pageseg_mode)); } // Set the hint for trading accuracy against speed. // Default is AVS_FASTEST, which is the old behaviour. // Note that this is only a hint. Depending on the language and/or // build configuration, speed and accuracy may not be tradeable. // Also note that despite being an enum, any value in the range // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not // have an effect, depending on the implementation. // The mode is stored as an INT_VARIABLE so it can also be modified by // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string). void TessBaseAPI::SetAccuracyVSpeed(AccuracyVSpeed mode) { if (tesseract_ == NULL) tesseract_ = new Tesseract; tesseract_->tessedit_accuracyvspeed.set_value(mode); } // Recognize a rectangle from an image and return the result as a string. // May be called many times for a single Init. // Currently has no error checking. // Greyscale of 8 and color of 24 or 32 bits per pixel may be given. // Palette color images will not work properly and must be converted to // 24 bit. // Binary images of 1 bit per pixel may also be given but they must be // byte packed with the MSB of the first byte being the first pixel, and a // one pixel is WHITE. For binary images set bytes_per_pixel=0. // The recognized text is returned as a char* which is coded // as UTF8 and must be freed with the delete [] operator. char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height) { if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize) return NULL; // Nothing worth doing. // Since this original api didn't give the exact size of the image, // we have to invent a reasonable value. int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel, bytes_per_line); SetRectangle(left, top, width, height); return GetUTF8Text(); } // Call between pages or documents etc to free up memory and forget // adaptive data. void TessBaseAPI::ClearAdaptiveClassifier() { if (tesseract_ == NULL) return; tesseract_->ResetAdaptiveClassifier(); } // Provide an image for Tesseract to recognize. Format is as // TesseractRect above. Does not copy the image buffer, or take // ownership. The source image may be destroyed after Recognize is called, // either explicitly or implicitly via one of the Get*Text functions. // SetImage clears all recognition results, and sets the rectangle to the // full image, so it may be followed immediately by a GetUTF8Text, and it // will automatically perform recognition. void TessBaseAPI::SetImage(const unsigned char* imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line) { if (InternalSetImage()) thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line); } // Provide an image for Tesseract to recognize. As with SetImage above, // Tesseract doesn't take a copy or ownership or pixDestroy the image, so // it must persist until after Recognize. // Pix vs raw, which to use? // Use Pix where possible. A future version of Tesseract may choose to use Pix // as its internal representation and discard IMAGE altogether. // Because of that, an implementation that sources and targets Pix may end up // with less copies than an implementation that does not. void TessBaseAPI::SetImage(const Pix* pix) { #ifdef HAVE_LIBLEPT if (InternalSetImage()) thresholder_->SetImage(pix); #endif } // Restrict recognition to a sub-rectangle of the image. Call after SetImage. // Each SetRectangle clears the recogntion results so multiple rectangles // can be recognized with the same image. void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { if (thresholder_ == NULL) return; thresholder_->SetRectangle(left, top, width, height); ClearResults(); } // ONLY available if you have Leptonica installed. // Get a copy of the internal thresholded image from Tesseract. Pix* TessBaseAPI::GetThresholdedImage() { #ifdef HAVE_LIBLEPT if (tesseract_ == NULL) return NULL; if (tesseract_->pix_binary() == NULL) Threshold(tesseract_->mutable_pix_binary()); return pixClone(tesseract_->pix_binary()); #endif } // Get the result of page layout analysis as a leptonica-style // Boxa, Pixa pair, in reading order. // Can be called before or after Recognize. // For now only gets text regions. Boxa* TessBaseAPI::GetRegions(Pixa** pixa) { #ifdef HAVE_LIBLEPT if (block_list_ == NULL || block_list_->empty()) { FindLines(); } int im_height = pixGetHeight(tesseract_->pix_binary()); Boxa* boxa = boxaCreate(block_list_->length()); if (pixa != NULL) { *pixa = pixaCreate(boxaGetCount(boxa)); } BLOCK_IT it(block_list_); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOCK* block = it.data(); POLY_BLOCK* poly = block->poly_block(); TBOX box; if (poly != NULL) { if (!poly->IsText()) continue; // Use only text blocks. POLY_BLOCK image_block(poly->points(), poly->isA()); image_block.rotate(block->re_rotation()); box = *image_block.bounding_box(); if (pixa != NULL) { Pix* pix = pixCreate(box.width(), box.height(), 1); PB_LINE_IT *lines; // Block outline is a polygon, so use a PC_LINE_IT to get the // rasterized interior. (Runs of interior pixels on a line.) lines = new PB_LINE_IT(&image_block); for (int y = box.bottom(); y < box.top(); ++y) { ICOORDELT_LIST* segments = lines->get_line(y); if (!segments->empty()) { ICOORDELT_IT s_it(segments); // Each element of segments is a start x and x size of the // run of interior pixels. for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) { int start = s_it.data()->x(); int xext = s_it.data()->y(); // Copy the run from the source image to the block image. pixRasterop(pix, start - box.left(), box.height() - 1 - (y - box.bottom()), xext, 1, PIX_SRC, tesseract_->pix_binary(), start, im_height - 1 - y); } } delete segments; } delete lines; pixaAddPix(*pixa, pix, L_INSERT); } } else { if (!block_list_->singleton()) continue; // A null poly block can only be used if it is the only block. box = block->bounding_box(); if (pixa != NULL) { Pix* pix = pixCreate(box.width(), box.height(), 1); // Just copy the whole block as there is only a bounding box. pixRasterop(pix, 0, 0, box.width(), box.height(), PIX_SRC, tesseract_->pix_binary(), box.left(), im_height - box.top()); pixaAddPix(*pixa, pix, L_INSERT); } } Box* lbox = boxCreate(box.left(), im_height - box.top(), box.width(), box.height()); boxaAddBox(boxa, lbox, L_INSERT); } return boxa; #endif } // Get the textlines as a leptonica-style // Boxa, Pixa pair, in reading order. // Can be called before or after Recognize. // If blockids is not NULL, the block-id of each line is also returned as an // array of one element per line. delete [] after use. Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) { #ifdef HAVE_LIBLEPT if (block_list_ == NULL || block_list_->empty()) { FindLines(); } // A local PAGE_RES prevents the clear if Recognize is called after. PAGE_RES page_res(block_list_); PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res); // Count the lines to get a size for the arrays. int line_count = 0; for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { if (page_res_it.row() != page_res_it.next_row()) { ++line_count; } } int im_height = pixGetHeight(tesseract_->pix_binary()); Boxa* boxa = boxaCreate(line_count); if (pixa != NULL) *pixa = pixaCreate(line_count); if (blockids != NULL) *blockids = new int[line_count]; int blockid = 0; int lineindex = 0; for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward(), ++lineindex) { WERD_RES *word = page_res_it.word(); BLOCK* block = page_res_it.block()->block; // Get the line bounding box. PAGE_RES_IT word_it(page_res_it); // Save start of line. TBOX line_box = word->word->bounding_box(); while (page_res_it.next_row() == page_res_it.row()) { page_res_it.forward(); word = page_res_it.word(); TBOX word_box = word->word->bounding_box(); word_box.rotate(block->re_rotation()); line_box += word_box; } Box* lbox = boxCreate(line_box.left(), im_height - line_box.top(), line_box.width(), line_box.height()); boxaAddBox(boxa, lbox, L_INSERT); if (pixa != NULL) { Pix* pix = pixCreate(line_box.width(), line_box.height(), 1); // Copy all the words to the output pix. while (word_it.row() == page_res_it.row()) { word = word_it.word(); TBOX word_box = word->word->bounding_box(); word_box.rotate(block->re_rotation()); pixRasterop(pix, word_box.left() - line_box.left(), line_box.top() - word_box.top(), word_box.width(), word_box.height(), PIX_SRC, tesseract_->pix_binary(), word_box.left(), im_height - word_box.top()); word_it.forward(); } pixaAddPix(*pixa, pix, L_INSERT); pixaAddBox(*pixa, lbox, L_CLONE); } if (blockids != NULL) { (*blockids)[lineindex] = blockid; if (page_res_it.block() != page_res_it.next_block()) ++blockid; } } return boxa; #endif } // Get the words as a leptonica-style // Boxa, Pixa pair, in reading order. // Can be called before or after Recognize. Boxa* TessBaseAPI::GetWords(Pixa** pixa) { #ifdef HAVE_LIBLEPT if (block_list_ == NULL || block_list_->empty()) { FindLines(); } // A local PAGE_RES prevents the clear if Recognize is called after. PAGE_RES page_res(block_list_); PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res); // Count the words to get a size for the arrays. int word_count = 0; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) ++word_count; int im_height = pixGetHeight(tesseract_->pix_binary()); Boxa* boxa = boxaCreate(word_count); if (pixa != NULL) { *pixa = pixaCreate(word_count); } for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); BLOCK* block = page_res_it.block()->block; TBOX box = word->word->bounding_box(); box.rotate(block->re_rotation()); Box* lbox = boxCreate(box.left(), im_height - box.top(), box.width(), box.height()); boxaAddBox(boxa, lbox, L_INSERT); if (pixa != NULL) { Pix* pix = pixCreate(box.width(), box.height(), 1); // Copy the whole word bounding box to the output pix. pixRasterop(pix, 0, 0, box.width(), box.height(), PIX_SRC, tesseract_->pix_binary(), box.left(), im_height - box.top()); pixaAddPix(*pixa, pix, L_INSERT); pixaAddBox(*pixa, lbox, L_CLONE); } } return boxa; #endif // HAVE_LIBLEPT } // Dump the internal binary image to a PGM file. void TessBaseAPI::DumpPGM(const char* filename) { if (tesseract_ == NULL) return; IMAGELINE line; line.init(page_image.get_xsize()); FILE *fp = fopen(filename, "w"); fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n", page_image.get_xsize(), page_image.get_ysize()); for (int j = page_image.get_ysize()-1; j >= 0 ; --j) { page_image.get_line(0, j, page_image.get_xsize(), &line, 0); for (int i = 0; i < page_image.get_xsize(); ++i) { uinT8 b = line.pixels[i] ? 255 : 0; fwrite(&b, 1, 1, fp); } } fclose(fp); } // Recognize the tesseract global image and return the result as Tesseract // internal structures. int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) { if (tesseract_ == NULL) return -1; if (thresholder_ == NULL || thresholder_->IsEmpty()) { tprintf("Please call SetImage before attempting recognition."); return -1; } if (page_res_ != NULL) ClearResults(); if (FindLines() != 0) return -1; if (tesseract_->tessedit_resegment_from_boxes) tesseract_->apply_boxes(*input_file_, block_list_); tesseract_->SetBlackAndWhitelist(); page_res_ = new PAGE_RES(block_list_); int result = 0; if (interactive_mode) { tesseract_->pgeditor_main(block_list_); // The page_res is invalid after an interactive session, so cleanup // in a way that lets us continue to the next page without crashing. delete page_res_; page_res_ = NULL; return -1; } else if (tesseract_->tessedit_train_from_boxes) { apply_box_training(*output_file_, block_list_); } else if (tesseract_->global_tessedit_ambigs_training) { FILE *ambigs_output_file = tesseract_->init_ambigs_training(*input_file_); // OCR the page segmented into words by tesseract. tesseract_->ambigs_training_segmented( *input_file_, page_res_, monitor, ambigs_output_file); fclose(ambigs_output_file); } else { // Now run the main recognition. // Running base tesseract if the inttemp for the current language loaded. if (tesseract_->inttemp_loaded_) { tesseract_->recog_all_words(page_res_, monitor); } } return result; } // Tests the chopper by exhaustively running chop_one_blob. int TessBaseAPI::RecognizeForChopTest(struct ETEXT_STRUCT* monitor) { if (tesseract_ == NULL) return -1; if (thresholder_ == NULL || thresholder_->IsEmpty()) { tprintf("Please call SetImage before attempting recognition."); return -1; } if (page_res_ != NULL) ClearResults(); if (FindLines() != 0) return -1; // Additional conditions under which chopper test cannot be run if (tesseract_->tessedit_train_from_boxes_word_level || interactive_mode) return -1; ASSERT_HOST(tesseract_->inttemp_loaded_); page_res_ = new PAGE_RES(block_list_); PAGE_RES_IT page_res_it(page_res_); tesseract_->tess_matcher = &Tesseract::tess_default_matcher; tesseract_->tess_tester = NULL; tesseract_->tess_trainer = NULL; while (page_res_it.word() != NULL) { WERD_RES *word_res = page_res_it.word(); WERD *word = word_res->word; if (word->cblob_list()->empty()) { page_res_it.forward(); continue; } WERD *bln_word = make_bln_copy(word, page_res_it.row()->row, page_res_it.block()->block, word_res->x_height, &word_res->denorm); ASSERT_HOST(!bln_word->blob_list()->empty()); TWERD *tessword = make_tess_word(bln_word, NULL); if (tessword->blobs == NULL) { make_tess_word(bln_word, NULL); } TBLOB *pblob; TBLOB *blob; init_match_table(); BLOB_CHOICE_LIST *match_result; BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); tesseract_->tess_denorm = &word_res->denorm; tesseract_->tess_word = bln_word; ASSERT_HOST(tessword->blobs != NULL); for (blob = tessword->blobs, pblob = NULL; blob != NULL; blob = blob->next) { match_result = tesseract_->classify_blob(pblob, blob, blob->next, NULL, "chop_word:", Green); if (match_result == NULL) tprintf("Null classifier output!\n"); tesseract_->modify_blob_choice(match_result, 0); ASSERT_HOST(!match_result->empty()); *char_choices += match_result; pblob = blob; } inT32 blob_number; SEAMS seam_list = start_seam_list(tessword->blobs); int right_chop_index = 0; while (tesseract_->chop_one_blob(tessword, char_choices, &blob_number, &seam_list, &right_chop_index)) { } word_res->best_choice = new WERD_CHOICE(); word_res->raw_choice = new WERD_CHOICE(); word_res->best_choice->make_bad(); word_res->raw_choice->make_bad(); tesseract_->getDict().permute_characters(*char_choices, 1000.0, word_res->best_choice, word_res->raw_choice); word_res->outword = make_ed_word(tessword, bln_word); page_res_it.forward(); } return 0; } // Make a text string from the internal data structures. char* TessBaseAPI::GetUTF8Text() { if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0)) return NULL; int total_length = TextLength(NULL); PAGE_RES_IT page_res_it(page_res_); char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); WERD_CHOICE* choice = word->best_choice; if (choice != NULL) { strcpy(ptr, choice->unichar_string().string()); ptr += choice->unichar_string().length(); if (word->word->flag(W_EOL)) *ptr++ = '\n'; else *ptr++ = ' '; } } *ptr++ = '\n'; *ptr = '\0'; return result; } // Helper returns true if there is a paragraph break between bbox_cur, // and bbox_prev. // TODO(rays) improve and incorporate deeper into tesseract, so other // output methods get the benefit. static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev, int right, int line_height) { // Check if the distance between lines is larger than the normal leading, if (fabs((float)(bbox_cur.bottom() - bbox_prev.bottom())) > line_height * 2) return true; // Check if the distance between left bounds of the two lines is nearly the // same as between their right bounds (if so, then both lines probably belong // to the same paragraph, maybe a centered one). if (fabs((float)((bbox_cur.left() - bbox_prev.left()) - (bbox_prev.right() - bbox_cur.right()))) < line_height) return false; // Check if there is a paragraph indent at this line (either -ve or +ve). if (fabs((float)(bbox_cur.left() - bbox_prev.left())) > line_height) return true; // Check if both current and previous line don't reach the right bound of the // block, but the distance is different. This will cause all lines in a verse // to be treated as separate paragraphs, but most probably will not split // block-quotes to separate lines (at least if the text is justified). if (fabs((float)(bbox_cur.right() - bbox_prev.right())) > line_height && right - bbox_cur.right() > line_height && right - bbox_prev.right() > line_height) return true; return false; } // Helper to add the hOCR for a box to the given hocr_str. static void AddBoxTohOCR(const TBOX& box, int image_height, STRING* hocr_str) { hocr_str->add_str_int("' title=\"bbox ", box.left()); hocr_str->add_str_int(" ", image_height - box.top()); hocr_str->add_str_int(" ", box.right()); hocr_str->add_str_int(" ", image_height - box.bottom()); *hocr_str += "\">"; } // Make a HTML-formatted string with hOCR markup from the internal // data structures. // STL removed from original patch submission and refactored by rays. // page_id is 1-based and will appear in the output. char* TessBaseAPI::GetHOCRText(int page_id) { if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0)) return NULL; PAGE_RES_IT page_res_it(page_res_); ROW_RES *row = NULL; // current row ROW *real_row = NULL, *prev_row = NULL; BLOCK_RES *block = NULL; // current row BLOCK *real_block = NULL; int lcnt = 1, bcnt = 1, wcnt = 1; STRING hocr_str; hocr_str.add_str_int("
\n"; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { if (block != page_res_it.block ()) { if (block != NULL) { hocr_str += "\n

\n
\n"; } block = page_res_it.block (); // current row real_block = block->block; real_row = NULL; row = NULL; hocr_str.add_str_int("
\n"; } if (row != page_res_it.row ()) { if (row != NULL) { hocr_str += "\n"; } prev_row = real_row; row = page_res_it.row (); // current row real_row = row->row; if (prev_row != NULL && IsParagraphBreak(real_row->bounding_box(), prev_row->bounding_box(), real_block->bounding_box().right(), real_row->x_height() + real_row->ascenders())) hocr_str += "

\n

\n"; hocr_str.add_str_int("certainty()); hocr_str += "\">"; if (word->bold > 0) hocr_str += ""; if (word->italic > 0) hocr_str += ""; hocr_str += choice->unichar_string(); if (word->italic > 0) hocr_str += ""; if (word->bold > 0) hocr_str += ""; hocr_str += ""; if (!word->word->flag(W_EOL)) hocr_str += " "; } } hocr_str += "\n

\n"; hocr_str += "
\n\n"; char *ret = new char[hocr_str.length() + 1]; strcpy(ret, hocr_str.string()); return ret; } static int ConvertWordToBoxText(WERD_RES *word, ROW_RES* row, int left, int bottom, int image_width, int image_height, int page_number, char* word_str) { // Copy the output word and denormalize it back to image coords. WERD copy_outword; copy_outword = *(word->outword); copy_outword.baseline_denormalise(&word->denorm); PBLOB_IT blob_it; blob_it.set_to_list(copy_outword.blob_list()); int length = copy_outword.blob_list()->length(); int output_size = 0; if (length > 0) { for (int index = 0, offset = 0; index < length; offset += word->best_choice->unichar_lengths()[index++], blob_it.forward()) { PBLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (word->tess_failed || blob_box.left() < 0 || blob_box.right() > image_width || blob_box.bottom() < 0 || blob_box.top() > image_height) { // Bounding boxes can be illegal when tess fails on a word. blob_box = word->word->bounding_box(); // Use original word as backup. tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n", blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top()); } // A single classification unit can be composed of several UTF-8 // characters. Append each of them to the result. for (int sub = 0; sub < word->best_choice->unichar_lengths()[index]; ++sub) { char ch = word->best_choice->unichar_string()[offset + sub]; // Tesseract uses space for recognition failure. Fix to a reject // character, kTesseractReject so we don't create illegal box files. if (ch == ' ') ch = kTesseractReject; word_str[output_size++] = ch; } sprintf(word_str + output_size, " %d %d %d %d %d\n", blob_box.left() + left, blob_box.bottom() + bottom, blob_box.right() + left, blob_box.top() + bottom, page_number); output_size += strlen(word_str + output_size); } } return output_size; } // Multiplier for max expected textlength assumes typically 5 numbers @ // (5 digits and a space) plus the newline = 5*(5+1)+1. Add to this the // orginal UTF8 characters, and one kMaxCharsPerChar. const int kCharsPerChar = 31; // A maximal single box could occupy 5 numbers at 20 digits (for 64 bit) and a // space plus the newline 5*(20+1)+1 and the maximum length of a UNICHAR. // Test against this on each iteration for safety. const int kMaxCharsPerChar = 106 + UNICHAR_LEN; // The recognized text is returned as a char* which is coded // as a UTF8 box file and must be freed with the delete [] operator. // page_number is a 0-base page index that will appear in the box file. char* TessBaseAPI::GetBoxText(int page_number) { int bottom = image_height_ - (rect_top_ + rect_height_); if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0)) return NULL; int blob_count; int utf8_length = TextLength(&blob_count); int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar; PAGE_RES_IT page_res_it(page_res_); char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom, image_width_, image_height_, page_number, ptr); // Just in case... if (ptr - result + kMaxCharsPerChar > total_length) break; } *ptr = '\0'; return result; } // Conversion table for non-latin characters. // Maps characters out of the latin set into the latin set. // TODO(rays) incorporate this translation into unicharset. const int kUniChs[] = { 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0 }; // Latin chars corresponding to the unicode chars above. const int kLatinChs[] = { 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0 }; // The recognized text is returned as a char* which is coded // as UNLV format Latin-1 with specific reject and suspect codes // and must be freed with the delete [] operator. char* TessBaseAPI::GetUNLVText() { if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0)) return NULL; bool tilde_crunch_written = false; bool last_char_was_newline = true; bool last_char_was_tilde = false; int total_length = TextLength(NULL); PAGE_RES_IT page_res_it(page_res_); char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); // Process the current word. if (word->unlv_crunch_mode != CR_NONE) { if (word->unlv_crunch_mode != CR_DELETE && (!tilde_crunch_written || (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)) { /* Write a space to separate from preceeding good text */ *ptr++ = ' '; last_char_was_tilde = false; } if (!last_char_was_tilde) { // Write a reject char. last_char_was_tilde = true; *ptr++ = kUNLVReject; tilde_crunch_written = true; last_char_was_newline = false; } } } else { // NORMAL PROCESSING of non tilde crunched words. tilde_crunch_written = false; if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps) ensure_rep_chars_are_consistent(word); tesseract_->set_unlv_suspects(word); const char* wordstr = word->best_choice->unichar_string().string(); const STRING& lengths = word->best_choice->unichar_lengths(); int length = lengths.length(); int i = 0; int offset = 0; if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') { // Prevent adjacent tilde across words - we know that adjacent tildes // within words have been removed. // Skip the first character. offset = lengths[i++]; } if (i < length && wordstr[offset] != 0) { if (!last_char_was_newline) *ptr++ = ' '; else last_char_was_newline = false; for (; i < length; offset += lengths[i++]) { if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) { *ptr++ = kUNLVReject; last_char_was_tilde = true; } else { if (word->reject_map[i].rejected()) *ptr++ = kUNLVSuspect; UNICHAR ch(wordstr + offset, lengths[i]); int uni_ch = ch.first_uni(); for (int j = 0; kUniChs[j] != 0; ++j) { if (kUniChs[j] == uni_ch) { uni_ch = kLatinChs[j]; break; } } if (uni_ch <= 0xff) { *ptr++ = static_cast(uni_ch); last_char_was_tilde = false; } else { *ptr++ = kUNLVReject; last_char_was_tilde = true; } } } } } if (word->word->flag(W_EOL) && !last_char_was_newline) { /* Add a new line output */ *ptr++ = '\n'; tilde_crunch_written = false; last_char_was_newline = true; last_char_was_tilde = false; } } *ptr++ = '\n'; *ptr = '\0'; return result; } // Returns the average word confidence for Tesseract page result. int TessBaseAPI::MeanTextConf() { int* conf = AllWordConfidences(); if (!conf) return 0; int sum = 0; int *pt = conf; while (*pt >= 0) sum += *pt++; if (pt != conf) sum /= pt - conf; delete [] conf; return sum; } // Returns an array of all word confidences, terminated by -1. int* TessBaseAPI::AllWordConfidences() { if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0)) return NULL; int n_word = 0; PAGE_RES_IT res_it(page_res_); for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) n_word++; int* conf = new int[n_word+1]; n_word = 0; for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) { WERD_RES *word = res_it.word(); WERD_CHOICE* choice = word->best_choice; int w_conf = static_cast(100 + 5 * choice->certainty()); // This is the eq for converting Tesseract confidence to 1..100 if (w_conf < 0) w_conf = 0; if (w_conf > 100) w_conf = 100; conf[n_word++] = w_conf; } conf[n_word] = -1; return conf; } // Free up recognition results and any stored image data, without actually // freeing any recognition data that would be time-consuming to reload. // Afterwards, you must call SetImage or TesseractRect before doing // any Recognize or Get* operation. void TessBaseAPI::Clear() { if (thresholder_ != NULL) thresholder_->Clear(); ClearResults(); page_image.destroy(); } // Close down tesseract and free up all memory. End() is equivalent to // destructing and reconstructing your TessBaseAPI. // Once End() has been used, none of the other API functions may be used // other than Init and anything declared above it in the class definition. void TessBaseAPI::End() { if (thresholder_ != NULL) { delete thresholder_; thresholder_ = NULL; } if (page_res_ != NULL) { delete page_res_; page_res_ = NULL; } if (block_list_ != NULL) { delete block_list_; block_list_ = NULL; } if (tesseract_ != NULL) { tesseract_->end_tesseract(); delete tesseract_; tesseract_ = NULL; } if (input_file_ != NULL) { delete input_file_; input_file_ = NULL; } if (output_file_ != NULL) { delete output_file_; output_file_ = NULL; } if (datapath_ != NULL) { delete datapath_; datapath_ = NULL; } if (language_ != NULL) { delete language_; language_ = NULL; } } // Check whether a word is valid according to Tesseract's language model // returns 0 if the word is invalid, non-zero if valid int TessBaseAPI::IsValidWord(const char *word) { return tesseract_->getDict().valid_word(word); } bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) { if (page_res_ == NULL) FindLines(); if (block_list_->length() < 1) { return false; } // Get first block BLOCK_IT block_it(block_list_); block_it.move_to_first(); ROW_LIST* rows = block_it.data()->row_list(); if (rows->length() != 1) { return false; } // Get first line of block ROW_IT row_it(rows); row_it.move_to_first(); ROW* row = row_it.data(); // Calculate offset and slope (NOTE: Kind of ugly) *out_offset = static_cast(row->base_line(0.0)); *out_slope = row->base_line(1.0) - row->base_line(0.0); return true; } // Set the letter_is_okay function to point somewhere else. void TessBaseAPI::SetDictFunc(DictFunc f) { if (tesseract_ != NULL) { tesseract_->getDict().letter_is_okay_ = f; } } // Common code for setting the image. bool TessBaseAPI::InternalSetImage() { if (tesseract_ == NULL) { tprintf("Please call Init before attempting to send an image."); return false; } if (thresholder_ == NULL) thresholder_ = new ImageThresholder; ClearResults(); return true; } // Run the thresholder to make the thresholded image. If pix is not NULL, // the source is thresholded to pix instead of the internal IMAGE. void TessBaseAPI::Threshold(Pix** pix) { #ifdef HAVE_LIBLEPT if (pix != NULL) thresholder_->ThresholdToPix(pix); else thresholder_->ThresholdToIMAGE(&page_image); #else thresholder_->ThresholdToIMAGE(&page_image); #endif thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_, &image_height_); threshold_done_ = true; } // Find lines from the image making the BLOCK_LIST. int TessBaseAPI::FindLines() { if (!block_list_->empty()) { return 0; } if (tesseract_ == NULL) { tesseract_ = new Tesseract; tesseract_->InitAdaptiveClassifier(); } #ifdef HAVE_LIBLEPT if (tesseract_->pix_binary() == NULL) Threshold(tesseract_->mutable_pix_binary()); #endif if (!threshold_done_) Threshold(NULL); if (tesseract_->SegmentPage(input_file_, &page_image, block_list_) < 0) return -1; ASSERT_HOST(page_image.get_xsize() == rect_width_ || page_image.get_xsize() == rect_width_ - 1); ASSERT_HOST(page_image.get_ysize() == rect_height_ || page_image.get_ysize() == rect_height_ - 1); return 0; } // Delete the pageres and clear the block list ready for a new page. void TessBaseAPI::ClearResults() { threshold_done_ = false; if (tesseract_ != NULL) tesseract_->Clear(); if (page_res_ != NULL) { delete page_res_; page_res_ = NULL; } if (block_list_ == NULL) block_list_ = new BLOCK_LIST; else block_list_->clear(); } // Return the length of the output text string, as UTF8, assuming // one newline per line and one per block, with a terminator, // and assuming a single character reject marker for each rejected character. // Also return the number of recognized blobs in blob_count. int TessBaseAPI::TextLength(int* blob_count) { if (tesseract_ == NULL || page_res_ == NULL) return 0; PAGE_RES_IT page_res_it(page_res_); int total_length = 2; int total_blobs = 0; // Iterate over the data structures to extract the recognition result. for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); WERD_CHOICE* choice = word->best_choice; if (choice != NULL) { total_blobs += choice->length() + 1; total_length += choice->unichar_string().length() + 1; for (int i = 0; i < word->reject_map.length(); ++i) { if (word->reject_map[i].rejected()) ++total_length; } } } if (blob_count != NULL) *blob_count = total_blobs; return total_length; } // Estimates the Orientation And Script of the image. // Returns true if the image was processed successfully. bool TessBaseAPI::DetectOS(OSResults* osr) { if (tesseract_ == NULL) return false; ClearResults(); Threshold(NULL); if (input_file_ == NULL) input_file_ = new STRING(kInputFile); return orientation_and_script_detection(*input_file_, osr, tesseract_); } // ____________________________________________________________________________ // Ocropus add-ons. // Find lines from the image making the BLOCK_LIST. BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() { FindLines(); BLOCK_LIST* result = block_list_; block_list_ = NULL; return result; } // Delete a block list. // This is to keep BLOCK_LIST pointer opaque // and let go of including the other headers. void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) { delete block_list; } static ROW *make_tess_ocrrow(float baseline, float xheight, float descender, float ascender) { inT32 xstarts[] = {-32000}; double quad_coeffs[] = {0, 0, baseline}; return new ROW(1, xstarts, quad_coeffs, xheight, ascender - (baseline + xheight), descender - baseline, 0, 0); } // Almost a copy of make_tess_row() from ccmain/tstruct.cpp. static void fill_dummy_row(float baseline, float xheight, float descender, float ascender, TEXTROW* tessrow) { tessrow->baseline.segments = 1; tessrow->baseline.xstarts[0] = -32767; tessrow->baseline.xstarts[1] = 32767; tessrow->baseline.quads[0].a = 0; tessrow->baseline.quads[0].b = 0; tessrow->baseline.quads[0].c = bln_baseline_offset; tessrow->xheight.segments = 1; tessrow->xheight.xstarts[0] = -32767; tessrow->xheight.xstarts[1] = 32767; tessrow->xheight.quads[0].a = 0; tessrow->xheight.quads[0].b = 0; tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height; tessrow->lineheight = bln_x_height; tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight; tessrow->descdrop = bln_x_height * (descender - baseline) / xheight; } // Return a TBLOB * from the whole page_image. // To be freed later with free_blob(). TBLOB *make_tesseract_blob(float baseline, float xheight, float descender, float ascender) { BLOCK *block = new BLOCK("a character", TRUE, 0, 0, 0, 0, page_image.get_xsize(), page_image.get_ysize()); // Create C_BLOBs from the page extract_edges(NULL, &page_image, &page_image, ICOORD(page_image.get_xsize(), page_image.get_ysize()), block); // Create one PBLOB from all C_BLOBs C_BLOB_LIST *list = block->blob_list(); C_BLOB_IT c_blob_it(list); PBLOB *pblob = new PBLOB; // will be (hopefully) deleted by the pblob_list for (c_blob_it.mark_cycle_pt(); !c_blob_it.cycled_list(); c_blob_it.forward()) { C_BLOB *c_blob = c_blob_it.data(); PBLOB c_as_p(c_blob, baseline + xheight); merge_blobs(pblob, &c_as_p); } PBLOB_LIST *pblob_list = new PBLOB_LIST; // will be deleted by the word PBLOB_IT pblob_it(pblob_list); pblob_it.add_after_then_move(pblob); // Normalize PBLOB WERD word(pblob_list, 0, " "); ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender); word.baseline_normalise(row); delete row; // Create a TBLOB from PBLOB return make_tess_blob(pblob, /* flatten: */ TRUE); } // Adapt to recognize the current image as the given character. // The image must be preloaded and be just an image of a single character. void TessBaseAPI::AdaptToCharacter(const char *unichar_repr, int length, float baseline, float xheight, float descender, float ascender) { UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length); LINE_STATS LineStats; TEXTROW row; fill_dummy_row(baseline, xheight, descender, ascender, &row); GetLineStatsFromRow(&row, &LineStats); TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender); float threshold; UNICHAR_ID best_class = 0; float best_rating = -100; // Classify to get a raw choice. BLOB_CHOICE_LIST choices; tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices, NULL); BLOB_CHOICE_IT choice_it; choice_it.set_to_list(&choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { if (choice_it.data()->rating() > best_rating) { best_rating = choice_it.data()->rating(); best_class = choice_it.data()->unichar_id(); } } if (id == best_class) { threshold = matcher_good_threshold; } else { /* the blob was incorrectly classified - find the rating threshold needed to create a template which will correct the error with some margin. However, don't waste time trying to make templates which are too tight. */ threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id); threshold *= .9; const float max_threshold = .125; const float min_threshold = .02; if (threshold > max_threshold) threshold = max_threshold; // I have cuddled the following line to set it out of the strike // of the coverage testing tool. I have no idea how to trigger // this situation nor I have any necessity to do it. --mezhirov if (threshold < min_threshold) threshold = min_threshold; } if (blob->outlines) tesseract_->AdaptToChar(blob, &LineStats, id, threshold); free_blob(blob); } PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) { PAGE_RES *page_res = new PAGE_RES(block_list); tesseract_->recog_all_words(page_res, NULL, NULL, 1); return page_res; } PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list, PAGE_RES* pass1_result) { if (!pass1_result) pass1_result = new PAGE_RES(block_list); tesseract_->recog_all_words(pass1_result, NULL, NULL, 2); return pass1_result; } struct TESS_CHAR : ELIST_LINK { char *unicode_repr; int length; // of unicode_repr float cost; TBOX box; TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) { length = (len == -1 ? strlen(repr) : len); unicode_repr = new char[length + 1]; strncpy(unicode_repr, repr, length); } TESS_CHAR() { // Satisfies ELISTIZE. } ~TESS_CHAR() { delete [] unicode_repr; } }; ELISTIZEH(TESS_CHAR) ELISTIZE(TESS_CHAR) static void add_space(TESS_CHAR_IT* it) { TESS_CHAR *t = new TESS_CHAR(0, " "); it->add_after_then_move(t); } static float rating_to_cost(float rating) { rating = 100 + 5*rating; // cuddled that to save from coverage profiler // (I have never seen ratings worse than -100, // but the check won't hurt) if (rating < 0) rating = 0; return rating; } // Extract the OCR results, costs (penalty points for uncertainty), // and the bounding boxes of the characters. static void extract_result(TESS_CHAR_IT* out, PAGE_RES* page_res) { PAGE_RES_IT page_res_it(page_res); int word_count = 0; while (page_res_it.word() != NULL) { WERD_RES *word = page_res_it.word(); const char *str = word->best_choice->unichar_string().string(); const char *len = word->best_choice->unichar_lengths().string(); if (word_count) add_space(out); TBOX bln_rect; PBLOB_LIST *blobs = word->outword->blob_list(); PBLOB_IT it(blobs); int n = strlen(len); TBOX** boxes_to_fix = new TBOX*[n]; for (int i = 0; i < n; i++) { PBLOB *blob = it.data(); TBOX current = blob->bounding_box(); bln_rect = bln_rect.bounding_union(current); TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->certainty()), str, *len); tc->box = current; boxes_to_fix[i] = &tc->box; out->add_after_then_move(tc); it.forward(); str += *len; len++; } // Find the word bbox before normalization. // Here we can't use the C_BLOB bboxes directly, // since connected letters are not yet cut. TBOX real_rect = word->word->bounding_box(); // Denormalize boxes by transforming the bbox of the whole bln word // into the denorm bbox (`real_rect') of the whole word. double x_stretch = static_cast(real_rect.width()) / bln_rect.width(); double y_stretch = static_cast(real_rect.height()) / bln_rect.height(); for (int j = 0; j < n; j++) { TBOX *box = boxes_to_fix[j]; int x0 = static_cast(real_rect.left() + x_stretch * (box->left() - bln_rect.left()) + 0.5); int x1 = static_cast(real_rect.left() + x_stretch * (box->right() - bln_rect.left()) + 0.5); int y0 = static_cast(real_rect.bottom() + y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5); int y1 = static_cast(real_rect.bottom() + y_stretch * (box->top() - bln_rect.bottom()) + 0.5); *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1)); } delete [] boxes_to_fix; page_res_it.forward(); word_count++; } } // Extract the OCR results, costs (penalty points for uncertainty), // and the bounding boxes of the characters. int TessBaseAPI::TesseractExtractResult(char** text, int** lengths, float** costs, int** x0, int** y0, int** x1, int** y1, PAGE_RES* page_res) { TESS_CHAR_LIST tess_chars; TESS_CHAR_IT tess_chars_it(&tess_chars); extract_result(&tess_chars_it, page_res); tess_chars_it.move_to_first(); int n = tess_chars.length(); int text_len = 0; *lengths = new int[n]; *costs = new float[n]; *x0 = new int[n]; *y0 = new int[n]; *x1 = new int[n]; *y1 = new int[n]; int i = 0; for (tess_chars_it.mark_cycle_pt(); !tess_chars_it.cycled_list(); tess_chars_it.forward(), i++) { TESS_CHAR *tc = tess_chars_it.data(); text_len += (*lengths)[i] = tc->length; (*costs)[i] = tc->cost; (*x0)[i] = tc->box.left(); (*y0)[i] = tc->box.bottom(); (*x1)[i] = tc->box.right(); (*y1)[i] = tc->box.top(); } char *p = *text = new char[text_len]; tess_chars_it.move_to_first(); for (tess_chars_it.mark_cycle_pt(); !tess_chars_it.cycled_list(); tess_chars_it.forward()) { TESS_CHAR *tc = tess_chars_it.data(); strncpy(p, tc->unicode_repr, tc->length); p += tc->length; } return n; } // This method returns the features associated with the current image. // Make sure setimage has been called before calling this method. void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features, int* num_features) { if (page_res_ != NULL) ClearResults(); if (!threshold_done_) Threshold(NULL); // We have only one block, which is of the size of the page. BLOCK_LIST* blocks = new BLOCK_LIST; BLOCK *block = new BLOCK("", // filename. TRUE, // proportional. 0, // kerning. 0, // spacing. 0, // Left. 0, // Bottom. page_image.get_xsize(), // Right. page_image.get_ysize()); // Top. ICOORD bleft, tright; block->bounding_box (bleft, tright); BLOCK_IT block_it_add = blocks; block_it_add.add_to_end(block); ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize()); TEXTROW tessrow; make_tess_row(NULL, // Denormalizer. &tessrow); // Output row. LINE_STATS line_stats; GetLineStatsFromRow(&tessrow, &line_stats); // Perform a CC analysis to detect the blobs. BLOCK_IT block_it = blocks; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { BLOCK* block = block_it.data(); #ifndef GRAPHICS_DISABLED extract_edges(NULL, // Scrollview window. &page_image, // Image. &page_image, // Thresholded image. page_tr, // corner of page. block); // block. #else extract_edges(&page_image, // Image. &page_image, // Thresholded image. page_tr, // corner of page. block); // block. #endif C_BLOB_IT blob_it = block->blob_list(); PBLOB *pblob = new PBLOB; // Iterate over all blobs found and get their features. for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); blob = blob; PBLOB c_as_p(blob, page_image.get_ysize()); merge_blobs(pblob, &c_as_p); } PBLOB_LIST *pblob_list = new PBLOB_LIST; PBLOB_IT pblob_it(pblob_list); pblob_it.add_after_then_move(pblob); WERD word(pblob_list, // Blob list. 0, // Blanks in front. " "); // Correct text. ROW *row = make_tess_ocrrow(0, // baseline. page_image.get_ysize(), // xheight. 0, // ascent. 0); // descent. word.baseline_normalise(row); delete row; if (pblob->out_list () == NULL) { tprintf("Blob list is empty"); } TBLOB* tblob = make_tess_blob(pblob, // Blob. TRUE); // Flatten. CLASS_NORMALIZATION_ARRAY norm_array; inT32 len; *num_features = tesseract_->GetCharNormFeatures( tblob, &line_stats, tesseract_->PreTrainedTemplates, int_features, norm_array, &len); } delete blocks; } // Return the pointer to the i-th dawg loaded into tesseract_ object. const Dawg *TessBaseAPI::GetDawg(int i) const { if (tesseract_ == NULL || i >= NumDawgs()) return NULL; return tesseract_->getDict().GetDawg(i); } // Return the number of dawgs loaded into tesseract_ object. int TessBaseAPI::NumDawgs() const { return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs(); } // Return the language used in the last valid initialization. const char* TessBaseAPI::GetLastInitLanguage() const { return (tesseract_ == NULL || tesseract_->lang.string() == NULL) ? "" : tesseract_->lang.string(); } } // namespace tesseract.