mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-18 19:39:24 +08:00
542 lines
27 KiB
C
542 lines
27 KiB
C
|
///////////////////////////////////////////////////////////////////////
|
||
|
// File: tesseractclass.h
|
||
|
// Description: An instance of Tesseract. For thread safety, *every*
|
||
|
// global variable goes in here, directly, or indirectly.
|
||
|
// Author: Ray Smith
|
||
|
// Created: Fri Mar 07 08:17:01 PST 2008
|
||
|
//
|
||
|
// (C) Copyright 2008, Google Inc.
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
#ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
|
||
|
#define TESSERACT_CCMAIN_TESSERACTCLASS_H__
|
||
|
|
||
|
#include "varable.h"
|
||
|
#include "wordrec.h"
|
||
|
#include "ocrclass.h"
|
||
|
#include "control.h"
|
||
|
#include "docqual.h"
|
||
|
|
||
|
class CHAR_SAMPLES_LIST;
|
||
|
class CHAR_SAMPLE_LIST;
|
||
|
class PAGE_RES;
|
||
|
class PAGE_RES_IT;
|
||
|
class BLOCK_LIST;
|
||
|
class TO_BLOCK_LIST;
|
||
|
class IMAGE;
|
||
|
class WERD_RES;
|
||
|
class ROW;
|
||
|
class TBOX;
|
||
|
class SVMenuNode;
|
||
|
struct Pix;
|
||
|
class WERD_CHOICE;
|
||
|
class WERD;
|
||
|
class BLOB_CHOICE_LIST_CLIST;
|
||
|
|
||
|
|
||
|
// Top-level class for all tesseract global instance data.
|
||
|
// This class either holds or points to all data used by an instance
|
||
|
// of Tesseract, including the memory allocator. When this is
|
||
|
// complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
|
||
|
//
|
||
|
// NOTE to developers: Do not create cyclic dependencies through this class!
|
||
|
// The directory dependency tree must remain a tree! The keep this clean,
|
||
|
// lower-level code (eg in ccutil, the bottom level) must never need to
|
||
|
// know about the content of a higher-level directory.
|
||
|
// The following scheme will grant the easiest access to lower-level
|
||
|
// global members without creating a cyclic dependency:
|
||
|
// ccmain inherits wordrec, includes textord as a member
|
||
|
// wordrec inherits classify
|
||
|
// classify inherits ccstruct, includes dict as a member
|
||
|
// ccstruct inherits c_util, includes image as a member
|
||
|
// c_util inherits cc_util
|
||
|
// textord has a pointer to ccstruct, but doesn't own it.
|
||
|
// dict has a pointer to ccstruct, but doesn't own it.
|
||
|
//
|
||
|
// NOTE: that each level contains members that correspond to global
|
||
|
// data that is defined (and used) at that level, not necessarily where
|
||
|
// the type is defined so for instance:
|
||
|
// BOOL_VAR (textord_show_blobs, FALSE, "Display unsorted blobs");
|
||
|
// goes inside the Textord class, not the cc_util class.
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
class Tesseract : public Wordrec {
|
||
|
public:
|
||
|
Tesseract();
|
||
|
~Tesseract();
|
||
|
|
||
|
void Clear();
|
||
|
|
||
|
// Simple accessors.
|
||
|
const FCOORD& reskew() const {
|
||
|
return reskew_;
|
||
|
}
|
||
|
// Destroy any existing pix and return a pointer to the pointer.
|
||
|
Pix** mutable_pix_binary() {
|
||
|
Clear();
|
||
|
return &pix_binary_;
|
||
|
}
|
||
|
Pix* pix_binary() const {
|
||
|
return pix_binary_;
|
||
|
}
|
||
|
|
||
|
void SetBlackAndWhitelist();
|
||
|
int SegmentPage(const STRING* input_file,
|
||
|
IMAGE* image, BLOCK_LIST* blocks);
|
||
|
int AutoPageSeg(int width, int height, int resolution,
|
||
|
bool single_column, IMAGE* image,
|
||
|
BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
|
||
|
|
||
|
//// control.h /////////////////////////////////////////////////////////
|
||
|
void recog_all_words( //process words
|
||
|
PAGE_RES *page_res, //page structure
|
||
|
//progress monitor
|
||
|
volatile ETEXT_DESC *monitor,
|
||
|
TBOX *target_word_box=0L,
|
||
|
inT16 dopasses=0
|
||
|
);
|
||
|
void classify_word_pass1( //recog one word
|
||
|
WERD_RES *word, //word to do
|
||
|
ROW *row,
|
||
|
BLOCK* block,
|
||
|
BOOL8 cluster_adapt,
|
||
|
CHAR_SAMPLES_LIST *char_clusters,
|
||
|
CHAR_SAMPLE_LIST *chars_waiting);
|
||
|
void recog_pseudo_word( //recognize blobs
|
||
|
BLOCK_LIST *block_list, //blocks to check
|
||
|
TBOX &selection_box);
|
||
|
|
||
|
// This method returns all the blobs in the specified blocks.
|
||
|
// It's the caller's responsibility to destroy the returned list.
|
||
|
C_BLOB_LIST* get_blobs_from_blocks(BLOCK_LIST* blocks // blocks to look at.
|
||
|
);
|
||
|
|
||
|
// This method can be used to perform word-level training using box files.
|
||
|
// TODO: this can be modified to perform training in general case too.
|
||
|
void train_word_level_with_boxes(
|
||
|
const STRING& box_file, // File with boxes.
|
||
|
const STRING& out_file, // Output file.
|
||
|
BLOCK_LIST* blocks // Blocks to use.
|
||
|
);
|
||
|
void fix_rep_char(WERD_RES *word);
|
||
|
void fix_quotes( //make double quotes
|
||
|
WERD_CHOICE *choice, //choice to fix
|
||
|
WERD *word, //word to do //char choices
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||
|
ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
|
||
|
const char *lengths);
|
||
|
void match_word_pass2( //recog one word
|
||
|
WERD_RES *word, //word to do
|
||
|
ROW *row,
|
||
|
BLOCK* block,
|
||
|
float x_height);
|
||
|
void classify_word_pass2( //word to do
|
||
|
WERD_RES *word,
|
||
|
BLOCK* block,
|
||
|
ROW *row);
|
||
|
BOOL8 recog_interactive( //recognize blobs
|
||
|
BLOCK *block, //block
|
||
|
ROW *row, //row of word
|
||
|
WERD *word //word to recognize
|
||
|
);
|
||
|
void fix_hyphens( //crunch double hyphens
|
||
|
WERD_CHOICE *choice, //choice to fix
|
||
|
WERD *word, //word to do //char choices
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||
|
void set_word_fonts(
|
||
|
WERD_RES *word, // word to adapt to
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices); // detailed results
|
||
|
void font_recognition_pass( //good chars in word
|
||
|
PAGE_RES_IT &page_res_it);
|
||
|
|
||
|
//// output.h //////////////////////////////////////////////////////////
|
||
|
|
||
|
void output_pass( //Tess output pass //send to api
|
||
|
PAGE_RES_IT &page_res_it,
|
||
|
BOOL8 write_to_shm,
|
||
|
TBOX *target_word_box);
|
||
|
FILE *open_outfile( //open .map & .unlv file
|
||
|
const char *extension);
|
||
|
void write_results( //output a word
|
||
|
PAGE_RES_IT &page_res_it, //full info
|
||
|
char newline_type, //type of newline
|
||
|
BOOL8 force_eol, //override tilde crunch?
|
||
|
BOOL8 write_to_shm //send to api
|
||
|
);
|
||
|
void set_unlv_suspects(WERD_RES *word);
|
||
|
UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated?
|
||
|
BOOL8 acceptable_number_string(const char *s,
|
||
|
const char *lengths);
|
||
|
inT16 count_alphanums(const WERD_CHOICE &word);
|
||
|
inT16 count_alphas(const WERD_CHOICE &word);
|
||
|
//// tessedit.h ////////////////////////////////////////////////////////
|
||
|
void read_config_file(const char *filename, bool global_only);
|
||
|
int init_tesseract(const char *arg0,
|
||
|
const char *textbase,
|
||
|
const char *language,
|
||
|
char **configs,
|
||
|
int configs_size,
|
||
|
bool configs_global_only);
|
||
|
|
||
|
int init_tesseract_lm(const char *arg0,
|
||
|
const char *textbase,
|
||
|
const char *language);
|
||
|
|
||
|
// Initializes the tesseract classifier without loading language models.
|
||
|
int init_tesseract_classifier(const char *arg0,
|
||
|
const char *textbase,
|
||
|
const char *language,
|
||
|
char **configs,
|
||
|
int configs_size,
|
||
|
bool configs_global_only);
|
||
|
|
||
|
void recognize_page(STRING& image_name);
|
||
|
void end_tesseract();
|
||
|
|
||
|
bool init_tesseract_lang_data(const char *arg0,
|
||
|
const char *textbase,
|
||
|
const char *language,
|
||
|
char **configs,
|
||
|
int configs_size,
|
||
|
bool configs_global_only);
|
||
|
|
||
|
//// pgedit.h //////////////////////////////////////////////////////////
|
||
|
SVMenuNode *build_menu_new();
|
||
|
void pgeditor_main(BLOCK_LIST *blocks);
|
||
|
void process_image_event( // action in image win
|
||
|
const SVEvent &event);
|
||
|
void pgeditor_read_file( // of serialised file
|
||
|
STRING &filename,
|
||
|
BLOCK_LIST *blocks // block list to add to
|
||
|
);
|
||
|
void do_new_source( // serialise
|
||
|
);
|
||
|
BOOL8 process_cmd_win_event( // UI command semantics
|
||
|
inT32 cmd_event, // which menu item?
|
||
|
char *new_value // any prompt data
|
||
|
);
|
||
|
//// reject.h //////////////////////////////////////////////////////////
|
||
|
const char *char_ambiguities(char c);
|
||
|
void make_reject_map( //make rej map for wd //detailed results
|
||
|
WERD_RES *word,
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
ROW *row,
|
||
|
inT16 pass //1st or 2nd?
|
||
|
);
|
||
|
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
|
||
|
inT16 first_alphanum_index(const char *word,
|
||
|
const char *word_lengths);
|
||
|
inT16 first_alphanum_offset(const char *word,
|
||
|
const char *word_lengths);
|
||
|
inT16 alpha_count(const char *word,
|
||
|
const char *word_lengths);
|
||
|
BOOL8 word_contains_non_1_digit(const char *word,
|
||
|
const char *word_lengths);
|
||
|
void dont_allow_1Il(WERD_RES *word);
|
||
|
inT16 count_alphanums( //how many alphanums
|
||
|
WERD_RES *word);
|
||
|
BOOL8 repeated_ch_string(const char *rep_ch_str,
|
||
|
const char *lengths);
|
||
|
void flip_0O(WERD_RES *word);
|
||
|
BOOL8 non_0_digit(UNICHAR_ID unichar_id);
|
||
|
BOOL8 non_O_upper(UNICHAR_ID unichar_id);
|
||
|
BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
|
||
|
void nn_match_word( //Match a word
|
||
|
WERD_RES *word,
|
||
|
ROW *row);
|
||
|
void nn_recover_rejects(WERD_RES *word, ROW *row);
|
||
|
BOOL8 test_ambig_word( //test for ambiguity
|
||
|
WERD_RES *word);
|
||
|
void set_done( //set done flag
|
||
|
WERD_RES *word,
|
||
|
inT16 pass);
|
||
|
inT16 safe_dict_word(const WERD_CHOICE &word);
|
||
|
void flip_hyphens(WERD_RES *word);
|
||
|
//// adaptions.h ///////////////////////////////////////////////////////
|
||
|
void adapt_to_good_ems(WERD_RES *word,
|
||
|
CHAR_SAMPLES_LIST *char_clusters,
|
||
|
CHAR_SAMPLE_LIST *chars_waiting);
|
||
|
void adapt_to_good_samples(WERD_RES *word,
|
||
|
CHAR_SAMPLES_LIST *char_clusters,
|
||
|
CHAR_SAMPLE_LIST *chars_waiting);
|
||
|
BOOL8 word_adaptable( //should we adapt?
|
||
|
WERD_RES *word,
|
||
|
uinT16 mode);
|
||
|
void reject_suspect_ems(WERD_RES *word);
|
||
|
void collect_ems_for_adaption(WERD_RES *word,
|
||
|
CHAR_SAMPLES_LIST *char_clusters,
|
||
|
CHAR_SAMPLE_LIST *chars_waiting);
|
||
|
void collect_characters_for_adaption(WERD_RES *word,
|
||
|
CHAR_SAMPLES_LIST *char_clusters,
|
||
|
CHAR_SAMPLE_LIST *chars_waiting);
|
||
|
void check_wait_list(CHAR_SAMPLE_LIST *chars_waiting,
|
||
|
CHAR_SAMPLE *sample,
|
||
|
CHAR_SAMPLES *best_cluster);
|
||
|
void cluster_sample(CHAR_SAMPLE *sample,
|
||
|
CHAR_SAMPLES_LIST *char_clusters,
|
||
|
CHAR_SAMPLE_LIST *chars_waiting);
|
||
|
void complete_clustering(CHAR_SAMPLES_LIST *char_clusters,
|
||
|
CHAR_SAMPLE_LIST *chars_waiting);
|
||
|
|
||
|
//// tfacepp.cpp ///////////////////////////////////////////////////////
|
||
|
WERD_CHOICE *recog_word_recursive( //recog one owrd
|
||
|
WERD *word, //word to do
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
//matcher function
|
||
|
POLY_MATCHER matcher,
|
||
|
//tester function
|
||
|
POLY_TESTER tester,
|
||
|
//trainer function
|
||
|
POLY_TESTER trainer,
|
||
|
BOOL8 testing, //true if answer driven
|
||
|
//raw result
|
||
|
WERD_CHOICE *&raw_choice,
|
||
|
//list of blob lists
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
WERD *&outword //bln word output
|
||
|
);
|
||
|
WERD_CHOICE *recog_word( //recog one owrd
|
||
|
WERD *word, //word to do
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
POLY_MATCHER matcher, //matcher function
|
||
|
POLY_TESTER tester, //tester function
|
||
|
POLY_TESTER trainer, //trainer function
|
||
|
BOOL8 testing, //true if answer driven
|
||
|
WERD_CHOICE *&raw_choice, //raw result
|
||
|
//list of blob lists
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
WERD *&outword //bln word output
|
||
|
);
|
||
|
WERD_CHOICE *split_and_recog_word( //recog one owrd
|
||
|
WERD *word, //word to do
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
//matcher function
|
||
|
POLY_MATCHER matcher,
|
||
|
//tester function
|
||
|
POLY_TESTER tester,
|
||
|
//trainer function
|
||
|
POLY_TESTER trainer,
|
||
|
BOOL8 testing, //true if answer driven
|
||
|
//raw result
|
||
|
WERD_CHOICE *&raw_choice,
|
||
|
//list of blob lists
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
WERD *&outword //bln word output
|
||
|
);
|
||
|
//// fixspace.cpp ///////////////////////////////////////////////////////
|
||
|
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
|
||
|
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
|
||
|
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
|
||
|
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
|
||
|
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
|
||
|
void fix_fuzzy_space_list( //space explorer
|
||
|
WERD_RES_LIST &best_perm,
|
||
|
ROW *row,
|
||
|
BLOCK* block);
|
||
|
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
|
||
|
void fix_fuzzy_spaces( //find fuzzy words
|
||
|
volatile ETEXT_DESC *monitor, //progress monitor
|
||
|
inT32 word_count, //count of words in doc
|
||
|
PAGE_RES *page_res);
|
||
|
//// docqual.cpp ////////////////////////////////////////////////////////
|
||
|
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
|
||
|
BOOL8 potential_word_crunch(WERD_RES *word,
|
||
|
GARBAGE_LEVEL garbage_level,
|
||
|
BOOL8 ok_dict_word);
|
||
|
void tilde_crunch(PAGE_RES_IT &page_res_it);
|
||
|
void unrej_good_quality_words( //unreject potential
|
||
|
PAGE_RES_IT &page_res_it);
|
||
|
void doc_and_block_rejection( //reject big chunks
|
||
|
PAGE_RES_IT &page_res_it,
|
||
|
BOOL8 good_quality_doc);
|
||
|
void quality_based_rejection(PAGE_RES_IT &page_res_it,
|
||
|
BOOL8 good_quality_doc);
|
||
|
void convert_bad_unlv_chs(WERD_RES *word_res);
|
||
|
void merge_tess_fails(WERD_RES *word_res);
|
||
|
void tilde_delete(PAGE_RES_IT &page_res_it);
|
||
|
void insert_rej_cblobs(WERD_RES *word);
|
||
|
//// pagewalk.cpp ///////////////////////////////////////////////////////
|
||
|
void
|
||
|
process_selected_words (
|
||
|
BLOCK_LIST * block_list, //blocks to check
|
||
|
//function to call
|
||
|
TBOX & selection_box,
|
||
|
BOOL8 (tesseract::Tesseract::*word_processor) (
|
||
|
BLOCK *,
|
||
|
ROW *,
|
||
|
WERD *));
|
||
|
//// tessbox.cpp ///////////////////////////////////////////////////////
|
||
|
void tess_add_doc_word( //test acceptability
|
||
|
WERD_CHOICE *word_choice //after context
|
||
|
);
|
||
|
void tess_adapter( //adapt to word
|
||
|
WERD *word, //bln word
|
||
|
DENORM *denorm, //de-normalise
|
||
|
const WERD_CHOICE& choice, //string for word
|
||
|
const WERD_CHOICE& raw_choice, //before context
|
||
|
const char *rejmap //reject map
|
||
|
);
|
||
|
WERD_CHOICE *test_segment_pass2( //recog one word
|
||
|
WERD *word, //bln word to do
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
POLY_MATCHER matcher, //matcher function
|
||
|
POLY_TESTER tester, //tester function
|
||
|
//raw result
|
||
|
WERD_CHOICE *&raw_choice,
|
||
|
//list of blob lists
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
WERD *&outword //bln word output
|
||
|
);
|
||
|
WERD_CHOICE *tess_segment_pass1( //recog one word
|
||
|
WERD *word, //bln word to do
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
POLY_MATCHER matcher, //matcher function
|
||
|
//raw result
|
||
|
WERD_CHOICE *&raw_choice,
|
||
|
//list of blob lists
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
WERD *&outword //bln word output
|
||
|
);
|
||
|
WERD_CHOICE *tess_segment_pass2( //recog one word
|
||
|
WERD *word, //bln word to do
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
POLY_MATCHER matcher, //matcher function
|
||
|
//raw result
|
||
|
WERD_CHOICE *&raw_choice,
|
||
|
//list of blob lists
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
WERD *&outword //bln word output
|
||
|
);
|
||
|
WERD_CHOICE *correct_segment_pass2( //recog one word
|
||
|
WERD *word, //bln word to do
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
POLY_MATCHER matcher, //matcher function
|
||
|
POLY_TESTER tester, //tester function
|
||
|
//raw result
|
||
|
WERD_CHOICE *&raw_choice,
|
||
|
//list of blob lists
|
||
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||
|
WERD *&outword //bln word output
|
||
|
);
|
||
|
void tess_default_matcher( //call tess
|
||
|
PBLOB *pblob, //previous blob
|
||
|
PBLOB *blob, //blob to match
|
||
|
PBLOB *nblob, //next blob
|
||
|
WERD *word, //word it came from
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
BLOB_CHOICE_LIST *ratings, //list of results
|
||
|
const char* script
|
||
|
);
|
||
|
void tess_bn_matcher( //call tess
|
||
|
PBLOB *pblob, //previous blob
|
||
|
PBLOB *blob, //blob to match
|
||
|
PBLOB *nblob, //next blob
|
||
|
WERD *word, //word it came from
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
BLOB_CHOICE_LIST *ratings //list of results
|
||
|
);
|
||
|
void tess_cn_matcher( //call tess
|
||
|
PBLOB *pblob, //previous blob
|
||
|
PBLOB *blob, //blob to match
|
||
|
PBLOB *nblob, //next blob
|
||
|
WERD *word, //word it came from
|
||
|
DENORM *denorm, //de-normaliser
|
||
|
BLOB_CHOICE_LIST *ratings, //list of results
|
||
|
// Sorted array of CP_RESULT_STRUCT from class pruner.
|
||
|
CLASS_PRUNER_RESULTS cpresults
|
||
|
);
|
||
|
BOOL8 tess_adaptable_word( //test adaptability
|
||
|
WERD *word, //word to test
|
||
|
WERD_CHOICE *word_choice, //after context
|
||
|
WERD_CHOICE *raw_choice //before context
|
||
|
);
|
||
|
BOOL8 tess_acceptable_word( //test acceptability
|
||
|
WERD_CHOICE *word_choice, //after context
|
||
|
WERD_CHOICE *raw_choice //before context
|
||
|
);
|
||
|
//// applybox.cpp //////////////////////////////////////////////////////
|
||
|
void apply_box_testing(BLOCK_LIST *block_list);
|
||
|
void apply_boxes(const STRING& fname,
|
||
|
BLOCK_LIST *block_list //real blocks
|
||
|
);
|
||
|
// converts an array of boxes to a block list
|
||
|
int Boxes2BlockList(int box_cnt, TBOX *boxes, BLOCK_LIST *block_list,
|
||
|
bool right2left);
|
||
|
//// blobcmp.cpp ///////////////////////////////////////////////////////
|
||
|
float compare_tess_blobs(TBLOB *blob1,
|
||
|
TEXTROW *row1,
|
||
|
TBLOB *blob2,
|
||
|
TEXTROW *row2);
|
||
|
//// paircmp.cpp ///////////////////////////////////////////////////////
|
||
|
float compare_bln_blobs( //match 2 blobs
|
||
|
PBLOB *blob1, //first blob
|
||
|
DENORM *denorm1,
|
||
|
PBLOB *blob2, //other blob
|
||
|
DENORM *denorm2);
|
||
|
float compare_blobs( //match 2 blobs
|
||
|
PBLOB *blob1, //first blob
|
||
|
ROW *row1, //row it came from
|
||
|
PBLOB *blob2, //other blob
|
||
|
ROW *row2);
|
||
|
BOOL8 compare_blob_pairs( //blob processor
|
||
|
BLOCK *,
|
||
|
ROW *row, //row it came from
|
||
|
WERD *,
|
||
|
PBLOB *blob //blob to compare
|
||
|
);
|
||
|
//// fixxht.cpp ///////////////////////////////////////////////////////
|
||
|
void check_block_occ(WERD_RES *word_res);
|
||
|
|
||
|
//// Data members ///////////////////////////////////////////////////////
|
||
|
BOOL_VAR_H(tessedit_resegment_from_boxes, false,
|
||
|
"Take segmentation and labeling from box file");
|
||
|
BOOL_VAR_H(tessedit_train_from_boxes, false,
|
||
|
"Generate training data from boxed chars");
|
||
|
BOOL_VAR_H(tessedit_dump_pageseg_images, false,
|
||
|
"Dump itermediate images made during page segmentation");
|
||
|
INT_VAR_H(tessedit_pageseg_mode, 2,
|
||
|
"Page seg mode: 0=auto, 1=col, 2=block, 3=line, 4=word, 6=char"
|
||
|
" (Values from PageSegMode enum in baseapi.h)");
|
||
|
INT_VAR_H(tessedit_accuracyvspeed, 0,
|
||
|
"Accuracy V Speed tradeoff: 0 fastest, 100 most accurate"
|
||
|
" (Values from AccuracyVSpeed enum in baseapi.h)");
|
||
|
BOOL_VAR_H(tessedit_train_from_boxes_word_level, false,
|
||
|
"Generate training data from boxed chars at word level.");
|
||
|
STRING_VAR_H(tessedit_char_blacklist, "",
|
||
|
"Blacklist of chars not to recognize");
|
||
|
STRING_VAR_H(tessedit_char_whitelist, "",
|
||
|
"Whitelist of chars to recognize");
|
||
|
BOOL_VAR_H(global_tessedit_ambigs_training, false,
|
||
|
"Perform training for ambiguities");
|
||
|
//// ambigsrecog.cpp /////////////////////////////////////////////////////////
|
||
|
FILE *init_ambigs_training(const STRING &fname);
|
||
|
void ambigs_training_segmented(const STRING &fname,
|
||
|
PAGE_RES *page_res,
|
||
|
volatile ETEXT_DESC *monitor,
|
||
|
FILE *output_file);
|
||
|
void ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
|
||
|
const char *label,
|
||
|
FILE *output_file);
|
||
|
private:
|
||
|
Pix* pix_binary_;
|
||
|
FCOORD deskew_;
|
||
|
FCOORD reskew_;
|
||
|
bool hindi_image_;
|
||
|
};
|
||
|
|
||
|
} // namespace tesseract
|
||
|
|
||
|
|
||
|
#endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__
|