mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
7870d67c21
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@157 d0cd1f9f-072b-0410-8dd7-cf729c803f20
199 lines
9.5 KiB
C
199 lines
9.5 KiB
C
/**********************************************************************
|
|
* File: control.h (Formerly control.h)
|
|
* Description: Module-independent matcher controller.
|
|
* Author: Ray Smith
|
|
* Created: Thu Apr 23 11:09:58 BST 1992
|
|
*
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#ifndef CONTROL_H
|
|
#define CONTROL_H
|
|
|
|
#include "varable.h"
|
|
#include "ocrblock.h"
|
|
//#include "epapdest.h"
|
|
#include "ratngs.h"
|
|
#include "statistc.h"
|
|
//#include "epapconv.h"
|
|
#include "ocrshell.h"
|
|
#include "pageres.h"
|
|
//TODO (wanke) why does the app. path have to be so weird here?
|
|
#include "charsample.h"
|
|
#include "notdll.h"
|
|
|
|
enum ACCEPTABLE_WERD_TYPE
|
|
{
|
|
AC_UNACCEPTABLE, //Unacceptable word
|
|
AC_LOWER_CASE, //ALL lower case
|
|
AC_UPPER_CASE, //ALL upper case
|
|
AC_INITIAL_CAP, //ALL but initial lc
|
|
AC_LC_ABBREV, //a.b.c.
|
|
AC_UC_ABBREV //A.B.C.
|
|
};
|
|
|
|
typedef BOOL8 (*BLOB_REJECTOR) (PBLOB *, BLOB_CHOICE_IT *, void *);
|
|
|
|
extern INT_VAR_H (tessedit_single_match, FALSE, "Top choice only from CP");
|
|
//extern BOOL_VAR_H(tessedit_small_match,FALSE,"Use small matrix matcher");
|
|
extern BOOL_VAR_H (tessedit_print_text, FALSE, "Write text to stdout");
|
|
extern BOOL_VAR_H (tessedit_draw_words, FALSE, "Draw source words");
|
|
extern BOOL_VAR_H (tessedit_draw_outwords, FALSE, "Draw output words");
|
|
extern BOOL_VAR_H (tessedit_training_wiseowl, FALSE,
|
|
"Call WO to learn blobs");
|
|
extern BOOL_VAR_H (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
|
|
extern BOOL_VAR_H (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
|
|
extern BOOL_VAR_H (tessedit_dump_choices, FALSE, "Dump char choices");
|
|
extern BOOL_VAR_H (tessedit_fix_fuzzy_spaces, TRUE,
|
|
"Try to improve fuzzy spaces");
|
|
extern BOOL_VAR_H (tessedit_unrej_any_wd, FALSE,
|
|
"Dont bother with word plausibility");
|
|
extern BOOL_VAR_H (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
|
|
extern BOOL_VAR_H (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
|
|
extern BOOL_VAR_H (tessedit_reject_suspect_fullstops, FALSE,
|
|
"Reject suspect fullstops");
|
|
extern BOOL_VAR_H (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
|
|
extern BOOL_VAR_H (tessedit_cluster_adaption_on, TRUE,
|
|
"Do our own adaption - ems only");
|
|
extern BOOL_VAR_H (tessedit_enable_doc_dict, TRUE,
|
|
"Add words to the document dictionary");
|
|
extern BOOL_VAR_H (word_occ_first, FALSE, "Do word occ before re-est xht");
|
|
extern BOOL_VAR_H (tessedit_xht_fiddles_on_done_wds, TRUE,
|
|
"Apply xht fix up even if done");
|
|
extern BOOL_VAR_H (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
|
|
"Apply xht fix up even in no rejects");
|
|
extern INT_VAR_H (x_ht_check_word_occ, 2, "Check Char Block occupancy");
|
|
extern INT_VAR_H (x_ht_stringency, 1, "How many confirmed a/n to accept?");
|
|
extern BOOL_VAR_H (x_ht_quality_check, TRUE, "Dont allow worse quality");
|
|
extern BOOL_VAR_H (tessedit_debug_block_rejection, FALSE,
|
|
"Block and Row stats");
|
|
extern INT_VAR_H (debug_x_ht_level, 0, "Reestimate debug");
|
|
extern BOOL_VAR_H (rej_use_xht, TRUE, "Individual rejection control");
|
|
extern BOOL_VAR_H (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
|
|
extern STRING_VAR_H (chs_leading_punct, "('`\"", "Leading punctuation");
|
|
extern
|
|
STRING_VAR_H (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
|
|
extern STRING_VAR_H (chs_trailing_punct2, ")'`\"",
|
|
"2nd Trailing punctuation");
|
|
extern double_VAR_H (quality_rej_pc, 0.08,
|
|
"good_quality_doc lte rejection limit");
|
|
extern double_VAR_H (quality_blob_pc, 0.0,
|
|
"good_quality_doc gte good blobs limit");
|
|
extern double_VAR_H (quality_outline_pc, 1.0,
|
|
"good_quality_doc lte outline error limit");
|
|
extern double_VAR_H (quality_char_pc, 0.95,
|
|
"good_quality_doc gte good char limit");
|
|
extern INT_VAR_H (quality_min_initial_alphas_reqd, 2,
|
|
"alphas in a good word");
|
|
extern BOOL_VAR_H (tessedit_tess_adapt_to_rejmap, FALSE,
|
|
"Use reject map to control Tesseract adaption");
|
|
extern INT_VAR_H (tessedit_tess_adaption_mode, 3,
|
|
"Adaptation decision algorithm for tess");
|
|
extern INT_VAR_H (tessedit_em_adaption_mode, 62,
|
|
"Adaptation decision algorithm for ems matrix matcher");
|
|
extern BOOL_VAR_H (tessedit_cluster_adapt_after_pass1, FALSE,
|
|
"Adapt using clusterer after pass 1");
|
|
extern BOOL_VAR_H (tessedit_cluster_adapt_after_pass2, FALSE,
|
|
"Adapt using clusterer after pass 1");
|
|
extern BOOL_VAR_H (tessedit_cluster_adapt_after_pass3, FALSE,
|
|
"Adapt using clusterer after pass 1");
|
|
extern BOOL_VAR_H (tessedit_cluster_adapt_before_pass1, FALSE,
|
|
"Adapt using clusterer before Tess adaping during pass 1");
|
|
extern INT_VAR_H (tessedit_cluster_adaption_mode, 0,
|
|
"Adaptation decision algorithm for matrix matcher");
|
|
extern BOOL_VAR_H (tessedit_adaption_debug, FALSE,
|
|
"Generate and print debug information for adaption");
|
|
extern BOOL_VAR_H (tessedit_minimal_rej_pass1, FALSE,
|
|
"Do minimal rejection on pass 1 output");
|
|
extern BOOL_VAR_H (tessedit_test_adaption, FALSE,
|
|
"Test adaption criteria");
|
|
extern BOOL_VAR_H (tessedit_global_adaption, FALSE,
|
|
"Adapt to all docs over time");
|
|
extern BOOL_VAR_H (tessedit_matcher_log, FALSE, "Log matcher activity");
|
|
extern INT_VAR_H (tessedit_test_adaption_mode, 3,
|
|
"Adaptation decision algorithm for tess");
|
|
extern BOOL_VAR_H (test_pt, FALSE, "Test for point");
|
|
extern double_VAR_H (test_pt_x, 99999.99, "xcoord");
|
|
extern double_VAR_H (test_pt_y, 99999.99, "ycoord");
|
|
void recog_pseudo_word( //recognize blobs
|
|
BLOCK_LIST *block_list, //blocks to check
|
|
TBOX &selection_box);
|
|
BOOL8 recog_interactive( //recognize blobs
|
|
BLOCK *, //block
|
|
ROW *row, //row of word
|
|
WERD *word //word to recognize
|
|
);
|
|
void recog_all_words( //process words
|
|
PAGE_RES *page_res, //page structure
|
|
volatile ETEXT_DESC *monitor, //progress monitor
|
|
TBOX *target_word_box=0L,
|
|
inT16 dopasses=0
|
|
);
|
|
|
|
void classify_word_pass1( //recog one word
|
|
WERD_RES *word, //word to do
|
|
ROW *row,
|
|
BOOL8 cluster_adapt,
|
|
CHAR_SAMPLES_LIST *char_clusters,
|
|
CHAR_SAMPLE_LIST *chars_waiting);
|
|
//word to do
|
|
void classify_word_pass2(WERD_RES *word, ROW *row);
|
|
void match_word_pass2( //recog one word
|
|
WERD_RES *word, //word to do
|
|
ROW *row,
|
|
float x_height);
|
|
void fix_rep_char( //Repeated char word
|
|
WERD_RES *word //word to do
|
|
);
|
|
void fix_quotes( //make double quotes
|
|
WERD_CHOICE *choice, //string to fix
|
|
WERD *word, //word to do //char choices
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices);
|
|
void fix_hyphens( //crunch double hyphens
|
|
WERD_CHOICE *choice, //string to fix
|
|
WERD *word, //word to do //char choices
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices);
|
|
void merge_blobs( //combine 2 blobs
|
|
PBLOB *blob1, //dest blob
|
|
PBLOB *blob2 //source blob
|
|
);
|
|
void choice_dump_tester( //dump chars in word
|
|
PBLOB *, //blob
|
|
DENORM *, //de-normaliser
|
|
BOOL8 correct, //ly segmented
|
|
char *text, //correct text
|
|
inT32 count, //chars in text
|
|
BLOB_CHOICE_LIST *ratings //list of results
|
|
);
|
|
WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm);
|
|
ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
|
|
const char *lengths);
|
|
BOOL8 check_debug_pt(WERD_RES *word, int location);
|
|
void set_word_fonts( //good chars in word
|
|
WERD_RES *word, //word to adapt to //detailed results
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices);
|
|
void font_recognition_pass( //good chars in word
|
|
PAGE_RES_IT &page_res_it);
|
|
void add_in_one_row( //good chars in word
|
|
ROW_RES *row, //current row
|
|
STATS *fonts, //font stats
|
|
inT8 *italic, //output count
|
|
inT8 *bold //output count
|
|
);
|
|
void find_modal_font( //good chars in word
|
|
STATS *fonts, //font stats
|
|
inT8 *font_out, //output font
|
|
inT8 *font_count //output count
|
|
);
|
|
#endif
|