mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-19 03:54:10 +08:00
7870d67c21
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@157 d0cd1f9f-072b-0410-8dd7-cf729c803f20
156 lines
7.0 KiB
C
156 lines
7.0 KiB
C
/******************************************************************
|
|
* File: docqual.h (Formerly docqual.h)
|
|
* Description: Document Quality Metrics
|
|
* Author: Phil Cheatle
|
|
* Created: Mon May 9 11:27:28 BST 1994
|
|
*
|
|
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#ifndef DOCQUAL_H
|
|
#define DOCQUAL_H
|
|
|
|
#include "control.h"
|
|
#include "notdll.h"
|
|
|
|
enum GARBAGE_LEVEL
|
|
{
|
|
G_NEVER_CRUNCH,
|
|
G_OK,
|
|
G_DODGY,
|
|
G_TERRIBLE
|
|
};
|
|
|
|
extern STRING_VAR_H (outlines_odd, "%| ", "Non standard number of outlines");
|
|
extern STRING_VAR_H (outlines_2, "ij!?%\":;",
|
|
"Non standard number of outlines");
|
|
extern BOOL_VAR_H (docqual_excuse_outline_errs, FALSE,
|
|
"Allow outline errs in unrejection?");
|
|
extern BOOL_VAR_H (tessedit_good_quality_unrej, TRUE,
|
|
"Reduce rejection on good docs");
|
|
extern BOOL_VAR_H (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
|
|
extern double_VAR_H (tessedit_reject_doc_percent, 65.00,
|
|
"%rej allowed before rej whole doc");
|
|
extern double_VAR_H (tessedit_reject_block_percent, 45.00,
|
|
"%rej allowed before rej whole block");
|
|
extern double_VAR_H (tessedit_reject_row_percent, 40.00,
|
|
"%rej allowed before rej whole row");
|
|
extern double_VAR_H (tessedit_whole_wd_rej_row_percent, 70.00,
|
|
"%of row rejects in whole word rejects which prevents whole row rejection");
|
|
extern BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds, TRUE,
|
|
"Only rej partially rejected words in block rejection");
|
|
extern BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds, TRUE,
|
|
"Only rej partially rejected words in row rejection");
|
|
extern BOOL_VAR_H (tessedit_dont_blkrej_good_wds, FALSE,
|
|
"Use word segmentation quality metric");
|
|
extern BOOL_VAR_H (tessedit_dont_rowrej_good_wds, FALSE,
|
|
"Use word segmentation quality metric");
|
|
extern INT_VAR_H (tessedit_preserve_min_wd_len, 2,
|
|
"Only preserve wds longer than this");
|
|
extern BOOL_VAR_H (tessedit_row_rej_good_docs, TRUE,
|
|
"Apply row rejection to good docs");
|
|
extern double_VAR_H (tessedit_good_doc_still_rowrej_wd, 1.1,
|
|
"rej good doc wd if more than this fraction rejected");
|
|
extern BOOL_VAR_H (tessedit_reject_bad_qual_wds, TRUE,
|
|
"Reject all bad quality wds");
|
|
extern BOOL_VAR_H (tessedit_debug_doc_rejection, FALSE, "Page stats");
|
|
extern BOOL_VAR_H (tessedit_debug_quality_metrics, FALSE,
|
|
"Output data to debug file");
|
|
extern BOOL_VAR_H (bland_unrej, FALSE, "unrej potential with no chekcs");
|
|
extern double_VAR_H (quality_rowrej_pc, 1.1,
|
|
"good_quality_doc gte good char limit");
|
|
extern BOOL_VAR_H (unlv_tilde_crunching, TRUE,
|
|
"Mark v.bad words for tilde crunch");
|
|
extern BOOL_VAR_H (crunch_early_merge_tess_fails, TRUE,
|
|
"Before word crunch?");
|
|
extern BOOL_VAR_H (crunch_early_convert_bad_unlv_chs, FALSE,
|
|
"Take out ~^ early?");
|
|
extern double_VAR_H (crunch_terrible_rating, 80.0, "crunch rating lt this");
|
|
extern BOOL_VAR_H (crunch_terrible_garbage, TRUE, "As it says");
|
|
extern double_VAR_H (crunch_poor_garbage_cert, -9.0,
|
|
"crunch garbage cert lt this");
|
|
extern double_VAR_H (crunch_poor_garbage_rate, 60,
|
|
"crunch garbage rating lt this");
|
|
extern double_VAR_H (crunch_pot_poor_rate, 40,
|
|
"POTENTIAL crunch rating lt this");
|
|
extern double_VAR_H (crunch_pot_poor_cert, -8.0,
|
|
"POTENTIAL crunch cert lt this");
|
|
extern BOOL_VAR_H (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
|
|
extern double_VAR_H (crunch_del_rating, 60,
|
|
"POTENTIAL crunch rating lt this");
|
|
extern double_VAR_H (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
|
|
extern double_VAR_H (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
|
|
extern double_VAR_H (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
|
|
extern double_VAR_H (crunch_del_min_width, 3.0,
|
|
"Del if word width lt xht x this");
|
|
extern double_VAR_H (crunch_del_high_word, 1.5,
|
|
"Del if word gt xht x this above bl");
|
|
extern double_VAR_H (crunch_del_low_word, 0.5,
|
|
"Del if word gt xht x this below bl");
|
|
extern double_VAR_H (crunch_small_outlines_size, 0.6,
|
|
"Small if lt xht x this");
|
|
extern INT_VAR_H (crunch_rating_max, 10, "For adj length in rating per ch");
|
|
extern INT_VAR_H (crunch_pot_indicators, 1,
|
|
"How many potential indicators needed");
|
|
extern BOOL_VAR_H (crunch_leave_ok_strings, TRUE,
|
|
"Dont touch sensible strings");
|
|
extern BOOL_VAR_H (crunch_accept_ok, TRUE, "Use acceptability in okstring");
|
|
extern BOOL_VAR_H (crunch_leave_accept_strings, FALSE,
|
|
"Dont pot crunch sensible strings");
|
|
extern BOOL_VAR_H (crunch_include_numerals, FALSE, "Fiddle alpha figures");
|
|
extern INT_VAR_H (crunch_leave_lc_strings, 4,
|
|
"Dont crunch words with long lower case strings");
|
|
extern INT_VAR_H (crunch_leave_uc_strings, 4,
|
|
"Dont crunch words with long lower case strings");
|
|
extern INT_VAR_H (crunch_long_repetitions, 3,
|
|
"Crunch words with long repetitions");
|
|
extern INT_VAR_H (crunch_debug, 0, "As it says");
|
|
inT16 word_blob_quality( //Blob seg changes
|
|
WERD_RES *word,
|
|
ROW *row);
|
|
BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2);
|
|
inT16 word_outline_errs( //Outline count errs
|
|
WERD_RES *word);
|
|
void word_char_quality( //Blob seg changes
|
|
WERD_RES *word,
|
|
ROW *row,
|
|
inT16 *match_count,
|
|
inT16 *accepted_match_count);
|
|
void unrej_good_chs(WERD_RES *word, ROW *row);
|
|
void print_boxes(WERD *word);
|
|
inT16 count_outline_errs(char c, inT16 outline_count);
|
|
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc);
|
|
void unrej_good_quality_words( //unreject potential
|
|
PAGE_RES_IT &page_res_it);
|
|
void doc_and_block_rejection( //reject big chunks
|
|
PAGE_RES_IT &page_res_it,
|
|
BOOL8 good_quality_doc);
|
|
void reject_whole_page(PAGE_RES_IT &page_res_it);
|
|
void tilde_crunch(PAGE_RES_IT &page_res_it);
|
|
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
|
|
BOOL8 potential_word_crunch(WERD_RES *word,
|
|
GARBAGE_LEVEL garbage_level,
|
|
BOOL8 ok_dict_word);
|
|
void tilde_delete(PAGE_RES_IT &page_res_it);
|
|
//word to do
|
|
void convert_bad_unlv_chs(WERD_RES *word_res);
|
|
//word to do
|
|
void merge_tess_fails(WERD_RES *word_res);
|
|
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
|
|
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
|
|
inT16 failure_count(WERD_RES *word);
|
|
BOOL8 noise_outlines(WERD *word);
|
|
//word to do
|
|
void insert_rej_cblobs(WERD_RES *word);
|
|
#endif
|