mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 19:19:05 +08:00
c4f4840fbe
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@163 d0cd1f9f-072b-0410-8dd7-cf729c803f20
285 lines
9.7 KiB
C++
285 lines
9.7 KiB
C++
/**********************************************************************
|
|
* File: rejctmap.h (Formerly rejmap.h)
|
|
* Description: REJ and REJMAP class functions.
|
|
* Author: Phil Cheatle
|
|
* Created: Thu Jun 9 13:46:38 BST 1994
|
|
*
|
|
* (C) Copyright 1994, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
|
|
This module may look unneccessarily verbose, but here's the philosophy...
|
|
|
|
ALL processing of the reject map is done in this module. There are lots of
|
|
separate calls to set reject/accept flags. These have DELIBERATELY been kept
|
|
distinct so that this module can decide what to do.
|
|
|
|
Basically, there is a flag for each sort of rejection or acceptance. This
|
|
provides a history of what has happened to EACH character.
|
|
|
|
Determining whether a character is CURRENTLY rejected depends on implicit
|
|
understanding of the SEQUENCE of possible calls. The flags are defined and
|
|
grouped in the REJ_FLAGS enum. These groupings are used in determining a
|
|
characters CURRENT rejection status. Basically, a character is ACCEPTED if
|
|
|
|
none of the permanent rej flags are set
|
|
AND ( the character has never been rejected
|
|
OR an accept flag is set which is LATER than the latest reject flag )
|
|
|
|
IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE
|
|
OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
|
|
**********************************************************************/
|
|
|
|
#ifndef REJCTMAP_H
|
|
#define REJCTMAP_H
|
|
|
|
#ifdef __UNIX__
|
|
#include <assert.h>
|
|
#endif
|
|
#include "memry.h"
|
|
#include "bits16.h"
|
|
#include "varable.h"
|
|
#include "notdll.h"
|
|
|
|
extern BOOL_VAR_H (rejword_only_set_if_accepted, TRUE,
|
|
"Mimic old reject_word");
|
|
extern BOOL_VAR_H (rejmap_allow_more_good_qual, FALSE,
|
|
"Use initial good qual setting");
|
|
extern BOOL_VAR_H (rej_use_1Il_rej, TRUE, "1Il rejection enabled");
|
|
|
|
enum REJ_FLAGS
|
|
{
|
|
/* Reject modes which are NEVER overridden */
|
|
R_TESS_FAILURE, // PERM Tess didnt classify
|
|
R_SMALL_XHT, // PERM Xht too small
|
|
R_EDGE_CHAR, // PERM Too close to edge of image
|
|
R_1IL_CONFLICT, // PERM 1Il confusion
|
|
R_POSTNN_1IL, // PERM 1Il unrejected by NN
|
|
R_REJ_CBLOB, // PERM Odd blob
|
|
R_MM_REJECT, // PERM Matrix match rejection (m's)
|
|
R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend
|
|
|
|
/* Initial reject modes (pre NN_ACCEPT) */
|
|
R_POOR_MATCH, // TEMP Ray's original heuristic (Not used)
|
|
R_NOT_TESS_ACCEPTED, // TEMP Tess didnt accept WERD
|
|
R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD
|
|
R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD
|
|
|
|
/* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */
|
|
R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop
|
|
R_DUBIOUS, // TEMP Post NN dodgy chars
|
|
R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN
|
|
R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest
|
|
R_XHT_FIXUP, // TEMP Xht tests unsure
|
|
|
|
/* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */
|
|
R_BAD_QUALITY, // TEMP Quality metrics bad for WERD
|
|
|
|
/* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/
|
|
R_DOC_REJ, // TEMP Document rejection
|
|
R_BLOCK_REJ, // TEMP Block rejection
|
|
R_ROW_REJ, // TEMP Row rejection
|
|
R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space
|
|
|
|
/* Accept modes which occur inbetween the above rejection groups */
|
|
R_NN_ACCEPT, //NN acceptance
|
|
R_HYPHEN_ACCEPT, //Hyphen acceptance
|
|
R_MM_ACCEPT, //Matrix match acceptance
|
|
R_QUALITY_ACCEPT, //Accept word in good quality doc
|
|
R_MINIMAL_REJ_ACCEPT //Accept EVERYTHING except tess failures
|
|
};
|
|
|
|
/* REJECT MAP VALUES */
|
|
|
|
#define MAP_ACCEPT '1'
|
|
#define MAP_REJECT_PERM '0'
|
|
#define MAP_REJECT_TEMP '2'
|
|
#define MAP_REJECT_POTENTIAL '3'
|
|
|
|
class REJ
|
|
{
|
|
BITS16 flags1;
|
|
BITS16 flags2;
|
|
|
|
void set_flag(REJ_FLAGS rej_flag) {
|
|
if (rej_flag < 16)
|
|
flags1.turn_on_bit (rej_flag);
|
|
else
|
|
flags2.turn_on_bit (rej_flag - 16);
|
|
}
|
|
|
|
BOOL8 rej_before_nn_accept();
|
|
BOOL8 rej_between_nn_and_mm();
|
|
BOOL8 rej_between_mm_and_quality_accept();
|
|
BOOL8 rej_between_quality_and_minimal_rej_accept();
|
|
BOOL8 rej_before_mm_accept();
|
|
BOOL8 rej_before_quality_accept();
|
|
|
|
public:
|
|
REJ() { //constructor
|
|
}
|
|
|
|
REJ( //classwise copy
|
|
const REJ &source) {
|
|
flags1 = source.flags1;
|
|
flags2 = source.flags2;
|
|
}
|
|
|
|
REJ & operator= ( //assign REJ
|
|
const REJ & source) { //from this
|
|
flags1 = source.flags1;
|
|
flags2 = source.flags2;
|
|
return *this;
|
|
}
|
|
|
|
BOOL8 flag(REJ_FLAGS rej_flag) {
|
|
if (rej_flag < 16)
|
|
return flags1.bit (rej_flag);
|
|
else
|
|
return flags2.bit (rej_flag - 16);
|
|
}
|
|
|
|
char display_char() {
|
|
if (perm_rejected ())
|
|
return MAP_REJECT_PERM;
|
|
else if (accept_if_good_quality ())
|
|
return MAP_REJECT_POTENTIAL;
|
|
else if (rejected ())
|
|
return MAP_REJECT_TEMP;
|
|
else
|
|
return MAP_ACCEPT;
|
|
}
|
|
|
|
BOOL8 perm_rejected(); //Is char perm reject?
|
|
|
|
BOOL8 rejected(); //Is char rejected?
|
|
|
|
BOOL8 accepted() { //Is char accepted?
|
|
return !rejected ();
|
|
}
|
|
|
|
//potential rej?
|
|
BOOL8 accept_if_good_quality();
|
|
|
|
BOOL8 recoverable() {
|
|
return (rejected () && !perm_rejected ());
|
|
}
|
|
|
|
void setrej_tess_failure(); //Tess generated blank
|
|
void setrej_small_xht(); //Small xht char/wd
|
|
void setrej_edge_char(); //Close to image edge
|
|
void setrej_1Il_conflict(); //Initial reject map
|
|
void setrej_postNN_1Il(); //1Il after NN
|
|
void setrej_rej_cblob(); //Insert duff blob
|
|
void setrej_mm_reject(); //Matrix matcher
|
|
//Odd repeated char
|
|
void setrej_bad_repetition();
|
|
void setrej_poor_match(); //Failed Rays heuristic
|
|
//TEMP reject_word
|
|
void setrej_not_tess_accepted();
|
|
//TEMP reject_word
|
|
void setrej_contains_blanks();
|
|
void setrej_bad_permuter(); //POTENTIAL reject_word
|
|
void setrej_hyphen(); //PostNN dubious hyph or .
|
|
void setrej_dubious(); //PostNN dubious limit
|
|
void setrej_no_alphanums(); //TEMP reject_word
|
|
void setrej_mostly_rej(); //TEMP reject_word
|
|
void setrej_xht_fixup(); //xht fixup
|
|
void setrej_bad_quality(); //TEMP reject_word
|
|
void setrej_doc_rej(); //TEMP reject_word
|
|
void setrej_block_rej(); //TEMP reject_word
|
|
void setrej_row_rej(); //TEMP reject_word
|
|
void setrej_unlv_rej(); //TEMP reject_word
|
|
void setrej_nn_accept(); //NN Flipped a char
|
|
void setrej_hyphen_accept(); //Good aspect ratio
|
|
void setrej_mm_accept(); //Matrix matcher
|
|
//Quality flip a char
|
|
void setrej_quality_accept();
|
|
//Accept all except blank
|
|
void setrej_minimal_rej_accept();
|
|
|
|
void full_print(FILE *fp);
|
|
};
|
|
|
|
class REJMAP
|
|
{
|
|
REJ *ptr; //ptr to the chars
|
|
inT16 len; //Number of chars
|
|
|
|
public:
|
|
REJMAP() { //constructor
|
|
ptr = NULL;
|
|
len = 0;
|
|
}
|
|
|
|
REJMAP( //classwise copy
|
|
const REJMAP &rejmap);
|
|
|
|
REJMAP & operator= ( //assign REJMAP
|
|
const REJMAP & source); //from this
|
|
|
|
~REJMAP () { //destructor
|
|
if (ptr != NULL)
|
|
free_struct (ptr, len * sizeof (REJ), "REJ");
|
|
}
|
|
|
|
void initialise( //Redefine map
|
|
inT16 length);
|
|
|
|
REJ & operator[]( //access function
|
|
inT16 index) const //map index
|
|
{
|
|
ASSERT_HOST (index < len);
|
|
return ptr[index]; //no bounds checks
|
|
}
|
|
|
|
inT32 length() const { //map length
|
|
return len;
|
|
}
|
|
|
|
inT16 accept_count(); //How many accepted?
|
|
|
|
inT16 reject_count() { //How many rejects?
|
|
return len - accept_count ();
|
|
}
|
|
|
|
void remove_pos( //Cut out an element
|
|
inT16 pos); //element to remove
|
|
|
|
void print(FILE *fp);
|
|
|
|
void full_print(FILE *fp);
|
|
|
|
BOOL8 recoverable_rejects(); //Any non perm rejs?
|
|
|
|
BOOL8 quality_recoverable_rejects();
|
|
//Any potential rejs?
|
|
|
|
void rej_word_small_xht(); //Reject whole word
|
|
//Reject whole word
|
|
void rej_word_tess_failure();
|
|
void rej_word_not_tess_accepted();
|
|
//Reject whole word
|
|
//Reject whole word
|
|
void rej_word_contains_blanks();
|
|
//Reject whole word
|
|
void rej_word_bad_permuter();
|
|
void rej_word_xht_fixup(); //Reject whole word
|
|
//Reject whole word
|
|
void rej_word_no_alphanums();
|
|
void rej_word_mostly_rej(); //Reject whole word
|
|
void rej_word_bad_quality(); //Reject whole word
|
|
void rej_word_doc_rej(); //Reject whole word
|
|
void rej_word_block_rej(); //Reject whole word
|
|
void rej_word_row_rej(); //Reject whole word
|
|
};
|
|
#endif
|