tesseract/classify/intmatcher.h
Ray Smith 84920b92b3 Font and classifier output structure cleanup.
Font recognition was poor, due to forcing a 1st and 2nd choice at
a character level, when the total score for the correct font is often
correct at the word level, so allowed the propagation of a full set
of fonts and scores to the word recognizer, which can now decide word
level fonts using the scores instead of simple votes.

Change precipitated a cleanup of output data structures for classifier
results, eliminating ScoredClass and INT_RESULT_STRUCT, with a few
extra elements going in UnicharRating, and using that wherever possible.
That added the extra complexity of 1-rating due to a flip between 0 is
good and 0 is bad for the internal classifier scores before they are
converted to rating and certainty.
2015-05-12 17:24:34 -07:00

205 lines
7.1 KiB
C++

/******************************************************************************
** Filename: intmatcher.h
** Purpose: Interface to high level generic classifier routines.
** Author: Robert Moss
** History: Wed Feb 13 15:24:15 MST 1991, RWM, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef INTMATCHER_H
#define INTMATCHER_H
#include "params.h"
// Character fragments could be present in the trained templaes
// but turned on/off on the language-by-language basis or depending
// on particular properties of the corpus (e.g. when we expect the
// images to have low exposure).
extern BOOL_VAR_H(disable_character_fragments, FALSE,
"Do not include character fragments in the"
" results of the classifier");
extern INT_VAR_H(classify_integer_matcher_multiplier, 10,
"Integer Matcher Multiplier 0-255: ");
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "intproto.h"
#include "cutoffs.h"
namespace tesseract {
class UnicharRating;
}
struct CP_RESULT_STRUCT {
CP_RESULT_STRUCT() : Rating(0.0f), Class(0) {}
FLOAT32 Rating;
CLASS_ID Class;
};
/*----------------------------------------------------------------------------
Variables
-----------------------------------------------------------------------------*/
extern INT_VAR_H(classify_adapt_proto_thresh, 230,
"Threshold for good protos during adaptive 0-255: ");
extern INT_VAR_H(classify_adapt_feature_thresh, 230,
"Threshold for good features during adaptive 0-255: ");
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
#define SE_TABLE_BITS 9
#define SE_TABLE_SIZE 512
struct ScratchEvidence {
uinT8 feature_evidence_[MAX_NUM_CONFIGS];
int sum_feature_evidence_[MAX_NUM_CONFIGS];
uinT8 proto_evidence_[MAX_NUM_PROTOS][MAX_PROTO_INDEX];
void Clear(const INT_CLASS class_template);
void ClearFeatureEvidence(const INT_CLASS class_template);
void NormalizeSums(INT_CLASS ClassTemplate, inT16 NumFeatures,
inT32 used_features);
void UpdateSumOfProtoEvidences(
INT_CLASS ClassTemplate, BIT_VECTOR ConfigMask, inT16 NumFeatures);
};
class IntegerMatcher {
public:
// Integer Matcher Theta Fudge (0-255).
static const int kIntThetaFudge = 128;
// Bits in Similarity to Evidence Lookup (8-9).
static const int kEvidenceTableBits = 9;
// Integer Evidence Truncation Bits (8-14).
static const int kIntEvidenceTruncBits = 14;
// Similarity to Evidence Table Exponential Multiplier.
static const float kSEExponentialMultiplier;
// Center of Similarity Curve.
static const float kSimilarityCenter;
IntegerMatcher() : classify_debug_level_(0) {}
void Init(tesseract::IntParam *classify_debug_level);
void Match(INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
inT16 NumFeatures,
const INT_FEATURE_STRUCT* Features,
tesseract::UnicharRating* Result,
int AdaptFeatureThreshold,
int Debug,
bool SeparateDebugWindows);
// Applies the CN normalization factor to the given rating and returns
// the modified rating.
float ApplyCNCorrection(float rating, int blob_length,
int normalization_factor, int matcher_multiplier);
int FindGoodProtos(INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
uinT16 BlobLength,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
PROTO_ID *ProtoArray,
int AdaptProtoThreshold,
int Debug);
int FindBadFeatures(INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
uinT16 BlobLength,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
FEATURE_ID *FeatureArray,
int AdaptFeatureThreshold,
int Debug);
private:
int UpdateTablesForFeature(
INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
int FeatureNum,
const INT_FEATURE_STRUCT* Feature,
ScratchEvidence *evidence,
int Debug);
int FindBestMatch(INT_CLASS ClassTemplate,
const ScratchEvidence &tables,
tesseract::UnicharRating* Result);
#ifndef GRAPHICS_DISABLED
void DebugFeatureProtoError(
INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
const ScratchEvidence &tables,
inT16 NumFeatures,
int Debug);
void DisplayProtoDebugInfo(
INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
const ScratchEvidence &tables,
bool SeparateDebugWindows);
void DisplayFeatureDebugInfo(
INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
inT16 NumFeatures,
const INT_FEATURE_STRUCT* Features,
int AdaptFeatureThreshold,
int Debug,
bool SeparateDebugWindows);
#endif
private:
uinT8 similarity_evidence_table_[SE_TABLE_SIZE];
uinT32 evidence_table_mask_;
uinT32 mult_trunc_shift_bits_;
uinT32 table_trunc_shift_bits_;
tesseract::IntParam *classify_debug_level_;
uinT32 evidence_mult_mask_;
};
/**----------------------------------------------------------------------------
Private Function Prototypes
----------------------------------------------------------------------------**/
void IMDebugConfiguration(INT_FEATURE FeatureNum,
uinT16 ActualProtoNum,
uinT8 Evidence,
BIT_VECTOR ConfigMask,
uinT32 ConfigWord);
void IMDebugConfigurationSum(INT_FEATURE FeatureNum,
uinT8 *FeatureEvidence,
inT32 ConfigCount);
void HeapSort (int n, register int ra[], register int rb[]);
/**----------------------------------------------------------------------------
Global Data Definitions and Declarations
----------------------------------------------------------------------------**/
#endif