tesseract/classify/classify.h

389 lines
17 KiB
C
Raw Normal View History

///////////////////////////////////////////////////////////////////////
// File: classify.h
// Description: classify class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
#define TESSERACT_CLASSIFY_CLASSIFY_H__
#include "adaptive.h"
#include "ccstruct.h"
#include "classify.h"
#include "dict.h"
#include "featdefs.h"
#include "intfx.h"
#include "intmatcher.h"
#include "ratngs.h"
#include "ocrfeatures.h"
#include "unicity_table.h"
class ScrollView;
class WERD_CHOICE;
class WERD_RES;
struct ADAPT_RESULTS;
struct NORM_PROTOS;
namespace tesseract {
// How segmented is a blob. In this enum, character refers to a classifiable
// unit, but that is too long and character is usually easier to understand.
enum CharSegmentationType {
CST_FRAGMENT, // A partial character.
CST_WHOLE, // A correctly segmented character.
CST_IMPROPER, // More than one but less than 2 characters.
CST_NGRAM // Multiple characters.
};
class Classify : public CCStruct {
public:
Classify();
virtual ~Classify();
Dict& getDict() {
return dict_;
}
// Set the denorm for classification. Takes a copy.
void set_denorm(const DENORM* denorm) {
denorm_ = *denorm;
}
/* adaptive.cpp ************************************************************/
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
int ClassPruner(INT_TEMPLATES IntTemplates,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
CLASS_PRUNER_RESULTS Results);
void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
CLASS_CUTOFF_ARRAY Cutoffs);
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
/* normmatch.cpp ************************************************************/
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
void FreeNormProtos();
NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
/* protos.cpp ***************************************************************/
void ReadClassFile();
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
const UNICHARSET& target_unicharset);
/* adaptmatch.cpp ***********************************************************/
// Learn the given word using its chopped_word, seam_array, denorm,
// box_word, best_state, and correct_text to learn both correctly and
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
// is called and the data will be written to a file for static training.
// Otherwise AdaptToBlob is called for adaption within a document.
// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
// be learned, otherwise all chars with good correct_text are learned.
void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
// Builds a blob of length fragments, from the word, starting at start,
// and then learn it, as having the given correct_text.
// If filename is not NULL, then LearnBlob
// is called and the data will be written to a file for static training.
// Otherwise AdaptToBlob is called for adaption within a document.
// threshold is a magic number required by AdaptToChar and generated by
// GetAdaptThresholds.
// Although it can be partly inferred from the string, segmentation is
// provided to explicitly clarify the character segmentation.
void LearnPieces(const char* filename, int start, int length,
float threshold, CharSegmentationType segmentation,
const char* correct_text, WERD_RES *word);
void InitAdaptiveClassifier(bool load_pre_trained_templates);
void InitAdaptedClass(TBLOB *Blob,
CLASS_ID ClassId,
ADAPT_CLASS Class,
ADAPT_TEMPLATES Templates);
void AdaptToPunc(TBLOB *Blob,
CLASS_ID ClassId,
FLOAT32 Threshold);
void AmbigClassifier(TBLOB *Blob,
INT_TEMPLATES Templates,
UNICHAR_ID *Ambiguities,
ADAPT_RESULTS *Results);
void MasterMatcher(INT_TEMPLATES templates,
inT16 num_features,
INT_FEATURE_ARRAY features,
CLASS_NORMALIZATION_ARRAY norm_factors,
ADAPT_CLASS* classes,
int debug,
int num_classes,
const TBOX& blob_box,
CLASS_PRUNER_RESULTS results,
ADAPT_RESULTS* final_results);
void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
BLOB_CHOICE_LIST *Choices);
void AddNewResult(ADAPT_RESULTS *Results,
CLASS_ID ClassId,
FLOAT32 Rating,
int ConfigId,
int config2);
int GetAdaptiveFeatures(TBLOB *Blob,
INT_FEATURE_ARRAY IntFeatures,
FEATURE_SET *FloatFeatures);
#ifndef GRAPHICS_DISABLED
void DebugAdaptiveClassifier(TBLOB *Blob,
ADAPT_RESULTS *Results);
#endif
void GetAdaptThresholds (TWERD * Word,
const WERD_CHOICE& BestChoice,
const WERD_CHOICE& BestRawChoice,
FLOAT32 Thresholds[]);
PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
int NumBadFeat,
FEATURE_ID BadFeat[],
INT_CLASS IClass,
ADAPT_CLASS Class,
BIT_VECTOR TempProtoMask);
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
CLASS_ID ClassId,
int NumFeatures,
INT_FEATURE_ARRAY Features,
FEATURE_SET FloatFeatures);
void MakePermanent(ADAPT_TEMPLATES Templates,
CLASS_ID ClassId,
int ConfigId,
TBLOB *Blob);
void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
void RemoveExtraPuncs(ADAPT_RESULTS *Results);
void RemoveBadMatches(ADAPT_RESULTS *Results);
void SetAdaptiveThreshold(FLOAT32 Threshold);
void ShowBestMatchFor(TBLOB *Blob,
CLASS_ID ClassId,
BOOL8 AdaptiveOn,
BOOL8 PreTrainedOn);
UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
ADAPT_TEMPLATES Templates,
ADAPT_RESULTS *Results);
int CharNormClassifier(TBLOB *Blob,
INT_TEMPLATES Templates,
ADAPT_RESULTS *Results);
UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
CLASS_ID CorrectClass);
void DoAdaptiveMatch(TBLOB *Blob,
ADAPT_RESULTS *Results);
void AdaptToChar(TBLOB *Blob,
CLASS_ID ClassId,
FLOAT32 Threshold);
int AdaptableWord(TWERD *Word,
const WERD_CHOICE &BestChoiceWord,
const WERD_CHOICE &RawChoiceWord);
void EndAdaptiveClassifier();
void PrintAdaptiveStatistics(FILE *File);
void SettupPass1();
void SettupPass2();
void AdaptiveClassifier(TBLOB *Blob,
BLOB_CHOICE_LIST *Choices,
CLASS_PRUNER_RESULTS cp_results);
void ClassifyAsNoise(ADAPT_RESULTS *Results);
void ResetAdaptiveClassifier();
int GetBaselineFeatures(TBLOB *Blob,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
inT32 *BlobLength);
FLOAT32 GetBestRatingFor(TBLOB *Blob,
CLASS_ID ClassId);
int GetCharNormFeatures(TBLOB *Blob,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
inT32 *BlobLength,
inT32 *FeatureOutlineIndex);
int GetIntBaselineFeatures(TBLOB *Blob,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
inT32 *BlobLength);
int GetIntCharNormFeatures(TBLOB *Blob,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
inT32 *BlobLength,
inT32 *FeatureOutlineArray);
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
void ResetFeaturesHaveBeenExtracted();
bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
/* float2int.cpp ************************************************************/
void ComputeIntCharNormArray(FEATURE NormFeature,
INT_TEMPLATES Templates,
CLASS_NORMALIZATION_ARRAY CharNormArray);
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
/* intproto.cpp *************************************************************/
INT_TEMPLATES ReadIntTemplates(FILE *File);
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
const UNICHARSET& target_unicharset);
CLASS_ID GetClassToDebug(const char *Prompt);
void ShowMatchDisplay();
/* font detection ***********************************************************/
UnicityTable<FontInfo>& get_fontinfo_table() {
return fontinfo_table_;
}
UnicityTable<FontSet>& get_fontset_table() {
return fontset_table_;
}
/* mfoutline.cpp ***********************************************************/
void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale);
/* outfeat.cpp ***********************************************************/
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
/* picofeat.cpp ***********************************************************/
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
// Member variables.
// Parameters.
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
INT_VAR_H(classify_debug_level, 0, "Classify debug level");
/* mfoutline.cpp ***********************************************************/
/* control knobs used to control normalization of outlines */
INT_VAR_H(classify_norm_method, character, "Normalization Method ...");
double_VAR_H(classify_char_norm_range, 0.2,
"Character Normalization Range ...");
double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
/* adaptmatch.cpp ***********************************************************/
BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
BOOL_VAR_H(classify_use_pre_adapted_templates, 0,
"Use pre-adapted classifier templates");
BOOL_VAR_H(classify_save_adapted_templates, 0,
"Save adapted templates to a file");
BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)");
double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
INT_VAR_H(matcher_min_examples_for_prototyping, 3,
"Reliable Config Threshold");
INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5,
"Enable adaption even if the ambiguities have not been seen");
double_VAR_H(matcher_clustering_max_angle_delta, 0.015,
"Maximum angle delta for prototype clustering");
double_VAR_H(classify_misfit_junk_penalty, 0.0,
"Penalty to apply when a non-alnum is vertically out of "
"its expected textline position");
BOOL_VAR_H(classify_enable_int_fx, 1, "Enable integer fx");
BOOL_VAR_H(classify_enable_new_adapt_rules, 1, "Enable new adaptation rules");
double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
double_VAR_H(tessedit_class_miss_scale, 0.00390625,
"Scale factor for features not used");
INT_VAR_H(classify_adapt_proto_threshold, 230,
"Threshold for good protos during adaptive 0-255");
INT_VAR_H(classify_adapt_feature_threshold, 230,
"Threshold for good features during adaptive 0-255");
BOOL_VAR_H(disable_character_fragments, FALSE,
"Do not include character fragments in the"
" results of the classifier");
BOOL_VAR_H(matcher_debug_separate_windows, FALSE,
"Use two different windows for debugging the matching: "
"One for the protos and one for the features.");
STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
/* intmatcher.cpp **********************************************************/
INT_VAR_H(classify_class_pruner_threshold, 229,
"Class Pruner Threshold 0-255: ");
INT_VAR_H(classify_class_pruner_multiplier, 30,
"Class Pruner Multiplier 0-255: ");
INT_VAR_H(classify_cp_cutoff_strength, 7,
"Class Pruner CutoffStrength: ");
INT_VAR_H(classify_integer_matcher_multiplier, 14,
"Integer Matcher Multiplier 0-255: ");
// Use class variables to hold onto built-in templates and adapted templates.
INT_TEMPLATES PreTrainedTemplates;
ADAPT_TEMPLATES AdaptedTemplates;
// Create dummy proto and config masks for use with the built-in templates.
BIT_VECTOR AllProtosOn;
BIT_VECTOR PrunedProtos;
BIT_VECTOR AllConfigsOn;
BIT_VECTOR AllProtosOff;
BIT_VECTOR AllConfigsOff;
BIT_VECTOR TempProtoMask;
bool EnableLearning;
/* normmatch.cpp */
NORM_PROTOS *NormProtos;
/* font detection ***********************************************************/
UnicityTable<FontInfo> fontinfo_table_;
UnicityTable<FontSet> fontset_table_;
INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
BOOL_VAR_H(classify_bln_numeric_mode, 0,
"Assume the input is numbers [0-9].");
protected:
IntegerMatcher im_;
FEATURE_DEFS_STRUCT feature_defs_;
// Must be set for the classifier to operate. Ususally set in
// Tesseract::recog_word_recursive, being the main word-level entry point.
DENORM denorm_;
private:
Dict dict_;
/* variables used to hold performance statistics */
int AdaptiveMatcherCalls;
int BaselineClassifierCalls;
int CharNormClassifierCalls;
int AmbigClassifierCalls;
int NumWordsAdaptedTo;
int NumCharsAdaptedTo;
int NumBaselineClassesTried;
int NumCharNormClassesTried;
int NumAmbigClassesTried;
int NumClassesOutput;
int NumAdaptationsFailed;
/* variables used to hold onto extracted features. This is used
to map from the old scheme in which baseline features and char norm
features are extracted separately, to the new scheme in which they
are extracted at the same time. */
bool FeaturesHaveBeenExtracted;
bool FeaturesOK;
INT_FEATURE_ARRAY BaselineFeatures;
INT_FEATURE_ARRAY CharNormFeatures;
INT_FX_RESULT_STRUCT FXInfo;
CLASS_CUTOFF_ARRAY CharNormCutoffs;
CLASS_CUTOFF_ARRAY BaselineCutoffs;
ScrollView* learn_debug_win_;
};
} // namespace tesseract
#endif // TESSERACT_CLASSIFY_CLASSIFY_H__