mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-07 10:17:50 +08:00
4523ce9f7d
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20
389 lines
17 KiB
C++
389 lines
17 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: classify.h
|
|
// Description: classify class.
|
|
// Author: Samuel Charron
|
|
//
|
|
// (C) Copyright 2006, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
|
|
#define TESSERACT_CLASSIFY_CLASSIFY_H__
|
|
|
|
#include "adaptive.h"
|
|
#include "ccstruct.h"
|
|
#include "classify.h"
|
|
#include "dict.h"
|
|
#include "featdefs.h"
|
|
#include "intfx.h"
|
|
#include "intmatcher.h"
|
|
#include "ratngs.h"
|
|
#include "ocrfeatures.h"
|
|
#include "unicity_table.h"
|
|
|
|
class ScrollView;
|
|
class WERD_CHOICE;
|
|
class WERD_RES;
|
|
struct ADAPT_RESULTS;
|
|
struct NORM_PROTOS;
|
|
|
|
namespace tesseract {
|
|
|
|
// How segmented is a blob. In this enum, character refers to a classifiable
|
|
// unit, but that is too long and character is usually easier to understand.
|
|
enum CharSegmentationType {
|
|
CST_FRAGMENT, // A partial character.
|
|
CST_WHOLE, // A correctly segmented character.
|
|
CST_IMPROPER, // More than one but less than 2 characters.
|
|
CST_NGRAM // Multiple characters.
|
|
};
|
|
|
|
class Classify : public CCStruct {
|
|
public:
|
|
Classify();
|
|
virtual ~Classify();
|
|
Dict& getDict() {
|
|
return dict_;
|
|
}
|
|
|
|
// Set the denorm for classification. Takes a copy.
|
|
void set_denorm(const DENORM* denorm) {
|
|
denorm_ = *denorm;
|
|
}
|
|
|
|
/* adaptive.cpp ************************************************************/
|
|
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
|
|
int ClassPruner(INT_TEMPLATES IntTemplates,
|
|
inT16 NumFeatures,
|
|
INT_FEATURE_ARRAY Features,
|
|
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
|
|
CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
|
|
CLASS_PRUNER_RESULTS Results);
|
|
void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
|
|
CLASS_CUTOFF_ARRAY Cutoffs);
|
|
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
|
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
|
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
|
|
/* normmatch.cpp ************************************************************/
|
|
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
|
|
void FreeNormProtos();
|
|
NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
|
|
/* protos.cpp ***************************************************************/
|
|
void ReadClassFile();
|
|
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
|
|
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
|
|
const UNICHARSET& target_unicharset);
|
|
/* adaptmatch.cpp ***********************************************************/
|
|
// Learn the given word using its chopped_word, seam_array, denorm,
|
|
// box_word, best_state, and correct_text to learn both correctly and
|
|
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
|
|
// is called and the data will be written to a file for static training.
|
|
// Otherwise AdaptToBlob is called for adaption within a document.
|
|
// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
|
|
// be learned, otherwise all chars with good correct_text are learned.
|
|
void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
|
|
|
|
// Builds a blob of length fragments, from the word, starting at start,
|
|
// and then learn it, as having the given correct_text.
|
|
// If filename is not NULL, then LearnBlob
|
|
// is called and the data will be written to a file for static training.
|
|
// Otherwise AdaptToBlob is called for adaption within a document.
|
|
// threshold is a magic number required by AdaptToChar and generated by
|
|
// GetAdaptThresholds.
|
|
// Although it can be partly inferred from the string, segmentation is
|
|
// provided to explicitly clarify the character segmentation.
|
|
void LearnPieces(const char* filename, int start, int length,
|
|
float threshold, CharSegmentationType segmentation,
|
|
const char* correct_text, WERD_RES *word);
|
|
void InitAdaptiveClassifier(bool load_pre_trained_templates);
|
|
void InitAdaptedClass(TBLOB *Blob,
|
|
CLASS_ID ClassId,
|
|
ADAPT_CLASS Class,
|
|
ADAPT_TEMPLATES Templates);
|
|
void AdaptToPunc(TBLOB *Blob,
|
|
CLASS_ID ClassId,
|
|
FLOAT32 Threshold);
|
|
void AmbigClassifier(TBLOB *Blob,
|
|
INT_TEMPLATES Templates,
|
|
UNICHAR_ID *Ambiguities,
|
|
ADAPT_RESULTS *Results);
|
|
void MasterMatcher(INT_TEMPLATES templates,
|
|
inT16 num_features,
|
|
INT_FEATURE_ARRAY features,
|
|
CLASS_NORMALIZATION_ARRAY norm_factors,
|
|
ADAPT_CLASS* classes,
|
|
int debug,
|
|
int num_classes,
|
|
const TBOX& blob_box,
|
|
CLASS_PRUNER_RESULTS results,
|
|
ADAPT_RESULTS* final_results);
|
|
void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
|
|
BLOB_CHOICE_LIST *Choices);
|
|
void AddNewResult(ADAPT_RESULTS *Results,
|
|
CLASS_ID ClassId,
|
|
FLOAT32 Rating,
|
|
int ConfigId,
|
|
int config2);
|
|
int GetAdaptiveFeatures(TBLOB *Blob,
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
FEATURE_SET *FloatFeatures);
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
void DebugAdaptiveClassifier(TBLOB *Blob,
|
|
ADAPT_RESULTS *Results);
|
|
#endif
|
|
void GetAdaptThresholds (TWERD * Word,
|
|
const WERD_CHOICE& BestChoice,
|
|
const WERD_CHOICE& BestRawChoice,
|
|
FLOAT32 Thresholds[]);
|
|
|
|
PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
|
|
int NumBadFeat,
|
|
FEATURE_ID BadFeat[],
|
|
INT_CLASS IClass,
|
|
ADAPT_CLASS Class,
|
|
BIT_VECTOR TempProtoMask);
|
|
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
|
|
CLASS_ID ClassId,
|
|
int NumFeatures,
|
|
INT_FEATURE_ARRAY Features,
|
|
FEATURE_SET FloatFeatures);
|
|
void MakePermanent(ADAPT_TEMPLATES Templates,
|
|
CLASS_ID ClassId,
|
|
int ConfigId,
|
|
TBLOB *Blob);
|
|
void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
|
|
void RemoveExtraPuncs(ADAPT_RESULTS *Results);
|
|
void RemoveBadMatches(ADAPT_RESULTS *Results);
|
|
void SetAdaptiveThreshold(FLOAT32 Threshold);
|
|
void ShowBestMatchFor(TBLOB *Blob,
|
|
CLASS_ID ClassId,
|
|
BOOL8 AdaptiveOn,
|
|
BOOL8 PreTrainedOn);
|
|
UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
|
|
ADAPT_TEMPLATES Templates,
|
|
ADAPT_RESULTS *Results);
|
|
int CharNormClassifier(TBLOB *Blob,
|
|
INT_TEMPLATES Templates,
|
|
ADAPT_RESULTS *Results);
|
|
UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
|
|
CLASS_ID CorrectClass);
|
|
void DoAdaptiveMatch(TBLOB *Blob,
|
|
ADAPT_RESULTS *Results);
|
|
void AdaptToChar(TBLOB *Blob,
|
|
CLASS_ID ClassId,
|
|
FLOAT32 Threshold);
|
|
int AdaptableWord(TWERD *Word,
|
|
const WERD_CHOICE &BestChoiceWord,
|
|
const WERD_CHOICE &RawChoiceWord);
|
|
void EndAdaptiveClassifier();
|
|
void PrintAdaptiveStatistics(FILE *File);
|
|
void SettupPass1();
|
|
void SettupPass2();
|
|
void AdaptiveClassifier(TBLOB *Blob,
|
|
BLOB_CHOICE_LIST *Choices,
|
|
CLASS_PRUNER_RESULTS cp_results);
|
|
void ClassifyAsNoise(ADAPT_RESULTS *Results);
|
|
void ResetAdaptiveClassifier();
|
|
|
|
int GetBaselineFeatures(TBLOB *Blob,
|
|
INT_TEMPLATES Templates,
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
inT32 *BlobLength);
|
|
FLOAT32 GetBestRatingFor(TBLOB *Blob,
|
|
CLASS_ID ClassId);
|
|
int GetCharNormFeatures(TBLOB *Blob,
|
|
INT_TEMPLATES Templates,
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
inT32 *BlobLength,
|
|
inT32 *FeatureOutlineIndex);
|
|
int GetIntBaselineFeatures(TBLOB *Blob,
|
|
INT_TEMPLATES Templates,
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
inT32 *BlobLength);
|
|
int GetIntCharNormFeatures(TBLOB *Blob,
|
|
INT_TEMPLATES Templates,
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
inT32 *BlobLength,
|
|
inT32 *FeatureOutlineArray);
|
|
|
|
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
|
|
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
|
|
|
|
void ResetFeaturesHaveBeenExtracted();
|
|
bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
|
|
/* float2int.cpp ************************************************************/
|
|
void ComputeIntCharNormArray(FEATURE NormFeature,
|
|
INT_TEMPLATES Templates,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray);
|
|
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
|
|
/* intproto.cpp *************************************************************/
|
|
INT_TEMPLATES ReadIntTemplates(FILE *File);
|
|
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
|
|
const UNICHARSET& target_unicharset);
|
|
CLASS_ID GetClassToDebug(const char *Prompt);
|
|
void ShowMatchDisplay();
|
|
/* font detection ***********************************************************/
|
|
UnicityTable<FontInfo>& get_fontinfo_table() {
|
|
return fontinfo_table_;
|
|
}
|
|
UnicityTable<FontSet>& get_fontset_table() {
|
|
return fontset_table_;
|
|
}
|
|
/* mfoutline.cpp ***********************************************************/
|
|
void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale);
|
|
/* outfeat.cpp ***********************************************************/
|
|
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
|
|
/* picofeat.cpp ***********************************************************/
|
|
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
|
|
|
|
|
|
// Member variables.
|
|
|
|
// Parameters.
|
|
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
|
|
BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
|
|
INT_VAR_H(classify_debug_level, 0, "Classify debug level");
|
|
|
|
/* mfoutline.cpp ***********************************************************/
|
|
/* control knobs used to control normalization of outlines */
|
|
INT_VAR_H(classify_norm_method, character, "Normalization Method ...");
|
|
double_VAR_H(classify_char_norm_range, 0.2,
|
|
"Character Normalization Range ...");
|
|
double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
|
|
double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
|
|
double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
|
|
double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
|
|
|
|
/* adaptmatch.cpp ***********************************************************/
|
|
BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
|
|
BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
|
|
BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
|
|
BOOL_VAR_H(classify_use_pre_adapted_templates, 0,
|
|
"Use pre-adapted classifier templates");
|
|
BOOL_VAR_H(classify_save_adapted_templates, 0,
|
|
"Save adapted templates to a file");
|
|
BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
|
|
INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
|
|
INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
|
|
INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
|
|
double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
|
|
double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)");
|
|
double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
|
|
double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
|
|
double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
|
|
double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
|
|
INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
|
|
INT_VAR_H(matcher_min_examples_for_prototyping, 3,
|
|
"Reliable Config Threshold");
|
|
INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5,
|
|
"Enable adaption even if the ambiguities have not been seen");
|
|
double_VAR_H(matcher_clustering_max_angle_delta, 0.015,
|
|
"Maximum angle delta for prototype clustering");
|
|
double_VAR_H(classify_misfit_junk_penalty, 0.0,
|
|
"Penalty to apply when a non-alnum is vertically out of "
|
|
"its expected textline position");
|
|
BOOL_VAR_H(classify_enable_int_fx, 1, "Enable integer fx");
|
|
BOOL_VAR_H(classify_enable_new_adapt_rules, 1, "Enable new adaptation rules");
|
|
double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
|
|
double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
|
|
double_VAR_H(tessedit_class_miss_scale, 0.00390625,
|
|
"Scale factor for features not used");
|
|
INT_VAR_H(classify_adapt_proto_threshold, 230,
|
|
"Threshold for good protos during adaptive 0-255");
|
|
INT_VAR_H(classify_adapt_feature_threshold, 230,
|
|
"Threshold for good features during adaptive 0-255");
|
|
BOOL_VAR_H(disable_character_fragments, FALSE,
|
|
"Do not include character fragments in the"
|
|
" results of the classifier");
|
|
BOOL_VAR_H(matcher_debug_separate_windows, FALSE,
|
|
"Use two different windows for debugging the matching: "
|
|
"One for the protos and one for the features.");
|
|
STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
|
|
|
|
/* intmatcher.cpp **********************************************************/
|
|
INT_VAR_H(classify_class_pruner_threshold, 229,
|
|
"Class Pruner Threshold 0-255: ");
|
|
INT_VAR_H(classify_class_pruner_multiplier, 30,
|
|
"Class Pruner Multiplier 0-255: ");
|
|
INT_VAR_H(classify_cp_cutoff_strength, 7,
|
|
"Class Pruner CutoffStrength: ");
|
|
INT_VAR_H(classify_integer_matcher_multiplier, 14,
|
|
"Integer Matcher Multiplier 0-255: ");
|
|
|
|
// Use class variables to hold onto built-in templates and adapted templates.
|
|
INT_TEMPLATES PreTrainedTemplates;
|
|
ADAPT_TEMPLATES AdaptedTemplates;
|
|
|
|
// Create dummy proto and config masks for use with the built-in templates.
|
|
BIT_VECTOR AllProtosOn;
|
|
BIT_VECTOR PrunedProtos;
|
|
BIT_VECTOR AllConfigsOn;
|
|
BIT_VECTOR AllProtosOff;
|
|
BIT_VECTOR AllConfigsOff;
|
|
BIT_VECTOR TempProtoMask;
|
|
bool EnableLearning;
|
|
/* normmatch.cpp */
|
|
NORM_PROTOS *NormProtos;
|
|
/* font detection ***********************************************************/
|
|
UnicityTable<FontInfo> fontinfo_table_;
|
|
UnicityTable<FontSet> fontset_table_;
|
|
|
|
INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
|
|
BOOL_VAR_H(classify_bln_numeric_mode, 0,
|
|
"Assume the input is numbers [0-9].");
|
|
protected:
|
|
IntegerMatcher im_;
|
|
FEATURE_DEFS_STRUCT feature_defs_;
|
|
// Must be set for the classifier to operate. Ususally set in
|
|
// Tesseract::recog_word_recursive, being the main word-level entry point.
|
|
DENORM denorm_;
|
|
|
|
private:
|
|
|
|
Dict dict_;
|
|
|
|
/* variables used to hold performance statistics */
|
|
int AdaptiveMatcherCalls;
|
|
int BaselineClassifierCalls;
|
|
int CharNormClassifierCalls;
|
|
int AmbigClassifierCalls;
|
|
int NumWordsAdaptedTo;
|
|
int NumCharsAdaptedTo;
|
|
int NumBaselineClassesTried;
|
|
int NumCharNormClassesTried;
|
|
int NumAmbigClassesTried;
|
|
int NumClassesOutput;
|
|
int NumAdaptationsFailed;
|
|
|
|
/* variables used to hold onto extracted features. This is used
|
|
to map from the old scheme in which baseline features and char norm
|
|
features are extracted separately, to the new scheme in which they
|
|
are extracted at the same time. */
|
|
bool FeaturesHaveBeenExtracted;
|
|
bool FeaturesOK;
|
|
INT_FEATURE_ARRAY BaselineFeatures;
|
|
INT_FEATURE_ARRAY CharNormFeatures;
|
|
INT_FX_RESULT_STRUCT FXInfo;
|
|
|
|
CLASS_CUTOFF_ARRAY CharNormCutoffs;
|
|
CLASS_CUTOFF_ARRAY BaselineCutoffs;
|
|
ScrollView* learn_debug_win_;
|
|
};
|
|
} // namespace tesseract
|
|
|
|
#endif // TESSERACT_CLASSIFY_CLASSIFY_H__
|