mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-28 11:08:15 +08:00
694d3f2c20
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@291 d0cd1f9f-072b-0410-8dd7-cf729c803f20
231 lines
9.6 KiB
C++
231 lines
9.6 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: classify.h
|
|
// Description: classify class.
|
|
// Author: Samuel Charron
|
|
//
|
|
// (C) Copyright 2006, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
|
|
#define TESSERACT_CLASSIFY_CLASSIFY_H__
|
|
|
|
#include "adaptive.h"
|
|
#include "ccstruct.h"
|
|
#include "classify.h"
|
|
#include "dict.h"
|
|
#include "fxdefs.h"
|
|
#include "intmatcher.h"
|
|
#include "ratngs.h"
|
|
#include "ocrfeatures.h"
|
|
#include "unicity_table.h"
|
|
|
|
class WERD_CHOICE;
|
|
struct ADAPT_RESULTS;
|
|
struct NORM_PROTOS;
|
|
|
|
namespace tesseract {
|
|
class Classify : public CCStruct {
|
|
public:
|
|
Classify();
|
|
~Classify();
|
|
Dict& getDict() {
|
|
return dict_;
|
|
}
|
|
/* adaptive.cpp ************************************************************/
|
|
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
|
|
int ClassPruner(INT_TEMPLATES IntTemplates,
|
|
inT16 NumFeatures,
|
|
INT_FEATURE_ARRAY Features,
|
|
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
|
|
CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
|
|
CLASS_PRUNER_RESULTS Results,
|
|
int Debug);
|
|
void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
|
|
CLASS_CUTOFF_ARRAY Cutoffs);
|
|
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
|
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
|
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
|
|
/* normmatch.cpp ************************************************************/
|
|
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
|
|
void FreeNormProtos();
|
|
NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
|
|
/* protos.cpp ***************************************************************/
|
|
void ReadClassFile();
|
|
INT_TEMPLATES
|
|
CreateIntTemplates(CLASSES FloatProtos,
|
|
const UNICHARSET& target_unicharset);
|
|
/* adaptmatch.cpp ***********************************************************/
|
|
void AdaptToWord(TWERD *Word,
|
|
TEXTROW *Row,
|
|
const WERD_CHOICE& BestChoice,
|
|
const WERD_CHOICE& BestRawChoice,
|
|
const char *rejmap);
|
|
void InitAdaptiveClassifier();
|
|
void InitAdaptedClass(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
CLASS_ID ClassId,
|
|
ADAPT_CLASS Class,
|
|
ADAPT_TEMPLATES Templates);
|
|
void AdaptToPunc(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
CLASS_ID ClassId,
|
|
FLOAT32 Threshold);
|
|
void AmbigClassifier(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
INT_TEMPLATES Templates,
|
|
UNICHAR_ID *Ambiguities,
|
|
ADAPT_RESULTS *Results);
|
|
void MasterMatcher(INT_TEMPLATES templates,
|
|
inT16 num_features,
|
|
INT_FEATURE_ARRAY features,
|
|
CLASS_NORMALIZATION_ARRAY norm_factors,
|
|
ADAPT_CLASS* classes,
|
|
int debug,
|
|
int num_classes,
|
|
CLASS_PRUNER_RESULTS results,
|
|
ADAPT_RESULTS* final_results);
|
|
void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
|
|
BLOB_CHOICE_LIST *Choices);
|
|
void AddNewResult(ADAPT_RESULTS *Results,
|
|
CLASS_ID ClassId,
|
|
FLOAT32 Rating,
|
|
int ConfigId);
|
|
#ifndef GRAPHICS_DISABLED
|
|
void DebugAdaptiveClassifier(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
ADAPT_RESULTS *Results);
|
|
#endif
|
|
void GetAdaptThresholds (TWERD * Word,
|
|
LINE_STATS * LineStats,
|
|
const WERD_CHOICE& BestChoice,
|
|
const WERD_CHOICE& BestRawChoice,
|
|
FLOAT32 Thresholds[]);
|
|
|
|
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
|
|
CLASS_ID ClassId,
|
|
int NumFeatures,
|
|
INT_FEATURE_ARRAY Features,
|
|
FEATURE_SET FloatFeatures);
|
|
void MakePermanent(ADAPT_TEMPLATES Templates,
|
|
CLASS_ID ClassId,
|
|
int ConfigId,
|
|
TBLOB *Blob,
|
|
LINE_STATS *LineStats);
|
|
void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
|
|
void RemoveExtraPuncs(ADAPT_RESULTS *Results);
|
|
void RemoveBadMatches(ADAPT_RESULTS *Results);
|
|
void ShowBestMatchFor(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
CLASS_ID ClassId,
|
|
BOOL8 AdaptiveOn,
|
|
BOOL8 PreTrainedOn);
|
|
UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
ADAPT_TEMPLATES Templates,
|
|
ADAPT_RESULTS *Results);
|
|
int CharNormClassifier(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
INT_TEMPLATES Templates,
|
|
ADAPT_RESULTS *Results);
|
|
UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
CLASS_ID CorrectClass);
|
|
void DoAdaptiveMatch(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
ADAPT_RESULTS *Results);
|
|
void AdaptToChar(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
CLASS_ID ClassId,
|
|
FLOAT32 Threshold);
|
|
int AdaptableWord(TWERD *Word,
|
|
const WERD_CHOICE &BestChoiceWord,
|
|
const WERD_CHOICE &RawChoiceWord);
|
|
void EndAdaptiveClassifier();
|
|
void PrintAdaptiveStatistics(FILE *File);
|
|
void SettupPass1();
|
|
void SettupPass2();
|
|
void AdaptiveClassifier(TBLOB *Blob,
|
|
TBLOB *DotBlob,
|
|
TEXTROW *Row,
|
|
BLOB_CHOICE_LIST *Choices,
|
|
CLASS_PRUNER_RESULTS cp_results);
|
|
void ClassifyAsNoise(ADAPT_RESULTS *Results);
|
|
void ResetAdaptiveClassifier();
|
|
|
|
FLOAT32 GetBestRatingFor(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
CLASS_ID ClassId);
|
|
int GetCharNormFeatures(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
INT_TEMPLATES Templates,
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
inT32 *BlobLength);
|
|
int GetIntCharNormFeatures(TBLOB *Blob,
|
|
LINE_STATS *LineStats,
|
|
INT_TEMPLATES Templates,
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
inT32 *BlobLength);
|
|
|
|
/* float2int.cpp ************************************************************/
|
|
void ComputeIntCharNormArray(FEATURE NormFeature,
|
|
INT_TEMPLATES Templates,
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray);
|
|
/* intproto.cpp *************************************************************/
|
|
INT_TEMPLATES ReadIntTemplates(FILE *File);
|
|
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
|
|
const UNICHARSET& target_unicharset);
|
|
CLASS_ID GetClassToDebug(const char *Prompt);
|
|
/* font detection ***********************************************************/
|
|
UnicityTable<FontInfo>& get_fontinfo_table() {
|
|
return fontinfo_table_;
|
|
}
|
|
UnicityTable<FontSet>& get_fontset_table() {
|
|
return fontset_table_;
|
|
}
|
|
/* adaptmatch.cpp ***********************************************************/
|
|
/* name of current image file being processed */
|
|
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
|
|
/* use class variables to hold onto built-in templates and adapted
|
|
templates */
|
|
INT_TEMPLATES PreTrainedTemplates;
|
|
ADAPT_TEMPLATES AdaptedTemplates;
|
|
// Successful load of inttemp allows base tesseract classfier to be used.
|
|
bool inttemp_loaded_;
|
|
|
|
/* create dummy proto and config masks for use with the built-in templates */
|
|
BIT_VECTOR AllProtosOn;
|
|
BIT_VECTOR PrunedProtos;
|
|
BIT_VECTOR AllConfigsOn;
|
|
BIT_VECTOR AllProtosOff;
|
|
BIT_VECTOR AllConfigsOff;
|
|
BIT_VECTOR TempProtoMask;
|
|
// External control of adaption.
|
|
BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
|
|
// Internal control of Adaption so it doesn't work on pass2.
|
|
BOOL_VAR_H(classify_recog_devanagari, false,
|
|
"Whether recognizing a language with devanagari script.");
|
|
bool EnableLearning;
|
|
/* normmatch.cpp */
|
|
NORM_PROTOS *NormProtos;
|
|
/* font detection ***********************************************************/
|
|
UnicityTable<FontInfo> fontinfo_table_;
|
|
UnicityTable<FontSet> fontset_table_;
|
|
private:
|
|
Dict dict_;
|
|
};
|
|
} // namespace tesseract
|
|
|
|
#endif // TESSERACT_CLASSIFY_CLASSIFY_H__
|