Added simultaneous multi-language capability, Added support for ShapeTable in classifier and training, Refactored class pruner, Added new uniform classifier API, Added new training error counter

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@650 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2012-02-02 02:57:42 +00:00
parent fdd4ffe85e
commit 5bc5e2a0b4
54 changed files with 7474 additions and 1501 deletions

View File

@ -7,13 +7,15 @@ AM_CPPFLAGS = \
include_HEADERS = \
adaptive.h baseline.h blobclass.h chartoname.h \
classify.h cluster.h clusttool.h cutoffs.h \
extern.h extract.h \
errorcounter.h extern.h extract.h \
featdefs.h flexfx.h float2int.h fpoint.h fxdefs.h \
intfeaturedist.h intfeaturemap.h intfeaturespace.h \
intfx.h intmatcher.h intproto.h kdtree.h \
mf.h mfdefs.h mfoutline.h mfx.h \
mastertrainer.h mf.h mfdefs.h mfoutline.h mfx.h \
normfeat.h normmatch.h \
ocrfeatures.h outfeat.h picofeat.h protos.h \
speckle.h xform2d.h
sampleiterator.h shapeclassifier.h shapetable.h \
speckle.h tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h
if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_classify.la
@ -32,12 +34,14 @@ endif
libtesseract_classify_la_SOURCES = \
adaptive.cpp adaptmatch.cpp blobclass.cpp \
chartoname.cpp classify.cpp cluster.cpp clusttool.cpp cutoffs.cpp \
extract.cpp \
errorcounter.cpp extract.cpp \
featdefs.cpp flexfx.cpp float2int.cpp fpoint.cpp fxdefs.cpp \
intfeaturedist.cpp intfeaturemap.cpp intfeaturespace.cpp \
intfx.cpp intmatcher.cpp intproto.cpp kdtree.cpp \
mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
mastertrainer.cpp mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
normfeat.cpp normmatch.cpp \
ocrfeatures.cpp outfeat.cpp picofeat.cpp protos.cpp \
speckle.cpp xform2d.cpp
sampleiterator.cpp shapetable.cpp speckle.cpp \
tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp xform2d.cpp

File diff suppressed because it is too large Load Diff

View File

@ -112,11 +112,15 @@ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
return;
}
// label the features with a class name and font name
fprintf (FeatureFile, "\n%s %s ", FontName, BlobText);
if (ValidCharDescription(FeatureDefs, CharDesc)) {
// label the features with a class name and font name
fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
// write micro-features to file and clean up
WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
// write micro-features to file and clean up
WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
} else {
tprintf("Blob learned was invalid!\n");
}
FreeCharDescription(CharDesc);
} // LearnBlob

View File

@ -17,49 +17,19 @@
///////////////////////////////////////////////////////////////////////
#include "classify.h"
#include "fontinfo.h"
#include "intproto.h"
#include "mfoutline.h"
#include "scrollview.h"
#include "shapetable.h"
#include "unicity_table.h"
#include <string.h>
namespace {
// Compare FontInfo structures.
bool compare_fontinfo(const FontInfo& fi1, const FontInfo& fi2) {
// The font properties are required to be the same for two font with the same
// name, so there is no need to test them.
// Consequently, querying the table with only its font name as information is
// enough to retrieve its properties.
return strcmp(fi1.name, fi2.name) == 0;
}
// Compare FontSet structures.
bool compare_font_set(const FontSet& fs1, const FontSet& fs2) {
if (fs1.size != fs2.size)
return false;
for (int i = 0; i < fs1.size; ++i) {
if (fs1.configs[i] != fs2.configs[i])
return false;
}
return true;
}
void delete_callback(FontInfo f) {
if (f.spacing_vec != NULL) {
f.spacing_vec->delete_data_pointers();
delete f.spacing_vec;
}
delete[] f.name;
}
void delete_callback_fs(FontSet fs) {
delete[] fs.configs;
}
}
namespace tesseract {
Classify::Classify()
: INT_MEMBER(tessedit_single_match, FALSE,
: BOOL_MEMBER(prioritize_division, FALSE,
"Prioritize blob division over chopping", this->params()),
INT_MEMBER(tessedit_single_match, FALSE,
"Top choice only from CP", this->params()),
BOOL_MEMBER(classify_enable_learning, true,
"Enable adaptive classifier", this->params()),
@ -120,10 +90,6 @@ Classify::Classify()
"Penalty to apply when a non-alnum is vertically out of "
"its expected textline position",
this->params()),
BOOL_MEMBER(classify_enable_int_fx, 1, "Enable integer fx",
this->params()),
BOOL_MEMBER(classify_enable_new_adapt_rules, 1,
"Enable new adaptation rules", this->params()),
double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
this->params()),
@ -149,28 +115,29 @@ Classify::Classify()
"One for the protos and one for the features.", this->params()),
STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
this->params()),
INT_INIT_MEMBER(classify_class_pruner_threshold, 229,
"Class Pruner Threshold 0-255: ", this->params()),
INT_INIT_MEMBER(classify_class_pruner_multiplier, 30,
"Class Pruner Multiplier 0-255: ", this->params()),
INT_INIT_MEMBER(classify_cp_cutoff_strength, 7,
"Class Pruner CutoffStrength: ", this->params()),
INT_INIT_MEMBER(classify_integer_matcher_multiplier, 14,
"Integer Matcher Multiplier 0-255: ", this->params()),
INT_MEMBER(classify_class_pruner_threshold, 229,
"Class Pruner Threshold 0-255", this->params()),
INT_MEMBER(classify_class_pruner_multiplier, 30,
"Class Pruner Multiplier 0-255: ", this->params()),
INT_MEMBER(classify_cp_cutoff_strength, 7,
"Class Pruner CutoffStrength: ", this->params()),
INT_MEMBER(classify_integer_matcher_multiplier, 14,
"Integer Matcher Multiplier 0-255: ", this->params()),
EnableLearning(true),
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
this->params()),
BOOL_MEMBER(classify_bln_numeric_mode, 0,
"Assume the input is numbers [0-9].", this->params()),
shape_table_(NULL),
dict_(&image_) {
fontinfo_table_.set_compare_callback(
NewPermanentTessCallback(compare_fontinfo));
NewPermanentTessCallback(CompareFontInfo));
fontinfo_table_.set_clear_callback(
NewPermanentTessCallback(delete_callback));
NewPermanentTessCallback(FontInfoDeleteCallback));
fontset_table_.set_compare_callback(
NewPermanentTessCallback(compare_font_set));
NewPermanentTessCallback(CompareFontSet));
fontset_table_.set_clear_callback(
NewPermanentTessCallback(delete_callback_fs));
NewPermanentTessCallback(FontSetDeleteCallback));
AdaptedTemplates = NULL;
PreTrainedTemplates = NULL;
AllProtosOn = NULL;
@ -198,6 +165,9 @@ Classify::Classify()
learn_debug_win_ = NULL;
learn_fragmented_word_debug_win_ = NULL;
learn_fragments_debug_win_ = NULL;
CharNormCutoffs = new uinT16[MAX_NUM_CLASSES];
BaselineCutoffs = new uinT16[MAX_NUM_CLASSES];
}
Classify::~Classify() {
@ -205,6 +175,8 @@ Classify::~Classify() {
delete learn_debug_win_;
delete learn_fragmented_word_debug_win_;
delete learn_fragments_debug_win_;
delete[] CharNormCutoffs;
delete[] BaselineCutoffs;
}
} // namespace tesseract

View File

@ -24,6 +24,7 @@
#include "classify.h"
#include "dict.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "intfx.h"
#include "intmatcher.h"
#include "normalis.h"
@ -42,6 +43,9 @@ static const int kBlankFontinfoId = -2;
namespace tesseract {
struct ShapeRating;
class ShapeTable;
// How segmented is a blob. In this enum, character refers to a classifiable
// unit, but that is too long and character is usually easier to understand.
enum CharSegmentationType {
@ -59,27 +63,41 @@ class Classify : public CCStruct {
return dict_;
}
// Set the denorm for classification. Takes a copy.
void set_denorm(const DENORM* denorm) {
denorm_ = *denorm;
const ShapeTable* shape_table() const {
return shape_table_;
}
/* adaptive.cpp ************************************************************/
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
int ClassPruner(INT_TEMPLATES IntTemplates,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
CLASS_PRUNER_RESULTS Results);
void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
// Runs the class pruner from int_templates on the given features, returning
// the number of classes output in results.
// int_templates Class pruner tables
// num_features Number of features in blob
// features Array of features
// normalization_factors (input) Array of int_templates->NumClasses fudge
// factors from blob normalization process.
// (Indexed by CLASS_INDEX)
// expected_num_features (input) Array of int_templates->NumClasses
// expected number of features for each class.
// (Indexed by CLASS_INDEX)
// results (output) Sorted Array of pruned classes.
// Array must be sized to take the maximum possible
// number of outputs : int_templates->NumClasses.
int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
int num_features,
const INT_FEATURE_STRUCT* features,
const uinT8* normalization_factors,
const uinT16* expected_num_features,
CP_RESULT_STRUCT* results);
void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
CLASS_CUTOFF_ARRAY Cutoffs);
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
/* normmatch.cpp ************************************************************/
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
FLOAT32 ComputeNormMatch(CLASS_ID ClassId,
const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
void FreeNormProtos();
NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
/* protos.cpp ***************************************************************/
@ -88,6 +106,7 @@ class Classify : public CCStruct {
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
const UNICHARSET& target_unicharset);
/* adaptmatch.cpp ***********************************************************/
// Learn the given word using its chopped_word, seam_array, denorm,
// box_word, best_state, and correct_text to learn both correctly and
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
@ -111,36 +130,62 @@ class Classify : public CCStruct {
const char* correct_text, WERD_RES *word);
void InitAdaptiveClassifier(bool load_pre_trained_templates);
void InitAdaptedClass(TBLOB *Blob,
const DENORM& denorm,
CLASS_ID ClassId,
int FontinfoId,
ADAPT_CLASS Class,
ADAPT_TEMPLATES Templates);
void AdaptToPunc(TBLOB *Blob,
const DENORM& denorm,
CLASS_ID ClassId,
int FontinfoId,
FLOAT32 Threshold);
void AmbigClassifier(TBLOB *Blob,
const DENORM& denorm,
INT_TEMPLATES Templates,
ADAPT_CLASS *Classes,
UNICHAR_ID *Ambiguities,
ADAPT_RESULTS *Results);
void MasterMatcher(INT_TEMPLATES templates,
inT16 num_features,
INT_FEATURE_ARRAY features,
CLASS_NORMALIZATION_ARRAY norm_factors,
const INT_FEATURE_STRUCT* features,
const uinT8* norm_factors,
ADAPT_CLASS* classes,
int debug,
int num_classes,
const TBOX& blob_box,
CLASS_PRUNER_RESULTS results,
ADAPT_RESULTS* final_results);
void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
// Converts configs to fonts, and if the result is not adapted, and a
// shape_table_ is present, the shape is expanded to include all
// unichar_ids represented, before applying a set of corrections to the
// distance rating in int_result, (see ComputeCorrectedRating.)
// The results are added to the final_results output.
void ExpandShapesAndApplyCorrections(ADAPT_CLASS* classes,
bool debug,
int class_id,
int bottom, int top,
float cp_rating,
int blob_length,
const uinT8* cn_factors,
INT_RESULT_STRUCT& int_result,
ADAPT_RESULTS* final_results);
// Applies a set of corrections to the distance im_rating,
// including the cn_correction, miss penalty and additional penalty
// for non-alnums being vertical misfits. Returns the corrected distance.
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
double im_rating, int feature_misses,
int bottom, int top,
int blob_length, const uinT8* cn_factors);
void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
ADAPT_RESULTS *Results,
BLOB_CHOICE_LIST *Choices);
void AddNewResult(ADAPT_RESULTS *results,
CLASS_ID class_dd,
CLASS_ID class_id,
int shape_id,
FLOAT32 rating,
bool adapted,
int config,
int config2,
int fontinfo_id,
int fontinfo_id2);
int GetAdaptiveFeatures(TBLOB *Blob,
@ -149,9 +194,11 @@ class Classify : public CCStruct {
#ifndef GRAPHICS_DISABLED
void DebugAdaptiveClassifier(TBLOB *Blob,
const DENORM& denorm,
ADAPT_RESULTS *Results);
#endif
void GetAdaptThresholds (TWERD * Word,
const DENORM& denorm,
const WERD_CHOICE& BestChoice,
const WERD_CHOICE& BestRawChoice,
FLOAT32 Thresholds[]);
@ -171,30 +218,64 @@ class Classify : public CCStruct {
void MakePermanent(ADAPT_TEMPLATES Templates,
CLASS_ID ClassId,
int ConfigId,
const DENORM& denorm,
TBLOB *Blob);
void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
void RemoveExtraPuncs(ADAPT_RESULTS *Results);
void RemoveBadMatches(ADAPT_RESULTS *Results);
void SetAdaptiveThreshold(FLOAT32 Threshold);
void ShowBestMatchFor(TBLOB *Blob,
const DENORM& denorm,
CLASS_ID ClassId,
int shape_id,
BOOL8 AdaptiveOn,
BOOL8 PreTrainedOn);
BOOL8 PreTrainedOn,
ADAPT_RESULTS *Results);
// Returns a string for the classifier class_id: either the corresponding
// unicharset debug_str or the shape_table_ debug str.
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
int class_id, int config_id) const;
// Converts a classifier class_id index with a config ID to:
// shape_table_ present: a shape_table_ index OR
// No shape_table_: a font ID.
// Without shape training, each class_id, config pair represents a single
// unichar id/font combination, so this function looks up the corresponding
// font id.
// With shape training, each class_id, config pair represents a single
// shape table index, so the fontset_table stores the shape table index,
// and the shape_table_ must be consulted to obtain the actual unichar_id/
// font combinations that the shape represents.
int ClassAndConfigIDToFontOrShapeID(int class_id,
int int_result_config) const;
// Converts a shape_table_ index to a classifier class_id index (not a
// unichar-id!). Uses a search, so not fast.
int ShapeIDToClassID(int shape_id) const;
UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
const DENORM& denorm,
ADAPT_TEMPLATES Templates,
ADAPT_RESULTS *Results);
int CharNormClassifier(TBLOB *Blob,
const DENORM& denorm,
INT_TEMPLATES Templates,
ADAPT_RESULTS *Results);
// As CharNormClassifier, but operates on a TrainingSample and outputs to
// a GenericVector of ShapeRating without conversion to classes.
int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample,
GenericVector<ShapeRating>* results);
UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
const DENORM& denorm,
CLASS_ID CorrectClass);
void DoAdaptiveMatch(TBLOB *Blob,
const DENORM& denorm,
ADAPT_RESULTS *Results);
void AdaptToChar(TBLOB *Blob,
const DENORM& denorm,
CLASS_ID ClassId,
int FontinfoId,
FLOAT32 Threshold);
void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
INT_CLASS_STRUCT* int_class);
int AdaptableWord(TWERD *Word,
const WERD_CHOICE &BestChoiceWord,
const WERD_CHOICE &RawChoiceWord);
@ -203,55 +284,53 @@ class Classify : public CCStruct {
void SettupPass1();
void SettupPass2();
void AdaptiveClassifier(TBLOB *Blob,
const DENORM& denorm,
BLOB_CHOICE_LIST *Choices,
CLASS_PRUNER_RESULTS cp_results);
void ClassifyAsNoise(ADAPT_RESULTS *Results);
void ResetAdaptiveClassifier();
void ResetAdaptiveClassifierInternal();
int GetBaselineFeatures(TBLOB *Blob,
const DENORM& denorm,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
uinT8* CharNormArray,
inT32 *BlobLength);
FLOAT32 GetBestRatingFor(TBLOB *Blob,
CLASS_ID ClassId);
int GetCharNormFeatures(TBLOB *Blob,
const DENORM& denorm,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
uinT8* PrunerNormArray,
uinT8* CharNormArray,
inT32 *BlobLength,
inT32 *FeatureOutlineIndex);
int GetIntBaselineFeatures(TBLOB *Blob,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
inT32 *BlobLength);
int GetIntCharNormFeatures(TBLOB *Blob,
INT_TEMPLATES Templates,
INT_FEATURE_ARRAY IntFeatures,
CLASS_NORMALIZATION_ARRAY CharNormArray,
inT32 *BlobLength,
inT32 *FeatureOutlineArray);
// Computes the char_norm_array for the unicharset and, if not NULL, the
// pruner_array as appropriate according to the existence of the shape_table.
// The norm_feature is deleted as it is almost certainly no longer needed.
void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
INT_TEMPLATES_STRUCT* templates,
uinT8* char_norm_array,
uinT8* pruner_array);
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob);
void ResetFeaturesHaveBeenExtracted();
bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
bool LooksLikeGarbage(TBLOB *blob);
bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob);
void RefreshDebugWindow(ScrollView **win, const char *msg,
int y_offset, const TBOX &wbox);
/* float2int.cpp ************************************************************/
void ComputeIntCharNormArray(FEATURE NormFeature,
INT_TEMPLATES Templates,
CLASS_NORMALIZATION_ARRAY CharNormArray);
void ClearCharNormArray(uinT8* char_norm_array);
void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
uinT8* char_norm_array);
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
/* intproto.cpp *************************************************************/
INT_TEMPLATES ReadIntTemplates(FILE *File);
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
const UNICHARSET& target_unicharset);
CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
bool* pretrained_on);
bool* pretrained_on, int* shape_id);
void ShowMatchDisplay();
/* font detection ***********************************************************/
UnicityTable<FontInfo>& get_fontinfo_table() {
@ -271,6 +350,8 @@ class Classify : public CCStruct {
// Member variables.
// Parameters.
BOOL_VAR_H(prioritize_division, FALSE,
"Prioritize blob division over chopping");
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
INT_VAR_H(classify_debug_level, 0, "Classify debug level");
@ -313,8 +394,6 @@ class Classify : public CCStruct {
double_VAR_H(classify_misfit_junk_penalty, 0.0,
"Penalty to apply when a non-alnum is vertically out of "
"its expected textline position");
BOOL_VAR_H(classify_enable_int_fx, 1, "Enable integer fx");
BOOL_VAR_H(classify_enable_new_adapt_rules, 1, "Enable new adaptation rules");
double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
double_VAR_H(tessedit_class_miss_scale, 0.00390625,
@ -338,7 +417,7 @@ class Classify : public CCStruct {
/* intmatcher.cpp **********************************************************/
INT_VAR_H(classify_class_pruner_threshold, 229,
"Class Pruner Threshold 0-255: ");
"Class Pruner Threshold 0-255");
INT_VAR_H(classify_class_pruner_multiplier, 30,
"Class Pruner Multiplier 0-255: ");
INT_VAR_H(classify_cp_cutoff_strength, 7,
@ -362,17 +441,27 @@ class Classify : public CCStruct {
NORM_PROTOS *NormProtos;
/* font detection ***********************************************************/
UnicityTable<FontInfo> fontinfo_table_;
// Without shape training, each class_id, config pair represents a single
// unichar id/font combination, so each fontset_table_ entry holds font ids
// for each config in the class.
// With shape training, each class_id, config pair represents a single
// shape_table_ index, so the fontset_table_ stores the shape_table_ index,
// and the shape_table_ must be consulted to obtain the actual unichar_id/
// font combinations that the shape represents.
UnicityTable<FontSet> fontset_table_;
INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
BOOL_VAR_H(classify_bln_numeric_mode, 0,
"Assume the input is numbers [0-9].");
protected:
IntegerMatcher im_;
FEATURE_DEFS_STRUCT feature_defs_;
// Must be set for the classifier to operate. Ususally set in
// Tesseract::recog_word_recursive, being the main word-level entry point.
DENORM denorm_;
// If a shape_table_ is present, it is used to remap classifier output in
// ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
// mean an index to the shape_table_ and the choices returned are *all* the
// shape_table_ entries at that index.
ShapeTable* shape_table_;
private:
@ -401,8 +490,17 @@ class Classify : public CCStruct {
INT_FEATURE_ARRAY CharNormFeatures;
INT_FX_RESULT_STRUCT FXInfo;
CLASS_CUTOFF_ARRAY CharNormCutoffs;
CLASS_CUTOFF_ARRAY BaselineCutoffs;
// Expected number of features in the class pruner, used to penalize
// unknowns that have too few features (like a c being classified as e) so
// it doesn't recognize everything as '@' or '#'.
// CharNormCutoffs is for the static classifier (with no shapetable).
// BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
// value in the adaptive classifier. Both are indexed by unichar_id.
// shapetable_cutoffs_ provides a similar value for each shape in the
// shape_table_
uinT16* CharNormCutoffs;
uinT16* BaselineCutoffs;
GenericVector<uinT16> shapetable_cutoffs_;
ScrollView* learn_debug_win_;
ScrollView* learn_fragmented_word_debug_win_;
ScrollView* learn_fragments_debug_win_;

View File

@ -20,6 +20,7 @@
#include "cluster.h"
#include "emalloc.h"
#include "helpers.h"
#include "matrix.h"
#include "tprintf.h"
#include "danerror.h"
#include "freelist.h"
@ -137,7 +138,7 @@ const double FTable[FTABLE_Y][FTABLE_X] = {
dimension of any feature. Since most features are calculated from numbers
with a precision no better than 1 in 128, the variance should never be
less than the square of this number for parameters whose range is 1. */
#define MINVARIANCE 0.0001
#define MINVARIANCE 0.0004
/* define the absolute minimum number of samples which must be present in
order to accurately test hypotheses about underlying probability
@ -145,7 +146,6 @@ const double FTable[FTABLE_Y][FTABLE_X] = {
before a statistical analysis is attempted; this number should be
equal to MINSAMPLES but can be set to a lower number for early testing
when very few samples are available. */
#define MINBUCKETS 5
#define MINSAMPLESPERBUCKET 5
#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
#define MINSAMPLESNEEDED 1
@ -222,7 +222,6 @@ static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
/* define lookup tables used to compute the number of histogram buckets
that should be used for a given number of samples. */
#define LOOKUPTABLESIZE 8
#define MAXBUCKETS 39
#define MAXDEGREESOFFREEDOM MAXBUCKETS
static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
@ -349,8 +348,7 @@ BOOL8 DistributionOK(BUCKETS *Buckets);
void FreeStatistics(STATISTICS *Statistics);
void FreeBuckets(CLUSTERER* clusterer,
BUCKETS *Buckets);
void FreeBuckets(BUCKETS *Buckets);
void FreeCluster(CLUSTER *Cluster);
@ -425,10 +423,11 @@ MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
// allocate a kd tree to hold the samples
Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
// keep a list of histogram buckets to minimize recomputing them
Clusterer->bucket_cache[0] = NIL_LIST;
Clusterer->bucket_cache[1] = NIL_LIST;
Clusterer->bucket_cache[2] = NIL_LIST;
// Initialize cache of histogram buckets to minimize recomputing them.
for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
Clusterer->bucket_cache[d][c] = NULL;
}
return Clusterer;
} // MakeClusterer
@ -448,8 +447,8 @@ Exceptions: ALREADYCLUSTERED MakeSample can't be called after
ClusterSamples has been called
History: 5/29/89, DSJ, Created.
*****************************************************************************/
SAMPLE *
MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID) {
SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
inT32 CharID) {
SAMPLE *Sample;
int i;
@ -537,9 +536,13 @@ void FreeClusterer(CLUSTERER *Clusterer) {
FreeKDTree (Clusterer->KDTree);
if (Clusterer->Root != NULL)
FreeCluster (Clusterer->Root);
iterate (Clusterer->ProtoList) {
((PROTOTYPE *) (first_node (Clusterer->ProtoList)))->Cluster = NULL;
// Free up all used buckets structures.
for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
if (Clusterer->bucket_cache[d][c] != NULL)
FreeBuckets(Clusterer->bucket_cache[d][c]);
}
memfree(Clusterer);
}
} // FreeClusterer
@ -662,6 +665,8 @@ FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
case uniform:
case D_random:
return (Proto->Variance.Elliptical[Dimension]);
case DISTRIBUTION_COUNT:
ASSERT_HOST(!"Distribution count not allowed!");
}
}
return 0.0f;
@ -1033,7 +1038,6 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
Config->Confidence);
break;
}
FreeBuckets(Clusterer, Buckets);
FreeStatistics(Statistics);
return Proto;
} // MakePrototype
@ -1339,10 +1343,6 @@ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
FreePrototype(Proto);
Proto = NULL;
}
if (UniformBuckets != NULL)
FreeBuckets(Clusterer, UniformBuckets);
if (RandomBuckets != NULL)
FreeBuckets(Clusterer, RandomBuckets);
return (Proto);
} // MakeMixedProto
@ -1623,6 +1623,7 @@ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
Proto->Distrib = NULL;
Proto->Significant = TRUE;
Proto->Merged = FALSE;
Proto->Style = spherical;
Proto->NumSamples = Cluster->SampleCount;
Proto->Cluster = Cluster;
@ -1705,17 +1706,18 @@ BUCKETS *GetBuckets(CLUSTERER* clusterer,
DISTRIBUTION Distribution,
uinT32 SampleCount,
FLOAT64 Confidence) {
// search for an old bucket structure with the same number of buckets
LIST *bucket_cache = clusterer->bucket_cache;
// Get an old bucket structure with the same number of buckets.
uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
BUCKETS *Buckets = (BUCKETS *) first_node(search(
bucket_cache[(int)Distribution], &NumberOfBuckets,
NumBucketsMatch));
BUCKETS *Buckets =
clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
// if a matching bucket structure is found, delete it from the list
if (Buckets != NULL) {
bucket_cache[(int) Distribution] =
delete_d(bucket_cache[(int) Distribution], Buckets, ListEntryMatch);
// If a matching bucket structure is not found, make one and save it.
if (Buckets == NULL) {
Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
Buckets;
} else {
// Just adjust the existing buckets.
if (SampleCount != Buckets->SampleCount)
AdjustBuckets(Buckets, SampleCount);
if (Confidence != Buckets->Confidence) {
@ -1725,9 +1727,6 @@ BUCKETS *GetBuckets(CLUSTERER* clusterer,
Confidence);
}
InitBuckets(Buckets);
} else {
// otherwise create a new structure
Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
}
return Buckets;
} // GetBuckets
@ -1770,14 +1769,14 @@ BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
BOOL8 Symmetrical;
// allocate memory needed for data structure
Buckets = (BUCKETS *) Emalloc(sizeof(BUCKETS));
Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
Buckets->SampleCount = SampleCount;
Buckets->Confidence = Confidence;
Buckets->Count =
(uinT32 *) Emalloc(Buckets->NumberOfBuckets * sizeof (uinT32));
Buckets->ExpectedCount =
(FLOAT32 *) Emalloc(Buckets->NumberOfBuckets * sizeof (FLOAT32));
Buckets->Count = reinterpret_cast<uinT32*>(
Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
// initialize simple fields
Buckets->Distribution = Distribution;
@ -2246,23 +2245,16 @@ void FreeStatistics(STATISTICS *Statistics) {
//---------------------------------------------------------------------------
void FreeBuckets(CLUSTERER* clusterer, BUCKETS *buckets) {
void FreeBuckets(BUCKETS *buckets) {
/*
** Parameters:
** clusterer->bucket_cache
** distribution-indexed cache of old bucket structures.
** buckets pointer to data structure to be freed
** Operation:
** This routine places the specified histogram data structure
** at the front of a list of histograms so that it can be reused
** later if necessary. A separate list is maintained for each
** different type of distribution.
** This routine properly frees the memory used by a BUCKETS.
*/
LIST *bucket_cache = clusterer->bucket_cache;
if (buckets != NULL) {
int dist = (int)buckets->Distribution;
bucket_cache[dist] = (LIST) push(bucket_cache[dist], buckets);
}
Efree(buckets->Count);
Efree(buckets->ExpectedCount);
Efree(buckets);
} // FreeBuckets
@ -2640,8 +2632,10 @@ CLUSTER * Cluster, FLOAT32 MaxIllegal)
}
NumCharInCluster--;
PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
if (PercentIllegal > MaxIllegal)
if (PercentIllegal > MaxIllegal) {
destroy(SearchState);
return (TRUE);
}
}
}
return (FALSE);
@ -2652,17 +2646,10 @@ CLUSTER * Cluster, FLOAT32 MaxIllegal)
// The return value is the sum of norms of the off-diagonal terms of the
// product of a and inv. (A measure of the error.)
double InvertMatrix(const float* input, int size, float* inv) {
double** U; // The upper triangular array.
double* Umem;
double** U_inv; // The inverse of U.
double* U_invmem;
double** L; // The lower triangular array.
double* Lmem;
// Allocate memory for the 2D arrays.
ALLOC_2D_ARRAY(size, size, Umem, U, double);
ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double);
ALLOC_2D_ARRAY(size, size, Lmem, L, double);
GENERIC_2D_ARRAY<double> U(size, size, 0.0);
GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
GENERIC_2D_ARRAY<double> L(size, size, 0.0);
// Initialize the working matrices. U starts as input, L as I and U_inv as O.
int row;

View File

@ -21,6 +21,11 @@
#include "kdtree.h"
#include "oldlist.h"
struct BUCKETS;
#define MINBUCKETS 5
#define MAXBUCKETS 39
/*----------------------------------------------------------------------
Types
----------------------------------------------------------------------*/
@ -51,7 +56,7 @@ typedef struct { // parameters to control clustering
} CLUSTERCONFIG;
typedef enum {
normal, uniform, D_random
normal, uniform, D_random, DISTRIBUTION_COUNT
} DISTRIBUTION;
typedef union {
@ -86,7 +91,8 @@ typedef struct {
CLUSTER *Root; // ptr to root cluster of cluster tree
LIST ProtoList; // list of prototypes
inT32 NumChar; // # of characters represented by samples
LIST bucket_cache[3]; // cache of reusable histograms by distribution type
// cache of reusable histograms by distribution type and number of buckets.
BUCKETS* bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];
} CLUSTERER;
typedef struct {
@ -103,7 +109,7 @@ typedef struct {
--------------------------------------------------------------------------*/
CLUSTERER *MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]);
SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID);
SAMPLE *MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, inT32 CharID);
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);

View File

@ -213,6 +213,8 @@ PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
Proto->Magnitude.Elliptical[i] = 1.0 /
(2.0 * Proto->Variance.Elliptical[i]);
break;
case DISTRIBUTION_COUNT:
ASSERT_HOST(!"Distribution count not allowed!");
}
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
}
@ -374,6 +376,8 @@ void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto) {
case D_random:
fprintf (File, " %9s", "random");
break;
case DISTRIBUTION_COUNT:
ASSERT_HOST(!"Distribution count not allowed!");
}
fprintf (File, "\n\t");
WriteNFloats (File, N, Proto->Variance.Elliptical);
@ -392,13 +396,10 @@ Return: None
Exceptions: None
History: 6/6/89, DSJ, Created.
****************************************************************************/
void
WriteNFloats (FILE * File, uinT16 N, FLOAT32 Array[]) {
int i;
for (i = 0; i < N; i++)
fprintf (File, " %9.6f", Array[i]);
fprintf (File, "\n");
void WriteNFloats(FILE * File, uinT16 N, FLOAT32 Array[]) {
for (int i = 0; i < N; i++)
fprintf(File, " %9.6f", Array[i]);
fprintf(File, "\n");
} // WriteNFloats
@ -479,29 +480,3 @@ void WriteProtoList(
}
} /* WriteProtoList */
/** UniformRandomNumber ********************************************************
Parameters: MMin lower range of uniform distribution
MMax upper range of uniform distribution
Globals: None
Operation: This routine computes a random number which comes from a
uniform distribution over the range from MMin to MMax.
Return: Uniform random number
Exceptions: None
History: 6/6/89, DSJ, Created.
*******************************************************************************/
FLOAT32 UniformRandomNumber(FLOAT32 MMin, FLOAT32 MMax) {
double fake_drand48();
FLOAT32 RandomNumber;
RandomNumber = fake_drand48 ();
return (MMin + (RandomNumber * (MMax - MMin)));
} // UniformRandomNumber
/** drand48 *************************************************************
Cheap replacement for drand48 which is not available on the PC.
**********************************************************************/
double fake_drand48() {
return rand () / (RAND_MAX + 1.0);
}

View File

@ -52,8 +52,6 @@ void WriteProtoList(
BOOL8 WriteSigProtos,
BOOL8 WriteInsigProtos);
FLOAT32 UniformRandomNumber(FLOAT32 MMin, FLOAT32 MMax);
//--------------Global Data Definitions and Declarations---------------------
// define errors that can be trapped
#define ILLEGALSAMPLESIZE 5000

View File

@ -39,7 +39,7 @@
----------------------------------------------------------------------------**/
/*---------------------------------------------------------------------------*/
namespace tesseract {
void Classify::ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
CLASS_CUTOFF_ARRAY Cutoffs) {
/*
** Parameters:
@ -59,6 +59,11 @@ void Classify::ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
int Cutoff;
int i;
if (shape_table_ != NULL) {
if (!shapetable_cutoffs_.DeSerialize(swap, CutoffFile)) {
tprintf("Error during read of shapetable pffmtable!\n");
}
}
for (i = 0; i < MAX_NUM_CLASSES; i++)
Cutoffs[i] = MAX_CUTOFF;

385
classify/errorcounter.cpp Normal file
View File

@ -0,0 +1,385 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "errorcounter.h"
#include "fontinfo.h"
#include "ndminx.h"
#include "sampleiterator.h"
#include "shapeclassifier.h"
#include "shapetable.h"
#include "trainingsample.h"
#include "trainingsampleset.h"
#include "unicity_table.h"
namespace tesseract {
// Tests a classifier, computing its error rate.
// See errorcounter.h for description of arguments.
// Iterates over the samples, calling the classifier in normal/silent mode.
// If the classifier makes a CT_UNICHAR_TOPN_ERR error, and the appropriate
// report_level is set (4 or greater), it will then call the classifier again
// with a debug flag and a keep_this argument to find out what is going on.
double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
int report_level, CountTypes boosting_mode,
const UnicityTable<FontInfo>& fontinfo_table,
const GenericVector<Pix*>& page_images, SampleIterator* it,
double* unichar_error, double* scaled_error, STRING* fonts_report) {
int charsetsize = it->shape_table()->unicharset().size();
int shapesize = it->CompactCharsetSize();
int fontsize = it->sample_set()->NumFonts();
ErrorCounter counter(charsetsize, shapesize, fontsize);
GenericVector<ShapeRating> results;
clock_t start = clock();
int total_samples = 0;
double unscaled_error = 0.0;
// Set a number of samples on which to run the classify debug mode.
int error_samples = report_level > 3 ? report_level * report_level : 0;
// Iterate over all the samples, accumulating errors.
for (it->Begin(); !it->AtEnd(); it->Next()) {
TrainingSample* mutable_sample = it->MutableSample();
int page_index = mutable_sample->page_num();
Pix* page_pix = 0 <= page_index && page_index < page_images.size()
? page_images[page_index] : NULL;
// No debug, no keep this.
classifier->ClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID,
&results);
if (mutable_sample->class_id() == 0) {
// This is junk so use the special counter.
counter.AccumulateJunk(*it->shape_table(), results, mutable_sample);
} else if (counter.AccumulateErrors(report_level > 3, boosting_mode,
fontinfo_table, *it->shape_table(),
results, mutable_sample) &&
error_samples > 0) {
// Running debug, keep the correct answer, and debug the classifier.
tprintf("Error on sample %d: Classifier debug output:\n",
it->GlobalSampleIndex());
int keep_this = it->GetSparseClassID();
classifier->ClassifySample(*mutable_sample, page_pix, 1, keep_this,
&results);
--error_samples;
}
++total_samples;
}
double total_time = 1.0 * (clock() - start) / CLOCKS_PER_SEC;
// Create the appropriate error report.
unscaled_error = counter.ReportErrors(report_level, boosting_mode,
fontinfo_table,
*it, unichar_error, fonts_report);
if (scaled_error != NULL) *scaled_error = counter.scaled_error_;
if (report_level > 1) {
// It is useful to know the time in microseconds/char.
tprintf("Errors computed in %.2fs at %.1f μs/char\n",
total_time, 1000000.0 * total_time / total_samples);
}
return unscaled_error;
}
// Constructor is private. Only anticipated use of ErrorCounter is via
// the static ComputeErrorRate.
ErrorCounter::ErrorCounter(int charsetsize, int shapesize, int fontsize)
: scaled_error_(0.0), unichar_counts_(charsetsize, shapesize, 0) {
Counts empty_counts;
font_counts_.init_to_size(fontsize, empty_counts);
}
ErrorCounter::~ErrorCounter() {
}
// Accumulates the errors from the classifier results on a single sample.
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
// boosting_mode selects the type of error to be used for boosting and the
// is_error_ member of sample is set according to whether the required type
// of error occurred. The font_table provides access to font properties
// for error counting and shape_table is used to understand the relationship
// between unichar_ids and shape_ids in the results
bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
const UnicityTable<FontInfo>& font_table,
const ShapeTable& shape_table,
const GenericVector<ShapeRating>& results,
TrainingSample* sample) {
int num_results = results.size();
int res_index = 0;
bool debug_it = false;
int font_id = sample->font_id();
int unichar_id = sample->class_id();
sample->set_is_error(false);
if (num_results == 0) {
// Reject. We count rejects as a separate category, but still mark the
// sample as an error in case any training module wants to use that to
// improve the classifier.
sample->set_is_error(true);
++font_counts_[font_id].n[CT_REJECT];
} else if (shape_table.GetShape(results[0].shape_id).
ContainsUnicharAndFont(unichar_id, font_id)) {
++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT];
// Unichar and font OK, but count if multiple unichars.
if (shape_table.GetShape(results[0].shape_id).size() > 1)
++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
} else {
// This is a top shape error.
++font_counts_[font_id].n[CT_SHAPE_TOP_ERR];
// Check to see if any font in the top choice has attributes that match.
bool attributes_match = false;
uinT32 font_props = font_table.get(font_id).properties;
const Shape& shape = shape_table.GetShape(results[0].shape_id);
for (int c = 0; c < shape.size() && !attributes_match; ++c) {
for (int f = 0; f < shape[c].font_ids.size(); ++f) {
if (font_table.get(shape[c].font_ids[f]).properties == font_props) {
attributes_match = true;
break;
}
}
}
// TODO(rays) It is easy to add counters for individual font attributes
// here if we want them.
if (!attributes_match)
++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true);
// Find rank of correct unichar answer. (Ignoring the font.)
while (res_index < num_results &&
!shape_table.GetShape(results[res_index].shape_id).
ContainsUnichar(unichar_id)) {
++res_index;
}
if (res_index == 0) {
// Unichar OK, but count if multiple unichars.
if (shape_table.GetShape(results[res_index].shape_id).size() > 1) {
++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
}
} else {
// Count maps from unichar id to shape id.
if (num_results > 0)
++unichar_counts_(unichar_id, results[0].shape_id);
// This is a unichar error.
++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
if (res_index >= MIN(2, num_results)) {
// It is also a 2nd choice unichar error.
++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
}
if (res_index >= num_results) {
// It is also a top-n choice unichar error.
++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
debug_it = debug;
}
}
}
// Compute mean number of return values and mean rank of correct answer.
font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
font_counts_[font_id].n[CT_RANK] += res_index;
// If it was an error for boosting then sum the weight.
if (sample->is_error()) {
scaled_error_ += sample->weight();
}
if (debug_it) {
tprintf("%d results for char %s font %d :",
num_results, shape_table.unicharset().id_to_unichar(unichar_id),
font_id);
for (int i = 0; i < num_results; ++i) {
tprintf(" %.3f/%.3f:%s",
results[i].rating, results[i].font,
shape_table.DebugStr(results[i].shape_id).string());
}
tprintf("\n");
return true;
}
return false;
}
// Accumulates counts for junk. Counts only whether the junk was correctly
// rejected or not.
void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table,
const GenericVector<ShapeRating>& results,
TrainingSample* sample) {
// For junk we accept no answer, or an explicit shape answer matching the
// class id of the sample.
int num_results = results.size();
int font_id = sample->font_id();
int unichar_id = sample->class_id();
if (num_results > 0 &&
!shape_table.GetShape(results[0].shape_id).ContainsUnichar(unichar_id)) {
// This is a junk error.
++font_counts_[font_id].n[CT_ACCEPTED_JUNK];
sample->set_is_error(true);
// It counts as an error for boosting too so sum the weight.
scaled_error_ += sample->weight();
} else {
// Correctly rejected.
++font_counts_[font_id].n[CT_REJECTED_JUNK];
sample->set_is_error(false);
}
}
// Creates a report of the error rate. The report_level controls the detail
// that is reported to stderr via tprintf:
// 0 -> no output.
// >=1 -> bottom-line error rate.
// >=3 -> font-level error rate.
// boosting_mode determines the return value. It selects which (un-weighted)
// error rate to return.
// The fontinfo_table from MasterTrainer provides the names of fonts.
// The it determines the current subset of the training samples.
// If not NULL, the top-choice unichar error rate is saved in unichar_error.
// If not NULL, the report string is saved in fonts_report.
// (Ignoring report_level).
double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
const UnicityTable<FontInfo>& fontinfo_table,
const SampleIterator& it,
double* unichar_error,
STRING* fonts_report) {
// Compute totals over all the fonts and report individual font results
// when required.
Counts totals;
int fontsize = font_counts_.size();
for (int f = 0; f < fontsize; ++f) {
// Accumulate counts over fonts.
totals += font_counts_[f];
STRING font_report;
if (ReportString(font_counts_[f], &font_report)) {
if (fonts_report != NULL) {
*fonts_report += fontinfo_table.get(f).name;
*fonts_report += ": ";
*fonts_report += font_report;
*fonts_report += "\n";
}
if (report_level > 2) {
// Report individual font error rates.
tprintf("%s: %s\n", fontinfo_table.get(f).name, font_report.string());
}
}
}
if (report_level > 0) {
// Report the totals.
STRING total_report;
if (ReportString(totals, &total_report)) {
tprintf("TOTAL Scaled Err=%.4g%%, %s\n",
scaled_error_ * 100.0, total_report.string());
}
// Report the worst substitution error only for now.
if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {
const UNICHARSET& unicharset = it.shape_table()->unicharset();
int charsetsize = unicharset.size();
int shapesize = it.CompactCharsetSize();
int worst_uni_id = 0;
int worst_shape_id = 0;
int worst_err = 0;
for (int u = 0; u < charsetsize; ++u) {
for (int s = 0; s < shapesize; ++s) {
if (unichar_counts_(u, s) > worst_err) {
worst_err = unichar_counts_(u, s);
worst_uni_id = u;
worst_shape_id = s;
}
}
}
if (worst_err > 0) {
tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
worst_uni_id, unicharset.id_to_unichar(worst_uni_id),
it.shape_table()->DebugStr(worst_shape_id).string(),
worst_err, totals.n[CT_UNICHAR_TOP1_ERR],
100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);
}
}
}
double rates[CT_SIZE];
if (!ComputeRates(totals, rates))
return 0.0;
// Set output values if asked for.
if (unichar_error != NULL)
*unichar_error = rates[CT_UNICHAR_TOP1_ERR];
return rates[boosting_mode];
}
// Sets the report string to a combined human and machine-readable report
// string of the error rates.
// Returns false if there is no data, leaving report unchanged.
bool ErrorCounter::ReportString(const Counts& counts, STRING* report) {
// Compute the error rates.
double rates[CT_SIZE];
if (!ComputeRates(counts, rates))
return false;
// Using %.4g%%, the length of the output string should exactly match the
// length of the format string, but in case of overflow, allow for +eddd
// on each number.
const int kMaxExtraLength = 5; // Length of +eddd.
// Keep this format string and the snprintf in sync with the CountTypes enum.
const char* format_str = "ShapeErr=%.4g%%, FontAttr=%.4g%%, "
"Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], "
"Multi=%.4g%%, Rej=%.4g%%, "
"Answers=%.3g, Rank=%.3g, "
"OKjunk=%.4g%%, Badjunk=%.4g%%";
int max_str_len = strlen(format_str) + kMaxExtraLength * (CT_SIZE - 1) + 1;
char* formatted_str = new char[max_str_len];
snprintf(formatted_str, max_str_len, format_str,
rates[CT_SHAPE_TOP_ERR] * 100.0,
rates[CT_FONT_ATTR_ERR] * 100.0,
rates[CT_UNICHAR_TOP1_ERR] * 100.0,
rates[CT_UNICHAR_TOP2_ERR] * 100.0,
rates[CT_UNICHAR_TOPN_ERR] * 100.0,
rates[CT_OK_MULTI_UNICHAR] * 100.0,
rates[CT_REJECT] * 100.0,
rates[CT_NUM_RESULTS],
rates[CT_RANK],
100.0 * rates[CT_REJECTED_JUNK],
100.0 * rates[CT_ACCEPTED_JUNK]);
*report = formatted_str;
delete [] formatted_str;
// Now append each field of counts with a tab in front so the result can
// be loaded into a spreadsheet.
for (int ct = 0; ct < CT_SIZE; ++ct)
report->add_str_int("\t", counts.n[ct]);
return true;
}
// Computes the error rates and returns in rates which is an array of size
// CT_SIZE. Returns false if there is no data, leaving rates unchanged.
bool ErrorCounter::ComputeRates(const Counts& counts, double rates[CT_SIZE]) {
int ok_samples = counts.n[CT_SHAPE_TOP_CORRECT] + counts.n[CT_SHAPE_TOP_ERR] +
counts.n[CT_REJECT];
int junk_samples = counts.n[CT_REJECTED_JUNK] + counts.n[CT_ACCEPTED_JUNK];
if (ok_samples == 0 && junk_samples == 0) {
// There is no data.
return false;
}
// Compute rates for normal chars.
double denominator = static_cast<double>(MAX(ok_samples, 1));
for (int ct = 0; ct <= CT_RANK; ++ct)
rates[ct] = counts.n[ct] / denominator;
// Compute rates for junk.
denominator = static_cast<double>(MAX(junk_samples, 1));
for (int ct = CT_REJECTED_JUNK; ct <= CT_ACCEPTED_JUNK; ++ct)
rates[ct] = counts.n[ct] / denominator;
return true;
}
ErrorCounter::Counts::Counts() {
memset(n, 0, sizeof(n[0]) * CT_SIZE);
}
// Adds other into this for computing totals.
void ErrorCounter::Counts::operator+=(const Counts& other) {
for (int ct = 0; ct < CT_SIZE; ++ct)
n[ct] += other.n[ct];
}
} // namespace tesseract.

198
classify/errorcounter.h Normal file
View File

@ -0,0 +1,198 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
#define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
#include "genericvector.h"
#include "matrix.h"
struct Pix;
template <typename T> class UnicityTable;
namespace tesseract {
struct FontInfo;
class SampleIterator;
class ShapeClassifier;
class ShapeRating;
class ShapeTable;
class TrainingSample;
// Enumeration of the different types of error count.
// Error counts work as follows:
//
// Ground truth is a valid unichar-id / font-id pair:
// Number of classifier answers?
// 0 >0
// CT_REJECT BOTH unichar-id and font-id match top shape?
// __________ yes! no
// CT_SHAPE_TOP_CORRECT CT_SHAPE_TOP_ERR
// | Font attributes match?
// | yes! no
// | | CT_FONT_ATTR_ERROR
// | Top unichar-id matches?
// | yes! no
// Top shape-id has multiple unichars? CT_UNICHAR_TOP1_ERR
// yes! no 2nd shape unichar id matches?
// CT_OK_MULTI_UNICHAR ________ yes! no
// ___________________ _____ CT_UNICHAR_TOP2_ERR
// Any unichar-id matches?
// yes! no
// ______ CT_UNICHAR_TOPN_ERR
// _________________
// Note that multiple counts may be activated for a single sample!
//
// Ground truth is for a fragment/n-gram that is NOT in the unicharset.
// This is called junk and is expected to be rejected:
// Number of classifier answers?
// 0 >0
// CT_REJECTED_JUNK CT_ACCEPTED_JUNK
//
// Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
// the mean rank of the correct result, counting from 0, and with an error
// receiving the number of answers as the correct rank.
//
// Keep in sync with the ReportString function.
enum CountTypes {
CT_SHAPE_TOP_CORRECT, // Top shape id is actually correct.
CT_SHAPE_TOP_ERR, // Top shape id is not correct.
CT_FONT_ATTR_ERR, // Font attributes incorrect, ignoring unichar.
CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id.
CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id.
CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id.
CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others.
CT_REJECT, // Classifier hates this.
CT_NUM_RESULTS, // Number of answers produced.
CT_RANK, // Rank of correct answer.
CT_REJECTED_JUNK, // Junk that was correctly rejected.
CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise.
CT_SIZE // Number of types for array sizing.
};
// Class to encapsulate all the functionality and sub-structures required
// to count errors for an isolated character classifier (ShapeClassifier).
class ErrorCounter {
public:
// Computes and returns the unweighted boosting_mode error rate of the given
// classifier. Can be used for testing, or inside an iterative training
// system, including one that uses boosting.
// report_levels:
// 0 = no output.
// 1 = bottom-line error rate.
// 2 = bottom-line error rate + time.
// 3 = font-level error rate + time.
// 4 = list of all errors + short classifier debug output on 16 errors.
// 5 = list of all errors + short classifier debug output on 25 errors.
// * The boosting_mode determines which error type is used for computing the
// scaled_error output, and setting the is_error flag in the samples.
// * The fontinfo_table is used to get string font names for the debug
// output, and also to count font attributes errors.
// * The page_images vector may contain a Pix* (which may be NULL) for each
// page index assigned to the samples.
// * The it provides encapsulated iteration over some sample set.
// * The outputs unichar_error, scaled_error and totals_report are all
// optional.
// * If not NULL, unichar error gets the top1 unichar error rate.
// * Scaled_error gets the error chosen by boosting_mode weighted by the
// weights on the samples.
// * Fonts_report gets a string summarizing the error rates for each font in
// both human-readable form and as a tab-separated list of error counts.
// The human-readable form is all before the first tab.
// * The return value is the un-weighted version of the scaled_error.
static double ComputeErrorRate(ShapeClassifier* classifier,
int report_level, CountTypes boosting_mode,
const UnicityTable<FontInfo>& fontinfo_table,
const GenericVector<Pix*>& page_images,
SampleIterator* it,
double* unichar_error,
double* scaled_error,
STRING* fonts_report);
private:
// Simple struct to hold an array of counts.
struct Counts {
Counts();
// Adds other into this for computing totals.
void operator+=(const Counts& other);
int n[CT_SIZE];
};
// Constructor is private. Only anticipated use of ErrorCounter is via
// the static ComputeErrorRate.
ErrorCounter(int charsetsize, int shapesize, int fontsize);
~ErrorCounter();
// Accumulates the errors from the classifier results on a single sample.
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
// boosting_mode selects the type of error to be used for boosting and the
// is_error_ member of sample is set according to whether the required type
// of error occurred. The font_table provides access to font properties
// for error counting and shape_table is used to understand the relationship
// between unichar_ids and shape_ids in the results
bool AccumulateErrors(bool debug, CountTypes boosting_mode,
const UnicityTable<FontInfo>& font_table,
const ShapeTable& shape_table,
const GenericVector<ShapeRating>& results,
TrainingSample* sample);
// Accumulates counts for junk. Counts only whether the junk was correctly
// rejected or not.
void AccumulateJunk(const ShapeTable& shape_table,
const GenericVector<ShapeRating>& results,
TrainingSample* sample);
// Creates a report of the error rate. The report_level controls the detail
// that is reported to stderr via tprintf:
// 0 -> no output.
// >=1 -> bottom-line error rate.
// >=3 -> font-level error rate.
// boosting_mode determines the return value. It selects which (un-weighted)
// error rate to return.
// The fontinfo_table from MasterTrainer provides the names of fonts.
// The it determines the current subset of the training samples.
// If not NULL, the top-choice unichar error rate is saved in unichar_error.
// If not NULL, the report string is saved in fonts_report.
// (Ignoring report_level).
double ReportErrors(int report_level, CountTypes boosting_mode,
const UnicityTable<FontInfo>& fontinfo_table,
const SampleIterator& it,
double* unichar_error,
STRING* fonts_report);
// Sets the report string to a combined human and machine-readable report
// string of the error rates.
// Returns false if there is no data, leaving report unchanged.
static bool ReportString(const Counts& counts, STRING* report);
// Computes the error rates and returns in rates which is an array of size
// CT_SIZE. Returns false if there is no data, leaving rates unchanged.
static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]);
// Total scaled error used by boosting algorithms.
double scaled_error_;
// Vector indexed by font_id from the samples of error accumulators.
GenericVector<Counts> font_counts_;
// Counts of the results that map each unichar_id (from samples) to an
// incorrect shape_id.
GENERIC_2D_ARRAY<int> unichar_counts_;
};
} // namespace tesseract.
#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */

View File

@ -29,15 +29,6 @@ typedef CHAR_FEATURES (*CF_FUNC) ();
-----------------------------------------------------------------------------*/
void ExtractorStub();
/*-----------------------------------------------------------------------------
Global Data Definitions and Declarations
-----------------------------------------------------------------------------*/
/** tables to keep track of the different low level feature extractors */
#define NUM_FX 3
#define DEFAULT_FX 2
int CurrentFx = DEFAULT_FX;
/*-----------------------------------------------------------------------------
Public Code
-----------------------------------------------------------------------------*/

View File

@ -30,64 +30,85 @@
#define ILLEGAL_NUM_SETS 3001
#define PICO_FEATURE_LENGTH 0.05
#define MAX_OUTLINE_FEATURES 100
/*-----------------------------------------------------------------------------
Global Data Definitions and Declarations
-----------------------------------------------------------------------------*/
/* define all of the parameters for the MicroFeature type*/
StartParamDesc (MicroFeatureParams)
DefineParam (0, 0, -0.5, 0.5)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (1, 0, 0.0, 1.0)
const char* kMicroFeatureType = "mf";
const char* kCNFeatureType = "cn";
const char* kIntFeatureType = "if";
const char* kGeoFeatureType = "tb";
// Define all of the parameters for the MicroFeature type.
StartParamDesc(MicroFeatureParams)
DefineParam(0, 0, -0.5, 0.5)
DefineParam(0, 0, -0.25, 0.75)
DefineParam(0, 1, 0.0, 1.0)
DefineParam(1, 0, 0.0, 1.0)
DefineParam (0, 1, -0.5, 0.5)
DefineParam (0, 1, -0.5, 0.5)
EndParamDesc
/* now define the feature type itself (see features.h for info about each
parameter).*/
DefineFeature (MicroFeatureDesc, 5, 1, 4, 50, "Micro", "mf", MicroFeatureParams)
// Now define the feature type itself (see features.h for parameters).
DefineFeature(MicroFeatureDesc, 5, 1, kMicroFeatureType, MicroFeatureParams)
// define all of the parameters for the PicoFeature type
/* define knob that can be used to adjust pico-feature length */
FLOAT32 PicoFeatureLength = PICO_FEATURE_LENGTH;
StartParamDesc (PicoFeatParams)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (1, 0, 0.0, 1.0)
DefineParam (0, 0, -0.5, 0.5)
EndParamDesc
/* now define the feature type itself (see features.h for info about each
parameter).*/
DefineFeature (PicoFeatDesc, 2, 1, 1, MAX_UINT8, "Pico", "pf", PicoFeatParams)
/* define all of the parameters for the NormFeat type*/
// Define all of the parameters for the NormFeat type.
StartParamDesc (CharNormParams)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 0, 0.0, 1.0)
DefineParam(0, 0, -0.25, 0.75)
DefineParam(0, 1, 0.0, 1.0)
DefineParam(0, 0, 0.0, 1.0)
DefineParam(0, 0, 0.0, 1.0)
EndParamDesc
/* now define the feature type itself (see features.h for info about each
parameter).*/
DefineFeature (CharNormDesc, 4, 0, 1, 1, "CharNorm", "cn", CharNormParams)
// Now define the feature type itself (see features.h for parameters).
DefineFeature(CharNormDesc, 4, 0, kCNFeatureType, CharNormParams)
// define all of the parameters for the OutlineFeature type
StartParamDesc (OutlineFeatParams)
DefineParam (0, 0, -0.5, 0.5)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (1, 0, 0.0, 1.0)
// Define all of the parameters for the IntFeature type
StartParamDesc(IntFeatParams)
DefineParam(0, 0, 0.0, 255.0)
DefineParam(0, 0, 0.0, 255.0)
DefineParam(1, 0, 0.0, 255.0)
EndParamDesc
/* now define the feature type itself (see features.h for info about each
parameter).*/
DefineFeature (OutlineFeatDesc, 3, 1, 1, MAX_OUTLINE_FEATURES, "Outline",
"of", OutlineFeatParams)
// Now define the feature type itself (see features.h for parameters).
DefineFeature(IntFeatDesc, 2, 1, kIntFeatureType, IntFeatParams)
// Define all of the parameters for the GeoFeature type
StartParamDesc(GeoFeatParams)
DefineParam(0, 0, 0.0, 255.0)
DefineParam(0, 0, 0.0, 255.0)
DefineParam(0, 0, 0.0, 255.0)
EndParamDesc
// Now define the feature type itself (see features.h for parameters).
DefineFeature(GeoFeatDesc, 3, 0, kGeoFeatureType, GeoFeatParams)
// Other features used for training the adaptive classifier, but not used
// during normal training, therefore not in the DescDefs array.
// Define all of the parameters for the PicoFeature type
// define knob that can be used to adjust pico-feature length.
FLOAT32 PicoFeatureLength = PICO_FEATURE_LENGTH;
StartParamDesc(PicoFeatParams)
DefineParam(0, 0, -0.25, 0.75)
DefineParam(1, 0, 0.0, 1.0)
DefineParam(0, 0, -0.5, 0.5)
EndParamDesc
// Now define the feature type itself (see features.h for parameters).
DefineFeature(PicoFeatDesc, 2, 1, "pf", PicoFeatParams)
// Define all of the parameters for the OutlineFeature type.
StartParamDesc(OutlineFeatParams)
DefineParam(0, 0, -0.5, 0.5)
DefineParam(0, 0, -0.25, 0.75)
DefineParam(0, 0, 0.0, 1.0)
DefineParam(1, 0, 0.0, 1.0)
EndParamDesc
// Now define the feature type itself (see features.h for parameters).
DefineFeature(OutlineFeatDesc, 3, 1, "of", OutlineFeatParams)
// MUST be kept in-sync with ExtractorDefs in fxdefs.cpp.
static const FEATURE_DESC_STRUCT *DescDefs[NUM_FEATURE_TYPES] = {
&MicroFeatureDesc,
&PicoFeatDesc,
&OutlineFeatDesc,
&CharNormDesc
&CharNormDesc,
&IntFeatDesc,
&GeoFeatDesc
};
/*-----------------------------------------------------------------------------
@ -188,6 +209,27 @@ void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
}
} /* WriteCharDescription */
// Return whether all of the fields of the given feature set
// are well defined (not inf or nan).
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
CHAR_DESC CharDesc) {
bool anything_written = false;
bool well_formed = true;
for (int Type = 0; Type < CharDesc->NumFeatureSets; Type++) {
if (CharDesc->FeatureSets[Type]) {
for (int i = 0; i < CharDesc->FeatureSets[Type]->NumFeatures; i++) {
FEATURE feat = CharDesc->FeatureSets[Type]->Features[i];
for (int p = 0; p < feat->Type->NumParams; p++) {
if (isnan(feat->Params[p]) || isinf(feat->Params[p]))
well_formed = false;
else
anything_written = true;
}
}
}
}
return anything_written && well_formed;
} /* ValidCharDescription */
/*---------------------------------------------------------------------------*/
/**

View File

@ -25,6 +25,10 @@
/* Enumerate the different types of features currently defined. */
#define NUM_FEATURE_TYPES 4
extern const char* kMicroFeatureType;
extern const char* kCNFeatureType;
extern const char* kIntFeatureType;
extern const char* kGeoFeatureType;
/* define error traps which can be triggered by this module.*/
#define ILLEGAL_SHORT_NAME 2000
@ -58,6 +62,9 @@ void FreeCharDescription(CHAR_DESC CharDesc);
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs);
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
CHAR_DESC CharDesc);
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
FILE *File, CHAR_DESC CharDesc);
@ -74,4 +81,6 @@ extern const FEATURE_DESC_STRUCT MicroFeatureDesc;
extern const FEATURE_DESC_STRUCT PicoFeatDesc;
extern const FEATURE_DESC_STRUCT CharNormDesc;
extern const FEATURE_DESC_STRUCT OutlineFeatDesc;
extern const FEATURE_DESC_STRUCT IntFeatDesc;
extern const FEATURE_DESC_STRUCT GeoFeatDesc;
#endif

View File

@ -22,6 +22,7 @@
#include "normmatch.h"
#include "mfoutline.h"
#include "classify.h"
#include "helpers.h"
#include "picofeat.h"
#define MAX_INT_CHAR_NORM (INT_CHAR_NORM_RANGE - 1)
@ -33,63 +34,44 @@
namespace tesseract {
/**
* For each class in Templates, clear the corresponding
* entry in CharNormArray. CharNormArray is indexed by class
* indicies (as obtained from Templates) rather than class id's.
* For each class in the unicharset, clears the corresponding
* entry in char_norm_array. char_norm_array is indexed by unichar_id.
*
* Globals:
* - none
*
* @param Templates specifies classes currently defined
* @param CharNormArray array to be cleared
* @param char_norm_array array to be cleared
*
* @note Exceptions: none
* @note History: Wed Feb 20 11:20:54 1991, DSJ, Created.
*/
void ClearCharNormArray(INT_TEMPLATES Templates,
CLASS_NORMALIZATION_ARRAY CharNormArray) {
int i;
for (i = 0; i < Templates->NumClasses; i++) {
CharNormArray[i] = 0;
}
void Classify::ClearCharNormArray(uinT8* char_norm_array) {
memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
} /* ClearCharNormArray */
/*---------------------------------------------------------------------------*/
/**
* For each class in Templates, compute the match between
* NormFeature and the normalization protos for that class.
* Convert this number to the range from 0 - 255 and store it
* into CharNormArray. CharNormArray is indexed by class
* indicies (as obtained from Templates) rather than class id's.
* For each class in unicharset, computes the match between
* norm_feature and the normalization protos for that class.
* Converts this number to the range from 0 - 255 and stores it
* into char_norm_array. CharNormArray is indexed by unichar_id.
*
* Globals:
* - none
*
* @param NormFeature character normalization feature
* @param Templates specifies classes currently defined
* @param[out] CharNormArray place to put results
* @param norm_feature character normalization feature
* @param[out] char_norm_array place to put results of size unicharset.size()
*
* @note Exceptions: none
* @note History: Wed Feb 20 11:20:54 1991, DSJ, Created.
*/
void Classify::ComputeIntCharNormArray(
FEATURE NormFeature, INT_TEMPLATES Templates,
CLASS_NORMALIZATION_ARRAY CharNormArray) {
int i;
int NormAdjust;
for (i = 0; i < Templates->NumClasses; i++) {
NormAdjust = (int) (INT_CHAR_NORM_RANGE *
ComputeNormMatch (i, NormFeature, FALSE));
if (NormAdjust < 0)
NormAdjust = 0;
else if (NormAdjust > MAX_INT_CHAR_NORM)
NormAdjust = MAX_INT_CHAR_NORM;
CharNormArray[i] = NormAdjust;
void Classify::ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
uinT8* char_norm_array) {
for (int i = 0; i < unicharset.size(); i++) {
int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
ComputeNormMatch(i, norm_feature, FALSE));
char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
}
} /* ComputeIntCharNormArray */

View File

@ -27,12 +27,4 @@
#define INT_FEAT_RANGE 256
#define BASELINE_Y_SHIFT (0.25)
/*-----------------------------------------------------------------------------
Public Function Prototypes
-----------------------------------------------------------------------------*/
namespace tesseract {
void ClearCharNormArray(INT_TEMPLATES Templates,
CLASS_NORMALIZATION_ARRAY CharNormArray);
} // namespace tesseract.
#endif

View File

@ -27,15 +27,16 @@
-----------------------------------------------------------------------------*/
// Definitions of extractors separated from feature definitions.
const FEATURE_EXT_STRUCT MicroFeatureExt = { ExtractMicros };
const FEATURE_EXT_STRUCT PicoFeatExt = { NULL };
const FEATURE_EXT_STRUCT OutlineFeatExt = { NULL };
const FEATURE_EXT_STRUCT CharNormExt = { ExtractCharNormFeatures };
const FEATURE_EXT_STRUCT IntFeatExt = { ExtractIntCNFeatures };
const FEATURE_EXT_STRUCT GeoFeatExt = { ExtractIntGeoFeatures };
// MUST be kept in-sync with DescDefs in featdefs.cpp.
const FEATURE_EXT_STRUCT* ExtractorDefs[NUM_FEATURE_TYPES] = {
&MicroFeatureExt,
&PicoFeatExt,
&OutlineFeatExt,
&CharNormExt
&CharNormExt,
&IntFeatExt,
&GeoFeatExt
};
void SetupExtractors(FEATURE_DEFS_STRUCT *FeatureDefs) {

159
classify/intfeaturedist.cpp Normal file
View File

@ -0,0 +1,159 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: intfeaturedist.cpp
// Description: Fast set-difference-based feature distance calculator.
// Created: Thu Sep 01 13:07:30 PDT 2011
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "intfeaturedist.h"
#include "intfeaturemap.h"
namespace tesseract {
IntFeatureDist::IntFeatureDist()
: size_(0), total_feature_weight_(0.0),
feature_map_(NULL), features_(NULL),
features_delta_one_(NULL), features_delta_two_(NULL) {
}
IntFeatureDist::~IntFeatureDist() {
Clear();
}
// Initialize the table to the given size of feature space.
void IntFeatureDist::Init(const IntFeatureMap* feature_map) {
size_ = feature_map->sparse_size();
Clear();
feature_map_ = feature_map;
features_ = new bool[size_];
features_delta_one_ = new bool[size_];
features_delta_two_ = new bool[size_];
memset(features_, false, size_ * sizeof(features_[0]));
memset(features_delta_one_, false, size_ * sizeof(features_delta_one_[0]));
memset(features_delta_two_, false, size_ * sizeof(features_delta_two_[0]));
total_feature_weight_ = 0.0;
}
// Setup the map for the given indexed_features that have been indexed by
// feature_map.
void IntFeatureDist::Set(const GenericVector<int>& indexed_features,
int canonical_count, bool value) {
total_feature_weight_ = canonical_count;
for (int i = 0; i < indexed_features.size(); ++i) {
int f = indexed_features[i];
features_[f] = value;
for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
if (dir == 0) continue;
int mapped_f = feature_map_->OffsetFeature(f, dir);
if (mapped_f >= 0) {
features_delta_one_[mapped_f] = value;
for (int dir2 = -kNumOffsetMaps; dir2 <= kNumOffsetMaps; ++dir2) {
if (dir2 == 0) continue;
int mapped_f2 = feature_map_->OffsetFeature(mapped_f, dir2);
if (mapped_f2 >= 0)
features_delta_two_[mapped_f2] = value;
}
}
}
}
}
// Compute the distance between the given feature vector and the last
// Set feature vector.
double IntFeatureDist::FeatureDistance(
const GenericVector<int>& features) const {
int num_test_features = features.size();
double denominator = total_feature_weight_ + num_test_features;
double misses = denominator;
for (int i = 0; i < num_test_features; ++i) {
int index = features[i];
double weight = 1.0;
if (features_[index]) {
// A perfect match.
misses -= 2.0 * weight;
} else if (features_delta_one_[index]) {
misses -= 1.5 * weight;
} else if (features_delta_two_[index]) {
// A near miss.
misses -= 1.0 * weight;
}
}
return misses / denominator;
}
// Compute the distance between the given feature vector and the last
// Set feature vector.
double IntFeatureDist::DebugFeatureDistance(
const GenericVector<int>& features) const {
int num_test_features = features.size();
double denominator = total_feature_weight_ + num_test_features;
double misses = denominator;
for (int i = 0; i < num_test_features; ++i) {
int index = features[i];
double weight = 1.0;
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(features[i]);
tprintf("Testing feature weight %g:", weight);
f.print();
if (features_[index]) {
// A perfect match.
misses -= 2.0 * weight;
tprintf("Perfect hit\n");
} else if (features_delta_one_[index]) {
misses -= 1.5 * weight;
tprintf("-1 hit\n");
} else if (features_delta_two_[index]) {
// A near miss.
misses -= 1.0 * weight;
tprintf("-2 hit\n");
} else {
tprintf("Total miss\n");
}
}
tprintf("Features present:");
for (int i = 0; i < size_; ++i) {
if (features_[i]) {
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
f.print();
}
}
tprintf("\nMinus one features:");
for (int i = 0; i < size_; ++i) {
if (features_delta_one_[i]) {
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
f.print();
}
}
tprintf("\nMinus two features:");
for (int i = 0; i < size_; ++i) {
if (features_delta_two_[i]) {
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
f.print();
}
}
tprintf("\n");
return misses / denominator;
}
// Clear all data.
void IntFeatureDist::Clear() {
delete [] features_;
features_ = NULL;
delete [] features_delta_one_;
features_delta_one_ = NULL;
delete [] features_delta_two_;
features_delta_two_ = NULL;
}
} // namespace tesseract

80
classify/intfeaturedist.h Normal file
View File

@ -0,0 +1,80 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: intfeaturedist.h
// Description: Fast set-difference-based feature distance calculator.
// Created: Thu Sep 01 12:14:30 PDT 2011
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_INTFEATUREDIST_H_
#define TESSERACT_CLASSIFY_INTFEATUREDIST_H_
#include "genericvector.h"
namespace tesseract {
class IntFeatureMap;
// Feature distance calculator designed to provide a fast distance calculation
// based on set difference between a given feature set and many other feature
// sets in turn.
// Representation of a feature set as an array of bools that are sparsely
// true, and companion arrays that allow fast feature set distance
// calculations with allowance of offsets in position.
// Init is expensive, so for greatest efficiency, to re-initialize for a new
// feature set, use Set(..., false) on the SAME feature set as was used to
// setup with Set(..., true), to return to its initialized state before
// reuse with Set(..., true) on a new feature set.
class IntFeatureDist {
public:
IntFeatureDist();
~IntFeatureDist();
// Initialize the bool array to the given size of feature space.
// The feature_map is just borrowed, and must exist for the entire
// lifetime of the IntFeatureDist.
void Init(const IntFeatureMap* feature_map);
// Setup the map for the given indexed_features that have been indexed by
// feature_map. After use, use Set(..., false) to reset to the initial state
// as this is faster than calling Init for sparse spaces.
void Set(const GenericVector<int>& indexed_features,
int canonical_count, bool value);
// Compute the distance between the given feature vector and the last
// Set feature vector.
double FeatureDistance(const GenericVector<int>& features) const;
double DebugFeatureDistance(const GenericVector<int>& features) const;
private:
// Clear all data.
void Clear();
// Size of the indexed feature space.
int size_;
// Total weight of features currently stored in the maps.
double total_feature_weight_;
// Pointer to IntFeatureMap given at Init to find offset features.
const IntFeatureMap* feature_map_;
// Array of bools indicating presence of a feature.
bool* features_;
// Array indicating the presence of a feature offset by one unit.
bool* features_delta_one_;
// Array indicating the presence of a feature offset by two units.
bool* features_delta_two_;
};
} // namespace tesseract
#endif // TESSERACT_CLASSIFY_INTFEATUREDIST_H_

245
classify/intfeaturemap.cpp Normal file
View File

@ -0,0 +1,245 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: intfeaturemap.cpp
// Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
// to provide a subspace mapping and fast feature lookup.
// Created: Tue Oct 26 08:58:30 PDT 2010
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "intfeaturemap.h"
#include "intfeaturespace.h"
#include "intfx.h"
// These includes do not exist yet, but will be coming soon.
//#include "sampleiterator.h"
//#include "trainingsample.h"
//#include "trainingsampleset.h"
namespace tesseract {
const int kMaxOffsetDist = 32;
const double kMinPCLengthIncrease = 1.0 / 1024;
IntFeatureMap::IntFeatureMap()
: mapping_changed_(true), compact_size_(0) {
for (int dir = 0; dir < kNumOffsetMaps; ++dir) {
offset_plus_[dir] = NULL;
offset_minus_[dir] = NULL;
}
}
IntFeatureMap::~IntFeatureMap() {
Clear();
}
// Pseudo-accessors.
int IntFeatureMap::IndexFeature(const INT_FEATURE_STRUCT& f) const {
return feature_space_.Index(f);
}
int IntFeatureMap::MapFeature(const INT_FEATURE_STRUCT& f) const {
return feature_map_.SparseToCompact(feature_space_.Index(f));
}
int IntFeatureMap::MapIndexFeature(int index_feature) const {
return feature_map_.SparseToCompact(index_feature);
}
INT_FEATURE_STRUCT IntFeatureMap::InverseIndexFeature(int index_feature) const {
return feature_space_.PositionFromIndex(index_feature);
}
INT_FEATURE_STRUCT IntFeatureMap::InverseMapFeature(int map_feature) const {
int index = feature_map_.CompactToSparse(map_feature);
return feature_space_.PositionFromIndex(index);
}
void IntFeatureMap::DeleteMapFeature(int map_feature) {
feature_map_.Merge(-1, map_feature);
mapping_changed_ = true;
}
bool IntFeatureMap::IsMapFeatureDeleted(int map_feature) const {
return feature_map_.IsCompactDeleted(map_feature);
}
// Copies the given feature_space and uses it as the index feature map
// from INT_FEATURE_STRUCT.
void IntFeatureMap::Init(const IntFeatureSpace& feature_space) {
feature_space_ = feature_space;
mapping_changed_ = false;
int sparse_size = feature_space_.Size();
feature_map_.Init(sparse_size, true);
feature_map_.Setup();
compact_size_ = feature_map_.CompactSize();
// Initialize look-up tables if needed.
FCOORD dir = FeatureDirection(0);
if (dir.x() == 0.0f && dir.y() == 0.0f)
InitIntegerFX();
// Compute look-up tables to generate offset features.
for (int dir = 0; dir < kNumOffsetMaps; ++dir) {
delete [] offset_plus_[dir];
delete [] offset_minus_[dir];
offset_plus_[dir] = new int[sparse_size];
offset_minus_[dir] = new int[sparse_size];
}
for (int dir = 1; dir <= kNumOffsetMaps; ++dir) {
for (int i = 0; i < sparse_size; ++i) {
int offset_index = ComputeOffsetFeature(i, dir);
offset_plus_[dir - 1][i] = offset_index;
offset_index = ComputeOffsetFeature(i, -dir);
offset_minus_[dir - 1][i] = offset_index;
}
}
}
// Helper to return an offset index feature. In this context an offset
// feature with a dir of +/-1 is a feature of a similar direction,
// but shifted perpendicular to the direction of the feature. An offset
// feature with a dir of +/-2 is feature at the same position, but rotated
// by +/- one [compact] quantum. Returns the index of the generated offset
// feature, or -1 if it doesn't exist. Dir should be in
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
// A dir of 0 is an identity transformation.
// Both input and output are from the index(sparse) feature space, not
// the mapped/compact feature space, but the offset feature is the minimum
// distance moved from the input to guarantee that it maps to the next
// available quantum in the mapped/compact space.
int IntFeatureMap::OffsetFeature(int index_feature, int dir) const {
if (dir > 0 && dir <= kNumOffsetMaps)
return offset_plus_[dir - 1][index_feature];
else if (dir < 0 && -dir <= kNumOffsetMaps)
return offset_minus_[-dir - 1][index_feature];
else if (dir == 0)
return index_feature;
else
return -1;
}
//#define EXPERIMENT_ON
#ifdef EXPERIMENT_ON // This code is commented out as SampleIterator and
// TrainingSample are not reviewed/checked in yet, but these functions are a
// useful indicator of how an IntFeatureMap is setup.
// Computes the features used by the subset of samples defined by
// the iterator and sets up the feature mapping.
// Returns the size of the compacted feature space.
int IntFeatureMap::FindNZFeatureMapping(SampleIterator* it) {
feature_map_.Init(feature_space_.Size(), false);
int total_samples = 0;
for (it->Begin(); !it->AtEnd(); it->Next()) {
const TrainingSample& sample = it->GetSample();
GenericVector<int> features;
feature_space_.IndexAndSortFeatures(sample.features(),
sample.num_features(),
&features);
int num_features = features.size();
for (int f = 0; f < num_features; ++f)
feature_map_.SetMap(features[f], true);
++total_samples;
}
feature_map_.Setup();
compact_size_ = feature_map_.CompactSize();
mapping_changed_ = true;
FinalizeMapping(it);
tprintf("%d non-zero features found in %d samples\n",
compact_size_, total_samples);
return compact_size_;
}
#endif
// After deleting some features, finish setting up the mapping, and map
// all the samples. Returns the size of the compacted feature space.
int IntFeatureMap::FinalizeMapping(SampleIterator* it) {
if (mapping_changed_) {
feature_map_.CompleteMerges();
compact_size_ = feature_map_.CompactSize();
#ifdef EXPERIMENT_ON
it->MapSampleFeatures(*this);
#endif
mapping_changed_ = false;
}
return compact_size_;
}
// Prints the map features from the set in human-readable form.
void IntFeatureMap::DebugMapFeatures(
const GenericVector<int>& map_features) const {
for (int i = 0; i < map_features.size(); ++i) {
INT_FEATURE_STRUCT f = InverseMapFeature(map_features[i]);
f.print();
}
}
void IntFeatureMap::Clear() {
for (int dir = 0; dir < kNumOffsetMaps; ++dir) {
delete [] offset_plus_[dir];
delete [] offset_minus_[dir];
offset_plus_[dir] = NULL;
offset_minus_[dir] = NULL;
}
}
// Helper to compute an offset index feature. In this context an offset
// feature with a dir of +/-1 is a feature of a similar direction,
// but shifted perpendicular to the direction of the feature. An offset
// feature with a dir of +/-2 is feature at the same position, but rotated
// by +/- one [compact] quantum. Returns the index of the generated offset
// feature, or -1 if it doesn't exist. Dir should be in
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
// A dir of 0 is an identity transformation.
// Both input and output are from the index(sparse) feature space, not
// the mapped/compact feature space, but the offset feature is the minimum
// distance moved from the input to guarantee that it maps to the next
// available quantum in the mapped/compact space.
int IntFeatureMap::ComputeOffsetFeature(int index_feature, int dir) const {
INT_FEATURE_STRUCT f = InverseIndexFeature(index_feature);
ASSERT_HOST(IndexFeature(f) == index_feature);
if (dir == 0) {
return index_feature;
} else if (dir == 1 || dir == -1) {
FCOORD feature_dir = FeatureDirection(f.Theta);
FCOORD rotation90(0.0f, 1.0f);
feature_dir.rotate(rotation90);
// Find the nearest existing feature.
for (int m = 1; m < kMaxOffsetDist; ++m) {
double x_pos = f.X + feature_dir.x() * (m * dir);
double y_pos = f.Y + feature_dir.y() * (m * dir);
int x = IntCastRounded(x_pos);
int y = IntCastRounded(y_pos);
if (x >= 0 && x <= MAX_UINT8 && y >= 0 && y <= MAX_UINT8) {
INT_FEATURE_STRUCT offset_f;
offset_f.X = x;
offset_f.Y = y;
offset_f.Theta = f.Theta;
int offset_index = IndexFeature(offset_f);
if (offset_index != index_feature && offset_index >= 0)
return offset_index; // Found one.
} else {
return -1; // Hit the edge of feature space.
}
}
} else if (dir == 2 || dir == -2) {
// Find the nearest existing index_feature.
for (int m = 1; m < kMaxOffsetDist; ++m) {
int theta = f.Theta + m * dir / 2;
INT_FEATURE_STRUCT offset_f;
offset_f.X = f.X;
offset_f.Y = f.Y;
offset_f.Theta = Modulo(theta, 256);
int offset_index = IndexFeature(offset_f);
if (offset_index != index_feature && offset_index >= 0)
return offset_index; // Found one.
}
}
return -1; // Nothing within the max distance.
}
} // namespace tesseract.

163
classify/intfeaturemap.h Normal file
View File

@ -0,0 +1,163 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: intfeaturemap.h
// Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
// to provide a subspace mapping and fast feature lookup.
// Created: Tue Oct 26 08:58:30 PDT 2010
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H__
#define TESSERACT_CLASSIFY_INTFEATUREMAP_H__
#include "intfeaturespace.h"
#include "indexmapbidi.h"
#include "intproto.h"
namespace tesseract {
class SampleIterator;
// Number of positive and negative offset maps.
static const int kNumOffsetMaps = 2;
// Class to map a feature space defined by INT_FEATURE_STRUCT to a compact
// down-sampled subspace of actually used features.
// The IntFeatureMap copes with 2 stages of transformation:
// The first step is down-sampling (re-quantization) and converting to a
// single index value from the 3-D input:
// INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and
// the second is a feature-space compaction to map only the feature indices
// that are actually used. This saves space in classifiers that are built
// using the mapped feature space.
// index (sparse) feature <-> map (compact) feature via IndexMapBiDi.
// Although the transformations are reversible, the inverses are lossy and do
// not return the exact input INT_FEATURE_STRUCT, due to the many->one nature
// of both transformations.
class IntFeatureMap {
public:
IntFeatureMap();
~IntFeatureMap();
// Accessors.
int sparse_size() const {
return feature_space_.Size();
}
int compact_size() const {
return compact_size_;
}
const IntFeatureSpace& feature_space() const {
return feature_space_;
}
const IndexMapBiDi& feature_map() const {
return feature_map_;
}
// Pseudo-accessors.
int IndexFeature(const INT_FEATURE_STRUCT& f) const;
int MapFeature(const INT_FEATURE_STRUCT& f) const;
int MapIndexFeature(int index_feature) const;
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const;
INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const;
void DeleteMapFeature(int map_feature);
bool IsMapFeatureDeleted(int map_feature) const;
// Copies the given feature_space and uses it as the index feature map
// from INT_FEATURE_STRUCT.
void Init(const IntFeatureSpace& feature_space);
// Helper to return an offset index feature. In this context an offset
// feature with a dir of +/-1 is a feature of a similar direction,
// but shifted perpendicular to the direction of the feature. An offset
// feature with a dir of +/-2 is feature at the same position, but rotated
// by +/- one [compact] quantum. Returns the index of the generated offset
// feature, or -1 if it doesn't exist. Dir should be in
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
// A dir of 0 is an identity transformation.
// Both input and output are from the index(sparse) feature space, not
// the mapped/compact feature space, but the offset feature is the minimum
// distance moved from the input to guarantee that it maps to the next
// available quantum in the mapped/compact space.
int OffsetFeature(int index_feature, int dir) const;
// Computes the features used by the subset of samples defined by
// the iterator and sets up the feature mapping.
// Returns the size of the compacted feature space.
int FindNZFeatureMapping(SampleIterator* it);
// After deleting some features, finish setting up the mapping, and map
// all the samples. Returns the size of the compacted feature space.
int FinalizeMapping(SampleIterator* it);
// Indexes the given array of features to a vector of sorted indices.
void IndexAndSortFeatures(const INT_FEATURE_STRUCT* features,
int num_features,
GenericVector<int>* sorted_features) const {
feature_space_.IndexAndSortFeatures(features, num_features,
sorted_features);
}
// Maps the given array of index/sparse features to an array of map/compact
// features.
// Assumes the input is sorted. The output indices are sorted and uniqued.
// Returns the number of "missed" features, being features that
// don't map to the compact feature space.
int MapIndexedFeatures(const GenericVector<int>& index_features,
GenericVector<int>* map_features) const {
return feature_map_.MapFeatures(index_features, map_features);
}
// Prints the map features from the set in human-readable form.
void DebugMapFeatures(const GenericVector<int>& map_features) const;
private:
void Clear();
// Helper to compute an offset index feature. In this context an offset
// feature with a dir of +/-1 is a feature of a similar direction,
// but shifted perpendicular to the direction of the feature. An offset
// feature with a dir of +/-2 is feature at the same position, but rotated
// by +/- one [compact] quantum. Returns the index of the generated offset
// feature, or -1 if it doesn't exist. Dir should be in
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
// A dir of 0 is an identity transformation.
// Both input and output are from the index(sparse) feature space, not
// the mapped/compact feature space, but the offset feature is the minimum
// distance moved from the input to guarantee that it maps to the next
// available quantum in the mapped/compact space.
int ComputeOffsetFeature(int index_feature, int dir) const;
// True if the mapping has changed since it was last finalized.
bool mapping_changed_;
// Size of the compacted feature space, after unused features are removed.
int compact_size_;
// Feature space quantization definition and indexing from INT_FEATURE_STRUCT.
IntFeatureSpace feature_space_;
// Mapping from indexed feature space to the compacted space with unused
// features mapping to -1.
IndexMapBiDi feature_map_;
// Index tables to map a feature index to the corresponding feature after a
// shift perpendicular to the feature direction, or a rotation in place.
// An entry of -1 indicates that there is no corresponding feature.
// Array of arrays of size feature_space_.Size() owned by this class.
int* offset_plus_[kNumOffsetMaps];
int* offset_minus_[kNumOffsetMaps];
// Don't use default copy and assign!
IntFeatureMap(const IntFeatureMap&);
void operator=(const IntFeatureMap&);
};
} // namespace tesseract.
#endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H__

View File

@ -0,0 +1,143 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: intfeaturespace.cpp
// Description: Indexed feature space based on INT_FEATURE_STRUCT.
// Created: Wed Mar 24 11:21:27 PDT 2010
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "intfeaturespace.h"
#include "intfx.h"
namespace tesseract {
IntFeatureSpace::IntFeatureSpace()
: x_buckets_(0), y_buckets_(0), theta_buckets_(0) {
}
void IntFeatureSpace::Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets) {
x_buckets_ = xbuckets;
y_buckets_ = ybuckets;
theta_buckets_ = thetabuckets;
}
// Serializes the feature space definition to the given file.
// Returns false on error.
bool IntFeatureSpace::Serialize(FILE* fp) const {
if (fwrite(&x_buckets_, sizeof(x_buckets_), 1, fp) != 1)
return false;
if (fwrite(&y_buckets_, sizeof(y_buckets_), 1, fp) != 1)
return false;
if (fwrite(&theta_buckets_, sizeof(theta_buckets_), 1, fp) != 1)
return false;
return true;
}
// DeSerializes the feature space definition from the given file.
// If swap is true, the data is big/little-endian swapped.
// Returns false on error.
bool IntFeatureSpace::DeSerialize(bool swap, FILE* fp) {
if (fread(&x_buckets_, sizeof(x_buckets_), 1, fp) != 1)
return false;
if (fread(&y_buckets_, sizeof(y_buckets_), 1, fp) != 1)
return false;
if (fread(&theta_buckets_, sizeof(theta_buckets_), 1, fp) != 1)
return false;
return true;
}
// Returns an INT_FEATURE_STRUCT corresponding to the given index.
// This is the inverse of the Index member.
INT_FEATURE_STRUCT IntFeatureSpace::PositionFromIndex(int index) const {
return PositionFromBuckets(index / (y_buckets_ * theta_buckets_),
index / theta_buckets_ % y_buckets_,
index % theta_buckets_);
}
// Bulk calls to Index. Maps the given array of features to a vector of
// inT32 indices in the same order as the input.
void IntFeatureSpace::IndexFeatures(const INT_FEATURE_STRUCT* features,
int num_features,
GenericVector<int>* mapped_features) const {
mapped_features->truncate(0);
for (int f = 0; f < num_features; ++f)
mapped_features->push_back(Index(features[f]));
}
// Bulk calls to Index. Maps the given array of features to a vector of
// sorted inT32 indices.
void IntFeatureSpace::IndexAndSortFeatures(
const INT_FEATURE_STRUCT* features, int num_features,
GenericVector<int>* sorted_features) const {
sorted_features->truncate(0);
for (int f = 0; f < num_features; ++f)
sorted_features->push_back(Index(features[f]));
sorted_features->sort();
}
// Returns a feature space index for the given x,y position in a display
// window, or -1 if the feature is a miss.
int IntFeatureSpace::XYToFeatureIndex(int x, int y) const {
// Round the x,y position to a feature. Search for a valid theta.
INT_FEATURE_STRUCT feature = {static_cast<uinT8>(x), static_cast<uinT8>(y),
0, 0};
int index = -1;
for (int theta = 0; theta <= MAX_UINT8 && index < 0; ++theta) {
feature.Theta = theta;
index = Index(feature);
}
if (index < 0) {
tprintf("(%d,%d) does not exist in feature space!\n", x, y);
return -1;
}
feature = PositionFromIndex(index);
tprintf("Click at (%d, %d) ->(%d, %d), ->(%d, %d)\n",
x, y, feature.X, feature.Y, x - feature.X, y - feature.Y);
// Get the relative position of x,y from the rounded feature.
x -= feature.X;
y -= feature.Y;
if (x != 0 || y != 0) {
double angle = atan2(static_cast<double>(y), static_cast<double>(x)) + PI;
angle *= kIntFeatureExtent / (2.0 * PI);
feature.Theta = static_cast<uinT8>(angle + 0.5);
index = Index(feature);
if (index < 0) {
tprintf("Feature failed to map to a valid index:");
feature.print();
return -1;
}
feature = PositionFromIndex(index);
}
feature.print();
return index;
}
// Returns an INT_FEATURE_STRUCT corresponding to the given bucket coords.
INT_FEATURE_STRUCT IntFeatureSpace::PositionFromBuckets(int x,
int y,
int theta) const {
INT_FEATURE_STRUCT pos = {
static_cast<uinT8>(ClipToRange(
(x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,
0, MAX_UINT8)),
static_cast<uinT8>(ClipToRange(
(y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,
0, MAX_UINT8)),
static_cast<uinT8>(ClipToRange(
DivRounded(theta * kIntFeatureExtent, theta_buckets_),
0, MAX_UINT8))};
return pos;
}
} // namespace tesseract.

110
classify/intfeaturespace.h Normal file
View File

@ -0,0 +1,110 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: intfeaturespace.h
// Description: Indexed feature space based on INT_FEATURE_STRUCT.
// Created: Wed Mar 24 10:55:30 PDT 2010
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_INTFEATURESPACE_H__
#define TESSERACT_CLASSIFY_INTFEATURESPACE_H__
#include "genericvector.h"
#include "intproto.h"
// Extent of x,y,theta in the input feature space. [0,255].
const int kIntFeatureExtent = 256;
// Extent of x,y,theta dimensions in the quantized feature space.
const int kBoostXYBuckets = 16;
const int kBoostDirBuckets = 16;
namespace tesseract {
class IndexMap;
// Down-sampling quantization of the INT_FEATURE_STRUCT feature space and
// conversion to a single scalar index value, used as a binary feature space.
class IntFeatureSpace {
public:
IntFeatureSpace();
// Default copy constructors and assignment OK!
// Setup the feature space with the given dimensions.
void Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets);
// Serializes the feature space definition to the given file.
// Returns false on error.
bool Serialize(FILE* fp) const;
// DeSerializes the feature space definition from the given file.
// If swap is true, the data is big/little-endian swapped.
// Returns false on error.
bool DeSerialize(bool swap, FILE* fp);
// Returns the total size of the feature space.
int Size() const {
return static_cast<int>(x_buckets_) * y_buckets_ * theta_buckets_;
}
// Returns an INT_FEATURE_STRUCT corresponding to the given index.
// This is the inverse of the Index member.
INT_FEATURE_STRUCT PositionFromIndex(int index) const;
// Returns a 1-dimensional index corresponding to the given feature value.
// Range is [0, Size()-1]. Inverse of PositionFromIndex member.
int Index(const INT_FEATURE_STRUCT& f) const {
return (XBucket(f.X) * y_buckets_ + YBucket(f.Y)) * theta_buckets_ +
ThetaBucket(f.Theta);
}
// Bulk calls to Index. Maps the given array of features to a vector of
// inT32 indices in the same order as the input.
void IndexFeatures(const INT_FEATURE_STRUCT* features, int num_features,
GenericVector<int>* mapped_features) const;
// Bulk calls to Index. Maps the given array of features to a vector of
// sorted inT32 indices.
void IndexAndSortFeatures(const INT_FEATURE_STRUCT* features,
int num_features,
GenericVector<int>* sorted_features) const;
// Returns a feature space index for the given x,y position in a display
// window, or -1 if the feature is a miss.
int XYToFeatureIndex(int x, int y) const;
protected:
// Converters to generate indices for individual feature dimensions.
int XBucket(int x) const {
int bucket = x * x_buckets_ / kIntFeatureExtent;
return ClipToRange(bucket, 0, static_cast<int>(x_buckets_) - 1);
}
int YBucket(int y) const {
int bucket = y * y_buckets_ / kIntFeatureExtent;
return ClipToRange(bucket, 0, static_cast<int>(y_buckets_) - 1);
}
// Use DivRounded for theta so that exactly vertical and horizontal are in
// the middle of a bucket. The Modulo takes care of the wrap-around.
int ThetaBucket(int theta) const {
int bucket = DivRounded(theta * theta_buckets_, kIntFeatureExtent);
return Modulo(bucket, theta_buckets_);
}
// Returns an INT_FEATURE_STRUCT corresponding to the given buckets.
INT_FEATURE_STRUCT PositionFromBuckets(int x, int y, int theta) const;
// Feature space definition - serialized.
uinT8 x_buckets_;
uinT8 y_buckets_;
uinT8 theta_buckets_;
};
} // namespace tesseract.
#endif // TESSERACT_CLASSIFY_INTFEATURESPACE_H__

View File

@ -23,9 +23,13 @@
#include "const.h"
#include "helpers.h"
#include "ccutil.h"
#include "statistc.h"
#include "trainingsample.h"
#ifdef __UNIX__
#endif
using tesseract::TrainingSample;
/**----------------------------------------------------------------------------
Private Function Prototypes
----------------------------------------------------------------------------**/
@ -55,6 +59,10 @@ INT_VAR(classify_radius_gyr_max_exp, 8,
// atan(0.0) ... atan(ATAN_TABLE_SIZE - 1 / ATAN_TABLE_SIZE)
// The entries are in binary degrees where a full circle is 256 binary degrees.
static uinT8 AtanTable[ATAN_TABLE_SIZE];
// Look up table for cos and sin to turn the intfx feature angle to a vector.
// Also protected by atan_table_mutex.
static float cos_table[INT_CHAR_NORM_RANGE];
static float sin_table[INT_CHAR_NORM_RANGE];
// Guards write access to AtanTable so we dont create it more than once.
tesseract::CCUtilMutex atan_table_mutex;
@ -71,11 +79,46 @@ void InitIntegerFX() {
AtanTable[i] =
(uinT8) (atan ((i / (float) ATAN_TABLE_SIZE)) * 128.0 / PI + 0.5);
}
for (int i = 0; i < INT_CHAR_NORM_RANGE; ++i) {
cos_table[i] = cos(i * 2 * PI / INT_CHAR_NORM_RANGE + PI);
sin_table[i] = sin(i * 2 * PI / INT_CHAR_NORM_RANGE + PI);
}
atan_table_init = true;
}
atan_table_mutex.Unlock();
}
// Returns a vector representing the direction of a feature with the given
// theta direction in an INT_FEATURE_STRUCT.
FCOORD FeatureDirection(uinT8 theta) {
return FCOORD(cos_table[theta], sin_table[theta]);
}
TrainingSample* GetIntFeatures(tesseract::NormalizationMode mode,
TBLOB *blob, const DENORM& denorm) {
INT_FEATURE_ARRAY blfeatures;
INT_FEATURE_ARRAY cnfeatures;
INT_FX_RESULT_STRUCT fx_info;
ExtractIntFeat(blob, denorm, blfeatures, cnfeatures, &fx_info, NULL);
TrainingSample* sample = NULL;
if (mode == tesseract::NM_CHAR_ANISOTROPIC) {
int num_features = fx_info.NumCN;
if (num_features > 0) {
sample = TrainingSample::CopyFromFeatures(fx_info, cnfeatures,
num_features);
}
} else if (mode == tesseract::NM_BASELINE) {
int num_features = fx_info.NumBL;
if (num_features > 0) {
sample = TrainingSample::CopyFromFeatures(fx_info, blfeatures,
num_features);
}
} else {
ASSERT_HOST(!"Unsupported normalization mode!");
}
return sample;
}
/*--------------------------------------------------------------------------*/
// Extract a set of standard-sized features from Blobs and write them out in
@ -101,7 +144,7 @@ int ExtractIntFeat(TBLOB *Blob,
const DENORM& denorm,
INT_FEATURE_ARRAY BLFeat,
INT_FEATURE_ARRAY CNFeat,
INT_FX_RESULT Results,
INT_FX_RESULT_STRUCT* Results,
inT32 *FeatureOutlineArray) {
TESSLINE *OutLine;
@ -131,6 +174,8 @@ int ExtractIntFeat(TBLOB *Blob,
Results->Ry = 0;
Results->NumBL = 0;
Results->NumCN = 0;
Results->YBottom = MAX_UINT8;
Results->YTop = 0;
// Calculate the centroid (Xmean, Ymean) for the blob.
// We use centroid (instead of center of bounding box or center of smallest
@ -200,6 +245,8 @@ int ExtractIntFeat(TBLOB *Blob,
Iy = 0;
NumBLFeatures = 0;
OutLine = Blob->outlines;
int min_x = 0;
int max_x = 0;
while (OutLine != NULL) {
LoopStart = OutLine->loop;
Loop = LoopStart;
@ -213,6 +260,11 @@ int ExtractIntFeat(TBLOB *Blob,
Loop = Loop->next;
NormX = Loop->pos.x - Xmean;
NormY = Loop->pos.y;
if (NormY < Results->YBottom)
Results->YBottom = ClipToRange(NormY, 0, MAX_UINT8);
if (NormY > Results->YTop)
Results->YTop = ClipToRange(NormY, 0, MAX_UINT8);
UpdateRange(NormX, &min_x, &max_x);
n = 1;
if (!Segment->IsHidden()) {
@ -261,6 +313,7 @@ int ExtractIntFeat(TBLOB *Blob,
while (Loop != LoopStart);
OutLine = OutLine->next;
}
Results->Width = max_x - min_x;
if (Ix == 0)
Ix = 1;
if (Iy == 0)
@ -440,6 +493,7 @@ int SaveFeature(INT_FEATURE_ARRAY FeatureArray,
Feature->X = ClipToRange<inT16>(X, 0, 255);
Feature->Y = ClipToRange<inT16>(Y, 0, 255);
Feature->Theta = Theta;
Feature->CP_misses = 0;
return TRUE;
}

View File

@ -23,31 +23,43 @@
----------------------------------------------------------------------------**/
#include "blobs.h"
#include "intproto.h"
#include "normalis.h"
#include <math.h>
class DENORM;
typedef struct
{
inT32 Length; /* total length of all outlines */
inT16 Xmean, Ymean; /* center of mass of all outlines */
inT16 Rx, Ry; /* radius of gyration */
inT16 NumBL, NumCN; /* number of features extracted */
namespace tesseract {
class TrainingSample;
}
INT_FX_RESULT_STRUCT, *INT_FX_RESULT;
struct INT_FX_RESULT_STRUCT {
inT32 Length; // total length of all outlines
inT16 Xmean, Ymean; // center of mass of all outlines
inT16 Rx, Ry; // radius of gyration
inT16 NumBL, NumCN; // number of features extracted
inT16 Width; // Width of blob in BLN coords.
uinT8 YBottom; // Bottom of blob in BLN coords.
uinT8 YTop; // Top of blob in BLN coords.
};
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
void InitIntegerFX();
// Returns a vector representing the direction of a feature with the given
// theta direction in an INT_FEATURE_STRUCT.
FCOORD FeatureDirection(uinT8 theta);
tesseract::TrainingSample* GetIntFeatures(
tesseract::NormalizationMode mode, TBLOB *blob,
const DENORM& denorm);
int ExtractIntFeat(TBLOB *Blob,
const DENORM& denorm,
INT_FEATURE_ARRAY BLFeat,
INT_FEATURE_ARRAY CNFeat,
INT_FX_RESULT Results,
INT_FX_RESULT_STRUCT* Results,
inT32 *FeatureOutlineArray = 0);
uinT8 BinaryAnglePlusPi(inT32 Y, inT32 X);

View File

@ -23,8 +23,11 @@
#include "intproto.h"
#include "callcpp.h"
#include "scrollview.h"
#include "float2int.h"
#include "globals.h"
#include "helpers.h"
#include "classify.h"
#include "shapetable.h"
#include <math.h>
// Include automatically generated configuration file if running autoconf.
@ -35,6 +38,11 @@
/*----------------------------------------------------------------------------
Global Data Definitions and Declarations
----------------------------------------------------------------------------*/
// Parameters of the sigmoid used to convert similarity to evidence in the
// similarity_evidence_table_ that is used to convert distance metric to an
// 8 bit evidence value in the secondary matcher. (See IntMatcher::Init).
const float IntegerMatcher::kSEExponentialMultiplier = 0.0;
const float IntegerMatcher::kSimilarityCenter = 0.0075;
static const uinT8 offset_table[256] = {
255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
@ -89,275 +97,360 @@ static const uinT8 next_table[256] = {
0xf8, 0xfc, 0xfc, 0xfe
};
struct ClassPrunerData {
int *class_count_;
int *norm_count_;
int *sort_key_;
int *sort_index_;
int max_classes_;
namespace tesseract {
ClassPrunerData(int max_classes) {
// class_count_ and friends are referenced by indexing off of data in
// class pruner word sized chunks. Each pruner word is of sized
// BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
// BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
// See Classify::ClassPruner in intmatcher.cpp.
max_classes_ = RoundUp(
// Encapsulation of the intermediate data and computations made by the class
// pruner. The class pruner implements a simple linear classifier on binary
// features by heavily quantizing the feature space, and applying
// NUM_BITS_PER_CLASS (2)-bit weights to the features. Lack of resolution in
// weights is compensated by a non-constant bias that is dependent on the
// number of features present.
class ClassPruner {
public:
ClassPruner(int max_classes) {
// The unrolled loop in ComputeScores means that the array sizes need to
// be rounded up so that the array is big enough to accommodate the extra
// entries accessed by the unrolling. Each pruner word is of sized
// BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
// BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
// See ComputeScores.
max_classes_ = max_classes;
rounded_classes_ = RoundUp(
max_classes, WERDS_PER_CP_VECTOR * BITS_PER_WERD / NUM_BITS_PER_CLASS);
class_count_ = new int[max_classes_];
norm_count_ = new int[max_classes_];
sort_key_ = new int[max_classes_ + 1];
sort_index_ = new int[max_classes_ + 1];
for (int i = 0; i < max_classes_; i++) {
class_count_ = new int[rounded_classes_];
norm_count_ = new int[rounded_classes_];
sort_key_ = new int[rounded_classes_ + 1];
sort_index_ = new int[rounded_classes_ + 1];
for (int i = 0; i < rounded_classes_; i++) {
class_count_[i] = 0;
}
pruning_threshold_ = 0;
num_features_ = 0;
num_classes_ = 0;
}
~ClassPrunerData() {
~ClassPruner() {
delete []class_count_;
delete []norm_count_;
delete []sort_key_;
delete []sort_index_;
}
};
// Computes the scores for every class in the character set, by summing the
// weights for each feature and stores the sums internally in class_count_.
void ComputeScores(const INT_TEMPLATES_STRUCT* int_templates,
int num_features, const INT_FEATURE_STRUCT* features) {
num_features_ = num_features;
int num_pruners = int_templates->NumClassPruners;
for (int f = 0; f < num_features; ++f) {
const INT_FEATURE_STRUCT* feature = &features[f];
// Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
int x = feature->X * NUM_CP_BUCKETS >> 8;
int y = feature->Y * NUM_CP_BUCKETS >> 8;
int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
int class_id = 0;
// Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
// we need a collection of them, indexed by pruner_set.
for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
// Look up quantized feature in a 3-D array, an array of weights for
// each class.
const uinT32* pruner_word_ptr =
int_templates->ClassPruners[pruner_set]->p[x][y][theta];
for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
uinT32 pruner_word = *pruner_word_ptr++;
// This inner loop is unrolled to speed up the ClassPruner.
// Currently gcc would not unroll it unless it is set to O3
// level of optimization or -funroll-loops is specified.
/*
uinT32 class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
class_count_[class_id++] += pruner_word & class_mask;
pruner_word >>= NUM_BITS_PER_CLASS;
}
*/
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
pruner_word >>= NUM_BITS_PER_CLASS;
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
}
}
}
}
const float IntegerMatcher::kSEExponentialMultiplier = 0.0;
const float IntegerMatcher::kSimilarityCenter = 0.0075;
// Adjusts the scores according to the number of expected features. Used
// in lieu of a constant bias, this penalizes classes that expect more
// features than there are present. Thus an actual c will score higher for c
// than e, even though almost all the features match e as well as c, because
// e expects more features to be present.
void AdjustForExpectedNumFeatures(const uinT16* expected_num_features,
int cutoff_strength) {
for (int class_id = 0; class_id < max_classes_; ++class_id) {
if (num_features_ < expected_num_features[class_id]) {
int deficit = expected_num_features[class_id] - num_features_;
class_count_[class_id] -= class_count_[class_id] * deficit /
(num_features_ * cutoff_strength + deficit);
}
}
}
// Zeros the scores for classes disabled in the unicharset.
// Implements the black-list to recognize a subset of the character set.
void DisableDisabledClasses(const UNICHARSET& unicharset) {
for (int class_id = 0; class_id < max_classes_; ++class_id) {
if (!unicharset.get_enabled(class_id))
class_count_[class_id] = 0; // This char is disabled!
}
}
// Zeros the scores of fragments.
void DisableFragments(const UNICHARSET& unicharset) {
for (int class_id = 0; class_id < max_classes_; ++class_id) {
// Do not include character fragments in the class pruner
// results if disable_character_fragments is true.
if (unicharset.get_fragment(class_id)) {
class_count_[class_id] = 0;
}
}
}
// Normalizes the counts for xheight, putting the normalized result in
// norm_count_. Applies a simple subtractive penalty for incorrect vertical
// position provided by the normalization_factors array, indexed by
// character class, and scaled by the norm_multiplier.
void NormalizeForXheight(int norm_multiplier,
const uinT8* normalization_factors) {
for (int class_id = 0; class_id < max_classes_; class_id++) {
norm_count_[class_id] = class_count_[class_id] -
((norm_multiplier * normalization_factors[class_id]) >> 8);
}
}
// The nop normalization copies the class_count_ array to norm_count_.
void NoNormalization() {
for (int class_id = 0; class_id < max_classes_; class_id++) {
norm_count_[class_id] = class_count_[class_id];
}
}
// Prunes the classes using <the maximum count> * pruning_factor/256 as a
// threshold for keeping classes. If max_of_non_fragments, then ignore
// fragments in computing the maximum count.
void PruneAndSort(int pruning_factor, bool max_of_non_fragments,
const UNICHARSET& unicharset) {
int max_count = 0;
for (int c = 0; c < max_classes_; ++c) {
if (norm_count_[c] > max_count &&
// This additional check is added in order to ensure that
// the classifier will return at least one non-fragmented
// character match.
// TODO(daria): verify that this helps accuracy and does not
// hurt performance.
(!max_of_non_fragments || !unicharset.get_fragment(c))) {
max_count = norm_count_[c];
}
}
// Prune Classes.
pruning_threshold_ = (max_count * pruning_factor) >> 8;
// Select Classes.
if (pruning_threshold_ < 1)
pruning_threshold_ = 1;
num_classes_ = 0;
for (int class_id = 0; class_id < max_classes_; class_id++) {
if (norm_count_[class_id] >= pruning_threshold_) {
++num_classes_;
sort_index_[num_classes_] = class_id;
sort_key_[num_classes_] = norm_count_[class_id];
}
}
// Sort Classes using Heapsort Algorithm.
if (num_classes_ > 1)
HeapSort(num_classes_, sort_key_, sort_index_);
}
// Prints debug info on the class pruner matches for the pruned classes only.
void DebugMatch(const Classify& classify,
const INT_TEMPLATES_STRUCT* int_templates,
const INT_FEATURE_STRUCT* features) const {
int num_pruners = int_templates->NumClassPruners;
int max_num_classes = int_templates->NumClasses;
for (int f = 0; f < num_features_; ++f) {
const INT_FEATURE_STRUCT* feature = &features[f];
tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta);
// Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
int x = feature->X * NUM_CP_BUCKETS >> 8;
int y = feature->Y * NUM_CP_BUCKETS >> 8;
int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
int class_id = 0;
for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
// Look up quantized feature in a 3-D array, an array of weights for
// each class.
const uinT32* pruner_word_ptr =
int_templates->ClassPruners[pruner_set]->p[x][y][theta];
for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
uinT32 pruner_word = *pruner_word_ptr++;
for (int word_class = 0; word_class < 16 &&
class_id < max_num_classes; ++word_class, ++class_id) {
if (norm_count_[class_id] >= pruning_threshold_) {
tprintf(" %s=%d,",
classify.ClassIDToDebugStr(int_templates,
class_id, 0).string(),
pruner_word & CLASS_PRUNER_CLASS_MASK);
}
pruner_word >>= NUM_BITS_PER_CLASS;
}
}
tprintf("\n");
}
}
}
// Prints a summary of the pruner result.
void SummarizeResult(const Classify& classify,
const INT_TEMPLATES_STRUCT* int_templates,
const uinT16* expected_num_features,
int norm_multiplier,
const uinT8* normalization_factors) const {
tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_);
for (int i = 0; i < num_classes_; ++i) {
int class_id = sort_index_[num_classes_ - i];
STRING class_string = classify.ClassIDToDebugStr(int_templates,
class_id, 0);
tprintf("%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n",
class_string.string(),
class_count_[class_id],
expected_num_features[class_id],
(norm_multiplier * normalization_factors[class_id]) >> 8,
sort_key_[num_classes_ - i],
100.0 - 100.0 * sort_key_[num_classes_ - i] /
(CLASS_PRUNER_CLASS_MASK * num_features_));
}
}
// Copies the pruned, sorted classes into the output results and returns
// the number of classes.
int SetupResults(CP_RESULT_STRUCT* results) const {
for (int c = 0; c < num_classes_; ++c) {
results[c].Class = sort_index_[num_classes_ - c];
results[c].Rating = 1.0 - sort_key_[num_classes_ - c] /
(static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_);
}
return num_classes_;
}
private:
// Array[rounded_classes_] of initial counts for each class.
int *class_count_;
// Array[rounded_classes_] of modified counts for each class after normalizing
// for expected number of features, disabled classes, fragments, and xheights.
int *norm_count_;
// Array[rounded_classes_ +1] of pruned counts that gets sorted
int *sort_key_;
// Array[rounded_classes_ +1] of classes corresponding to sort_key_.
int *sort_index_;
// Number of classes in this class pruner.
int max_classes_;
// Rounded up number of classes used for array sizes.
int rounded_classes_;
// Threshold count applied to prune classes.
int pruning_threshold_;
// The number of features used to compute the scores.
int num_features_;
// Final number of pruned classes.
int num_classes_;
};
/*----------------------------------------------------------------------------
Public Code
----------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
namespace tesseract {
int Classify::ClassPruner(INT_TEMPLATES IntTemplates,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
CLASS_PRUNER_RESULTS Results) {
// Runs the class pruner from int_templates on the given features, returning
// the number of classes output in results.
// int_templates Class pruner tables
// num_features Number of features in blob
// features Array of features
// normalization_factors Array of fudge factors from blob
// normalization process (by CLASS_INDEX)
// expected_num_features Array of expected number of features
// for each class (by CLASS_INDEX)
// results Sorted Array of pruned classes. Must be an array
// of size at least int_templates->NumClasses.
int Classify::PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
int num_features,
const INT_FEATURE_STRUCT* features,
const uinT8* normalization_factors,
const uinT16* expected_num_features,
CP_RESULT_STRUCT* results) {
/*
** Parameters:
** IntTemplates Class pruner tables
** NumFeatures Number of features in blob
** Features Array of features
** NormalizationFactors Array of fudge factors from blob
** normalization process
** (by CLASS_INDEX)
** ExpectedNumFeatures Array of expected number of features
** for each class
** (by CLASS_INDEX)
** Results Sorted Array of pruned classes
** (by CLASS_ID)
** Operation:
** Prune the classes using a modified fast match table.
** Return a sorted list of classes along with the number
** of pruned classes in that list.
** Return: Number of pruned classes.
** Exceptions: none
** History: Tue Feb 19 10:24:24 MST 1991, RWM, Created.
** Operation:
** Prunes the classes using a modified fast match table.
** Returns a sorted list of classes along with the number
** of pruned classes in that list.
** Return: Number of pruned classes.
** Exceptions: none
** History: Tue Feb 19 10:24:24 MST 1991, RWM, Created.
*/
uinT32 PrunerWord;
inT32 class_index; //index to class
int Word;
uinT32 *BasePrunerAddress;
uinT32 feature_address; //current feature index
INT_FEATURE feature; //current feature
CLASS_PRUNER *ClassPruner;
int PrunerSet;
int NumPruners;
inT32 feature_index; //current feature
ClassPruner pruner(int_templates->NumClasses);
// Compute initial match scores for all classes.
pruner.ComputeScores(int_templates, num_features, features);
// Adjust match scores for number of expected features.
pruner.AdjustForExpectedNumFeatures(expected_num_features,
classify_cp_cutoff_strength);
// Apply disabled classes in unicharset - only works without a shape_table.
if (shape_table_ == NULL)
pruner.DisableDisabledClasses(unicharset);
// If fragments are disabled, remove them, also only without a shape table.
if (disable_character_fragments && shape_table_ == NULL)
pruner.DisableFragments(unicharset);
int MaxNumClasses = IntTemplates->NumClasses;
ClassPrunerData data(IntTemplates->NumClasses);
int *ClassCount = data.class_count_;
int *NormCount = data.norm_count_;
int *SortKey = data.sort_key_;
int *SortIndex = data.sort_index_;
int out_class;
int MaxCount;
int NumClasses;
FLOAT32 max_rating; //max allowed rating
CLASS_ID class_id;
/* Update Class Counts */
NumPruners = IntTemplates->NumClassPruners;
for (feature_index = 0; feature_index < NumFeatures; feature_index++) {
feature = &Features[feature_index];
feature_address = (((feature->X * NUM_CP_BUCKETS >> 8) * NUM_CP_BUCKETS +
(feature->Y * NUM_CP_BUCKETS >> 8)) * NUM_CP_BUCKETS +
(feature->Theta * NUM_CP_BUCKETS >> 8)) << 1;
ClassPruner = IntTemplates->ClassPruner;
class_index = 0;
for (PrunerSet = 0; PrunerSet < NumPruners; PrunerSet++, ClassPruner++) {
BasePrunerAddress = (uinT32 *) (*ClassPruner) + feature_address;
for (Word = 0; Word < WERDS_PER_CP_VECTOR; Word++) {
PrunerWord = *BasePrunerAddress++;
// This inner loop is unrolled to speed up the ClassPruner.
// Currently gcc would not unroll it unless it is set to O3
// level of optimization or -funroll-loops is specified.
/*
uinT32 class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
ClassCount[class_index++] += PrunerWord & class_mask;
PrunerWord >>= NUM_BITS_PER_CLASS;
}
*/
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
PrunerWord >>= NUM_BITS_PER_CLASS;
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
}
}
// If we have good x-heights, apply the given normalization factors.
if (normalization_factors != NULL) {
pruner.NormalizeForXheight(classify_class_pruner_multiplier,
normalization_factors);
} else {
pruner.NoNormalization();
}
// Do the actual pruning and sort the short-list.
pruner.PruneAndSort(classify_class_pruner_threshold,
shape_table_ == NULL, unicharset);
/* Adjust Class Counts for Number of Expected Features */
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
if (NumFeatures < ExpectedNumFeatures[class_id]) {
int deficit = ExpectedNumFeatures[class_id] - NumFeatures;
ClassCount[class_id] -= ClassCount[class_id] * deficit /
(NumFeatures * classify_cp_cutoff_strength + deficit);
}
if (!unicharset.get_enabled(class_id))
ClassCount[class_id] = 0; // This char is disabled!
// Do not include character fragments in the class pruner
// results if disable_character_fragments is true.
if (disable_character_fragments && unicharset.get_fragment(class_id)) {
ClassCount[class_id] = 0;
}
if (classify_debug_level > 2) {
pruner.DebugMatch(*this, int_templates, features);
}
/* Adjust Class Counts for Normalization Factors */
MaxCount = 0;
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
NormCount[class_id] = ClassCount[class_id]
- ((classify_class_pruner_multiplier * NormalizationFactors[class_id])
>> 8);
if (NormCount[class_id] > MaxCount &&
// This additional check is added in order to ensure that
// the classifier will return at least one non-fragmented
// character match.
// TODO(daria): verify that this helps accuracy and does not
// hurt performance.
!unicharset.get_fragment(class_id)) {
MaxCount = NormCount[class_id];
}
}
/* Prune Classes */
MaxCount *= classify_class_pruner_threshold;
MaxCount >>= 8;
/* Select Classes */
if (MaxCount < 1)
MaxCount = 1;
NumClasses = 0;
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
if (NormCount[class_id] >= MaxCount) {
NumClasses++;
SortIndex[NumClasses] = class_id;
SortKey[NumClasses] = NormCount[class_id];
}
}
/* Sort Classes using Heapsort Algorithm */
if (NumClasses > 1)
HeapSort(NumClasses, SortKey, SortIndex);
if (classify_debug_level > 1) {
cprintf ("CP:%d classes, %d features:\n", NumClasses, NumFeatures);
for (class_id = 0; class_id < NumClasses; class_id++) {
cprintf ("%s:C=%d, E=%d, N=%d, Rat=%d\n",
unicharset.debug_str(SortIndex[NumClasses - class_id]).string(),
ClassCount[SortIndex[NumClasses - class_id]],
ExpectedNumFeatures[SortIndex[NumClasses - class_id]],
SortKey[NumClasses - class_id],
1010 - 1000 * SortKey[NumClasses - class_id] /
(CLASS_PRUNER_CLASS_MASK * NumFeatures));
}
if (classify_debug_level > 2) {
NumPruners = IntTemplates->NumClassPruners;
for (feature_index = 0; feature_index < NumFeatures;
feature_index++) {
cprintf ("F=%3d,", feature_index);
feature = &Features[feature_index];
feature_address =
(((feature->X * NUM_CP_BUCKETS >> 8) * NUM_CP_BUCKETS +
(feature->Y * NUM_CP_BUCKETS >> 8)) * NUM_CP_BUCKETS +
(feature->Theta * NUM_CP_BUCKETS >> 8)) << 1;
ClassPruner = IntTemplates->ClassPruner;
class_index = 0;
for (PrunerSet = 0; PrunerSet < NumPruners;
PrunerSet++, ClassPruner++) {
BasePrunerAddress = (uinT32 *) (*ClassPruner)
+ feature_address;
for (Word = 0; Word < WERDS_PER_CP_VECTOR; Word++) {
PrunerWord = *BasePrunerAddress++;
for (class_id = 0; class_id < 16; class_id++, class_index++) {
if (NormCount[class_index] >= MaxCount)
cprintf (" %s=%d,",
unicharset.id_to_unichar(class_index),
PrunerWord & CLASS_PRUNER_CLASS_MASK);
PrunerWord >>= NUM_BITS_PER_CLASS;
}
}
}
cprintf ("\n");
}
cprintf ("Adjustments:");
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
if (NormCount[class_id] > MaxCount)
cprintf(" %s=%d,",
unicharset.id_to_unichar(class_id),
-((classify_class_pruner_multiplier *
NormalizationFactors[class_id]) >> 8));
}
cprintf ("\n");
}
pruner.SummarizeResult(*this, int_templates, expected_num_features,
classify_class_pruner_multiplier,
normalization_factors);
}
/* Set Up Results */
max_rating = 0.0f;
for (class_id = 0, out_class = 0; class_id < NumClasses; class_id++) {
Results[out_class].Class = SortIndex[NumClasses - class_id];
Results[out_class].Rating =
1.0 - SortKey[NumClasses - class_id] /
(static_cast<float>(CLASS_PRUNER_CLASS_MASK) * NumFeatures);
out_class++;
}
NumClasses = out_class;
return NumClasses;
// Convert to the expected output format.
return pruner.SetupResults(results);
}
} // namespace tesseract
@ -366,10 +459,8 @@ int Classify::ClassPruner(INT_TEMPLATES IntTemplates,
void IntegerMatcher::Match(INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
uinT16 BlobLength,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
uinT8 NormalizationFactor,
const INT_FEATURE_STRUCT* Features,
INT_RESULT Result,
int AdaptFeatureThreshold,
int Debug,
@ -436,12 +527,11 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate,
tables->UpdateSumOfProtoEvidences(ClassTemplate, ConfigMask, NumFeatures);
tables->NormalizeSums(ClassTemplate, NumFeatures, NumFeatures);
BestMatch = FindBestMatch(ClassTemplate, *tables, BlobLength,
NormalizationFactor, Result);
BestMatch = FindBestMatch(ClassTemplate, *tables, Result);
#ifndef GRAPHICS_DISABLED
if (PrintMatchSummaryOn(Debug))
DebugBestMatch(BestMatch, Result, BlobLength, NormalizationFactor);
DebugBestMatch(BestMatch, Result);
if (MatchDebuggingOn(Debug))
cprintf("Match Complete --------------------------------------------\n");
@ -718,7 +808,7 @@ int IntegerMatcher::UpdateTablesForFeature(
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
int FeatureNum,
INT_FEATURE Feature,
const INT_FEATURE_STRUCT* Feature,
ScratchEvidence *tables,
int Debug) {
/*
@ -1048,7 +1138,7 @@ void IntegerMatcher::DisplayFeatureDebugInfo(
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
const INT_FEATURE_STRUCT* Features,
int AdaptFeatureThreshold,
int Debug,
bool SeparateDebugWindows) {
@ -1146,8 +1236,6 @@ void ScratchEvidence::NormalizeSums(
int IntegerMatcher::FindBestMatch(
INT_CLASS ClassTemplate,
const ScratchEvidence &tables,
uinT16 BlobLength,
uinT8 NormalizationFactor,
INT_RESULT Result) {
/*
** Parameters:
@ -1168,7 +1256,7 @@ int IntegerMatcher::FindBestMatch(
/* Find best match */
for (int ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {
int rating = tables.sum_feature_evidence_[ConfigNum];
if (*classify_debug_level_ > 1)
if (*classify_debug_level_ > 2)
cprintf("Config %d, rating=%d\n", ConfigNum, rating);
if (rating > BestMatch) {
if (BestMatch > 0) {
@ -1186,31 +1274,28 @@ int IntegerMatcher::FindBestMatch(
}
/* Compute Certainty Rating */
Result->Rating = ((65536.0 - BestMatch) / 65536.0 * BlobLength +
local_matcher_multiplier_ * NormalizationFactor / 256.0) /
(BlobLength + local_matcher_multiplier_);
Result->Rating = (65536.0 - BestMatch) / 65536.0;
return BestMatch;
}
// Applies the CN normalization factor to the given rating and returns
// the modified rating.
float IntegerMatcher::ApplyCNCorrection(float rating, int blob_length,
int normalization_factor) {
return (rating * blob_length +
local_matcher_multiplier_ * normalization_factor / 256.0) /
(blob_length + local_matcher_multiplier_);
}
/*---------------------------------------------------------------------------*/
#ifndef GRAPHICS_DISABLED
// Print debug information about the best match for the current class.
void IntegerMatcher::DebugBestMatch(
int BestMatch, INT_RESULT Result, uinT16 BlobLength,
uinT8 NormalizationFactor) {
cprintf("Rating = %5.1f%% Best Config = %3d\n",
100.0 * ((*Result).Rating), (int) ((*Result).Config));
cprintf
("Matcher Error = %5.1f%% Blob Length = %3d Weight = %4.1f%%\n",
100.0 * (65536.0 - BestMatch) / 65536.0, (int) BlobLength,
100.0 * BlobLength / (BlobLength + local_matcher_multiplier_));
cprintf
("Char Norm Error = %5.1f%% Norm Strength = %3d Weight = %4.1f%%\n",
100.0 * NormalizationFactor / 256.0,
local_matcher_multiplier_,
100.0 * local_matcher_multiplier_ /
(BlobLength + local_matcher_multiplier_));
int BestMatch, INT_RESULT Result) {
tprintf("Rating = %5.1f%% Best Config = %3d, Distance = %5.1f\n",
100.0 * Result->Rating, Result->Config,
100.0 * (65536.0 - BestMatch) / 65536.0);
}
#endif

View File

@ -56,8 +56,6 @@ struct CP_RESULT_STRUCT {
typedef CP_RESULT_STRUCT CLASS_PRUNER_RESULTS[MAX_NUM_CLASSES];
typedef uinT8 CLASS_NORMALIZATION_ARRAY[MAX_NUM_CLASSES];
/*----------------------------------------------------------------------------
Variables
-----------------------------------------------------------------------------*/
@ -113,15 +111,18 @@ class IntegerMatcher {
void Match(INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
uinT16 BlobLength,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
uinT8 NormalizationFactor,
const INT_FEATURE_STRUCT* Features,
INT_RESULT Result,
int AdaptFeatureThreshold,
int Debug,
bool SeparateDebugWindows);
// Applies the CN normalization factor to the given rating and returns
// the modified rating.
float ApplyCNCorrection(float rating, int blob_length,
int normalization_factor);
int FindGoodProtos(INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
@ -148,14 +149,12 @@ class IntegerMatcher {
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
int FeatureNum,
INT_FEATURE Feature,
const INT_FEATURE_STRUCT* Feature,
ScratchEvidence *evidence,
int Debug);
int FindBestMatch(INT_CLASS ClassTemplate,
const ScratchEvidence &tables,
uinT16 BlobLength,
uinT8 NormalizationFactor,
INT_RESULT Result);
#ifndef GRAPHICS_DISABLED
@ -179,15 +178,12 @@ class IntegerMatcher {
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
inT16 NumFeatures,
INT_FEATURE_ARRAY Features,
const INT_FEATURE_STRUCT* Features,
int AdaptFeatureThreshold,
int Debug,
bool SeparateDebugWindows);
void DebugBestMatch(int BestMatch,
INT_RESULT Result,
uinT16 BlobLength,
uinT8 NormalizationFactor);
void DebugBestMatch(int BestMatch, INT_RESULT Result);
#endif

View File

@ -18,19 +18,6 @@
/*-----------------------------------------------------------------------------
Include Files and Type Defines
-----------------------------------------------------------------------------*/
#include "helpers.h"
#include "intproto.h"
#include "picofeat.h"
#include "mfoutline.h"
#include "emalloc.h"
#include "const.h"
#include "ndminx.h"
#include "svmnode.h"
#include "globals.h"
#include "classify.h"
#include "genericvector.h"
//extern GetPicoFeatureLength();
#include <math.h>
#include <stdio.h>
@ -39,11 +26,29 @@
#include <unistd.h>
#endif
#include "classify.h"
#include "const.h"
#include "emalloc.h"
#include "fontinfo.h"
#include "genericvector.h"
#include "globals.h"
#include "helpers.h"
#include "intproto.h"
#include "mfoutline.h"
#include "ndminx.h"
#include "picofeat.h"
#include "shapetable.h"
#include "svmnode.h"
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
using tesseract::FontInfo;
using tesseract::FontSet;
using tesseract::FontSpacingInfo;
/* match debug display constants*/
#define PROTO_PRUNER_SCALE (4.0)
@ -126,7 +131,7 @@ FLOAT32 BucketStart(int Bucket, FLOAT32 Offset, int NumBuckets);
FLOAT32 BucketEnd(int Bucket, FLOAT32 Offset, int NumBuckets);
void DoFill(FILL_SPEC *FillSpec,
CLASS_PRUNER Pruner,
CLASS_PRUNER_STRUCT* Pruner,
register uinT32 ClassMask,
register uinT32 ClassCount,
register uinT32 WordIndex);
@ -218,7 +223,6 @@ double_VAR(classify_pp_side_pad, 2.5, "Proto Pruner Side Pad");
*/
void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class) {
int Pruner;
uinT32 *Word;
assert (LegalClassId (ClassId));
if (ClassId != Templates->NumClasses) {
@ -231,13 +235,8 @@ void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class) {
if (Templates->NumClasses > MaxNumClassesIn (Templates)) {
Pruner = Templates->NumClassPruners++;
Templates->ClassPruner[Pruner] =
(CLASS_PRUNER) Emalloc (sizeof (CLASS_PRUNER_STRUCT));
for (Word = reinterpret_cast<uinT32*>(Templates->ClassPruner[Pruner]);
Word < reinterpret_cast<uinT32*>(Templates->ClassPruner[Pruner]) +
WERDS_PER_CP;
*Word++ = 0);
Templates->ClassPruners[Pruner] = new CLASS_PRUNER_STRUCT;
memset(Templates->ClassPruners[Pruner], 0, sizeof(CLASS_PRUNER_STRUCT));
}
} /* AddIntClass */
@ -296,14 +295,14 @@ int AddIntProto(INT_CLASS Class) {
ProtoSet = (PROTO_SET) Emalloc(sizeof(PROTO_SET_STRUCT));
Class->ProtoSets[ProtoSetId] = ProtoSet;
for (Word = reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner);
Word < reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner) + WERDS_PER_PP;
*Word++ = 0);
memset(ProtoSet, 0, sizeof(*ProtoSet));
/* reallocate space for the proto lengths and install in class */
Class->ProtoLengths =
(uinT8 *)Erealloc(Class->ProtoLengths,
MaxNumIntProtosIn(Class) * sizeof(uinT8));
memset(&Class->ProtoLengths[Index], 0,
sizeof(*Class->ProtoLengths) * (MaxNumIntProtosIn(Class) - Index));
}
/* initialize proto so its length is zero and it isn't in any configs */
@ -335,7 +334,7 @@ void AddProtoToClassPruner (PROTO Proto, CLASS_ID ClassId,
*/
#define MAX_LEVEL 2
{
CLASS_PRUNER Pruner;
CLASS_PRUNER_STRUCT* Pruner;
uinT32 ClassMask;
uinT32 ClassCount;
uinT32 WordIndex;
@ -636,7 +635,7 @@ INT_TEMPLATES Classify::CreateIntTemplates(CLASSES FloatProtos,
/*---------------------------------------------------------------------------*/
#ifndef GRAPHICS_DISABLED
void DisplayIntFeature(INT_FEATURE Feature, FLOAT32 Evidence) {
void DisplayIntFeature(const INT_FEATURE_STRUCT* Feature, FLOAT32 Evidence) {
/*
** Parameters:
** Feature pico-feature to be displayed
@ -697,7 +696,6 @@ INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs) {
INT_CLASS Class;
PROTO_SET ProtoSet;
int i;
register uinT32 *Word;
assert(MaxNumConfigs <= MAX_NUM_CONFIGS);
@ -713,17 +711,20 @@ INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs) {
for (i = 0; i < Class->NumProtoSets; i++) {
/* allocate space for a proto set, install in class, and initialize */
ProtoSet = (PROTO_SET) Emalloc(sizeof(PROTO_SET_STRUCT));
memset(ProtoSet, 0, sizeof(*ProtoSet));
Class->ProtoSets[i] = ProtoSet;
for (Word = reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner);
Word < reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner) + WERDS_PER_PP;
*Word++ = 0);
/* allocate space for the proto lengths and install in class */
}
if (MaxNumIntProtosIn (Class) > 0) {
Class->ProtoLengths =
(uinT8 *)Emalloc(MaxNumIntProtosIn (Class) * sizeof (uinT8));
memset(Class->ProtoLengths, 0,
MaxNumIntProtosIn(Class) * sizeof(*Class->ProtoLengths));
} else {
Class->ProtoLengths = NULL;
}
memset(Class->ConfigLengths, 0, sizeof(Class->ConfigLengths));
return (Class);
@ -776,120 +777,11 @@ void free_int_templates(INT_TEMPLATES templates) {
for (i = 0; i < templates->NumClasses; i++)
free_int_class(templates->Class[i]);
for (i = 0; i < templates->NumClassPruners; i++)
Efree(templates->ClassPruner[i]);
delete templates->ClassPruners[i];
Efree(templates);
}
/*---------------------------------------------------------------------------*/
// Code to read/write Classify::font*table structures.
namespace {
bool read_info(FILE* f, FontInfo* fi, bool swap) {
inT32 size;
if (fread(&size, sizeof(size), 1, f) != 1) return false;
if (swap)
Reverse32(&size);
char* font_name = new char[size + 1];
fi->name = font_name;
if (fread(font_name, sizeof(*font_name), size, f) != size) return false;
font_name[size] = '\0';
if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
if (swap)
Reverse32(&fi->properties);
return true;
}
bool write_info(FILE* f, const FontInfo& fi) {
inT32 size = strlen(fi.name);
if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
if (fwrite(fi.name, sizeof(*fi.name), size, f) != size) return false;
if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
return true;
}
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap) {
inT32 vec_size, kern_size;
if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
if (swap) Reverse32(&vec_size);
ASSERT_HOST(vec_size >= 0);
if (vec_size == 0) return true;
fi->init_spacing(vec_size);
for (int i = 0; i < vec_size; ++i) {
FontSpacingInfo *fs = new FontSpacingInfo();
if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
return false;
}
if (swap) {
ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
Reverse32(&kern_size);
}
if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec
delete fs;
continue;
}
if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
!fs->kerned_x_gaps.DeSerialize(swap, f))) {
return false;
}
fi->add_spacing(i, fs);
}
return true;
}
bool write_spacing_info(FILE* f, const FontInfo& fi) {
inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
if (fwrite(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
inT16 x_gap_invalid = -1;
for (int i = 0; i < vec_size; ++i) {
FontSpacingInfo *fs = fi.spacing_vec->get(i);
inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
if (fs == NULL) {
if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
return false;
}
} else {
if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
return false;
}
}
if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
!fs->kerned_x_gaps.Serialize(f))) {
return false;
}
}
return true;
}
bool read_set(FILE* f, FontSet* fs, bool swap) {
if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->size);
fs->configs = new int[fs->size];
for (int i = 0; i < fs->size; ++i) {
if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->configs[i]);
}
return true;
}
bool write_set(FILE* f, const FontSet& fs) {
if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
for (int i = 0; i < fs.size; ++i) {
if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
}
return true;
}
} // namespace.
namespace tesseract {
INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
/*
@ -909,7 +801,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
int unicharset_size;
int version_id = 0;
INT_TEMPLATES Templates;
CLASS_PRUNER Pruner;
CLASS_PRUNER_STRUCT* Pruner;
INT_CLASS Class;
uinT8 *Lengths;
PROTO_SET ProtoSet;
@ -919,11 +811,11 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
CLASS_ID class_id, max_class_id;
inT16 *IndexFor = new inT16[MAX_NUM_CLASSES];
CLASS_ID *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
CLASS_PRUNER *TempClassPruner = new CLASS_PRUNER[MAX_NUM_CLASS_PRUNERS];
CLASS_PRUNER_STRUCT **TempClassPruner =
new CLASS_PRUNER_STRUCT*[MAX_NUM_CLASS_PRUNERS];
uinT32 SetBitsForMask = // word with NUM_BITS_PER_CLASS
(1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
uinT32 Mask, NewMask, ClassBits;
uinT32 *Word;
int MaxNumConfigs = MAX_NUM_CONFIGS;
int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
@ -979,9 +871,9 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
/* then read in the class pruners */
for (i = 0; i < Templates->NumClassPruners; i++) {
Pruner = (CLASS_PRUNER) Emalloc(sizeof(CLASS_PRUNER_STRUCT));
Pruner = new CLASS_PRUNER_STRUCT;
if ((nread =
fread((char *) Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
fread(Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
File)) != sizeof(CLASS_PRUNER_STRUCT))
cprintf("Bad read of inttemp!\n");
if (swap) {
@ -989,7 +881,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
for (y = 0; y < NUM_CP_BUCKETS; y++) {
for (z = 0; z < NUM_CP_BUCKETS; z++) {
for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
Reverse32(&Pruner[x][y][z][w]);
Reverse32(&Pruner->p[x][y][z][w]);
}
}
}
@ -998,7 +890,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
if (version_id < 2) {
TempClassPruner[i] = Pruner;
} else {
Templates->ClassPruner[i] = Pruner;
Templates->ClassPruners[i] = Pruner;
}
}
@ -1010,11 +902,8 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
if (ClassIdFor[i] > max_class_id)
max_class_id = ClassIdFor[i];
for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
Templates->ClassPruner[i] =
(CLASS_PRUNER) Emalloc(sizeof(CLASS_PRUNER_STRUCT));
for (Word = (uinT32 *) (Templates->ClassPruner[i]);
Word < (uinT32 *) (Templates->ClassPruner[i]) + WERDS_PER_CP;
*Word++ = 0);
Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
}
// Convert class pruners from the old format (indexed by class index)
// to the new format (indexed by class id).
@ -1024,7 +913,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
for (y = 0; y < NUM_CP_BUCKETS; y++)
for (z = 0; z < NUM_CP_BUCKETS; z++)
for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
if (TempClassPruner[i][x][y][z][w] == 0)
if (TempClassPruner[i]->p[x][y][z][w] == 0)
continue;
for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
@ -1033,7 +922,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
// Single out NUM_BITS_PER_CLASS bits relating to class_id.
Mask = SetBitsForMask << b;
ClassBits = TempClassPruner[i][x][y][z][w] & Mask;
ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
// Move these bits to the new position in which they should
// appear (indexed corresponding to the class_id).
new_i = CPrunerIdFor(class_id);
@ -1047,13 +936,13 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
// Copy bits relating to class_id to the correct position
// in Templates->ClassPruner.
NewMask = SetBitsForMask << new_b;
Templates->ClassPruner[new_i][x][y][z][new_w] &= ~NewMask;
Templates->ClassPruner[new_i][x][y][z][new_w] |= ClassBits;
Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
}
}
}
for (i = 0; i < Templates->NumClassPruners; i++) {
Efree (TempClassPruner[i]);
delete TempClassPruner[i];
}
}
@ -1217,7 +1106,6 @@ void Classify::ShowMatchDisplay() {
** History: Thu Mar 21 15:47:33 1991, DSJ, Created.
*/
InitIntMatchWindowIfReqd();
c_clear_window(IntMatchWindow);
if (ProtoDisplayWindow) {
ProtoDisplayWindow->Clear();
}
@ -1227,7 +1115,6 @@ void Classify::ShowMatchDisplay() {
ClearFeatureSpaceWindow(
static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
IntMatchWindow);
IntMatchWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
INT_MAX_X, INT_MAX_Y);
if (ProtoDisplayWindow) {
@ -1299,7 +1186,7 @@ void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
/* then write out the class pruners */
for (i = 0; i < Templates->NumClassPruners; i++)
fwrite(Templates->ClassPruner[i],
fwrite(Templates->ClassPruners[i],
sizeof(CLASS_PRUNER_STRUCT), 1, File);
/* then write out each class */
@ -1385,7 +1272,7 @@ FLOAT32 BucketEnd(int Bucket, FLOAT32 Offset, int NumBuckets) {
/*---------------------------------------------------------------------------*/
void DoFill(FILL_SPEC *FillSpec,
CLASS_PRUNER Pruner,
CLASS_PRUNER_STRUCT* Pruner,
register uinT32 ClassMask,
register uinT32 ClassCount,
register uinT32 WordIndex) {
@ -1421,11 +1308,11 @@ void DoFill(FILL_SPEC *FillSpec,
for (Y = FillSpec->YStart; Y <= FillSpec->YEnd; Y++)
for (Angle = FillSpec->AngleStart;
TRUE; CircularIncrement (Angle, NUM_CP_BUCKETS)) {
OldWord = Pruner[X][Y][Angle][WordIndex];
OldWord = Pruner->p[X][Y][Angle][WordIndex];
if (ClassCount > (OldWord & ClassMask)) {
OldWord &= ~ClassMask;
OldWord |= ClassCount;
Pruner[X][Y][Angle][WordIndex] = OldWord;
Pruner->p[X][Y][Angle][WordIndex] = OldWord;
}
if (Angle == FillSpec->AngleEnd)
break;
@ -1543,7 +1430,7 @@ void FillPPLinearBits(uinT32 ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR],
#ifndef GRAPHICS_DISABLED
namespace tesseract {
CLASS_ID Classify::GetClassToDebug(const char *Prompt, bool* adaptive_on,
bool* pretrained_on) {
bool* pretrained_on, int* shape_id) {
/*
** Parameters:
** Prompt prompt to print while waiting for input from window
@ -1557,26 +1444,57 @@ CLASS_ID Classify::GetClassToDebug(const char *Prompt, bool* adaptive_on,
tprintf("%s\n", Prompt);
SVEvent* ev;
SVEventType ev_type;
int unichar_id = INVALID_UNICHAR_ID;
// Wait until a click or popup event.
do {
ev = IntMatchWindow->AwaitEvent(SVET_ANY);
ev_type = ev->type;
if (ev_type == SVET_POPUP) {
if (unicharset.contains_unichar(ev->parameter)) {
if (ev->command_id == IDA_ADAPTIVE) {
*adaptive_on = true;
*pretrained_on = false;
} else if (ev->command_id == IDA_STATIC) {
if (ev->command_id == IDA_SHAPE_INDEX) {
if (shape_table_ != NULL) {
*shape_id = atoi(ev->parameter);
*adaptive_on = false;
*pretrained_on = true;
if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
int font_id;
shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
&font_id);
tprintf("Shape %d, first unichar=%d, font=%d\n",
*shape_id, unichar_id, font_id);
return unichar_id;
}
tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
} else {
*adaptive_on = true;
*pretrained_on = true;
tprintf("No shape table loaded!\n");
}
} else {
if (unicharset.contains_unichar(ev->parameter)) {
unichar_id = unicharset.unichar_to_id(ev->parameter);
if (ev->command_id == IDA_ADAPTIVE) {
*adaptive_on = true;
*pretrained_on = false;
*shape_id = -1;
} else if (ev->command_id == IDA_STATIC) {
*adaptive_on = false;
*pretrained_on = true;
} else {
*adaptive_on = true;
*pretrained_on = true;
}
if (ev->command_id == IDA_ADAPTIVE || shape_table_ == NULL) {
*shape_id = -1;
return unichar_id;
}
for (int s = 0; s < shape_table_->NumShapes(); ++s) {
if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
tprintf("%s\n", shape_table_->DebugStr(s).string());
}
}
} else {
tprintf("Char class '%s' not found in unicharset",
ev->parameter);
}
return unicharset.unichar_to_id(ev->parameter);
}
tprintf("Char class '%s' not found in unicharset",
ev->parameter);
}
delete ev;
} while (ev_type != SVET_CLICK);
@ -1916,15 +1834,8 @@ void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT* Feature,
// using BinaryAnglePlusPi in intfx.cpp.
Dx = (Length / 2.0) * cos((Feature->Theta / 256.0) * 2.0 * PI - PI);
Dy = (Length / 2.0) * sin((Feature->Theta / 256.0) * 2.0 * PI - PI);
float x_offset = Dy / 4.0;
float y_offset = -Dx / 4.0;
window->SetCursor(X - Dx, Y - Dy);
window->DrawTo(X + Dx, Y + Dy);
// Draw another copy of the feature offset perpendicualar to its direction.
X += x_offset;
Y += y_offset;
window->SetCursor(X - Dx, Y - Dy);
window->SetCursor(X, Y);
window->DrawTo(X + Dx, Y + Dy);
} /* RenderIntFeature */
@ -2047,6 +1958,8 @@ void InitIntMatchWindowIfReqd() {
"x", "Class to debug");
popup_menu->AddChild("Debug Both", IDA_BOTH,
"x", "Class to debug");
popup_menu->AddChild("Debug Shape Index", IDA_SHAPE_INDEX,
"0", "Index to debug");
popup_menu->BuildMenu(IntMatchWindow, false);
}
}

View File

@ -25,7 +25,6 @@
#include "matchdefs.h"
#include "mfoutline.h"
#include "protos.h"
#include "callcpp.h"
#include "scrollview.h"
#include "unicharset.h"
@ -72,11 +71,9 @@
* The position of the the bits recorded for each class in the
* 4th dimension is determined by using CPrunerWordIndexFor(c),
* where c is the corresponding class id. */
typedef uinT32 CLASS_PRUNER_STRUCT
[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR];
typedef
uinT32 (*CLASS_PRUNER)[NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR];
struct CLASS_PRUNER_STRUCT {
uinT32 p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR];
};
typedef struct
{
@ -103,86 +100,6 @@ PROTO_SET_STRUCT, *PROTO_SET;
typedef uinT32 CONFIG_PRUNER[NUM_PP_PARAMS][NUM_PP_BUCKETS][4];
// Struct for information about spacing between characters in a particular font.
struct FontSpacingInfo {
inT16 x_gap_before;
inT16 x_gap_after;
GenericVector<UNICHAR_ID> kerned_unichar_ids;
GenericVector<inT16> kerned_x_gaps;
};
/*
* font_properties contains properties about boldness, italicness, fixed pitch,
* serif, fraktur
*/
struct FontInfo {
FontInfo() : name(NULL), spacing_vec(NULL) {}
~FontInfo() {}
// Reserves unicharset_size spots in spacing_vec.
void init_spacing(int unicharset_size) {
spacing_vec = new GenericVector<FontSpacingInfo *>();
spacing_vec->init_to_size(unicharset_size, NULL);
}
// Adds the given pointer to FontSpacingInfo to spacing_vec member
// (FontInfo class takes ownership of the pointer).
// Note: init_spacing should be called before calling this function.
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
ASSERT_HOST(spacing_vec != NULL && spacing_vec->size() > uch_id);
(*spacing_vec)[uch_id] = spacing_info;
}
// Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
return (spacing_vec == NULL || spacing_vec->size() <= uch_id) ?
NULL : (*spacing_vec)[uch_id];
}
// Fills spacing with the value of the x gap expected between the two given
// UNICHAR_IDs. Returns true on success.
bool get_spacing(UNICHAR_ID prev_uch_id,
UNICHAR_ID uch_id,
int *spacing) const {
const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
const FontSpacingInfo *fsi = this->get_spacing(uch_id);
if (prev_fsi == NULL || fsi == NULL) return false;
int i = 0;
for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
if (prev_fsi->kerned_unichar_ids[i] == uch_id) break;
}
if (i < prev_fsi->kerned_unichar_ids.size()) {
*spacing = prev_fsi->kerned_x_gaps[i];
} else {
*spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
}
return true;
}
bool is_italic() const { return properties & 1; }
bool is_bold() const { return (properties & 2) != 0; }
bool is_fixed_pitch() const { return (properties & 4) != 0; }
bool is_serif() const { return (properties & 8) != 0; }
bool is_fraktur() const { return (properties & 16) != 0; }
char* name;
uinT32 properties;
// Horizontal spacing between characters (indexed by UNICHAR_ID).
GenericVector<FontSpacingInfo *> *spacing_vec;
};
// Every class (character) owns a FontSet that represents all the fonts that can
// render this character.
// Since almost all the characters from the same script share the same set of
// fonts, the sets are shared over multiple classes (see
// Classify::fontset_table_). Thus, a class only store an id to a set.
// Because some fonts cannot render just one character of a set, there are a
// lot of FontSet that differ only by one font. Rather than storing directly
// the FontInfo in the FontSet structure, it's better to share FontInfos among
// FontSets (Classify::fontinfo_table_).
struct FontSet {
int size;
int* configs; // FontInfo ids
};
typedef struct
{
@ -203,7 +120,7 @@ typedef struct
int NumClasses;
int NumClassPruners;
INT_CLASS Class[MAX_NUM_CLASSES];
CLASS_PRUNER ClassPruner[MAX_NUM_CLASS_PRUNERS];
CLASS_PRUNER_STRUCT* ClassPruners[MAX_NUM_CLASS_PRUNERS];
}
@ -232,6 +149,7 @@ typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES];
enum IntmatcherDebugAction {
IDA_ADAPTIVE,
IDA_STATIC,
IDA_SHAPE_INDEX,
IDA_BOTH
};
@ -255,7 +173,7 @@ enum IntmatcherDebugAction {
#define ClassForClassId(T,c) ((T)->Class[c])
#define ClassPrunersFor(T) ((T)->ClassPruner)
#define CPrunerIdFor(c) ((c) / CLASSES_PER_CP)
#define CPrunerFor(T,c) ((T)->ClassPruner [CPrunerIdFor (c)])
#define CPrunerFor(T,c) ((T)->ClassPruners[CPrunerIdFor(c)])
#define CPrunerWordIndexFor(c) (((c) % CLASSES_PER_CP) / CLASSES_PER_CP_WERD)
#define CPrunerBitIndexFor(c) (((c) % CLASSES_PER_CP) % CLASSES_PER_CP_WERD)
#define CPrunerMaskFor(L,c) (((L)+1) << CPrunerBitIndexFor (c) * NUM_BITS_PER_CLASS)
@ -300,7 +218,7 @@ void UpdateMatchDisplay();
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class);
void DisplayIntFeature(INT_FEATURE Feature, FLOAT32 Evidence);
void DisplayIntFeature(const INT_FEATURE_STRUCT* Feature, FLOAT32 Evidence);
void DisplayIntProto(INT_CLASS Class, PROTO_ID ProtoId, FLOAT32 Evidence);

967
classify/mastertrainer.cpp Normal file
View File

@ -0,0 +1,967 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: mastertrainer.cpp
// Description: Trainer to build the MasterClassifier.
// Author: Ray Smith
// Created: Wed Nov 03 18:10:01 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "mastertrainer.h"
#include <math.h>
#include <time.h>
#include "allheaders.h"
#include "boxread.h"
#include "classify.h"
#include "errorcounter.h"
#include "featdefs.h"
#include "sampleiterator.h"
#include "shapeclassifier.h"
#include "shapetable.h"
#include "svmnode.h"
namespace tesseract {
// Constants controlling clustering. With a low kMinClusteredShapes and a high
// kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.
// Min number of shapes in the output.
const int kMinClusteredShapes = 1;
// Max number of unichars in any individual cluster.
const int kMaxUnicharsPerCluster = 2000;
// Mean font distance below which to merge fonts and unichars.
const float kFontMergeDistance = 0.025;
MasterTrainer::MasterTrainer(NormalizationMode norm_mode,
bool shape_analysis,
bool replicate_samples,
int debug_level)
: norm_mode_(norm_mode), samples_(fontinfo_table_),
junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
charsetsize_(0),
enable_shape_anaylsis_(shape_analysis),
enable_replication_(replicate_samples),
fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
fontinfo_table_.set_compare_callback(
NewPermanentTessCallback(CompareFontInfo));
fontinfo_table_.set_clear_callback(
NewPermanentTessCallback(FontInfoDeleteCallback));
}
MasterTrainer::~MasterTrainer() {
delete [] fragments_;
for (int p = 0; p < page_images_.size(); ++p)
pixDestroy(&page_images_[p]);
}
// WARNING! Serialize/DeSerialize are only partial, providing
// enough data to get the samples back and display them.
// Writes to the given file. Returns false in case of error.
bool MasterTrainer::Serialize(FILE* fp) const {
if (fwrite(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
if (!unicharset_.save_to_file(fp)) return false;
if (!feature_space_.Serialize(fp)) return false;
if (!samples_.Serialize(fp)) return false;
if (!junk_samples_.Serialize(fp)) return false;
if (!verify_samples_.Serialize(fp)) return false;
if (!master_shapes_.Serialize(fp)) return false;
if (!flat_shapes_.Serialize(fp)) return false;
if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_info)))
return false;
if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_spacing_info)))
return false;
if (!xheights_.Serialize(fp)) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
if (swap) {
ReverseN(&norm_mode_, sizeof(norm_mode_));
}
if (!unicharset_.load_from_file(fp)) return false;
charsetsize_ = unicharset_.size();
if (!feature_space_.DeSerialize(swap, fp)) return false;
feature_map_.Init(feature_space_);
if (!samples_.DeSerialize(swap, fp)) return false;
if (!junk_samples_.DeSerialize(swap, fp)) return false;
if (!verify_samples_.DeSerialize(swap, fp)) return false;
if (!master_shapes_.DeSerialize(swap, fp)) return false;
if (!flat_shapes_.DeSerialize(swap, fp)) return false;
if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap))
return false;
if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_spacing_info),
swap))
return false;
if (!xheights_.DeSerialize(swap, fp)) return false;
return true;
}
// Load an initial unicharset, or set one up if the file cannot be read.
void MasterTrainer::LoadUnicharset(const char* filename) {
if (!unicharset_.load_from_file(filename)) {
tprintf("Failed to load unicharset from file %s\n"
"Building unicharset for training from scratch...\n",
filename);
unicharset_.clear();
// Space character needed to represent NIL_LIST classification.
unicharset_.unichar_insert(" ");
}
charsetsize_ = unicharset_.size();
delete [] fragments_;
fragments_ = new int[charsetsize_];
memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
samples_.LoadUnicharset(filename);
junk_samples_.LoadUnicharset(filename);
verify_samples_.LoadUnicharset(filename);
}
// Reads the samples and their features from the given .tr format file,
// adding them to the trainer with the font_id from the content of the file.
// See mftraining.cpp for a description of the file format.
// If verification, then these are verification samples, not training.
void MasterTrainer::ReadTrainingSamples(FILE *fp,
const FEATURE_DEFS_STRUCT& feature_defs,
bool verification) {
char buffer[2048];
int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
int micro_feature_type = ShortNameToFeatureType(feature_defs,
kMicroFeatureType);
int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
while (fgets(buffer, sizeof(buffer), fp) != NULL) {
if (buffer[0] == '\n')
continue;
char* space = strchr(buffer, ' ');
if (space == NULL) {
tprintf("Bad format in tr file, reading fontname, unichar\n");
continue;
}
*space++ = '\0';
int font_id = GetFontInfoId(buffer);
int page_number;
STRING unichar;
TBOX bounding_box;
if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
tprintf("Bad format in tr file, reading box coords\n");
continue;
}
CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
TrainingSample* sample = new TrainingSample;
sample->set_font_id(font_id);
sample->set_page_num(page_number + page_images_.size());
sample->set_bounding_box(bounding_box);
sample->ExtractCharDesc(int_feature_type, micro_feature_type,
cn_feature_type, geo_feature_type, char_desc);
AddSample(verification, unichar.string(), sample);
FreeCharDescription(char_desc);
}
charsetsize_ = unicharset_.size();
}
// Adds the given single sample to the trainer, setting the classid
// appropriately from the given unichar_str.
void MasterTrainer::AddSample(bool verification, const char* unichar,
TrainingSample* sample) {
if (verification) {
verify_samples_.AddSample(unichar, sample);
prev_unichar_id_ = -1;
} else if (unicharset_.contains_unichar(unichar)) {
if (prev_unichar_id_ >= 0)
fragments_[prev_unichar_id_] = -1;
prev_unichar_id_ = samples_.AddSample(unichar, sample);
if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
} else {
int junk_id = junk_samples_.AddSample(unichar, sample);
if (prev_unichar_id_ >= 0) {
CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(unichar);
if (frag != NULL && frag->is_natural()) {
if (fragments_[prev_unichar_id_] == 0)
fragments_[prev_unichar_id_] = junk_id;
else if (fragments_[prev_unichar_id_] != junk_id)
fragments_[prev_unichar_id_] = -1;
}
delete frag;
}
prev_unichar_id_ = -1;
}
}
// Loads all pages from the given tif filename and append to page_images_.
// Must be called after ReadTrainingSamples, as the current number of images
// is used as an offset for page numbers in the samples.
void MasterTrainer::LoadPageImages(const char* filename) {
int page;
Pix* pix;
for (page = 0; (pix = pixReadTiff(filename, page)) != NULL; ++page) {
page_images_.push_back(pix);
}
tprintf("Loaded %d page images from %s\n", page, filename);
}
// Cleans up the samples after initial load from the tr files, and prior to
// saving the MasterTrainer:
// Remaps fragmented chars if running shape anaylsis.
// Sets up the samples appropriately for class/fontwise access.
// Deletes outlier samples.
void MasterTrainer::PostLoadCleanup() {
if (debug_level_ > 0)
tprintf("PostLoadCleanup...\n");
if (enable_shape_anaylsis_)
ReplaceFragmentedSamples();
SampleIterator sample_it;
sample_it.Init(NULL, NULL, true, &verify_samples_);
sample_it.NormalizeSamples();
verify_samples_.OrganizeByFontAndClass();
samples_.IndexFeatures(feature_space_);
// TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
// against current training.
// samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
samples_.OrganizeByFontAndClass();
if (debug_level_ > 0)
tprintf("ComputeCanonicalSamples...\n");
samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
}
// Gets the samples ready for training. Use after both
// ReadTrainingSamples+PostLoadCleanup or DeSerialize.
// Re-indexes the features and computes canonical and cloud features.
void MasterTrainer::PreTrainingSetup() {
if (debug_level_ > 0)
tprintf("PreTrainingSetup...\n");
samples_.IndexFeatures(feature_space_);
samples_.ComputeCanonicalFeatures();
if (debug_level_ > 0)
tprintf("ComputeCloudFeatures...\n");
samples_.ComputeCloudFeatures(feature_space_.Size());
}
// Sets up the master_shapes_ table, which tells which fonts should stay
// together until they get to a leaf node classifier.
void MasterTrainer::SetupMasterShapes() {
tprintf("Building master shape table\n");
int num_fonts = samples_.NumFonts();
ShapeTable char_shapes_begin_fragment(samples_.unicharset());
ShapeTable char_shapes_end_fragment(samples_.unicharset());
ShapeTable char_shapes(samples_.unicharset());
for (int c = 0; c < samples_.charsetsize(); ++c) {
ShapeTable shapes(samples_.unicharset());
for (int f = 0; f < num_fonts; ++f) {
if (samples_.NumClassSamples(f, c, true) > 0)
shapes.AddShape(c, f);
}
ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
if (fragment == NULL)
char_shapes.AppendMasterShapes(shapes);
else if (fragment->is_beginning())
char_shapes_begin_fragment.AppendMasterShapes(shapes);
else if (fragment->is_ending())
char_shapes_end_fragment.AppendMasterShapes(shapes);
else
char_shapes.AppendMasterShapes(shapes);
}
ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
kFontMergeDistance, &char_shapes_begin_fragment);
char_shapes.AppendMasterShapes(char_shapes_begin_fragment);
ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
kFontMergeDistance, &char_shapes_end_fragment);
char_shapes.AppendMasterShapes(char_shapes_end_fragment);
ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
kFontMergeDistance, &char_shapes);
master_shapes_.AppendMasterShapes(char_shapes);
tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
}
// Adds the junk_samples_ to the main samples_ set. Junk samples are initially
// fragments and n-grams (all incorrectly segmented characters).
// Various training functions may result in incorrectly segmented characters
// being added to the unicharset of the main samples, perhaps because they
// form a "radical" decomposition of some (Indic) grapheme, or because they
// just look the same as a real character (like rn/m)
// This function moves all the junk samples, to the main samples_ set, but
// desirable junk, being any sample for which the unichar already exists in
// the samples_ unicharset gets the unichar-ids re-indexed to match, but
// anything else gets re-marked as unichar_id 0 (space character) to identify
// it as junk to the error counter.
void MasterTrainer::IncludeJunk() {
// Get ids of fragments in junk_samples_ that replace the dead chars.
const UNICHARSET& junk_set = junk_samples_.unicharset();
const UNICHARSET& sample_set = samples_.unicharset();
int num_junks = junk_samples_.num_samples();
tprintf("Moving %d junk samples to master sample set.\n", num_junks);
for (int s = 0; s < num_junks; ++s) {
TrainingSample* sample = junk_samples_.mutable_sample(s);
int junk_id = sample->class_id();
const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
int sample_id = sample_set.unichar_to_id(junk_utf8);
if (sample_id == INVALID_UNICHAR_ID)
sample_id = 0;
sample->set_class_id(sample_id);
junk_samples_.extract_sample(s);
samples_.AddSample(sample_id, sample);
}
junk_samples_.DeleteDeadSamples();
samples_.OrganizeByFontAndClass();
}
// Replicates the samples and perturbs them if the enable_replication_ flag
// is set. MUST be used after the last call to OrganizeByFontAndClass on
// the training samples, ie after IncludeJunk if it is going to be used, as
// OrganizeByFontAndClass will eat the replicated samples into the regular
// samples.
void MasterTrainer::ReplicateAndRandomizeSamplesIfRequired() {
if (enable_replication_) {
if (debug_level_ > 0)
tprintf("ReplicateAndRandomize...\n");
verify_samples_.ReplicateAndRandomizeSamples();
samples_.ReplicateAndRandomizeSamples();
samples_.IndexFeatures(feature_space_);
}
}
// Loads the basic font properties file into fontinfo_table_.
// Returns false on failure.
bool MasterTrainer::LoadFontInfo(const char* filename) {
FILE* fp = fopen(filename, "rb");
if (fp == NULL) {
fprintf(stderr, "Failed to load font_properties from %s\n", filename);
return false;
}
int italic, bold, fixed, serif, fraktur;
while (!feof(fp)) {
FontInfo fontinfo;
char* font_name = new char[1024];
fontinfo.name = font_name;
fontinfo.properties = 0;
fontinfo.universal_id = 0;
if (fscanf(fp, "%1024s %i %i %i %i %i\n", font_name,
&italic, &bold, &fixed, &serif, &fraktur) != 6)
continue;
fontinfo.properties =
(italic << 0) +
(bold << 1) +
(fixed << 2) +
(serif << 3) +
(fraktur << 4);
if (!fontinfo_table_.contains(fontinfo)) {
fontinfo_table_.push_back(fontinfo);
}
}
fclose(fp);
return true;
}
// Loads the xheight font properties file into xheights_.
// Returns false on failure.
bool MasterTrainer::LoadXHeights(const char* filename) {
tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
xheights_.init_to_size(fontinfo_table_.size(), -1);
if (filename == NULL) return true;
FILE *f = fopen(filename, "rb");
if (f == NULL) {
fprintf(stderr, "Failed to load font xheights from %s\n", filename);
return false;
}
tprintf("Reading x-heights from %s ...\n", filename);
FontInfo fontinfo;
fontinfo.properties = 0; // Not used to lookup in the table.
fontinfo.universal_id = 0;
char buffer[1024];
int xht;
int total_xheight = 0;
int xheight_count = 0;
while (!feof(f)) {
if (fscanf(f, "%1024s %d\n", buffer, &xht) != 2)
continue;
fontinfo.name = buffer;
if (!fontinfo_table_.contains(fontinfo)) continue;
int fontinfo_id = fontinfo_table_.get_id(fontinfo);
xheights_[fontinfo_id] = xht;
total_xheight += xht;
++xheight_count;
}
if (xheight_count == 0) {
fprintf(stderr, "No valid xheights in %s!\n", filename);
return false;
}
int mean_xheight = DivRounded(total_xheight, xheight_count);
for (int i = 0; i < fontinfo_table_.size(); ++i) {
if (xheights_[i] < 0)
xheights_[i] = mean_xheight;
}
return true;
} // LoadXHeights
// Reads spacing stats from filename and adds them to fontinfo_table.
bool MasterTrainer::AddSpacingInfo(const char *filename) {
FILE* fontinfo_file = fopen(filename, "rb");
if (fontinfo_file == NULL)
return true; // We silently ignore missing files!
// Find the fontinfo_id.
int fontinfo_id = GetBestMatchingFontInfoId(filename);
if (fontinfo_id < 0) {
tprintf("No font found matching fontinfo filename %s\n", filename);
fclose(fontinfo_file);
return false;
}
tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
// TODO(rays) scale should probably be a double, but keep as an int for now
// to duplicate current behavior.
int scale = kBlnXHeight / xheights_[fontinfo_id];
int num_unichars;
char uch[UNICHAR_LEN];
char kerned_uch[UNICHAR_LEN];
int x_gap, x_gap_before, x_gap_after, num_kerned;
ASSERT_HOST(fscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
FontInfo *fi = fontinfo_table_.get_mutable(fontinfo_id);
fi->init_spacing(unicharset_.size());
FontSpacingInfo *spacing = NULL;
for (int l = 0; l < num_unichars; ++l) {
if (fscanf(fontinfo_file, "%s %d %d %d",
uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
tprintf("Bad format of font spacing file %s\n", filename);
fclose(fontinfo_file);
return false;
}
bool valid = unicharset_.contains_unichar(uch);
if (valid) {
spacing = new FontSpacingInfo();
spacing->x_gap_before = static_cast<inT16>(x_gap_before * scale);
spacing->x_gap_after = static_cast<inT16>(x_gap_after * scale);
}
for (int k = 0; k < num_kerned; ++k) {
if (fscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
tprintf("Bad format of font spacing file %s\n", filename);
fclose(fontinfo_file);
return false;
}
if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
spacing->kerned_unichar_ids.push_back(
unicharset_.unichar_to_id(kerned_uch));
spacing->kerned_x_gaps.push_back(static_cast<inT16>(x_gap * scale));
}
if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
}
fclose(fontinfo_file);
return true;
}
// Returns the font id corresponding to the given font name.
// Returns -1 if the font cannot be found.
int MasterTrainer::GetFontInfoId(const char* font_name) {
FontInfo fontinfo;
// We are only borrowing the string, so it is OK to const cast it.
fontinfo.name = const_cast<char*>(font_name);
fontinfo.properties = 0; // Not used to lookup in the table
fontinfo.universal_id = 0;
if (!fontinfo_table_.contains(fontinfo)) {
return -1;
} else {
return fontinfo_table_.get_id(fontinfo);
}
}
// Returns the font_id of the closest matching font name to the given
// filename. It is assumed that a substring of the filename will match
// one of the fonts. If more than one is matched, the longest is returned.
int MasterTrainer::GetBestMatchingFontInfoId(const char* filename) {
int fontinfo_id = -1;
int best_len = 0;
for (int f = 0; f < fontinfo_table_.size(); ++f) {
if (strstr(filename, fontinfo_table_.get(f).name) != NULL) {
int len = strlen(fontinfo_table_.get(f).name);
// Use the longest matching length in case a substring of a font matched.
if (len > best_len) {
best_len = len;
fontinfo_id = f;
}
}
}
return fontinfo_id;
}
// Sets up a flat shapetable with one shape per class/font combination.
void MasterTrainer::SetupFlatShapeTable(ShapeTable* shape_table) {
// To exactly mimic the results of the previous implementation, the shapes
// must be clustered in order the fonts arrived, and reverse order of the
// characters within each font.
// Get a list of the fonts in the order they appeared.
GenericVector<int> active_fonts;
int num_shapes = flat_shapes_.NumShapes();
for (int s = 0; s < num_shapes; ++s) {
int font = flat_shapes_.GetShape(s)[0].font_ids[0];
int f = 0;
for (f = 0; f < active_fonts.size(); ++f) {
if (active_fonts[f] == font)
break;
}
if (f == active_fonts.size())
active_fonts.push_back(font);
}
// For each font in order, add all the shapes with that font in reverse order.
int num_fonts = active_fonts.size();
for (int f = 0; f < num_fonts; ++f) {
for (int s = num_shapes - 1; s >= 0; --s) {
int font = flat_shapes_.GetShape(s)[0].font_ids[0];
if (font == active_fonts[f]) {
shape_table->AddShape(flat_shapes_.GetShape(s));
}
}
}
}
// Sets up a Clusterer for mftraining on a single shape_id.
// Call FreeClusterer on the return value after use.
CLUSTERER* MasterTrainer::SetupForClustering(
const ShapeTable& shape_table,
const FEATURE_DEFS_STRUCT& feature_defs,
int shape_id,
int* num_samples) {
int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType);
int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
ASSERT_HOST(num_params == MFCount);
CLUSTERER* clusterer = MakeClusterer(
num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
// We want to iterate over the samples of just the one shape.
IndexMapBiDi shape_map;
shape_map.Init(shape_table.NumShapes(), false);
shape_map.SetMap(shape_id, true);
shape_map.Setup();
// Reverse the order of the samples to match the previous behavior.
GenericVector<const TrainingSample*> sample_ptrs;
SampleIterator it;
it.Init(&shape_map, &shape_table, false, &samples_);
for (it.Begin(); !it.AtEnd(); it.Next()) {
sample_ptrs.push_back(&it.GetSample());
}
int sample_id = 0;
for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
const TrainingSample* sample = sample_ptrs[i];
int num_features = sample->num_micro_features();
for (int f = 0; f < num_features; ++f)
MakeSample(clusterer, sample->micro_features()[f], sample_id);
++sample_id;
}
*num_samples = sample_id;
return clusterer;
}
// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
// to the given inttemp_file, and the corresponding pffmtable.
// The unicharset is the original encoding of graphemes, and shape_set should
// match the size of the shape_table, and may possibly be totally fake.
void MasterTrainer::WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
const UNICHARSET& shape_set,
const ShapeTable& shape_table,
CLASS_STRUCT* float_classes,
const char* inttemp_file,
const char* pffmtable_file) {
tesseract::Classify *classify = new tesseract::Classify();
// Move the fontinfo table to classify.
classify->get_fontinfo_table().move(&fontinfo_table_);
INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
shape_set);
FILE* fp = fopen(inttemp_file, "wb");
classify->WriteIntTemplates(fp, int_templates, shape_set);
fclose(fp);
// Now write pffmtable. This is complicated by the fact that the adaptive
// classifier still wants one indexed by unichar-id, but the static
// classifier needs one indexed by its shape class id.
// We put the shapetable_cutoffs in a GenericVector, and compute the
// unicharset cutoffs along the way.
GenericVector<uinT16> shapetable_cutoffs;
GenericVector<uinT16> unichar_cutoffs;
for (int c = 0; c < unicharset.size(); ++c)
unichar_cutoffs.push_back(0);
/* then write out each class */
for (int i = 0; i < int_templates->NumClasses; ++i) {
INT_CLASS Class = ClassForClassId(int_templates, i);
// Todo: Test with min instead of max
// int MaxLength = LengthForConfigId(Class, 0);
uinT16 max_length = 0;
for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
// Todo: Test with min instead of max
// if (LengthForConfigId (Class, config_id) < MaxLength)
uinT16 length = Class->ConfigLengths[config_id];
if (length > max_length)
max_length = Class->ConfigLengths[config_id];
int shape_id = float_classes[i].font_set.get(config_id);
const Shape& shape = shape_table.GetShape(shape_id);
for (int c = 0; c < shape.size(); ++c) {
int unichar_id = shape[c].unichar_id;
if (length > unichar_cutoffs[unichar_id])
unichar_cutoffs[unichar_id] = length;
}
}
shapetable_cutoffs.push_back(max_length);
}
fp = fopen(pffmtable_file, "wb");
shapetable_cutoffs.Serialize(fp);
for (int c = 0; c < unicharset.size(); ++c) {
const char *unichar = unicharset.id_to_unichar(c);
if (strcmp(unichar, " ") == 0) {
unichar = "NULL";
}
fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
}
fclose(fp);
free_int_templates(int_templates);
}
// Generate debug output relating to the canonical distance between the
// two given UTF8 grapheme strings.
void MasterTrainer::DebugCanonical(const char* unichar_str1,
const char* unichar_str2) {
int class_id1 = unicharset_.unichar_to_id(unichar_str1);
int class_id2 = unicharset_.unichar_to_id(unichar_str2);
if (class_id2 == INVALID_UNICHAR_ID)
class_id2 = class_id1;
if (class_id1 == INVALID_UNICHAR_ID) {
tprintf("No unicharset entry found for %s\n", unichar_str1);
return;
} else {
tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
class_id1, unichar_str1, class_id2, unichar_str2);
}
int num_fonts = samples_.NumFonts();
const IntFeatureMap& feature_map = feature_map_;
// Iterate the fonts to get the similarity with other fonst of the same
// class.
tprintf(" ");
for (int f = 0; f < num_fonts; ++f) {
if (samples_.NumClassSamples(f, class_id2, false) == 0)
continue;
tprintf("%6d", f);
}
tprintf("\n");
for (int f1 = 0; f1 < num_fonts; ++f1) {
// Map the features of the canonical_sample.
if (samples_.NumClassSamples(f1, class_id1, false) == 0)
continue;
tprintf("%4d ", f1);
for (int f2 = 0; f2 < num_fonts; ++f2) {
if (samples_.NumClassSamples(f2, class_id2, false) == 0)
continue;
float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
feature_map);
tprintf(" %5.3f", dist);
}
tprintf("\n");
}
// Build a fake ShapeTable containing all the sample types.
ShapeTable shapes(unicharset_);
for (int f = 0; f < num_fonts; ++f) {
if (samples_.NumClassSamples(f, class_id1, true) > 0)
shapes.AddShape(class_id1, f);
if (class_id1 != class_id2 &&
samples_.NumClassSamples(f, class_id2, true) > 0)
shapes.AddShape(class_id2, f);
}
}
// Debugging for cloud/canonical features.
// Displays a Features window containing:
// If unichar_str2 is in the unicharset, and canonical_font is non-negative,
// displays the canonical features of the char/font combination in red.
// If unichar_str1 is in the unicharset, and cloud_font is non-negative,
// displays the cloud feature of the char/font combination in green.
// The canonical features are drawn first to show which ones have no
// matches in the cloud features.
// Until the features window is destroyed, each click in the features window
// will display the samples that have that feature in a separate window.
void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
const char* unichar_str2,
int canonical_font) {
const IntFeatureMap& feature_map = feature_map_;
const IntFeatureSpace& feature_space = feature_map.feature_space();
ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
ClearFeatureSpaceWindow(norm_mode_ == NM_BASELINE ? baseline : character,
f_window);
int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
class_id2);
for (int f = 0; f < sample->num_features(); ++f) {
RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
}
}
int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
for (int f = 0; f < cloud.size(); ++f) {
if (cloud[f]) {
INT_FEATURE_STRUCT feature =
feature_map.InverseIndexFeature(f);
RenderIntFeature(f_window, &feature, ScrollView::GREEN);
}
}
}
f_window->Update();
ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
SVEventType ev_type;
do {
SVEvent* ev;
// Wait until a click or popup event.
ev = f_window->AwaitEvent(SVET_ANY);
ev_type = ev->type;
if (ev_type == SVET_CLICK) {
int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
if (feature_index >= 0) {
// Iterate samples and display those with the feature.
Shape shape;
shape.AddToShape(class_id1, cloud_font);
s_window->Clear();
samples_.DisplaySamplesWithFeature(feature_index, shape,
feature_space, ScrollView::GREEN,
s_window);
s_window->Update();
}
}
delete ev;
} while (ev_type != SVET_DESTROY);
}
// Tests the given test_classifier on the internal samples.
// See TestClassifier for details.
void MasterTrainer::TestClassifierOnSamples(int report_level,
bool replicate_samples,
ShapeClassifier* test_classifier,
STRING* report_string) {
TestClassifier(report_level, replicate_samples, &samples_,
test_classifier, report_string);
}
// Tests the given test_classifier on the given samples
// report_levels:
// 0 = no output.
// 1 = bottom-line error rate.
// 2 = bottom-line error rate + time.
// 3 = font-level error rate + time.
// 4 = list of all errors + short classifier debug output on 16 errors.
// 5 = list of all errors + short classifier debug output on 25 errors.
// If replicate_samples is true, then the test is run on an extended test
// sample including replicated and systematically perturbed samples.
// If report_string is non-NULL, a summary of the results for each font
// is appended to the report_string.
double MasterTrainer::TestClassifier(int report_level,
bool replicate_samples,
TrainingSampleSet* samples,
ShapeClassifier* test_classifier,
STRING* report_string) {
SampleIterator sample_it;
sample_it.Init(NULL, test_classifier->GetShapeTable(), replicate_samples,
samples);
if (report_level > 0) {
int num_samples = 0;
for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
++num_samples;
tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
test_classifier->GetShapeTable()->NumShapes(), num_samples);
tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
}
double unichar_error = 0.0;
ErrorCounter::ComputeErrorRate(test_classifier, report_level,
CT_SHAPE_TOP_ERR, fontinfo_table_,
page_images_, &sample_it, &unichar_error,
NULL, report_string);
return unichar_error;
}
// Returns the average (in some sense) distance between the two given
// shapes, which may contain multiple fonts and/or unichars.
float MasterTrainer::ShapeDistance(const ShapeTable& shapes, int s1, int s2) {
const IntFeatureMap& feature_map = feature_map_;
const Shape& shape1 = shapes.GetShape(s1);
const Shape& shape2 = shapes.GetShape(s2);
int num_chars1 = shape1.size();
int num_chars2 = shape2.size();
float dist_sum = 0.0f;
int dist_count = 0;
if (num_chars1 > 1 || num_chars2 > 1) {
// In the multi-char case try to optimize the calculation by computing
// distances between characters of matching font where possible.
for (int c1 = 0; c1 < num_chars1; ++c1) {
for (int c2 = 0; c2 < num_chars2; ++c2) {
dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
true, feature_map);
++dist_count;
}
}
} else {
// In the single unichar case, there is little alternative, but to compute
// the squared-order distance between pairs of fonts.
dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
false, feature_map);
++dist_count;
}
return dist_sum / dist_count;
}
// Replaces samples that are always fragmented with the corresponding
// fragment samples.
void MasterTrainer::ReplaceFragmentedSamples() {
if (fragments_ == NULL) return;
// Remove samples that are replaced by fragments. Each class that was
// always naturally fragmented should be replaced by its fragments.
int num_samples = samples_.num_samples();
for (int s = 0; s < num_samples; ++s) {
TrainingSample* sample = samples_.mutable_sample(s);
if (fragments_[sample->class_id()] > 0)
samples_.KillSample(sample);
}
samples_.DeleteDeadSamples();
// Get ids of fragments in junk_samples_ that replace the dead chars.
const UNICHARSET& frag_set = junk_samples_.unicharset();
#if 0
// TODO(rays) The original idea was to replace only graphemes that were
// always naturally fragmented, but that left a lot of the Indic graphemes
// out. Determine whether we can go back to that idea now that spacing
// is fixed in the training images, or whether this code is obsolete.
bool* good_junk = new bool[frag_set.size()];
memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());
for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {
int frag_ch = fragments_[dead_ch];
if (frag_ch <= 0) continue;
const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);
CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);
// Mark the chars for all parts of the fragment as good in good_junk.
for (int part = 0; part < frag->get_total(); ++part) {
frag->set_pos(part);
int good_ch = frag_set.unichar_to_id(frag->to_string().string());
if (good_ch != INVALID_UNICHAR_ID)
good_junk[good_ch] = true; // We want this one.
}
}
#endif
// For now just use all the junk that was from natural fragments.
// Get samples of fragments in junk_samples_ that replace the dead chars.
int num_junks = junk_samples_.num_samples();
for (int s = 0; s < num_junks; ++s) {
TrainingSample* sample = junk_samples_.mutable_sample(s);
int junk_id = sample->class_id();
const char* frag_utf8 = frag_set.id_to_unichar(junk_id);
CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);
if (frag != NULL && frag->is_natural()) {
junk_samples_.extract_sample(s);
samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);
}
}
junk_samples_.DeleteDeadSamples();
junk_samples_.OrganizeByFontAndClass();
samples_.OrganizeByFontAndClass();
unicharset_.clear();
unicharset_.AppendOtherUnicharset(samples_.unicharset());
// delete [] good_junk;
// Fragments_ no longer needed?
delete [] fragments_;
fragments_ = NULL;
}
// Runs a hierarchical agglomerative clustering to merge shapes in the given
// shape_table, while satisfying the given constraints:
// * End with at least min_shapes left in shape_table,
// * No shape shall have more than max_shape_unichars in it,
// * Don't merge shapes where the distance between them exceeds max_dist.
const float kInfiniteDist = 999.0f;
void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,
float max_dist, ShapeTable* shapes) {
int num_shapes = shapes->NumShapes();
int max_merges = num_shapes - min_shapes;
GenericVector<ShapeDist>* shape_dists =
new GenericVector<ShapeDist>[num_shapes];
float min_dist = kInfiniteDist;
int min_s1 = 0;
int min_s2 = 0;
tprintf("Computing shape distances...");
for (int s1 = 0; s1 < num_shapes; ++s1) {
for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {
ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));
shape_dists[s1].push_back(dist);
if (dist.distance < min_dist) {
min_dist = dist.distance;
min_s1 = s1;
min_s2 = s2;
}
}
tprintf(" %d", s1);
}
tprintf("\n");
int num_merged = 0;
while (num_merged < max_merges && min_dist < max_dist) {
tprintf("Distance = %f: ", min_dist);
int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);
shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;
if (num_unichars > max_shape_unichars) {
tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n",
min_s1, min_s2, num_unichars, max_shape_unichars);
} else {
shapes->MergeShapes(min_s1, min_s2);
shape_dists[min_s2].clear();
++num_merged;
for (int s = 0; s < min_s1; ++s) {
if (!shape_dists[s].empty()) {
shape_dists[s][min_s1 - s - 1].distance =
ShapeDistance(*shapes, s, min_s1);
shape_dists[s][min_s2 - s -1].distance = kInfiniteDist;
}
}
for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist)
shape_dists[min_s1][s2 - min_s1 - 1].distance =
ShapeDistance(*shapes, min_s1, s2);
}
for (int s = min_s1 + 1; s < min_s2; ++s) {
if (!shape_dists[s].empty()) {
shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;
}
}
}
min_dist = kInfiniteDist;
for (int s1 = 0; s1 < num_shapes; ++s1) {
for (int i = 0; i < shape_dists[s1].size(); ++i) {
if (shape_dists[s1][i].distance < min_dist) {
min_dist = shape_dists[s1][i].distance;
min_s1 = s1;
min_s2 = s1 + 1 + i;
}
}
}
}
tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist);
delete [] shape_dists;
if (debug_level_ > 1) {
for (int s1 = 0; s1 < num_shapes; ++s1) {
if (shapes->MasterDestinationIndex(s1) == s1) {
tprintf("Master shape:%s\n", shapes->DebugStr(s1).string());
}
}
}
}
} // namespace tesseract.

296
classify/mastertrainer.h Normal file
View File

@ -0,0 +1,296 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: mastertrainer.h
// Description: Trainer to build the MasterClassifier.
// Author: Ray Smith
// Created: Wed Nov 03 18:07:01 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_TRAINING_MASTERTRAINER_H__
#define TESSERACT_TRAINING_MASTERTRAINER_H__
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "classify.h"
#include "cluster.h"
#include "intfx.h"
#include "elst.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "indexmapbidi.h"
#include "intfeaturespace.h"
#include "intfeaturemap.h"
#include "intmatcher.h"
#include "params.h"
#include "shapetable.h"
#include "trainingsample.h"
#include "trainingsampleset.h"
#include "unicharset.h"
namespace tesseract {
class ShapeClassifier;
// Simple struct to hold the distance between two shapes during clustering.
struct ShapeDist {
ShapeDist() : shape1(0), shape2(0), distance(0.0f) {}
ShapeDist(int s1, int s2, float dist)
: shape1(s1), shape2(s2), distance(dist) {}
// Sort operator to sort in ascending order of distance.
bool operator<(const ShapeDist& other) const {
return distance < other.distance;
}
int shape1;
int shape2;
float distance;
};
// Class to encapsulate training processes that use the TrainingSampleSet.
// Initially supports shape clustering and mftrainining.
// Other important features of the MasterTrainer are conditioning the data
// by outlier elimination, replication with perturbation, and serialization.
class MasterTrainer {
public:
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis,
bool replicate_samples, int debug_level);
~MasterTrainer();
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Loads an initial unicharset, or sets one up if the file cannot be read.
void LoadUnicharset(const char* filename);
// Sets the feature space definition.
void SetFeatureSpace(const IntFeatureSpace& fs) {
feature_space_ = fs;
feature_map_.Init(fs);
}
// Reads the samples and their features from the given file,
// adding them to the trainer with the font_id from the content of the file.
// If verification, then these are verification samples, not training.
void ReadTrainingSamples(FILE *fp,
const FEATURE_DEFS_STRUCT& feature_defs,
bool verification);
// Adds the given single sample to the trainer, setting the classid
// appropriately from the given unichar_str.
void AddSample(bool verification, const char* unichar_str,
TrainingSample* sample);
// Loads all pages from the given tif filename and append to page_images_.
// Must be called after ReadTrainingSamples, as the current number of images
// is used as an offset for page numbers in the samples.
void LoadPageImages(const char* filename);
// Cleans up the samples after initial load from the tr files, and prior to
// saving the MasterTrainer:
// Remaps fragmented chars if running shape anaylsis.
// Sets up the samples appropriately for class/fontwise access.
// Deletes outlier samples.
void PostLoadCleanup();
// Gets the samples ready for training. Use after both
// ReadTrainingSamples+PostLoadCleanup or DeSerialize.
// Re-indexes the features and computes canonical and cloud features.
void PreTrainingSetup();
// Sets up the master_shapes_ table, which tells which fonts should stay
// together until they get to a leaf node classifier.
void SetupMasterShapes();
// Adds the junk_samples_ to the main samples_ set. Junk samples are initially
// fragments and n-grams (all incorrectly segmented characters).
// Various training functions may result in incorrectly segmented characters
// being added to the unicharset of the main samples, perhaps because they
// form a "radical" decomposition of some (Indic) grapheme, or because they
// just look the same as a real character (like rn/m)
// This function moves all the junk samples, to the main samples_ set, but
// desirable junk, being any sample for which the unichar already exists in
// the samples_ unicharset gets the unichar-ids re-indexed to match, but
// anything else gets re-marked as unichar_id 0 (space character) to identify
// it as junk to the error counter.
void IncludeJunk();
// Replicates the samples and perturbs them if the enable_replication_ flag
// is set. MUST be used after the last call to OrganizeByFontAndClass on
// the training samples, ie after IncludeJunk if it is going to be used, as
// OrganizeByFontAndClass will eat the replicated samples into the regular
// samples.
void ReplicateAndRandomizeSamplesIfRequired();
// Loads the basic font properties file into fontinfo_table_.
// Returns false on failure.
bool LoadFontInfo(const char* filename);
// Loads the xheight font properties file into xheights_.
// Returns false on failure.
bool LoadXHeights(const char* filename);
// Reads spacing stats from filename and adds them to fontinfo_table.
// Returns false on failure.
bool AddSpacingInfo(const char *filename);
// Returns the font id corresponding to the given font name.
// Returns -1 if the font cannot be found.
int GetFontInfoId(const char* font_name);
// Returns the font_id of the closest matching font name to the given
// filename. It is assumed that a substring of the filename will match
// one of the fonts. If more than one is matched, the longest is returned.
int GetBestMatchingFontInfoId(const char* filename);
// Sets up a flat shapetable with one shape per class/font combination.
void SetupFlatShapeTable(ShapeTable* shape_table);
// Sets up a Clusterer for mftraining on a single shape_id.
// Call FreeClusterer on the return value after use.
CLUSTERER* SetupForClustering(const ShapeTable& shape_table,
const FEATURE_DEFS_STRUCT& feature_defs,
int shape_id, int* num_samples);
// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
// to the given inttemp_file, and the corresponding pffmtable.
// The unicharset is the original encoding of graphemes, and shape_set should
// match the size of the shape_table, and may possibly be totally fake.
void WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
const UNICHARSET& shape_set,
const ShapeTable& shape_table,
CLASS_STRUCT* float_classes,
const char* inttemp_file,
const char* pffmtable_file);
const UNICHARSET& unicharset() const {
return samples_.unicharset();
}
TrainingSampleSet* GetSamples() {
return &samples_;
}
const ShapeTable& master_shapes() const {
return master_shapes_;
}
// Generates debug output relating to the canonical distance between the
// two given UTF8 grapheme strings.
void DebugCanonical(const char* unichar_str1, const char* unichar_str2);
// Debugging for cloud/canonical features.
// Displays a Features window containing:
// If unichar_str2 is in the unicharset, and canonical_font is non-negative,
// displays the canonical features of the char/font combination in red.
// If unichar_str1 is in the unicharset, and cloud_font is non-negative,
// displays the cloud feature of the char/font combination in green.
// The canonical features are drawn first to show which ones have no
// matches in the cloud features.
// Until the features window is destroyed, each click in the features window
// will display the samples that have that feature in a separate window.
void DisplaySamples(const char* unichar_str1, int cloud_font,
const char* unichar_str2, int canonical_font);
// Tests the given test_classifier on the internal samples.
// See TestClassifier for details.
void TestClassifierOnSamples(int report_level,
bool replicate_samples,
ShapeClassifier* test_classifier,
STRING* report_string);
// Tests the given test_classifier on the given samples
// report_levels:
// 0 = no output.
// 1 = bottom-line error rate.
// 2 = bottom-line error rate + time.
// 3 = font-level error rate + time.
// 4 = list of all errors + short classifier debug output on 16 errors.
// 5 = list of all errors + short classifier debug output on 25 errors.
// If replicate_samples is true, then the test is run on an extended test
// sample including replicated and systematically perturbed samples.
// If report_string is non-NULL, a summary of the results for each font
// is appended to the report_string.
double TestClassifier(int report_level,
bool replicate_samples,
TrainingSampleSet* samples,
ShapeClassifier* test_classifier,
STRING* report_string);
// Returns the average (in some sense) distance between the two given
// shapes, which may contain multiple fonts and/or unichars.
// This function is public to facilitate testing.
float ShapeDistance(const ShapeTable& shapes, int s1, int s2);
private:
// Replaces samples that are always fragmented with the corresponding
// fragment samples.
void ReplaceFragmentedSamples();
// Runs a hierarchical agglomerative clustering to merge shapes in the given
// shape_table, while satisfying the given constraints:
// * End with at least min_shapes left in shape_table,
// * No shape shall have more than max_shape_unichars in it,
// * Don't merge shapes where the distance between them exceeds max_dist.
void ClusterShapes(int min_shapes, int max_shape_unichars,
float max_dist, ShapeTable* shape_table);
private:
NormalizationMode norm_mode_;
// Character set we are training for.
UNICHARSET unicharset_;
// Original feature space. Subspace mapping is contained in feature_map_.
IntFeatureSpace feature_space_;
TrainingSampleSet samples_;
TrainingSampleSet junk_samples_;
TrainingSampleSet verify_samples_;
// Master shape table defines what fonts stay together until the leaves.
ShapeTable master_shapes_;
// Flat shape table has each unichar/font id pair in a separate shape.
ShapeTable flat_shapes_;
// Font metrics gathered from multiple files.
UnicityTable<FontInfo> fontinfo_table_;
// Array of xheights indexed by font ids in fontinfo_table_;
GenericVector<int> xheights_;
// Non-serialized data initialized by other means or used temporarily
// during loading of training samples.
// Number of different class labels in unicharset_.
int charsetsize_;
// Flag to indicate that we are running shape analysis and need fragments
// fixing.
bool enable_shape_anaylsis_;
// Flag to indicate that sample replication is required.
bool enable_replication_;
// Flag to indicate that junk should be included in samples_.
bool include_junk_;
// Array of classids of fragments that replace the correctly segmented chars.
int* fragments_;
// Classid of previous correctly segmented sample that was added.
int prev_unichar_id_;
// Debug output control.
int debug_level_;
// Feature map used to construct reduced feature spaces for compact
// classifiers.
IntFeatureMap feature_map_;
// Vector of Pix pointers used for classifiers that need the image.
// Indexed by page_num_ in the samples.
// These images are owned by the trainer and need to be pixDestroyed.
GenericVector<Pix*> page_images_;
};
} // namespace tesseract.
#endif

View File

@ -26,8 +26,11 @@
typedef enum {
MFXPosition, MFYPosition,
MFLength, MFDirection, MFBulge1, MFBulge2
MFLength, MFDirection, MFBulge1, MFBulge2,
MFCount // For array sizes.
} MF_PARAM_NAME;
typedef float MicroFeature[MFCount];
/*----------------------------------------------------------------------------
Private Function Prototypes
-----------------------------------------------------------------------------*/

View File

@ -76,7 +76,8 @@ MFOUTLINE ConvertOutline(TESSLINE *outline) {
EdgePoint = NextPoint;
} while (EdgePoint != StartPoint);
MakeOutlineCircular(MFOutline);
if (MFOutline != NULL)
MakeOutlineCircular(MFOutline);
return MFOutline;
}
@ -95,7 +96,8 @@ LIST ConvertOutlines(TESSLINE *outline,
while (outline != NULL) {
mf_outline = ConvertOutline(outline);
mf_outlines = push(mf_outlines, mf_outline);
if (mf_outline != NULL)
mf_outlines = push(mf_outlines, mf_outline);
outline = outline->next;
}
return mf_outlines;
@ -404,54 +406,6 @@ void Classify::NormalizeOutlines(LIST Outlines,
} /* NormalizeOutlines */
} // namespace tesseract
/*---------------------------------------------------------------------------*/
void SmearExtremities(MFOUTLINE Outline, FLOAT32 XScale, FLOAT32 YScale) {
/*
** Parameters:
** Outline outline whose extremities are to be smeared
** XScale factor used to normalize outline in x dir
** YScale factor used to normalize outline in y dir
** Globals: none
** Operation:
** This routine smears the extremities of the specified outline.
** It does this by adding a random number between
** -0.5 and 0.5 pixels (that is why X/YScale are needed) to
** the x and y position of the point. This is done so that
** the discrete nature of the original scanned image does not
** affect the statistical clustering used during training.
** Return: none
** Exceptions: none
** History: 1/11/90, DSJ, Created.
*/
MFEDGEPT *Current;
MFOUTLINE EdgePoint;
FLOAT32 MinXSmear;
FLOAT32 MaxXSmear;
FLOAT32 MinYSmear;
FLOAT32 MaxYSmear;
if (Outline != NIL_LIST) {
MinXSmear = -0.5 * XScale;
MaxXSmear = 0.5 * XScale;
MinYSmear = -0.5 * YScale;
MaxYSmear = 0.5 * YScale;
EdgePoint = Outline;
do {
Current = PointAt (EdgePoint);
if (Current->ExtremityMark) {
Current->Point.x +=
UniformRandomNumber(MinXSmear, MaxXSmear);
Current->Point.y +=
UniformRandomNumber(MinYSmear, MaxYSmear);
}
EdgePoint = NextPointAfter (EdgePoint);
}
while (EdgePoint != Outline);
}
} /* SmearExtremities */
/**----------------------------------------------------------------------------
Private Code
----------------------------------------------------------------------------**/

View File

@ -114,8 +114,6 @@ MFOUTLINE NextExtremity(MFOUTLINE EdgePoint);
void NormalizeOutline(MFOUTLINE Outline,
FLOAT32 XOrigin);
void SmearExtremities(MFOUTLINE Outline, FLOAT32 XScale, FLOAT32 YScale);
/*----------------------------------------------------------------------------
Private Function Prototypes
-----------------------------------------------------------------------------*/

View File

@ -102,7 +102,6 @@ CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm) {
Outline = (MFOUTLINE) first_node (RemainingOutlines);
FindDirectionChanges(Outline, classify_min_slope, classify_max_slope);
MarkDirectionChanges(Outline);
SmearExtremities(Outline, XScale, YScale);
MicroFeatures = ConvertToMicroFeatures (Outline, MicroFeatures);
}
FreeOutlines(Outlines);

View File

@ -70,7 +70,8 @@ const double kWidthErrorWeighting = 0.125;
----------------------------------------------------------------------------**/
/*---------------------------------------------------------------------------*/
namespace tesseract {
FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature,
FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId,
const FEATURE_STRUCT& feature,
BOOL8 DebugMatch) {
/*
** Parameters:
@ -96,12 +97,12 @@ FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature,
/* handle requests for classification as noise */
if (ClassId == NO_CLASS) {
/* kludge - clean up constants and make into control knobs later */
Match = (Feature->Params[CharNormLength] *
Feature->Params[CharNormLength] * 500.0 +
Feature->Params[CharNormRx] *
Feature->Params[CharNormRx] * 8000.0 +
Feature->Params[CharNormRy] *
Feature->Params[CharNormRy] * 8000.0);
Match = (feature.Params[CharNormLength] *
feature.Params[CharNormLength] * 500.0 +
feature.Params[CharNormRx] *
feature.Params[CharNormRx] * 8000.0 +
feature.Params[CharNormRy] *
feature.Params[CharNormRy] * 8000.0);
return (1.0 - NormEvidenceOf (Match));
}
@ -109,38 +110,48 @@ FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature,
Protos = NormProtos->Protos[ClassId];
if (DebugMatch) {
cprintf ("\nFeature = ");
WriteFeature(stdout, Feature);
tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
}
ProtoId = 0;
iterate(Protos) {
Proto = (PROTOTYPE *) first_node (Protos);
Delta = Feature->Params[CharNormY] - Proto->Mean[CharNormY];
Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
Delta = Feature->Params[CharNormRx] - Proto->Mean[CharNormRx];
if (DebugMatch) {
tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
Proto->Mean[CharNormY], Delta,
Proto->Weight.Elliptical[CharNormY], Match);
}
Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
if (DebugMatch) {
tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
Proto->Mean[CharNormRx], Delta,
Proto->Weight.Elliptical[CharNormRx], Match);
}
// Ry is width! See intfx.cpp.
Delta = Feature->Params[CharNormRy] - Proto->Mean[CharNormRy];
Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
if (DebugMatch) {
tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
Proto->Mean[CharNormRy], Delta,
Proto->Weight.Elliptical[CharNormRy]);
}
Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
Delta *= kWidthErrorWeighting;
Match += Delta;
if (DebugMatch) {
tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
Match, Match / classify_norm_adj_midpoint,
NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
}
if (Match < BestMatch)
BestMatch = Match;
if (DebugMatch) {
cprintf ("Proto %1d = ", ProtoId);
WriteNFloats (stdout, NormProtos->NumParams, Proto->Mean);
cprintf (" var = ");
WriteNFloats (stdout, NormProtos->NumParams,
Proto->Variance.Elliptical);
cprintf (" match = ");
PrintNormMatch (stdout, NormProtos->NumParams, Proto, Feature);
}
ProtoId++;
}
return (1.0 - NormEvidenceOf (BestMatch));
return 1.0 - NormEvidenceOf(BestMatch);
} /* ComputeNormMatch */
void Classify::FreeNormProtos() {
@ -230,7 +241,7 @@ NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
*/
NORM_PROTOS *NormProtos;
int i;
char unichar[UNICHAR_LEN + 1];
char unichar[2 * UNICHAR_LEN + 1];
UNICHAR_ID unichar_id;
LIST Protos;
int NumProtos;
@ -256,8 +267,12 @@ NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
Protos =
push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
NormProtos->Protos[unichar_id] = Protos;
} else
cprintf("Error: unichar %s in normproto file is not in unichar set.\n");
} else {
cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
unichar);
for (i = 0; i < NumProtos; i++)
FreePrototype(ReadPrototype (File, NormProtos->NumParams));
}
SkipNewline(File);
}
return (NormProtos);

View File

@ -231,12 +231,11 @@ void WriteFeature(FILE *File, FEATURE Feature) {
for (i = 0; i < Feature->Type->NumParams; i++) {
#ifndef WIN32
assert (!isnan(Feature->Params[i]));
assert(!isnan(Feature->Params[i]));
#endif
fprintf (File, " %12g", Feature->Params[i]);
fprintf(File, " %g", Feature->Params[i]);
}
fprintf (File, "\n");
fprintf(File, "\n");
} /* WriteFeature */

View File

@ -100,11 +100,9 @@ const PARAM_DESC Name[] = {
Macro for describing a new feature. The parameters of the macro
are as follows:
DefineFeature (Name, NumLinear, NumCircular,
MinFeatPerChar, MaxFeatPerChar,
LongName, ShortName, ParamName)
DefineFeature (Name, NumLinear, NumCircular, ShortName, ParamName)
----------------------------------------------------------------------*/
#define DefineFeature(Name, NL, NC, Min, Max, LN, SN, PN) \
#define DefineFeature(Name, NL, NC, SN, PN) \
const FEATURE_DESC_STRUCT Name = { \
((NL) + (NC)), SN, PN};

View File

@ -27,6 +27,7 @@
#include "mfoutline.h"
#include "ocrfeatures.h"
#include "params.h"
#include "trainingsample.h"
#include <math.h>
#include <stdio.h>
@ -221,3 +222,59 @@ void NormalizePicoX(FEATURE_SET FeatureSet) {
Feature->Params[PicoFeatX] -= Origin;
}
} /* NormalizePicoX */
/*---------------------------------------------------------------------------*/
FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
/*
** Parameters:
** blob blob to extract features from
** denorm normalization/denormalization parameters.
** Return: Integer character-normalized features for blob.
** Exceptions: none
** History: 8/8/2011, rays, Created.
*/
tesseract::TrainingSample* sample = GetIntFeatures(
tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
if (sample == NULL) return NULL;
int num_features = sample->num_features();
const INT_FEATURE_STRUCT* features = sample->features();
FEATURE_SET feature_set = NewFeatureSet(num_features);
for (int f = 0; f < num_features; ++f) {
FEATURE feature = NewFeature(&IntFeatDesc);
feature->Params[IntX] = features[f].X;
feature->Params[IntY] = features[f].Y;
feature->Params[IntDir] = features[f].Theta;
AddFeature(feature_set, feature);
}
delete sample;
return feature_set;
} /* ExtractIntCNFeatures */
/*---------------------------------------------------------------------------*/
FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& denorm) {
/*
** Parameters:
** blob blob to extract features from
** denorm normalization/denormalization parameters.
** Return: Geometric (top/bottom/width) features for blob.
** Exceptions: none
** History: 8/8/2011, rays, Created.
*/
tesseract::TrainingSample* sample = GetIntFeatures(
tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
if (sample == NULL) return NULL;
FEATURE_SET feature_set = NewFeatureSet(1);
FEATURE feature = NewFeature(&IntFeatDesc);
feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
feature->Params[GeoTop] = sample->geo_feature(GeoTop);
feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
AddFeature(feature_set, feature);
delete sample;
return feature_set;
} /* ExtractIntGeoFeatures */

View File

@ -24,6 +24,22 @@
#include "ocrfeatures.h"
#include "params.h"
// Enum for the order/type of params in IntFeatDesc.
enum IntParams {
IntX, // x-position (0-255).
IntY, // y-position (0-255).
IntDir // Direction (0-255, circular).
};
// Enum for the order/type of params in GeoFeatDesc.
enum GeoParams {
GeoBottom, // Bounding box bottom in baseline space (0-255).
GeoTop, // Bounding box top in baseline space (0-255).
GeoWidth, // Bounding box width in baseline space (0-255).
GeoCount // Number of geo features.
};
typedef enum
{ PicoFeatY, PicoFeatDir, PicoFeatX }
PICO_FEAT_PARAM_NAME;
@ -42,6 +58,9 @@ extern double_VAR_H(classify_pico_feature_length, 0.05, "Pico Feature Length");
----------------------------------------------------------------------------**/
#define GetPicoFeatureLength() (PicoFeatureLength)
FEATURE_SET ExtractIntCNFeatures(TBLOB *Blob, const DENORM& denorm);
FEATURE_SET ExtractIntGeoFeatures(TBLOB *Blob, const DENORM& denorm);
/**----------------------------------------------------------------------------
Global Data Definitions and Declarations
----------------------------------------------------------------------------**/

View File

@ -51,8 +51,11 @@ typedef struct
} PROTO_STRUCT;
typedef PROTO_STRUCT *PROTO;
typedef struct
{
struct CLASS_STRUCT {
CLASS_STRUCT()
: NumProtos(0), MaxNumProtos(0), Prototypes(NULL),
NumConfigs(0), MaxNumConfigs(0), Configurations(NULL) {
}
inT16 NumProtos;
inT16 MaxNumProtos;
PROTO Prototypes;
@ -60,7 +63,7 @@ typedef struct
inT16 MaxNumConfigs;
CONFIGS Configurations;
UnicityTableEqEq<int> font_set;
} CLASS_STRUCT;
};
typedef CLASS_STRUCT *CLASS_TYPE;
typedef CLASS_STRUCT *CLASSES;

262
classify/sampleiterator.cpp Normal file
View File

@ -0,0 +1,262 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "sampleiterator.h"
#include "indexmapbidi.h"
#include "shapetable.h"
#include "trainingsample.h"
#include "trainingsampleset.h"
namespace tesseract {
// ================== SampleIterator Implementation =================
SampleIterator::SampleIterator()
: charset_map_(NULL),
shape_table_(NULL),
sample_set_(NULL),
randomize_(false),
owned_shape_table_(NULL) {
num_shapes_ = 0;
Begin();
}
SampleIterator::~SampleIterator() {
Clear();
}
void SampleIterator::Clear() {
delete owned_shape_table_;
owned_shape_table_ = NULL;
}
// See class comment for arguments.
void SampleIterator::Init(const IndexMapBiDi* charset_map,
const ShapeTable* shape_table,
bool randomize,
TrainingSampleSet* sample_set) {
Clear();
charset_map_ = charset_map;
shape_table_ = shape_table;
sample_set_ = sample_set;
randomize_ = randomize;
if (shape_table_ == NULL && charset_map_ != NULL) {
// The caller wishes to iterate by class. The easiest way to do this
// is to create a dummy shape_table_ that we will own.
int num_fonts = sample_set_->NumFonts();
owned_shape_table_ = new ShapeTable(sample_set_->unicharset());
int charsetsize = sample_set_->unicharset().size();
for (int c = 0; c < charsetsize; ++c) {
// We always add a shape for each character to keep the index in sync
// with the unichar_id.
int shape_id = owned_shape_table_->AddShape(c, 0);
for (int f = 1; f < num_fonts; ++f) {
if (sample_set_->NumClassSamples(f, c, true) > 0) {
owned_shape_table_->AddToShape(shape_id, c, f);
}
}
}
shape_table_ = owned_shape_table_;
}
if (shape_table_ != NULL) {
num_shapes_ = shape_table_->NumShapes();
} else {
num_shapes_ = randomize ? sample_set_->num_samples()
: sample_set_->num_raw_samples();
}
Begin();
}
// Iterator functions designed for use with a simple for loop:
// for (it.Begin(); !it.AtEnd(); it.Next()) {
// const TrainingSample& sample = it.GetSample();
// }
void SampleIterator::Begin() {
shape_index_ = -1;
shape_char_index_ = 0;
num_shape_chars_ = 0;
shape_font_index_ = 0;
num_shape_fonts_ = 0;
sample_index_ = 0;
num_samples_ = 0;
// Find the first indexable sample.
Next();
}
bool SampleIterator::AtEnd() const {
return shape_index_ >= num_shapes_;
}
const TrainingSample& SampleIterator::GetSample() const {
if (shape_table_ != NULL) {
const UnicharAndFonts* shape_entry = GetShapeEntry();
int char_id = shape_entry->unichar_id;
int font_id = shape_entry->font_ids[shape_font_index_];
return *sample_set_->GetSample(font_id, char_id, sample_index_);
} else {
return *sample_set_->GetSample(shape_index_);
}
}
TrainingSample* SampleIterator::MutableSample() const {
if (shape_table_ != NULL) {
const UnicharAndFonts* shape_entry = GetShapeEntry();
int char_id = shape_entry->unichar_id;
int font_id = shape_entry->font_ids[shape_font_index_];
return sample_set_->MutableSample(font_id, char_id, sample_index_);
} else {
return sample_set_->mutable_sample(shape_index_);
}
}
// Returns the total index (from the original set of samples) of the current
// sample.
int SampleIterator::GlobalSampleIndex() const {
if (shape_table_ != NULL) {
const UnicharAndFonts* shape_entry = GetShapeEntry();
int char_id = shape_entry->unichar_id;
int font_id = shape_entry->font_ids[shape_font_index_];
return sample_set_->GlobalSampleIndex(font_id, char_id, sample_index_);
} else {
return shape_index_;
}
}
// Returns the index of the current sample in compact charset space, so
// in a 2-class problem between x and y, the returned indices will all be
// 0 or 1, and have nothing to do with the unichar_ids.
// If the charset_map_ is NULL, then this is equal to GetSparseClassID().
int SampleIterator::GetCompactClassID() const {
return charset_map_ != NULL ? charset_map_->SparseToCompact(shape_index_)
: GetSparseClassID();
}
// Returns the index of the current sample in sparse charset space, so
// in a 2-class problem between x and y, the returned indices will all be
// x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
// with a shape_table_.
int SampleIterator::GetSparseClassID() const {
return shape_table_ != NULL ? shape_index_ : GetSample().class_id();
}
// Moves on to the next indexable sample. If the end is reached, leaves
// the state such that AtEnd() is true.
void SampleIterator::Next() {
if (shape_table_ != NULL) {
// Next sample in this class/font combination.
++sample_index_;
if (sample_index_ < num_samples_)
return;
// Next font in this class in this shape.
sample_index_ = 0;
do {
++shape_font_index_;
if (shape_font_index_ >= num_shape_fonts_) {
// Next unichar in this shape.
shape_font_index_ = 0;
++shape_char_index_;
if (shape_char_index_ >= num_shape_chars_) {
// Find the next shape that is mapped in the charset_map_.
shape_char_index_ = 0;
do {
++shape_index_;
} while (shape_index_ < num_shapes_ &&
charset_map_ != NULL &&
charset_map_->SparseToCompact(shape_index_) < 0);
if (shape_index_ >= num_shapes_)
return; // The end.
num_shape_chars_ = shape_table_->GetShape(shape_index_).size();
}
}
const UnicharAndFonts* shape_entry = GetShapeEntry();
num_shape_fonts_ = shape_entry->font_ids.size();
int char_id = shape_entry->unichar_id;
int font_id = shape_entry->font_ids[shape_font_index_];
num_samples_ = sample_set_->NumClassSamples(font_id, char_id, randomize_);
} while (num_samples_ == 0);
} else {
// We are just iterating over the samples.
++shape_index_;
}
}
// Returns the size of the compact charset space.
int SampleIterator::CompactCharsetSize() const {
return charset_map_ != NULL ? charset_map_->CompactSize()
: SparseCharsetSize();
}
// Returns the size of the sparse charset space.
int SampleIterator::SparseCharsetSize() const {
return charset_map_ != NULL
? charset_map_->SparseSize()
: (shape_table_ != NULL ? shape_table_->NumShapes()
: sample_set_->charsetsize());
}
// Apply the supplied feature_space/feature_map transform to all samples
// accessed by this iterator.
void SampleIterator::MapSampleFeatures(const IntFeatureMap& feature_map) {
for (Begin(); !AtEnd(); Next()) {
TrainingSample* sample = MutableSample();
sample->MapFeatures(feature_map);
}
}
// Adjust the weights of all the samples to be uniform in the given charset.
// Returns the number of samples in the iterator.
int SampleIterator::UniformSamples() {
int num_good_samples = 0;
for (Begin(); !AtEnd(); Next()) {
TrainingSample* sample = MutableSample();
sample->set_weight(1.0);
++num_good_samples;
}
NormalizeSamples();
return num_good_samples;
}
// Normalize the weights of all the samples in the charset_map so they sum
// to 1. Returns the minimum assigned sample weight.
double SampleIterator::NormalizeSamples() {
double total_weight = 0.0;
int sample_count = 0;
for (Begin(); !AtEnd(); Next()) {
const TrainingSample& sample = GetSample();
total_weight += sample.weight();
++sample_count;
}
// Normalize samples.
double min_assigned_sample_weight = 1.0;
if (total_weight > 0.0) {
for (Begin(); !AtEnd(); Next()) {
TrainingSample* sample = MutableSample();
double weight = sample->weight() / total_weight;
if (weight < min_assigned_sample_weight)
min_assigned_sample_weight = weight;
sample->set_weight(weight);
}
}
return min_assigned_sample_weight;
}
// Helper returns the current UnicharAndFont shape_entry.
const UnicharAndFonts* SampleIterator::GetShapeEntry() const {
const Shape& shape = shape_table_->GetShape(shape_index_);
return &shape[shape_char_index_];
}
} // namespace tesseract.

195
classify/sampleiterator.h Normal file
View File

@ -0,0 +1,195 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
#define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
namespace tesseract {
class IndexMapBiDi;
class IntFeatureMap;
class ShapeTable;
class TrainingSample;
class TrainingSampleSet;
class UnicharAndFonts;
// Iterator class to encapsulate the complex iteration involved in getting
// all samples of all shapes needed for a classification problem.
//
// =====INPUTS TO Init FUNCTION=====
// The charset_map defines a subset of the sample_set classes (with a NULL
// shape_table, or the shape_table classes if not NULL.)
//
// The shape_table (if not NULL) defines the mapping from shapes to
// font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
//
// The sample_set holds the samples and provides indexed access to samples
// of font_id/class_id pairs.
//
// If randomize is true, the samples are perturbed slightly, but the
// perturbation is guaranteed to be the same for multiple identical
// iterations.
//
// =====DIFFERENT COMBINATIONS OF INPUTS=====
// NULL shape_table:
// Without a shape_table, everything works in UNICHAR_IDs.
//
// NULL shape_table, NULL charset_map:
// Iterations simply run over the samples in the order the samples occur in the
// input files.
// GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
//
// NULL shape_table, non-NULL charset_map:
// When shape_table is NULL, the charset_map indexes unichar_ids directly,
// and an iteration returns all samples of all chars in the charset_map, which
// is a subset of the full unicharset.
// The iteration will be in groups of the same unichar_id, in the order
// defined by the charset_map.
// GetCompactClassID returns the charset_map index of a sample, and
// GetSparseClassID returns the sample UNICHAR_ID.
//
// Non-NULL shape_table:
// With a shape_table, samples are grouped according to the shape_table, so
// multiple UNICHAR_IDs and fonts may be grouped together, and everything
// works in shape_ids.
//
// Non-NULL shape_table, NULL charset_map.
// Iterations simply run over the samples in the order of shape_id.
// GetCompactClassID and GetSparseClassID both return the shape_id.
// (If you want the unichar_id or font_id, the sample still has them.)
//
// Non-NULL shape_table, non-NULL charset_map.
// When shape_table is not NULL, the charset_map indexes and subsets shapes in
// the shape_table, and iterations will be in shape_table order, not
// charset_map order.
// GetCompactClassID returns the charset_map index of a shape, and
// GetSparseClassID returns the shape_id.
//
// =====What is SampleIterator good for?=====
// Inside a classifier training module, the SampleIterator has abstracted away
// all the different modes above.
// Use the following iteration to train your classifier:
// for (it.Begin(); !it.AtEnd(); it.Next()) {
// const TrainingSample& sample = it.GetSample();
// int class_id = it.GetCompactClassID();
// Your classifier may or may not be dealing with a shape_table, and may be
// dealing with some subset of the character/shape set. It doesn't need to
// know and shouldn't care. It is just learning shapes with compact class ids
// in the range [0, it.CompactCharsetSize()).
class SampleIterator {
public:
SampleIterator();
~SampleIterator();
void Clear();
// See class comment for arguments.
void Init(const IndexMapBiDi* charset_map,
const ShapeTable* shape_table,
bool randomize,
TrainingSampleSet* sample_set);
// Iterator functions designed for use with a simple for loop:
// for (it.Begin(); !it.AtEnd(); it.Next()) {
// const TrainingSample& sample = it.GetSample();
// int class_id = it.GetCompactClassID();
// ...
// }
void Begin();
bool AtEnd() const;
const TrainingSample& GetSample() const;
TrainingSample* MutableSample() const;
// Returns the total index (from the original set of samples) of the current
// sample.
int GlobalSampleIndex() const;
// Returns the index of the current sample in compact charset space, so
// in a 2-class problem between x and y, the returned indices will all be
// 0 or 1, and have nothing to do with the unichar_ids.
// If the charset_map_ is NULL, then this is equal to GetSparseClassID().
int GetCompactClassID() const;
// Returns the index of the current sample in sparse charset space, so
// in a 2-class problem between x and y, the returned indices will all be
// x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
// with a shape_table_.
int GetSparseClassID() const;
// Moves on to the next indexable sample. If the end is reached, leaves
// the state such that AtEnd() is true.
void Next();
// Returns the size of the compact charset space.
int CompactCharsetSize() const;
// Returns the size of the sparse charset space.
int SparseCharsetSize() const;
const IndexMapBiDi& charset_map() const {
return *charset_map_;
}
const ShapeTable* shape_table() const {
return shape_table_;
}
// Sample set operations.
const TrainingSampleSet* sample_set() const {
return sample_set_;
}
// A set of functions that do something to all the samples accessed by the
// iterator, as it is currently setup.
// Apply the supplied feature_space/feature_map transform to all samples
// accessed by this iterator.
void MapSampleFeatures(const IntFeatureMap& feature_map);
// Adjust the weights of all the samples to be uniform in the given charset.
// Returns the number of samples in the iterator.
int UniformSamples();
// Normalize the weights of all the samples defined by the iterator so they
// sum to 1. Returns the minimum assigned sample weight.
double NormalizeSamples();
private:
// Helper returns the current UnicharAndFont shape_entry.
const UnicharAndFonts* GetShapeEntry() const;
// Map to subset the actual charset space.
const IndexMapBiDi* charset_map_;
// Shape table to recombine character classes into shapes
const ShapeTable* shape_table_;
// The samples to iterate over.
TrainingSampleSet* sample_set_;
// Flag to control randomizing the sample features.
bool randomize_;
// Shape table owned by this used to iterate character classes.
ShapeTable* owned_shape_table_;
// Top-level iteration. Shape index in sparse charset_map space.
int shape_index_;
int num_shapes_;
// Index to the character class within a shape.
int shape_char_index_;
int num_shape_chars_;
// Index to the font within a shape/class pair.
int shape_font_index_;
int num_shape_fonts_;
// The lowest level iteration. sample_index_/num_samples_ counts samples
// in the current shape/class/font combination.
int sample_index_;
int num_samples_;
};
} // namespace tesseract.
#endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_

View File

@ -0,0 +1,95 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: shapeclassifier.h
// Description: Base interface class for classifiers that return a
// shape index.
// Author: Ray Smith
// Created: Tue Sep 13 11:26:32 PDT 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
#define TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
template <typename T> class GenericVector;
struct Pix;
namespace tesseract {
class ShapeTable;
class TrainingSample;
// Classifier result from a low-level classification is an index into some
// ShapeTable and a rating.
struct ShapeRating {
ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f) {}
ShapeRating(int s, float r)
: shape_id(s), rating(r), raw(1.0f), font(0.0f) {}
// Sort function to sort ratings appropriately by descending rating.
static int SortDescendingRating(const void* t1, const void* t2) {
const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
if (a->rating > b->rating) {
return -1;
} else if (a->rating < b->rating) {
return 1;
} else {
return a->shape_id - b->shape_id;
}
}
// Index into some shape table indicates the class of the answer.
int shape_id;
// Rating from classifier with 1.0 perfect and 0.0 impossible.
// Call it a probability if you must.
float rating;
// Subsidiary rating that a classifier may use internally.
float raw;
// Subsidiary rating that a classifier may use internally.
float font;
};
// Interface base class for classifiers that produce ShapeRating results.
class ShapeClassifier {
public:
virtual ~ShapeClassifier() {}
// Classifies the given [training] sample, writing to results.
// If page_pix is not NULL, the overriding function may call
// sample.GetSamplePix(padding, page_pix) to get an image of the sample
// padded (with real image data) by the given padding to extract features
// from the image of the character. Other members of TrainingSample:
// features(), micro_features(), cn_feature(), geo_feature() may be used
// to get the appropriate tesseract features.
// If debug is non-zero, then various degrees of classifier dependent debug
// information is provided.
// If keep_this (a shape index) is >= 0, then the results should always
// contain keep_this, and (if possible) anything of intermediate confidence.
// (Used for answering "Why didn't it get that right?" questions.)
// The return value is the number of classes saved in results.
// NOTE that overriding functions MUST clear results unless the classifier
// is working with a team of such classifiers.
virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
int debug, int keep_this,
GenericVector<ShapeRating>* results) = 0;
// Provides access to the ShapeTable that this classifier works with.
virtual const ShapeTable* GetShapeTable() const = 0;
};
} // namespace tesseract.
#endif // TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_

452
classify/shapetable.cpp Normal file
View File

@ -0,0 +1,452 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: shapetable.cpp
// Description: Class to map a classifier shape index to unicharset
// indices and font indices.
// Author: Ray Smith
// Created: Tue Nov 02 15:31:32 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "shapetable.h"
#include "intfeaturespace.h"
#include "strngs.h"
#include "unicharset.h"
namespace tesseract {
// Writes to the given file. Returns false in case of error.
bool UnicharAndFonts::Serialize(FILE* fp) {
inT32 uni_id = unichar_id;
if (fwrite(&uni_id, sizeof(uni_id), 1, fp) != 1) return false;
if (!font_ids.Serialize(fp)) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
inT32 uni_id;
if (fread(&uni_id, sizeof(uni_id), 1, fp) != 1) return false;
if (swap)
ReverseN(&uni_id, sizeof(uni_id));
unichar_id = uni_id;
if (!font_ids.DeSerialize(swap, fp)) return false;
return true;
}
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1);
const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2);
return p1->unichar_id - p2->unichar_id;
}
// Writes to the given file. Returns false in case of error.
bool Shape::Serialize(FILE* fp) {
if (fwrite(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1)
return false;
if (!unichars_.SerializeClasses(fp)) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool Shape::DeSerialize(bool swap, FILE* fp) {
if (fread(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1)
return false;
if (!unichars_.DeSerializeClasses(swap, fp)) return false;
return true;
}
// Adds a font_id for the given unichar_id. If the unichar_id is not
// in the shape, it is added.
void Shape::AddToShape(int unichar_id, int font_id) {
for (int c = 0; c < unichars_.size(); ++c) {
if (unichars_[c].unichar_id == unichar_id) {
// Found the unichar in the shape table.
GenericVector<int>& font_list = unichars_[c].font_ids;
for (int f = 0; f < font_list.size(); ++f) {
if (font_list[f] == font_id)
return; // Font is already there.
}
font_list.push_back(font_id);
return;
}
}
// Unichar_id is not in shape, so add it to shape.
unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
unichars_sorted_ = unichars_.size() <= 1;
}
// Adds everything in other to this.
void Shape::AddShape(const Shape& other) {
for (int c = 0; c < other.unichars_.size(); ++c) {
for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
AddToShape(other.unichars_[c].unichar_id,
other.unichars_[c].font_ids[f]);
}
}
unichars_sorted_ = unichars_.size() <= 1;
}
// Returns true if the shape contains the given unichar_id, font_id pair.
bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
for (int c = 0; c < unichars_.size(); ++c) {
if (unichars_[c].unichar_id == unichar_id) {
// Found the unichar, so look for the font.
GenericVector<int>& font_list = unichars_[c].font_ids;
for (int f = 0; f < font_list.size(); ++f) {
if (font_list[f] == font_id)
return true;
}
return false;
}
}
return false;
}
// Returns true if the shape contains the given unichar_id, ignoring font.
bool Shape::ContainsUnichar(int unichar_id) const {
for (int c = 0; c < unichars_.size(); ++c) {
if (unichars_[c].unichar_id == unichar_id) {
return true;
}
}
return false;
}
// Returns true if the shape contains the given font, ignoring unichar_id.
bool Shape::ContainsFont(int font_id) const {
for (int c = 0; c < unichars_.size(); ++c) {
GenericVector<int>& font_list = unichars_[c].font_ids;
for (int f = 0; f < font_list.size(); ++f) {
if (font_list[f] == font_id)
return true;
}
}
return false;
}
// Returns true if this is a subset (including equal) of other.
bool Shape::IsSubsetOf(const Shape& other) const {
for (int c = 0; c < unichars_.size(); ++c) {
int unichar_id = unichars_[c].unichar_id;
const GenericVector<int>& font_list = unichars_[c].font_ids;
for (int f = 0; f < font_list.size(); ++f) {
if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
return false;
}
}
return true;
}
// Returns true if the lists of unichar ids are the same in this and other,
// ignoring fonts.
// NOT const, as it will sort the unichars on demand.
bool Shape::IsEqualUnichars(Shape* other) {
if (unichars_.size() != other->unichars_.size()) return false;
if (!unichars_sorted_) SortUnichars();
if (!other->unichars_sorted_) other->SortUnichars();
for (int c = 0; c < unichars_.size(); ++c) {
if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
return false;
}
return true;
}
// Sorts the unichars_ vector by unichar.
void Shape::SortUnichars() {
unichars_.sort(UnicharAndFonts::SortByUnicharId);
unichars_sorted_ = true;
}
ShapeTable::ShapeTable() : unicharset_(NULL) {
}
ShapeTable::ShapeTable(const UNICHARSET& unicharset)
: unicharset_(&unicharset) {
}
// Writes to the given file. Returns false in case of error.
bool ShapeTable::Serialize(FILE* fp) const {
if (!shape_table_.Serialize(fp)) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
if (!shape_table_.DeSerialize(swap, fp)) return false;
return true;
}
// Returns a string listing the classes/fonts in a shape.
STRING ShapeTable::DebugStr(int shape_id) const {
if (shape_id < 0 || shape_id >= shape_table_.size())
return STRING("INVALID_UNICHAR_ID");
const Shape& shape = GetShape(shape_id);
STRING result;
result.add_str_int("Shape", shape_id);
for (int c = 0; c < shape.size(); ++c) {
result.add_str_int(" c_id=", shape[c].unichar_id);
result += "=";
result += unicharset_->id_to_unichar(shape[c].unichar_id);
result.add_str_int(", ", shape[c].font_ids.size());
result += " fonts =";
for (int f = 0; f < shape[c].font_ids.size(); ++f) {
result.add_str_int(" ", shape[c].font_ids[f]);
}
}
return result;
}
// Returns a debug string summarizing the table.
STRING ShapeTable::SummaryStr() const {
int max_unichars = 0;
int num_multi_shapes = 0;
int num_master_shapes = 0;
for (int s = 0; s < shape_table_.size(); ++s) {
if (MasterDestinationIndex(s) != s) continue;
++num_master_shapes;
int shape_size = GetShape(s).size();
if (shape_size > 1)
++num_multi_shapes;
if (shape_size > max_unichars)
max_unichars = shape_size;
}
STRING result;
result.add_str_int("Number of shapes = ", num_master_shapes);
result.add_str_int(" max unichars = ", max_unichars);
result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
return result;
}
// Adds a new shape starting with the given unichar_id and font_id.
// Returns the assigned index.
int ShapeTable::AddShape(int unichar_id, int font_id) {
int index = shape_table_.size();
Shape* shape = new Shape;
shape->AddToShape(unichar_id, font_id);
shape_table_.push_back(shape);
return index;
}
// Adds a copy of the given shape.
// Returns the assigned index.
int ShapeTable::AddShape(const Shape& other) {
int index = shape_table_.size();
Shape* shape = new Shape(other);
shape_table_.push_back(shape);
return index;
}
// Removes the shape given by the shape index.
void ShapeTable::DeleteShape(int shape_id) {
delete shape_table_[shape_id];
shape_table_[shape_id] = NULL;
shape_table_.remove(shape_id);
}
// Adds a font_id to the given existing shape index for the given
// unichar_id. If the unichar_id is not in the shape, it is added.
void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
Shape& shape = *shape_table_[shape_id];
shape.AddToShape(unichar_id, font_id);
}
// Adds the given shape to the existing shape with the given index.
void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
Shape& shape = *shape_table_[shape_id];
shape.AddShape(other);
}
// Returns the id of the shape that contains the given unichar and font.
// If not found, returns -1.
// If font_id < 0, the font_id is ignored and the first shape that matches
// the unichar_id is returned.
int ShapeTable::FindShape(int unichar_id, int font_id) const {
for (int s = 0; s < shape_table_.size(); ++s) {
const Shape& shape = GetShape(s);
for (int c = 0; c < shape.size(); ++c) {
if (shape[c].unichar_id == unichar_id) {
if (font_id < 0)
return s; // We don't care about the font.
for (int f = 0; f < shape[c].font_ids.size(); ++f) {
if (shape[c].font_ids[f] == font_id)
return s;
}
}
}
}
return -1;
}
// Returns the first unichar_id and font_id in the given shape.
void ShapeTable::GetFirstUnicharAndFont(int shape_id,
int* unichar_id, int* font_id) const {
const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
*unichar_id = unichar_and_fonts.unichar_id;
*font_id = unichar_and_fonts.font_ids[0];
}
// Expands all the classes/fonts in the shape individually to build
// a ShapeTable.
int ShapeTable::BuildFromShape(const Shape& shape,
const ShapeTable& master_shapes) {
int num_masters = 0;
for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
int c = shape[u_ind].unichar_id;
int f = shape[u_ind].font_ids[f_ind];
if (FindShape(c, f) < 0) {
int shape_id = AddShape(c, f);
int master_id = master_shapes.FindShape(c, f);
if (master_id >= 0 && shape.size() > 1) {
const Shape& master = master_shapes.GetShape(master_id);
if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) {
// Add everything else from the master shape.
shape_table_[shape_id]->AddShape(master);
++num_masters;
}
}
}
}
}
return num_masters;
}
// Returns true if the shapes are already merged.
bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) {
return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
}
// Returns true if any shape contains multiple unichars.
bool ShapeTable::AnyMultipleUnichars() {
int num_shapes = NumShapes();
for (int s1 = 0; s1 < num_shapes; ++s1) {
if (MasterDestinationIndex(s1) != s1) continue;
if (GetShape(s1).size() > 1)
return true;
}
return false;
}
// Returns the maximum number of unichars over all shapes.
int ShapeTable::MaxNumUnichars() const {
int max_num_unichars = 0;
int num_shapes = NumShapes();
for (int s = 0; s < num_shapes; ++s) {
if (GetShape(s).size() > max_num_unichars)
max_num_unichars = GetShape(s).size();
}
return max_num_unichars;
}
// Merges shapes with a common unichar over the [start, end) interval.
// Assumes single unichar per shape.
void ShapeTable::ForceFontMerges(int start, int end) {
for (int s1 = start; s1 < end; ++s1) {
if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
int unichar_id = GetShape(s1)[0].unichar_id;
for (int s2 = s1 + 1; s2 < end; ++s2) {
if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
unichar_id == GetShape(s2)[0].unichar_id) {
MergeShapes(s1, s2);
}
}
}
}
ShapeTable compacted(*unicharset_);
compacted.AppendMasterShapes(*this);
*this = compacted;
}
// Returns the number of unichars in the master shape.
int ShapeTable::MasterUnicharCount(int shape_id) const {
int master_id = MasterDestinationIndex(shape_id);
return GetShape(master_id).size();
}
// Returns the sum of the font counts in the master shape.
int ShapeTable::MasterFontCount(int shape_id) const {
int master_id = MasterDestinationIndex(shape_id);
const Shape& shape = GetShape(master_id);
int font_count = 0;
for (int c = 0; c < shape.size(); ++c) {
font_count += shape[c].font_ids.size();
}
return font_count;
}
// Returns the number of unichars that would result from merging the shapes.
int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
// Do it the easy way for now.
int master_id1 = MasterDestinationIndex(shape_id1);
int master_id2 = MasterDestinationIndex(shape_id2);
Shape combined_shape(*shape_table_[master_id1]);
combined_shape.AddShape(*shape_table_[master_id2]);
return combined_shape.size();
}
// Merges two shape_ids, leaving shape_id2 marked as merged.
void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
int master_id1 = MasterDestinationIndex(shape_id1);
int master_id2 = MasterDestinationIndex(shape_id2);
// Point master_id2 (and all merged shapes) to master_id1.
shape_table_[master_id2]->set_destination_index(master_id1);
// Add all the shapes of master_id2 to master_id1.
shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
tprintf("Merged shape %d->%d, %d->%d, now with %d unichars: %s\n",
shape_id1, master_id1, shape_id2, master_id2,
shape_table_[master_id1]->size(),
DebugStr(master_id1).string());
}
// Returns the destination of this shape, (if merged), taking into account
// the fact that the destination may itself have been merged.
int ShapeTable::MasterDestinationIndex(int shape_id) const {
int dest_id = shape_table_[shape_id]->destination_index();
if (dest_id == shape_id || dest_id < 0)
return shape_id; // Is master already.
int master_id = shape_table_[dest_id]->destination_index();
if (master_id == dest_id || master_id < 0)
return dest_id; // Dest is the master and shape_id points to it.
master_id = MasterDestinationIndex(master_id);
return master_id;
}
// Appends the master shapes from other to this.
void ShapeTable::AppendMasterShapes(const ShapeTable& other) {
for (int s = 0; s < other.shape_table_.size(); ++s) {
if (other.shape_table_[s]->destination_index() < 0) {
AddShape(*other.shape_table_[s]);
}
}
}
// Returns the number of master shapes remaining after merging.
int ShapeTable::NumMasterShapes() const {
int num_shapes = 0;
for (int s = 0; s < shape_table_.size(); ++s) {
if (shape_table_[s]->destination_index() < 0)
++num_shapes;
}
return num_shapes;
}
} // namespace tesseract

227
classify/shapetable.h Normal file
View File

@ -0,0 +1,227 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: shapetable.h
// Description: Class to map a classifier shape index to unicharset
// indices and font indices.
// Author: Ray Smith
// Created: Thu Oct 28 17:46:32 PDT 2010
//
// (C) Copyright 2010, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
#define TESSERACT_CLASSIFY_SHAPETABLE_H_
#include "genericvector.h"
#include "intmatcher.h"
class STRING;
class UNICHARSET;
namespace tesseract {
// Simple struct to hold a set of fonts associated with a single unichar-id.
// A vector of UnicharAndFonts makes a shape.
struct UnicharAndFonts {
UnicharAndFonts() : unichar_id(0) {
}
UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
font_ids.push_back(font_id);
}
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp);
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
static int SortByUnicharId(const void* v1, const void* v2);
GenericVector<int> font_ids;
int unichar_id;
};
// A Shape is a collection of unichar-ids and a list of fonts associated with
// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
// a classifiable unit, and represents a group of characters or parts of
// characters that have a similar or identical shape. Shapes/ShapeTables may
// be organized hierarchically from identical shapes at the leaves to vaguely
// similar shapes near the root.
class Shape {
public:
Shape() : destination_index_(-1) {}
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp);
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
int destination_index() const {
return destination_index_;
}
void set_destination_index(int index) {
destination_index_ = index;
}
int size() const {
return unichars_.size();
}
// Returns a UnicharAndFonts entry for the given index, which must be
// in the range [0, size()).
const UnicharAndFonts& operator[](int index) const {
return unichars_[index];
}
// Adds a font_id for the given unichar_id. If the unichar_id is not
// in the shape, it is added.
void AddToShape(int unichar_id, int font_id);
// Adds everything in other to this.
void AddShape(const Shape& other);
// Returns true if the shape contains the given unichar_id, font_id pair.
bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
// Returns true if the shape contains the given unichar_id, ignoring font.
bool ContainsUnichar(int unichar_id) const;
// Returns true if the shape contains the given font, ignoring unichar_id.
bool ContainsFont(int font_id) const;
// Returns true if this is a subset (including equal) of other.
bool IsSubsetOf(const Shape& other) const;
// Returns true if the lists of unichar ids are the same in this and other,
// ignoring fonts.
// NOT const, as it will sort the unichars on demand.
bool IsEqualUnichars(Shape* other);
private:
// Sorts the unichars_ vector by unichar.
void SortUnichars();
// Flag indicates that the unichars are sorted, allowing faster set
// operations with another shape.
bool unichars_sorted_;
// If this Shape is part of a ShapeTable the destiation_index_ is the index
// of some other shape in the ShapeTable with which this shape is merged.
int destination_index_;
// Array of unichars, each with a set of fonts. Each unichar has at most
// one entry in the vector.
GenericVector<UnicharAndFonts> unichars_;
};
// ShapeTable is a class to encapsulate the triple indirection that is
// used here.
// ShapeTable is a vector of shapes.
// Each shape is a vector of UnicharAndFonts representing the set of unichars
// that the shape represents.
// Each UnicharAndFonts also lists the fonts of the unichar_id that were
// mapped to the shape during training.
class ShapeTable {
public:
ShapeTable();
// The UNICHARSET reference supplied here, or in set_unicharset below must
// exist for the entire life of the ShapeTable. It is used only by DebugStr.
explicit ShapeTable(const UNICHARSET& unicharset);
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Accessors.
int NumShapes() const {
return shape_table_.size();
}
const UNICHARSET& unicharset() const {
return *unicharset_;
}
// Shapetable takes a pointer to the UNICHARSET, so it must persist for the
// entire life of the ShapeTable.
void set_unicharset(const UNICHARSET& unicharset) {
unicharset_ = &unicharset;
}
// Returns a string listing the classes/fonts in a shape.
STRING DebugStr(int shape_id) const;
// Returns a debug string summarizing the table.
STRING SummaryStr() const;
// Adds a new shape starting with the given unichar_id and font_id.
// Returns the assigned index.
int AddShape(int unichar_id, int font_id);
// Adds a copy of the given shape.
// Returns the assigned index.
int AddShape(const Shape& other);
// Removes the shape given by the shape index. All indices above are changed!
void DeleteShape(int shape_id);
// Adds a font_id to the given existing shape index for the given
// unichar_id. If the unichar_id is not in the shape, it is added.
void AddToShape(int shape_id, int unichar_id, int font_id);
// Adds the given shape to the existing shape with the given index.
void AddShapeToShape(int shape_id, const Shape& other);
// Returns the id of the shape that contains the given unichar and font.
// If not found, returns -1.
// If font_id < 0, the font_id is ignored and the first shape that matches
// the unichar_id is returned.
int FindShape(int unichar_id, int font_id) const;
// Returns the first unichar_id and font_id in the given shape.
void GetFirstUnicharAndFont(int shape_id,
int* unichar_id, int* font_id) const;
// Accessors for the Shape with the given shape_id.
const Shape& GetShape(int shape_id) const {
return *shape_table_[shape_id];
}
Shape* MutableShape(int shape_id) {
return shape_table_[shape_id];
}
// Expands all the classes/fonts in the shape individually to build
// a ShapeTable.
int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
// Returns true if the shapes are already merged.
bool AlreadyMerged(int shape_id1, int shape_id2);
// Returns true if any shape contains multiple unichars.
bool AnyMultipleUnichars();
// Returns the maximum number of unichars over all shapes.
int MaxNumUnichars() const;
// Merges shapes with a common unichar over the [start, end) interval.
// Assumes single unichar per shape.
void ForceFontMerges(int start, int end);
// Returns the number of unichars in the master shape.
int MasterUnicharCount(int shape_id) const;
// Returns the sum of the font counts in the master shape.
int MasterFontCount(int shape_id) const;
// Returns the number of unichars that would result from merging the shapes.
int MergedUnicharCount(int shape_id1, int shape_id2) const;
// Merges two shape_ids, leaving shape_id2 marked as merged.
void MergeShapes(int shape_id1, int shape_id2);
// Appends the master shapes from other to this.
// Used to create a clean ShapeTable from a merged one, or to create a
// copy of a ShapeTable.
void AppendMasterShapes(const ShapeTable& other);
// Returns the number of master shapes remaining after merging.
int NumMasterShapes() const;
// Returns the destination of this shape, (if merged), taking into account
// the fact that the destination may itself have been merged.
// For a non-merged shape, returns the input shape_id.
int MasterDestinationIndex(int shape_id) const;
private:
// Pointer to a provided unicharset used only by the Debugstr member.
const UNICHARSET* unicharset_;
// Vector of pointers to the Shapes in this ShapeTable.
PointerVector<Shape> shape_table_;
};
} // namespace tesseract.
#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_

View File

@ -70,7 +70,7 @@ void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
if (Choices->length() == 0) {
blob_choice =
new BLOB_CHOICE(0, speckle_small_certainty + speckle_large_penalty,
speckle_small_certainty, -1, -1, NULL);
speckle_small_certainty, -1, -1, NULL, 0, 0, false);
temp_it.add_to_end(blob_choice);
return;
}
@ -81,7 +81,7 @@ void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
blob_choice = temp_it.data(); // pick the worst choice
temp_it.add_to_end(
new BLOB_CHOICE(0, blob_choice->rating() + speckle_large_penalty,
blob_choice->certainty(), -1, -1, NULL));
blob_choice->certainty(), -1, -1, NULL, 0, 0, false));
} /* AddLargeSpeckleTo */
@ -100,18 +100,8 @@ void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
*
* @return TRUE if Blob is speckle, FALSE otherwise.
*/
BOOL8 LargeSpeckle(TBLOB *Blob) {
double speckle_size;
TPOINT TopLeft;
TPOINT BottomRight;
speckle_size = BASELINE_SCALE * speckle_large_max_size;
blob_bounding_box(Blob, &TopLeft, &BottomRight);
if (TopLeft.y - BottomRight.y < speckle_size &&
BottomRight.x - TopLeft.x < speckle_size)
return (TRUE);
else
return (FALSE);
BOOL8 LargeSpeckle(TBLOB *blob) {
double speckle_size = BASELINE_SCALE * speckle_large_max_size;
TBOX bbox = blob->bounding_box();
return (bbox.width() < speckle_size && bbox.height() < speckle_size);
} /* LargeSpeckle */

View File

@ -0,0 +1,52 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: tessclassifier.cpp
// Description: Tesseract implementation of a ShapeClassifier.
// Author: Ray Smith
// Created: Tue Nov 22 14:16:25 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "tessclassifier.h"
#include "classify.h"
#include "trainingsample.h"
namespace tesseract {
// Classifies the given [training] sample, writing to results.
// See ShapeClassifier for a full description.
int TessClassifier::ClassifySample(const TrainingSample& sample,
Pix* page_pix, int debug, int keep_this,
GenericVector<ShapeRating>* results) {
if (debug) {
classify_->matcher_debug_level.set_value(debug ? 2 : 0);
classify_->matcher_debug_flags.set_value(debug ? 25 : 0);
classify_->classify_debug_level.set_value(debug ? 3 : 0);
} else {
classify_->classify_debug_level.set_value(debug ? 2 : 0);
}
classify_->CharNormTrainingSample(pruner_only_, sample, results);
return results->size();
}
// Provides access to the ShapeTable that this classifier works with.
const ShapeTable* TessClassifier::GetShapeTable() const {
return classify_->shape_table();
}
} // namespace tesseract

65
classify/tessclassifier.h Normal file
View File

@ -0,0 +1,65 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: tessclassifier.h
// Description: Tesseract implementation of a ShapeClassifier.
// Author: Ray Smith
// Created: Tue Nov 22 14:10:45 PST 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_
#define THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_
#include "shapeclassifier.h"
namespace tesseract {
class Classify;
class TrainingSample;
// Tesseract implementation of a ShapeClassifier.
// Due to limitations in the content of TrainingSample, this currently
// only works for the static classifier and only works if the ShapeTable
// in classify is not NULL.
class TessClassifier : public ShapeClassifier {
public:
TessClassifier(bool pruner_only, tesseract::Classify* classify)
: pruner_only_(pruner_only), classify_(classify) {}
virtual ~TessClassifier() {}
// Classifies the given [training] sample, writing to results.
// See ShapeClassifier for a full description.
virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
int debug, int keep_this,
GenericVector<ShapeRating>* results);
// Provides access to the ShapeTable that this classifier works with.
virtual const ShapeTable* GetShapeTable() const;
private:
// Indicates that this classifier is to use just the ClassPruner, or the
// full classifier if false.
bool pruner_only_;
// Borrowed pointer to the actual Tesseract classifier.
tesseract::Classify* classify_;
};
} // namespace tesseract
#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_ */

311
classify/trainingsample.cpp Normal file
View File

@ -0,0 +1,311 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "trainingsample.h"
#include <math.h>
#include "allheaders.h"
#include "helpers.h"
#include "intfeaturemap.h"
#include "normfeat.h"
#include "shapetable.h"
namespace tesseract {
ELISTIZE(TrainingSample)
// Center of randomizing operations.
const int kRandomizingCenter = 128;
// Randomizing factors.
const int TrainingSample::kYShiftValues[kSampleYShiftSize] = {
6, 3, -3, -6, 0
};
const double TrainingSample::kScaleValues[kSampleScaleSize] = {
1.0625, 0.9375, 1.0
};
TrainingSample::~TrainingSample() {
delete [] features_;
delete [] micro_features_;
}
// WARNING! Serialize/DeSerialize do not save/restore the "cache" data
// members, which is mostly the mapped features, and the weight.
// It is assumed these can all be reconstructed from what is saved.
// Writes to the given file. Returns false in case of error.
bool TrainingSample::Serialize(FILE* fp) const {
if (fwrite(&class_id_, sizeof(class_id_), 1, fp) != 1) return false;
if (fwrite(&font_id_, sizeof(font_id_), 1, fp) != 1) return false;
if (fwrite(&page_num_, sizeof(page_num_), 1, fp) != 1) return false;
if (!bounding_box_.Serialize(fp)) return false;
if (fwrite(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
if (fwrite(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
return false;
if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_)
return false;
if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_,
fp) != num_micro_features_)
return false;
if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
kNumCNParams) return false;
if (fwrite(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount)
return false;
return true;
}
// Creates from the given file. Returns NULL in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
TrainingSample* TrainingSample::DeSerializeCreate(bool swap, FILE* fp) {
TrainingSample* sample = new TrainingSample;
if (sample->DeSerialize(swap, fp)) return sample;
delete sample;
return NULL;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool TrainingSample::DeSerialize(bool swap, FILE* fp) {
if (fread(&class_id_, sizeof(class_id_), 1, fp) != 1) return false;
if (fread(&font_id_, sizeof(font_id_), 1, fp) != 1) return false;
if (fread(&page_num_, sizeof(page_num_), 1, fp) != 1) return false;
if (!bounding_box_.DeSerialize(swap, fp)) return false;
if (fread(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
if (fread(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
return false;
if (swap) {
ReverseN(&class_id_, sizeof(class_id_));
ReverseN(&num_features_, sizeof(num_features_));
ReverseN(&num_micro_features_, sizeof(num_micro_features_));
}
delete [] features_;
features_ = new INT_FEATURE_STRUCT[num_features_];
if (fread(features_, sizeof(*features_), num_features_, fp) != num_features_)
return false;
delete [] micro_features_;
micro_features_ = new MicroFeature[num_micro_features_];
if (fread(micro_features_, sizeof(*micro_features_), num_micro_features_,
fp) != num_micro_features_)
return false;
if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
kNumCNParams) return false;
if (fread(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount)
return false;
return true;
}
// Saves the given features into a TrainingSample.
TrainingSample* TrainingSample::CopyFromFeatures(
const INT_FX_RESULT_STRUCT& fx_info, const INT_FEATURE_STRUCT* features,
int num_features) {
TrainingSample* sample = new TrainingSample;
sample->num_features_ = num_features;
sample->features_ = new INT_FEATURE_STRUCT[num_features];
memcpy(sample->features_, features, num_features * sizeof(features[0]));
sample->geo_feature_[GeoBottom] = fx_info.YBottom;
sample->geo_feature_[GeoTop] = fx_info.YTop;
sample->geo_feature_[GeoWidth] = fx_info.Width;
sample->features_are_indexed_ = false;
sample->features_are_mapped_ = false;
return sample;
}
// Constructs and returns a copy randomized by the method given by
// the randomizer index. If index is out of [0, kSampleRandomSize) then
// an exact copy is returned.
TrainingSample* TrainingSample::RandomizedCopy(int index) const {
TrainingSample* sample = Copy();
if (index >= 0 && index < kSampleRandomSize) {
++index; // Remove the first combination.
int yshift = kYShiftValues[index / kSampleScaleSize];
double scaling = kScaleValues[index % kSampleScaleSize];
for (int i = 0; i < num_features_; ++i) {
double result = (features_[i].X - kRandomizingCenter) * scaling;
result += kRandomizingCenter;
sample->features_[i].X = ClipToRange(static_cast<int>(result + 0.5), 0,
MAX_UINT8);
result = (features_[i].Y - kRandomizingCenter) * scaling;
result += kRandomizingCenter + yshift;
sample->features_[i].Y = ClipToRange(static_cast<int>(result + 0.5), 0,
MAX_UINT8);
}
}
return sample;
}
// Constructs and returns an exact copy.
TrainingSample* TrainingSample::Copy() const {
TrainingSample* sample = new TrainingSample;
sample->class_id_ = class_id_;
sample->font_id_ = font_id_;
sample->weight_ = weight_;
sample->sample_index_ = sample_index_;
sample->num_features_ = num_features_;
if (num_features_ > 0) {
sample->features_ = new INT_FEATURE_STRUCT[num_features_];
memcpy(sample->features_, features_, num_features_ * sizeof(features_[0]));
}
sample->num_micro_features_ = num_micro_features_;
if (num_micro_features_ > 0) {
sample->micro_features_ = new MicroFeature[num_micro_features_];
memcpy(sample->micro_features_, micro_features_,
num_micro_features_ * sizeof(micro_features_[0]));
}
memcpy(sample->cn_feature_, cn_feature_, sizeof(*cn_feature_) * kNumCNParams);
memcpy(sample->geo_feature_, geo_feature_, sizeof(*geo_feature_) * GeoCount);
return sample;
}
// Extracts the needed information from the CHAR_DESC_STRUCT.
void TrainingSample::ExtractCharDesc(int int_feature_type,
int micro_type,
int cn_type,
int geo_type,
CHAR_DESC_STRUCT* char_desc) {
// Extract the INT features.
if (features_ != NULL) delete [] features_;
FEATURE_SET_STRUCT* char_features = char_desc->FeatureSets[int_feature_type];
if (char_features == NULL) {
tprintf("Error: no features to train on of type %s\n",
kIntFeatureType);
num_features_ = 0;
features_ = NULL;
} else {
num_features_ = char_features->NumFeatures;
features_ = new INT_FEATURE_STRUCT[num_features_];
for (int f = 0; f < num_features_; ++f) {
features_[f].X =
static_cast<uinT8>(char_features->Features[f]->Params[IntX]);
features_[f].Y =
static_cast<uinT8>(char_features->Features[f]->Params[IntY]);
features_[f].Theta =
static_cast<uinT8>(char_features->Features[f]->Params[IntDir]);
features_[f].CP_misses = 0;
}
}
// Extract the Micro features.
if (micro_features_ != NULL) delete [] micro_features_;
char_features = char_desc->FeatureSets[micro_type];
if (char_features == NULL) {
tprintf("Error: no features to train on of type %s\n",
kMicroFeatureType);
num_micro_features_ = 0;
micro_features_ = NULL;
} else {
num_micro_features_ = char_features->NumFeatures;
micro_features_ = new MicroFeature[num_micro_features_];
for (int f = 0; f < num_micro_features_; ++f) {
for (int d = 0; d < MFCount; ++d) {
micro_features_[f][d] = char_features->Features[f]->Params[d];
}
}
}
// Extract the CN feature.
char_features = char_desc->FeatureSets[cn_type];
if (char_features == NULL) {
tprintf("Error: no CN feature to train on.\n");
} else {
ASSERT_HOST(char_features->NumFeatures == 1);
cn_feature_[CharNormY] = char_features->Features[0]->Params[CharNormY];
cn_feature_[CharNormLength] =
char_features->Features[0]->Params[CharNormLength];
cn_feature_[CharNormRx] = char_features->Features[0]->Params[CharNormRx];
cn_feature_[CharNormRy] = char_features->Features[0]->Params[CharNormRy];
}
// Extract the Geo feature.
char_features = char_desc->FeatureSets[geo_type];
if (char_features == NULL) {
tprintf("Error: no Geo feature to train on.\n");
} else {
ASSERT_HOST(char_features->NumFeatures == 1);
geo_feature_[GeoBottom] = char_features->Features[0]->Params[GeoBottom];
geo_feature_[GeoTop] = char_features->Features[0]->Params[GeoTop];
geo_feature_[GeoWidth] = char_features->Features[0]->Params[GeoWidth];
}
features_are_indexed_ = false;
features_are_mapped_ = false;
}
// Sets the mapped_features_ from the features_ using the provided
// feature_space to the indexed versions of the features.
void TrainingSample::IndexFeatures(const IntFeatureSpace& feature_space) {
GenericVector<int> indexed_features;
feature_space.IndexAndSortFeatures(features_, num_features_,
&mapped_features_);
features_are_indexed_ = true;
features_are_mapped_ = false;
}
// Sets the mapped_features_ from the features using the provided
// feature_map.
void TrainingSample::MapFeatures(const IntFeatureMap& feature_map) {
GenericVector<int> indexed_features;
feature_map.feature_space().IndexAndSortFeatures(features_, num_features_,
&indexed_features);
feature_map.MapIndexedFeatures(indexed_features, &mapped_features_);
features_are_indexed_ = false;
features_are_mapped_ = true;
}
// Returns a pix representing the sample. (Int features only.)
Pix* TrainingSample::RenderToPix(const UNICHARSET* unicharset) const {
Pix* pix = pixCreate(kIntFeatureExtent, kIntFeatureExtent, 1);
for (int f = 0; f < num_features_; ++f) {
int start_x = features_[f].X;
int start_y = kIntFeatureExtent - features_[f].Y;
double dx = cos((features_[f].Theta / 256.0) * 2.0 * PI - PI);
double dy = -sin((features_[f].Theta / 256.0) * 2.0 * PI - PI);
for (int i = 0; i <= 5; ++i) {
int x = static_cast<int>(start_x + dx * i);
int y = static_cast<int>(start_y + dy * i);
if (x >= 0 && x < 256 && y >= 0 && y < 256)
pixSetPixel(pix, x, y, 1);
}
}
if (unicharset != NULL)
pixSetText(pix, unicharset->id_to_unichar(class_id_));
return pix;
}
// Displays the features in the given window with the given color.
void TrainingSample::DisplayFeatures(ScrollView::Color color,
ScrollView* window) const {
for (int f = 0; f < num_features_; ++f) {
RenderIntFeature(window, &features_[f], color);
}
}
// Returns a pix of the original sample image. The pix is padded all round
// by padding wherever possible.
// The returned Pix must be pixDestroyed after use.
// If the input page_pix is NULL, NULL is returned.
Pix* TrainingSample::GetSamplePix(int padding, Pix* page_pix) const {
if (page_pix == NULL)
return NULL;
int page_width = pixGetWidth(page_pix);
int page_height = pixGetHeight(page_pix);
TBOX padded_box = bounding_box();
padded_box.pad(padding, padding);
// Clip the padded_box to the limits of the page
TBOX page_box(0, 0, page_width, page_height);
padded_box &= page_box;
Box* box = boxCreate(page_box.left(), page_height - page_box.top(),
page_box.width(), page_box.height());
Pix* sample_pix = pixClipRectangle(page_pix, box, NULL);
boxDestroy(&box);
return sample_pix;
}
} // namespace tesseract

240
classify/trainingsample.h Normal file
View File

@ -0,0 +1,240 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_TRAINING_TRAININGSAMPLE_H__
#define TESSERACT_TRAINING_TRAININGSAMPLE_H__
#include "elst.h"
#include "featdefs.h"
#include "intfx.h"
#include "intmatcher.h"
#include "matrix.h"
#include "mf.h"
#include "picofeat.h"
#include "shapetable.h"
#include "unicharset.h"
struct Pix;
namespace tesseract {
class IntFeatureMap;
class IntFeatureSpace;
class ShapeTable;
// Number of elements of cn_feature_.
static const int kNumCNParams = 4;
// Number of ways to shift the features when randomizing.
static const int kSampleYShiftSize = 5;
// Number of ways to scale the features when randomizing.
static const int kSampleScaleSize = 3;
// Total number of different ways to manipulate the features when randomizing.
// The first and last combinations are removed to avoid an excessive
// top movement (first) and an identity transformation (last).
// WARNING: To avoid patterned duplication of samples, be sure to keep
// kSampleRandomSize prime!
// Eg with current values (kSampleYShiftSize = 5 and TkSampleScaleSize = 3)
// kSampleRandomSize is 13, which is prime.
static const int kSampleRandomSize = kSampleYShiftSize * kSampleScaleSize - 2;
// ASSERT_IS_PRIME(kSampleRandomSize) !!
class TrainingSample : public ELIST_LINK {
public:
TrainingSample()
: class_id_(INVALID_UNICHAR_ID), font_id_(0), page_num_(0),
num_features_(0), num_micro_features_(0),
features_(NULL), micro_features_(NULL), weight_(1.0),
max_dist_(0.0), sample_index_(0),
features_are_indexed_(false), features_are_mapped_(false),
is_error_(false) {
}
~TrainingSample();
// Saves the given features into a TrainingSample. The features are copied,
// so may be deleted afterwards. Delete the return value after use.
static TrainingSample* CopyFromFeatures(const INT_FX_RESULT_STRUCT& fx_info,
const INT_FEATURE_STRUCT* features,
int num_features);
// Constructs and returns a copy "randomized" by the method given by
// the randomizer index. If index is out of [0, kSampleRandomSize) then
// an exact copy is returned.
TrainingSample* RandomizedCopy(int index) const;
// Constructs and returns an exact copy.
TrainingSample* Copy() const;
// WARNING! Serialize/DeSerialize do not save/restore the "cache" data
// members, which is mostly the mapped features, and the weight.
// It is assumed these can all be reconstructed from what is saved.
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Creates from the given file. Returns NULL in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
static TrainingSample* DeSerializeCreate(bool swap, FILE* fp);
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Extracts the needed information from the CHAR_DESC_STRUCT.
void ExtractCharDesc(int feature_type, int micro_type,
int cn_type, int geo_type,
CHAR_DESC_STRUCT* char_desc);
// Sets the mapped_features_ from the features_ using the provided
// feature_space to the indexed versions of the features.
void IndexFeatures(const IntFeatureSpace& feature_space);
// Sets the mapped_features_ from the features_ using the provided
// feature_map.
void MapFeatures(const IntFeatureMap& feature_map);
// Returns a pix representing the sample. (Int features only.)
Pix* RenderToPix(const UNICHARSET* unicharset) const;
// Displays the features in the given window with the given color.
void DisplayFeatures(ScrollView::Color color, ScrollView* window) const;
// Returns a pix of the original sample image. The pix is padded all round
// by padding wherever possible.
// The returned Pix must be pixDestroyed after use.
// If the input page_pix is NULL, NULL is returned.
Pix* GetSamplePix(int padding, Pix* page_pix) const;
// Accessors.
UNICHAR_ID class_id() const {
return class_id_;
}
void set_class_id(int id) {
class_id_ = id;
}
int font_id() const {
return font_id_;
}
void set_font_id(int id) {
font_id_ = id;
}
int page_num() const {
return page_num_;
}
void set_page_num(int page) {
page_num_ = page;
}
const TBOX& bounding_box() const {
return bounding_box_;
}
void set_bounding_box(const TBOX& box) {
bounding_box_ = box;
}
int num_features() const {
return num_features_;
}
const INT_FEATURE_STRUCT* features() const {
return features_;
}
int num_micro_features() const {
return num_micro_features_;
}
const MicroFeature* micro_features() const {
return micro_features_;
}
float cn_feature(int index) const {
return cn_feature_[index];
}
int geo_feature(int index) const {
return geo_feature_[index];
}
double weight() const {
return weight_;
}
void set_weight(double value) {
weight_ = value;
}
double max_dist() const {
return max_dist_;
}
void set_max_dist(double value) {
max_dist_ = value;
}
int sample_index() const {
return sample_index_;
}
void set_sample_index(int value) {
sample_index_ = value;
}
bool features_are_mapped() const {
return features_are_mapped_;
}
const GenericVector<int>& mapped_features() const {
ASSERT_HOST(features_are_mapped_);
return mapped_features_;
}
const GenericVector<int>& indexed_features() const {
ASSERT_HOST(features_are_indexed_);
return mapped_features_;
}
bool is_error() const {
return is_error_;
}
void set_is_error(bool value) {
is_error_ = value;
}
private:
// Unichar id that this sample represents. There obviously must be a
// reference UNICHARSET somewhere. Usually in TrainingSampleSet.
UNICHAR_ID class_id_;
// Font id in which this sample was printed. Refers to a fontinfo_table_ in
// MasterTrainer.
int font_id_;
// Number of page that the sample came from.
int page_num_;
// Bounding box of sample in original image.
TBOX bounding_box_;
// Number of INT_FEATURE_STRUCT in features_ array.
int num_features_;
// Number of MicroFeature in micro_features_ array.
int num_micro_features_;
// Array of features.
INT_FEATURE_STRUCT* features_;
// Array of features.
MicroFeature* micro_features_;
// The one and only CN feature. Indexed by NORM_PARAM_NAME enum.
float cn_feature_[kNumCNParams];
// The one and only geometric feature. (Aims at replacing cn_feature_).
// Indexed by GeoParams enum in picofeat.h
int geo_feature_[GeoCount];
// Non-serialized cache data.
// Weight used for boosting training.
double weight_;
// Maximum distance to other samples of same class/font used in computing
// the canonical sample.
double max_dist_;
// Global index of this sample.
int sample_index_;
// Indexed/mapped features, as indicated by the bools below.
GenericVector<int> mapped_features_;
bool features_are_indexed_;
bool features_are_mapped_;
// True if the last classification was an error by the current definition.
bool is_error_;
// Randomizing factors.
static const int kYShiftValues[kSampleYShiftSize];
static const double kScaleValues[kSampleScaleSize];
};
ELISTIZEH(TrainingSample)
} // namespace tesseract
#endif // TESSERACT_TRAINING_TRAININGSAMPLE_H__

View File

@ -0,0 +1,870 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "trainingsampleset.h"
#include "allheaders.h"
#include "boxread.h"
#include "fontinfo.h"
#include "indexmapbidi.h"
#include "intfeaturedist.h"
#include "intfeaturemap.h"
#include "intfeaturespace.h"
#include "shapetable.h"
#include "trainingsample.h"
#include "unicity_table.h"
namespace tesseract {
const int kTestChar = -1; // 37;
// Max number of distances to compute the squared way
const int kSquareLimit = 25;
// Prime numbers for subsampling distances.
const int kPrime1 = 17;
const int kPrime2 = 13;
// Min samples from which to start discarding outliers.
const int kMinOutlierSamples = 5;
TrainingSampleSet::FontClassInfo::FontClassInfo()
: num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
}
// Writes to the given file. Returns false in case of error.
bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const {
if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
return false;
if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
return false;
if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
if (!samples.Serialize(fp)) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
return false;
if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
return false;
if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
if (!samples.DeSerialize(swap, fp)) return false;
if (swap) {
ReverseN(&num_raw_samples, sizeof(num_raw_samples));
ReverseN(&canonical_sample, sizeof(canonical_sample));
ReverseN(&canonical_dist, sizeof(canonical_dist));
}
return true;
}
TrainingSampleSet::TrainingSampleSet(const UnicityTable<FontInfo>& font_table)
: num_raw_samples_(0), unicharset_size_(0),
font_class_array_(NULL), fontinfo_table_(font_table) {
}
TrainingSampleSet::~TrainingSampleSet() {
delete font_class_array_;
}
// Writes to the given file. Returns false in case of error.
bool TrainingSampleSet::Serialize(FILE* fp) const {
if (!samples_.Serialize(fp)) return false;
if (!unicharset_.save_to_file(fp)) return false;
if (!font_id_map_.Serialize(fp)) return false;
inT8 not_null = font_class_array_ != NULL;
if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
if (not_null) {
if (!font_class_array_->SerializeClasses(fp)) return false;
}
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
if (!samples_.DeSerialize(swap, fp)) return false;
num_raw_samples_ = samples_.size();
if (!unicharset_.load_from_file(fp)) return false;
if (!font_id_map_.DeSerialize(swap, fp)) return false;
if (font_class_array_ != NULL) {
delete font_class_array_;
font_class_array_ = NULL;
}
inT8 not_null;
if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
if (not_null) {
FontClassInfo empty;
font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
}
unicharset_size_ = unicharset_.size();
return true;
}
// Load an initial unicharset, or set one up if the file cannot be read.
void TrainingSampleSet::LoadUnicharset(const char* filename) {
if (!unicharset_.load_from_file(filename)) {
tprintf("Failed to load unicharset from file %s\n"
"Building unicharset for boosting from scratch...\n",
filename);
unicharset_.clear();
// Space character needed to represent NIL_LIST classification.
unicharset_.unichar_insert(" ");
}
unicharset_size_ = unicharset_.size();
}
// Adds a character sample to this sample set.
// If the unichar is not already in the local unicharset, it is added.
// Returns the unichar_id of the added sample, from the local unicharset.
int TrainingSampleSet::AddSample(const char* unichar, TrainingSample* sample) {
if (!unicharset_.contains_unichar(unichar)) {
unicharset_.unichar_insert(unichar);
if (unicharset_.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
"greater than MAX_NUM_CLASSES\n");
return -1;
}
}
UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
AddSample(char_id, sample);
return char_id;
}
// Adds a character sample to this sample set with the given unichar_id,
// which must correspond to the local unicharset (in this).
void TrainingSampleSet::AddSample(int unichar_id, TrainingSample* sample) {
sample->set_class_id(unichar_id);
samples_.push_back(sample);
num_raw_samples_ = samples_.size();
unicharset_size_ = unicharset_.size();
}
// Returns the number of samples for the given font,class pair.
// If randomize is true, returns the number of samples accessible
// with randomizing on. (Increases the number of samples if small.)
// OrganizeByFontAndClass must have been already called.
int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
bool randomize) const {
ASSERT_HOST(font_class_array_ != NULL);
if (font_id < 0 || class_id < 0 ||
font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
// There are no samples because the font or class doesn't exist.
return 0;
}
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0)
return 0; // The font has no samples.
if (randomize)
return (*font_class_array_)(font_index, class_id).samples.size();
else
return (*font_class_array_)(font_index, class_id).num_raw_samples;
}
// Gets a sample by its index.
const TrainingSample* TrainingSampleSet::GetSample(int index) const {
return samples_[index];
}
// Gets a sample by its font, class, index.
// OrganizeByFontAndClass must have been already called.
const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
int index) const {
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return NULL;
int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
return samples_[sample_index];
}
// Get a sample by its font, class, index. Does not randomize.
// OrganizeByFontAndClass must have been already called.
TrainingSample* TrainingSampleSet::MutableSample(int font_id, int class_id,
int index) {
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return NULL;
int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
return samples_[sample_index];
}
// Returns a string debug representation of the given sample:
// font, unichar_str, bounding box, page.
STRING TrainingSampleSet::SampleToString(const TrainingSample& sample) const {
STRING boxfile_str;
MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
sample.bounding_box(), sample.page_num(), &boxfile_str);
return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
}
// Gets the combined set of features used by all the samples of the given
// font/class combination.
const BitVector& TrainingSampleSet::GetCloudFeatures(
int font_id, int class_id) const {
int font_index = font_id_map_.SparseToCompact(font_id);
ASSERT_HOST(font_index >= 0);
return (*font_class_array_)(font_index, class_id).cloud_features;
}
// Gets the indexed features of the canonical sample of the given
// font/class combination.
const GenericVector<int>& TrainingSampleSet::GetCanonicalFeatures(
int font_id, int class_id) const {
int font_index = font_id_map_.SparseToCompact(font_id);
ASSERT_HOST(font_index >= 0);
return (*font_class_array_)(font_index, class_id).canonical_features;
}
// Returns the distance between the given UniCharAndFonts pair.
// If matched_fonts, only matching fonts, are considered, unless that yields
// the empty set.
// OrganizeByFontAndClass must have been already called.
float TrainingSampleSet::UnicharDistance(const UnicharAndFonts& uf1,
const UnicharAndFonts& uf2,
bool matched_fonts,
const IntFeatureMap& feature_map) {
int num_fonts1 = uf1.font_ids.size();
int c1 = uf1.unichar_id;
int num_fonts2 = uf2.font_ids.size();
int c2 = uf2.unichar_id;
double dist_sum = 0.0;
int dist_count = 0;
bool debug = false;
if (matched_fonts) {
// Compute distances only where fonts match.
for (int i = 0; i < num_fonts1; ++i) {
int f1 = uf1.font_ids[i];
for (int j = 0; j < num_fonts2; ++j) {
int f2 = uf2.font_ids[j];
if (f1 == f2) {
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
++dist_count;
}
}
}
} else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
// Small enough sets to compute all the distances.
for (int i = 0; i < num_fonts1; ++i) {
int f1 = uf1.font_ids[i];
for (int j = 0; j < num_fonts2; ++j) {
int f2 = uf2.font_ids[j];
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
if (debug) {
tprintf("Cluster dist %d %d %d %d = %g\n",
f1, c1, f2, c2,
ClusterDistance(f1, c1, f2, c2, feature_map));
}
++dist_count;
}
}
} else {
// Subsample distances, using the largest set once, and stepping through
// the smaller set so as to ensure that all the pairs are different.
int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
int index = 0;
int num_samples = MAX(num_fonts1, num_fonts2);
for (int i = 0; i < num_samples; ++i, index += increment) {
int f1 = uf1.font_ids[i % num_fonts1];
int f2 = uf2.font_ids[index % num_fonts2];
if (debug) {
tprintf("Cluster dist %d %d %d %d = %g\n",
f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
}
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
++dist_count;
}
}
if (dist_count == 0) {
if (matched_fonts)
return UnicharDistance(uf1, uf2, false, feature_map);
return 0.0f;
}
return dist_sum / dist_count;
}
// Returns the distance between the given pair of font/class pairs.
// Finds in cache or computes and caches.
// OrganizeByFontAndClass must have been already called.
float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map) {
ASSERT_HOST(font_class_array_ != NULL);
int font_index1 = font_id_map_.SparseToCompact(font_id1);
int font_index2 = font_id_map_.SparseToCompact(font_id2);
if (font_index1 < 0 || font_index2 < 0)
return 0.0f;
FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
if (font_id1 == font_id2) {
// Special case cache for speed.
if (fc_info.unichar_distance_cache.size() == 0)
fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
if (fc_info.unichar_distance_cache[class_id2] < 0) {
// Distance has to be calculated.
float result = ComputeClusterDistance(font_id1, class_id1,
font_id2, class_id2,
feature_map);
fc_info.unichar_distance_cache[class_id2] = result;
// Copy to the symmetric cache entry.
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
if (fc_info2.unichar_distance_cache.size() == 0)
fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
fc_info2.unichar_distance_cache[class_id1] = result;
}
return fc_info.unichar_distance_cache[class_id2];
} else if (class_id1 == class_id2) {
// Another special-case cache for equal class-id.
if (fc_info.font_distance_cache.size() == 0)
fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
-1.0f);
if (fc_info.font_distance_cache[font_index2] < 0) {
// Distance has to be calculated.
float result = ComputeClusterDistance(font_id1, class_id1,
font_id2, class_id2,
feature_map);
fc_info.font_distance_cache[font_index2] = result;
// Copy to the symmetric cache entry.
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
if (fc_info2.font_distance_cache.size() == 0)
fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
-1.0f);
fc_info2.font_distance_cache[font_index1] = result;
}
return fc_info.font_distance_cache[font_index2];
}
// Both font and class are different. Linear search for class_id2/font_id2
// in what is a hopefully short list of distances.
int cache_index = 0;
while (cache_index < fc_info.distance_cache.size() &&
(fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
fc_info.distance_cache[cache_index].font_id != font_id2))
++cache_index;
if (cache_index == fc_info.distance_cache.size()) {
// Distance has to be calculated.
float result = ComputeClusterDistance(font_id1, class_id1,
font_id2, class_id2,
feature_map);
FontClassDistance fc_dist = { class_id2, font_id2, result };
fc_info.distance_cache.push_back(fc_dist);
// Copy to the symmetric cache entry. We know it isn't there already, as
// we always copy to the symmetric entry.
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
fc_dist.unichar_id = class_id1;
fc_dist.font_id = font_id1;
fc_info2.distance_cache.push_back(fc_dist);
}
return fc_info.distance_cache[cache_index].distance;
}
// Computes the distance between the given pair of font/class pairs.
float TrainingSampleSet::ComputeClusterDistance(
int font_id1, int class_id1, int font_id2, int class_id2,
const IntFeatureMap& feature_map) const {
int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
feature_map, false);
dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
feature_map, false);
int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
denominator += GetCanonicalFeatures(font_id2, class_id2).size();
return static_cast<float>(dist) / denominator;
}
// Helper to add a feature and its near neighbors to the good_features.
// levels indicates how many times to compute the offset features of what is
// already there. This is done by iteration rather than recursion.
static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
GenericVector<int>* good_features) {
int prev_num_features = 0;
good_features->push_back(f);
int num_features = 1;
for (int level = 0; level < levels; ++level) {
for (int i = prev_num_features; i < num_features; ++i) {
int feature = (*good_features)[i];
for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
if (dir == 0) continue;
int f1 = feature_map.OffsetFeature(feature, dir);
if (f1 >= 0) {
good_features->push_back(f1);
}
}
}
prev_num_features = num_features;
num_features = good_features->size();
}
}
// Returns the number of canonical features of font/class 2 for which
// neither the feature nor any of its near neighbors occurs in the cloud
// of font/class 1. Each such feature is a reliable separation between
// the classes, ASSUMING that the canonical sample is sufficiently
// representative that every sample has a feature near that particular
// feature. To check that this is so on the fly would be prohibitively
// expensive, but it might be possible to pre-qualify the canonical features
// to include only those for which this assumption is true.
// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
// first, or the results will be nonsense.
int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map,
bool thorough) const {
int result = 0;
const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
if (sample2 == NULL)
return 0; // There are no canonical features.
const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
class_id2);
const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
if (cloud1.size() == 0)
return canonical2.size(); // There are no cloud features.
// Find a canonical2 feature that is not in cloud1.
for (int f = 0; f < canonical2.size(); ++f) {
int feature = canonical2[f];
if (cloud1[feature])
continue;
// Gather the near neighbours of f.
GenericVector<int> good_features;
AddNearFeatures(feature_map, feature, 1, &good_features);
// Check that none of the good_features are in the cloud.
int i;
for (i = 0; i < good_features.size(); ++i) {
int good_f = good_features[i];
if (cloud1[good_f]) {
break;
}
}
if (i < good_features.size())
continue; // Found one in the cloud.
++result;
}
return result;
}
// Returns the total index of the requested sample.
// OrganizeByFontAndClass must have been already called.
int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
int index) const {
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return -1;
return (*font_class_array_)(font_index, class_id).samples[index];
}
// Gets the canonical sample for the given font, class pair.
// ComputeCanonicalSamples must have been called first.
const TrainingSample* TrainingSampleSet::GetCanonicalSample(
int font_id, int class_id) const {
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return NULL;
int sample_index = (*font_class_array_)(font_index,
class_id).canonical_sample;
return sample_index >= 0 ? samples_[sample_index] : NULL;
}
// Gets the max distance for the given canonical sample.
// ComputeCanonicalSamples must have been called first.
float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
ASSERT_HOST(font_class_array_ != NULL);
int font_index = font_id_map_.SparseToCompact(font_id);
if (font_index < 0) return 0.0f;
if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
return (*font_class_array_)(font_index, class_id).canonical_dist;
else
return 0.0f;
}
// Generates indexed features for all samples with the supplied feature_space.
void TrainingSampleSet::IndexFeatures(const IntFeatureSpace& feature_space) {
for (int s = 0; s < samples_.size(); ++s)
samples_[s]->IndexFeatures(feature_space);
}
// Delete outlier samples with few features that are shared with others.
// IndexFeatures must have been called already.
void TrainingSampleSet::DeleteOutliers(const IntFeatureSpace& feature_space,
bool debug) {
if (font_class_array_ == NULL)
OrganizeByFontAndClass();
Pixa* pixa = NULL;
if (debug)
pixa = pixaCreate(0);
GenericVector<int> feature_counts;
int fs_size = feature_space.Size();
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
for (int c = 0; c < unicharset_size_; ++c) {
// Create a histogram of the features used by all samples of this
// font/class combination.
feature_counts.init_to_size(fs_size, 0);
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
int sample_count = fcinfo.samples.size();
if (sample_count < kMinOutlierSamples)
continue;
for (int i = 0; i < sample_count; ++i) {
int s = fcinfo.samples[i];
const GenericVector<int>& features = samples_[s]->indexed_features();
for (int f = 0; f < features.size(); ++f) {
++feature_counts[features[f]];
}
}
for (int i = 0; i < sample_count; ++i) {
int s = fcinfo.samples[i];
const TrainingSample& sample = *samples_[s];
const GenericVector<int>& features = sample.indexed_features();
// A feature that has a histogram count of 1 is only used by this
// sample, making it 'bad'. All others are 'good'.
int good_features = 0;
int bad_features = 0;
for (int f = 0; f < features.size(); ++f) {
if (feature_counts[features[f]] > 1)
++good_features;
else
++bad_features;
}
// If more than 1/3 features are bad, then this is an outlier.
if (bad_features * 2 > good_features) {
tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
SampleToString(sample).string(),
good_features, bad_features);
if (debug) {
pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
// Add the previous sample as well, so it is easier to see in
// the output what is wrong with this sample.
int t;
if (i == 0)
t = fcinfo.samples[1];
else
t = fcinfo.samples[i - 1];
const TrainingSample &csample = *samples_[t];
pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
}
// Mark the sample for deletion.
KillSample(samples_[s]);
}
}
}
}
// Truly delete all bad samples and renumber everything.
DeleteDeadSamples();
if (pixa != NULL) {
Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
pixaDestroy(&pixa);
pixWrite("outliers.png", pix, IFF_PNG);
pixDestroy(&pix);
}
}
// Marks the given sample index for deletion.
// Deletion is actually completed by DeleteDeadSamples.
void TrainingSampleSet::KillSample(TrainingSample* sample) {
sample->set_sample_index(-1);
}
// Deletes all samples with zero features marked by KillSample.
void TrainingSampleSet::DeleteDeadSamples() {
samples_.compact(
NewPermanentTessCallback(this, &TrainingSampleSet::DeleteableSample));
num_raw_samples_ = samples_.size();
// Samples must be re-organized now we have deleted a few.
}
// Callback function returns true if the given sample is to be deleted, due
// to having a negative classid.
bool TrainingSampleSet::DeleteableSample(const TrainingSample* sample) {
return sample == NULL || sample->class_id() < 0;
}
static Pix* DebugSample(const UNICHARSET& unicharset,
TrainingSample* sample) {
tprintf("\nOriginal features:\n");
for (int i = 0; i < sample->num_features(); ++i) {
sample->features()[i].print();
}
if (sample->features_are_mapped()) {
tprintf("\nMapped features:\n");
for (int i = 0; i < sample->mapped_features().size(); ++i) {
tprintf("%d ", sample->mapped_features()[i]);
}
tprintf("\n");
}
return sample->RenderToPix(&unicharset);
}
// Construct an array to access the samples by font,class pair.
void TrainingSampleSet::OrganizeByFontAndClass() {
// Font indexes are sparse, so we used a map to compact them, so we can
// have an efficient 2-d array of fonts and character classes.
SetupFontIdMap();
int compact_font_size = font_id_map_.CompactSize();
// Get a 2-d array of generic vectors.
if (font_class_array_ != NULL)
delete font_class_array_;
FontClassInfo empty;
font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
compact_font_size, unicharset_size_, empty);
for (int s = 0; s < samples_.size(); ++s) {
int font_id = samples_[s]->font_id();
int class_id = samples_[s]->class_id();
if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
s);
}
ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
int font_index = font_id_map_.SparseToCompact(font_id);
(*font_class_array_)(font_index, class_id).samples.push_back(s);
}
// Set the num_raw_samples member of the FontClassInfo, to set the boundary
// between the raw samples and the replicated ones.
for (int f = 0; f < compact_font_size; ++f) {
for (int c = 0; c < unicharset_size_; ++c)
(*font_class_array_)(f, c).num_raw_samples =
(*font_class_array_)(f, c).samples.size();
}
// This is the global number of samples and also marks the boundary between
// real and replicated samples.
num_raw_samples_ = samples_.size();
}
// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
// index for the font_class_array_.
void TrainingSampleSet::SetupFontIdMap() {
// Number of samples for each font_id.
GenericVector<int> font_counts;
for (int s = 0; s < samples_.size(); ++s) {
int font_id = samples_[s]->font_id();
while (font_id >= font_counts.size())
font_counts.push_back(0);
++font_counts[font_id];
}
font_id_map_.Init(font_counts.size(), false);
for (int f = 0; f < font_counts.size(); ++f) {
font_id_map_.SetMap(f, font_counts[f] > 0);
}
font_id_map_.Setup();
}
// Finds the sample for each font, class pair that has least maximum
// distance to all the other samples of the same font, class.
// OrganizeByFontAndClass must have been already called.
void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map,
bool debug) {
ASSERT_HOST(font_class_array_ != NULL);
IntFeatureDist f_table;
if (debug) tprintf("feature table size %d\n", map.sparse_size());
f_table.Init(&map);
int worst_s1 = 0;
int worst_s2 = 0;
double global_worst_dist = 0.0;
// Compute distances independently for each font and char index.
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
int font_id = font_id_map_.CompactToSparse(font_index);
for (int c = 0; c < unicharset_size_; ++c) {
int samples_found = 0;
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
if (fcinfo.samples.size() == 0 ||
(kTestChar >= 0 && c != kTestChar)) {
fcinfo.canonical_sample = -1;
fcinfo.canonical_dist = 0.0f;
if (debug) tprintf("Skipping class %d\n", c);
continue;
}
// The canonical sample will be the one with the min_max_dist, which
// is the sample with the lowest maximum distance to all other samples.
double min_max_dist = 2.0;
// We keep track of the farthest apart pair (max_s1, max_s2) which
// are max_max_dist apart, so we can see how bad the variability is.
double max_max_dist = 0.0;
int max_s1 = 0;
int max_s2 = 0;
fcinfo.canonical_sample = fcinfo.samples[0];
fcinfo.canonical_dist = 0.0f;
for (int i = 0; i < fcinfo.samples.size(); ++i) {
int s1 = fcinfo.samples[i];
const GenericVector<int>& features1 = samples_[s1]->indexed_features();
f_table.Set(features1, features1.size(), true);
double max_dist = 0.0;
// Run the full squared-order search for similar samples. It is still
// reasonably fast because f_table.FeatureDistance is fast, but we
// may have to reconsider if we start playing with too many samples
// of a single char/font.
for (int j = 0; j < fcinfo.samples.size(); ++j) {
int s2 = fcinfo.samples[j];
if (samples_[s2]->class_id() != c ||
samples_[s2]->font_id() != font_id ||
s2 == s1)
continue;
GenericVector<int> features2 = samples_[s2]->indexed_features();
double dist = f_table.FeatureDistance(features2);
int height = samples_[s2]->geo_feature(GeoTop) -
samples_[s2]->geo_feature(GeoBottom);
if (dist == 1.0 && height > 64) {
// TODO(rays) rethink this when the polygonal approximation goes.
// Currently it is possible for dots and other small characters
// to be completely different, even within the same class.
f_table.DebugFeatureDistance(features2);
}
if (dist > max_dist) {
max_dist = dist;
if (dist > max_max_dist) {
max_s1 = s1;
max_s2 = s2;
}
}
}
// Using Set(..., false) is far faster than re initializing, due to
// the sparseness of the feature space.
f_table.Set(features1, features1.size(), false);
samples_[s1]->set_max_dist(max_dist);
++samples_found;
if (max_dist < min_max_dist) {
fcinfo.canonical_sample = s1;
fcinfo.canonical_dist = max_dist;
}
UpdateRange(max_dist, &min_max_dist, &max_max_dist);
}
if (max_max_dist > global_worst_dist) {
// Keep a record of the worst pair over all characters/fonts too.
global_worst_dist = max_max_dist;
worst_s1 = max_s1;
worst_s2 = max_s2;
}
if (debug) {
tprintf("Found %d samples of class %d=%s, font %d, "
"dist range [%g, %g], worst pair= %s, %s\n",
samples_found, c, unicharset_.debug_str(c).string(),
font_index, min_max_dist, max_max_dist,
SampleToString(*samples_[max_s1]).string(),
SampleToString(*samples_[max_s2]).string());
}
}
}
if (debug) {
tprintf("Global worst dist = %g, between sample %d and %d\n",
global_worst_dist, worst_s1, worst_s2);
Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
pixOr(pix1, pix1, pix2);
pixWrite("worstpair.png", pix1, IFF_PNG);
pixDestroy(&pix1);
pixDestroy(&pix2);
}
}
// Replicates the samples to a minimum frequency defined by
// 2 * kSampleRandomSize, or for larger counts duplicates all samples.
// After replication, the replicated samples are perturbed slightly, but
// in a predictable and repeatable way.
// Use after OrganizeByFontAndClass().
void TrainingSampleSet::ReplicateAndRandomizeSamples() {
ASSERT_HOST(font_class_array_ != NULL);
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
for (int c = 0; c < unicharset_size_; ++c) {
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
int sample_count = fcinfo.samples.size();
int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
if (sample_count > 0 && sample_count < min_samples) {
int base_count = sample_count;
for (int base_index = 0; sample_count < min_samples; ++sample_count) {
int src_index = fcinfo.samples[base_index++];
if (base_index >= base_count) base_index = 0;
TrainingSample* sample = samples_[src_index]->RandomizedCopy(
sample_count % kSampleRandomSize);
int sample_index = samples_.size();
sample->set_sample_index(sample_index);
samples_.push_back(sample);
fcinfo.samples.push_back(sample_index);
}
}
}
}
}
// Caches the indexed features of the canonical samples.
// ComputeCanonicalSamples must have been already called.
// TODO(rays) see note on ReliablySeparable and try restricting the
// canonical features to those that truly represent all samples.
void TrainingSampleSet::ComputeCanonicalFeatures() {
ASSERT_HOST(font_class_array_ != NULL);
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
int font_id = font_id_map_.CompactToSparse(font_index);
for (int c = 0; c < unicharset_size_; ++c) {
int num_samples = NumClassSamples(font_id, c, false);
if (num_samples == 0)
continue;
const TrainingSample* sample = GetCanonicalSample(font_id, c);
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
fcinfo.canonical_features = sample->indexed_features();
}
}
}
// Computes the combined set of features used by all the samples of each
// font/class combination. Use after ReplicateAndRandomizeSamples.
void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
ASSERT_HOST(font_class_array_ != NULL);
int font_size = font_id_map_.CompactSize();
for (int font_index = 0; font_index < font_size; ++font_index) {
int font_id = font_id_map_.CompactToSparse(font_index);
for (int c = 0; c < unicharset_size_; ++c) {
int num_samples = NumClassSamples(font_id, c, false);
if (num_samples == 0)
continue;
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
fcinfo.cloud_features.Init(feature_space_size);
for (int s = 0; s < num_samples; ++s) {
const TrainingSample* sample = GetSample(font_id, c, s);
const GenericVector<int>& sample_features = sample->indexed_features();
for (int i = 0; i < sample_features.size(); ++i)
fcinfo.cloud_features.SetBit(sample_features[i]);
}
}
}
}
// Adds all fonts of the given class to the shape.
void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
int font_id = font_id_map_.CompactToSparse(f);
shape->AddToShape(class_id, font_id);
}
}
// Display the samples with the given indexed feature that also match
// the given shape.
void TrainingSampleSet::DisplaySamplesWithFeature(int f_index,
const Shape& shape,
const IntFeatureSpace& space,
ScrollView::Color color,
ScrollView* window) const {
for (int s = 0; s < num_raw_samples(); ++s) {
const TrainingSample* sample = GetSample(s);
if (shape.ContainsUnichar(sample->class_id())) {
GenericVector<int> indexed_features;
space.IndexAndSortFeatures(sample->features(), sample->num_features(),
&indexed_features);
for (int f = 0; f < indexed_features.size(); ++f) {
if (indexed_features[f] == f_index) {
sample->DisplayFeatures(color, window);
}
}
}
}
}
} // namespace tesseract.

View File

@ -0,0 +1,290 @@
// Copyright 2010 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__
#define TESSERACT_TRAINING_TRAININGSAMPLESET_H__
#include "bitvector.h"
#include "genericvector.h"
#include "indexmapbidi.h"
#include "matrix.h"
#include "shapetable.h"
#include "trainingsample.h"
class UNICHARSET;
template <typename T> class UnicityTable;
namespace tesseract {
struct FontInfo;
class IntFeatureMap;
class IntFeatureSpace;
class TrainingSample;
class UnicharAndFonts;
// Collection of TrainingSample used for training or testing a classifier.
// Provides several useful methods to operate on the collection as a whole,
// including outlier detection and deletion, providing access by font and
// class, finding the canonical sample, finding the "cloud" features (OR of
// all features in all samples), replication of samples, caching of distance
// metrics.
class TrainingSampleSet {
public:
explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table);
~TrainingSampleSet();
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Accessors
int num_samples() const {
return samples_.size();
}
int num_raw_samples() const {
return num_raw_samples_;
}
int NumFonts() const {
return font_id_map_.SparseSize();
}
const UNICHARSET& unicharset() const {
return unicharset_;
}
int charsetsize() const {
return unicharset_size_;
}
// Loads an initial unicharset, or sets one up if the file cannot be read.
void LoadUnicharset(const char* filename);
// Adds a character sample to this sample set.
// If the unichar is not already in the local unicharset, it is added.
// Returns the unichar_id of the added sample, from the local unicharset.
int AddSample(const char* unichar, TrainingSample* sample);
// Adds a character sample to this sample set with the given unichar_id,
// which must correspond to the local unicharset (in this).
void AddSample(int unichar_id, TrainingSample* sample);
// Returns the number of samples for the given font,class pair.
// If randomize is true, returns the number of samples accessible
// with randomizing on. (Increases the number of samples if small.)
// OrganizeByFontAndClass must have been already called.
int NumClassSamples(int font_id, int class_id, bool randomize) const;
// Gets a sample by its index.
const TrainingSample* GetSample(int index) const;
// Gets a sample by its font, class, index.
// OrganizeByFontAndClass must have been already called.
const TrainingSample* GetSample(int font_id, int class_id, int index) const;
// Get a sample by its font, class, index. Does not randomize.
// OrganizeByFontAndClass must have been already called.
TrainingSample* MutableSample(int font_id, int class_id, int index);
// Returns a string debug representation of the given sample:
// font, unichar_str, bounding box, page.
STRING SampleToString(const TrainingSample& sample) const;
// Gets the combined set of features used by all the samples of the given
// font/class combination.
const BitVector& GetCloudFeatures(int font_id, int class_id) const;
// Gets the indexed features of the canonical sample of the given
// font/class combination.
const GenericVector<int>& GetCanonicalFeatures(int font_id,
int class_id) const;
// Returns the distance between the given UniCharAndFonts pair.
// If matched_fonts, only matching fonts, are considered, unless that yields
// the empty set.
// OrganizeByFontAndClass must have been already called.
float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
bool matched_fonts, const IntFeatureMap& feature_map);
// Returns the distance between the given pair of font/class pairs.
// Finds in cache or computes and caches.
// OrganizeByFontAndClass must have been already called.
float ClusterDistance(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map);
// Computes the distance between the given pair of font/class pairs.
float ComputeClusterDistance(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map) const;
// Returns the number of canonical features of font/class 2 for which
// neither the feature nor any of its near neighbors occurs in the cloud
// of font/class 1. Each such feature is a reliable separation between
// the classes, ASSUMING that the canonical sample is sufficiently
// representative that every sample has a feature near that particular
// feature. To check that this is so on the fly would be prohibitively
// expensive, but it might be possible to pre-qualify the canonical features
// to include only those for which this assumption is true.
// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
// first, or the results will be nonsense.
int ReliablySeparable(int font_id1, int class_id1,
int font_id2, int class_id2,
const IntFeatureMap& feature_map,
bool thorough) const;
// Returns the total index of the requested sample.
// OrganizeByFontAndClass must have been already called.
int GlobalSampleIndex(int font_id, int class_id, int index) const;
// Gets the canonical sample for the given font, class pair.
// ComputeCanonicalSamples must have been called first.
const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
// Gets the max distance for the given canonical sample.
// ComputeCanonicalSamples must have been called first.
float GetCanonicalDist(int font_id, int class_id) const;
// Returns a mutable pointer to the sample with the given index.
TrainingSample* mutable_sample(int index) {
return samples_[index];
}
// Gets ownership of the sample with the given index, removing it from this.
TrainingSample* extract_sample(int index) {
TrainingSample* sample = samples_[index];
samples_[index] = NULL;
return sample;
}
// Generates indexed features for all samples with the supplied feature_space.
void IndexFeatures(const IntFeatureSpace& feature_space);
// Delete outlier samples with few features that are shared with others.
// IndexFeatures must have been called already.
void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug);
// Marks the given sample for deletion.
// Deletion is actually completed by DeleteDeadSamples.
void KillSample(TrainingSample* sample);
// Deletes all samples with a negative sample index marked by KillSample.
// Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
// must be called after as the samples have been renumbered.
void DeleteDeadSamples();
// Callback function returns true if the given sample is to be deleted, due
// to having a negative classid.
bool DeleteableSample(const TrainingSample* sample);
// Construct an array to access the samples by font,class pair.
void OrganizeByFontAndClass();
// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
// index for the font_class_array_.
void SetupFontIdMap();
// Finds the sample for each font, class pair that has least maximum
// distance to all the other samples of the same font, class.
// OrganizeByFontAndClass must have been already called.
void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);
// Replicates the samples to a minimum frequency defined by
// 2 * kSampleRandomSize, or for larger counts duplicates all samples.
// After replication, the replicated samples are perturbed slightly, but
// in a predictable and repeatable way.
// Use after OrganizeByFontAndClass().
void ReplicateAndRandomizeSamples();
// Caches the indexed features of the canonical samples.
// ComputeCanonicalSamples must have been already called.
void ComputeCanonicalFeatures();
// Computes the combined set of features used by all the samples of each
// font/class combination. Use after ReplicateAndRandomizeSamples.
void ComputeCloudFeatures(int feature_space_size);
// Adds all fonts of the given class to the shape.
void AddAllFontsForClass(int class_id, Shape* shape) const;
// Display the samples with the given indexed feature that also match
// the given shape.
void DisplaySamplesWithFeature(int f_index, const Shape& shape,
const IntFeatureSpace& feature_space,
ScrollView::Color color,
ScrollView* window) const;
private:
// Struct to store a triplet of unichar, font, distance in the distance cache.
struct FontClassDistance {
int unichar_id;
int font_id; // Real font id.
float distance;
};
// Simple struct to store information related to each font/class combination.
struct FontClassInfo {
FontClassInfo();
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Number of raw samples.
inT32 num_raw_samples;
// Index of the canonical sample.
inT32 canonical_sample;
// Max distance of the canonical sample from any other.
float canonical_dist;
// Sample indices for the samples, including replicated.
GenericVector<inT32> samples;
// Non-serialized cache data.
// Indexed features of the canonical sample.
GenericVector<int> canonical_features;
// The mapped features of all the samples.
BitVector cloud_features;
// Caches for ClusterDistance.
// Caches for other fonts but matching this unichar. -1 indicates not set.
// Indexed by compact font index from font_id_map_.
GenericVector<float> font_distance_cache;
// Caches for other unichars but matching this font. -1 indicates not set.
GenericVector<float> unichar_distance_cache;
// Cache for the rest (non matching font and unichar.)
// A cache of distances computed by ReliablySeparable.
GenericVector<FontClassDistance> distance_cache;
};
PointerVector<TrainingSample> samples_;
// Number of samples before replication/randomization.
int num_raw_samples_;
// Character set we are training for.
UNICHARSET unicharset_;
// Character set size to which the 2-d arrays below refer.
int unicharset_size_;
// Map to allow the font_class_array_ below to be compact.
// The sparse space is the real font_id, used in samples_ .
// The compact space is an index to font_class_array_
IndexMapBiDi font_id_map_;
// A 2-d array of FontClassInfo holding information related to each
// (font_id, class_id) pair.
GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;
// Reference to the fontinfo_table_ in MasterTrainer. Provides names
// for font_ids in the samples. Not serialized!
const UnicityTable<FontInfo>& fontinfo_table_;
};
} // namespace tesseract.
#endif // TRAININGSAMPLESETSET_H_