mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
Added simultaneous multi-language capability, Added support for ShapeTable in classifier and training, Refactored class pruner, Added new uniform classifier API, Added new training error counter
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@650 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
fdd4ffe85e
commit
5bc5e2a0b4
@ -7,13 +7,15 @@ AM_CPPFLAGS = \
|
||||
include_HEADERS = \
|
||||
adaptive.h baseline.h blobclass.h chartoname.h \
|
||||
classify.h cluster.h clusttool.h cutoffs.h \
|
||||
extern.h extract.h \
|
||||
errorcounter.h extern.h extract.h \
|
||||
featdefs.h flexfx.h float2int.h fpoint.h fxdefs.h \
|
||||
intfeaturedist.h intfeaturemap.h intfeaturespace.h \
|
||||
intfx.h intmatcher.h intproto.h kdtree.h \
|
||||
mf.h mfdefs.h mfoutline.h mfx.h \
|
||||
mastertrainer.h mf.h mfdefs.h mfoutline.h mfx.h \
|
||||
normfeat.h normmatch.h \
|
||||
ocrfeatures.h outfeat.h picofeat.h protos.h \
|
||||
speckle.h xform2d.h
|
||||
sampleiterator.h shapeclassifier.h shapetable.h \
|
||||
speckle.h tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h
|
||||
|
||||
if !USING_MULTIPLELIBS
|
||||
noinst_LTLIBRARIES = libtesseract_classify.la
|
||||
@ -32,12 +34,14 @@ endif
|
||||
libtesseract_classify_la_SOURCES = \
|
||||
adaptive.cpp adaptmatch.cpp blobclass.cpp \
|
||||
chartoname.cpp classify.cpp cluster.cpp clusttool.cpp cutoffs.cpp \
|
||||
extract.cpp \
|
||||
errorcounter.cpp extract.cpp \
|
||||
featdefs.cpp flexfx.cpp float2int.cpp fpoint.cpp fxdefs.cpp \
|
||||
intfeaturedist.cpp intfeaturemap.cpp intfeaturespace.cpp \
|
||||
intfx.cpp intmatcher.cpp intproto.cpp kdtree.cpp \
|
||||
mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
|
||||
mastertrainer.cpp mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
|
||||
normfeat.cpp normmatch.cpp \
|
||||
ocrfeatures.cpp outfeat.cpp picofeat.cpp protos.cpp \
|
||||
speckle.cpp xform2d.cpp
|
||||
sampleiterator.cpp shapetable.cpp speckle.cpp \
|
||||
tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp xform2d.cpp
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -112,11 +112,15 @@ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
|
||||
return;
|
||||
}
|
||||
|
||||
// label the features with a class name and font name
|
||||
fprintf (FeatureFile, "\n%s %s ", FontName, BlobText);
|
||||
if (ValidCharDescription(FeatureDefs, CharDesc)) {
|
||||
// label the features with a class name and font name
|
||||
fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
|
||||
|
||||
// write micro-features to file and clean up
|
||||
WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
|
||||
// write micro-features to file and clean up
|
||||
WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
|
||||
} else {
|
||||
tprintf("Blob learned was invalid!\n");
|
||||
}
|
||||
FreeCharDescription(CharDesc);
|
||||
|
||||
} // LearnBlob
|
||||
|
@ -17,49 +17,19 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "classify.h"
|
||||
#include "fontinfo.h"
|
||||
#include "intproto.h"
|
||||
#include "mfoutline.h"
|
||||
#include "scrollview.h"
|
||||
#include "shapetable.h"
|
||||
#include "unicity_table.h"
|
||||
#include <string.h>
|
||||
|
||||
namespace {
|
||||
|
||||
// Compare FontInfo structures.
|
||||
bool compare_fontinfo(const FontInfo& fi1, const FontInfo& fi2) {
|
||||
// The font properties are required to be the same for two font with the same
|
||||
// name, so there is no need to test them.
|
||||
// Consequently, querying the table with only its font name as information is
|
||||
// enough to retrieve its properties.
|
||||
return strcmp(fi1.name, fi2.name) == 0;
|
||||
}
|
||||
// Compare FontSet structures.
|
||||
bool compare_font_set(const FontSet& fs1, const FontSet& fs2) {
|
||||
if (fs1.size != fs2.size)
|
||||
return false;
|
||||
for (int i = 0; i < fs1.size; ++i) {
|
||||
if (fs1.configs[i] != fs2.configs[i])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void delete_callback(FontInfo f) {
|
||||
if (f.spacing_vec != NULL) {
|
||||
f.spacing_vec->delete_data_pointers();
|
||||
delete f.spacing_vec;
|
||||
}
|
||||
delete[] f.name;
|
||||
}
|
||||
void delete_callback_fs(FontSet fs) {
|
||||
delete[] fs.configs;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace tesseract {
|
||||
Classify::Classify()
|
||||
: INT_MEMBER(tessedit_single_match, FALSE,
|
||||
: BOOL_MEMBER(prioritize_division, FALSE,
|
||||
"Prioritize blob division over chopping", this->params()),
|
||||
INT_MEMBER(tessedit_single_match, FALSE,
|
||||
"Top choice only from CP", this->params()),
|
||||
BOOL_MEMBER(classify_enable_learning, true,
|
||||
"Enable adaptive classifier", this->params()),
|
||||
@ -120,10 +90,6 @@ Classify::Classify()
|
||||
"Penalty to apply when a non-alnum is vertically out of "
|
||||
"its expected textline position",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_enable_int_fx, 1, "Enable integer fx",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_enable_new_adapt_rules, 1,
|
||||
"Enable new adaptation rules", this->params()),
|
||||
double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
|
||||
double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
|
||||
this->params()),
|
||||
@ -149,28 +115,29 @@ Classify::Classify()
|
||||
"One for the protos and one for the features.", this->params()),
|
||||
STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
|
||||
this->params()),
|
||||
INT_INIT_MEMBER(classify_class_pruner_threshold, 229,
|
||||
"Class Pruner Threshold 0-255: ", this->params()),
|
||||
INT_INIT_MEMBER(classify_class_pruner_multiplier, 30,
|
||||
"Class Pruner Multiplier 0-255: ", this->params()),
|
||||
INT_INIT_MEMBER(classify_cp_cutoff_strength, 7,
|
||||
"Class Pruner CutoffStrength: ", this->params()),
|
||||
INT_INIT_MEMBER(classify_integer_matcher_multiplier, 14,
|
||||
"Integer Matcher Multiplier 0-255: ", this->params()),
|
||||
INT_MEMBER(classify_class_pruner_threshold, 229,
|
||||
"Class Pruner Threshold 0-255", this->params()),
|
||||
INT_MEMBER(classify_class_pruner_multiplier, 30,
|
||||
"Class Pruner Multiplier 0-255: ", this->params()),
|
||||
INT_MEMBER(classify_cp_cutoff_strength, 7,
|
||||
"Class Pruner CutoffStrength: ", this->params()),
|
||||
INT_MEMBER(classify_integer_matcher_multiplier, 14,
|
||||
"Integer Matcher Multiplier 0-255: ", this->params()),
|
||||
EnableLearning(true),
|
||||
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_bln_numeric_mode, 0,
|
||||
"Assume the input is numbers [0-9].", this->params()),
|
||||
shape_table_(NULL),
|
||||
dict_(&image_) {
|
||||
fontinfo_table_.set_compare_callback(
|
||||
NewPermanentTessCallback(compare_fontinfo));
|
||||
NewPermanentTessCallback(CompareFontInfo));
|
||||
fontinfo_table_.set_clear_callback(
|
||||
NewPermanentTessCallback(delete_callback));
|
||||
NewPermanentTessCallback(FontInfoDeleteCallback));
|
||||
fontset_table_.set_compare_callback(
|
||||
NewPermanentTessCallback(compare_font_set));
|
||||
NewPermanentTessCallback(CompareFontSet));
|
||||
fontset_table_.set_clear_callback(
|
||||
NewPermanentTessCallback(delete_callback_fs));
|
||||
NewPermanentTessCallback(FontSetDeleteCallback));
|
||||
AdaptedTemplates = NULL;
|
||||
PreTrainedTemplates = NULL;
|
||||
AllProtosOn = NULL;
|
||||
@ -198,6 +165,9 @@ Classify::Classify()
|
||||
learn_debug_win_ = NULL;
|
||||
learn_fragmented_word_debug_win_ = NULL;
|
||||
learn_fragments_debug_win_ = NULL;
|
||||
|
||||
CharNormCutoffs = new uinT16[MAX_NUM_CLASSES];
|
||||
BaselineCutoffs = new uinT16[MAX_NUM_CLASSES];
|
||||
}
|
||||
|
||||
Classify::~Classify() {
|
||||
@ -205,6 +175,8 @@ Classify::~Classify() {
|
||||
delete learn_debug_win_;
|
||||
delete learn_fragmented_word_debug_win_;
|
||||
delete learn_fragments_debug_win_;
|
||||
delete[] CharNormCutoffs;
|
||||
delete[] BaselineCutoffs;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "classify.h"
|
||||
#include "dict.h"
|
||||
#include "featdefs.h"
|
||||
#include "fontinfo.h"
|
||||
#include "intfx.h"
|
||||
#include "intmatcher.h"
|
||||
#include "normalis.h"
|
||||
@ -42,6 +43,9 @@ static const int kBlankFontinfoId = -2;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
struct ShapeRating;
|
||||
class ShapeTable;
|
||||
|
||||
// How segmented is a blob. In this enum, character refers to a classifiable
|
||||
// unit, but that is too long and character is usually easier to understand.
|
||||
enum CharSegmentationType {
|
||||
@ -59,27 +63,41 @@ class Classify : public CCStruct {
|
||||
return dict_;
|
||||
}
|
||||
|
||||
// Set the denorm for classification. Takes a copy.
|
||||
void set_denorm(const DENORM* denorm) {
|
||||
denorm_ = *denorm;
|
||||
const ShapeTable* shape_table() const {
|
||||
return shape_table_;
|
||||
}
|
||||
|
||||
/* adaptive.cpp ************************************************************/
|
||||
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
|
||||
int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
|
||||
int ClassPruner(INT_TEMPLATES IntTemplates,
|
||||
inT16 NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
|
||||
CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
|
||||
CLASS_PRUNER_RESULTS Results);
|
||||
void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
|
||||
// Runs the class pruner from int_templates on the given features, returning
|
||||
// the number of classes output in results.
|
||||
// int_templates Class pruner tables
|
||||
// num_features Number of features in blob
|
||||
// features Array of features
|
||||
// normalization_factors (input) Array of int_templates->NumClasses fudge
|
||||
// factors from blob normalization process.
|
||||
// (Indexed by CLASS_INDEX)
|
||||
// expected_num_features (input) Array of int_templates->NumClasses
|
||||
// expected number of features for each class.
|
||||
// (Indexed by CLASS_INDEX)
|
||||
// results (output) Sorted Array of pruned classes.
|
||||
// Array must be sized to take the maximum possible
|
||||
// number of outputs : int_templates->NumClasses.
|
||||
int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
|
||||
int num_features,
|
||||
const INT_FEATURE_STRUCT* features,
|
||||
const uinT8* normalization_factors,
|
||||
const uinT16* expected_num_features,
|
||||
CP_RESULT_STRUCT* results);
|
||||
void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
|
||||
CLASS_CUTOFF_ARRAY Cutoffs);
|
||||
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
||||
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
||||
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
|
||||
/* normmatch.cpp ************************************************************/
|
||||
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
|
||||
FLOAT32 ComputeNormMatch(CLASS_ID ClassId,
|
||||
const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
|
||||
void FreeNormProtos();
|
||||
NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
|
||||
/* protos.cpp ***************************************************************/
|
||||
@ -88,6 +106,7 @@ class Classify : public CCStruct {
|
||||
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
|
||||
const UNICHARSET& target_unicharset);
|
||||
/* adaptmatch.cpp ***********************************************************/
|
||||
|
||||
// Learn the given word using its chopped_word, seam_array, denorm,
|
||||
// box_word, best_state, and correct_text to learn both correctly and
|
||||
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
|
||||
@ -111,36 +130,62 @@ class Classify : public CCStruct {
|
||||
const char* correct_text, WERD_RES *word);
|
||||
void InitAdaptiveClassifier(bool load_pre_trained_templates);
|
||||
void InitAdaptedClass(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
CLASS_ID ClassId,
|
||||
int FontinfoId,
|
||||
ADAPT_CLASS Class,
|
||||
ADAPT_TEMPLATES Templates);
|
||||
void AdaptToPunc(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
CLASS_ID ClassId,
|
||||
int FontinfoId,
|
||||
FLOAT32 Threshold);
|
||||
void AmbigClassifier(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
INT_TEMPLATES Templates,
|
||||
ADAPT_CLASS *Classes,
|
||||
UNICHAR_ID *Ambiguities,
|
||||
ADAPT_RESULTS *Results);
|
||||
void MasterMatcher(INT_TEMPLATES templates,
|
||||
inT16 num_features,
|
||||
INT_FEATURE_ARRAY features,
|
||||
CLASS_NORMALIZATION_ARRAY norm_factors,
|
||||
const INT_FEATURE_STRUCT* features,
|
||||
const uinT8* norm_factors,
|
||||
ADAPT_CLASS* classes,
|
||||
int debug,
|
||||
int num_classes,
|
||||
const TBOX& blob_box,
|
||||
CLASS_PRUNER_RESULTS results,
|
||||
ADAPT_RESULTS* final_results);
|
||||
void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
|
||||
// Converts configs to fonts, and if the result is not adapted, and a
|
||||
// shape_table_ is present, the shape is expanded to include all
|
||||
// unichar_ids represented, before applying a set of corrections to the
|
||||
// distance rating in int_result, (see ComputeCorrectedRating.)
|
||||
// The results are added to the final_results output.
|
||||
void ExpandShapesAndApplyCorrections(ADAPT_CLASS* classes,
|
||||
bool debug,
|
||||
int class_id,
|
||||
int bottom, int top,
|
||||
float cp_rating,
|
||||
int blob_length,
|
||||
const uinT8* cn_factors,
|
||||
INT_RESULT_STRUCT& int_result,
|
||||
ADAPT_RESULTS* final_results);
|
||||
// Applies a set of corrections to the distance im_rating,
|
||||
// including the cn_correction, miss penalty and additional penalty
|
||||
// for non-alnums being vertical misfits. Returns the corrected distance.
|
||||
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
|
||||
double im_rating, int feature_misses,
|
||||
int bottom, int top,
|
||||
int blob_length, const uinT8* cn_factors);
|
||||
void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
|
||||
ADAPT_RESULTS *Results,
|
||||
BLOB_CHOICE_LIST *Choices);
|
||||
void AddNewResult(ADAPT_RESULTS *results,
|
||||
CLASS_ID class_dd,
|
||||
CLASS_ID class_id,
|
||||
int shape_id,
|
||||
FLOAT32 rating,
|
||||
bool adapted,
|
||||
int config,
|
||||
int config2,
|
||||
int fontinfo_id,
|
||||
int fontinfo_id2);
|
||||
int GetAdaptiveFeatures(TBLOB *Blob,
|
||||
@ -149,9 +194,11 @@ class Classify : public CCStruct {
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void DebugAdaptiveClassifier(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
ADAPT_RESULTS *Results);
|
||||
#endif
|
||||
void GetAdaptThresholds (TWERD * Word,
|
||||
const DENORM& denorm,
|
||||
const WERD_CHOICE& BestChoice,
|
||||
const WERD_CHOICE& BestRawChoice,
|
||||
FLOAT32 Thresholds[]);
|
||||
@ -171,30 +218,64 @@ class Classify : public CCStruct {
|
||||
void MakePermanent(ADAPT_TEMPLATES Templates,
|
||||
CLASS_ID ClassId,
|
||||
int ConfigId,
|
||||
const DENORM& denorm,
|
||||
TBLOB *Blob);
|
||||
void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
|
||||
void RemoveExtraPuncs(ADAPT_RESULTS *Results);
|
||||
void RemoveBadMatches(ADAPT_RESULTS *Results);
|
||||
void SetAdaptiveThreshold(FLOAT32 Threshold);
|
||||
void ShowBestMatchFor(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
CLASS_ID ClassId,
|
||||
int shape_id,
|
||||
BOOL8 AdaptiveOn,
|
||||
BOOL8 PreTrainedOn);
|
||||
BOOL8 PreTrainedOn,
|
||||
ADAPT_RESULTS *Results);
|
||||
// Returns a string for the classifier class_id: either the corresponding
|
||||
// unicharset debug_str or the shape_table_ debug str.
|
||||
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
|
||||
int class_id, int config_id) const;
|
||||
// Converts a classifier class_id index with a config ID to:
|
||||
// shape_table_ present: a shape_table_ index OR
|
||||
// No shape_table_: a font ID.
|
||||
// Without shape training, each class_id, config pair represents a single
|
||||
// unichar id/font combination, so this function looks up the corresponding
|
||||
// font id.
|
||||
// With shape training, each class_id, config pair represents a single
|
||||
// shape table index, so the fontset_table stores the shape table index,
|
||||
// and the shape_table_ must be consulted to obtain the actual unichar_id/
|
||||
// font combinations that the shape represents.
|
||||
int ClassAndConfigIDToFontOrShapeID(int class_id,
|
||||
int int_result_config) const;
|
||||
// Converts a shape_table_ index to a classifier class_id index (not a
|
||||
// unichar-id!). Uses a search, so not fast.
|
||||
int ShapeIDToClassID(int shape_id) const;
|
||||
UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
ADAPT_TEMPLATES Templates,
|
||||
ADAPT_RESULTS *Results);
|
||||
int CharNormClassifier(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
INT_TEMPLATES Templates,
|
||||
ADAPT_RESULTS *Results);
|
||||
|
||||
// As CharNormClassifier, but operates on a TrainingSample and outputs to
|
||||
// a GenericVector of ShapeRating without conversion to classes.
|
||||
int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample,
|
||||
GenericVector<ShapeRating>* results);
|
||||
UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
CLASS_ID CorrectClass);
|
||||
void DoAdaptiveMatch(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
ADAPT_RESULTS *Results);
|
||||
void AdaptToChar(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
CLASS_ID ClassId,
|
||||
int FontinfoId,
|
||||
FLOAT32 Threshold);
|
||||
void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
|
||||
void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
|
||||
INT_CLASS_STRUCT* int_class);
|
||||
int AdaptableWord(TWERD *Word,
|
||||
const WERD_CHOICE &BestChoiceWord,
|
||||
const WERD_CHOICE &RawChoiceWord);
|
||||
@ -203,55 +284,53 @@ class Classify : public CCStruct {
|
||||
void SettupPass1();
|
||||
void SettupPass2();
|
||||
void AdaptiveClassifier(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
BLOB_CHOICE_LIST *Choices,
|
||||
CLASS_PRUNER_RESULTS cp_results);
|
||||
void ClassifyAsNoise(ADAPT_RESULTS *Results);
|
||||
void ResetAdaptiveClassifier();
|
||||
void ResetAdaptiveClassifierInternal();
|
||||
|
||||
int GetBaselineFeatures(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
INT_TEMPLATES Templates,
|
||||
INT_FEATURE_ARRAY IntFeatures,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
||||
uinT8* CharNormArray,
|
||||
inT32 *BlobLength);
|
||||
FLOAT32 GetBestRatingFor(TBLOB *Blob,
|
||||
CLASS_ID ClassId);
|
||||
int GetCharNormFeatures(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
INT_TEMPLATES Templates,
|
||||
INT_FEATURE_ARRAY IntFeatures,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
||||
uinT8* PrunerNormArray,
|
||||
uinT8* CharNormArray,
|
||||
inT32 *BlobLength,
|
||||
inT32 *FeatureOutlineIndex);
|
||||
int GetIntBaselineFeatures(TBLOB *Blob,
|
||||
INT_TEMPLATES Templates,
|
||||
INT_FEATURE_ARRAY IntFeatures,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
||||
inT32 *BlobLength);
|
||||
int GetIntCharNormFeatures(TBLOB *Blob,
|
||||
INT_TEMPLATES Templates,
|
||||
INT_FEATURE_ARRAY IntFeatures,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
||||
inT32 *BlobLength,
|
||||
inT32 *FeatureOutlineArray);
|
||||
// Computes the char_norm_array for the unicharset and, if not NULL, the
|
||||
// pruner_array as appropriate according to the existence of the shape_table.
|
||||
// The norm_feature is deleted as it is almost certainly no longer needed.
|
||||
void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
|
||||
INT_TEMPLATES_STRUCT* templates,
|
||||
uinT8* char_norm_array,
|
||||
uinT8* pruner_array);
|
||||
|
||||
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
|
||||
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
|
||||
void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob);
|
||||
|
||||
void ResetFeaturesHaveBeenExtracted();
|
||||
bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
|
||||
bool LooksLikeGarbage(TBLOB *blob);
|
||||
bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob);
|
||||
void RefreshDebugWindow(ScrollView **win, const char *msg,
|
||||
int y_offset, const TBOX &wbox);
|
||||
/* float2int.cpp ************************************************************/
|
||||
void ComputeIntCharNormArray(FEATURE NormFeature,
|
||||
INT_TEMPLATES Templates,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray);
|
||||
void ClearCharNormArray(uinT8* char_norm_array);
|
||||
void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
|
||||
uinT8* char_norm_array);
|
||||
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
|
||||
/* intproto.cpp *************************************************************/
|
||||
INT_TEMPLATES ReadIntTemplates(FILE *File);
|
||||
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
|
||||
const UNICHARSET& target_unicharset);
|
||||
CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
|
||||
bool* pretrained_on);
|
||||
bool* pretrained_on, int* shape_id);
|
||||
void ShowMatchDisplay();
|
||||
/* font detection ***********************************************************/
|
||||
UnicityTable<FontInfo>& get_fontinfo_table() {
|
||||
@ -271,6 +350,8 @@ class Classify : public CCStruct {
|
||||
// Member variables.
|
||||
|
||||
// Parameters.
|
||||
BOOL_VAR_H(prioritize_division, FALSE,
|
||||
"Prioritize blob division over chopping");
|
||||
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
|
||||
BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
|
||||
INT_VAR_H(classify_debug_level, 0, "Classify debug level");
|
||||
@ -313,8 +394,6 @@ class Classify : public CCStruct {
|
||||
double_VAR_H(classify_misfit_junk_penalty, 0.0,
|
||||
"Penalty to apply when a non-alnum is vertically out of "
|
||||
"its expected textline position");
|
||||
BOOL_VAR_H(classify_enable_int_fx, 1, "Enable integer fx");
|
||||
BOOL_VAR_H(classify_enable_new_adapt_rules, 1, "Enable new adaptation rules");
|
||||
double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
|
||||
double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
|
||||
double_VAR_H(tessedit_class_miss_scale, 0.00390625,
|
||||
@ -338,7 +417,7 @@ class Classify : public CCStruct {
|
||||
|
||||
/* intmatcher.cpp **********************************************************/
|
||||
INT_VAR_H(classify_class_pruner_threshold, 229,
|
||||
"Class Pruner Threshold 0-255: ");
|
||||
"Class Pruner Threshold 0-255");
|
||||
INT_VAR_H(classify_class_pruner_multiplier, 30,
|
||||
"Class Pruner Multiplier 0-255: ");
|
||||
INT_VAR_H(classify_cp_cutoff_strength, 7,
|
||||
@ -362,17 +441,27 @@ class Classify : public CCStruct {
|
||||
NORM_PROTOS *NormProtos;
|
||||
/* font detection ***********************************************************/
|
||||
UnicityTable<FontInfo> fontinfo_table_;
|
||||
// Without shape training, each class_id, config pair represents a single
|
||||
// unichar id/font combination, so each fontset_table_ entry holds font ids
|
||||
// for each config in the class.
|
||||
// With shape training, each class_id, config pair represents a single
|
||||
// shape_table_ index, so the fontset_table_ stores the shape_table_ index,
|
||||
// and the shape_table_ must be consulted to obtain the actual unichar_id/
|
||||
// font combinations that the shape represents.
|
||||
UnicityTable<FontSet> fontset_table_;
|
||||
|
||||
INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
|
||||
BOOL_VAR_H(classify_bln_numeric_mode, 0,
|
||||
"Assume the input is numbers [0-9].");
|
||||
|
||||
protected:
|
||||
IntegerMatcher im_;
|
||||
FEATURE_DEFS_STRUCT feature_defs_;
|
||||
// Must be set for the classifier to operate. Ususally set in
|
||||
// Tesseract::recog_word_recursive, being the main word-level entry point.
|
||||
DENORM denorm_;
|
||||
// If a shape_table_ is present, it is used to remap classifier output in
|
||||
// ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
|
||||
// mean an index to the shape_table_ and the choices returned are *all* the
|
||||
// shape_table_ entries at that index.
|
||||
ShapeTable* shape_table_;
|
||||
|
||||
private:
|
||||
|
||||
@ -401,8 +490,17 @@ class Classify : public CCStruct {
|
||||
INT_FEATURE_ARRAY CharNormFeatures;
|
||||
INT_FX_RESULT_STRUCT FXInfo;
|
||||
|
||||
CLASS_CUTOFF_ARRAY CharNormCutoffs;
|
||||
CLASS_CUTOFF_ARRAY BaselineCutoffs;
|
||||
// Expected number of features in the class pruner, used to penalize
|
||||
// unknowns that have too few features (like a c being classified as e) so
|
||||
// it doesn't recognize everything as '@' or '#'.
|
||||
// CharNormCutoffs is for the static classifier (with no shapetable).
|
||||
// BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
|
||||
// value in the adaptive classifier. Both are indexed by unichar_id.
|
||||
// shapetable_cutoffs_ provides a similar value for each shape in the
|
||||
// shape_table_
|
||||
uinT16* CharNormCutoffs;
|
||||
uinT16* BaselineCutoffs;
|
||||
GenericVector<uinT16> shapetable_cutoffs_;
|
||||
ScrollView* learn_debug_win_;
|
||||
ScrollView* learn_fragmented_word_debug_win_;
|
||||
ScrollView* learn_fragments_debug_win_;
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include "cluster.h"
|
||||
#include "emalloc.h"
|
||||
#include "helpers.h"
|
||||
#include "matrix.h"
|
||||
#include "tprintf.h"
|
||||
#include "danerror.h"
|
||||
#include "freelist.h"
|
||||
@ -137,7 +138,7 @@ const double FTable[FTABLE_Y][FTABLE_X] = {
|
||||
dimension of any feature. Since most features are calculated from numbers
|
||||
with a precision no better than 1 in 128, the variance should never be
|
||||
less than the square of this number for parameters whose range is 1. */
|
||||
#define MINVARIANCE 0.0001
|
||||
#define MINVARIANCE 0.0004
|
||||
|
||||
/* define the absolute minimum number of samples which must be present in
|
||||
order to accurately test hypotheses about underlying probability
|
||||
@ -145,7 +146,6 @@ const double FTable[FTABLE_Y][FTABLE_X] = {
|
||||
before a statistical analysis is attempted; this number should be
|
||||
equal to MINSAMPLES but can be set to a lower number for early testing
|
||||
when very few samples are available. */
|
||||
#define MINBUCKETS 5
|
||||
#define MINSAMPLESPERBUCKET 5
|
||||
#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
|
||||
#define MINSAMPLESNEEDED 1
|
||||
@ -222,7 +222,6 @@ static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
|
||||
/* define lookup tables used to compute the number of histogram buckets
|
||||
that should be used for a given number of samples. */
|
||||
#define LOOKUPTABLESIZE 8
|
||||
#define MAXBUCKETS 39
|
||||
#define MAXDEGREESOFFREEDOM MAXBUCKETS
|
||||
|
||||
static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
|
||||
@ -349,8 +348,7 @@ BOOL8 DistributionOK(BUCKETS *Buckets);
|
||||
|
||||
void FreeStatistics(STATISTICS *Statistics);
|
||||
|
||||
void FreeBuckets(CLUSTERER* clusterer,
|
||||
BUCKETS *Buckets);
|
||||
void FreeBuckets(BUCKETS *Buckets);
|
||||
|
||||
void FreeCluster(CLUSTER *Cluster);
|
||||
|
||||
@ -425,10 +423,11 @@ MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
|
||||
// allocate a kd tree to hold the samples
|
||||
Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
|
||||
|
||||
// keep a list of histogram buckets to minimize recomputing them
|
||||
Clusterer->bucket_cache[0] = NIL_LIST;
|
||||
Clusterer->bucket_cache[1] = NIL_LIST;
|
||||
Clusterer->bucket_cache[2] = NIL_LIST;
|
||||
// Initialize cache of histogram buckets to minimize recomputing them.
|
||||
for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
|
||||
for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
|
||||
Clusterer->bucket_cache[d][c] = NULL;
|
||||
}
|
||||
|
||||
return Clusterer;
|
||||
} // MakeClusterer
|
||||
@ -448,8 +447,8 @@ Exceptions: ALREADYCLUSTERED MakeSample can't be called after
|
||||
ClusterSamples has been called
|
||||
History: 5/29/89, DSJ, Created.
|
||||
*****************************************************************************/
|
||||
SAMPLE *
|
||||
MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID) {
|
||||
SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
|
||||
inT32 CharID) {
|
||||
SAMPLE *Sample;
|
||||
int i;
|
||||
|
||||
@ -537,9 +536,13 @@ void FreeClusterer(CLUSTERER *Clusterer) {
|
||||
FreeKDTree (Clusterer->KDTree);
|
||||
if (Clusterer->Root != NULL)
|
||||
FreeCluster (Clusterer->Root);
|
||||
iterate (Clusterer->ProtoList) {
|
||||
((PROTOTYPE *) (first_node (Clusterer->ProtoList)))->Cluster = NULL;
|
||||
// Free up all used buckets structures.
|
||||
for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
|
||||
for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
|
||||
if (Clusterer->bucket_cache[d][c] != NULL)
|
||||
FreeBuckets(Clusterer->bucket_cache[d][c]);
|
||||
}
|
||||
|
||||
memfree(Clusterer);
|
||||
}
|
||||
} // FreeClusterer
|
||||
@ -662,6 +665,8 @@ FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
|
||||
case uniform:
|
||||
case D_random:
|
||||
return (Proto->Variance.Elliptical[Dimension]);
|
||||
case DISTRIBUTION_COUNT:
|
||||
ASSERT_HOST(!"Distribution count not allowed!");
|
||||
}
|
||||
}
|
||||
return 0.0f;
|
||||
@ -1033,7 +1038,6 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
|
||||
Config->Confidence);
|
||||
break;
|
||||
}
|
||||
FreeBuckets(Clusterer, Buckets);
|
||||
FreeStatistics(Statistics);
|
||||
return Proto;
|
||||
} // MakePrototype
|
||||
@ -1339,10 +1343,6 @@ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
|
||||
FreePrototype(Proto);
|
||||
Proto = NULL;
|
||||
}
|
||||
if (UniformBuckets != NULL)
|
||||
FreeBuckets(Clusterer, UniformBuckets);
|
||||
if (RandomBuckets != NULL)
|
||||
FreeBuckets(Clusterer, RandomBuckets);
|
||||
return (Proto);
|
||||
} // MakeMixedProto
|
||||
|
||||
@ -1623,6 +1623,7 @@ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
|
||||
Proto->Distrib = NULL;
|
||||
|
||||
Proto->Significant = TRUE;
|
||||
Proto->Merged = FALSE;
|
||||
Proto->Style = spherical;
|
||||
Proto->NumSamples = Cluster->SampleCount;
|
||||
Proto->Cluster = Cluster;
|
||||
@ -1705,17 +1706,18 @@ BUCKETS *GetBuckets(CLUSTERER* clusterer,
|
||||
DISTRIBUTION Distribution,
|
||||
uinT32 SampleCount,
|
||||
FLOAT64 Confidence) {
|
||||
// search for an old bucket structure with the same number of buckets
|
||||
LIST *bucket_cache = clusterer->bucket_cache;
|
||||
// Get an old bucket structure with the same number of buckets.
|
||||
uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
|
||||
BUCKETS *Buckets = (BUCKETS *) first_node(search(
|
||||
bucket_cache[(int)Distribution], &NumberOfBuckets,
|
||||
NumBucketsMatch));
|
||||
BUCKETS *Buckets =
|
||||
clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
|
||||
|
||||
// if a matching bucket structure is found, delete it from the list
|
||||
if (Buckets != NULL) {
|
||||
bucket_cache[(int) Distribution] =
|
||||
delete_d(bucket_cache[(int) Distribution], Buckets, ListEntryMatch);
|
||||
// If a matching bucket structure is not found, make one and save it.
|
||||
if (Buckets == NULL) {
|
||||
Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
|
||||
clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
|
||||
Buckets;
|
||||
} else {
|
||||
// Just adjust the existing buckets.
|
||||
if (SampleCount != Buckets->SampleCount)
|
||||
AdjustBuckets(Buckets, SampleCount);
|
||||
if (Confidence != Buckets->Confidence) {
|
||||
@ -1725,9 +1727,6 @@ BUCKETS *GetBuckets(CLUSTERER* clusterer,
|
||||
Confidence);
|
||||
}
|
||||
InitBuckets(Buckets);
|
||||
} else {
|
||||
// otherwise create a new structure
|
||||
Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
|
||||
}
|
||||
return Buckets;
|
||||
} // GetBuckets
|
||||
@ -1770,14 +1769,14 @@ BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
|
||||
BOOL8 Symmetrical;
|
||||
|
||||
// allocate memory needed for data structure
|
||||
Buckets = (BUCKETS *) Emalloc(sizeof(BUCKETS));
|
||||
Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
|
||||
Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
|
||||
Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
|
||||
Buckets->SampleCount = SampleCount;
|
||||
Buckets->Confidence = Confidence;
|
||||
Buckets->Count =
|
||||
(uinT32 *) Emalloc(Buckets->NumberOfBuckets * sizeof (uinT32));
|
||||
Buckets->ExpectedCount =
|
||||
(FLOAT32 *) Emalloc(Buckets->NumberOfBuckets * sizeof (FLOAT32));
|
||||
Buckets->Count = reinterpret_cast<uinT32*>(
|
||||
Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
|
||||
Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
|
||||
Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
|
||||
|
||||
// initialize simple fields
|
||||
Buckets->Distribution = Distribution;
|
||||
@ -2246,23 +2245,16 @@ void FreeStatistics(STATISTICS *Statistics) {
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
void FreeBuckets(CLUSTERER* clusterer, BUCKETS *buckets) {
|
||||
void FreeBuckets(BUCKETS *buckets) {
|
||||
/*
|
||||
** Parameters:
|
||||
** clusterer->bucket_cache
|
||||
** distribution-indexed cache of old bucket structures.
|
||||
** buckets pointer to data structure to be freed
|
||||
** Operation:
|
||||
** This routine places the specified histogram data structure
|
||||
** at the front of a list of histograms so that it can be reused
|
||||
** later if necessary. A separate list is maintained for each
|
||||
** different type of distribution.
|
||||
** This routine properly frees the memory used by a BUCKETS.
|
||||
*/
|
||||
LIST *bucket_cache = clusterer->bucket_cache;
|
||||
if (buckets != NULL) {
|
||||
int dist = (int)buckets->Distribution;
|
||||
bucket_cache[dist] = (LIST) push(bucket_cache[dist], buckets);
|
||||
}
|
||||
Efree(buckets->Count);
|
||||
Efree(buckets->ExpectedCount);
|
||||
Efree(buckets);
|
||||
} // FreeBuckets
|
||||
|
||||
|
||||
@ -2640,8 +2632,10 @@ CLUSTER * Cluster, FLOAT32 MaxIllegal)
|
||||
}
|
||||
NumCharInCluster--;
|
||||
PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
|
||||
if (PercentIllegal > MaxIllegal)
|
||||
if (PercentIllegal > MaxIllegal) {
|
||||
destroy(SearchState);
|
||||
return (TRUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (FALSE);
|
||||
@ -2652,17 +2646,10 @@ CLUSTER * Cluster, FLOAT32 MaxIllegal)
|
||||
// The return value is the sum of norms of the off-diagonal terms of the
|
||||
// product of a and inv. (A measure of the error.)
|
||||
double InvertMatrix(const float* input, int size, float* inv) {
|
||||
double** U; // The upper triangular array.
|
||||
double* Umem;
|
||||
double** U_inv; // The inverse of U.
|
||||
double* U_invmem;
|
||||
double** L; // The lower triangular array.
|
||||
double* Lmem;
|
||||
|
||||
// Allocate memory for the 2D arrays.
|
||||
ALLOC_2D_ARRAY(size, size, Umem, U, double);
|
||||
ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double);
|
||||
ALLOC_2D_ARRAY(size, size, Lmem, L, double);
|
||||
GENERIC_2D_ARRAY<double> U(size, size, 0.0);
|
||||
GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
|
||||
GENERIC_2D_ARRAY<double> L(size, size, 0.0);
|
||||
|
||||
// Initialize the working matrices. U starts as input, L as I and U_inv as O.
|
||||
int row;
|
||||
|
@ -21,6 +21,11 @@
|
||||
#include "kdtree.h"
|
||||
#include "oldlist.h"
|
||||
|
||||
struct BUCKETS;
|
||||
|
||||
#define MINBUCKETS 5
|
||||
#define MAXBUCKETS 39
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
Types
|
||||
----------------------------------------------------------------------*/
|
||||
@ -51,7 +56,7 @@ typedef struct { // parameters to control clustering
|
||||
} CLUSTERCONFIG;
|
||||
|
||||
typedef enum {
|
||||
normal, uniform, D_random
|
||||
normal, uniform, D_random, DISTRIBUTION_COUNT
|
||||
} DISTRIBUTION;
|
||||
|
||||
typedef union {
|
||||
@ -86,7 +91,8 @@ typedef struct {
|
||||
CLUSTER *Root; // ptr to root cluster of cluster tree
|
||||
LIST ProtoList; // list of prototypes
|
||||
inT32 NumChar; // # of characters represented by samples
|
||||
LIST bucket_cache[3]; // cache of reusable histograms by distribution type
|
||||
// cache of reusable histograms by distribution type and number of buckets.
|
||||
BUCKETS* bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];
|
||||
} CLUSTERER;
|
||||
|
||||
typedef struct {
|
||||
@ -103,7 +109,7 @@ typedef struct {
|
||||
--------------------------------------------------------------------------*/
|
||||
CLUSTERER *MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]);
|
||||
|
||||
SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID);
|
||||
SAMPLE *MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, inT32 CharID);
|
||||
|
||||
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
|
||||
|
||||
|
@ -213,6 +213,8 @@ PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
|
||||
Proto->Magnitude.Elliptical[i] = 1.0 /
|
||||
(2.0 * Proto->Variance.Elliptical[i]);
|
||||
break;
|
||||
case DISTRIBUTION_COUNT:
|
||||
ASSERT_HOST(!"Distribution count not allowed!");
|
||||
}
|
||||
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
|
||||
}
|
||||
@ -374,6 +376,8 @@ void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto) {
|
||||
case D_random:
|
||||
fprintf (File, " %9s", "random");
|
||||
break;
|
||||
case DISTRIBUTION_COUNT:
|
||||
ASSERT_HOST(!"Distribution count not allowed!");
|
||||
}
|
||||
fprintf (File, "\n\t");
|
||||
WriteNFloats (File, N, Proto->Variance.Elliptical);
|
||||
@ -392,13 +396,10 @@ Return: None
|
||||
Exceptions: None
|
||||
History: 6/6/89, DSJ, Created.
|
||||
****************************************************************************/
|
||||
void
|
||||
WriteNFloats (FILE * File, uinT16 N, FLOAT32 Array[]) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
fprintf (File, " %9.6f", Array[i]);
|
||||
fprintf (File, "\n");
|
||||
void WriteNFloats(FILE * File, uinT16 N, FLOAT32 Array[]) {
|
||||
for (int i = 0; i < N; i++)
|
||||
fprintf(File, " %9.6f", Array[i]);
|
||||
fprintf(File, "\n");
|
||||
} // WriteNFloats
|
||||
|
||||
|
||||
@ -479,29 +480,3 @@ void WriteProtoList(
|
||||
}
|
||||
} /* WriteProtoList */
|
||||
|
||||
/** UniformRandomNumber ********************************************************
|
||||
Parameters: MMin lower range of uniform distribution
|
||||
MMax upper range of uniform distribution
|
||||
Globals: None
|
||||
Operation: This routine computes a random number which comes from a
|
||||
uniform distribution over the range from MMin to MMax.
|
||||
Return: Uniform random number
|
||||
Exceptions: None
|
||||
History: 6/6/89, DSJ, Created.
|
||||
*******************************************************************************/
|
||||
FLOAT32 UniformRandomNumber(FLOAT32 MMin, FLOAT32 MMax) {
|
||||
double fake_drand48();
|
||||
FLOAT32 RandomNumber;
|
||||
|
||||
RandomNumber = fake_drand48 ();
|
||||
return (MMin + (RandomNumber * (MMax - MMin)));
|
||||
} // UniformRandomNumber
|
||||
|
||||
|
||||
/** drand48 *************************************************************
|
||||
Cheap replacement for drand48 which is not available on the PC.
|
||||
**********************************************************************/
|
||||
|
||||
double fake_drand48() {
|
||||
return rand () / (RAND_MAX + 1.0);
|
||||
}
|
||||
|
@ -52,8 +52,6 @@ void WriteProtoList(
|
||||
BOOL8 WriteSigProtos,
|
||||
BOOL8 WriteInsigProtos);
|
||||
|
||||
FLOAT32 UniformRandomNumber(FLOAT32 MMin, FLOAT32 MMax);
|
||||
|
||||
//--------------Global Data Definitions and Declarations---------------------
|
||||
// define errors that can be trapped
|
||||
#define ILLEGALSAMPLESIZE 5000
|
||||
|
@ -39,7 +39,7 @@
|
||||
----------------------------------------------------------------------------**/
|
||||
/*---------------------------------------------------------------------------*/
|
||||
namespace tesseract {
|
||||
void Classify::ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
|
||||
void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
|
||||
CLASS_CUTOFF_ARRAY Cutoffs) {
|
||||
/*
|
||||
** Parameters:
|
||||
@ -59,6 +59,11 @@ void Classify::ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
|
||||
int Cutoff;
|
||||
int i;
|
||||
|
||||
if (shape_table_ != NULL) {
|
||||
if (!shapetable_cutoffs_.DeSerialize(swap, CutoffFile)) {
|
||||
tprintf("Error during read of shapetable pffmtable!\n");
|
||||
}
|
||||
}
|
||||
for (i = 0; i < MAX_NUM_CLASSES; i++)
|
||||
Cutoffs[i] = MAX_CUTOFF;
|
||||
|
||||
|
385
classify/errorcounter.cpp
Normal file
385
classify/errorcounter.cpp
Normal file
@ -0,0 +1,385 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "errorcounter.h"
|
||||
|
||||
#include "fontinfo.h"
|
||||
#include "ndminx.h"
|
||||
#include "sampleiterator.h"
|
||||
#include "shapeclassifier.h"
|
||||
#include "shapetable.h"
|
||||
#include "trainingsample.h"
|
||||
#include "trainingsampleset.h"
|
||||
#include "unicity_table.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Tests a classifier, computing its error rate.
|
||||
// See errorcounter.h for description of arguments.
|
||||
// Iterates over the samples, calling the classifier in normal/silent mode.
|
||||
// If the classifier makes a CT_UNICHAR_TOPN_ERR error, and the appropriate
|
||||
// report_level is set (4 or greater), it will then call the classifier again
|
||||
// with a debug flag and a keep_this argument to find out what is going on.
|
||||
double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
|
||||
int report_level, CountTypes boosting_mode,
|
||||
const UnicityTable<FontInfo>& fontinfo_table,
|
||||
const GenericVector<Pix*>& page_images, SampleIterator* it,
|
||||
double* unichar_error, double* scaled_error, STRING* fonts_report) {
|
||||
int charsetsize = it->shape_table()->unicharset().size();
|
||||
int shapesize = it->CompactCharsetSize();
|
||||
int fontsize = it->sample_set()->NumFonts();
|
||||
ErrorCounter counter(charsetsize, shapesize, fontsize);
|
||||
GenericVector<ShapeRating> results;
|
||||
|
||||
clock_t start = clock();
|
||||
int total_samples = 0;
|
||||
double unscaled_error = 0.0;
|
||||
// Set a number of samples on which to run the classify debug mode.
|
||||
int error_samples = report_level > 3 ? report_level * report_level : 0;
|
||||
// Iterate over all the samples, accumulating errors.
|
||||
for (it->Begin(); !it->AtEnd(); it->Next()) {
|
||||
TrainingSample* mutable_sample = it->MutableSample();
|
||||
int page_index = mutable_sample->page_num();
|
||||
Pix* page_pix = 0 <= page_index && page_index < page_images.size()
|
||||
? page_images[page_index] : NULL;
|
||||
// No debug, no keep this.
|
||||
classifier->ClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID,
|
||||
&results);
|
||||
if (mutable_sample->class_id() == 0) {
|
||||
// This is junk so use the special counter.
|
||||
counter.AccumulateJunk(*it->shape_table(), results, mutable_sample);
|
||||
} else if (counter.AccumulateErrors(report_level > 3, boosting_mode,
|
||||
fontinfo_table, *it->shape_table(),
|
||||
results, mutable_sample) &&
|
||||
error_samples > 0) {
|
||||
// Running debug, keep the correct answer, and debug the classifier.
|
||||
tprintf("Error on sample %d: Classifier debug output:\n",
|
||||
it->GlobalSampleIndex());
|
||||
int keep_this = it->GetSparseClassID();
|
||||
classifier->ClassifySample(*mutable_sample, page_pix, 1, keep_this,
|
||||
&results);
|
||||
--error_samples;
|
||||
}
|
||||
++total_samples;
|
||||
}
|
||||
double total_time = 1.0 * (clock() - start) / CLOCKS_PER_SEC;
|
||||
// Create the appropriate error report.
|
||||
unscaled_error = counter.ReportErrors(report_level, boosting_mode,
|
||||
fontinfo_table,
|
||||
*it, unichar_error, fonts_report);
|
||||
if (scaled_error != NULL) *scaled_error = counter.scaled_error_;
|
||||
if (report_level > 1) {
|
||||
// It is useful to know the time in microseconds/char.
|
||||
tprintf("Errors computed in %.2fs at %.1f μs/char\n",
|
||||
total_time, 1000000.0 * total_time / total_samples);
|
||||
}
|
||||
return unscaled_error;
|
||||
}
|
||||
|
||||
// Constructor is private. Only anticipated use of ErrorCounter is via
|
||||
// the static ComputeErrorRate.
|
||||
ErrorCounter::ErrorCounter(int charsetsize, int shapesize, int fontsize)
|
||||
: scaled_error_(0.0), unichar_counts_(charsetsize, shapesize, 0) {
|
||||
Counts empty_counts;
|
||||
font_counts_.init_to_size(fontsize, empty_counts);
|
||||
}
|
||||
ErrorCounter::~ErrorCounter() {
|
||||
}
|
||||
|
||||
// Accumulates the errors from the classifier results on a single sample.
|
||||
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
|
||||
// boosting_mode selects the type of error to be used for boosting and the
|
||||
// is_error_ member of sample is set according to whether the required type
|
||||
// of error occurred. The font_table provides access to font properties
|
||||
// for error counting and shape_table is used to understand the relationship
|
||||
// between unichar_ids and shape_ids in the results
|
||||
bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
|
||||
const UnicityTable<FontInfo>& font_table,
|
||||
const ShapeTable& shape_table,
|
||||
const GenericVector<ShapeRating>& results,
|
||||
TrainingSample* sample) {
|
||||
int num_results = results.size();
|
||||
int res_index = 0;
|
||||
bool debug_it = false;
|
||||
int font_id = sample->font_id();
|
||||
int unichar_id = sample->class_id();
|
||||
sample->set_is_error(false);
|
||||
if (num_results == 0) {
|
||||
// Reject. We count rejects as a separate category, but still mark the
|
||||
// sample as an error in case any training module wants to use that to
|
||||
// improve the classifier.
|
||||
sample->set_is_error(true);
|
||||
++font_counts_[font_id].n[CT_REJECT];
|
||||
} else if (shape_table.GetShape(results[0].shape_id).
|
||||
ContainsUnicharAndFont(unichar_id, font_id)) {
|
||||
++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT];
|
||||
// Unichar and font OK, but count if multiple unichars.
|
||||
if (shape_table.GetShape(results[0].shape_id).size() > 1)
|
||||
++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
|
||||
} else {
|
||||
// This is a top shape error.
|
||||
++font_counts_[font_id].n[CT_SHAPE_TOP_ERR];
|
||||
// Check to see if any font in the top choice has attributes that match.
|
||||
bool attributes_match = false;
|
||||
uinT32 font_props = font_table.get(font_id).properties;
|
||||
const Shape& shape = shape_table.GetShape(results[0].shape_id);
|
||||
for (int c = 0; c < shape.size() && !attributes_match; ++c) {
|
||||
for (int f = 0; f < shape[c].font_ids.size(); ++f) {
|
||||
if (font_table.get(shape[c].font_ids[f]).properties == font_props) {
|
||||
attributes_match = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO(rays) It is easy to add counters for individual font attributes
|
||||
// here if we want them.
|
||||
if (!attributes_match)
|
||||
++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
|
||||
if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true);
|
||||
// Find rank of correct unichar answer. (Ignoring the font.)
|
||||
while (res_index < num_results &&
|
||||
!shape_table.GetShape(results[res_index].shape_id).
|
||||
ContainsUnichar(unichar_id)) {
|
||||
++res_index;
|
||||
}
|
||||
if (res_index == 0) {
|
||||
// Unichar OK, but count if multiple unichars.
|
||||
if (shape_table.GetShape(results[res_index].shape_id).size() > 1) {
|
||||
++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
|
||||
}
|
||||
} else {
|
||||
// Count maps from unichar id to shape id.
|
||||
if (num_results > 0)
|
||||
++unichar_counts_(unichar_id, results[0].shape_id);
|
||||
// This is a unichar error.
|
||||
++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
|
||||
if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
|
||||
if (res_index >= MIN(2, num_results)) {
|
||||
// It is also a 2nd choice unichar error.
|
||||
++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
|
||||
if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
|
||||
}
|
||||
if (res_index >= num_results) {
|
||||
// It is also a top-n choice unichar error.
|
||||
++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
|
||||
if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
|
||||
debug_it = debug;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Compute mean number of return values and mean rank of correct answer.
|
||||
font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
|
||||
font_counts_[font_id].n[CT_RANK] += res_index;
|
||||
// If it was an error for boosting then sum the weight.
|
||||
if (sample->is_error()) {
|
||||
scaled_error_ += sample->weight();
|
||||
}
|
||||
if (debug_it) {
|
||||
tprintf("%d results for char %s font %d :",
|
||||
num_results, shape_table.unicharset().id_to_unichar(unichar_id),
|
||||
font_id);
|
||||
for (int i = 0; i < num_results; ++i) {
|
||||
tprintf(" %.3f/%.3f:%s",
|
||||
results[i].rating, results[i].font,
|
||||
shape_table.DebugStr(results[i].shape_id).string());
|
||||
}
|
||||
tprintf("\n");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Accumulates counts for junk. Counts only whether the junk was correctly
|
||||
// rejected or not.
|
||||
void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table,
|
||||
const GenericVector<ShapeRating>& results,
|
||||
TrainingSample* sample) {
|
||||
// For junk we accept no answer, or an explicit shape answer matching the
|
||||
// class id of the sample.
|
||||
int num_results = results.size();
|
||||
int font_id = sample->font_id();
|
||||
int unichar_id = sample->class_id();
|
||||
if (num_results > 0 &&
|
||||
!shape_table.GetShape(results[0].shape_id).ContainsUnichar(unichar_id)) {
|
||||
// This is a junk error.
|
||||
++font_counts_[font_id].n[CT_ACCEPTED_JUNK];
|
||||
sample->set_is_error(true);
|
||||
// It counts as an error for boosting too so sum the weight.
|
||||
scaled_error_ += sample->weight();
|
||||
} else {
|
||||
// Correctly rejected.
|
||||
++font_counts_[font_id].n[CT_REJECTED_JUNK];
|
||||
sample->set_is_error(false);
|
||||
}
|
||||
}
|
||||
|
||||
// Creates a report of the error rate. The report_level controls the detail
|
||||
// that is reported to stderr via tprintf:
|
||||
// 0 -> no output.
|
||||
// >=1 -> bottom-line error rate.
|
||||
// >=3 -> font-level error rate.
|
||||
// boosting_mode determines the return value. It selects which (un-weighted)
|
||||
// error rate to return.
|
||||
// The fontinfo_table from MasterTrainer provides the names of fonts.
|
||||
// The it determines the current subset of the training samples.
|
||||
// If not NULL, the top-choice unichar error rate is saved in unichar_error.
|
||||
// If not NULL, the report string is saved in fonts_report.
|
||||
// (Ignoring report_level).
|
||||
double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
|
||||
const UnicityTable<FontInfo>& fontinfo_table,
|
||||
const SampleIterator& it,
|
||||
double* unichar_error,
|
||||
STRING* fonts_report) {
|
||||
// Compute totals over all the fonts and report individual font results
|
||||
// when required.
|
||||
Counts totals;
|
||||
int fontsize = font_counts_.size();
|
||||
for (int f = 0; f < fontsize; ++f) {
|
||||
// Accumulate counts over fonts.
|
||||
totals += font_counts_[f];
|
||||
STRING font_report;
|
||||
if (ReportString(font_counts_[f], &font_report)) {
|
||||
if (fonts_report != NULL) {
|
||||
*fonts_report += fontinfo_table.get(f).name;
|
||||
*fonts_report += ": ";
|
||||
*fonts_report += font_report;
|
||||
*fonts_report += "\n";
|
||||
}
|
||||
if (report_level > 2) {
|
||||
// Report individual font error rates.
|
||||
tprintf("%s: %s\n", fontinfo_table.get(f).name, font_report.string());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (report_level > 0) {
|
||||
// Report the totals.
|
||||
STRING total_report;
|
||||
if (ReportString(totals, &total_report)) {
|
||||
tprintf("TOTAL Scaled Err=%.4g%%, %s\n",
|
||||
scaled_error_ * 100.0, total_report.string());
|
||||
}
|
||||
// Report the worst substitution error only for now.
|
||||
if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {
|
||||
const UNICHARSET& unicharset = it.shape_table()->unicharset();
|
||||
int charsetsize = unicharset.size();
|
||||
int shapesize = it.CompactCharsetSize();
|
||||
int worst_uni_id = 0;
|
||||
int worst_shape_id = 0;
|
||||
int worst_err = 0;
|
||||
for (int u = 0; u < charsetsize; ++u) {
|
||||
for (int s = 0; s < shapesize; ++s) {
|
||||
if (unichar_counts_(u, s) > worst_err) {
|
||||
worst_err = unichar_counts_(u, s);
|
||||
worst_uni_id = u;
|
||||
worst_shape_id = s;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (worst_err > 0) {
|
||||
tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
|
||||
worst_uni_id, unicharset.id_to_unichar(worst_uni_id),
|
||||
it.shape_table()->DebugStr(worst_shape_id).string(),
|
||||
worst_err, totals.n[CT_UNICHAR_TOP1_ERR],
|
||||
100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);
|
||||
}
|
||||
}
|
||||
}
|
||||
double rates[CT_SIZE];
|
||||
if (!ComputeRates(totals, rates))
|
||||
return 0.0;
|
||||
// Set output values if asked for.
|
||||
if (unichar_error != NULL)
|
||||
*unichar_error = rates[CT_UNICHAR_TOP1_ERR];
|
||||
return rates[boosting_mode];
|
||||
}
|
||||
|
||||
// Sets the report string to a combined human and machine-readable report
|
||||
// string of the error rates.
|
||||
// Returns false if there is no data, leaving report unchanged.
|
||||
bool ErrorCounter::ReportString(const Counts& counts, STRING* report) {
|
||||
// Compute the error rates.
|
||||
double rates[CT_SIZE];
|
||||
if (!ComputeRates(counts, rates))
|
||||
return false;
|
||||
// Using %.4g%%, the length of the output string should exactly match the
|
||||
// length of the format string, but in case of overflow, allow for +eddd
|
||||
// on each number.
|
||||
const int kMaxExtraLength = 5; // Length of +eddd.
|
||||
// Keep this format string and the snprintf in sync with the CountTypes enum.
|
||||
const char* format_str = "ShapeErr=%.4g%%, FontAttr=%.4g%%, "
|
||||
"Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], "
|
||||
"Multi=%.4g%%, Rej=%.4g%%, "
|
||||
"Answers=%.3g, Rank=%.3g, "
|
||||
"OKjunk=%.4g%%, Badjunk=%.4g%%";
|
||||
int max_str_len = strlen(format_str) + kMaxExtraLength * (CT_SIZE - 1) + 1;
|
||||
char* formatted_str = new char[max_str_len];
|
||||
snprintf(formatted_str, max_str_len, format_str,
|
||||
rates[CT_SHAPE_TOP_ERR] * 100.0,
|
||||
rates[CT_FONT_ATTR_ERR] * 100.0,
|
||||
rates[CT_UNICHAR_TOP1_ERR] * 100.0,
|
||||
rates[CT_UNICHAR_TOP2_ERR] * 100.0,
|
||||
rates[CT_UNICHAR_TOPN_ERR] * 100.0,
|
||||
rates[CT_OK_MULTI_UNICHAR] * 100.0,
|
||||
rates[CT_REJECT] * 100.0,
|
||||
rates[CT_NUM_RESULTS],
|
||||
rates[CT_RANK],
|
||||
100.0 * rates[CT_REJECTED_JUNK],
|
||||
100.0 * rates[CT_ACCEPTED_JUNK]);
|
||||
*report = formatted_str;
|
||||
delete [] formatted_str;
|
||||
// Now append each field of counts with a tab in front so the result can
|
||||
// be loaded into a spreadsheet.
|
||||
for (int ct = 0; ct < CT_SIZE; ++ct)
|
||||
report->add_str_int("\t", counts.n[ct]);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Computes the error rates and returns in rates which is an array of size
|
||||
// CT_SIZE. Returns false if there is no data, leaving rates unchanged.
|
||||
bool ErrorCounter::ComputeRates(const Counts& counts, double rates[CT_SIZE]) {
|
||||
int ok_samples = counts.n[CT_SHAPE_TOP_CORRECT] + counts.n[CT_SHAPE_TOP_ERR] +
|
||||
counts.n[CT_REJECT];
|
||||
int junk_samples = counts.n[CT_REJECTED_JUNK] + counts.n[CT_ACCEPTED_JUNK];
|
||||
if (ok_samples == 0 && junk_samples == 0) {
|
||||
// There is no data.
|
||||
return false;
|
||||
}
|
||||
// Compute rates for normal chars.
|
||||
double denominator = static_cast<double>(MAX(ok_samples, 1));
|
||||
for (int ct = 0; ct <= CT_RANK; ++ct)
|
||||
rates[ct] = counts.n[ct] / denominator;
|
||||
// Compute rates for junk.
|
||||
denominator = static_cast<double>(MAX(junk_samples, 1));
|
||||
for (int ct = CT_REJECTED_JUNK; ct <= CT_ACCEPTED_JUNK; ++ct)
|
||||
rates[ct] = counts.n[ct] / denominator;
|
||||
return true;
|
||||
}
|
||||
|
||||
ErrorCounter::Counts::Counts() {
|
||||
memset(n, 0, sizeof(n[0]) * CT_SIZE);
|
||||
}
|
||||
// Adds other into this for computing totals.
|
||||
void ErrorCounter::Counts::operator+=(const Counts& other) {
|
||||
for (int ct = 0; ct < CT_SIZE; ++ct)
|
||||
n[ct] += other.n[ct];
|
||||
}
|
||||
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
|
||||
|
||||
|
||||
|
198
classify/errorcounter.h
Normal file
198
classify/errorcounter.h
Normal file
@ -0,0 +1,198 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
|
||||
#define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "matrix.h"
|
||||
|
||||
struct Pix;
|
||||
template <typename T> class UnicityTable;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
struct FontInfo;
|
||||
class SampleIterator;
|
||||
class ShapeClassifier;
|
||||
class ShapeRating;
|
||||
class ShapeTable;
|
||||
class TrainingSample;
|
||||
|
||||
// Enumeration of the different types of error count.
|
||||
// Error counts work as follows:
|
||||
//
|
||||
// Ground truth is a valid unichar-id / font-id pair:
|
||||
// Number of classifier answers?
|
||||
// 0 >0
|
||||
// CT_REJECT BOTH unichar-id and font-id match top shape?
|
||||
// __________ yes! no
|
||||
// CT_SHAPE_TOP_CORRECT CT_SHAPE_TOP_ERR
|
||||
// | Font attributes match?
|
||||
// | yes! no
|
||||
// | | CT_FONT_ATTR_ERROR
|
||||
// | Top unichar-id matches?
|
||||
// | yes! no
|
||||
// Top shape-id has multiple unichars? CT_UNICHAR_TOP1_ERR
|
||||
// yes! no 2nd shape unichar id matches?
|
||||
// CT_OK_MULTI_UNICHAR ________ yes! no
|
||||
// ___________________ _____ CT_UNICHAR_TOP2_ERR
|
||||
// Any unichar-id matches?
|
||||
// yes! no
|
||||
// ______ CT_UNICHAR_TOPN_ERR
|
||||
// _________________
|
||||
// Note that multiple counts may be activated for a single sample!
|
||||
//
|
||||
// Ground truth is for a fragment/n-gram that is NOT in the unicharset.
|
||||
// This is called junk and is expected to be rejected:
|
||||
// Number of classifier answers?
|
||||
// 0 >0
|
||||
// CT_REJECTED_JUNK CT_ACCEPTED_JUNK
|
||||
//
|
||||
// Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
|
||||
// the mean rank of the correct result, counting from 0, and with an error
|
||||
// receiving the number of answers as the correct rank.
|
||||
//
|
||||
// Keep in sync with the ReportString function.
|
||||
enum CountTypes {
|
||||
CT_SHAPE_TOP_CORRECT, // Top shape id is actually correct.
|
||||
CT_SHAPE_TOP_ERR, // Top shape id is not correct.
|
||||
CT_FONT_ATTR_ERR, // Font attributes incorrect, ignoring unichar.
|
||||
CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id.
|
||||
CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id.
|
||||
CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id.
|
||||
CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others.
|
||||
CT_REJECT, // Classifier hates this.
|
||||
CT_NUM_RESULTS, // Number of answers produced.
|
||||
CT_RANK, // Rank of correct answer.
|
||||
CT_REJECTED_JUNK, // Junk that was correctly rejected.
|
||||
CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise.
|
||||
|
||||
CT_SIZE // Number of types for array sizing.
|
||||
};
|
||||
|
||||
// Class to encapsulate all the functionality and sub-structures required
|
||||
// to count errors for an isolated character classifier (ShapeClassifier).
|
||||
class ErrorCounter {
|
||||
public:
|
||||
// Computes and returns the unweighted boosting_mode error rate of the given
|
||||
// classifier. Can be used for testing, or inside an iterative training
|
||||
// system, including one that uses boosting.
|
||||
// report_levels:
|
||||
// 0 = no output.
|
||||
// 1 = bottom-line error rate.
|
||||
// 2 = bottom-line error rate + time.
|
||||
// 3 = font-level error rate + time.
|
||||
// 4 = list of all errors + short classifier debug output on 16 errors.
|
||||
// 5 = list of all errors + short classifier debug output on 25 errors.
|
||||
// * The boosting_mode determines which error type is used for computing the
|
||||
// scaled_error output, and setting the is_error flag in the samples.
|
||||
// * The fontinfo_table is used to get string font names for the debug
|
||||
// output, and also to count font attributes errors.
|
||||
// * The page_images vector may contain a Pix* (which may be NULL) for each
|
||||
// page index assigned to the samples.
|
||||
// * The it provides encapsulated iteration over some sample set.
|
||||
// * The outputs unichar_error, scaled_error and totals_report are all
|
||||
// optional.
|
||||
// * If not NULL, unichar error gets the top1 unichar error rate.
|
||||
// * Scaled_error gets the error chosen by boosting_mode weighted by the
|
||||
// weights on the samples.
|
||||
// * Fonts_report gets a string summarizing the error rates for each font in
|
||||
// both human-readable form and as a tab-separated list of error counts.
|
||||
// The human-readable form is all before the first tab.
|
||||
// * The return value is the un-weighted version of the scaled_error.
|
||||
static double ComputeErrorRate(ShapeClassifier* classifier,
|
||||
int report_level, CountTypes boosting_mode,
|
||||
const UnicityTable<FontInfo>& fontinfo_table,
|
||||
const GenericVector<Pix*>& page_images,
|
||||
SampleIterator* it,
|
||||
double* unichar_error,
|
||||
double* scaled_error,
|
||||
STRING* fonts_report);
|
||||
|
||||
private:
|
||||
// Simple struct to hold an array of counts.
|
||||
struct Counts {
|
||||
Counts();
|
||||
// Adds other into this for computing totals.
|
||||
void operator+=(const Counts& other);
|
||||
|
||||
int n[CT_SIZE];
|
||||
};
|
||||
|
||||
// Constructor is private. Only anticipated use of ErrorCounter is via
|
||||
// the static ComputeErrorRate.
|
||||
ErrorCounter(int charsetsize, int shapesize, int fontsize);
|
||||
~ErrorCounter();
|
||||
|
||||
// Accumulates the errors from the classifier results on a single sample.
|
||||
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
|
||||
// boosting_mode selects the type of error to be used for boosting and the
|
||||
// is_error_ member of sample is set according to whether the required type
|
||||
// of error occurred. The font_table provides access to font properties
|
||||
// for error counting and shape_table is used to understand the relationship
|
||||
// between unichar_ids and shape_ids in the results
|
||||
bool AccumulateErrors(bool debug, CountTypes boosting_mode,
|
||||
const UnicityTable<FontInfo>& font_table,
|
||||
const ShapeTable& shape_table,
|
||||
const GenericVector<ShapeRating>& results,
|
||||
TrainingSample* sample);
|
||||
|
||||
// Accumulates counts for junk. Counts only whether the junk was correctly
|
||||
// rejected or not.
|
||||
void AccumulateJunk(const ShapeTable& shape_table,
|
||||
const GenericVector<ShapeRating>& results,
|
||||
TrainingSample* sample);
|
||||
|
||||
// Creates a report of the error rate. The report_level controls the detail
|
||||
// that is reported to stderr via tprintf:
|
||||
// 0 -> no output.
|
||||
// >=1 -> bottom-line error rate.
|
||||
// >=3 -> font-level error rate.
|
||||
// boosting_mode determines the return value. It selects which (un-weighted)
|
||||
// error rate to return.
|
||||
// The fontinfo_table from MasterTrainer provides the names of fonts.
|
||||
// The it determines the current subset of the training samples.
|
||||
// If not NULL, the top-choice unichar error rate is saved in unichar_error.
|
||||
// If not NULL, the report string is saved in fonts_report.
|
||||
// (Ignoring report_level).
|
||||
double ReportErrors(int report_level, CountTypes boosting_mode,
|
||||
const UnicityTable<FontInfo>& fontinfo_table,
|
||||
const SampleIterator& it,
|
||||
double* unichar_error,
|
||||
STRING* fonts_report);
|
||||
|
||||
// Sets the report string to a combined human and machine-readable report
|
||||
// string of the error rates.
|
||||
// Returns false if there is no data, leaving report unchanged.
|
||||
static bool ReportString(const Counts& counts, STRING* report);
|
||||
|
||||
// Computes the error rates and returns in rates which is an array of size
|
||||
// CT_SIZE. Returns false if there is no data, leaving rates unchanged.
|
||||
static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]);
|
||||
|
||||
|
||||
// Total scaled error used by boosting algorithms.
|
||||
double scaled_error_;
|
||||
// Vector indexed by font_id from the samples of error accumulators.
|
||||
GenericVector<Counts> font_counts_;
|
||||
// Counts of the results that map each unichar_id (from samples) to an
|
||||
// incorrect shape_id.
|
||||
GENERIC_2D_ARRAY<int> unichar_counts_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */
|
@ -29,15 +29,6 @@ typedef CHAR_FEATURES (*CF_FUNC) ();
|
||||
-----------------------------------------------------------------------------*/
|
||||
void ExtractorStub();
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Global Data Definitions and Declarations
|
||||
-----------------------------------------------------------------------------*/
|
||||
/** tables to keep track of the different low level feature extractors */
|
||||
#define NUM_FX 3
|
||||
#define DEFAULT_FX 2
|
||||
|
||||
int CurrentFx = DEFAULT_FX;
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Public Code
|
||||
-----------------------------------------------------------------------------*/
|
||||
|
@ -30,64 +30,85 @@
|
||||
#define ILLEGAL_NUM_SETS 3001
|
||||
|
||||
#define PICO_FEATURE_LENGTH 0.05
|
||||
#define MAX_OUTLINE_FEATURES 100
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Global Data Definitions and Declarations
|
||||
-----------------------------------------------------------------------------*/
|
||||
/* define all of the parameters for the MicroFeature type*/
|
||||
StartParamDesc (MicroFeatureParams)
|
||||
DefineParam (0, 0, -0.5, 0.5)
|
||||
DefineParam (0, 0, -0.25, 0.75)
|
||||
DefineParam (0, 1, 0.0, 1.0)
|
||||
DefineParam (1, 0, 0.0, 1.0)
|
||||
const char* kMicroFeatureType = "mf";
|
||||
const char* kCNFeatureType = "cn";
|
||||
const char* kIntFeatureType = "if";
|
||||
const char* kGeoFeatureType = "tb";
|
||||
|
||||
// Define all of the parameters for the MicroFeature type.
|
||||
StartParamDesc(MicroFeatureParams)
|
||||
DefineParam(0, 0, -0.5, 0.5)
|
||||
DefineParam(0, 0, -0.25, 0.75)
|
||||
DefineParam(0, 1, 0.0, 1.0)
|
||||
DefineParam(1, 0, 0.0, 1.0)
|
||||
DefineParam (0, 1, -0.5, 0.5)
|
||||
DefineParam (0, 1, -0.5, 0.5)
|
||||
EndParamDesc
|
||||
/* now define the feature type itself (see features.h for info about each
|
||||
parameter).*/
|
||||
DefineFeature (MicroFeatureDesc, 5, 1, 4, 50, "Micro", "mf", MicroFeatureParams)
|
||||
// Now define the feature type itself (see features.h for parameters).
|
||||
DefineFeature(MicroFeatureDesc, 5, 1, kMicroFeatureType, MicroFeatureParams)
|
||||
|
||||
// define all of the parameters for the PicoFeature type
|
||||
/* define knob that can be used to adjust pico-feature length */
|
||||
FLOAT32 PicoFeatureLength = PICO_FEATURE_LENGTH;
|
||||
StartParamDesc (PicoFeatParams)
|
||||
DefineParam (0, 0, -0.25, 0.75)
|
||||
DefineParam (1, 0, 0.0, 1.0)
|
||||
DefineParam (0, 0, -0.5, 0.5)
|
||||
EndParamDesc
|
||||
/* now define the feature type itself (see features.h for info about each
|
||||
parameter).*/
|
||||
DefineFeature (PicoFeatDesc, 2, 1, 1, MAX_UINT8, "Pico", "pf", PicoFeatParams)
|
||||
|
||||
/* define all of the parameters for the NormFeat type*/
|
||||
// Define all of the parameters for the NormFeat type.
|
||||
StartParamDesc (CharNormParams)
|
||||
DefineParam (0, 0, -0.25, 0.75)
|
||||
DefineParam (0, 1, 0.0, 1.0)
|
||||
DefineParam (0, 0, 0.0, 1.0)
|
||||
DefineParam (0, 0, 0.0, 1.0)
|
||||
DefineParam(0, 0, -0.25, 0.75)
|
||||
DefineParam(0, 1, 0.0, 1.0)
|
||||
DefineParam(0, 0, 0.0, 1.0)
|
||||
DefineParam(0, 0, 0.0, 1.0)
|
||||
EndParamDesc
|
||||
/* now define the feature type itself (see features.h for info about each
|
||||
parameter).*/
|
||||
DefineFeature (CharNormDesc, 4, 0, 1, 1, "CharNorm", "cn", CharNormParams)
|
||||
// Now define the feature type itself (see features.h for parameters).
|
||||
DefineFeature(CharNormDesc, 4, 0, kCNFeatureType, CharNormParams)
|
||||
|
||||
// define all of the parameters for the OutlineFeature type
|
||||
StartParamDesc (OutlineFeatParams)
|
||||
DefineParam (0, 0, -0.5, 0.5)
|
||||
DefineParam (0, 0, -0.25, 0.75)
|
||||
DefineParam (0, 0, 0.0, 1.0)
|
||||
DefineParam (1, 0, 0.0, 1.0)
|
||||
// Define all of the parameters for the IntFeature type
|
||||
StartParamDesc(IntFeatParams)
|
||||
DefineParam(0, 0, 0.0, 255.0)
|
||||
DefineParam(0, 0, 0.0, 255.0)
|
||||
DefineParam(1, 0, 0.0, 255.0)
|
||||
EndParamDesc
|
||||
/* now define the feature type itself (see features.h for info about each
|
||||
parameter).*/
|
||||
DefineFeature (OutlineFeatDesc, 3, 1, 1, MAX_OUTLINE_FEATURES, "Outline",
|
||||
"of", OutlineFeatParams)
|
||||
// Now define the feature type itself (see features.h for parameters).
|
||||
DefineFeature(IntFeatDesc, 2, 1, kIntFeatureType, IntFeatParams)
|
||||
|
||||
// Define all of the parameters for the GeoFeature type
|
||||
StartParamDesc(GeoFeatParams)
|
||||
DefineParam(0, 0, 0.0, 255.0)
|
||||
DefineParam(0, 0, 0.0, 255.0)
|
||||
DefineParam(0, 0, 0.0, 255.0)
|
||||
EndParamDesc
|
||||
// Now define the feature type itself (see features.h for parameters).
|
||||
DefineFeature(GeoFeatDesc, 3, 0, kGeoFeatureType, GeoFeatParams)
|
||||
|
||||
// Other features used for training the adaptive classifier, but not used
|
||||
// during normal training, therefore not in the DescDefs array.
|
||||
|
||||
// Define all of the parameters for the PicoFeature type
|
||||
// define knob that can be used to adjust pico-feature length.
|
||||
FLOAT32 PicoFeatureLength = PICO_FEATURE_LENGTH;
|
||||
StartParamDesc(PicoFeatParams)
|
||||
DefineParam(0, 0, -0.25, 0.75)
|
||||
DefineParam(1, 0, 0.0, 1.0)
|
||||
DefineParam(0, 0, -0.5, 0.5)
|
||||
EndParamDesc
|
||||
// Now define the feature type itself (see features.h for parameters).
|
||||
DefineFeature(PicoFeatDesc, 2, 1, "pf", PicoFeatParams)
|
||||
|
||||
// Define all of the parameters for the OutlineFeature type.
|
||||
StartParamDesc(OutlineFeatParams)
|
||||
DefineParam(0, 0, -0.5, 0.5)
|
||||
DefineParam(0, 0, -0.25, 0.75)
|
||||
DefineParam(0, 0, 0.0, 1.0)
|
||||
DefineParam(1, 0, 0.0, 1.0)
|
||||
EndParamDesc
|
||||
// Now define the feature type itself (see features.h for parameters).
|
||||
DefineFeature(OutlineFeatDesc, 3, 1, "of", OutlineFeatParams)
|
||||
|
||||
// MUST be kept in-sync with ExtractorDefs in fxdefs.cpp.
|
||||
static const FEATURE_DESC_STRUCT *DescDefs[NUM_FEATURE_TYPES] = {
|
||||
&MicroFeatureDesc,
|
||||
&PicoFeatDesc,
|
||||
&OutlineFeatDesc,
|
||||
&CharNormDesc
|
||||
&CharNormDesc,
|
||||
&IntFeatDesc,
|
||||
&GeoFeatDesc
|
||||
};
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
@ -188,6 +209,27 @@ void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
|
||||
}
|
||||
} /* WriteCharDescription */
|
||||
|
||||
// Return whether all of the fields of the given feature set
|
||||
// are well defined (not inf or nan).
|
||||
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
|
||||
CHAR_DESC CharDesc) {
|
||||
bool anything_written = false;
|
||||
bool well_formed = true;
|
||||
for (int Type = 0; Type < CharDesc->NumFeatureSets; Type++) {
|
||||
if (CharDesc->FeatureSets[Type]) {
|
||||
for (int i = 0; i < CharDesc->FeatureSets[Type]->NumFeatures; i++) {
|
||||
FEATURE feat = CharDesc->FeatureSets[Type]->Features[i];
|
||||
for (int p = 0; p < feat->Type->NumParams; p++) {
|
||||
if (isnan(feat->Params[p]) || isinf(feat->Params[p]))
|
||||
well_formed = false;
|
||||
else
|
||||
anything_written = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return anything_written && well_formed;
|
||||
} /* ValidCharDescription */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
|
@ -25,6 +25,10 @@
|
||||
|
||||
/* Enumerate the different types of features currently defined. */
|
||||
#define NUM_FEATURE_TYPES 4
|
||||
extern const char* kMicroFeatureType;
|
||||
extern const char* kCNFeatureType;
|
||||
extern const char* kIntFeatureType;
|
||||
extern const char* kGeoFeatureType;
|
||||
|
||||
/* define error traps which can be triggered by this module.*/
|
||||
#define ILLEGAL_SHORT_NAME 2000
|
||||
@ -58,6 +62,9 @@ void FreeCharDescription(CHAR_DESC CharDesc);
|
||||
|
||||
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs);
|
||||
|
||||
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
|
||||
CHAR_DESC CharDesc);
|
||||
|
||||
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
|
||||
FILE *File, CHAR_DESC CharDesc);
|
||||
|
||||
@ -74,4 +81,6 @@ extern const FEATURE_DESC_STRUCT MicroFeatureDesc;
|
||||
extern const FEATURE_DESC_STRUCT PicoFeatDesc;
|
||||
extern const FEATURE_DESC_STRUCT CharNormDesc;
|
||||
extern const FEATURE_DESC_STRUCT OutlineFeatDesc;
|
||||
extern const FEATURE_DESC_STRUCT IntFeatDesc;
|
||||
extern const FEATURE_DESC_STRUCT GeoFeatDesc;
|
||||
#endif
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "normmatch.h"
|
||||
#include "mfoutline.h"
|
||||
#include "classify.h"
|
||||
#include "helpers.h"
|
||||
#include "picofeat.h"
|
||||
|
||||
#define MAX_INT_CHAR_NORM (INT_CHAR_NORM_RANGE - 1)
|
||||
@ -33,63 +34,44 @@
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* For each class in Templates, clear the corresponding
|
||||
* entry in CharNormArray. CharNormArray is indexed by class
|
||||
* indicies (as obtained from Templates) rather than class id's.
|
||||
* For each class in the unicharset, clears the corresponding
|
||||
* entry in char_norm_array. char_norm_array is indexed by unichar_id.
|
||||
*
|
||||
* Globals:
|
||||
* - none
|
||||
*
|
||||
* @param Templates specifies classes currently defined
|
||||
* @param CharNormArray array to be cleared
|
||||
* @param char_norm_array array to be cleared
|
||||
*
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Feb 20 11:20:54 1991, DSJ, Created.
|
||||
*/
|
||||
void ClearCharNormArray(INT_TEMPLATES Templates,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < Templates->NumClasses; i++) {
|
||||
CharNormArray[i] = 0;
|
||||
}
|
||||
|
||||
void Classify::ClearCharNormArray(uinT8* char_norm_array) {
|
||||
memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
|
||||
} /* ClearCharNormArray */
|
||||
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* For each class in Templates, compute the match between
|
||||
* NormFeature and the normalization protos for that class.
|
||||
* Convert this number to the range from 0 - 255 and store it
|
||||
* into CharNormArray. CharNormArray is indexed by class
|
||||
* indicies (as obtained from Templates) rather than class id's.
|
||||
* For each class in unicharset, computes the match between
|
||||
* norm_feature and the normalization protos for that class.
|
||||
* Converts this number to the range from 0 - 255 and stores it
|
||||
* into char_norm_array. CharNormArray is indexed by unichar_id.
|
||||
*
|
||||
* Globals:
|
||||
* - none
|
||||
*
|
||||
* @param NormFeature character normalization feature
|
||||
* @param Templates specifies classes currently defined
|
||||
* @param[out] CharNormArray place to put results
|
||||
* @param norm_feature character normalization feature
|
||||
* @param[out] char_norm_array place to put results of size unicharset.size()
|
||||
*
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Feb 20 11:20:54 1991, DSJ, Created.
|
||||
*/
|
||||
void Classify::ComputeIntCharNormArray(
|
||||
FEATURE NormFeature, INT_TEMPLATES Templates,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray) {
|
||||
int i;
|
||||
int NormAdjust;
|
||||
|
||||
for (i = 0; i < Templates->NumClasses; i++) {
|
||||
NormAdjust = (int) (INT_CHAR_NORM_RANGE *
|
||||
ComputeNormMatch (i, NormFeature, FALSE));
|
||||
if (NormAdjust < 0)
|
||||
NormAdjust = 0;
|
||||
else if (NormAdjust > MAX_INT_CHAR_NORM)
|
||||
NormAdjust = MAX_INT_CHAR_NORM;
|
||||
|
||||
CharNormArray[i] = NormAdjust;
|
||||
void Classify::ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
|
||||
uinT8* char_norm_array) {
|
||||
for (int i = 0; i < unicharset.size(); i++) {
|
||||
int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
|
||||
ComputeNormMatch(i, norm_feature, FALSE));
|
||||
char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
|
||||
}
|
||||
} /* ComputeIntCharNormArray */
|
||||
|
||||
|
@ -27,12 +27,4 @@
|
||||
#define INT_FEAT_RANGE 256
|
||||
#define BASELINE_Y_SHIFT (0.25)
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Public Function Prototypes
|
||||
-----------------------------------------------------------------------------*/
|
||||
namespace tesseract {
|
||||
void ClearCharNormArray(INT_TEMPLATES Templates,
|
||||
CLASS_NORMALIZATION_ARRAY CharNormArray);
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif
|
||||
|
@ -27,15 +27,16 @@
|
||||
-----------------------------------------------------------------------------*/
|
||||
// Definitions of extractors separated from feature definitions.
|
||||
const FEATURE_EXT_STRUCT MicroFeatureExt = { ExtractMicros };
|
||||
const FEATURE_EXT_STRUCT PicoFeatExt = { NULL };
|
||||
const FEATURE_EXT_STRUCT OutlineFeatExt = { NULL };
|
||||
const FEATURE_EXT_STRUCT CharNormExt = { ExtractCharNormFeatures };
|
||||
const FEATURE_EXT_STRUCT IntFeatExt = { ExtractIntCNFeatures };
|
||||
const FEATURE_EXT_STRUCT GeoFeatExt = { ExtractIntGeoFeatures };
|
||||
|
||||
// MUST be kept in-sync with DescDefs in featdefs.cpp.
|
||||
const FEATURE_EXT_STRUCT* ExtractorDefs[NUM_FEATURE_TYPES] = {
|
||||
&MicroFeatureExt,
|
||||
&PicoFeatExt,
|
||||
&OutlineFeatExt,
|
||||
&CharNormExt
|
||||
&CharNormExt,
|
||||
&IntFeatExt,
|
||||
&GeoFeatExt
|
||||
};
|
||||
|
||||
void SetupExtractors(FEATURE_DEFS_STRUCT *FeatureDefs) {
|
||||
|
159
classify/intfeaturedist.cpp
Normal file
159
classify/intfeaturedist.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intfeaturedist.cpp
|
||||
// Description: Fast set-difference-based feature distance calculator.
|
||||
// Created: Thu Sep 01 13:07:30 PDT 2011
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "intfeaturedist.h"
|
||||
#include "intfeaturemap.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
IntFeatureDist::IntFeatureDist()
|
||||
: size_(0), total_feature_weight_(0.0),
|
||||
feature_map_(NULL), features_(NULL),
|
||||
features_delta_one_(NULL), features_delta_two_(NULL) {
|
||||
}
|
||||
|
||||
IntFeatureDist::~IntFeatureDist() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
// Initialize the table to the given size of feature space.
|
||||
void IntFeatureDist::Init(const IntFeatureMap* feature_map) {
|
||||
size_ = feature_map->sparse_size();
|
||||
Clear();
|
||||
feature_map_ = feature_map;
|
||||
features_ = new bool[size_];
|
||||
features_delta_one_ = new bool[size_];
|
||||
features_delta_two_ = new bool[size_];
|
||||
memset(features_, false, size_ * sizeof(features_[0]));
|
||||
memset(features_delta_one_, false, size_ * sizeof(features_delta_one_[0]));
|
||||
memset(features_delta_two_, false, size_ * sizeof(features_delta_two_[0]));
|
||||
total_feature_weight_ = 0.0;
|
||||
}
|
||||
|
||||
// Setup the map for the given indexed_features that have been indexed by
|
||||
// feature_map.
|
||||
void IntFeatureDist::Set(const GenericVector<int>& indexed_features,
|
||||
int canonical_count, bool value) {
|
||||
total_feature_weight_ = canonical_count;
|
||||
for (int i = 0; i < indexed_features.size(); ++i) {
|
||||
int f = indexed_features[i];
|
||||
features_[f] = value;
|
||||
for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
|
||||
if (dir == 0) continue;
|
||||
int mapped_f = feature_map_->OffsetFeature(f, dir);
|
||||
if (mapped_f >= 0) {
|
||||
features_delta_one_[mapped_f] = value;
|
||||
for (int dir2 = -kNumOffsetMaps; dir2 <= kNumOffsetMaps; ++dir2) {
|
||||
if (dir2 == 0) continue;
|
||||
int mapped_f2 = feature_map_->OffsetFeature(mapped_f, dir2);
|
||||
if (mapped_f2 >= 0)
|
||||
features_delta_two_[mapped_f2] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the distance between the given feature vector and the last
|
||||
// Set feature vector.
|
||||
double IntFeatureDist::FeatureDistance(
|
||||
const GenericVector<int>& features) const {
|
||||
int num_test_features = features.size();
|
||||
double denominator = total_feature_weight_ + num_test_features;
|
||||
double misses = denominator;
|
||||
for (int i = 0; i < num_test_features; ++i) {
|
||||
int index = features[i];
|
||||
double weight = 1.0;
|
||||
if (features_[index]) {
|
||||
// A perfect match.
|
||||
misses -= 2.0 * weight;
|
||||
} else if (features_delta_one_[index]) {
|
||||
misses -= 1.5 * weight;
|
||||
} else if (features_delta_two_[index]) {
|
||||
// A near miss.
|
||||
misses -= 1.0 * weight;
|
||||
}
|
||||
}
|
||||
return misses / denominator;
|
||||
}
|
||||
|
||||
// Compute the distance between the given feature vector and the last
|
||||
// Set feature vector.
|
||||
double IntFeatureDist::DebugFeatureDistance(
|
||||
const GenericVector<int>& features) const {
|
||||
int num_test_features = features.size();
|
||||
double denominator = total_feature_weight_ + num_test_features;
|
||||
double misses = denominator;
|
||||
for (int i = 0; i < num_test_features; ++i) {
|
||||
int index = features[i];
|
||||
double weight = 1.0;
|
||||
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(features[i]);
|
||||
tprintf("Testing feature weight %g:", weight);
|
||||
f.print();
|
||||
if (features_[index]) {
|
||||
// A perfect match.
|
||||
misses -= 2.0 * weight;
|
||||
tprintf("Perfect hit\n");
|
||||
} else if (features_delta_one_[index]) {
|
||||
misses -= 1.5 * weight;
|
||||
tprintf("-1 hit\n");
|
||||
} else if (features_delta_two_[index]) {
|
||||
// A near miss.
|
||||
misses -= 1.0 * weight;
|
||||
tprintf("-2 hit\n");
|
||||
} else {
|
||||
tprintf("Total miss\n");
|
||||
}
|
||||
}
|
||||
tprintf("Features present:");
|
||||
for (int i = 0; i < size_; ++i) {
|
||||
if (features_[i]) {
|
||||
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
|
||||
f.print();
|
||||
}
|
||||
}
|
||||
tprintf("\nMinus one features:");
|
||||
for (int i = 0; i < size_; ++i) {
|
||||
if (features_delta_one_[i]) {
|
||||
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
|
||||
f.print();
|
||||
}
|
||||
}
|
||||
tprintf("\nMinus two features:");
|
||||
for (int i = 0; i < size_; ++i) {
|
||||
if (features_delta_two_[i]) {
|
||||
INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
|
||||
f.print();
|
||||
}
|
||||
}
|
||||
tprintf("\n");
|
||||
return misses / denominator;
|
||||
}
|
||||
|
||||
// Clear all data.
|
||||
void IntFeatureDist::Clear() {
|
||||
delete [] features_;
|
||||
features_ = NULL;
|
||||
delete [] features_delta_one_;
|
||||
features_delta_one_ = NULL;
|
||||
delete [] features_delta_two_;
|
||||
features_delta_two_ = NULL;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
80
classify/intfeaturedist.h
Normal file
80
classify/intfeaturedist.h
Normal file
@ -0,0 +1,80 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intfeaturedist.h
|
||||
// Description: Fast set-difference-based feature distance calculator.
|
||||
// Created: Thu Sep 01 12:14:30 PDT 2011
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CLASSIFY_INTFEATUREDIST_H_
|
||||
#define TESSERACT_CLASSIFY_INTFEATUREDIST_H_
|
||||
|
||||
#include "genericvector.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class IntFeatureMap;
|
||||
|
||||
// Feature distance calculator designed to provide a fast distance calculation
|
||||
// based on set difference between a given feature set and many other feature
|
||||
// sets in turn.
|
||||
// Representation of a feature set as an array of bools that are sparsely
|
||||
// true, and companion arrays that allow fast feature set distance
|
||||
// calculations with allowance of offsets in position.
|
||||
// Init is expensive, so for greatest efficiency, to re-initialize for a new
|
||||
// feature set, use Set(..., false) on the SAME feature set as was used to
|
||||
// setup with Set(..., true), to return to its initialized state before
|
||||
// reuse with Set(..., true) on a new feature set.
|
||||
class IntFeatureDist {
|
||||
public:
|
||||
IntFeatureDist();
|
||||
~IntFeatureDist();
|
||||
|
||||
// Initialize the bool array to the given size of feature space.
|
||||
// The feature_map is just borrowed, and must exist for the entire
|
||||
// lifetime of the IntFeatureDist.
|
||||
void Init(const IntFeatureMap* feature_map);
|
||||
|
||||
// Setup the map for the given indexed_features that have been indexed by
|
||||
// feature_map. After use, use Set(..., false) to reset to the initial state
|
||||
// as this is faster than calling Init for sparse spaces.
|
||||
void Set(const GenericVector<int>& indexed_features,
|
||||
int canonical_count, bool value);
|
||||
|
||||
// Compute the distance between the given feature vector and the last
|
||||
// Set feature vector.
|
||||
double FeatureDistance(const GenericVector<int>& features) const;
|
||||
double DebugFeatureDistance(const GenericVector<int>& features) const;
|
||||
|
||||
private:
|
||||
// Clear all data.
|
||||
void Clear();
|
||||
|
||||
// Size of the indexed feature space.
|
||||
int size_;
|
||||
// Total weight of features currently stored in the maps.
|
||||
double total_feature_weight_;
|
||||
// Pointer to IntFeatureMap given at Init to find offset features.
|
||||
const IntFeatureMap* feature_map_;
|
||||
// Array of bools indicating presence of a feature.
|
||||
bool* features_;
|
||||
// Array indicating the presence of a feature offset by one unit.
|
||||
bool* features_delta_one_;
|
||||
// Array indicating the presence of a feature offset by two units.
|
||||
bool* features_delta_two_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CLASSIFY_INTFEATUREDIST_H_
|
245
classify/intfeaturemap.cpp
Normal file
245
classify/intfeaturemap.cpp
Normal file
@ -0,0 +1,245 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intfeaturemap.cpp
|
||||
// Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
|
||||
// to provide a subspace mapping and fast feature lookup.
|
||||
// Created: Tue Oct 26 08:58:30 PDT 2010
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "intfeaturemap.h"
|
||||
|
||||
#include "intfeaturespace.h"
|
||||
#include "intfx.h"
|
||||
// These includes do not exist yet, but will be coming soon.
|
||||
//#include "sampleiterator.h"
|
||||
//#include "trainingsample.h"
|
||||
//#include "trainingsampleset.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const int kMaxOffsetDist = 32;
|
||||
const double kMinPCLengthIncrease = 1.0 / 1024;
|
||||
|
||||
IntFeatureMap::IntFeatureMap()
|
||||
: mapping_changed_(true), compact_size_(0) {
|
||||
for (int dir = 0; dir < kNumOffsetMaps; ++dir) {
|
||||
offset_plus_[dir] = NULL;
|
||||
offset_minus_[dir] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
IntFeatureMap::~IntFeatureMap() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
// Pseudo-accessors.
|
||||
int IntFeatureMap::IndexFeature(const INT_FEATURE_STRUCT& f) const {
|
||||
return feature_space_.Index(f);
|
||||
}
|
||||
int IntFeatureMap::MapFeature(const INT_FEATURE_STRUCT& f) const {
|
||||
return feature_map_.SparseToCompact(feature_space_.Index(f));
|
||||
}
|
||||
int IntFeatureMap::MapIndexFeature(int index_feature) const {
|
||||
return feature_map_.SparseToCompact(index_feature);
|
||||
}
|
||||
INT_FEATURE_STRUCT IntFeatureMap::InverseIndexFeature(int index_feature) const {
|
||||
return feature_space_.PositionFromIndex(index_feature);
|
||||
}
|
||||
INT_FEATURE_STRUCT IntFeatureMap::InverseMapFeature(int map_feature) const {
|
||||
int index = feature_map_.CompactToSparse(map_feature);
|
||||
return feature_space_.PositionFromIndex(index);
|
||||
}
|
||||
void IntFeatureMap::DeleteMapFeature(int map_feature) {
|
||||
feature_map_.Merge(-1, map_feature);
|
||||
mapping_changed_ = true;
|
||||
}
|
||||
bool IntFeatureMap::IsMapFeatureDeleted(int map_feature) const {
|
||||
return feature_map_.IsCompactDeleted(map_feature);
|
||||
}
|
||||
|
||||
// Copies the given feature_space and uses it as the index feature map
|
||||
// from INT_FEATURE_STRUCT.
|
||||
void IntFeatureMap::Init(const IntFeatureSpace& feature_space) {
|
||||
feature_space_ = feature_space;
|
||||
mapping_changed_ = false;
|
||||
int sparse_size = feature_space_.Size();
|
||||
feature_map_.Init(sparse_size, true);
|
||||
feature_map_.Setup();
|
||||
compact_size_ = feature_map_.CompactSize();
|
||||
// Initialize look-up tables if needed.
|
||||
FCOORD dir = FeatureDirection(0);
|
||||
if (dir.x() == 0.0f && dir.y() == 0.0f)
|
||||
InitIntegerFX();
|
||||
// Compute look-up tables to generate offset features.
|
||||
for (int dir = 0; dir < kNumOffsetMaps; ++dir) {
|
||||
delete [] offset_plus_[dir];
|
||||
delete [] offset_minus_[dir];
|
||||
offset_plus_[dir] = new int[sparse_size];
|
||||
offset_minus_[dir] = new int[sparse_size];
|
||||
}
|
||||
for (int dir = 1; dir <= kNumOffsetMaps; ++dir) {
|
||||
for (int i = 0; i < sparse_size; ++i) {
|
||||
int offset_index = ComputeOffsetFeature(i, dir);
|
||||
offset_plus_[dir - 1][i] = offset_index;
|
||||
offset_index = ComputeOffsetFeature(i, -dir);
|
||||
offset_minus_[dir - 1][i] = offset_index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to return an offset index feature. In this context an offset
|
||||
// feature with a dir of +/-1 is a feature of a similar direction,
|
||||
// but shifted perpendicular to the direction of the feature. An offset
|
||||
// feature with a dir of +/-2 is feature at the same position, but rotated
|
||||
// by +/- one [compact] quantum. Returns the index of the generated offset
|
||||
// feature, or -1 if it doesn't exist. Dir should be in
|
||||
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
|
||||
// A dir of 0 is an identity transformation.
|
||||
// Both input and output are from the index(sparse) feature space, not
|
||||
// the mapped/compact feature space, but the offset feature is the minimum
|
||||
// distance moved from the input to guarantee that it maps to the next
|
||||
// available quantum in the mapped/compact space.
|
||||
int IntFeatureMap::OffsetFeature(int index_feature, int dir) const {
|
||||
if (dir > 0 && dir <= kNumOffsetMaps)
|
||||
return offset_plus_[dir - 1][index_feature];
|
||||
else if (dir < 0 && -dir <= kNumOffsetMaps)
|
||||
return offset_minus_[-dir - 1][index_feature];
|
||||
else if (dir == 0)
|
||||
return index_feature;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
//#define EXPERIMENT_ON
|
||||
#ifdef EXPERIMENT_ON // This code is commented out as SampleIterator and
|
||||
// TrainingSample are not reviewed/checked in yet, but these functions are a
|
||||
// useful indicator of how an IntFeatureMap is setup.
|
||||
|
||||
// Computes the features used by the subset of samples defined by
|
||||
// the iterator and sets up the feature mapping.
|
||||
// Returns the size of the compacted feature space.
|
||||
int IntFeatureMap::FindNZFeatureMapping(SampleIterator* it) {
|
||||
feature_map_.Init(feature_space_.Size(), false);
|
||||
int total_samples = 0;
|
||||
for (it->Begin(); !it->AtEnd(); it->Next()) {
|
||||
const TrainingSample& sample = it->GetSample();
|
||||
GenericVector<int> features;
|
||||
feature_space_.IndexAndSortFeatures(sample.features(),
|
||||
sample.num_features(),
|
||||
&features);
|
||||
int num_features = features.size();
|
||||
for (int f = 0; f < num_features; ++f)
|
||||
feature_map_.SetMap(features[f], true);
|
||||
++total_samples;
|
||||
}
|
||||
feature_map_.Setup();
|
||||
compact_size_ = feature_map_.CompactSize();
|
||||
mapping_changed_ = true;
|
||||
FinalizeMapping(it);
|
||||
tprintf("%d non-zero features found in %d samples\n",
|
||||
compact_size_, total_samples);
|
||||
return compact_size_;
|
||||
}
|
||||
#endif
|
||||
|
||||
// After deleting some features, finish setting up the mapping, and map
|
||||
// all the samples. Returns the size of the compacted feature space.
|
||||
int IntFeatureMap::FinalizeMapping(SampleIterator* it) {
|
||||
if (mapping_changed_) {
|
||||
feature_map_.CompleteMerges();
|
||||
compact_size_ = feature_map_.CompactSize();
|
||||
#ifdef EXPERIMENT_ON
|
||||
it->MapSampleFeatures(*this);
|
||||
#endif
|
||||
mapping_changed_ = false;
|
||||
}
|
||||
return compact_size_;
|
||||
}
|
||||
|
||||
// Prints the map features from the set in human-readable form.
|
||||
void IntFeatureMap::DebugMapFeatures(
|
||||
const GenericVector<int>& map_features) const {
|
||||
for (int i = 0; i < map_features.size(); ++i) {
|
||||
INT_FEATURE_STRUCT f = InverseMapFeature(map_features[i]);
|
||||
f.print();
|
||||
}
|
||||
}
|
||||
|
||||
void IntFeatureMap::Clear() {
|
||||
for (int dir = 0; dir < kNumOffsetMaps; ++dir) {
|
||||
delete [] offset_plus_[dir];
|
||||
delete [] offset_minus_[dir];
|
||||
offset_plus_[dir] = NULL;
|
||||
offset_minus_[dir] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to compute an offset index feature. In this context an offset
|
||||
// feature with a dir of +/-1 is a feature of a similar direction,
|
||||
// but shifted perpendicular to the direction of the feature. An offset
|
||||
// feature with a dir of +/-2 is feature at the same position, but rotated
|
||||
// by +/- one [compact] quantum. Returns the index of the generated offset
|
||||
// feature, or -1 if it doesn't exist. Dir should be in
|
||||
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
|
||||
// A dir of 0 is an identity transformation.
|
||||
// Both input and output are from the index(sparse) feature space, not
|
||||
// the mapped/compact feature space, but the offset feature is the minimum
|
||||
// distance moved from the input to guarantee that it maps to the next
|
||||
// available quantum in the mapped/compact space.
|
||||
int IntFeatureMap::ComputeOffsetFeature(int index_feature, int dir) const {
|
||||
INT_FEATURE_STRUCT f = InverseIndexFeature(index_feature);
|
||||
ASSERT_HOST(IndexFeature(f) == index_feature);
|
||||
if (dir == 0) {
|
||||
return index_feature;
|
||||
} else if (dir == 1 || dir == -1) {
|
||||
FCOORD feature_dir = FeatureDirection(f.Theta);
|
||||
FCOORD rotation90(0.0f, 1.0f);
|
||||
feature_dir.rotate(rotation90);
|
||||
// Find the nearest existing feature.
|
||||
for (int m = 1; m < kMaxOffsetDist; ++m) {
|
||||
double x_pos = f.X + feature_dir.x() * (m * dir);
|
||||
double y_pos = f.Y + feature_dir.y() * (m * dir);
|
||||
int x = IntCastRounded(x_pos);
|
||||
int y = IntCastRounded(y_pos);
|
||||
if (x >= 0 && x <= MAX_UINT8 && y >= 0 && y <= MAX_UINT8) {
|
||||
INT_FEATURE_STRUCT offset_f;
|
||||
offset_f.X = x;
|
||||
offset_f.Y = y;
|
||||
offset_f.Theta = f.Theta;
|
||||
int offset_index = IndexFeature(offset_f);
|
||||
if (offset_index != index_feature && offset_index >= 0)
|
||||
return offset_index; // Found one.
|
||||
} else {
|
||||
return -1; // Hit the edge of feature space.
|
||||
}
|
||||
}
|
||||
} else if (dir == 2 || dir == -2) {
|
||||
// Find the nearest existing index_feature.
|
||||
for (int m = 1; m < kMaxOffsetDist; ++m) {
|
||||
int theta = f.Theta + m * dir / 2;
|
||||
INT_FEATURE_STRUCT offset_f;
|
||||
offset_f.X = f.X;
|
||||
offset_f.Y = f.Y;
|
||||
offset_f.Theta = Modulo(theta, 256);
|
||||
int offset_index = IndexFeature(offset_f);
|
||||
if (offset_index != index_feature && offset_index >= 0)
|
||||
return offset_index; // Found one.
|
||||
}
|
||||
}
|
||||
return -1; // Nothing within the max distance.
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
163
classify/intfeaturemap.h
Normal file
163
classify/intfeaturemap.h
Normal file
@ -0,0 +1,163 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intfeaturemap.h
|
||||
// Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
|
||||
// to provide a subspace mapping and fast feature lookup.
|
||||
// Created: Tue Oct 26 08:58:30 PDT 2010
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H__
|
||||
#define TESSERACT_CLASSIFY_INTFEATUREMAP_H__
|
||||
|
||||
#include "intfeaturespace.h"
|
||||
#include "indexmapbidi.h"
|
||||
#include "intproto.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class SampleIterator;
|
||||
|
||||
// Number of positive and negative offset maps.
|
||||
static const int kNumOffsetMaps = 2;
|
||||
|
||||
// Class to map a feature space defined by INT_FEATURE_STRUCT to a compact
|
||||
// down-sampled subspace of actually used features.
|
||||
// The IntFeatureMap copes with 2 stages of transformation:
|
||||
// The first step is down-sampling (re-quantization) and converting to a
|
||||
// single index value from the 3-D input:
|
||||
// INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and
|
||||
// the second is a feature-space compaction to map only the feature indices
|
||||
// that are actually used. This saves space in classifiers that are built
|
||||
// using the mapped feature space.
|
||||
// index (sparse) feature <-> map (compact) feature via IndexMapBiDi.
|
||||
// Although the transformations are reversible, the inverses are lossy and do
|
||||
// not return the exact input INT_FEATURE_STRUCT, due to the many->one nature
|
||||
// of both transformations.
|
||||
class IntFeatureMap {
|
||||
public:
|
||||
IntFeatureMap();
|
||||
~IntFeatureMap();
|
||||
|
||||
// Accessors.
|
||||
int sparse_size() const {
|
||||
return feature_space_.Size();
|
||||
}
|
||||
int compact_size() const {
|
||||
return compact_size_;
|
||||
}
|
||||
const IntFeatureSpace& feature_space() const {
|
||||
return feature_space_;
|
||||
}
|
||||
const IndexMapBiDi& feature_map() const {
|
||||
return feature_map_;
|
||||
}
|
||||
|
||||
// Pseudo-accessors.
|
||||
int IndexFeature(const INT_FEATURE_STRUCT& f) const;
|
||||
int MapFeature(const INT_FEATURE_STRUCT& f) const;
|
||||
int MapIndexFeature(int index_feature) const;
|
||||
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const;
|
||||
INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const;
|
||||
void DeleteMapFeature(int map_feature);
|
||||
bool IsMapFeatureDeleted(int map_feature) const;
|
||||
|
||||
// Copies the given feature_space and uses it as the index feature map
|
||||
// from INT_FEATURE_STRUCT.
|
||||
void Init(const IntFeatureSpace& feature_space);
|
||||
|
||||
// Helper to return an offset index feature. In this context an offset
|
||||
// feature with a dir of +/-1 is a feature of a similar direction,
|
||||
// but shifted perpendicular to the direction of the feature. An offset
|
||||
// feature with a dir of +/-2 is feature at the same position, but rotated
|
||||
// by +/- one [compact] quantum. Returns the index of the generated offset
|
||||
// feature, or -1 if it doesn't exist. Dir should be in
|
||||
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
|
||||
// A dir of 0 is an identity transformation.
|
||||
// Both input and output are from the index(sparse) feature space, not
|
||||
// the mapped/compact feature space, but the offset feature is the minimum
|
||||
// distance moved from the input to guarantee that it maps to the next
|
||||
// available quantum in the mapped/compact space.
|
||||
int OffsetFeature(int index_feature, int dir) const;
|
||||
|
||||
// Computes the features used by the subset of samples defined by
|
||||
// the iterator and sets up the feature mapping.
|
||||
// Returns the size of the compacted feature space.
|
||||
int FindNZFeatureMapping(SampleIterator* it);
|
||||
|
||||
// After deleting some features, finish setting up the mapping, and map
|
||||
// all the samples. Returns the size of the compacted feature space.
|
||||
int FinalizeMapping(SampleIterator* it);
|
||||
|
||||
// Indexes the given array of features to a vector of sorted indices.
|
||||
void IndexAndSortFeatures(const INT_FEATURE_STRUCT* features,
|
||||
int num_features,
|
||||
GenericVector<int>* sorted_features) const {
|
||||
feature_space_.IndexAndSortFeatures(features, num_features,
|
||||
sorted_features);
|
||||
}
|
||||
// Maps the given array of index/sparse features to an array of map/compact
|
||||
// features.
|
||||
// Assumes the input is sorted. The output indices are sorted and uniqued.
|
||||
// Returns the number of "missed" features, being features that
|
||||
// don't map to the compact feature space.
|
||||
int MapIndexedFeatures(const GenericVector<int>& index_features,
|
||||
GenericVector<int>* map_features) const {
|
||||
return feature_map_.MapFeatures(index_features, map_features);
|
||||
}
|
||||
|
||||
// Prints the map features from the set in human-readable form.
|
||||
void DebugMapFeatures(const GenericVector<int>& map_features) const;
|
||||
|
||||
private:
|
||||
void Clear();
|
||||
|
||||
// Helper to compute an offset index feature. In this context an offset
|
||||
// feature with a dir of +/-1 is a feature of a similar direction,
|
||||
// but shifted perpendicular to the direction of the feature. An offset
|
||||
// feature with a dir of +/-2 is feature at the same position, but rotated
|
||||
// by +/- one [compact] quantum. Returns the index of the generated offset
|
||||
// feature, or -1 if it doesn't exist. Dir should be in
|
||||
// [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
|
||||
// A dir of 0 is an identity transformation.
|
||||
// Both input and output are from the index(sparse) feature space, not
|
||||
// the mapped/compact feature space, but the offset feature is the minimum
|
||||
// distance moved from the input to guarantee that it maps to the next
|
||||
// available quantum in the mapped/compact space.
|
||||
int ComputeOffsetFeature(int index_feature, int dir) const;
|
||||
|
||||
// True if the mapping has changed since it was last finalized.
|
||||
bool mapping_changed_;
|
||||
// Size of the compacted feature space, after unused features are removed.
|
||||
int compact_size_;
|
||||
// Feature space quantization definition and indexing from INT_FEATURE_STRUCT.
|
||||
IntFeatureSpace feature_space_;
|
||||
// Mapping from indexed feature space to the compacted space with unused
|
||||
// features mapping to -1.
|
||||
IndexMapBiDi feature_map_;
|
||||
// Index tables to map a feature index to the corresponding feature after a
|
||||
// shift perpendicular to the feature direction, or a rotation in place.
|
||||
// An entry of -1 indicates that there is no corresponding feature.
|
||||
// Array of arrays of size feature_space_.Size() owned by this class.
|
||||
int* offset_plus_[kNumOffsetMaps];
|
||||
int* offset_minus_[kNumOffsetMaps];
|
||||
|
||||
// Don't use default copy and assign!
|
||||
IntFeatureMap(const IntFeatureMap&);
|
||||
void operator=(const IntFeatureMap&);
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H__
|
143
classify/intfeaturespace.cpp
Normal file
143
classify/intfeaturespace.cpp
Normal file
@ -0,0 +1,143 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intfeaturespace.cpp
|
||||
// Description: Indexed feature space based on INT_FEATURE_STRUCT.
|
||||
// Created: Wed Mar 24 11:21:27 PDT 2010
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "intfeaturespace.h"
|
||||
#include "intfx.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
IntFeatureSpace::IntFeatureSpace()
|
||||
: x_buckets_(0), y_buckets_(0), theta_buckets_(0) {
|
||||
}
|
||||
|
||||
void IntFeatureSpace::Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets) {
|
||||
x_buckets_ = xbuckets;
|
||||
y_buckets_ = ybuckets;
|
||||
theta_buckets_ = thetabuckets;
|
||||
}
|
||||
|
||||
// Serializes the feature space definition to the given file.
|
||||
// Returns false on error.
|
||||
bool IntFeatureSpace::Serialize(FILE* fp) const {
|
||||
if (fwrite(&x_buckets_, sizeof(x_buckets_), 1, fp) != 1)
|
||||
return false;
|
||||
if (fwrite(&y_buckets_, sizeof(y_buckets_), 1, fp) != 1)
|
||||
return false;
|
||||
if (fwrite(&theta_buckets_, sizeof(theta_buckets_), 1, fp) != 1)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// DeSerializes the feature space definition from the given file.
|
||||
// If swap is true, the data is big/little-endian swapped.
|
||||
// Returns false on error.
|
||||
bool IntFeatureSpace::DeSerialize(bool swap, FILE* fp) {
|
||||
if (fread(&x_buckets_, sizeof(x_buckets_), 1, fp) != 1)
|
||||
return false;
|
||||
if (fread(&y_buckets_, sizeof(y_buckets_), 1, fp) != 1)
|
||||
return false;
|
||||
if (fread(&theta_buckets_, sizeof(theta_buckets_), 1, fp) != 1)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns an INT_FEATURE_STRUCT corresponding to the given index.
|
||||
// This is the inverse of the Index member.
|
||||
INT_FEATURE_STRUCT IntFeatureSpace::PositionFromIndex(int index) const {
|
||||
return PositionFromBuckets(index / (y_buckets_ * theta_buckets_),
|
||||
index / theta_buckets_ % y_buckets_,
|
||||
index % theta_buckets_);
|
||||
}
|
||||
|
||||
// Bulk calls to Index. Maps the given array of features to a vector of
|
||||
// inT32 indices in the same order as the input.
|
||||
void IntFeatureSpace::IndexFeatures(const INT_FEATURE_STRUCT* features,
|
||||
int num_features,
|
||||
GenericVector<int>* mapped_features) const {
|
||||
mapped_features->truncate(0);
|
||||
for (int f = 0; f < num_features; ++f)
|
||||
mapped_features->push_back(Index(features[f]));
|
||||
}
|
||||
|
||||
// Bulk calls to Index. Maps the given array of features to a vector of
|
||||
// sorted inT32 indices.
|
||||
void IntFeatureSpace::IndexAndSortFeatures(
|
||||
const INT_FEATURE_STRUCT* features, int num_features,
|
||||
GenericVector<int>* sorted_features) const {
|
||||
sorted_features->truncate(0);
|
||||
for (int f = 0; f < num_features; ++f)
|
||||
sorted_features->push_back(Index(features[f]));
|
||||
sorted_features->sort();
|
||||
}
|
||||
|
||||
// Returns a feature space index for the given x,y position in a display
|
||||
// window, or -1 if the feature is a miss.
|
||||
int IntFeatureSpace::XYToFeatureIndex(int x, int y) const {
|
||||
// Round the x,y position to a feature. Search for a valid theta.
|
||||
INT_FEATURE_STRUCT feature = {static_cast<uinT8>(x), static_cast<uinT8>(y),
|
||||
0, 0};
|
||||
int index = -1;
|
||||
for (int theta = 0; theta <= MAX_UINT8 && index < 0; ++theta) {
|
||||
feature.Theta = theta;
|
||||
index = Index(feature);
|
||||
}
|
||||
if (index < 0) {
|
||||
tprintf("(%d,%d) does not exist in feature space!\n", x, y);
|
||||
return -1;
|
||||
}
|
||||
feature = PositionFromIndex(index);
|
||||
tprintf("Click at (%d, %d) ->(%d, %d), ->(%d, %d)\n",
|
||||
x, y, feature.X, feature.Y, x - feature.X, y - feature.Y);
|
||||
// Get the relative position of x,y from the rounded feature.
|
||||
x -= feature.X;
|
||||
y -= feature.Y;
|
||||
if (x != 0 || y != 0) {
|
||||
double angle = atan2(static_cast<double>(y), static_cast<double>(x)) + PI;
|
||||
angle *= kIntFeatureExtent / (2.0 * PI);
|
||||
feature.Theta = static_cast<uinT8>(angle + 0.5);
|
||||
index = Index(feature);
|
||||
if (index < 0) {
|
||||
tprintf("Feature failed to map to a valid index:");
|
||||
feature.print();
|
||||
return -1;
|
||||
}
|
||||
feature = PositionFromIndex(index);
|
||||
}
|
||||
feature.print();
|
||||
return index;
|
||||
}
|
||||
|
||||
// Returns an INT_FEATURE_STRUCT corresponding to the given bucket coords.
|
||||
INT_FEATURE_STRUCT IntFeatureSpace::PositionFromBuckets(int x,
|
||||
int y,
|
||||
int theta) const {
|
||||
INT_FEATURE_STRUCT pos = {
|
||||
static_cast<uinT8>(ClipToRange(
|
||||
(x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,
|
||||
0, MAX_UINT8)),
|
||||
static_cast<uinT8>(ClipToRange(
|
||||
(y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,
|
||||
0, MAX_UINT8)),
|
||||
static_cast<uinT8>(ClipToRange(
|
||||
DivRounded(theta * kIntFeatureExtent, theta_buckets_),
|
||||
0, MAX_UINT8))};
|
||||
return pos;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
110
classify/intfeaturespace.h
Normal file
110
classify/intfeaturespace.h
Normal file
@ -0,0 +1,110 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: intfeaturespace.h
|
||||
// Description: Indexed feature space based on INT_FEATURE_STRUCT.
|
||||
// Created: Wed Mar 24 10:55:30 PDT 2010
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CLASSIFY_INTFEATURESPACE_H__
|
||||
#define TESSERACT_CLASSIFY_INTFEATURESPACE_H__
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "intproto.h"
|
||||
|
||||
// Extent of x,y,theta in the input feature space. [0,255].
|
||||
const int kIntFeatureExtent = 256;
|
||||
// Extent of x,y,theta dimensions in the quantized feature space.
|
||||
const int kBoostXYBuckets = 16;
|
||||
const int kBoostDirBuckets = 16;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class IndexMap;
|
||||
|
||||
// Down-sampling quantization of the INT_FEATURE_STRUCT feature space and
|
||||
// conversion to a single scalar index value, used as a binary feature space.
|
||||
class IntFeatureSpace {
|
||||
public:
|
||||
IntFeatureSpace();
|
||||
// Default copy constructors and assignment OK!
|
||||
|
||||
// Setup the feature space with the given dimensions.
|
||||
void Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets);
|
||||
|
||||
// Serializes the feature space definition to the given file.
|
||||
// Returns false on error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
|
||||
// DeSerializes the feature space definition from the given file.
|
||||
// If swap is true, the data is big/little-endian swapped.
|
||||
// Returns false on error.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Returns the total size of the feature space.
|
||||
int Size() const {
|
||||
return static_cast<int>(x_buckets_) * y_buckets_ * theta_buckets_;
|
||||
}
|
||||
// Returns an INT_FEATURE_STRUCT corresponding to the given index.
|
||||
// This is the inverse of the Index member.
|
||||
INT_FEATURE_STRUCT PositionFromIndex(int index) const;
|
||||
|
||||
// Returns a 1-dimensional index corresponding to the given feature value.
|
||||
// Range is [0, Size()-1]. Inverse of PositionFromIndex member.
|
||||
int Index(const INT_FEATURE_STRUCT& f) const {
|
||||
return (XBucket(f.X) * y_buckets_ + YBucket(f.Y)) * theta_buckets_ +
|
||||
ThetaBucket(f.Theta);
|
||||
}
|
||||
// Bulk calls to Index. Maps the given array of features to a vector of
|
||||
// inT32 indices in the same order as the input.
|
||||
void IndexFeatures(const INT_FEATURE_STRUCT* features, int num_features,
|
||||
GenericVector<int>* mapped_features) const;
|
||||
// Bulk calls to Index. Maps the given array of features to a vector of
|
||||
// sorted inT32 indices.
|
||||
void IndexAndSortFeatures(const INT_FEATURE_STRUCT* features,
|
||||
int num_features,
|
||||
GenericVector<int>* sorted_features) const;
|
||||
// Returns a feature space index for the given x,y position in a display
|
||||
// window, or -1 if the feature is a miss.
|
||||
int XYToFeatureIndex(int x, int y) const;
|
||||
|
||||
protected:
|
||||
// Converters to generate indices for individual feature dimensions.
|
||||
int XBucket(int x) const {
|
||||
int bucket = x * x_buckets_ / kIntFeatureExtent;
|
||||
return ClipToRange(bucket, 0, static_cast<int>(x_buckets_) - 1);
|
||||
}
|
||||
int YBucket(int y) const {
|
||||
int bucket = y * y_buckets_ / kIntFeatureExtent;
|
||||
return ClipToRange(bucket, 0, static_cast<int>(y_buckets_) - 1);
|
||||
}
|
||||
// Use DivRounded for theta so that exactly vertical and horizontal are in
|
||||
// the middle of a bucket. The Modulo takes care of the wrap-around.
|
||||
int ThetaBucket(int theta) const {
|
||||
int bucket = DivRounded(theta * theta_buckets_, kIntFeatureExtent);
|
||||
return Modulo(bucket, theta_buckets_);
|
||||
}
|
||||
// Returns an INT_FEATURE_STRUCT corresponding to the given buckets.
|
||||
INT_FEATURE_STRUCT PositionFromBuckets(int x, int y, int theta) const;
|
||||
|
||||
// Feature space definition - serialized.
|
||||
uinT8 x_buckets_;
|
||||
uinT8 y_buckets_;
|
||||
uinT8 theta_buckets_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
|
||||
#endif // TESSERACT_CLASSIFY_INTFEATURESPACE_H__
|
@ -23,9 +23,13 @@
|
||||
#include "const.h"
|
||||
#include "helpers.h"
|
||||
#include "ccutil.h"
|
||||
#include "statistc.h"
|
||||
#include "trainingsample.h"
|
||||
#ifdef __UNIX__
|
||||
#endif
|
||||
|
||||
using tesseract::TrainingSample;
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Private Function Prototypes
|
||||
----------------------------------------------------------------------------**/
|
||||
@ -55,6 +59,10 @@ INT_VAR(classify_radius_gyr_max_exp, 8,
|
||||
// atan(0.0) ... atan(ATAN_TABLE_SIZE - 1 / ATAN_TABLE_SIZE)
|
||||
// The entries are in binary degrees where a full circle is 256 binary degrees.
|
||||
static uinT8 AtanTable[ATAN_TABLE_SIZE];
|
||||
// Look up table for cos and sin to turn the intfx feature angle to a vector.
|
||||
// Also protected by atan_table_mutex.
|
||||
static float cos_table[INT_CHAR_NORM_RANGE];
|
||||
static float sin_table[INT_CHAR_NORM_RANGE];
|
||||
// Guards write access to AtanTable so we dont create it more than once.
|
||||
tesseract::CCUtilMutex atan_table_mutex;
|
||||
|
||||
@ -71,11 +79,46 @@ void InitIntegerFX() {
|
||||
AtanTable[i] =
|
||||
(uinT8) (atan ((i / (float) ATAN_TABLE_SIZE)) * 128.0 / PI + 0.5);
|
||||
}
|
||||
for (int i = 0; i < INT_CHAR_NORM_RANGE; ++i) {
|
||||
cos_table[i] = cos(i * 2 * PI / INT_CHAR_NORM_RANGE + PI);
|
||||
sin_table[i] = sin(i * 2 * PI / INT_CHAR_NORM_RANGE + PI);
|
||||
}
|
||||
atan_table_init = true;
|
||||
}
|
||||
atan_table_mutex.Unlock();
|
||||
}
|
||||
|
||||
// Returns a vector representing the direction of a feature with the given
|
||||
// theta direction in an INT_FEATURE_STRUCT.
|
||||
FCOORD FeatureDirection(uinT8 theta) {
|
||||
return FCOORD(cos_table[theta], sin_table[theta]);
|
||||
}
|
||||
|
||||
TrainingSample* GetIntFeatures(tesseract::NormalizationMode mode,
|
||||
TBLOB *blob, const DENORM& denorm) {
|
||||
INT_FEATURE_ARRAY blfeatures;
|
||||
INT_FEATURE_ARRAY cnfeatures;
|
||||
INT_FX_RESULT_STRUCT fx_info;
|
||||
ExtractIntFeat(blob, denorm, blfeatures, cnfeatures, &fx_info, NULL);
|
||||
TrainingSample* sample = NULL;
|
||||
if (mode == tesseract::NM_CHAR_ANISOTROPIC) {
|
||||
int num_features = fx_info.NumCN;
|
||||
if (num_features > 0) {
|
||||
sample = TrainingSample::CopyFromFeatures(fx_info, cnfeatures,
|
||||
num_features);
|
||||
}
|
||||
} else if (mode == tesseract::NM_BASELINE) {
|
||||
int num_features = fx_info.NumBL;
|
||||
if (num_features > 0) {
|
||||
sample = TrainingSample::CopyFromFeatures(fx_info, blfeatures,
|
||||
num_features);
|
||||
}
|
||||
} else {
|
||||
ASSERT_HOST(!"Unsupported normalization mode!");
|
||||
}
|
||||
return sample;
|
||||
}
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
// Extract a set of standard-sized features from Blobs and write them out in
|
||||
@ -101,7 +144,7 @@ int ExtractIntFeat(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
INT_FEATURE_ARRAY BLFeat,
|
||||
INT_FEATURE_ARRAY CNFeat,
|
||||
INT_FX_RESULT Results,
|
||||
INT_FX_RESULT_STRUCT* Results,
|
||||
inT32 *FeatureOutlineArray) {
|
||||
|
||||
TESSLINE *OutLine;
|
||||
@ -131,6 +174,8 @@ int ExtractIntFeat(TBLOB *Blob,
|
||||
Results->Ry = 0;
|
||||
Results->NumBL = 0;
|
||||
Results->NumCN = 0;
|
||||
Results->YBottom = MAX_UINT8;
|
||||
Results->YTop = 0;
|
||||
|
||||
// Calculate the centroid (Xmean, Ymean) for the blob.
|
||||
// We use centroid (instead of center of bounding box or center of smallest
|
||||
@ -200,6 +245,8 @@ int ExtractIntFeat(TBLOB *Blob,
|
||||
Iy = 0;
|
||||
NumBLFeatures = 0;
|
||||
OutLine = Blob->outlines;
|
||||
int min_x = 0;
|
||||
int max_x = 0;
|
||||
while (OutLine != NULL) {
|
||||
LoopStart = OutLine->loop;
|
||||
Loop = LoopStart;
|
||||
@ -213,6 +260,11 @@ int ExtractIntFeat(TBLOB *Blob,
|
||||
Loop = Loop->next;
|
||||
NormX = Loop->pos.x - Xmean;
|
||||
NormY = Loop->pos.y;
|
||||
if (NormY < Results->YBottom)
|
||||
Results->YBottom = ClipToRange(NormY, 0, MAX_UINT8);
|
||||
if (NormY > Results->YTop)
|
||||
Results->YTop = ClipToRange(NormY, 0, MAX_UINT8);
|
||||
UpdateRange(NormX, &min_x, &max_x);
|
||||
|
||||
n = 1;
|
||||
if (!Segment->IsHidden()) {
|
||||
@ -261,6 +313,7 @@ int ExtractIntFeat(TBLOB *Blob,
|
||||
while (Loop != LoopStart);
|
||||
OutLine = OutLine->next;
|
||||
}
|
||||
Results->Width = max_x - min_x;
|
||||
if (Ix == 0)
|
||||
Ix = 1;
|
||||
if (Iy == 0)
|
||||
@ -440,6 +493,7 @@ int SaveFeature(INT_FEATURE_ARRAY FeatureArray,
|
||||
Feature->X = ClipToRange<inT16>(X, 0, 255);
|
||||
Feature->Y = ClipToRange<inT16>(Y, 0, 255);
|
||||
Feature->Theta = Theta;
|
||||
Feature->CP_misses = 0;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -23,31 +23,43 @@
|
||||
----------------------------------------------------------------------------**/
|
||||
#include "blobs.h"
|
||||
#include "intproto.h"
|
||||
#include "normalis.h"
|
||||
#include <math.h>
|
||||
|
||||
class DENORM;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
inT32 Length; /* total length of all outlines */
|
||||
inT16 Xmean, Ymean; /* center of mass of all outlines */
|
||||
inT16 Rx, Ry; /* radius of gyration */
|
||||
inT16 NumBL, NumCN; /* number of features extracted */
|
||||
namespace tesseract {
|
||||
class TrainingSample;
|
||||
}
|
||||
|
||||
|
||||
INT_FX_RESULT_STRUCT, *INT_FX_RESULT;
|
||||
struct INT_FX_RESULT_STRUCT {
|
||||
inT32 Length; // total length of all outlines
|
||||
inT16 Xmean, Ymean; // center of mass of all outlines
|
||||
inT16 Rx, Ry; // radius of gyration
|
||||
inT16 NumBL, NumCN; // number of features extracted
|
||||
inT16 Width; // Width of blob in BLN coords.
|
||||
uinT8 YBottom; // Bottom of blob in BLN coords.
|
||||
uinT8 YTop; // Top of blob in BLN coords.
|
||||
};
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Public Function Prototypes
|
||||
----------------------------------------------------------------------------**/
|
||||
void InitIntegerFX();
|
||||
|
||||
// Returns a vector representing the direction of a feature with the given
|
||||
// theta direction in an INT_FEATURE_STRUCT.
|
||||
FCOORD FeatureDirection(uinT8 theta);
|
||||
|
||||
tesseract::TrainingSample* GetIntFeatures(
|
||||
tesseract::NormalizationMode mode, TBLOB *blob,
|
||||
const DENORM& denorm);
|
||||
|
||||
int ExtractIntFeat(TBLOB *Blob,
|
||||
const DENORM& denorm,
|
||||
INT_FEATURE_ARRAY BLFeat,
|
||||
INT_FEATURE_ARRAY CNFeat,
|
||||
INT_FX_RESULT Results,
|
||||
INT_FX_RESULT_STRUCT* Results,
|
||||
inT32 *FeatureOutlineArray = 0);
|
||||
|
||||
uinT8 BinaryAnglePlusPi(inT32 Y, inT32 X);
|
||||
|
@ -23,8 +23,11 @@
|
||||
#include "intproto.h"
|
||||
#include "callcpp.h"
|
||||
#include "scrollview.h"
|
||||
#include "float2int.h"
|
||||
#include "globals.h"
|
||||
#include "helpers.h"
|
||||
#include "classify.h"
|
||||
#include "shapetable.h"
|
||||
#include <math.h>
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
@ -35,6 +38,11 @@
|
||||
/*----------------------------------------------------------------------------
|
||||
Global Data Definitions and Declarations
|
||||
----------------------------------------------------------------------------*/
|
||||
// Parameters of the sigmoid used to convert similarity to evidence in the
|
||||
// similarity_evidence_table_ that is used to convert distance metric to an
|
||||
// 8 bit evidence value in the secondary matcher. (See IntMatcher::Init).
|
||||
const float IntegerMatcher::kSEExponentialMultiplier = 0.0;
|
||||
const float IntegerMatcher::kSimilarityCenter = 0.0075;
|
||||
|
||||
static const uinT8 offset_table[256] = {
|
||||
255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
|
||||
@ -89,275 +97,360 @@ static const uinT8 next_table[256] = {
|
||||
0xf8, 0xfc, 0xfc, 0xfe
|
||||
};
|
||||
|
||||
struct ClassPrunerData {
|
||||
int *class_count_;
|
||||
int *norm_count_;
|
||||
int *sort_key_;
|
||||
int *sort_index_;
|
||||
int max_classes_;
|
||||
namespace tesseract {
|
||||
|
||||
ClassPrunerData(int max_classes) {
|
||||
// class_count_ and friends are referenced by indexing off of data in
|
||||
// class pruner word sized chunks. Each pruner word is of sized
|
||||
// BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
|
||||
// BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
|
||||
// See Classify::ClassPruner in intmatcher.cpp.
|
||||
max_classes_ = RoundUp(
|
||||
// Encapsulation of the intermediate data and computations made by the class
|
||||
// pruner. The class pruner implements a simple linear classifier on binary
|
||||
// features by heavily quantizing the feature space, and applying
|
||||
// NUM_BITS_PER_CLASS (2)-bit weights to the features. Lack of resolution in
|
||||
// weights is compensated by a non-constant bias that is dependent on the
|
||||
// number of features present.
|
||||
class ClassPruner {
|
||||
public:
|
||||
ClassPruner(int max_classes) {
|
||||
// The unrolled loop in ComputeScores means that the array sizes need to
|
||||
// be rounded up so that the array is big enough to accommodate the extra
|
||||
// entries accessed by the unrolling. Each pruner word is of sized
|
||||
// BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
|
||||
// BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
|
||||
// See ComputeScores.
|
||||
max_classes_ = max_classes;
|
||||
rounded_classes_ = RoundUp(
|
||||
max_classes, WERDS_PER_CP_VECTOR * BITS_PER_WERD / NUM_BITS_PER_CLASS);
|
||||
class_count_ = new int[max_classes_];
|
||||
norm_count_ = new int[max_classes_];
|
||||
sort_key_ = new int[max_classes_ + 1];
|
||||
sort_index_ = new int[max_classes_ + 1];
|
||||
for (int i = 0; i < max_classes_; i++) {
|
||||
class_count_ = new int[rounded_classes_];
|
||||
norm_count_ = new int[rounded_classes_];
|
||||
sort_key_ = new int[rounded_classes_ + 1];
|
||||
sort_index_ = new int[rounded_classes_ + 1];
|
||||
for (int i = 0; i < rounded_classes_; i++) {
|
||||
class_count_[i] = 0;
|
||||
}
|
||||
pruning_threshold_ = 0;
|
||||
num_features_ = 0;
|
||||
num_classes_ = 0;
|
||||
}
|
||||
|
||||
~ClassPrunerData() {
|
||||
~ClassPruner() {
|
||||
delete []class_count_;
|
||||
delete []norm_count_;
|
||||
delete []sort_key_;
|
||||
delete []sort_index_;
|
||||
}
|
||||
|
||||
};
|
||||
// Computes the scores for every class in the character set, by summing the
|
||||
// weights for each feature and stores the sums internally in class_count_.
|
||||
void ComputeScores(const INT_TEMPLATES_STRUCT* int_templates,
|
||||
int num_features, const INT_FEATURE_STRUCT* features) {
|
||||
num_features_ = num_features;
|
||||
int num_pruners = int_templates->NumClassPruners;
|
||||
for (int f = 0; f < num_features; ++f) {
|
||||
const INT_FEATURE_STRUCT* feature = &features[f];
|
||||
// Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
|
||||
int x = feature->X * NUM_CP_BUCKETS >> 8;
|
||||
int y = feature->Y * NUM_CP_BUCKETS >> 8;
|
||||
int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
|
||||
int class_id = 0;
|
||||
// Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
|
||||
// we need a collection of them, indexed by pruner_set.
|
||||
for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
|
||||
// Look up quantized feature in a 3-D array, an array of weights for
|
||||
// each class.
|
||||
const uinT32* pruner_word_ptr =
|
||||
int_templates->ClassPruners[pruner_set]->p[x][y][theta];
|
||||
for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
|
||||
uinT32 pruner_word = *pruner_word_ptr++;
|
||||
// This inner loop is unrolled to speed up the ClassPruner.
|
||||
// Currently gcc would not unroll it unless it is set to O3
|
||||
// level of optimization or -funroll-loops is specified.
|
||||
/*
|
||||
uinT32 class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
|
||||
for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
|
||||
class_count_[class_id++] += pruner_word & class_mask;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
}
|
||||
*/
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float IntegerMatcher::kSEExponentialMultiplier = 0.0;
|
||||
const float IntegerMatcher::kSimilarityCenter = 0.0075;
|
||||
// Adjusts the scores according to the number of expected features. Used
|
||||
// in lieu of a constant bias, this penalizes classes that expect more
|
||||
// features than there are present. Thus an actual c will score higher for c
|
||||
// than e, even though almost all the features match e as well as c, because
|
||||
// e expects more features to be present.
|
||||
void AdjustForExpectedNumFeatures(const uinT16* expected_num_features,
|
||||
int cutoff_strength) {
|
||||
for (int class_id = 0; class_id < max_classes_; ++class_id) {
|
||||
if (num_features_ < expected_num_features[class_id]) {
|
||||
int deficit = expected_num_features[class_id] - num_features_;
|
||||
class_count_[class_id] -= class_count_[class_id] * deficit /
|
||||
(num_features_ * cutoff_strength + deficit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Zeros the scores for classes disabled in the unicharset.
|
||||
// Implements the black-list to recognize a subset of the character set.
|
||||
void DisableDisabledClasses(const UNICHARSET& unicharset) {
|
||||
for (int class_id = 0; class_id < max_classes_; ++class_id) {
|
||||
if (!unicharset.get_enabled(class_id))
|
||||
class_count_[class_id] = 0; // This char is disabled!
|
||||
}
|
||||
}
|
||||
|
||||
// Zeros the scores of fragments.
|
||||
void DisableFragments(const UNICHARSET& unicharset) {
|
||||
for (int class_id = 0; class_id < max_classes_; ++class_id) {
|
||||
// Do not include character fragments in the class pruner
|
||||
// results if disable_character_fragments is true.
|
||||
if (unicharset.get_fragment(class_id)) {
|
||||
class_count_[class_id] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Normalizes the counts for xheight, putting the normalized result in
|
||||
// norm_count_. Applies a simple subtractive penalty for incorrect vertical
|
||||
// position provided by the normalization_factors array, indexed by
|
||||
// character class, and scaled by the norm_multiplier.
|
||||
void NormalizeForXheight(int norm_multiplier,
|
||||
const uinT8* normalization_factors) {
|
||||
for (int class_id = 0; class_id < max_classes_; class_id++) {
|
||||
norm_count_[class_id] = class_count_[class_id] -
|
||||
((norm_multiplier * normalization_factors[class_id]) >> 8);
|
||||
}
|
||||
}
|
||||
|
||||
// The nop normalization copies the class_count_ array to norm_count_.
|
||||
void NoNormalization() {
|
||||
for (int class_id = 0; class_id < max_classes_; class_id++) {
|
||||
norm_count_[class_id] = class_count_[class_id];
|
||||
}
|
||||
}
|
||||
|
||||
// Prunes the classes using <the maximum count> * pruning_factor/256 as a
|
||||
// threshold for keeping classes. If max_of_non_fragments, then ignore
|
||||
// fragments in computing the maximum count.
|
||||
void PruneAndSort(int pruning_factor, bool max_of_non_fragments,
|
||||
const UNICHARSET& unicharset) {
|
||||
int max_count = 0;
|
||||
for (int c = 0; c < max_classes_; ++c) {
|
||||
if (norm_count_[c] > max_count &&
|
||||
// This additional check is added in order to ensure that
|
||||
// the classifier will return at least one non-fragmented
|
||||
// character match.
|
||||
// TODO(daria): verify that this helps accuracy and does not
|
||||
// hurt performance.
|
||||
(!max_of_non_fragments || !unicharset.get_fragment(c))) {
|
||||
max_count = norm_count_[c];
|
||||
}
|
||||
}
|
||||
// Prune Classes.
|
||||
pruning_threshold_ = (max_count * pruning_factor) >> 8;
|
||||
// Select Classes.
|
||||
if (pruning_threshold_ < 1)
|
||||
pruning_threshold_ = 1;
|
||||
num_classes_ = 0;
|
||||
for (int class_id = 0; class_id < max_classes_; class_id++) {
|
||||
if (norm_count_[class_id] >= pruning_threshold_) {
|
||||
++num_classes_;
|
||||
sort_index_[num_classes_] = class_id;
|
||||
sort_key_[num_classes_] = norm_count_[class_id];
|
||||
}
|
||||
}
|
||||
|
||||
// Sort Classes using Heapsort Algorithm.
|
||||
if (num_classes_ > 1)
|
||||
HeapSort(num_classes_, sort_key_, sort_index_);
|
||||
}
|
||||
|
||||
// Prints debug info on the class pruner matches for the pruned classes only.
|
||||
void DebugMatch(const Classify& classify,
|
||||
const INT_TEMPLATES_STRUCT* int_templates,
|
||||
const INT_FEATURE_STRUCT* features) const {
|
||||
int num_pruners = int_templates->NumClassPruners;
|
||||
int max_num_classes = int_templates->NumClasses;
|
||||
for (int f = 0; f < num_features_; ++f) {
|
||||
const INT_FEATURE_STRUCT* feature = &features[f];
|
||||
tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta);
|
||||
// Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
|
||||
int x = feature->X * NUM_CP_BUCKETS >> 8;
|
||||
int y = feature->Y * NUM_CP_BUCKETS >> 8;
|
||||
int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
|
||||
int class_id = 0;
|
||||
for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
|
||||
// Look up quantized feature in a 3-D array, an array of weights for
|
||||
// each class.
|
||||
const uinT32* pruner_word_ptr =
|
||||
int_templates->ClassPruners[pruner_set]->p[x][y][theta];
|
||||
for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
|
||||
uinT32 pruner_word = *pruner_word_ptr++;
|
||||
for (int word_class = 0; word_class < 16 &&
|
||||
class_id < max_num_classes; ++word_class, ++class_id) {
|
||||
if (norm_count_[class_id] >= pruning_threshold_) {
|
||||
tprintf(" %s=%d,",
|
||||
classify.ClassIDToDebugStr(int_templates,
|
||||
class_id, 0).string(),
|
||||
pruner_word & CLASS_PRUNER_CLASS_MASK);
|
||||
}
|
||||
pruner_word >>= NUM_BITS_PER_CLASS;
|
||||
}
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prints a summary of the pruner result.
|
||||
void SummarizeResult(const Classify& classify,
|
||||
const INT_TEMPLATES_STRUCT* int_templates,
|
||||
const uinT16* expected_num_features,
|
||||
int norm_multiplier,
|
||||
const uinT8* normalization_factors) const {
|
||||
tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_);
|
||||
for (int i = 0; i < num_classes_; ++i) {
|
||||
int class_id = sort_index_[num_classes_ - i];
|
||||
STRING class_string = classify.ClassIDToDebugStr(int_templates,
|
||||
class_id, 0);
|
||||
tprintf("%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n",
|
||||
class_string.string(),
|
||||
class_count_[class_id],
|
||||
expected_num_features[class_id],
|
||||
(norm_multiplier * normalization_factors[class_id]) >> 8,
|
||||
sort_key_[num_classes_ - i],
|
||||
100.0 - 100.0 * sort_key_[num_classes_ - i] /
|
||||
(CLASS_PRUNER_CLASS_MASK * num_features_));
|
||||
}
|
||||
}
|
||||
|
||||
// Copies the pruned, sorted classes into the output results and returns
|
||||
// the number of classes.
|
||||
int SetupResults(CP_RESULT_STRUCT* results) const {
|
||||
for (int c = 0; c < num_classes_; ++c) {
|
||||
results[c].Class = sort_index_[num_classes_ - c];
|
||||
results[c].Rating = 1.0 - sort_key_[num_classes_ - c] /
|
||||
(static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_);
|
||||
}
|
||||
return num_classes_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Array[rounded_classes_] of initial counts for each class.
|
||||
int *class_count_;
|
||||
// Array[rounded_classes_] of modified counts for each class after normalizing
|
||||
// for expected number of features, disabled classes, fragments, and xheights.
|
||||
int *norm_count_;
|
||||
// Array[rounded_classes_ +1] of pruned counts that gets sorted
|
||||
int *sort_key_;
|
||||
// Array[rounded_classes_ +1] of classes corresponding to sort_key_.
|
||||
int *sort_index_;
|
||||
// Number of classes in this class pruner.
|
||||
int max_classes_;
|
||||
// Rounded up number of classes used for array sizes.
|
||||
int rounded_classes_;
|
||||
// Threshold count applied to prune classes.
|
||||
int pruning_threshold_;
|
||||
// The number of features used to compute the scores.
|
||||
int num_features_;
|
||||
// Final number of pruned classes.
|
||||
int num_classes_;
|
||||
};
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
Public Code
|
||||
----------------------------------------------------------------------------*/
|
||||
/*---------------------------------------------------------------------------*/
|
||||
namespace tesseract {
|
||||
int Classify::ClassPruner(INT_TEMPLATES IntTemplates,
|
||||
inT16 NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
|
||||
CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
|
||||
CLASS_PRUNER_RESULTS Results) {
|
||||
// Runs the class pruner from int_templates on the given features, returning
|
||||
// the number of classes output in results.
|
||||
// int_templates Class pruner tables
|
||||
// num_features Number of features in blob
|
||||
// features Array of features
|
||||
// normalization_factors Array of fudge factors from blob
|
||||
// normalization process (by CLASS_INDEX)
|
||||
// expected_num_features Array of expected number of features
|
||||
// for each class (by CLASS_INDEX)
|
||||
// results Sorted Array of pruned classes. Must be an array
|
||||
// of size at least int_templates->NumClasses.
|
||||
int Classify::PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
|
||||
int num_features,
|
||||
const INT_FEATURE_STRUCT* features,
|
||||
const uinT8* normalization_factors,
|
||||
const uinT16* expected_num_features,
|
||||
CP_RESULT_STRUCT* results) {
|
||||
/*
|
||||
** Parameters:
|
||||
** IntTemplates Class pruner tables
|
||||
** NumFeatures Number of features in blob
|
||||
** Features Array of features
|
||||
** NormalizationFactors Array of fudge factors from blob
|
||||
** normalization process
|
||||
** (by CLASS_INDEX)
|
||||
** ExpectedNumFeatures Array of expected number of features
|
||||
** for each class
|
||||
** (by CLASS_INDEX)
|
||||
** Results Sorted Array of pruned classes
|
||||
** (by CLASS_ID)
|
||||
** Operation:
|
||||
** Prune the classes using a modified fast match table.
|
||||
** Return a sorted list of classes along with the number
|
||||
** of pruned classes in that list.
|
||||
** Return: Number of pruned classes.
|
||||
** Exceptions: none
|
||||
** History: Tue Feb 19 10:24:24 MST 1991, RWM, Created.
|
||||
** Operation:
|
||||
** Prunes the classes using a modified fast match table.
|
||||
** Returns a sorted list of classes along with the number
|
||||
** of pruned classes in that list.
|
||||
** Return: Number of pruned classes.
|
||||
** Exceptions: none
|
||||
** History: Tue Feb 19 10:24:24 MST 1991, RWM, Created.
|
||||
*/
|
||||
uinT32 PrunerWord;
|
||||
inT32 class_index; //index to class
|
||||
int Word;
|
||||
uinT32 *BasePrunerAddress;
|
||||
uinT32 feature_address; //current feature index
|
||||
INT_FEATURE feature; //current feature
|
||||
CLASS_PRUNER *ClassPruner;
|
||||
int PrunerSet;
|
||||
int NumPruners;
|
||||
inT32 feature_index; //current feature
|
||||
ClassPruner pruner(int_templates->NumClasses);
|
||||
// Compute initial match scores for all classes.
|
||||
pruner.ComputeScores(int_templates, num_features, features);
|
||||
// Adjust match scores for number of expected features.
|
||||
pruner.AdjustForExpectedNumFeatures(expected_num_features,
|
||||
classify_cp_cutoff_strength);
|
||||
// Apply disabled classes in unicharset - only works without a shape_table.
|
||||
if (shape_table_ == NULL)
|
||||
pruner.DisableDisabledClasses(unicharset);
|
||||
// If fragments are disabled, remove them, also only without a shape table.
|
||||
if (disable_character_fragments && shape_table_ == NULL)
|
||||
pruner.DisableFragments(unicharset);
|
||||
|
||||
int MaxNumClasses = IntTemplates->NumClasses;
|
||||
ClassPrunerData data(IntTemplates->NumClasses);
|
||||
int *ClassCount = data.class_count_;
|
||||
int *NormCount = data.norm_count_;
|
||||
int *SortKey = data.sort_key_;
|
||||
int *SortIndex = data.sort_index_;
|
||||
|
||||
int out_class;
|
||||
int MaxCount;
|
||||
int NumClasses;
|
||||
FLOAT32 max_rating; //max allowed rating
|
||||
CLASS_ID class_id;
|
||||
|
||||
/* Update Class Counts */
|
||||
NumPruners = IntTemplates->NumClassPruners;
|
||||
for (feature_index = 0; feature_index < NumFeatures; feature_index++) {
|
||||
feature = &Features[feature_index];
|
||||
feature_address = (((feature->X * NUM_CP_BUCKETS >> 8) * NUM_CP_BUCKETS +
|
||||
(feature->Y * NUM_CP_BUCKETS >> 8)) * NUM_CP_BUCKETS +
|
||||
(feature->Theta * NUM_CP_BUCKETS >> 8)) << 1;
|
||||
ClassPruner = IntTemplates->ClassPruner;
|
||||
class_index = 0;
|
||||
|
||||
for (PrunerSet = 0; PrunerSet < NumPruners; PrunerSet++, ClassPruner++) {
|
||||
BasePrunerAddress = (uinT32 *) (*ClassPruner) + feature_address;
|
||||
|
||||
for (Word = 0; Word < WERDS_PER_CP_VECTOR; Word++) {
|
||||
PrunerWord = *BasePrunerAddress++;
|
||||
// This inner loop is unrolled to speed up the ClassPruner.
|
||||
// Currently gcc would not unroll it unless it is set to O3
|
||||
// level of optimization or -funroll-loops is specified.
|
||||
/*
|
||||
uinT32 class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
|
||||
for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
|
||||
ClassCount[class_index++] += PrunerWord & class_mask;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
}
|
||||
*/
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
ClassCount[class_index++] += PrunerWord & CLASS_PRUNER_CLASS_MASK;
|
||||
}
|
||||
}
|
||||
// If we have good x-heights, apply the given normalization factors.
|
||||
if (normalization_factors != NULL) {
|
||||
pruner.NormalizeForXheight(classify_class_pruner_multiplier,
|
||||
normalization_factors);
|
||||
} else {
|
||||
pruner.NoNormalization();
|
||||
}
|
||||
// Do the actual pruning and sort the short-list.
|
||||
pruner.PruneAndSort(classify_class_pruner_threshold,
|
||||
shape_table_ == NULL, unicharset);
|
||||
|
||||
/* Adjust Class Counts for Number of Expected Features */
|
||||
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
|
||||
if (NumFeatures < ExpectedNumFeatures[class_id]) {
|
||||
int deficit = ExpectedNumFeatures[class_id] - NumFeatures;
|
||||
ClassCount[class_id] -= ClassCount[class_id] * deficit /
|
||||
(NumFeatures * classify_cp_cutoff_strength + deficit);
|
||||
}
|
||||
if (!unicharset.get_enabled(class_id))
|
||||
ClassCount[class_id] = 0; // This char is disabled!
|
||||
|
||||
// Do not include character fragments in the class pruner
|
||||
// results if disable_character_fragments is true.
|
||||
if (disable_character_fragments && unicharset.get_fragment(class_id)) {
|
||||
ClassCount[class_id] = 0;
|
||||
}
|
||||
if (classify_debug_level > 2) {
|
||||
pruner.DebugMatch(*this, int_templates, features);
|
||||
}
|
||||
|
||||
/* Adjust Class Counts for Normalization Factors */
|
||||
MaxCount = 0;
|
||||
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
|
||||
NormCount[class_id] = ClassCount[class_id]
|
||||
- ((classify_class_pruner_multiplier * NormalizationFactors[class_id])
|
||||
>> 8);
|
||||
if (NormCount[class_id] > MaxCount &&
|
||||
// This additional check is added in order to ensure that
|
||||
// the classifier will return at least one non-fragmented
|
||||
// character match.
|
||||
// TODO(daria): verify that this helps accuracy and does not
|
||||
// hurt performance.
|
||||
!unicharset.get_fragment(class_id)) {
|
||||
MaxCount = NormCount[class_id];
|
||||
}
|
||||
}
|
||||
|
||||
/* Prune Classes */
|
||||
MaxCount *= classify_class_pruner_threshold;
|
||||
MaxCount >>= 8;
|
||||
/* Select Classes */
|
||||
if (MaxCount < 1)
|
||||
MaxCount = 1;
|
||||
NumClasses = 0;
|
||||
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
|
||||
if (NormCount[class_id] >= MaxCount) {
|
||||
NumClasses++;
|
||||
SortIndex[NumClasses] = class_id;
|
||||
SortKey[NumClasses] = NormCount[class_id];
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort Classes using Heapsort Algorithm */
|
||||
if (NumClasses > 1)
|
||||
HeapSort(NumClasses, SortKey, SortIndex);
|
||||
|
||||
if (classify_debug_level > 1) {
|
||||
cprintf ("CP:%d classes, %d features:\n", NumClasses, NumFeatures);
|
||||
for (class_id = 0; class_id < NumClasses; class_id++) {
|
||||
cprintf ("%s:C=%d, E=%d, N=%d, Rat=%d\n",
|
||||
unicharset.debug_str(SortIndex[NumClasses - class_id]).string(),
|
||||
ClassCount[SortIndex[NumClasses - class_id]],
|
||||
ExpectedNumFeatures[SortIndex[NumClasses - class_id]],
|
||||
SortKey[NumClasses - class_id],
|
||||
1010 - 1000 * SortKey[NumClasses - class_id] /
|
||||
(CLASS_PRUNER_CLASS_MASK * NumFeatures));
|
||||
}
|
||||
if (classify_debug_level > 2) {
|
||||
NumPruners = IntTemplates->NumClassPruners;
|
||||
for (feature_index = 0; feature_index < NumFeatures;
|
||||
feature_index++) {
|
||||
cprintf ("F=%3d,", feature_index);
|
||||
feature = &Features[feature_index];
|
||||
feature_address =
|
||||
(((feature->X * NUM_CP_BUCKETS >> 8) * NUM_CP_BUCKETS +
|
||||
(feature->Y * NUM_CP_BUCKETS >> 8)) * NUM_CP_BUCKETS +
|
||||
(feature->Theta * NUM_CP_BUCKETS >> 8)) << 1;
|
||||
ClassPruner = IntTemplates->ClassPruner;
|
||||
class_index = 0;
|
||||
for (PrunerSet = 0; PrunerSet < NumPruners;
|
||||
PrunerSet++, ClassPruner++) {
|
||||
BasePrunerAddress = (uinT32 *) (*ClassPruner)
|
||||
+ feature_address;
|
||||
|
||||
for (Word = 0; Word < WERDS_PER_CP_VECTOR; Word++) {
|
||||
PrunerWord = *BasePrunerAddress++;
|
||||
for (class_id = 0; class_id < 16; class_id++, class_index++) {
|
||||
if (NormCount[class_index] >= MaxCount)
|
||||
cprintf (" %s=%d,",
|
||||
unicharset.id_to_unichar(class_index),
|
||||
PrunerWord & CLASS_PRUNER_CLASS_MASK);
|
||||
PrunerWord >>= NUM_BITS_PER_CLASS;
|
||||
}
|
||||
}
|
||||
}
|
||||
cprintf ("\n");
|
||||
}
|
||||
cprintf ("Adjustments:");
|
||||
for (class_id = 0; class_id < MaxNumClasses; class_id++) {
|
||||
if (NormCount[class_id] > MaxCount)
|
||||
cprintf(" %s=%d,",
|
||||
unicharset.id_to_unichar(class_id),
|
||||
-((classify_class_pruner_multiplier *
|
||||
NormalizationFactors[class_id]) >> 8));
|
||||
}
|
||||
cprintf ("\n");
|
||||
}
|
||||
pruner.SummarizeResult(*this, int_templates, expected_num_features,
|
||||
classify_class_pruner_multiplier,
|
||||
normalization_factors);
|
||||
}
|
||||
|
||||
/* Set Up Results */
|
||||
max_rating = 0.0f;
|
||||
for (class_id = 0, out_class = 0; class_id < NumClasses; class_id++) {
|
||||
Results[out_class].Class = SortIndex[NumClasses - class_id];
|
||||
Results[out_class].Rating =
|
||||
1.0 - SortKey[NumClasses - class_id] /
|
||||
(static_cast<float>(CLASS_PRUNER_CLASS_MASK) * NumFeatures);
|
||||
out_class++;
|
||||
}
|
||||
NumClasses = out_class;
|
||||
return NumClasses;
|
||||
// Convert to the expected output format.
|
||||
return pruner.SetupResults(results);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
@ -366,10 +459,8 @@ int Classify::ClassPruner(INT_TEMPLATES IntTemplates,
|
||||
void IntegerMatcher::Match(INT_CLASS ClassTemplate,
|
||||
BIT_VECTOR ProtoMask,
|
||||
BIT_VECTOR ConfigMask,
|
||||
uinT16 BlobLength,
|
||||
inT16 NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
uinT8 NormalizationFactor,
|
||||
const INT_FEATURE_STRUCT* Features,
|
||||
INT_RESULT Result,
|
||||
int AdaptFeatureThreshold,
|
||||
int Debug,
|
||||
@ -436,12 +527,11 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate,
|
||||
tables->UpdateSumOfProtoEvidences(ClassTemplate, ConfigMask, NumFeatures);
|
||||
tables->NormalizeSums(ClassTemplate, NumFeatures, NumFeatures);
|
||||
|
||||
BestMatch = FindBestMatch(ClassTemplate, *tables, BlobLength,
|
||||
NormalizationFactor, Result);
|
||||
BestMatch = FindBestMatch(ClassTemplate, *tables, Result);
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
if (PrintMatchSummaryOn(Debug))
|
||||
DebugBestMatch(BestMatch, Result, BlobLength, NormalizationFactor);
|
||||
DebugBestMatch(BestMatch, Result);
|
||||
|
||||
if (MatchDebuggingOn(Debug))
|
||||
cprintf("Match Complete --------------------------------------------\n");
|
||||
@ -718,7 +808,7 @@ int IntegerMatcher::UpdateTablesForFeature(
|
||||
BIT_VECTOR ProtoMask,
|
||||
BIT_VECTOR ConfigMask,
|
||||
int FeatureNum,
|
||||
INT_FEATURE Feature,
|
||||
const INT_FEATURE_STRUCT* Feature,
|
||||
ScratchEvidence *tables,
|
||||
int Debug) {
|
||||
/*
|
||||
@ -1048,7 +1138,7 @@ void IntegerMatcher::DisplayFeatureDebugInfo(
|
||||
BIT_VECTOR ProtoMask,
|
||||
BIT_VECTOR ConfigMask,
|
||||
inT16 NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
const INT_FEATURE_STRUCT* Features,
|
||||
int AdaptFeatureThreshold,
|
||||
int Debug,
|
||||
bool SeparateDebugWindows) {
|
||||
@ -1146,8 +1236,6 @@ void ScratchEvidence::NormalizeSums(
|
||||
int IntegerMatcher::FindBestMatch(
|
||||
INT_CLASS ClassTemplate,
|
||||
const ScratchEvidence &tables,
|
||||
uinT16 BlobLength,
|
||||
uinT8 NormalizationFactor,
|
||||
INT_RESULT Result) {
|
||||
/*
|
||||
** Parameters:
|
||||
@ -1168,7 +1256,7 @@ int IntegerMatcher::FindBestMatch(
|
||||
/* Find best match */
|
||||
for (int ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {
|
||||
int rating = tables.sum_feature_evidence_[ConfigNum];
|
||||
if (*classify_debug_level_ > 1)
|
||||
if (*classify_debug_level_ > 2)
|
||||
cprintf("Config %d, rating=%d\n", ConfigNum, rating);
|
||||
if (rating > BestMatch) {
|
||||
if (BestMatch > 0) {
|
||||
@ -1186,31 +1274,28 @@ int IntegerMatcher::FindBestMatch(
|
||||
}
|
||||
|
||||
/* Compute Certainty Rating */
|
||||
Result->Rating = ((65536.0 - BestMatch) / 65536.0 * BlobLength +
|
||||
local_matcher_multiplier_ * NormalizationFactor / 256.0) /
|
||||
(BlobLength + local_matcher_multiplier_);
|
||||
Result->Rating = (65536.0 - BestMatch) / 65536.0;
|
||||
|
||||
return BestMatch;
|
||||
}
|
||||
|
||||
// Applies the CN normalization factor to the given rating and returns
|
||||
// the modified rating.
|
||||
float IntegerMatcher::ApplyCNCorrection(float rating, int blob_length,
|
||||
int normalization_factor) {
|
||||
return (rating * blob_length +
|
||||
local_matcher_multiplier_ * normalization_factor / 256.0) /
|
||||
(blob_length + local_matcher_multiplier_);
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
// Print debug information about the best match for the current class.
|
||||
void IntegerMatcher::DebugBestMatch(
|
||||
int BestMatch, INT_RESULT Result, uinT16 BlobLength,
|
||||
uinT8 NormalizationFactor) {
|
||||
cprintf("Rating = %5.1f%% Best Config = %3d\n",
|
||||
100.0 * ((*Result).Rating), (int) ((*Result).Config));
|
||||
cprintf
|
||||
("Matcher Error = %5.1f%% Blob Length = %3d Weight = %4.1f%%\n",
|
||||
100.0 * (65536.0 - BestMatch) / 65536.0, (int) BlobLength,
|
||||
100.0 * BlobLength / (BlobLength + local_matcher_multiplier_));
|
||||
cprintf
|
||||
("Char Norm Error = %5.1f%% Norm Strength = %3d Weight = %4.1f%%\n",
|
||||
100.0 * NormalizationFactor / 256.0,
|
||||
local_matcher_multiplier_,
|
||||
100.0 * local_matcher_multiplier_ /
|
||||
(BlobLength + local_matcher_multiplier_));
|
||||
int BestMatch, INT_RESULT Result) {
|
||||
tprintf("Rating = %5.1f%% Best Config = %3d, Distance = %5.1f\n",
|
||||
100.0 * Result->Rating, Result->Config,
|
||||
100.0 * (65536.0 - BestMatch) / 65536.0);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -56,8 +56,6 @@ struct CP_RESULT_STRUCT {
|
||||
|
||||
typedef CP_RESULT_STRUCT CLASS_PRUNER_RESULTS[MAX_NUM_CLASSES];
|
||||
|
||||
typedef uinT8 CLASS_NORMALIZATION_ARRAY[MAX_NUM_CLASSES];
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
Variables
|
||||
-----------------------------------------------------------------------------*/
|
||||
@ -113,15 +111,18 @@ class IntegerMatcher {
|
||||
void Match(INT_CLASS ClassTemplate,
|
||||
BIT_VECTOR ProtoMask,
|
||||
BIT_VECTOR ConfigMask,
|
||||
uinT16 BlobLength,
|
||||
inT16 NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
uinT8 NormalizationFactor,
|
||||
const INT_FEATURE_STRUCT* Features,
|
||||
INT_RESULT Result,
|
||||
int AdaptFeatureThreshold,
|
||||
int Debug,
|
||||
bool SeparateDebugWindows);
|
||||
|
||||
// Applies the CN normalization factor to the given rating and returns
|
||||
// the modified rating.
|
||||
float ApplyCNCorrection(float rating, int blob_length,
|
||||
int normalization_factor);
|
||||
|
||||
int FindGoodProtos(INT_CLASS ClassTemplate,
|
||||
BIT_VECTOR ProtoMask,
|
||||
BIT_VECTOR ConfigMask,
|
||||
@ -148,14 +149,12 @@ class IntegerMatcher {
|
||||
BIT_VECTOR ProtoMask,
|
||||
BIT_VECTOR ConfigMask,
|
||||
int FeatureNum,
|
||||
INT_FEATURE Feature,
|
||||
const INT_FEATURE_STRUCT* Feature,
|
||||
ScratchEvidence *evidence,
|
||||
int Debug);
|
||||
|
||||
int FindBestMatch(INT_CLASS ClassTemplate,
|
||||
const ScratchEvidence &tables,
|
||||
uinT16 BlobLength,
|
||||
uinT8 NormalizationFactor,
|
||||
INT_RESULT Result);
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
@ -179,15 +178,12 @@ class IntegerMatcher {
|
||||
BIT_VECTOR ProtoMask,
|
||||
BIT_VECTOR ConfigMask,
|
||||
inT16 NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
const INT_FEATURE_STRUCT* Features,
|
||||
int AdaptFeatureThreshold,
|
||||
int Debug,
|
||||
bool SeparateDebugWindows);
|
||||
|
||||
void DebugBestMatch(int BestMatch,
|
||||
INT_RESULT Result,
|
||||
uinT16 BlobLength,
|
||||
uinT8 NormalizationFactor);
|
||||
void DebugBestMatch(int BestMatch, INT_RESULT Result);
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -18,19 +18,6 @@
|
||||
/*-----------------------------------------------------------------------------
|
||||
Include Files and Type Defines
|
||||
-----------------------------------------------------------------------------*/
|
||||
#include "helpers.h"
|
||||
#include "intproto.h"
|
||||
#include "picofeat.h"
|
||||
#include "mfoutline.h"
|
||||
#include "emalloc.h"
|
||||
#include "const.h"
|
||||
#include "ndminx.h"
|
||||
#include "svmnode.h"
|
||||
#include "globals.h"
|
||||
#include "classify.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
//extern GetPicoFeatureLength();
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
@ -39,11 +26,29 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "classify.h"
|
||||
#include "const.h"
|
||||
#include "emalloc.h"
|
||||
#include "fontinfo.h"
|
||||
#include "genericvector.h"
|
||||
#include "globals.h"
|
||||
#include "helpers.h"
|
||||
#include "intproto.h"
|
||||
#include "mfoutline.h"
|
||||
#include "ndminx.h"
|
||||
#include "picofeat.h"
|
||||
#include "shapetable.h"
|
||||
#include "svmnode.h"
|
||||
|
||||
// Include automatically generated configuration file if running autoconf.
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config_auto.h"
|
||||
#endif
|
||||
|
||||
using tesseract::FontInfo;
|
||||
using tesseract::FontSet;
|
||||
using tesseract::FontSpacingInfo;
|
||||
|
||||
/* match debug display constants*/
|
||||
#define PROTO_PRUNER_SCALE (4.0)
|
||||
|
||||
@ -126,7 +131,7 @@ FLOAT32 BucketStart(int Bucket, FLOAT32 Offset, int NumBuckets);
|
||||
FLOAT32 BucketEnd(int Bucket, FLOAT32 Offset, int NumBuckets);
|
||||
|
||||
void DoFill(FILL_SPEC *FillSpec,
|
||||
CLASS_PRUNER Pruner,
|
||||
CLASS_PRUNER_STRUCT* Pruner,
|
||||
register uinT32 ClassMask,
|
||||
register uinT32 ClassCount,
|
||||
register uinT32 WordIndex);
|
||||
@ -218,7 +223,6 @@ double_VAR(classify_pp_side_pad, 2.5, "Proto Pruner Side Pad");
|
||||
*/
|
||||
void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class) {
|
||||
int Pruner;
|
||||
uinT32 *Word;
|
||||
|
||||
assert (LegalClassId (ClassId));
|
||||
if (ClassId != Templates->NumClasses) {
|
||||
@ -231,13 +235,8 @@ void AddIntClass(INT_TEMPLATES Templates, CLASS_ID ClassId, INT_CLASS Class) {
|
||||
|
||||
if (Templates->NumClasses > MaxNumClassesIn (Templates)) {
|
||||
Pruner = Templates->NumClassPruners++;
|
||||
Templates->ClassPruner[Pruner] =
|
||||
(CLASS_PRUNER) Emalloc (sizeof (CLASS_PRUNER_STRUCT));
|
||||
|
||||
for (Word = reinterpret_cast<uinT32*>(Templates->ClassPruner[Pruner]);
|
||||
Word < reinterpret_cast<uinT32*>(Templates->ClassPruner[Pruner]) +
|
||||
WERDS_PER_CP;
|
||||
*Word++ = 0);
|
||||
Templates->ClassPruners[Pruner] = new CLASS_PRUNER_STRUCT;
|
||||
memset(Templates->ClassPruners[Pruner], 0, sizeof(CLASS_PRUNER_STRUCT));
|
||||
}
|
||||
} /* AddIntClass */
|
||||
|
||||
@ -296,14 +295,14 @@ int AddIntProto(INT_CLASS Class) {
|
||||
|
||||
ProtoSet = (PROTO_SET) Emalloc(sizeof(PROTO_SET_STRUCT));
|
||||
Class->ProtoSets[ProtoSetId] = ProtoSet;
|
||||
for (Word = reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner);
|
||||
Word < reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner) + WERDS_PER_PP;
|
||||
*Word++ = 0);
|
||||
memset(ProtoSet, 0, sizeof(*ProtoSet));
|
||||
|
||||
/* reallocate space for the proto lengths and install in class */
|
||||
Class->ProtoLengths =
|
||||
(uinT8 *)Erealloc(Class->ProtoLengths,
|
||||
MaxNumIntProtosIn(Class) * sizeof(uinT8));
|
||||
memset(&Class->ProtoLengths[Index], 0,
|
||||
sizeof(*Class->ProtoLengths) * (MaxNumIntProtosIn(Class) - Index));
|
||||
}
|
||||
|
||||
/* initialize proto so its length is zero and it isn't in any configs */
|
||||
@ -335,7 +334,7 @@ void AddProtoToClassPruner (PROTO Proto, CLASS_ID ClassId,
|
||||
*/
|
||||
#define MAX_LEVEL 2
|
||||
{
|
||||
CLASS_PRUNER Pruner;
|
||||
CLASS_PRUNER_STRUCT* Pruner;
|
||||
uinT32 ClassMask;
|
||||
uinT32 ClassCount;
|
||||
uinT32 WordIndex;
|
||||
@ -636,7 +635,7 @@ INT_TEMPLATES Classify::CreateIntTemplates(CLASSES FloatProtos,
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void DisplayIntFeature(INT_FEATURE Feature, FLOAT32 Evidence) {
|
||||
void DisplayIntFeature(const INT_FEATURE_STRUCT* Feature, FLOAT32 Evidence) {
|
||||
/*
|
||||
** Parameters:
|
||||
** Feature pico-feature to be displayed
|
||||
@ -697,7 +696,6 @@ INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs) {
|
||||
INT_CLASS Class;
|
||||
PROTO_SET ProtoSet;
|
||||
int i;
|
||||
register uinT32 *Word;
|
||||
|
||||
assert(MaxNumConfigs <= MAX_NUM_CONFIGS);
|
||||
|
||||
@ -713,17 +711,20 @@ INT_CLASS NewIntClass(int MaxNumProtos, int MaxNumConfigs) {
|
||||
for (i = 0; i < Class->NumProtoSets; i++) {
|
||||
/* allocate space for a proto set, install in class, and initialize */
|
||||
ProtoSet = (PROTO_SET) Emalloc(sizeof(PROTO_SET_STRUCT));
|
||||
memset(ProtoSet, 0, sizeof(*ProtoSet));
|
||||
Class->ProtoSets[i] = ProtoSet;
|
||||
for (Word = reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner);
|
||||
Word < reinterpret_cast<uinT32*>(ProtoSet->ProtoPruner) + WERDS_PER_PP;
|
||||
*Word++ = 0);
|
||||
|
||||
/* allocate space for the proto lengths and install in class */
|
||||
}
|
||||
if (MaxNumIntProtosIn (Class) > 0) {
|
||||
Class->ProtoLengths =
|
||||
(uinT8 *)Emalloc(MaxNumIntProtosIn (Class) * sizeof (uinT8));
|
||||
memset(Class->ProtoLengths, 0,
|
||||
MaxNumIntProtosIn(Class) * sizeof(*Class->ProtoLengths));
|
||||
} else {
|
||||
Class->ProtoLengths = NULL;
|
||||
}
|
||||
memset(Class->ConfigLengths, 0, sizeof(Class->ConfigLengths));
|
||||
|
||||
return (Class);
|
||||
|
||||
@ -776,120 +777,11 @@ void free_int_templates(INT_TEMPLATES templates) {
|
||||
for (i = 0; i < templates->NumClasses; i++)
|
||||
free_int_class(templates->Class[i]);
|
||||
for (i = 0; i < templates->NumClassPruners; i++)
|
||||
Efree(templates->ClassPruner[i]);
|
||||
delete templates->ClassPruners[i];
|
||||
Efree(templates);
|
||||
}
|
||||
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
// Code to read/write Classify::font*table structures.
|
||||
namespace {
|
||||
bool read_info(FILE* f, FontInfo* fi, bool swap) {
|
||||
inT32 size;
|
||||
if (fread(&size, sizeof(size), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&size);
|
||||
char* font_name = new char[size + 1];
|
||||
fi->name = font_name;
|
||||
if (fread(font_name, sizeof(*font_name), size, f) != size) return false;
|
||||
font_name[size] = '\0';
|
||||
if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&fi->properties);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool write_info(FILE* f, const FontInfo& fi) {
|
||||
inT32 size = strlen(fi.name);
|
||||
if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
|
||||
if (fwrite(fi.name, sizeof(*fi.name), size, f) != size) return false;
|
||||
if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap) {
|
||||
inT32 vec_size, kern_size;
|
||||
if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
|
||||
if (swap) Reverse32(&vec_size);
|
||||
ASSERT_HOST(vec_size >= 0);
|
||||
if (vec_size == 0) return true;
|
||||
fi->init_spacing(vec_size);
|
||||
for (int i = 0; i < vec_size; ++i) {
|
||||
FontSpacingInfo *fs = new FontSpacingInfo();
|
||||
if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
|
||||
fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
|
||||
fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
|
||||
return false;
|
||||
}
|
||||
if (swap) {
|
||||
ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
|
||||
ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
|
||||
Reverse32(&kern_size);
|
||||
}
|
||||
if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec
|
||||
delete fs;
|
||||
continue;
|
||||
}
|
||||
if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
|
||||
!fs->kerned_x_gaps.DeSerialize(swap, f))) {
|
||||
return false;
|
||||
}
|
||||
fi->add_spacing(i, fs);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool write_spacing_info(FILE* f, const FontInfo& fi) {
|
||||
inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
|
||||
if (fwrite(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
|
||||
inT16 x_gap_invalid = -1;
|
||||
for (int i = 0; i < vec_size; ++i) {
|
||||
FontSpacingInfo *fs = fi.spacing_vec->get(i);
|
||||
inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
|
||||
if (fs == NULL) {
|
||||
if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
|
||||
fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
|
||||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
|
||||
fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
|
||||
fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
|
||||
!fs->kerned_x_gaps.Serialize(f))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_set(FILE* f, FontSet* fs, bool swap) {
|
||||
if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&fs->size);
|
||||
fs->configs = new int[fs->size];
|
||||
for (int i = 0; i < fs->size; ++i) {
|
||||
if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&fs->configs[i]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool write_set(FILE* f, const FontSet& fs) {
|
||||
if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
|
||||
for (int i = 0; i < fs.size; ++i) {
|
||||
if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace.
|
||||
|
||||
namespace tesseract {
|
||||
INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
/*
|
||||
@ -909,7 +801,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
int unicharset_size;
|
||||
int version_id = 0;
|
||||
INT_TEMPLATES Templates;
|
||||
CLASS_PRUNER Pruner;
|
||||
CLASS_PRUNER_STRUCT* Pruner;
|
||||
INT_CLASS Class;
|
||||
uinT8 *Lengths;
|
||||
PROTO_SET ProtoSet;
|
||||
@ -919,11 +811,11 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
CLASS_ID class_id, max_class_id;
|
||||
inT16 *IndexFor = new inT16[MAX_NUM_CLASSES];
|
||||
CLASS_ID *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
|
||||
CLASS_PRUNER *TempClassPruner = new CLASS_PRUNER[MAX_NUM_CLASS_PRUNERS];
|
||||
CLASS_PRUNER_STRUCT **TempClassPruner =
|
||||
new CLASS_PRUNER_STRUCT*[MAX_NUM_CLASS_PRUNERS];
|
||||
uinT32 SetBitsForMask = // word with NUM_BITS_PER_CLASS
|
||||
(1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
|
||||
uinT32 Mask, NewMask, ClassBits;
|
||||
uinT32 *Word;
|
||||
int MaxNumConfigs = MAX_NUM_CONFIGS;
|
||||
int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
|
||||
|
||||
@ -979,9 +871,9 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
|
||||
/* then read in the class pruners */
|
||||
for (i = 0; i < Templates->NumClassPruners; i++) {
|
||||
Pruner = (CLASS_PRUNER) Emalloc(sizeof(CLASS_PRUNER_STRUCT));
|
||||
Pruner = new CLASS_PRUNER_STRUCT;
|
||||
if ((nread =
|
||||
fread((char *) Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
|
||||
fread(Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
|
||||
File)) != sizeof(CLASS_PRUNER_STRUCT))
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
if (swap) {
|
||||
@ -989,7 +881,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
for (y = 0; y < NUM_CP_BUCKETS; y++) {
|
||||
for (z = 0; z < NUM_CP_BUCKETS; z++) {
|
||||
for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
|
||||
Reverse32(&Pruner[x][y][z][w]);
|
||||
Reverse32(&Pruner->p[x][y][z][w]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -998,7 +890,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
if (version_id < 2) {
|
||||
TempClassPruner[i] = Pruner;
|
||||
} else {
|
||||
Templates->ClassPruner[i] = Pruner;
|
||||
Templates->ClassPruners[i] = Pruner;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1010,11 +902,8 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
if (ClassIdFor[i] > max_class_id)
|
||||
max_class_id = ClassIdFor[i];
|
||||
for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
|
||||
Templates->ClassPruner[i] =
|
||||
(CLASS_PRUNER) Emalloc(sizeof(CLASS_PRUNER_STRUCT));
|
||||
for (Word = (uinT32 *) (Templates->ClassPruner[i]);
|
||||
Word < (uinT32 *) (Templates->ClassPruner[i]) + WERDS_PER_CP;
|
||||
*Word++ = 0);
|
||||
Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
|
||||
memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
|
||||
}
|
||||
// Convert class pruners from the old format (indexed by class index)
|
||||
// to the new format (indexed by class id).
|
||||
@ -1024,7 +913,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
for (y = 0; y < NUM_CP_BUCKETS; y++)
|
||||
for (z = 0; z < NUM_CP_BUCKETS; z++)
|
||||
for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
|
||||
if (TempClassPruner[i][x][y][z][w] == 0)
|
||||
if (TempClassPruner[i]->p[x][y][z][w] == 0)
|
||||
continue;
|
||||
for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
|
||||
bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
|
||||
@ -1033,7 +922,7 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
|
||||
// Single out NUM_BITS_PER_CLASS bits relating to class_id.
|
||||
Mask = SetBitsForMask << b;
|
||||
ClassBits = TempClassPruner[i][x][y][z][w] & Mask;
|
||||
ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
|
||||
// Move these bits to the new position in which they should
|
||||
// appear (indexed corresponding to the class_id).
|
||||
new_i = CPrunerIdFor(class_id);
|
||||
@ -1047,13 +936,13 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
// Copy bits relating to class_id to the correct position
|
||||
// in Templates->ClassPruner.
|
||||
NewMask = SetBitsForMask << new_b;
|
||||
Templates->ClassPruner[new_i][x][y][z][new_w] &= ~NewMask;
|
||||
Templates->ClassPruner[new_i][x][y][z][new_w] |= ClassBits;
|
||||
Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
|
||||
Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (i = 0; i < Templates->NumClassPruners; i++) {
|
||||
Efree (TempClassPruner[i]);
|
||||
delete TempClassPruner[i];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1217,7 +1106,6 @@ void Classify::ShowMatchDisplay() {
|
||||
** History: Thu Mar 21 15:47:33 1991, DSJ, Created.
|
||||
*/
|
||||
InitIntMatchWindowIfReqd();
|
||||
c_clear_window(IntMatchWindow);
|
||||
if (ProtoDisplayWindow) {
|
||||
ProtoDisplayWindow->Clear();
|
||||
}
|
||||
@ -1227,7 +1115,6 @@ void Classify::ShowMatchDisplay() {
|
||||
ClearFeatureSpaceWindow(
|
||||
static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
|
||||
IntMatchWindow);
|
||||
|
||||
IntMatchWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
|
||||
INT_MAX_X, INT_MAX_Y);
|
||||
if (ProtoDisplayWindow) {
|
||||
@ -1299,7 +1186,7 @@ void Classify::WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
|
||||
|
||||
/* then write out the class pruners */
|
||||
for (i = 0; i < Templates->NumClassPruners; i++)
|
||||
fwrite(Templates->ClassPruner[i],
|
||||
fwrite(Templates->ClassPruners[i],
|
||||
sizeof(CLASS_PRUNER_STRUCT), 1, File);
|
||||
|
||||
/* then write out each class */
|
||||
@ -1385,7 +1272,7 @@ FLOAT32 BucketEnd(int Bucket, FLOAT32 Offset, int NumBuckets) {
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
void DoFill(FILL_SPEC *FillSpec,
|
||||
CLASS_PRUNER Pruner,
|
||||
CLASS_PRUNER_STRUCT* Pruner,
|
||||
register uinT32 ClassMask,
|
||||
register uinT32 ClassCount,
|
||||
register uinT32 WordIndex) {
|
||||
@ -1421,11 +1308,11 @@ void DoFill(FILL_SPEC *FillSpec,
|
||||
for (Y = FillSpec->YStart; Y <= FillSpec->YEnd; Y++)
|
||||
for (Angle = FillSpec->AngleStart;
|
||||
TRUE; CircularIncrement (Angle, NUM_CP_BUCKETS)) {
|
||||
OldWord = Pruner[X][Y][Angle][WordIndex];
|
||||
OldWord = Pruner->p[X][Y][Angle][WordIndex];
|
||||
if (ClassCount > (OldWord & ClassMask)) {
|
||||
OldWord &= ~ClassMask;
|
||||
OldWord |= ClassCount;
|
||||
Pruner[X][Y][Angle][WordIndex] = OldWord;
|
||||
Pruner->p[X][Y][Angle][WordIndex] = OldWord;
|
||||
}
|
||||
if (Angle == FillSpec->AngleEnd)
|
||||
break;
|
||||
@ -1543,7 +1430,7 @@ void FillPPLinearBits(uinT32 ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR],
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
namespace tesseract {
|
||||
CLASS_ID Classify::GetClassToDebug(const char *Prompt, bool* adaptive_on,
|
||||
bool* pretrained_on) {
|
||||
bool* pretrained_on, int* shape_id) {
|
||||
/*
|
||||
** Parameters:
|
||||
** Prompt prompt to print while waiting for input from window
|
||||
@ -1557,26 +1444,57 @@ CLASS_ID Classify::GetClassToDebug(const char *Prompt, bool* adaptive_on,
|
||||
tprintf("%s\n", Prompt);
|
||||
SVEvent* ev;
|
||||
SVEventType ev_type;
|
||||
int unichar_id = INVALID_UNICHAR_ID;
|
||||
// Wait until a click or popup event.
|
||||
do {
|
||||
ev = IntMatchWindow->AwaitEvent(SVET_ANY);
|
||||
ev_type = ev->type;
|
||||
if (ev_type == SVET_POPUP) {
|
||||
if (unicharset.contains_unichar(ev->parameter)) {
|
||||
if (ev->command_id == IDA_ADAPTIVE) {
|
||||
*adaptive_on = true;
|
||||
*pretrained_on = false;
|
||||
} else if (ev->command_id == IDA_STATIC) {
|
||||
if (ev->command_id == IDA_SHAPE_INDEX) {
|
||||
if (shape_table_ != NULL) {
|
||||
*shape_id = atoi(ev->parameter);
|
||||
*adaptive_on = false;
|
||||
*pretrained_on = true;
|
||||
if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
|
||||
int font_id;
|
||||
shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
|
||||
&font_id);
|
||||
tprintf("Shape %d, first unichar=%d, font=%d\n",
|
||||
*shape_id, unichar_id, font_id);
|
||||
return unichar_id;
|
||||
}
|
||||
tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
|
||||
} else {
|
||||
*adaptive_on = true;
|
||||
*pretrained_on = true;
|
||||
tprintf("No shape table loaded!\n");
|
||||
}
|
||||
} else {
|
||||
if (unicharset.contains_unichar(ev->parameter)) {
|
||||
unichar_id = unicharset.unichar_to_id(ev->parameter);
|
||||
if (ev->command_id == IDA_ADAPTIVE) {
|
||||
*adaptive_on = true;
|
||||
*pretrained_on = false;
|
||||
*shape_id = -1;
|
||||
} else if (ev->command_id == IDA_STATIC) {
|
||||
*adaptive_on = false;
|
||||
*pretrained_on = true;
|
||||
} else {
|
||||
*adaptive_on = true;
|
||||
*pretrained_on = true;
|
||||
}
|
||||
if (ev->command_id == IDA_ADAPTIVE || shape_table_ == NULL) {
|
||||
*shape_id = -1;
|
||||
return unichar_id;
|
||||
}
|
||||
for (int s = 0; s < shape_table_->NumShapes(); ++s) {
|
||||
if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
|
||||
tprintf("%s\n", shape_table_->DebugStr(s).string());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tprintf("Char class '%s' not found in unicharset",
|
||||
ev->parameter);
|
||||
}
|
||||
return unicharset.unichar_to_id(ev->parameter);
|
||||
}
|
||||
tprintf("Char class '%s' not found in unicharset",
|
||||
ev->parameter);
|
||||
}
|
||||
delete ev;
|
||||
} while (ev_type != SVET_CLICK);
|
||||
@ -1916,15 +1834,8 @@ void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT* Feature,
|
||||
// using BinaryAnglePlusPi in intfx.cpp.
|
||||
Dx = (Length / 2.0) * cos((Feature->Theta / 256.0) * 2.0 * PI - PI);
|
||||
Dy = (Length / 2.0) * sin((Feature->Theta / 256.0) * 2.0 * PI - PI);
|
||||
float x_offset = Dy / 4.0;
|
||||
float y_offset = -Dx / 4.0;
|
||||
|
||||
window->SetCursor(X - Dx, Y - Dy);
|
||||
window->DrawTo(X + Dx, Y + Dy);
|
||||
// Draw another copy of the feature offset perpendicualar to its direction.
|
||||
X += x_offset;
|
||||
Y += y_offset;
|
||||
window->SetCursor(X - Dx, Y - Dy);
|
||||
window->SetCursor(X, Y);
|
||||
window->DrawTo(X + Dx, Y + Dy);
|
||||
} /* RenderIntFeature */
|
||||
|
||||
@ -2047,6 +1958,8 @@ void InitIntMatchWindowIfReqd() {
|
||||
"x", "Class to debug");
|
||||
popup_menu->AddChild("Debug Both", IDA_BOTH,
|
||||
"x", "Class to debug");
|
||||
popup_menu->AddChild("Debug Shape Index", IDA_SHAPE_INDEX,
|
||||
"0", "Index to debug");
|
||||
popup_menu->BuildMenu(IntMatchWindow, false);
|
||||
}
|
||||
}
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include "matchdefs.h"
|
||||
#include "mfoutline.h"
|
||||
#include "protos.h"
|
||||
#include "callcpp.h"
|
||||
#include "scrollview.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
@ -72,11 +71,9 @@
|
||||
* The position of the the bits recorded for each class in the
|
||||
* 4th dimension is determined by using CPrunerWordIndexFor(c),
|
||||
* where c is the corresponding class id. */
|
||||
typedef uinT32 CLASS_PRUNER_STRUCT
|
||||
[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR];
|
||||
|
||||
typedef
|
||||
uinT32 (*CLASS_PRUNER)[NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR];
|
||||
struct CLASS_PRUNER_STRUCT {
|
||||
uinT32 p[NUM_CP_BUCKETS][NUM_CP_BUCKETS][NUM_CP_BUCKETS][WERDS_PER_CP_VECTOR];
|
||||
};
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@ -103,86 +100,6 @@ PROTO_SET_STRUCT, *PROTO_SET;
|
||||
|
||||
typedef uinT32 CONFIG_PRUNER[NUM_PP_PARAMS][NUM_PP_BUCKETS][4];
|
||||
|
||||
// Struct for information about spacing between characters in a particular font.
|
||||
struct FontSpacingInfo {
|
||||
inT16 x_gap_before;
|
||||
inT16 x_gap_after;
|
||||
GenericVector<UNICHAR_ID> kerned_unichar_ids;
|
||||
GenericVector<inT16> kerned_x_gaps;
|
||||
};
|
||||
|
||||
/*
|
||||
* font_properties contains properties about boldness, italicness, fixed pitch,
|
||||
* serif, fraktur
|
||||
*/
|
||||
struct FontInfo {
|
||||
FontInfo() : name(NULL), spacing_vec(NULL) {}
|
||||
~FontInfo() {}
|
||||
// Reserves unicharset_size spots in spacing_vec.
|
||||
void init_spacing(int unicharset_size) {
|
||||
spacing_vec = new GenericVector<FontSpacingInfo *>();
|
||||
spacing_vec->init_to_size(unicharset_size, NULL);
|
||||
}
|
||||
// Adds the given pointer to FontSpacingInfo to spacing_vec member
|
||||
// (FontInfo class takes ownership of the pointer).
|
||||
// Note: init_spacing should be called before calling this function.
|
||||
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
|
||||
ASSERT_HOST(spacing_vec != NULL && spacing_vec->size() > uch_id);
|
||||
(*spacing_vec)[uch_id] = spacing_info;
|
||||
}
|
||||
|
||||
// Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
|
||||
const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
|
||||
return (spacing_vec == NULL || spacing_vec->size() <= uch_id) ?
|
||||
NULL : (*spacing_vec)[uch_id];
|
||||
}
|
||||
|
||||
// Fills spacing with the value of the x gap expected between the two given
|
||||
// UNICHAR_IDs. Returns true on success.
|
||||
bool get_spacing(UNICHAR_ID prev_uch_id,
|
||||
UNICHAR_ID uch_id,
|
||||
int *spacing) const {
|
||||
const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
|
||||
const FontSpacingInfo *fsi = this->get_spacing(uch_id);
|
||||
if (prev_fsi == NULL || fsi == NULL) return false;
|
||||
int i = 0;
|
||||
for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
|
||||
if (prev_fsi->kerned_unichar_ids[i] == uch_id) break;
|
||||
}
|
||||
if (i < prev_fsi->kerned_unichar_ids.size()) {
|
||||
*spacing = prev_fsi->kerned_x_gaps[i];
|
||||
} else {
|
||||
*spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool is_italic() const { return properties & 1; }
|
||||
bool is_bold() const { return (properties & 2) != 0; }
|
||||
bool is_fixed_pitch() const { return (properties & 4) != 0; }
|
||||
bool is_serif() const { return (properties & 8) != 0; }
|
||||
bool is_fraktur() const { return (properties & 16) != 0; }
|
||||
|
||||
char* name;
|
||||
uinT32 properties;
|
||||
// Horizontal spacing between characters (indexed by UNICHAR_ID).
|
||||
GenericVector<FontSpacingInfo *> *spacing_vec;
|
||||
};
|
||||
|
||||
// Every class (character) owns a FontSet that represents all the fonts that can
|
||||
// render this character.
|
||||
// Since almost all the characters from the same script share the same set of
|
||||
// fonts, the sets are shared over multiple classes (see
|
||||
// Classify::fontset_table_). Thus, a class only store an id to a set.
|
||||
// Because some fonts cannot render just one character of a set, there are a
|
||||
// lot of FontSet that differ only by one font. Rather than storing directly
|
||||
// the FontInfo in the FontSet structure, it's better to share FontInfos among
|
||||
// FontSets (Classify::fontinfo_table_).
|
||||
struct FontSet {
|
||||
int size;
|
||||
int* configs; // FontInfo ids
|
||||
};
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@ -203,7 +120,7 @@ typedef struct
|
||||
int NumClasses;
|
||||
int NumClassPruners;
|
||||
INT_CLASS Class[MAX_NUM_CLASSES];
|
||||
CLASS_PRUNER ClassPruner[MAX_NUM_CLASS_PRUNERS];
|
||||
CLASS_PRUNER_STRUCT* ClassPruners[MAX_NUM_CLASS_PRUNERS];
|
||||
}
|
||||
|
||||
|
||||
@ -232,6 +149,7 @@ typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES];
|
||||
enum IntmatcherDebugAction {
|
||||
IDA_ADAPTIVE,
|
||||
IDA_STATIC,
|
||||
IDA_SHAPE_INDEX,
|
||||
IDA_BOTH
|
||||
};
|
||||
|
||||
@ -255,7 +173,7 @@ enum IntmatcherDebugAction {
|
||||
#define ClassForClassId(T,c) ((T)->Class[c])
|
||||
#define ClassPrunersFor(T) ((T)->ClassPruner)
|
||||
#define CPrunerIdFor(c) ((c) / CLASSES_PER_CP)
|
||||
#define CPrunerFor(T,c) ((T)->ClassPruner [CPrunerIdFor (c)])
|
||||
#define CPrunerFor(T,c) ((T)->ClassPruners[CPrunerIdFor(c)])
|
||||
#define CPrunerWordIndexFor(c) (((c) % CLASSES_PER_CP) / CLASSES_PER_CP_WERD)
|
||||
#define CPrunerBitIndexFor(c) (((c) % CLASSES_PER_CP) % CLASSES_PER_CP_WERD)
|
||||
#define CPrunerMaskFor(L,c) (((L)+1) << CPrunerBitIndexFor (c) * NUM_BITS_PER_CLASS)
|
||||
@ -300,7 +218,7 @@ void UpdateMatchDisplay();
|
||||
|
||||
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class);
|
||||
|
||||
void DisplayIntFeature(INT_FEATURE Feature, FLOAT32 Evidence);
|
||||
void DisplayIntFeature(const INT_FEATURE_STRUCT* Feature, FLOAT32 Evidence);
|
||||
|
||||
void DisplayIntProto(INT_CLASS Class, PROTO_ID ProtoId, FLOAT32 Evidence);
|
||||
|
||||
|
967
classify/mastertrainer.cpp
Normal file
967
classify/mastertrainer.cpp
Normal file
@ -0,0 +1,967 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: mastertrainer.cpp
|
||||
// Description: Trainer to build the MasterClassifier.
|
||||
// Author: Ray Smith
|
||||
// Created: Wed Nov 03 18:10:01 PDT 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "mastertrainer.h"
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include "allheaders.h"
|
||||
#include "boxread.h"
|
||||
#include "classify.h"
|
||||
#include "errorcounter.h"
|
||||
#include "featdefs.h"
|
||||
#include "sampleiterator.h"
|
||||
#include "shapeclassifier.h"
|
||||
#include "shapetable.h"
|
||||
#include "svmnode.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Constants controlling clustering. With a low kMinClusteredShapes and a high
|
||||
// kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.
|
||||
// Min number of shapes in the output.
|
||||
const int kMinClusteredShapes = 1;
|
||||
// Max number of unichars in any individual cluster.
|
||||
const int kMaxUnicharsPerCluster = 2000;
|
||||
// Mean font distance below which to merge fonts and unichars.
|
||||
const float kFontMergeDistance = 0.025;
|
||||
|
||||
MasterTrainer::MasterTrainer(NormalizationMode norm_mode,
|
||||
bool shape_analysis,
|
||||
bool replicate_samples,
|
||||
int debug_level)
|
||||
: norm_mode_(norm_mode), samples_(fontinfo_table_),
|
||||
junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
|
||||
charsetsize_(0),
|
||||
enable_shape_anaylsis_(shape_analysis),
|
||||
enable_replication_(replicate_samples),
|
||||
fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
|
||||
fontinfo_table_.set_compare_callback(
|
||||
NewPermanentTessCallback(CompareFontInfo));
|
||||
fontinfo_table_.set_clear_callback(
|
||||
NewPermanentTessCallback(FontInfoDeleteCallback));
|
||||
}
|
||||
|
||||
MasterTrainer::~MasterTrainer() {
|
||||
delete [] fragments_;
|
||||
for (int p = 0; p < page_images_.size(); ++p)
|
||||
pixDestroy(&page_images_[p]);
|
||||
}
|
||||
|
||||
// WARNING! Serialize/DeSerialize are only partial, providing
|
||||
// enough data to get the samples back and display them.
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool MasterTrainer::Serialize(FILE* fp) const {
|
||||
if (fwrite(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
|
||||
if (!unicharset_.save_to_file(fp)) return false;
|
||||
if (!feature_space_.Serialize(fp)) return false;
|
||||
if (!samples_.Serialize(fp)) return false;
|
||||
if (!junk_samples_.Serialize(fp)) return false;
|
||||
if (!verify_samples_.Serialize(fp)) return false;
|
||||
if (!master_shapes_.Serialize(fp)) return false;
|
||||
if (!flat_shapes_.Serialize(fp)) return false;
|
||||
if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_info)))
|
||||
return false;
|
||||
if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_spacing_info)))
|
||||
return false;
|
||||
if (!xheights_.Serialize(fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
|
||||
if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
|
||||
if (swap) {
|
||||
ReverseN(&norm_mode_, sizeof(norm_mode_));
|
||||
}
|
||||
if (!unicharset_.load_from_file(fp)) return false;
|
||||
charsetsize_ = unicharset_.size();
|
||||
if (!feature_space_.DeSerialize(swap, fp)) return false;
|
||||
feature_map_.Init(feature_space_);
|
||||
if (!samples_.DeSerialize(swap, fp)) return false;
|
||||
if (!junk_samples_.DeSerialize(swap, fp)) return false;
|
||||
if (!verify_samples_.DeSerialize(swap, fp)) return false;
|
||||
if (!master_shapes_.DeSerialize(swap, fp)) return false;
|
||||
if (!flat_shapes_.DeSerialize(swap, fp)) return false;
|
||||
if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap))
|
||||
return false;
|
||||
if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_spacing_info),
|
||||
swap))
|
||||
return false;
|
||||
if (!xheights_.DeSerialize(swap, fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Load an initial unicharset, or set one up if the file cannot be read.
|
||||
void MasterTrainer::LoadUnicharset(const char* filename) {
|
||||
if (!unicharset_.load_from_file(filename)) {
|
||||
tprintf("Failed to load unicharset from file %s\n"
|
||||
"Building unicharset for training from scratch...\n",
|
||||
filename);
|
||||
unicharset_.clear();
|
||||
// Space character needed to represent NIL_LIST classification.
|
||||
unicharset_.unichar_insert(" ");
|
||||
}
|
||||
charsetsize_ = unicharset_.size();
|
||||
delete [] fragments_;
|
||||
fragments_ = new int[charsetsize_];
|
||||
memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
|
||||
samples_.LoadUnicharset(filename);
|
||||
junk_samples_.LoadUnicharset(filename);
|
||||
verify_samples_.LoadUnicharset(filename);
|
||||
}
|
||||
|
||||
// Reads the samples and their features from the given .tr format file,
|
||||
// adding them to the trainer with the font_id from the content of the file.
|
||||
// See mftraining.cpp for a description of the file format.
|
||||
// If verification, then these are verification samples, not training.
|
||||
void MasterTrainer::ReadTrainingSamples(FILE *fp,
|
||||
const FEATURE_DEFS_STRUCT& feature_defs,
|
||||
bool verification) {
|
||||
char buffer[2048];
|
||||
int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
|
||||
int micro_feature_type = ShortNameToFeatureType(feature_defs,
|
||||
kMicroFeatureType);
|
||||
int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
|
||||
int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), fp) != NULL) {
|
||||
if (buffer[0] == '\n')
|
||||
continue;
|
||||
|
||||
char* space = strchr(buffer, ' ');
|
||||
if (space == NULL) {
|
||||
tprintf("Bad format in tr file, reading fontname, unichar\n");
|
||||
continue;
|
||||
}
|
||||
*space++ = '\0';
|
||||
int font_id = GetFontInfoId(buffer);
|
||||
int page_number;
|
||||
STRING unichar;
|
||||
TBOX bounding_box;
|
||||
if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
|
||||
tprintf("Bad format in tr file, reading box coords\n");
|
||||
continue;
|
||||
}
|
||||
CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
|
||||
TrainingSample* sample = new TrainingSample;
|
||||
sample->set_font_id(font_id);
|
||||
sample->set_page_num(page_number + page_images_.size());
|
||||
sample->set_bounding_box(bounding_box);
|
||||
sample->ExtractCharDesc(int_feature_type, micro_feature_type,
|
||||
cn_feature_type, geo_feature_type, char_desc);
|
||||
AddSample(verification, unichar.string(), sample);
|
||||
FreeCharDescription(char_desc);
|
||||
}
|
||||
charsetsize_ = unicharset_.size();
|
||||
}
|
||||
|
||||
// Adds the given single sample to the trainer, setting the classid
|
||||
// appropriately from the given unichar_str.
|
||||
void MasterTrainer::AddSample(bool verification, const char* unichar,
|
||||
TrainingSample* sample) {
|
||||
if (verification) {
|
||||
verify_samples_.AddSample(unichar, sample);
|
||||
prev_unichar_id_ = -1;
|
||||
} else if (unicharset_.contains_unichar(unichar)) {
|
||||
if (prev_unichar_id_ >= 0)
|
||||
fragments_[prev_unichar_id_] = -1;
|
||||
prev_unichar_id_ = samples_.AddSample(unichar, sample);
|
||||
if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
|
||||
flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
|
||||
} else {
|
||||
int junk_id = junk_samples_.AddSample(unichar, sample);
|
||||
if (prev_unichar_id_ >= 0) {
|
||||
CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(unichar);
|
||||
if (frag != NULL && frag->is_natural()) {
|
||||
if (fragments_[prev_unichar_id_] == 0)
|
||||
fragments_[prev_unichar_id_] = junk_id;
|
||||
else if (fragments_[prev_unichar_id_] != junk_id)
|
||||
fragments_[prev_unichar_id_] = -1;
|
||||
}
|
||||
delete frag;
|
||||
}
|
||||
prev_unichar_id_ = -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Loads all pages from the given tif filename and append to page_images_.
|
||||
// Must be called after ReadTrainingSamples, as the current number of images
|
||||
// is used as an offset for page numbers in the samples.
|
||||
void MasterTrainer::LoadPageImages(const char* filename) {
|
||||
int page;
|
||||
Pix* pix;
|
||||
for (page = 0; (pix = pixReadTiff(filename, page)) != NULL; ++page) {
|
||||
page_images_.push_back(pix);
|
||||
}
|
||||
tprintf("Loaded %d page images from %s\n", page, filename);
|
||||
}
|
||||
|
||||
// Cleans up the samples after initial load from the tr files, and prior to
|
||||
// saving the MasterTrainer:
|
||||
// Remaps fragmented chars if running shape anaylsis.
|
||||
// Sets up the samples appropriately for class/fontwise access.
|
||||
// Deletes outlier samples.
|
||||
void MasterTrainer::PostLoadCleanup() {
|
||||
if (debug_level_ > 0)
|
||||
tprintf("PostLoadCleanup...\n");
|
||||
if (enable_shape_anaylsis_)
|
||||
ReplaceFragmentedSamples();
|
||||
SampleIterator sample_it;
|
||||
sample_it.Init(NULL, NULL, true, &verify_samples_);
|
||||
sample_it.NormalizeSamples();
|
||||
verify_samples_.OrganizeByFontAndClass();
|
||||
|
||||
samples_.IndexFeatures(feature_space_);
|
||||
// TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
|
||||
// against current training.
|
||||
// samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
|
||||
samples_.OrganizeByFontAndClass();
|
||||
if (debug_level_ > 0)
|
||||
tprintf("ComputeCanonicalSamples...\n");
|
||||
samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
|
||||
}
|
||||
|
||||
// Gets the samples ready for training. Use after both
|
||||
// ReadTrainingSamples+PostLoadCleanup or DeSerialize.
|
||||
// Re-indexes the features and computes canonical and cloud features.
|
||||
void MasterTrainer::PreTrainingSetup() {
|
||||
if (debug_level_ > 0)
|
||||
tprintf("PreTrainingSetup...\n");
|
||||
samples_.IndexFeatures(feature_space_);
|
||||
samples_.ComputeCanonicalFeatures();
|
||||
if (debug_level_ > 0)
|
||||
tprintf("ComputeCloudFeatures...\n");
|
||||
samples_.ComputeCloudFeatures(feature_space_.Size());
|
||||
}
|
||||
|
||||
// Sets up the master_shapes_ table, which tells which fonts should stay
|
||||
// together until they get to a leaf node classifier.
|
||||
void MasterTrainer::SetupMasterShapes() {
|
||||
tprintf("Building master shape table\n");
|
||||
int num_fonts = samples_.NumFonts();
|
||||
|
||||
ShapeTable char_shapes_begin_fragment(samples_.unicharset());
|
||||
ShapeTable char_shapes_end_fragment(samples_.unicharset());
|
||||
ShapeTable char_shapes(samples_.unicharset());
|
||||
for (int c = 0; c < samples_.charsetsize(); ++c) {
|
||||
ShapeTable shapes(samples_.unicharset());
|
||||
for (int f = 0; f < num_fonts; ++f) {
|
||||
if (samples_.NumClassSamples(f, c, true) > 0)
|
||||
shapes.AddShape(c, f);
|
||||
}
|
||||
ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
|
||||
|
||||
const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
|
||||
|
||||
if (fragment == NULL)
|
||||
char_shapes.AppendMasterShapes(shapes);
|
||||
else if (fragment->is_beginning())
|
||||
char_shapes_begin_fragment.AppendMasterShapes(shapes);
|
||||
else if (fragment->is_ending())
|
||||
char_shapes_end_fragment.AppendMasterShapes(shapes);
|
||||
else
|
||||
char_shapes.AppendMasterShapes(shapes);
|
||||
}
|
||||
ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
|
||||
kFontMergeDistance, &char_shapes_begin_fragment);
|
||||
char_shapes.AppendMasterShapes(char_shapes_begin_fragment);
|
||||
ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
|
||||
kFontMergeDistance, &char_shapes_end_fragment);
|
||||
char_shapes.AppendMasterShapes(char_shapes_end_fragment);
|
||||
ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
|
||||
kFontMergeDistance, &char_shapes);
|
||||
master_shapes_.AppendMasterShapes(char_shapes);
|
||||
tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
|
||||
}
|
||||
|
||||
// Adds the junk_samples_ to the main samples_ set. Junk samples are initially
|
||||
// fragments and n-grams (all incorrectly segmented characters).
|
||||
// Various training functions may result in incorrectly segmented characters
|
||||
// being added to the unicharset of the main samples, perhaps because they
|
||||
// form a "radical" decomposition of some (Indic) grapheme, or because they
|
||||
// just look the same as a real character (like rn/m)
|
||||
// This function moves all the junk samples, to the main samples_ set, but
|
||||
// desirable junk, being any sample for which the unichar already exists in
|
||||
// the samples_ unicharset gets the unichar-ids re-indexed to match, but
|
||||
// anything else gets re-marked as unichar_id 0 (space character) to identify
|
||||
// it as junk to the error counter.
|
||||
void MasterTrainer::IncludeJunk() {
|
||||
// Get ids of fragments in junk_samples_ that replace the dead chars.
|
||||
const UNICHARSET& junk_set = junk_samples_.unicharset();
|
||||
const UNICHARSET& sample_set = samples_.unicharset();
|
||||
int num_junks = junk_samples_.num_samples();
|
||||
tprintf("Moving %d junk samples to master sample set.\n", num_junks);
|
||||
for (int s = 0; s < num_junks; ++s) {
|
||||
TrainingSample* sample = junk_samples_.mutable_sample(s);
|
||||
int junk_id = sample->class_id();
|
||||
const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
|
||||
int sample_id = sample_set.unichar_to_id(junk_utf8);
|
||||
if (sample_id == INVALID_UNICHAR_ID)
|
||||
sample_id = 0;
|
||||
sample->set_class_id(sample_id);
|
||||
junk_samples_.extract_sample(s);
|
||||
samples_.AddSample(sample_id, sample);
|
||||
}
|
||||
junk_samples_.DeleteDeadSamples();
|
||||
samples_.OrganizeByFontAndClass();
|
||||
}
|
||||
|
||||
// Replicates the samples and perturbs them if the enable_replication_ flag
|
||||
// is set. MUST be used after the last call to OrganizeByFontAndClass on
|
||||
// the training samples, ie after IncludeJunk if it is going to be used, as
|
||||
// OrganizeByFontAndClass will eat the replicated samples into the regular
|
||||
// samples.
|
||||
void MasterTrainer::ReplicateAndRandomizeSamplesIfRequired() {
|
||||
if (enable_replication_) {
|
||||
if (debug_level_ > 0)
|
||||
tprintf("ReplicateAndRandomize...\n");
|
||||
verify_samples_.ReplicateAndRandomizeSamples();
|
||||
samples_.ReplicateAndRandomizeSamples();
|
||||
samples_.IndexFeatures(feature_space_);
|
||||
}
|
||||
}
|
||||
|
||||
// Loads the basic font properties file into fontinfo_table_.
|
||||
// Returns false on failure.
|
||||
bool MasterTrainer::LoadFontInfo(const char* filename) {
|
||||
FILE* fp = fopen(filename, "rb");
|
||||
if (fp == NULL) {
|
||||
fprintf(stderr, "Failed to load font_properties from %s\n", filename);
|
||||
return false;
|
||||
}
|
||||
int italic, bold, fixed, serif, fraktur;
|
||||
while (!feof(fp)) {
|
||||
FontInfo fontinfo;
|
||||
char* font_name = new char[1024];
|
||||
fontinfo.name = font_name;
|
||||
fontinfo.properties = 0;
|
||||
fontinfo.universal_id = 0;
|
||||
if (fscanf(fp, "%1024s %i %i %i %i %i\n", font_name,
|
||||
&italic, &bold, &fixed, &serif, &fraktur) != 6)
|
||||
continue;
|
||||
fontinfo.properties =
|
||||
(italic << 0) +
|
||||
(bold << 1) +
|
||||
(fixed << 2) +
|
||||
(serif << 3) +
|
||||
(fraktur << 4);
|
||||
if (!fontinfo_table_.contains(fontinfo)) {
|
||||
fontinfo_table_.push_back(fontinfo);
|
||||
}
|
||||
}
|
||||
fclose(fp);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Loads the xheight font properties file into xheights_.
|
||||
// Returns false on failure.
|
||||
bool MasterTrainer::LoadXHeights(const char* filename) {
|
||||
tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
|
||||
xheights_.init_to_size(fontinfo_table_.size(), -1);
|
||||
if (filename == NULL) return true;
|
||||
FILE *f = fopen(filename, "rb");
|
||||
if (f == NULL) {
|
||||
fprintf(stderr, "Failed to load font xheights from %s\n", filename);
|
||||
return false;
|
||||
}
|
||||
tprintf("Reading x-heights from %s ...\n", filename);
|
||||
FontInfo fontinfo;
|
||||
fontinfo.properties = 0; // Not used to lookup in the table.
|
||||
fontinfo.universal_id = 0;
|
||||
char buffer[1024];
|
||||
int xht;
|
||||
int total_xheight = 0;
|
||||
int xheight_count = 0;
|
||||
while (!feof(f)) {
|
||||
if (fscanf(f, "%1024s %d\n", buffer, &xht) != 2)
|
||||
continue;
|
||||
fontinfo.name = buffer;
|
||||
if (!fontinfo_table_.contains(fontinfo)) continue;
|
||||
int fontinfo_id = fontinfo_table_.get_id(fontinfo);
|
||||
xheights_[fontinfo_id] = xht;
|
||||
total_xheight += xht;
|
||||
++xheight_count;
|
||||
}
|
||||
if (xheight_count == 0) {
|
||||
fprintf(stderr, "No valid xheights in %s!\n", filename);
|
||||
return false;
|
||||
}
|
||||
int mean_xheight = DivRounded(total_xheight, xheight_count);
|
||||
for (int i = 0; i < fontinfo_table_.size(); ++i) {
|
||||
if (xheights_[i] < 0)
|
||||
xheights_[i] = mean_xheight;
|
||||
}
|
||||
return true;
|
||||
} // LoadXHeights
|
||||
|
||||
// Reads spacing stats from filename and adds them to fontinfo_table.
|
||||
bool MasterTrainer::AddSpacingInfo(const char *filename) {
|
||||
FILE* fontinfo_file = fopen(filename, "rb");
|
||||
if (fontinfo_file == NULL)
|
||||
return true; // We silently ignore missing files!
|
||||
// Find the fontinfo_id.
|
||||
int fontinfo_id = GetBestMatchingFontInfoId(filename);
|
||||
if (fontinfo_id < 0) {
|
||||
tprintf("No font found matching fontinfo filename %s\n", filename);
|
||||
fclose(fontinfo_file);
|
||||
return false;
|
||||
}
|
||||
tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
|
||||
// TODO(rays) scale should probably be a double, but keep as an int for now
|
||||
// to duplicate current behavior.
|
||||
int scale = kBlnXHeight / xheights_[fontinfo_id];
|
||||
int num_unichars;
|
||||
char uch[UNICHAR_LEN];
|
||||
char kerned_uch[UNICHAR_LEN];
|
||||
int x_gap, x_gap_before, x_gap_after, num_kerned;
|
||||
ASSERT_HOST(fscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
|
||||
FontInfo *fi = fontinfo_table_.get_mutable(fontinfo_id);
|
||||
fi->init_spacing(unicharset_.size());
|
||||
FontSpacingInfo *spacing = NULL;
|
||||
for (int l = 0; l < num_unichars; ++l) {
|
||||
if (fscanf(fontinfo_file, "%s %d %d %d",
|
||||
uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
|
||||
tprintf("Bad format of font spacing file %s\n", filename);
|
||||
fclose(fontinfo_file);
|
||||
return false;
|
||||
}
|
||||
bool valid = unicharset_.contains_unichar(uch);
|
||||
if (valid) {
|
||||
spacing = new FontSpacingInfo();
|
||||
spacing->x_gap_before = static_cast<inT16>(x_gap_before * scale);
|
||||
spacing->x_gap_after = static_cast<inT16>(x_gap_after * scale);
|
||||
}
|
||||
for (int k = 0; k < num_kerned; ++k) {
|
||||
if (fscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
|
||||
tprintf("Bad format of font spacing file %s\n", filename);
|
||||
fclose(fontinfo_file);
|
||||
return false;
|
||||
}
|
||||
if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
|
||||
spacing->kerned_unichar_ids.push_back(
|
||||
unicharset_.unichar_to_id(kerned_uch));
|
||||
spacing->kerned_x_gaps.push_back(static_cast<inT16>(x_gap * scale));
|
||||
}
|
||||
if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
|
||||
}
|
||||
fclose(fontinfo_file);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns the font id corresponding to the given font name.
|
||||
// Returns -1 if the font cannot be found.
|
||||
int MasterTrainer::GetFontInfoId(const char* font_name) {
|
||||
FontInfo fontinfo;
|
||||
// We are only borrowing the string, so it is OK to const cast it.
|
||||
fontinfo.name = const_cast<char*>(font_name);
|
||||
fontinfo.properties = 0; // Not used to lookup in the table
|
||||
fontinfo.universal_id = 0;
|
||||
if (!fontinfo_table_.contains(fontinfo)) {
|
||||
return -1;
|
||||
} else {
|
||||
return fontinfo_table_.get_id(fontinfo);
|
||||
}
|
||||
}
|
||||
// Returns the font_id of the closest matching font name to the given
|
||||
// filename. It is assumed that a substring of the filename will match
|
||||
// one of the fonts. If more than one is matched, the longest is returned.
|
||||
int MasterTrainer::GetBestMatchingFontInfoId(const char* filename) {
|
||||
int fontinfo_id = -1;
|
||||
int best_len = 0;
|
||||
for (int f = 0; f < fontinfo_table_.size(); ++f) {
|
||||
if (strstr(filename, fontinfo_table_.get(f).name) != NULL) {
|
||||
int len = strlen(fontinfo_table_.get(f).name);
|
||||
// Use the longest matching length in case a substring of a font matched.
|
||||
if (len > best_len) {
|
||||
best_len = len;
|
||||
fontinfo_id = f;
|
||||
}
|
||||
}
|
||||
}
|
||||
return fontinfo_id;
|
||||
}
|
||||
|
||||
// Sets up a flat shapetable with one shape per class/font combination.
|
||||
void MasterTrainer::SetupFlatShapeTable(ShapeTable* shape_table) {
|
||||
// To exactly mimic the results of the previous implementation, the shapes
|
||||
// must be clustered in order the fonts arrived, and reverse order of the
|
||||
// characters within each font.
|
||||
// Get a list of the fonts in the order they appeared.
|
||||
GenericVector<int> active_fonts;
|
||||
int num_shapes = flat_shapes_.NumShapes();
|
||||
for (int s = 0; s < num_shapes; ++s) {
|
||||
int font = flat_shapes_.GetShape(s)[0].font_ids[0];
|
||||
int f = 0;
|
||||
for (f = 0; f < active_fonts.size(); ++f) {
|
||||
if (active_fonts[f] == font)
|
||||
break;
|
||||
}
|
||||
if (f == active_fonts.size())
|
||||
active_fonts.push_back(font);
|
||||
}
|
||||
// For each font in order, add all the shapes with that font in reverse order.
|
||||
int num_fonts = active_fonts.size();
|
||||
for (int f = 0; f < num_fonts; ++f) {
|
||||
for (int s = num_shapes - 1; s >= 0; --s) {
|
||||
int font = flat_shapes_.GetShape(s)[0].font_ids[0];
|
||||
if (font == active_fonts[f]) {
|
||||
shape_table->AddShape(flat_shapes_.GetShape(s));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sets up a Clusterer for mftraining on a single shape_id.
|
||||
// Call FreeClusterer on the return value after use.
|
||||
CLUSTERER* MasterTrainer::SetupForClustering(
|
||||
const ShapeTable& shape_table,
|
||||
const FEATURE_DEFS_STRUCT& feature_defs,
|
||||
int shape_id,
|
||||
int* num_samples) {
|
||||
|
||||
int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType);
|
||||
int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
|
||||
ASSERT_HOST(num_params == MFCount);
|
||||
CLUSTERER* clusterer = MakeClusterer(
|
||||
num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
|
||||
|
||||
// We want to iterate over the samples of just the one shape.
|
||||
IndexMapBiDi shape_map;
|
||||
shape_map.Init(shape_table.NumShapes(), false);
|
||||
shape_map.SetMap(shape_id, true);
|
||||
shape_map.Setup();
|
||||
// Reverse the order of the samples to match the previous behavior.
|
||||
GenericVector<const TrainingSample*> sample_ptrs;
|
||||
SampleIterator it;
|
||||
it.Init(&shape_map, &shape_table, false, &samples_);
|
||||
for (it.Begin(); !it.AtEnd(); it.Next()) {
|
||||
sample_ptrs.push_back(&it.GetSample());
|
||||
}
|
||||
int sample_id = 0;
|
||||
for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
|
||||
const TrainingSample* sample = sample_ptrs[i];
|
||||
int num_features = sample->num_micro_features();
|
||||
for (int f = 0; f < num_features; ++f)
|
||||
MakeSample(clusterer, sample->micro_features()[f], sample_id);
|
||||
++sample_id;
|
||||
}
|
||||
*num_samples = sample_id;
|
||||
return clusterer;
|
||||
}
|
||||
|
||||
// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
|
||||
// to the given inttemp_file, and the corresponding pffmtable.
|
||||
// The unicharset is the original encoding of graphemes, and shape_set should
|
||||
// match the size of the shape_table, and may possibly be totally fake.
|
||||
void MasterTrainer::WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
|
||||
const UNICHARSET& shape_set,
|
||||
const ShapeTable& shape_table,
|
||||
CLASS_STRUCT* float_classes,
|
||||
const char* inttemp_file,
|
||||
const char* pffmtable_file) {
|
||||
tesseract::Classify *classify = new tesseract::Classify();
|
||||
// Move the fontinfo table to classify.
|
||||
classify->get_fontinfo_table().move(&fontinfo_table_);
|
||||
INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
|
||||
shape_set);
|
||||
FILE* fp = fopen(inttemp_file, "wb");
|
||||
classify->WriteIntTemplates(fp, int_templates, shape_set);
|
||||
fclose(fp);
|
||||
// Now write pffmtable. This is complicated by the fact that the adaptive
|
||||
// classifier still wants one indexed by unichar-id, but the static
|
||||
// classifier needs one indexed by its shape class id.
|
||||
// We put the shapetable_cutoffs in a GenericVector, and compute the
|
||||
// unicharset cutoffs along the way.
|
||||
GenericVector<uinT16> shapetable_cutoffs;
|
||||
GenericVector<uinT16> unichar_cutoffs;
|
||||
for (int c = 0; c < unicharset.size(); ++c)
|
||||
unichar_cutoffs.push_back(0);
|
||||
/* then write out each class */
|
||||
for (int i = 0; i < int_templates->NumClasses; ++i) {
|
||||
INT_CLASS Class = ClassForClassId(int_templates, i);
|
||||
// Todo: Test with min instead of max
|
||||
// int MaxLength = LengthForConfigId(Class, 0);
|
||||
uinT16 max_length = 0;
|
||||
for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
|
||||
// Todo: Test with min instead of max
|
||||
// if (LengthForConfigId (Class, config_id) < MaxLength)
|
||||
uinT16 length = Class->ConfigLengths[config_id];
|
||||
if (length > max_length)
|
||||
max_length = Class->ConfigLengths[config_id];
|
||||
int shape_id = float_classes[i].font_set.get(config_id);
|
||||
const Shape& shape = shape_table.GetShape(shape_id);
|
||||
for (int c = 0; c < shape.size(); ++c) {
|
||||
int unichar_id = shape[c].unichar_id;
|
||||
if (length > unichar_cutoffs[unichar_id])
|
||||
unichar_cutoffs[unichar_id] = length;
|
||||
}
|
||||
}
|
||||
shapetable_cutoffs.push_back(max_length);
|
||||
}
|
||||
fp = fopen(pffmtable_file, "wb");
|
||||
shapetable_cutoffs.Serialize(fp);
|
||||
for (int c = 0; c < unicharset.size(); ++c) {
|
||||
const char *unichar = unicharset.id_to_unichar(c);
|
||||
if (strcmp(unichar, " ") == 0) {
|
||||
unichar = "NULL";
|
||||
}
|
||||
fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
|
||||
}
|
||||
fclose(fp);
|
||||
free_int_templates(int_templates);
|
||||
}
|
||||
|
||||
// Generate debug output relating to the canonical distance between the
|
||||
// two given UTF8 grapheme strings.
|
||||
void MasterTrainer::DebugCanonical(const char* unichar_str1,
|
||||
const char* unichar_str2) {
|
||||
int class_id1 = unicharset_.unichar_to_id(unichar_str1);
|
||||
int class_id2 = unicharset_.unichar_to_id(unichar_str2);
|
||||
if (class_id2 == INVALID_UNICHAR_ID)
|
||||
class_id2 = class_id1;
|
||||
if (class_id1 == INVALID_UNICHAR_ID) {
|
||||
tprintf("No unicharset entry found for %s\n", unichar_str1);
|
||||
return;
|
||||
} else {
|
||||
tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
|
||||
class_id1, unichar_str1, class_id2, unichar_str2);
|
||||
}
|
||||
int num_fonts = samples_.NumFonts();
|
||||
const IntFeatureMap& feature_map = feature_map_;
|
||||
// Iterate the fonts to get the similarity with other fonst of the same
|
||||
// class.
|
||||
tprintf(" ");
|
||||
for (int f = 0; f < num_fonts; ++f) {
|
||||
if (samples_.NumClassSamples(f, class_id2, false) == 0)
|
||||
continue;
|
||||
tprintf("%6d", f);
|
||||
}
|
||||
tprintf("\n");
|
||||
for (int f1 = 0; f1 < num_fonts; ++f1) {
|
||||
// Map the features of the canonical_sample.
|
||||
if (samples_.NumClassSamples(f1, class_id1, false) == 0)
|
||||
continue;
|
||||
tprintf("%4d ", f1);
|
||||
for (int f2 = 0; f2 < num_fonts; ++f2) {
|
||||
if (samples_.NumClassSamples(f2, class_id2, false) == 0)
|
||||
continue;
|
||||
float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
|
||||
feature_map);
|
||||
tprintf(" %5.3f", dist);
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
// Build a fake ShapeTable containing all the sample types.
|
||||
ShapeTable shapes(unicharset_);
|
||||
for (int f = 0; f < num_fonts; ++f) {
|
||||
if (samples_.NumClassSamples(f, class_id1, true) > 0)
|
||||
shapes.AddShape(class_id1, f);
|
||||
if (class_id1 != class_id2 &&
|
||||
samples_.NumClassSamples(f, class_id2, true) > 0)
|
||||
shapes.AddShape(class_id2, f);
|
||||
}
|
||||
}
|
||||
|
||||
// Debugging for cloud/canonical features.
|
||||
// Displays a Features window containing:
|
||||
// If unichar_str2 is in the unicharset, and canonical_font is non-negative,
|
||||
// displays the canonical features of the char/font combination in red.
|
||||
// If unichar_str1 is in the unicharset, and cloud_font is non-negative,
|
||||
// displays the cloud feature of the char/font combination in green.
|
||||
// The canonical features are drawn first to show which ones have no
|
||||
// matches in the cloud features.
|
||||
// Until the features window is destroyed, each click in the features window
|
||||
// will display the samples that have that feature in a separate window.
|
||||
void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
|
||||
const char* unichar_str2,
|
||||
int canonical_font) {
|
||||
const IntFeatureMap& feature_map = feature_map_;
|
||||
const IntFeatureSpace& feature_space = feature_map.feature_space();
|
||||
ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
|
||||
ClearFeatureSpaceWindow(norm_mode_ == NM_BASELINE ? baseline : character,
|
||||
f_window);
|
||||
int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
|
||||
if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
|
||||
const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
|
||||
class_id2);
|
||||
for (int f = 0; f < sample->num_features(); ++f) {
|
||||
RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
|
||||
}
|
||||
}
|
||||
int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
|
||||
if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
|
||||
const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
|
||||
for (int f = 0; f < cloud.size(); ++f) {
|
||||
if (cloud[f]) {
|
||||
INT_FEATURE_STRUCT feature =
|
||||
feature_map.InverseIndexFeature(f);
|
||||
RenderIntFeature(f_window, &feature, ScrollView::GREEN);
|
||||
}
|
||||
}
|
||||
}
|
||||
f_window->Update();
|
||||
ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
|
||||
SVEventType ev_type;
|
||||
do {
|
||||
SVEvent* ev;
|
||||
// Wait until a click or popup event.
|
||||
ev = f_window->AwaitEvent(SVET_ANY);
|
||||
ev_type = ev->type;
|
||||
if (ev_type == SVET_CLICK) {
|
||||
int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
|
||||
if (feature_index >= 0) {
|
||||
// Iterate samples and display those with the feature.
|
||||
Shape shape;
|
||||
shape.AddToShape(class_id1, cloud_font);
|
||||
s_window->Clear();
|
||||
samples_.DisplaySamplesWithFeature(feature_index, shape,
|
||||
feature_space, ScrollView::GREEN,
|
||||
s_window);
|
||||
s_window->Update();
|
||||
}
|
||||
}
|
||||
delete ev;
|
||||
} while (ev_type != SVET_DESTROY);
|
||||
}
|
||||
|
||||
// Tests the given test_classifier on the internal samples.
|
||||
// See TestClassifier for details.
|
||||
void MasterTrainer::TestClassifierOnSamples(int report_level,
|
||||
bool replicate_samples,
|
||||
ShapeClassifier* test_classifier,
|
||||
STRING* report_string) {
|
||||
TestClassifier(report_level, replicate_samples, &samples_,
|
||||
test_classifier, report_string);
|
||||
}
|
||||
|
||||
// Tests the given test_classifier on the given samples
|
||||
// report_levels:
|
||||
// 0 = no output.
|
||||
// 1 = bottom-line error rate.
|
||||
// 2 = bottom-line error rate + time.
|
||||
// 3 = font-level error rate + time.
|
||||
// 4 = list of all errors + short classifier debug output on 16 errors.
|
||||
// 5 = list of all errors + short classifier debug output on 25 errors.
|
||||
// If replicate_samples is true, then the test is run on an extended test
|
||||
// sample including replicated and systematically perturbed samples.
|
||||
// If report_string is non-NULL, a summary of the results for each font
|
||||
// is appended to the report_string.
|
||||
double MasterTrainer::TestClassifier(int report_level,
|
||||
bool replicate_samples,
|
||||
TrainingSampleSet* samples,
|
||||
ShapeClassifier* test_classifier,
|
||||
STRING* report_string) {
|
||||
SampleIterator sample_it;
|
||||
sample_it.Init(NULL, test_classifier->GetShapeTable(), replicate_samples,
|
||||
samples);
|
||||
if (report_level > 0) {
|
||||
int num_samples = 0;
|
||||
for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
|
||||
++num_samples;
|
||||
tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
|
||||
sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
|
||||
test_classifier->GetShapeTable()->NumShapes(), num_samples);
|
||||
tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
|
||||
}
|
||||
double unichar_error = 0.0;
|
||||
ErrorCounter::ComputeErrorRate(test_classifier, report_level,
|
||||
CT_SHAPE_TOP_ERR, fontinfo_table_,
|
||||
page_images_, &sample_it, &unichar_error,
|
||||
NULL, report_string);
|
||||
return unichar_error;
|
||||
}
|
||||
|
||||
// Returns the average (in some sense) distance between the two given
|
||||
// shapes, which may contain multiple fonts and/or unichars.
|
||||
float MasterTrainer::ShapeDistance(const ShapeTable& shapes, int s1, int s2) {
|
||||
const IntFeatureMap& feature_map = feature_map_;
|
||||
const Shape& shape1 = shapes.GetShape(s1);
|
||||
const Shape& shape2 = shapes.GetShape(s2);
|
||||
int num_chars1 = shape1.size();
|
||||
int num_chars2 = shape2.size();
|
||||
float dist_sum = 0.0f;
|
||||
int dist_count = 0;
|
||||
if (num_chars1 > 1 || num_chars2 > 1) {
|
||||
// In the multi-char case try to optimize the calculation by computing
|
||||
// distances between characters of matching font where possible.
|
||||
for (int c1 = 0; c1 < num_chars1; ++c1) {
|
||||
for (int c2 = 0; c2 < num_chars2; ++c2) {
|
||||
dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
|
||||
true, feature_map);
|
||||
++dist_count;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// In the single unichar case, there is little alternative, but to compute
|
||||
// the squared-order distance between pairs of fonts.
|
||||
dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
|
||||
false, feature_map);
|
||||
++dist_count;
|
||||
}
|
||||
return dist_sum / dist_count;
|
||||
}
|
||||
|
||||
// Replaces samples that are always fragmented with the corresponding
|
||||
// fragment samples.
|
||||
void MasterTrainer::ReplaceFragmentedSamples() {
|
||||
if (fragments_ == NULL) return;
|
||||
// Remove samples that are replaced by fragments. Each class that was
|
||||
// always naturally fragmented should be replaced by its fragments.
|
||||
int num_samples = samples_.num_samples();
|
||||
for (int s = 0; s < num_samples; ++s) {
|
||||
TrainingSample* sample = samples_.mutable_sample(s);
|
||||
if (fragments_[sample->class_id()] > 0)
|
||||
samples_.KillSample(sample);
|
||||
}
|
||||
samples_.DeleteDeadSamples();
|
||||
|
||||
// Get ids of fragments in junk_samples_ that replace the dead chars.
|
||||
const UNICHARSET& frag_set = junk_samples_.unicharset();
|
||||
#if 0
|
||||
// TODO(rays) The original idea was to replace only graphemes that were
|
||||
// always naturally fragmented, but that left a lot of the Indic graphemes
|
||||
// out. Determine whether we can go back to that idea now that spacing
|
||||
// is fixed in the training images, or whether this code is obsolete.
|
||||
bool* good_junk = new bool[frag_set.size()];
|
||||
memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());
|
||||
for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {
|
||||
int frag_ch = fragments_[dead_ch];
|
||||
if (frag_ch <= 0) continue;
|
||||
const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);
|
||||
CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);
|
||||
// Mark the chars for all parts of the fragment as good in good_junk.
|
||||
for (int part = 0; part < frag->get_total(); ++part) {
|
||||
frag->set_pos(part);
|
||||
int good_ch = frag_set.unichar_to_id(frag->to_string().string());
|
||||
if (good_ch != INVALID_UNICHAR_ID)
|
||||
good_junk[good_ch] = true; // We want this one.
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// For now just use all the junk that was from natural fragments.
|
||||
// Get samples of fragments in junk_samples_ that replace the dead chars.
|
||||
int num_junks = junk_samples_.num_samples();
|
||||
for (int s = 0; s < num_junks; ++s) {
|
||||
TrainingSample* sample = junk_samples_.mutable_sample(s);
|
||||
int junk_id = sample->class_id();
|
||||
const char* frag_utf8 = frag_set.id_to_unichar(junk_id);
|
||||
CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);
|
||||
if (frag != NULL && frag->is_natural()) {
|
||||
junk_samples_.extract_sample(s);
|
||||
samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);
|
||||
}
|
||||
}
|
||||
junk_samples_.DeleteDeadSamples();
|
||||
junk_samples_.OrganizeByFontAndClass();
|
||||
samples_.OrganizeByFontAndClass();
|
||||
unicharset_.clear();
|
||||
unicharset_.AppendOtherUnicharset(samples_.unicharset());
|
||||
// delete [] good_junk;
|
||||
// Fragments_ no longer needed?
|
||||
delete [] fragments_;
|
||||
fragments_ = NULL;
|
||||
}
|
||||
|
||||
// Runs a hierarchical agglomerative clustering to merge shapes in the given
|
||||
// shape_table, while satisfying the given constraints:
|
||||
// * End with at least min_shapes left in shape_table,
|
||||
// * No shape shall have more than max_shape_unichars in it,
|
||||
// * Don't merge shapes where the distance between them exceeds max_dist.
|
||||
const float kInfiniteDist = 999.0f;
|
||||
void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,
|
||||
float max_dist, ShapeTable* shapes) {
|
||||
int num_shapes = shapes->NumShapes();
|
||||
int max_merges = num_shapes - min_shapes;
|
||||
GenericVector<ShapeDist>* shape_dists =
|
||||
new GenericVector<ShapeDist>[num_shapes];
|
||||
float min_dist = kInfiniteDist;
|
||||
int min_s1 = 0;
|
||||
int min_s2 = 0;
|
||||
tprintf("Computing shape distances...");
|
||||
for (int s1 = 0; s1 < num_shapes; ++s1) {
|
||||
for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {
|
||||
ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));
|
||||
shape_dists[s1].push_back(dist);
|
||||
if (dist.distance < min_dist) {
|
||||
min_dist = dist.distance;
|
||||
min_s1 = s1;
|
||||
min_s2 = s2;
|
||||
}
|
||||
}
|
||||
tprintf(" %d", s1);
|
||||
}
|
||||
tprintf("\n");
|
||||
int num_merged = 0;
|
||||
while (num_merged < max_merges && min_dist < max_dist) {
|
||||
tprintf("Distance = %f: ", min_dist);
|
||||
int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);
|
||||
shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;
|
||||
if (num_unichars > max_shape_unichars) {
|
||||
tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n",
|
||||
min_s1, min_s2, num_unichars, max_shape_unichars);
|
||||
} else {
|
||||
shapes->MergeShapes(min_s1, min_s2);
|
||||
shape_dists[min_s2].clear();
|
||||
++num_merged;
|
||||
|
||||
for (int s = 0; s < min_s1; ++s) {
|
||||
if (!shape_dists[s].empty()) {
|
||||
shape_dists[s][min_s1 - s - 1].distance =
|
||||
ShapeDistance(*shapes, s, min_s1);
|
||||
shape_dists[s][min_s2 - s -1].distance = kInfiniteDist;
|
||||
}
|
||||
}
|
||||
for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
|
||||
if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist)
|
||||
shape_dists[min_s1][s2 - min_s1 - 1].distance =
|
||||
ShapeDistance(*shapes, min_s1, s2);
|
||||
}
|
||||
for (int s = min_s1 + 1; s < min_s2; ++s) {
|
||||
if (!shape_dists[s].empty()) {
|
||||
shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;
|
||||
}
|
||||
}
|
||||
}
|
||||
min_dist = kInfiniteDist;
|
||||
for (int s1 = 0; s1 < num_shapes; ++s1) {
|
||||
for (int i = 0; i < shape_dists[s1].size(); ++i) {
|
||||
if (shape_dists[s1][i].distance < min_dist) {
|
||||
min_dist = shape_dists[s1][i].distance;
|
||||
min_s1 = s1;
|
||||
min_s2 = s1 + 1 + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist);
|
||||
delete [] shape_dists;
|
||||
if (debug_level_ > 1) {
|
||||
for (int s1 = 0; s1 < num_shapes; ++s1) {
|
||||
if (shapes->MasterDestinationIndex(s1) == s1) {
|
||||
tprintf("Master shape:%s\n", shapes->DebugStr(s1).string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace tesseract.
|
296
classify/mastertrainer.h
Normal file
296
classify/mastertrainer.h
Normal file
@ -0,0 +1,296 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: mastertrainer.h
|
||||
// Description: Trainer to build the MasterClassifier.
|
||||
// Author: Ray Smith
|
||||
// Created: Wed Nov 03 18:07:01 PDT 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_TRAINING_MASTERTRAINER_H__
|
||||
#define TESSERACT_TRAINING_MASTERTRAINER_H__
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Include Files and Type Defines
|
||||
----------------------------------------------------------------------------**/
|
||||
#include "classify.h"
|
||||
#include "cluster.h"
|
||||
#include "intfx.h"
|
||||
#include "elst.h"
|
||||
#include "featdefs.h"
|
||||
#include "fontinfo.h"
|
||||
#include "indexmapbidi.h"
|
||||
#include "intfeaturespace.h"
|
||||
#include "intfeaturemap.h"
|
||||
#include "intmatcher.h"
|
||||
#include "params.h"
|
||||
#include "shapetable.h"
|
||||
#include "trainingsample.h"
|
||||
#include "trainingsampleset.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class ShapeClassifier;
|
||||
|
||||
// Simple struct to hold the distance between two shapes during clustering.
|
||||
struct ShapeDist {
|
||||
ShapeDist() : shape1(0), shape2(0), distance(0.0f) {}
|
||||
ShapeDist(int s1, int s2, float dist)
|
||||
: shape1(s1), shape2(s2), distance(dist) {}
|
||||
|
||||
// Sort operator to sort in ascending order of distance.
|
||||
bool operator<(const ShapeDist& other) const {
|
||||
return distance < other.distance;
|
||||
}
|
||||
|
||||
int shape1;
|
||||
int shape2;
|
||||
float distance;
|
||||
};
|
||||
|
||||
// Class to encapsulate training processes that use the TrainingSampleSet.
|
||||
// Initially supports shape clustering and mftrainining.
|
||||
// Other important features of the MasterTrainer are conditioning the data
|
||||
// by outlier elimination, replication with perturbation, and serialization.
|
||||
class MasterTrainer {
|
||||
public:
|
||||
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis,
|
||||
bool replicate_samples, int debug_level);
|
||||
~MasterTrainer();
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Loads an initial unicharset, or sets one up if the file cannot be read.
|
||||
void LoadUnicharset(const char* filename);
|
||||
|
||||
// Sets the feature space definition.
|
||||
void SetFeatureSpace(const IntFeatureSpace& fs) {
|
||||
feature_space_ = fs;
|
||||
feature_map_.Init(fs);
|
||||
}
|
||||
|
||||
// Reads the samples and their features from the given file,
|
||||
// adding them to the trainer with the font_id from the content of the file.
|
||||
// If verification, then these are verification samples, not training.
|
||||
void ReadTrainingSamples(FILE *fp,
|
||||
const FEATURE_DEFS_STRUCT& feature_defs,
|
||||
bool verification);
|
||||
|
||||
// Adds the given single sample to the trainer, setting the classid
|
||||
// appropriately from the given unichar_str.
|
||||
void AddSample(bool verification, const char* unichar_str,
|
||||
TrainingSample* sample);
|
||||
|
||||
// Loads all pages from the given tif filename and append to page_images_.
|
||||
// Must be called after ReadTrainingSamples, as the current number of images
|
||||
// is used as an offset for page numbers in the samples.
|
||||
void LoadPageImages(const char* filename);
|
||||
|
||||
// Cleans up the samples after initial load from the tr files, and prior to
|
||||
// saving the MasterTrainer:
|
||||
// Remaps fragmented chars if running shape anaylsis.
|
||||
// Sets up the samples appropriately for class/fontwise access.
|
||||
// Deletes outlier samples.
|
||||
void PostLoadCleanup();
|
||||
|
||||
// Gets the samples ready for training. Use after both
|
||||
// ReadTrainingSamples+PostLoadCleanup or DeSerialize.
|
||||
// Re-indexes the features and computes canonical and cloud features.
|
||||
void PreTrainingSetup();
|
||||
|
||||
// Sets up the master_shapes_ table, which tells which fonts should stay
|
||||
// together until they get to a leaf node classifier.
|
||||
void SetupMasterShapes();
|
||||
|
||||
// Adds the junk_samples_ to the main samples_ set. Junk samples are initially
|
||||
// fragments and n-grams (all incorrectly segmented characters).
|
||||
// Various training functions may result in incorrectly segmented characters
|
||||
// being added to the unicharset of the main samples, perhaps because they
|
||||
// form a "radical" decomposition of some (Indic) grapheme, or because they
|
||||
// just look the same as a real character (like rn/m)
|
||||
// This function moves all the junk samples, to the main samples_ set, but
|
||||
// desirable junk, being any sample for which the unichar already exists in
|
||||
// the samples_ unicharset gets the unichar-ids re-indexed to match, but
|
||||
// anything else gets re-marked as unichar_id 0 (space character) to identify
|
||||
// it as junk to the error counter.
|
||||
void IncludeJunk();
|
||||
|
||||
// Replicates the samples and perturbs them if the enable_replication_ flag
|
||||
// is set. MUST be used after the last call to OrganizeByFontAndClass on
|
||||
// the training samples, ie after IncludeJunk if it is going to be used, as
|
||||
// OrganizeByFontAndClass will eat the replicated samples into the regular
|
||||
// samples.
|
||||
void ReplicateAndRandomizeSamplesIfRequired();
|
||||
|
||||
// Loads the basic font properties file into fontinfo_table_.
|
||||
// Returns false on failure.
|
||||
bool LoadFontInfo(const char* filename);
|
||||
|
||||
// Loads the xheight font properties file into xheights_.
|
||||
// Returns false on failure.
|
||||
bool LoadXHeights(const char* filename);
|
||||
|
||||
// Reads spacing stats from filename and adds them to fontinfo_table.
|
||||
// Returns false on failure.
|
||||
bool AddSpacingInfo(const char *filename);
|
||||
|
||||
// Returns the font id corresponding to the given font name.
|
||||
// Returns -1 if the font cannot be found.
|
||||
int GetFontInfoId(const char* font_name);
|
||||
// Returns the font_id of the closest matching font name to the given
|
||||
// filename. It is assumed that a substring of the filename will match
|
||||
// one of the fonts. If more than one is matched, the longest is returned.
|
||||
int GetBestMatchingFontInfoId(const char* filename);
|
||||
|
||||
// Sets up a flat shapetable with one shape per class/font combination.
|
||||
void SetupFlatShapeTable(ShapeTable* shape_table);
|
||||
|
||||
// Sets up a Clusterer for mftraining on a single shape_id.
|
||||
// Call FreeClusterer on the return value after use.
|
||||
CLUSTERER* SetupForClustering(const ShapeTable& shape_table,
|
||||
const FEATURE_DEFS_STRUCT& feature_defs,
|
||||
int shape_id, int* num_samples);
|
||||
|
||||
// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
|
||||
// to the given inttemp_file, and the corresponding pffmtable.
|
||||
// The unicharset is the original encoding of graphemes, and shape_set should
|
||||
// match the size of the shape_table, and may possibly be totally fake.
|
||||
void WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
|
||||
const UNICHARSET& shape_set,
|
||||
const ShapeTable& shape_table,
|
||||
CLASS_STRUCT* float_classes,
|
||||
const char* inttemp_file,
|
||||
const char* pffmtable_file);
|
||||
|
||||
const UNICHARSET& unicharset() const {
|
||||
return samples_.unicharset();
|
||||
}
|
||||
TrainingSampleSet* GetSamples() {
|
||||
return &samples_;
|
||||
}
|
||||
const ShapeTable& master_shapes() const {
|
||||
return master_shapes_;
|
||||
}
|
||||
|
||||
// Generates debug output relating to the canonical distance between the
|
||||
// two given UTF8 grapheme strings.
|
||||
void DebugCanonical(const char* unichar_str1, const char* unichar_str2);
|
||||
// Debugging for cloud/canonical features.
|
||||
// Displays a Features window containing:
|
||||
// If unichar_str2 is in the unicharset, and canonical_font is non-negative,
|
||||
// displays the canonical features of the char/font combination in red.
|
||||
// If unichar_str1 is in the unicharset, and cloud_font is non-negative,
|
||||
// displays the cloud feature of the char/font combination in green.
|
||||
// The canonical features are drawn first to show which ones have no
|
||||
// matches in the cloud features.
|
||||
// Until the features window is destroyed, each click in the features window
|
||||
// will display the samples that have that feature in a separate window.
|
||||
void DisplaySamples(const char* unichar_str1, int cloud_font,
|
||||
const char* unichar_str2, int canonical_font);
|
||||
|
||||
// Tests the given test_classifier on the internal samples.
|
||||
// See TestClassifier for details.
|
||||
void TestClassifierOnSamples(int report_level,
|
||||
bool replicate_samples,
|
||||
ShapeClassifier* test_classifier,
|
||||
STRING* report_string);
|
||||
// Tests the given test_classifier on the given samples
|
||||
// report_levels:
|
||||
// 0 = no output.
|
||||
// 1 = bottom-line error rate.
|
||||
// 2 = bottom-line error rate + time.
|
||||
// 3 = font-level error rate + time.
|
||||
// 4 = list of all errors + short classifier debug output on 16 errors.
|
||||
// 5 = list of all errors + short classifier debug output on 25 errors.
|
||||
// If replicate_samples is true, then the test is run on an extended test
|
||||
// sample including replicated and systematically perturbed samples.
|
||||
// If report_string is non-NULL, a summary of the results for each font
|
||||
// is appended to the report_string.
|
||||
double TestClassifier(int report_level,
|
||||
bool replicate_samples,
|
||||
TrainingSampleSet* samples,
|
||||
ShapeClassifier* test_classifier,
|
||||
STRING* report_string);
|
||||
|
||||
// Returns the average (in some sense) distance between the two given
|
||||
// shapes, which may contain multiple fonts and/or unichars.
|
||||
// This function is public to facilitate testing.
|
||||
float ShapeDistance(const ShapeTable& shapes, int s1, int s2);
|
||||
|
||||
private:
|
||||
// Replaces samples that are always fragmented with the corresponding
|
||||
// fragment samples.
|
||||
void ReplaceFragmentedSamples();
|
||||
|
||||
// Runs a hierarchical agglomerative clustering to merge shapes in the given
|
||||
// shape_table, while satisfying the given constraints:
|
||||
// * End with at least min_shapes left in shape_table,
|
||||
// * No shape shall have more than max_shape_unichars in it,
|
||||
// * Don't merge shapes where the distance between them exceeds max_dist.
|
||||
void ClusterShapes(int min_shapes, int max_shape_unichars,
|
||||
float max_dist, ShapeTable* shape_table);
|
||||
|
||||
private:
|
||||
NormalizationMode norm_mode_;
|
||||
// Character set we are training for.
|
||||
UNICHARSET unicharset_;
|
||||
// Original feature space. Subspace mapping is contained in feature_map_.
|
||||
IntFeatureSpace feature_space_;
|
||||
TrainingSampleSet samples_;
|
||||
TrainingSampleSet junk_samples_;
|
||||
TrainingSampleSet verify_samples_;
|
||||
// Master shape table defines what fonts stay together until the leaves.
|
||||
ShapeTable master_shapes_;
|
||||
// Flat shape table has each unichar/font id pair in a separate shape.
|
||||
ShapeTable flat_shapes_;
|
||||
// Font metrics gathered from multiple files.
|
||||
UnicityTable<FontInfo> fontinfo_table_;
|
||||
// Array of xheights indexed by font ids in fontinfo_table_;
|
||||
GenericVector<int> xheights_;
|
||||
|
||||
// Non-serialized data initialized by other means or used temporarily
|
||||
// during loading of training samples.
|
||||
// Number of different class labels in unicharset_.
|
||||
int charsetsize_;
|
||||
// Flag to indicate that we are running shape analysis and need fragments
|
||||
// fixing.
|
||||
bool enable_shape_anaylsis_;
|
||||
// Flag to indicate that sample replication is required.
|
||||
bool enable_replication_;
|
||||
// Flag to indicate that junk should be included in samples_.
|
||||
bool include_junk_;
|
||||
// Array of classids of fragments that replace the correctly segmented chars.
|
||||
int* fragments_;
|
||||
// Classid of previous correctly segmented sample that was added.
|
||||
int prev_unichar_id_;
|
||||
// Debug output control.
|
||||
int debug_level_;
|
||||
// Feature map used to construct reduced feature spaces for compact
|
||||
// classifiers.
|
||||
IntFeatureMap feature_map_;
|
||||
// Vector of Pix pointers used for classifiers that need the image.
|
||||
// Indexed by page_num_ in the samples.
|
||||
// These images are owned by the trainer and need to be pixDestroyed.
|
||||
GenericVector<Pix*> page_images_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif
|
@ -26,8 +26,11 @@
|
||||
|
||||
typedef enum {
|
||||
MFXPosition, MFYPosition,
|
||||
MFLength, MFDirection, MFBulge1, MFBulge2
|
||||
MFLength, MFDirection, MFBulge1, MFBulge2,
|
||||
MFCount // For array sizes.
|
||||
} MF_PARAM_NAME;
|
||||
|
||||
typedef float MicroFeature[MFCount];
|
||||
/*----------------------------------------------------------------------------
|
||||
Private Function Prototypes
|
||||
-----------------------------------------------------------------------------*/
|
||||
|
@ -76,7 +76,8 @@ MFOUTLINE ConvertOutline(TESSLINE *outline) {
|
||||
EdgePoint = NextPoint;
|
||||
} while (EdgePoint != StartPoint);
|
||||
|
||||
MakeOutlineCircular(MFOutline);
|
||||
if (MFOutline != NULL)
|
||||
MakeOutlineCircular(MFOutline);
|
||||
return MFOutline;
|
||||
}
|
||||
|
||||
@ -95,7 +96,8 @@ LIST ConvertOutlines(TESSLINE *outline,
|
||||
|
||||
while (outline != NULL) {
|
||||
mf_outline = ConvertOutline(outline);
|
||||
mf_outlines = push(mf_outlines, mf_outline);
|
||||
if (mf_outline != NULL)
|
||||
mf_outlines = push(mf_outlines, mf_outline);
|
||||
outline = outline->next;
|
||||
}
|
||||
return mf_outlines;
|
||||
@ -404,54 +406,6 @@ void Classify::NormalizeOutlines(LIST Outlines,
|
||||
} /* NormalizeOutlines */
|
||||
} // namespace tesseract
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
void SmearExtremities(MFOUTLINE Outline, FLOAT32 XScale, FLOAT32 YScale) {
|
||||
/*
|
||||
** Parameters:
|
||||
** Outline outline whose extremities are to be smeared
|
||||
** XScale factor used to normalize outline in x dir
|
||||
** YScale factor used to normalize outline in y dir
|
||||
** Globals: none
|
||||
** Operation:
|
||||
** This routine smears the extremities of the specified outline.
|
||||
** It does this by adding a random number between
|
||||
** -0.5 and 0.5 pixels (that is why X/YScale are needed) to
|
||||
** the x and y position of the point. This is done so that
|
||||
** the discrete nature of the original scanned image does not
|
||||
** affect the statistical clustering used during training.
|
||||
** Return: none
|
||||
** Exceptions: none
|
||||
** History: 1/11/90, DSJ, Created.
|
||||
*/
|
||||
MFEDGEPT *Current;
|
||||
MFOUTLINE EdgePoint;
|
||||
FLOAT32 MinXSmear;
|
||||
FLOAT32 MaxXSmear;
|
||||
FLOAT32 MinYSmear;
|
||||
FLOAT32 MaxYSmear;
|
||||
|
||||
if (Outline != NIL_LIST) {
|
||||
MinXSmear = -0.5 * XScale;
|
||||
MaxXSmear = 0.5 * XScale;
|
||||
MinYSmear = -0.5 * YScale;
|
||||
MaxYSmear = 0.5 * YScale;
|
||||
EdgePoint = Outline;
|
||||
do {
|
||||
Current = PointAt (EdgePoint);
|
||||
if (Current->ExtremityMark) {
|
||||
Current->Point.x +=
|
||||
UniformRandomNumber(MinXSmear, MaxXSmear);
|
||||
Current->Point.y +=
|
||||
UniformRandomNumber(MinYSmear, MaxYSmear);
|
||||
}
|
||||
|
||||
EdgePoint = NextPointAfter (EdgePoint);
|
||||
}
|
||||
while (EdgePoint != Outline);
|
||||
}
|
||||
} /* SmearExtremities */
|
||||
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Private Code
|
||||
----------------------------------------------------------------------------**/
|
||||
|
@ -114,8 +114,6 @@ MFOUTLINE NextExtremity(MFOUTLINE EdgePoint);
|
||||
void NormalizeOutline(MFOUTLINE Outline,
|
||||
FLOAT32 XOrigin);
|
||||
|
||||
void SmearExtremities(MFOUTLINE Outline, FLOAT32 XScale, FLOAT32 YScale);
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
Private Function Prototypes
|
||||
-----------------------------------------------------------------------------*/
|
||||
|
@ -102,7 +102,6 @@ CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm) {
|
||||
Outline = (MFOUTLINE) first_node (RemainingOutlines);
|
||||
FindDirectionChanges(Outline, classify_min_slope, classify_max_slope);
|
||||
MarkDirectionChanges(Outline);
|
||||
SmearExtremities(Outline, XScale, YScale);
|
||||
MicroFeatures = ConvertToMicroFeatures (Outline, MicroFeatures);
|
||||
}
|
||||
FreeOutlines(Outlines);
|
||||
|
@ -70,7 +70,8 @@ const double kWidthErrorWeighting = 0.125;
|
||||
----------------------------------------------------------------------------**/
|
||||
/*---------------------------------------------------------------------------*/
|
||||
namespace tesseract {
|
||||
FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature,
|
||||
FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId,
|
||||
const FEATURE_STRUCT& feature,
|
||||
BOOL8 DebugMatch) {
|
||||
/*
|
||||
** Parameters:
|
||||
@ -96,12 +97,12 @@ FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature,
|
||||
/* handle requests for classification as noise */
|
||||
if (ClassId == NO_CLASS) {
|
||||
/* kludge - clean up constants and make into control knobs later */
|
||||
Match = (Feature->Params[CharNormLength] *
|
||||
Feature->Params[CharNormLength] * 500.0 +
|
||||
Feature->Params[CharNormRx] *
|
||||
Feature->Params[CharNormRx] * 8000.0 +
|
||||
Feature->Params[CharNormRy] *
|
||||
Feature->Params[CharNormRy] * 8000.0);
|
||||
Match = (feature.Params[CharNormLength] *
|
||||
feature.Params[CharNormLength] * 500.0 +
|
||||
feature.Params[CharNormRx] *
|
||||
feature.Params[CharNormRx] * 8000.0 +
|
||||
feature.Params[CharNormRy] *
|
||||
feature.Params[CharNormRy] * 8000.0);
|
||||
return (1.0 - NormEvidenceOf (Match));
|
||||
}
|
||||
|
||||
@ -109,38 +110,48 @@ FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature,
|
||||
Protos = NormProtos->Protos[ClassId];
|
||||
|
||||
if (DebugMatch) {
|
||||
cprintf ("\nFeature = ");
|
||||
WriteFeature(stdout, Feature);
|
||||
tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
|
||||
}
|
||||
|
||||
ProtoId = 0;
|
||||
iterate(Protos) {
|
||||
Proto = (PROTOTYPE *) first_node (Protos);
|
||||
Delta = Feature->Params[CharNormY] - Proto->Mean[CharNormY];
|
||||
Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
|
||||
Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
|
||||
Delta = Feature->Params[CharNormRx] - Proto->Mean[CharNormRx];
|
||||
if (DebugMatch) {
|
||||
tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
|
||||
Proto->Mean[CharNormY], Delta,
|
||||
Proto->Weight.Elliptical[CharNormY], Match);
|
||||
}
|
||||
Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
|
||||
Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
|
||||
if (DebugMatch) {
|
||||
tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
|
||||
Proto->Mean[CharNormRx], Delta,
|
||||
Proto->Weight.Elliptical[CharNormRx], Match);
|
||||
}
|
||||
// Ry is width! See intfx.cpp.
|
||||
Delta = Feature->Params[CharNormRy] - Proto->Mean[CharNormRy];
|
||||
Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
|
||||
if (DebugMatch) {
|
||||
tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
|
||||
Proto->Mean[CharNormRy], Delta,
|
||||
Proto->Weight.Elliptical[CharNormRy]);
|
||||
}
|
||||
Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
|
||||
Delta *= kWidthErrorWeighting;
|
||||
Match += Delta;
|
||||
if (DebugMatch) {
|
||||
tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
|
||||
Match, Match / classify_norm_adj_midpoint,
|
||||
NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
|
||||
}
|
||||
|
||||
if (Match < BestMatch)
|
||||
BestMatch = Match;
|
||||
|
||||
if (DebugMatch) {
|
||||
cprintf ("Proto %1d = ", ProtoId);
|
||||
WriteNFloats (stdout, NormProtos->NumParams, Proto->Mean);
|
||||
cprintf (" var = ");
|
||||
WriteNFloats (stdout, NormProtos->NumParams,
|
||||
Proto->Variance.Elliptical);
|
||||
cprintf (" match = ");
|
||||
PrintNormMatch (stdout, NormProtos->NumParams, Proto, Feature);
|
||||
}
|
||||
ProtoId++;
|
||||
}
|
||||
return (1.0 - NormEvidenceOf (BestMatch));
|
||||
return 1.0 - NormEvidenceOf(BestMatch);
|
||||
} /* ComputeNormMatch */
|
||||
|
||||
void Classify::FreeNormProtos() {
|
||||
@ -230,7 +241,7 @@ NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
|
||||
*/
|
||||
NORM_PROTOS *NormProtos;
|
||||
int i;
|
||||
char unichar[UNICHAR_LEN + 1];
|
||||
char unichar[2 * UNICHAR_LEN + 1];
|
||||
UNICHAR_ID unichar_id;
|
||||
LIST Protos;
|
||||
int NumProtos;
|
||||
@ -256,8 +267,12 @@ NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
|
||||
Protos =
|
||||
push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
|
||||
NormProtos->Protos[unichar_id] = Protos;
|
||||
} else
|
||||
cprintf("Error: unichar %s in normproto file is not in unichar set.\n");
|
||||
} else {
|
||||
cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
|
||||
unichar);
|
||||
for (i = 0; i < NumProtos; i++)
|
||||
FreePrototype(ReadPrototype (File, NormProtos->NumParams));
|
||||
}
|
||||
SkipNewline(File);
|
||||
}
|
||||
return (NormProtos);
|
||||
|
@ -231,12 +231,11 @@ void WriteFeature(FILE *File, FEATURE Feature) {
|
||||
|
||||
for (i = 0; i < Feature->Type->NumParams; i++) {
|
||||
#ifndef WIN32
|
||||
assert (!isnan(Feature->Params[i]));
|
||||
assert(!isnan(Feature->Params[i]));
|
||||
#endif
|
||||
fprintf (File, " %12g", Feature->Params[i]);
|
||||
fprintf(File, " %g", Feature->Params[i]);
|
||||
}
|
||||
fprintf (File, "\n");
|
||||
|
||||
fprintf(File, "\n");
|
||||
} /* WriteFeature */
|
||||
|
||||
|
||||
|
@ -100,11 +100,9 @@ const PARAM_DESC Name[] = {
|
||||
Macro for describing a new feature. The parameters of the macro
|
||||
are as follows:
|
||||
|
||||
DefineFeature (Name, NumLinear, NumCircular,
|
||||
MinFeatPerChar, MaxFeatPerChar,
|
||||
LongName, ShortName, ParamName)
|
||||
DefineFeature (Name, NumLinear, NumCircular, ShortName, ParamName)
|
||||
----------------------------------------------------------------------*/
|
||||
#define DefineFeature(Name, NL, NC, Min, Max, LN, SN, PN) \
|
||||
#define DefineFeature(Name, NL, NC, SN, PN) \
|
||||
const FEATURE_DESC_STRUCT Name = { \
|
||||
((NL) + (NC)), SN, PN};
|
||||
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "mfoutline.h"
|
||||
#include "ocrfeatures.h"
|
||||
#include "params.h"
|
||||
#include "trainingsample.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
@ -221,3 +222,59 @@ void NormalizePicoX(FEATURE_SET FeatureSet) {
|
||||
Feature->Params[PicoFeatX] -= Origin;
|
||||
}
|
||||
} /* NormalizePicoX */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
|
||||
/*
|
||||
** Parameters:
|
||||
** blob blob to extract features from
|
||||
** denorm normalization/denormalization parameters.
|
||||
** Return: Integer character-normalized features for blob.
|
||||
** Exceptions: none
|
||||
** History: 8/8/2011, rays, Created.
|
||||
*/
|
||||
tesseract::TrainingSample* sample = GetIntFeatures(
|
||||
tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
|
||||
if (sample == NULL) return NULL;
|
||||
|
||||
int num_features = sample->num_features();
|
||||
const INT_FEATURE_STRUCT* features = sample->features();
|
||||
FEATURE_SET feature_set = NewFeatureSet(num_features);
|
||||
for (int f = 0; f < num_features; ++f) {
|
||||
FEATURE feature = NewFeature(&IntFeatDesc);
|
||||
|
||||
feature->Params[IntX] = features[f].X;
|
||||
feature->Params[IntY] = features[f].Y;
|
||||
feature->Params[IntDir] = features[f].Theta;
|
||||
AddFeature(feature_set, feature);
|
||||
}
|
||||
delete sample;
|
||||
|
||||
return feature_set;
|
||||
} /* ExtractIntCNFeatures */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& denorm) {
|
||||
/*
|
||||
** Parameters:
|
||||
** blob blob to extract features from
|
||||
** denorm normalization/denormalization parameters.
|
||||
** Return: Geometric (top/bottom/width) features for blob.
|
||||
** Exceptions: none
|
||||
** History: 8/8/2011, rays, Created.
|
||||
*/
|
||||
tesseract::TrainingSample* sample = GetIntFeatures(
|
||||
tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
|
||||
if (sample == NULL) return NULL;
|
||||
|
||||
FEATURE_SET feature_set = NewFeatureSet(1);
|
||||
FEATURE feature = NewFeature(&IntFeatDesc);
|
||||
|
||||
feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
|
||||
feature->Params[GeoTop] = sample->geo_feature(GeoTop);
|
||||
feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
|
||||
AddFeature(feature_set, feature);
|
||||
delete sample;
|
||||
|
||||
return feature_set;
|
||||
} /* ExtractIntGeoFeatures */
|
||||
|
@ -24,6 +24,22 @@
|
||||
#include "ocrfeatures.h"
|
||||
#include "params.h"
|
||||
|
||||
// Enum for the order/type of params in IntFeatDesc.
|
||||
enum IntParams {
|
||||
IntX, // x-position (0-255).
|
||||
IntY, // y-position (0-255).
|
||||
IntDir // Direction (0-255, circular).
|
||||
};
|
||||
|
||||
// Enum for the order/type of params in GeoFeatDesc.
|
||||
enum GeoParams {
|
||||
GeoBottom, // Bounding box bottom in baseline space (0-255).
|
||||
GeoTop, // Bounding box top in baseline space (0-255).
|
||||
GeoWidth, // Bounding box width in baseline space (0-255).
|
||||
|
||||
GeoCount // Number of geo features.
|
||||
};
|
||||
|
||||
typedef enum
|
||||
{ PicoFeatY, PicoFeatDir, PicoFeatX }
|
||||
PICO_FEAT_PARAM_NAME;
|
||||
@ -42,6 +58,9 @@ extern double_VAR_H(classify_pico_feature_length, 0.05, "Pico Feature Length");
|
||||
----------------------------------------------------------------------------**/
|
||||
#define GetPicoFeatureLength() (PicoFeatureLength)
|
||||
|
||||
FEATURE_SET ExtractIntCNFeatures(TBLOB *Blob, const DENORM& denorm);
|
||||
FEATURE_SET ExtractIntGeoFeatures(TBLOB *Blob, const DENORM& denorm);
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Global Data Definitions and Declarations
|
||||
----------------------------------------------------------------------------**/
|
||||
|
@ -51,8 +51,11 @@ typedef struct
|
||||
} PROTO_STRUCT;
|
||||
typedef PROTO_STRUCT *PROTO;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
struct CLASS_STRUCT {
|
||||
CLASS_STRUCT()
|
||||
: NumProtos(0), MaxNumProtos(0), Prototypes(NULL),
|
||||
NumConfigs(0), MaxNumConfigs(0), Configurations(NULL) {
|
||||
}
|
||||
inT16 NumProtos;
|
||||
inT16 MaxNumProtos;
|
||||
PROTO Prototypes;
|
||||
@ -60,7 +63,7 @@ typedef struct
|
||||
inT16 MaxNumConfigs;
|
||||
CONFIGS Configurations;
|
||||
UnicityTableEqEq<int> font_set;
|
||||
} CLASS_STRUCT;
|
||||
};
|
||||
typedef CLASS_STRUCT *CLASS_TYPE;
|
||||
typedef CLASS_STRUCT *CLASSES;
|
||||
|
||||
|
262
classify/sampleiterator.cpp
Normal file
262
classify/sampleiterator.cpp
Normal file
@ -0,0 +1,262 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "sampleiterator.h"
|
||||
|
||||
#include "indexmapbidi.h"
|
||||
#include "shapetable.h"
|
||||
#include "trainingsample.h"
|
||||
#include "trainingsampleset.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// ================== SampleIterator Implementation =================
|
||||
|
||||
SampleIterator::SampleIterator()
|
||||
: charset_map_(NULL),
|
||||
shape_table_(NULL),
|
||||
sample_set_(NULL),
|
||||
randomize_(false),
|
||||
owned_shape_table_(NULL) {
|
||||
num_shapes_ = 0;
|
||||
Begin();
|
||||
}
|
||||
|
||||
SampleIterator::~SampleIterator() {
|
||||
Clear();
|
||||
}
|
||||
|
||||
void SampleIterator::Clear() {
|
||||
delete owned_shape_table_;
|
||||
owned_shape_table_ = NULL;
|
||||
}
|
||||
|
||||
// See class comment for arguments.
|
||||
void SampleIterator::Init(const IndexMapBiDi* charset_map,
|
||||
const ShapeTable* shape_table,
|
||||
bool randomize,
|
||||
TrainingSampleSet* sample_set) {
|
||||
Clear();
|
||||
charset_map_ = charset_map;
|
||||
shape_table_ = shape_table;
|
||||
sample_set_ = sample_set;
|
||||
randomize_ = randomize;
|
||||
if (shape_table_ == NULL && charset_map_ != NULL) {
|
||||
// The caller wishes to iterate by class. The easiest way to do this
|
||||
// is to create a dummy shape_table_ that we will own.
|
||||
int num_fonts = sample_set_->NumFonts();
|
||||
owned_shape_table_ = new ShapeTable(sample_set_->unicharset());
|
||||
int charsetsize = sample_set_->unicharset().size();
|
||||
for (int c = 0; c < charsetsize; ++c) {
|
||||
// We always add a shape for each character to keep the index in sync
|
||||
// with the unichar_id.
|
||||
int shape_id = owned_shape_table_->AddShape(c, 0);
|
||||
for (int f = 1; f < num_fonts; ++f) {
|
||||
if (sample_set_->NumClassSamples(f, c, true) > 0) {
|
||||
owned_shape_table_->AddToShape(shape_id, c, f);
|
||||
}
|
||||
}
|
||||
}
|
||||
shape_table_ = owned_shape_table_;
|
||||
}
|
||||
if (shape_table_ != NULL) {
|
||||
num_shapes_ = shape_table_->NumShapes();
|
||||
} else {
|
||||
num_shapes_ = randomize ? sample_set_->num_samples()
|
||||
: sample_set_->num_raw_samples();
|
||||
}
|
||||
Begin();
|
||||
}
|
||||
|
||||
// Iterator functions designed for use with a simple for loop:
|
||||
// for (it.Begin(); !it.AtEnd(); it.Next()) {
|
||||
// const TrainingSample& sample = it.GetSample();
|
||||
// }
|
||||
void SampleIterator::Begin() {
|
||||
shape_index_ = -1;
|
||||
shape_char_index_ = 0;
|
||||
num_shape_chars_ = 0;
|
||||
shape_font_index_ = 0;
|
||||
num_shape_fonts_ = 0;
|
||||
sample_index_ = 0;
|
||||
num_samples_ = 0;
|
||||
// Find the first indexable sample.
|
||||
Next();
|
||||
}
|
||||
|
||||
bool SampleIterator::AtEnd() const {
|
||||
return shape_index_ >= num_shapes_;
|
||||
}
|
||||
|
||||
const TrainingSample& SampleIterator::GetSample() const {
|
||||
if (shape_table_ != NULL) {
|
||||
const UnicharAndFonts* shape_entry = GetShapeEntry();
|
||||
int char_id = shape_entry->unichar_id;
|
||||
int font_id = shape_entry->font_ids[shape_font_index_];
|
||||
return *sample_set_->GetSample(font_id, char_id, sample_index_);
|
||||
} else {
|
||||
return *sample_set_->GetSample(shape_index_);
|
||||
}
|
||||
}
|
||||
|
||||
TrainingSample* SampleIterator::MutableSample() const {
|
||||
if (shape_table_ != NULL) {
|
||||
const UnicharAndFonts* shape_entry = GetShapeEntry();
|
||||
int char_id = shape_entry->unichar_id;
|
||||
int font_id = shape_entry->font_ids[shape_font_index_];
|
||||
return sample_set_->MutableSample(font_id, char_id, sample_index_);
|
||||
} else {
|
||||
return sample_set_->mutable_sample(shape_index_);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the total index (from the original set of samples) of the current
|
||||
// sample.
|
||||
int SampleIterator::GlobalSampleIndex() const {
|
||||
if (shape_table_ != NULL) {
|
||||
const UnicharAndFonts* shape_entry = GetShapeEntry();
|
||||
int char_id = shape_entry->unichar_id;
|
||||
int font_id = shape_entry->font_ids[shape_font_index_];
|
||||
return sample_set_->GlobalSampleIndex(font_id, char_id, sample_index_);
|
||||
} else {
|
||||
return shape_index_;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the index of the current sample in compact charset space, so
|
||||
// in a 2-class problem between x and y, the returned indices will all be
|
||||
// 0 or 1, and have nothing to do with the unichar_ids.
|
||||
// If the charset_map_ is NULL, then this is equal to GetSparseClassID().
|
||||
int SampleIterator::GetCompactClassID() const {
|
||||
return charset_map_ != NULL ? charset_map_->SparseToCompact(shape_index_)
|
||||
: GetSparseClassID();
|
||||
}
|
||||
// Returns the index of the current sample in sparse charset space, so
|
||||
// in a 2-class problem between x and y, the returned indices will all be
|
||||
// x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
|
||||
// with a shape_table_.
|
||||
int SampleIterator::GetSparseClassID() const {
|
||||
return shape_table_ != NULL ? shape_index_ : GetSample().class_id();
|
||||
}
|
||||
|
||||
// Moves on to the next indexable sample. If the end is reached, leaves
|
||||
// the state such that AtEnd() is true.
|
||||
void SampleIterator::Next() {
|
||||
if (shape_table_ != NULL) {
|
||||
// Next sample in this class/font combination.
|
||||
++sample_index_;
|
||||
if (sample_index_ < num_samples_)
|
||||
return;
|
||||
// Next font in this class in this shape.
|
||||
sample_index_ = 0;
|
||||
do {
|
||||
++shape_font_index_;
|
||||
if (shape_font_index_ >= num_shape_fonts_) {
|
||||
// Next unichar in this shape.
|
||||
shape_font_index_ = 0;
|
||||
++shape_char_index_;
|
||||
if (shape_char_index_ >= num_shape_chars_) {
|
||||
// Find the next shape that is mapped in the charset_map_.
|
||||
shape_char_index_ = 0;
|
||||
do {
|
||||
++shape_index_;
|
||||
} while (shape_index_ < num_shapes_ &&
|
||||
charset_map_ != NULL &&
|
||||
charset_map_->SparseToCompact(shape_index_) < 0);
|
||||
if (shape_index_ >= num_shapes_)
|
||||
return; // The end.
|
||||
num_shape_chars_ = shape_table_->GetShape(shape_index_).size();
|
||||
}
|
||||
}
|
||||
const UnicharAndFonts* shape_entry = GetShapeEntry();
|
||||
num_shape_fonts_ = shape_entry->font_ids.size();
|
||||
int char_id = shape_entry->unichar_id;
|
||||
int font_id = shape_entry->font_ids[shape_font_index_];
|
||||
num_samples_ = sample_set_->NumClassSamples(font_id, char_id, randomize_);
|
||||
} while (num_samples_ == 0);
|
||||
} else {
|
||||
// We are just iterating over the samples.
|
||||
++shape_index_;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the size of the compact charset space.
|
||||
int SampleIterator::CompactCharsetSize() const {
|
||||
return charset_map_ != NULL ? charset_map_->CompactSize()
|
||||
: SparseCharsetSize();
|
||||
}
|
||||
|
||||
// Returns the size of the sparse charset space.
|
||||
int SampleIterator::SparseCharsetSize() const {
|
||||
return charset_map_ != NULL
|
||||
? charset_map_->SparseSize()
|
||||
: (shape_table_ != NULL ? shape_table_->NumShapes()
|
||||
: sample_set_->charsetsize());
|
||||
}
|
||||
|
||||
// Apply the supplied feature_space/feature_map transform to all samples
|
||||
// accessed by this iterator.
|
||||
void SampleIterator::MapSampleFeatures(const IntFeatureMap& feature_map) {
|
||||
for (Begin(); !AtEnd(); Next()) {
|
||||
TrainingSample* sample = MutableSample();
|
||||
sample->MapFeatures(feature_map);
|
||||
}
|
||||
}
|
||||
|
||||
// Adjust the weights of all the samples to be uniform in the given charset.
|
||||
// Returns the number of samples in the iterator.
|
||||
int SampleIterator::UniformSamples() {
|
||||
int num_good_samples = 0;
|
||||
for (Begin(); !AtEnd(); Next()) {
|
||||
TrainingSample* sample = MutableSample();
|
||||
sample->set_weight(1.0);
|
||||
++num_good_samples;
|
||||
}
|
||||
NormalizeSamples();
|
||||
return num_good_samples;
|
||||
}
|
||||
|
||||
// Normalize the weights of all the samples in the charset_map so they sum
|
||||
// to 1. Returns the minimum assigned sample weight.
|
||||
double SampleIterator::NormalizeSamples() {
|
||||
double total_weight = 0.0;
|
||||
int sample_count = 0;
|
||||
for (Begin(); !AtEnd(); Next()) {
|
||||
const TrainingSample& sample = GetSample();
|
||||
total_weight += sample.weight();
|
||||
++sample_count;
|
||||
}
|
||||
// Normalize samples.
|
||||
double min_assigned_sample_weight = 1.0;
|
||||
if (total_weight > 0.0) {
|
||||
for (Begin(); !AtEnd(); Next()) {
|
||||
TrainingSample* sample = MutableSample();
|
||||
double weight = sample->weight() / total_weight;
|
||||
if (weight < min_assigned_sample_weight)
|
||||
min_assigned_sample_weight = weight;
|
||||
sample->set_weight(weight);
|
||||
}
|
||||
}
|
||||
return min_assigned_sample_weight;
|
||||
}
|
||||
|
||||
// Helper returns the current UnicharAndFont shape_entry.
|
||||
const UnicharAndFonts* SampleIterator::GetShapeEntry() const {
|
||||
const Shape& shape = shape_table_->GetShape(shape_index_);
|
||||
return &shape[shape_char_index_];
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
195
classify/sampleiterator.h
Normal file
195
classify/sampleiterator.h
Normal file
@ -0,0 +1,195 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
|
||||
#define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class IndexMapBiDi;
|
||||
class IntFeatureMap;
|
||||
class ShapeTable;
|
||||
class TrainingSample;
|
||||
class TrainingSampleSet;
|
||||
class UnicharAndFonts;
|
||||
|
||||
// Iterator class to encapsulate the complex iteration involved in getting
|
||||
// all samples of all shapes needed for a classification problem.
|
||||
//
|
||||
// =====INPUTS TO Init FUNCTION=====
|
||||
// The charset_map defines a subset of the sample_set classes (with a NULL
|
||||
// shape_table, or the shape_table classes if not NULL.)
|
||||
//
|
||||
// The shape_table (if not NULL) defines the mapping from shapes to
|
||||
// font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
|
||||
//
|
||||
// The sample_set holds the samples and provides indexed access to samples
|
||||
// of font_id/class_id pairs.
|
||||
//
|
||||
// If randomize is true, the samples are perturbed slightly, but the
|
||||
// perturbation is guaranteed to be the same for multiple identical
|
||||
// iterations.
|
||||
//
|
||||
// =====DIFFERENT COMBINATIONS OF INPUTS=====
|
||||
// NULL shape_table:
|
||||
// Without a shape_table, everything works in UNICHAR_IDs.
|
||||
//
|
||||
// NULL shape_table, NULL charset_map:
|
||||
// Iterations simply run over the samples in the order the samples occur in the
|
||||
// input files.
|
||||
// GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
|
||||
//
|
||||
// NULL shape_table, non-NULL charset_map:
|
||||
// When shape_table is NULL, the charset_map indexes unichar_ids directly,
|
||||
// and an iteration returns all samples of all chars in the charset_map, which
|
||||
// is a subset of the full unicharset.
|
||||
// The iteration will be in groups of the same unichar_id, in the order
|
||||
// defined by the charset_map.
|
||||
// GetCompactClassID returns the charset_map index of a sample, and
|
||||
// GetSparseClassID returns the sample UNICHAR_ID.
|
||||
//
|
||||
// Non-NULL shape_table:
|
||||
// With a shape_table, samples are grouped according to the shape_table, so
|
||||
// multiple UNICHAR_IDs and fonts may be grouped together, and everything
|
||||
// works in shape_ids.
|
||||
//
|
||||
// Non-NULL shape_table, NULL charset_map.
|
||||
// Iterations simply run over the samples in the order of shape_id.
|
||||
// GetCompactClassID and GetSparseClassID both return the shape_id.
|
||||
// (If you want the unichar_id or font_id, the sample still has them.)
|
||||
//
|
||||
// Non-NULL shape_table, non-NULL charset_map.
|
||||
// When shape_table is not NULL, the charset_map indexes and subsets shapes in
|
||||
// the shape_table, and iterations will be in shape_table order, not
|
||||
// charset_map order.
|
||||
// GetCompactClassID returns the charset_map index of a shape, and
|
||||
// GetSparseClassID returns the shape_id.
|
||||
//
|
||||
// =====What is SampleIterator good for?=====
|
||||
// Inside a classifier training module, the SampleIterator has abstracted away
|
||||
// all the different modes above.
|
||||
// Use the following iteration to train your classifier:
|
||||
// for (it.Begin(); !it.AtEnd(); it.Next()) {
|
||||
// const TrainingSample& sample = it.GetSample();
|
||||
// int class_id = it.GetCompactClassID();
|
||||
// Your classifier may or may not be dealing with a shape_table, and may be
|
||||
// dealing with some subset of the character/shape set. It doesn't need to
|
||||
// know and shouldn't care. It is just learning shapes with compact class ids
|
||||
// in the range [0, it.CompactCharsetSize()).
|
||||
class SampleIterator {
|
||||
public:
|
||||
SampleIterator();
|
||||
~SampleIterator();
|
||||
|
||||
void Clear();
|
||||
|
||||
// See class comment for arguments.
|
||||
void Init(const IndexMapBiDi* charset_map,
|
||||
const ShapeTable* shape_table,
|
||||
bool randomize,
|
||||
TrainingSampleSet* sample_set);
|
||||
|
||||
// Iterator functions designed for use with a simple for loop:
|
||||
// for (it.Begin(); !it.AtEnd(); it.Next()) {
|
||||
// const TrainingSample& sample = it.GetSample();
|
||||
// int class_id = it.GetCompactClassID();
|
||||
// ...
|
||||
// }
|
||||
void Begin();
|
||||
bool AtEnd() const;
|
||||
const TrainingSample& GetSample() const;
|
||||
TrainingSample* MutableSample() const;
|
||||
// Returns the total index (from the original set of samples) of the current
|
||||
// sample.
|
||||
int GlobalSampleIndex() const;
|
||||
// Returns the index of the current sample in compact charset space, so
|
||||
// in a 2-class problem between x and y, the returned indices will all be
|
||||
// 0 or 1, and have nothing to do with the unichar_ids.
|
||||
// If the charset_map_ is NULL, then this is equal to GetSparseClassID().
|
||||
int GetCompactClassID() const;
|
||||
// Returns the index of the current sample in sparse charset space, so
|
||||
// in a 2-class problem between x and y, the returned indices will all be
|
||||
// x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
|
||||
// with a shape_table_.
|
||||
int GetSparseClassID() const;
|
||||
// Moves on to the next indexable sample. If the end is reached, leaves
|
||||
// the state such that AtEnd() is true.
|
||||
void Next();
|
||||
|
||||
// Returns the size of the compact charset space.
|
||||
int CompactCharsetSize() const;
|
||||
// Returns the size of the sparse charset space.
|
||||
int SparseCharsetSize() const;
|
||||
|
||||
const IndexMapBiDi& charset_map() const {
|
||||
return *charset_map_;
|
||||
}
|
||||
const ShapeTable* shape_table() const {
|
||||
return shape_table_;
|
||||
}
|
||||
// Sample set operations.
|
||||
const TrainingSampleSet* sample_set() const {
|
||||
return sample_set_;
|
||||
}
|
||||
|
||||
// A set of functions that do something to all the samples accessed by the
|
||||
// iterator, as it is currently setup.
|
||||
|
||||
// Apply the supplied feature_space/feature_map transform to all samples
|
||||
// accessed by this iterator.
|
||||
void MapSampleFeatures(const IntFeatureMap& feature_map);
|
||||
|
||||
// Adjust the weights of all the samples to be uniform in the given charset.
|
||||
// Returns the number of samples in the iterator.
|
||||
int UniformSamples();
|
||||
|
||||
// Normalize the weights of all the samples defined by the iterator so they
|
||||
// sum to 1. Returns the minimum assigned sample weight.
|
||||
double NormalizeSamples();
|
||||
|
||||
private:
|
||||
// Helper returns the current UnicharAndFont shape_entry.
|
||||
const UnicharAndFonts* GetShapeEntry() const;
|
||||
|
||||
// Map to subset the actual charset space.
|
||||
const IndexMapBiDi* charset_map_;
|
||||
// Shape table to recombine character classes into shapes
|
||||
const ShapeTable* shape_table_;
|
||||
// The samples to iterate over.
|
||||
TrainingSampleSet* sample_set_;
|
||||
// Flag to control randomizing the sample features.
|
||||
bool randomize_;
|
||||
// Shape table owned by this used to iterate character classes.
|
||||
ShapeTable* owned_shape_table_;
|
||||
|
||||
// Top-level iteration. Shape index in sparse charset_map space.
|
||||
int shape_index_;
|
||||
int num_shapes_;
|
||||
// Index to the character class within a shape.
|
||||
int shape_char_index_;
|
||||
int num_shape_chars_;
|
||||
// Index to the font within a shape/class pair.
|
||||
int shape_font_index_;
|
||||
int num_shape_fonts_;
|
||||
// The lowest level iteration. sample_index_/num_samples_ counts samples
|
||||
// in the current shape/class/font combination.
|
||||
int sample_index_;
|
||||
int num_samples_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
|
95
classify/shapeclassifier.h
Normal file
95
classify/shapeclassifier.h
Normal file
@ -0,0 +1,95 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: shapeclassifier.h
|
||||
// Description: Base interface class for classifiers that return a
|
||||
// shape index.
|
||||
// Author: Ray Smith
|
||||
// Created: Tue Sep 13 11:26:32 PDT 2011
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
|
||||
#define TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
|
||||
|
||||
template <typename T> class GenericVector;
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class ShapeTable;
|
||||
class TrainingSample;
|
||||
|
||||
// Classifier result from a low-level classification is an index into some
|
||||
// ShapeTable and a rating.
|
||||
struct ShapeRating {
|
||||
ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f) {}
|
||||
ShapeRating(int s, float r)
|
||||
: shape_id(s), rating(r), raw(1.0f), font(0.0f) {}
|
||||
|
||||
// Sort function to sort ratings appropriately by descending rating.
|
||||
static int SortDescendingRating(const void* t1, const void* t2) {
|
||||
const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
|
||||
const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
|
||||
if (a->rating > b->rating) {
|
||||
return -1;
|
||||
} else if (a->rating < b->rating) {
|
||||
return 1;
|
||||
} else {
|
||||
return a->shape_id - b->shape_id;
|
||||
}
|
||||
}
|
||||
|
||||
// Index into some shape table indicates the class of the answer.
|
||||
int shape_id;
|
||||
// Rating from classifier with 1.0 perfect and 0.0 impossible.
|
||||
// Call it a probability if you must.
|
||||
float rating;
|
||||
// Subsidiary rating that a classifier may use internally.
|
||||
float raw;
|
||||
// Subsidiary rating that a classifier may use internally.
|
||||
float font;
|
||||
};
|
||||
|
||||
// Interface base class for classifiers that produce ShapeRating results.
|
||||
class ShapeClassifier {
|
||||
public:
|
||||
virtual ~ShapeClassifier() {}
|
||||
|
||||
// Classifies the given [training] sample, writing to results.
|
||||
// If page_pix is not NULL, the overriding function may call
|
||||
// sample.GetSamplePix(padding, page_pix) to get an image of the sample
|
||||
// padded (with real image data) by the given padding to extract features
|
||||
// from the image of the character. Other members of TrainingSample:
|
||||
// features(), micro_features(), cn_feature(), geo_feature() may be used
|
||||
// to get the appropriate tesseract features.
|
||||
// If debug is non-zero, then various degrees of classifier dependent debug
|
||||
// information is provided.
|
||||
// If keep_this (a shape index) is >= 0, then the results should always
|
||||
// contain keep_this, and (if possible) anything of intermediate confidence.
|
||||
// (Used for answering "Why didn't it get that right?" questions.)
|
||||
// The return value is the number of classes saved in results.
|
||||
// NOTE that overriding functions MUST clear results unless the classifier
|
||||
// is working with a team of such classifiers.
|
||||
virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
|
||||
int debug, int keep_this,
|
||||
GenericVector<ShapeRating>* results) = 0;
|
||||
|
||||
// Provides access to the ShapeTable that this classifier works with.
|
||||
virtual const ShapeTable* GetShapeTable() const = 0;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
|
452
classify/shapetable.cpp
Normal file
452
classify/shapetable.cpp
Normal file
@ -0,0 +1,452 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: shapetable.cpp
|
||||
// Description: Class to map a classifier shape index to unicharset
|
||||
// indices and font indices.
|
||||
// Author: Ray Smith
|
||||
// Created: Tue Nov 02 15:31:32 PDT 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "shapetable.h"
|
||||
|
||||
#include "intfeaturespace.h"
|
||||
#include "strngs.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool UnicharAndFonts::Serialize(FILE* fp) {
|
||||
inT32 uni_id = unichar_id;
|
||||
if (fwrite(&uni_id, sizeof(uni_id), 1, fp) != 1) return false;
|
||||
if (!font_ids.Serialize(fp)) return false;
|
||||
return true;
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
|
||||
inT32 uni_id;
|
||||
if (fread(&uni_id, sizeof(uni_id), 1, fp) != 1) return false;
|
||||
if (swap)
|
||||
ReverseN(&uni_id, sizeof(uni_id));
|
||||
unichar_id = uni_id;
|
||||
if (!font_ids.DeSerialize(swap, fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
|
||||
int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
|
||||
const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1);
|
||||
const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2);
|
||||
return p1->unichar_id - p2->unichar_id;
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Shape::Serialize(FILE* fp) {
|
||||
if (fwrite(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1)
|
||||
return false;
|
||||
if (!unichars_.SerializeClasses(fp)) return false;
|
||||
return true;
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool Shape::DeSerialize(bool swap, FILE* fp) {
|
||||
if (fread(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1)
|
||||
return false;
|
||||
if (!unichars_.DeSerializeClasses(swap, fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Adds a font_id for the given unichar_id. If the unichar_id is not
|
||||
// in the shape, it is added.
|
||||
void Shape::AddToShape(int unichar_id, int font_id) {
|
||||
for (int c = 0; c < unichars_.size(); ++c) {
|
||||
if (unichars_[c].unichar_id == unichar_id) {
|
||||
// Found the unichar in the shape table.
|
||||
GenericVector<int>& font_list = unichars_[c].font_ids;
|
||||
for (int f = 0; f < font_list.size(); ++f) {
|
||||
if (font_list[f] == font_id)
|
||||
return; // Font is already there.
|
||||
}
|
||||
font_list.push_back(font_id);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Unichar_id is not in shape, so add it to shape.
|
||||
unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
|
||||
unichars_sorted_ = unichars_.size() <= 1;
|
||||
}
|
||||
|
||||
// Adds everything in other to this.
|
||||
void Shape::AddShape(const Shape& other) {
|
||||
for (int c = 0; c < other.unichars_.size(); ++c) {
|
||||
for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
|
||||
AddToShape(other.unichars_[c].unichar_id,
|
||||
other.unichars_[c].font_ids[f]);
|
||||
}
|
||||
}
|
||||
unichars_sorted_ = unichars_.size() <= 1;
|
||||
}
|
||||
|
||||
// Returns true if the shape contains the given unichar_id, font_id pair.
|
||||
bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
|
||||
for (int c = 0; c < unichars_.size(); ++c) {
|
||||
if (unichars_[c].unichar_id == unichar_id) {
|
||||
// Found the unichar, so look for the font.
|
||||
GenericVector<int>& font_list = unichars_[c].font_ids;
|
||||
for (int f = 0; f < font_list.size(); ++f) {
|
||||
if (font_list[f] == font_id)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the shape contains the given unichar_id, ignoring font.
|
||||
bool Shape::ContainsUnichar(int unichar_id) const {
|
||||
for (int c = 0; c < unichars_.size(); ++c) {
|
||||
if (unichars_[c].unichar_id == unichar_id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the shape contains the given font, ignoring unichar_id.
|
||||
bool Shape::ContainsFont(int font_id) const {
|
||||
for (int c = 0; c < unichars_.size(); ++c) {
|
||||
GenericVector<int>& font_list = unichars_[c].font_ids;
|
||||
for (int f = 0; f < font_list.size(); ++f) {
|
||||
if (font_list[f] == font_id)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if this is a subset (including equal) of other.
|
||||
bool Shape::IsSubsetOf(const Shape& other) const {
|
||||
for (int c = 0; c < unichars_.size(); ++c) {
|
||||
int unichar_id = unichars_[c].unichar_id;
|
||||
const GenericVector<int>& font_list = unichars_[c].font_ids;
|
||||
for (int f = 0; f < font_list.size(); ++f) {
|
||||
if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns true if the lists of unichar ids are the same in this and other,
|
||||
// ignoring fonts.
|
||||
// NOT const, as it will sort the unichars on demand.
|
||||
bool Shape::IsEqualUnichars(Shape* other) {
|
||||
if (unichars_.size() != other->unichars_.size()) return false;
|
||||
if (!unichars_sorted_) SortUnichars();
|
||||
if (!other->unichars_sorted_) other->SortUnichars();
|
||||
for (int c = 0; c < unichars_.size(); ++c) {
|
||||
if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Sorts the unichars_ vector by unichar.
|
||||
void Shape::SortUnichars() {
|
||||
unichars_.sort(UnicharAndFonts::SortByUnicharId);
|
||||
unichars_sorted_ = true;
|
||||
}
|
||||
|
||||
ShapeTable::ShapeTable() : unicharset_(NULL) {
|
||||
}
|
||||
ShapeTable::ShapeTable(const UNICHARSET& unicharset)
|
||||
: unicharset_(&unicharset) {
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool ShapeTable::Serialize(FILE* fp) const {
|
||||
if (!shape_table_.Serialize(fp)) return false;
|
||||
return true;
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
|
||||
if (!shape_table_.DeSerialize(swap, fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns a string listing the classes/fonts in a shape.
|
||||
STRING ShapeTable::DebugStr(int shape_id) const {
|
||||
if (shape_id < 0 || shape_id >= shape_table_.size())
|
||||
return STRING("INVALID_UNICHAR_ID");
|
||||
const Shape& shape = GetShape(shape_id);
|
||||
STRING result;
|
||||
result.add_str_int("Shape", shape_id);
|
||||
for (int c = 0; c < shape.size(); ++c) {
|
||||
result.add_str_int(" c_id=", shape[c].unichar_id);
|
||||
result += "=";
|
||||
result += unicharset_->id_to_unichar(shape[c].unichar_id);
|
||||
result.add_str_int(", ", shape[c].font_ids.size());
|
||||
result += " fonts =";
|
||||
for (int f = 0; f < shape[c].font_ids.size(); ++f) {
|
||||
result.add_str_int(" ", shape[c].font_ids[f]);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns a debug string summarizing the table.
|
||||
STRING ShapeTable::SummaryStr() const {
|
||||
int max_unichars = 0;
|
||||
int num_multi_shapes = 0;
|
||||
int num_master_shapes = 0;
|
||||
for (int s = 0; s < shape_table_.size(); ++s) {
|
||||
if (MasterDestinationIndex(s) != s) continue;
|
||||
++num_master_shapes;
|
||||
int shape_size = GetShape(s).size();
|
||||
if (shape_size > 1)
|
||||
++num_multi_shapes;
|
||||
if (shape_size > max_unichars)
|
||||
max_unichars = shape_size;
|
||||
}
|
||||
STRING result;
|
||||
result.add_str_int("Number of shapes = ", num_master_shapes);
|
||||
result.add_str_int(" max unichars = ", max_unichars);
|
||||
result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Adds a new shape starting with the given unichar_id and font_id.
|
||||
// Returns the assigned index.
|
||||
int ShapeTable::AddShape(int unichar_id, int font_id) {
|
||||
int index = shape_table_.size();
|
||||
Shape* shape = new Shape;
|
||||
shape->AddToShape(unichar_id, font_id);
|
||||
shape_table_.push_back(shape);
|
||||
return index;
|
||||
}
|
||||
|
||||
// Adds a copy of the given shape.
|
||||
// Returns the assigned index.
|
||||
int ShapeTable::AddShape(const Shape& other) {
|
||||
int index = shape_table_.size();
|
||||
Shape* shape = new Shape(other);
|
||||
shape_table_.push_back(shape);
|
||||
return index;
|
||||
}
|
||||
|
||||
// Removes the shape given by the shape index.
|
||||
void ShapeTable::DeleteShape(int shape_id) {
|
||||
delete shape_table_[shape_id];
|
||||
shape_table_[shape_id] = NULL;
|
||||
shape_table_.remove(shape_id);
|
||||
}
|
||||
|
||||
// Adds a font_id to the given existing shape index for the given
|
||||
// unichar_id. If the unichar_id is not in the shape, it is added.
|
||||
void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
|
||||
Shape& shape = *shape_table_[shape_id];
|
||||
shape.AddToShape(unichar_id, font_id);
|
||||
}
|
||||
|
||||
// Adds the given shape to the existing shape with the given index.
|
||||
void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
|
||||
Shape& shape = *shape_table_[shape_id];
|
||||
shape.AddShape(other);
|
||||
}
|
||||
|
||||
// Returns the id of the shape that contains the given unichar and font.
|
||||
// If not found, returns -1.
|
||||
// If font_id < 0, the font_id is ignored and the first shape that matches
|
||||
// the unichar_id is returned.
|
||||
int ShapeTable::FindShape(int unichar_id, int font_id) const {
|
||||
for (int s = 0; s < shape_table_.size(); ++s) {
|
||||
const Shape& shape = GetShape(s);
|
||||
for (int c = 0; c < shape.size(); ++c) {
|
||||
if (shape[c].unichar_id == unichar_id) {
|
||||
if (font_id < 0)
|
||||
return s; // We don't care about the font.
|
||||
for (int f = 0; f < shape[c].font_ids.size(); ++f) {
|
||||
if (shape[c].font_ids[f] == font_id)
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Returns the first unichar_id and font_id in the given shape.
|
||||
void ShapeTable::GetFirstUnicharAndFont(int shape_id,
|
||||
int* unichar_id, int* font_id) const {
|
||||
const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
|
||||
*unichar_id = unichar_and_fonts.unichar_id;
|
||||
*font_id = unichar_and_fonts.font_ids[0];
|
||||
}
|
||||
|
||||
// Expands all the classes/fonts in the shape individually to build
|
||||
// a ShapeTable.
|
||||
int ShapeTable::BuildFromShape(const Shape& shape,
|
||||
const ShapeTable& master_shapes) {
|
||||
int num_masters = 0;
|
||||
for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
|
||||
for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
|
||||
int c = shape[u_ind].unichar_id;
|
||||
int f = shape[u_ind].font_ids[f_ind];
|
||||
if (FindShape(c, f) < 0) {
|
||||
int shape_id = AddShape(c, f);
|
||||
int master_id = master_shapes.FindShape(c, f);
|
||||
if (master_id >= 0 && shape.size() > 1) {
|
||||
const Shape& master = master_shapes.GetShape(master_id);
|
||||
if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) {
|
||||
// Add everything else from the master shape.
|
||||
shape_table_[shape_id]->AddShape(master);
|
||||
++num_masters;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return num_masters;
|
||||
}
|
||||
|
||||
// Returns true if the shapes are already merged.
|
||||
bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) {
|
||||
return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
|
||||
}
|
||||
|
||||
// Returns true if any shape contains multiple unichars.
|
||||
bool ShapeTable::AnyMultipleUnichars() {
|
||||
int num_shapes = NumShapes();
|
||||
for (int s1 = 0; s1 < num_shapes; ++s1) {
|
||||
if (MasterDestinationIndex(s1) != s1) continue;
|
||||
if (GetShape(s1).size() > 1)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns the maximum number of unichars over all shapes.
|
||||
int ShapeTable::MaxNumUnichars() const {
|
||||
int max_num_unichars = 0;
|
||||
int num_shapes = NumShapes();
|
||||
for (int s = 0; s < num_shapes; ++s) {
|
||||
if (GetShape(s).size() > max_num_unichars)
|
||||
max_num_unichars = GetShape(s).size();
|
||||
}
|
||||
return max_num_unichars;
|
||||
}
|
||||
|
||||
|
||||
// Merges shapes with a common unichar over the [start, end) interval.
|
||||
// Assumes single unichar per shape.
|
||||
void ShapeTable::ForceFontMerges(int start, int end) {
|
||||
for (int s1 = start; s1 < end; ++s1) {
|
||||
if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
|
||||
int unichar_id = GetShape(s1)[0].unichar_id;
|
||||
for (int s2 = s1 + 1; s2 < end; ++s2) {
|
||||
if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
|
||||
unichar_id == GetShape(s2)[0].unichar_id) {
|
||||
MergeShapes(s1, s2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ShapeTable compacted(*unicharset_);
|
||||
compacted.AppendMasterShapes(*this);
|
||||
*this = compacted;
|
||||
}
|
||||
|
||||
// Returns the number of unichars in the master shape.
|
||||
int ShapeTable::MasterUnicharCount(int shape_id) const {
|
||||
int master_id = MasterDestinationIndex(shape_id);
|
||||
return GetShape(master_id).size();
|
||||
}
|
||||
|
||||
// Returns the sum of the font counts in the master shape.
|
||||
int ShapeTable::MasterFontCount(int shape_id) const {
|
||||
int master_id = MasterDestinationIndex(shape_id);
|
||||
const Shape& shape = GetShape(master_id);
|
||||
int font_count = 0;
|
||||
for (int c = 0; c < shape.size(); ++c) {
|
||||
font_count += shape[c].font_ids.size();
|
||||
}
|
||||
return font_count;
|
||||
}
|
||||
|
||||
// Returns the number of unichars that would result from merging the shapes.
|
||||
int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
|
||||
// Do it the easy way for now.
|
||||
int master_id1 = MasterDestinationIndex(shape_id1);
|
||||
int master_id2 = MasterDestinationIndex(shape_id2);
|
||||
Shape combined_shape(*shape_table_[master_id1]);
|
||||
combined_shape.AddShape(*shape_table_[master_id2]);
|
||||
return combined_shape.size();
|
||||
}
|
||||
|
||||
// Merges two shape_ids, leaving shape_id2 marked as merged.
|
||||
void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
|
||||
int master_id1 = MasterDestinationIndex(shape_id1);
|
||||
int master_id2 = MasterDestinationIndex(shape_id2);
|
||||
// Point master_id2 (and all merged shapes) to master_id1.
|
||||
shape_table_[master_id2]->set_destination_index(master_id1);
|
||||
// Add all the shapes of master_id2 to master_id1.
|
||||
shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
|
||||
tprintf("Merged shape %d->%d, %d->%d, now with %d unichars: %s\n",
|
||||
shape_id1, master_id1, shape_id2, master_id2,
|
||||
shape_table_[master_id1]->size(),
|
||||
DebugStr(master_id1).string());
|
||||
}
|
||||
|
||||
// Returns the destination of this shape, (if merged), taking into account
|
||||
// the fact that the destination may itself have been merged.
|
||||
int ShapeTable::MasterDestinationIndex(int shape_id) const {
|
||||
int dest_id = shape_table_[shape_id]->destination_index();
|
||||
if (dest_id == shape_id || dest_id < 0)
|
||||
return shape_id; // Is master already.
|
||||
int master_id = shape_table_[dest_id]->destination_index();
|
||||
if (master_id == dest_id || master_id < 0)
|
||||
return dest_id; // Dest is the master and shape_id points to it.
|
||||
master_id = MasterDestinationIndex(master_id);
|
||||
return master_id;
|
||||
}
|
||||
|
||||
// Appends the master shapes from other to this.
|
||||
void ShapeTable::AppendMasterShapes(const ShapeTable& other) {
|
||||
for (int s = 0; s < other.shape_table_.size(); ++s) {
|
||||
if (other.shape_table_[s]->destination_index() < 0) {
|
||||
AddShape(*other.shape_table_[s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the number of master shapes remaining after merging.
|
||||
int ShapeTable::NumMasterShapes() const {
|
||||
int num_shapes = 0;
|
||||
for (int s = 0; s < shape_table_.size(); ++s) {
|
||||
if (shape_table_[s]->destination_index() < 0)
|
||||
++num_shapes;
|
||||
}
|
||||
return num_shapes;
|
||||
}
|
||||
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
227
classify/shapetable.h
Normal file
227
classify/shapetable.h
Normal file
@ -0,0 +1,227 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: shapetable.h
|
||||
// Description: Class to map a classifier shape index to unicharset
|
||||
// indices and font indices.
|
||||
// Author: Ray Smith
|
||||
// Created: Thu Oct 28 17:46:32 PDT 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
|
||||
#define TESSERACT_CLASSIFY_SHAPETABLE_H_
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "intmatcher.h"
|
||||
|
||||
class STRING;
|
||||
class UNICHARSET;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Simple struct to hold a set of fonts associated with a single unichar-id.
|
||||
// A vector of UnicharAndFonts makes a shape.
|
||||
struct UnicharAndFonts {
|
||||
UnicharAndFonts() : unichar_id(0) {
|
||||
}
|
||||
UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
|
||||
font_ids.push_back(font_id);
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp);
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
|
||||
static int SortByUnicharId(const void* v1, const void* v2);
|
||||
|
||||
GenericVector<int> font_ids;
|
||||
int unichar_id;
|
||||
};
|
||||
|
||||
// A Shape is a collection of unichar-ids and a list of fonts associated with
|
||||
// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
|
||||
// a classifiable unit, and represents a group of characters or parts of
|
||||
// characters that have a similar or identical shape. Shapes/ShapeTables may
|
||||
// be organized hierarchically from identical shapes at the leaves to vaguely
|
||||
// similar shapes near the root.
|
||||
class Shape {
|
||||
public:
|
||||
Shape() : destination_index_(-1) {}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp);
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
int destination_index() const {
|
||||
return destination_index_;
|
||||
}
|
||||
void set_destination_index(int index) {
|
||||
destination_index_ = index;
|
||||
}
|
||||
int size() const {
|
||||
return unichars_.size();
|
||||
}
|
||||
// Returns a UnicharAndFonts entry for the given index, which must be
|
||||
// in the range [0, size()).
|
||||
const UnicharAndFonts& operator[](int index) const {
|
||||
return unichars_[index];
|
||||
}
|
||||
// Adds a font_id for the given unichar_id. If the unichar_id is not
|
||||
// in the shape, it is added.
|
||||
void AddToShape(int unichar_id, int font_id);
|
||||
// Adds everything in other to this.
|
||||
void AddShape(const Shape& other);
|
||||
// Returns true if the shape contains the given unichar_id, font_id pair.
|
||||
bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
|
||||
// Returns true if the shape contains the given unichar_id, ignoring font.
|
||||
bool ContainsUnichar(int unichar_id) const;
|
||||
// Returns true if the shape contains the given font, ignoring unichar_id.
|
||||
bool ContainsFont(int font_id) const;
|
||||
// Returns true if this is a subset (including equal) of other.
|
||||
bool IsSubsetOf(const Shape& other) const;
|
||||
// Returns true if the lists of unichar ids are the same in this and other,
|
||||
// ignoring fonts.
|
||||
// NOT const, as it will sort the unichars on demand.
|
||||
bool IsEqualUnichars(Shape* other);
|
||||
|
||||
private:
|
||||
// Sorts the unichars_ vector by unichar.
|
||||
void SortUnichars();
|
||||
|
||||
// Flag indicates that the unichars are sorted, allowing faster set
|
||||
// operations with another shape.
|
||||
bool unichars_sorted_;
|
||||
// If this Shape is part of a ShapeTable the destiation_index_ is the index
|
||||
// of some other shape in the ShapeTable with which this shape is merged.
|
||||
int destination_index_;
|
||||
// Array of unichars, each with a set of fonts. Each unichar has at most
|
||||
// one entry in the vector.
|
||||
GenericVector<UnicharAndFonts> unichars_;
|
||||
};
|
||||
|
||||
// ShapeTable is a class to encapsulate the triple indirection that is
|
||||
// used here.
|
||||
// ShapeTable is a vector of shapes.
|
||||
// Each shape is a vector of UnicharAndFonts representing the set of unichars
|
||||
// that the shape represents.
|
||||
// Each UnicharAndFonts also lists the fonts of the unichar_id that were
|
||||
// mapped to the shape during training.
|
||||
class ShapeTable {
|
||||
public:
|
||||
ShapeTable();
|
||||
// The UNICHARSET reference supplied here, or in set_unicharset below must
|
||||
// exist for the entire life of the ShapeTable. It is used only by DebugStr.
|
||||
explicit ShapeTable(const UNICHARSET& unicharset);
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Accessors.
|
||||
int NumShapes() const {
|
||||
return shape_table_.size();
|
||||
}
|
||||
const UNICHARSET& unicharset() const {
|
||||
return *unicharset_;
|
||||
}
|
||||
// Shapetable takes a pointer to the UNICHARSET, so it must persist for the
|
||||
// entire life of the ShapeTable.
|
||||
void set_unicharset(const UNICHARSET& unicharset) {
|
||||
unicharset_ = &unicharset;
|
||||
}
|
||||
// Returns a string listing the classes/fonts in a shape.
|
||||
STRING DebugStr(int shape_id) const;
|
||||
// Returns a debug string summarizing the table.
|
||||
STRING SummaryStr() const;
|
||||
|
||||
// Adds a new shape starting with the given unichar_id and font_id.
|
||||
// Returns the assigned index.
|
||||
int AddShape(int unichar_id, int font_id);
|
||||
// Adds a copy of the given shape.
|
||||
// Returns the assigned index.
|
||||
int AddShape(const Shape& other);
|
||||
// Removes the shape given by the shape index. All indices above are changed!
|
||||
void DeleteShape(int shape_id);
|
||||
// Adds a font_id to the given existing shape index for the given
|
||||
// unichar_id. If the unichar_id is not in the shape, it is added.
|
||||
void AddToShape(int shape_id, int unichar_id, int font_id);
|
||||
// Adds the given shape to the existing shape with the given index.
|
||||
void AddShapeToShape(int shape_id, const Shape& other);
|
||||
// Returns the id of the shape that contains the given unichar and font.
|
||||
// If not found, returns -1.
|
||||
// If font_id < 0, the font_id is ignored and the first shape that matches
|
||||
// the unichar_id is returned.
|
||||
int FindShape(int unichar_id, int font_id) const;
|
||||
// Returns the first unichar_id and font_id in the given shape.
|
||||
void GetFirstUnicharAndFont(int shape_id,
|
||||
int* unichar_id, int* font_id) const;
|
||||
|
||||
// Accessors for the Shape with the given shape_id.
|
||||
const Shape& GetShape(int shape_id) const {
|
||||
return *shape_table_[shape_id];
|
||||
}
|
||||
Shape* MutableShape(int shape_id) {
|
||||
return shape_table_[shape_id];
|
||||
}
|
||||
|
||||
// Expands all the classes/fonts in the shape individually to build
|
||||
// a ShapeTable.
|
||||
int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
|
||||
|
||||
// Returns true if the shapes are already merged.
|
||||
bool AlreadyMerged(int shape_id1, int shape_id2);
|
||||
// Returns true if any shape contains multiple unichars.
|
||||
bool AnyMultipleUnichars();
|
||||
// Returns the maximum number of unichars over all shapes.
|
||||
int MaxNumUnichars() const;
|
||||
// Merges shapes with a common unichar over the [start, end) interval.
|
||||
// Assumes single unichar per shape.
|
||||
void ForceFontMerges(int start, int end);
|
||||
// Returns the number of unichars in the master shape.
|
||||
int MasterUnicharCount(int shape_id) const;
|
||||
// Returns the sum of the font counts in the master shape.
|
||||
int MasterFontCount(int shape_id) const;
|
||||
// Returns the number of unichars that would result from merging the shapes.
|
||||
int MergedUnicharCount(int shape_id1, int shape_id2) const;
|
||||
// Merges two shape_ids, leaving shape_id2 marked as merged.
|
||||
void MergeShapes(int shape_id1, int shape_id2);
|
||||
// Appends the master shapes from other to this.
|
||||
// Used to create a clean ShapeTable from a merged one, or to create a
|
||||
// copy of a ShapeTable.
|
||||
void AppendMasterShapes(const ShapeTable& other);
|
||||
// Returns the number of master shapes remaining after merging.
|
||||
int NumMasterShapes() const;
|
||||
// Returns the destination of this shape, (if merged), taking into account
|
||||
// the fact that the destination may itself have been merged.
|
||||
// For a non-merged shape, returns the input shape_id.
|
||||
int MasterDestinationIndex(int shape_id) const;
|
||||
|
||||
private:
|
||||
// Pointer to a provided unicharset used only by the Debugstr member.
|
||||
const UNICHARSET* unicharset_;
|
||||
// Vector of pointers to the Shapes in this ShapeTable.
|
||||
PointerVector<Shape> shape_table_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_
|
@ -70,7 +70,7 @@ void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
|
||||
if (Choices->length() == 0) {
|
||||
blob_choice =
|
||||
new BLOB_CHOICE(0, speckle_small_certainty + speckle_large_penalty,
|
||||
speckle_small_certainty, -1, -1, NULL);
|
||||
speckle_small_certainty, -1, -1, NULL, 0, 0, false);
|
||||
temp_it.add_to_end(blob_choice);
|
||||
return;
|
||||
}
|
||||
@ -81,7 +81,7 @@ void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
|
||||
blob_choice = temp_it.data(); // pick the worst choice
|
||||
temp_it.add_to_end(
|
||||
new BLOB_CHOICE(0, blob_choice->rating() + speckle_large_penalty,
|
||||
blob_choice->certainty(), -1, -1, NULL));
|
||||
blob_choice->certainty(), -1, -1, NULL, 0, 0, false));
|
||||
} /* AddLargeSpeckleTo */
|
||||
|
||||
|
||||
@ -100,18 +100,8 @@ void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
|
||||
*
|
||||
* @return TRUE if Blob is speckle, FALSE otherwise.
|
||||
*/
|
||||
BOOL8 LargeSpeckle(TBLOB *Blob) {
|
||||
double speckle_size;
|
||||
TPOINT TopLeft;
|
||||
TPOINT BottomRight;
|
||||
|
||||
speckle_size = BASELINE_SCALE * speckle_large_max_size;
|
||||
blob_bounding_box(Blob, &TopLeft, &BottomRight);
|
||||
|
||||
if (TopLeft.y - BottomRight.y < speckle_size &&
|
||||
BottomRight.x - TopLeft.x < speckle_size)
|
||||
return (TRUE);
|
||||
else
|
||||
return (FALSE);
|
||||
|
||||
BOOL8 LargeSpeckle(TBLOB *blob) {
|
||||
double speckle_size = BASELINE_SCALE * speckle_large_max_size;
|
||||
TBOX bbox = blob->bounding_box();
|
||||
return (bbox.width() < speckle_size && bbox.height() < speckle_size);
|
||||
} /* LargeSpeckle */
|
||||
|
52
classify/tessclassifier.cpp
Normal file
52
classify/tessclassifier.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: tessclassifier.cpp
|
||||
// Description: Tesseract implementation of a ShapeClassifier.
|
||||
// Author: Ray Smith
|
||||
// Created: Tue Nov 22 14:16:25 PST 2011
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "tessclassifier.h"
|
||||
|
||||
#include "classify.h"
|
||||
#include "trainingsample.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Classifies the given [training] sample, writing to results.
|
||||
// See ShapeClassifier for a full description.
|
||||
int TessClassifier::ClassifySample(const TrainingSample& sample,
|
||||
Pix* page_pix, int debug, int keep_this,
|
||||
GenericVector<ShapeRating>* results) {
|
||||
if (debug) {
|
||||
classify_->matcher_debug_level.set_value(debug ? 2 : 0);
|
||||
classify_->matcher_debug_flags.set_value(debug ? 25 : 0);
|
||||
classify_->classify_debug_level.set_value(debug ? 3 : 0);
|
||||
} else {
|
||||
classify_->classify_debug_level.set_value(debug ? 2 : 0);
|
||||
}
|
||||
classify_->CharNormTrainingSample(pruner_only_, sample, results);
|
||||
return results->size();
|
||||
}
|
||||
|
||||
// Provides access to the ShapeTable that this classifier works with.
|
||||
const ShapeTable* TessClassifier::GetShapeTable() const {
|
||||
return classify_->shape_table();
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
65
classify/tessclassifier.h
Normal file
65
classify/tessclassifier.h
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: tessclassifier.h
|
||||
// Description: Tesseract implementation of a ShapeClassifier.
|
||||
// Author: Ray Smith
|
||||
// Created: Tue Nov 22 14:10:45 PST 2011
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_
|
||||
#define THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_
|
||||
|
||||
#include "shapeclassifier.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class Classify;
|
||||
class TrainingSample;
|
||||
|
||||
// Tesseract implementation of a ShapeClassifier.
|
||||
// Due to limitations in the content of TrainingSample, this currently
|
||||
// only works for the static classifier and only works if the ShapeTable
|
||||
// in classify is not NULL.
|
||||
class TessClassifier : public ShapeClassifier {
|
||||
public:
|
||||
TessClassifier(bool pruner_only, tesseract::Classify* classify)
|
||||
: pruner_only_(pruner_only), classify_(classify) {}
|
||||
virtual ~TessClassifier() {}
|
||||
|
||||
// Classifies the given [training] sample, writing to results.
|
||||
// See ShapeClassifier for a full description.
|
||||
virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
|
||||
int debug, int keep_this,
|
||||
GenericVector<ShapeRating>* results);
|
||||
// Provides access to the ShapeTable that this classifier works with.
|
||||
virtual const ShapeTable* GetShapeTable() const;
|
||||
|
||||
private:
|
||||
// Indicates that this classifier is to use just the ClassPruner, or the
|
||||
// full classifier if false.
|
||||
bool pruner_only_;
|
||||
// Borrowed pointer to the actual Tesseract classifier.
|
||||
tesseract::Classify* classify_;
|
||||
};
|
||||
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_TESSCLASSIFIER_H_ */
|
311
classify/trainingsample.cpp
Normal file
311
classify/trainingsample.cpp
Normal file
@ -0,0 +1,311 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "trainingsample.h"
|
||||
|
||||
#include <math.h>
|
||||
#include "allheaders.h"
|
||||
#include "helpers.h"
|
||||
#include "intfeaturemap.h"
|
||||
#include "normfeat.h"
|
||||
#include "shapetable.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
ELISTIZE(TrainingSample)
|
||||
|
||||
// Center of randomizing operations.
|
||||
const int kRandomizingCenter = 128;
|
||||
|
||||
// Randomizing factors.
|
||||
const int TrainingSample::kYShiftValues[kSampleYShiftSize] = {
|
||||
6, 3, -3, -6, 0
|
||||
};
|
||||
const double TrainingSample::kScaleValues[kSampleScaleSize] = {
|
||||
1.0625, 0.9375, 1.0
|
||||
};
|
||||
|
||||
TrainingSample::~TrainingSample() {
|
||||
delete [] features_;
|
||||
delete [] micro_features_;
|
||||
}
|
||||
|
||||
// WARNING! Serialize/DeSerialize do not save/restore the "cache" data
|
||||
// members, which is mostly the mapped features, and the weight.
|
||||
// It is assumed these can all be reconstructed from what is saved.
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool TrainingSample::Serialize(FILE* fp) const {
|
||||
if (fwrite(&class_id_, sizeof(class_id_), 1, fp) != 1) return false;
|
||||
if (fwrite(&font_id_, sizeof(font_id_), 1, fp) != 1) return false;
|
||||
if (fwrite(&page_num_, sizeof(page_num_), 1, fp) != 1) return false;
|
||||
if (!bounding_box_.Serialize(fp)) return false;
|
||||
if (fwrite(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
|
||||
if (fwrite(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
|
||||
return false;
|
||||
if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_)
|
||||
return false;
|
||||
if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_,
|
||||
fp) != num_micro_features_)
|
||||
return false;
|
||||
if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
|
||||
kNumCNParams) return false;
|
||||
if (fwrite(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Creates from the given file. Returns NULL in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
TrainingSample* TrainingSample::DeSerializeCreate(bool swap, FILE* fp) {
|
||||
TrainingSample* sample = new TrainingSample;
|
||||
if (sample->DeSerialize(swap, fp)) return sample;
|
||||
delete sample;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool TrainingSample::DeSerialize(bool swap, FILE* fp) {
|
||||
if (fread(&class_id_, sizeof(class_id_), 1, fp) != 1) return false;
|
||||
if (fread(&font_id_, sizeof(font_id_), 1, fp) != 1) return false;
|
||||
if (fread(&page_num_, sizeof(page_num_), 1, fp) != 1) return false;
|
||||
if (!bounding_box_.DeSerialize(swap, fp)) return false;
|
||||
if (fread(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
|
||||
if (fread(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
|
||||
return false;
|
||||
if (swap) {
|
||||
ReverseN(&class_id_, sizeof(class_id_));
|
||||
ReverseN(&num_features_, sizeof(num_features_));
|
||||
ReverseN(&num_micro_features_, sizeof(num_micro_features_));
|
||||
}
|
||||
delete [] features_;
|
||||
features_ = new INT_FEATURE_STRUCT[num_features_];
|
||||
if (fread(features_, sizeof(*features_), num_features_, fp) != num_features_)
|
||||
return false;
|
||||
delete [] micro_features_;
|
||||
micro_features_ = new MicroFeature[num_micro_features_];
|
||||
if (fread(micro_features_, sizeof(*micro_features_), num_micro_features_,
|
||||
fp) != num_micro_features_)
|
||||
return false;
|
||||
if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) !=
|
||||
kNumCNParams) return false;
|
||||
if (fread(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Saves the given features into a TrainingSample.
|
||||
TrainingSample* TrainingSample::CopyFromFeatures(
|
||||
const INT_FX_RESULT_STRUCT& fx_info, const INT_FEATURE_STRUCT* features,
|
||||
int num_features) {
|
||||
TrainingSample* sample = new TrainingSample;
|
||||
sample->num_features_ = num_features;
|
||||
sample->features_ = new INT_FEATURE_STRUCT[num_features];
|
||||
memcpy(sample->features_, features, num_features * sizeof(features[0]));
|
||||
sample->geo_feature_[GeoBottom] = fx_info.YBottom;
|
||||
sample->geo_feature_[GeoTop] = fx_info.YTop;
|
||||
sample->geo_feature_[GeoWidth] = fx_info.Width;
|
||||
sample->features_are_indexed_ = false;
|
||||
sample->features_are_mapped_ = false;
|
||||
return sample;
|
||||
}
|
||||
|
||||
// Constructs and returns a copy randomized by the method given by
|
||||
// the randomizer index. If index is out of [0, kSampleRandomSize) then
|
||||
// an exact copy is returned.
|
||||
TrainingSample* TrainingSample::RandomizedCopy(int index) const {
|
||||
TrainingSample* sample = Copy();
|
||||
if (index >= 0 && index < kSampleRandomSize) {
|
||||
++index; // Remove the first combination.
|
||||
int yshift = kYShiftValues[index / kSampleScaleSize];
|
||||
double scaling = kScaleValues[index % kSampleScaleSize];
|
||||
for (int i = 0; i < num_features_; ++i) {
|
||||
double result = (features_[i].X - kRandomizingCenter) * scaling;
|
||||
result += kRandomizingCenter;
|
||||
sample->features_[i].X = ClipToRange(static_cast<int>(result + 0.5), 0,
|
||||
MAX_UINT8);
|
||||
result = (features_[i].Y - kRandomizingCenter) * scaling;
|
||||
result += kRandomizingCenter + yshift;
|
||||
sample->features_[i].Y = ClipToRange(static_cast<int>(result + 0.5), 0,
|
||||
MAX_UINT8);
|
||||
}
|
||||
}
|
||||
return sample;
|
||||
}
|
||||
|
||||
// Constructs and returns an exact copy.
|
||||
TrainingSample* TrainingSample::Copy() const {
|
||||
TrainingSample* sample = new TrainingSample;
|
||||
sample->class_id_ = class_id_;
|
||||
sample->font_id_ = font_id_;
|
||||
sample->weight_ = weight_;
|
||||
sample->sample_index_ = sample_index_;
|
||||
sample->num_features_ = num_features_;
|
||||
if (num_features_ > 0) {
|
||||
sample->features_ = new INT_FEATURE_STRUCT[num_features_];
|
||||
memcpy(sample->features_, features_, num_features_ * sizeof(features_[0]));
|
||||
}
|
||||
sample->num_micro_features_ = num_micro_features_;
|
||||
if (num_micro_features_ > 0) {
|
||||
sample->micro_features_ = new MicroFeature[num_micro_features_];
|
||||
memcpy(sample->micro_features_, micro_features_,
|
||||
num_micro_features_ * sizeof(micro_features_[0]));
|
||||
}
|
||||
memcpy(sample->cn_feature_, cn_feature_, sizeof(*cn_feature_) * kNumCNParams);
|
||||
memcpy(sample->geo_feature_, geo_feature_, sizeof(*geo_feature_) * GeoCount);
|
||||
return sample;
|
||||
}
|
||||
|
||||
// Extracts the needed information from the CHAR_DESC_STRUCT.
|
||||
void TrainingSample::ExtractCharDesc(int int_feature_type,
|
||||
int micro_type,
|
||||
int cn_type,
|
||||
int geo_type,
|
||||
CHAR_DESC_STRUCT* char_desc) {
|
||||
// Extract the INT features.
|
||||
if (features_ != NULL) delete [] features_;
|
||||
FEATURE_SET_STRUCT* char_features = char_desc->FeatureSets[int_feature_type];
|
||||
if (char_features == NULL) {
|
||||
tprintf("Error: no features to train on of type %s\n",
|
||||
kIntFeatureType);
|
||||
num_features_ = 0;
|
||||
features_ = NULL;
|
||||
} else {
|
||||
num_features_ = char_features->NumFeatures;
|
||||
features_ = new INT_FEATURE_STRUCT[num_features_];
|
||||
for (int f = 0; f < num_features_; ++f) {
|
||||
features_[f].X =
|
||||
static_cast<uinT8>(char_features->Features[f]->Params[IntX]);
|
||||
features_[f].Y =
|
||||
static_cast<uinT8>(char_features->Features[f]->Params[IntY]);
|
||||
features_[f].Theta =
|
||||
static_cast<uinT8>(char_features->Features[f]->Params[IntDir]);
|
||||
features_[f].CP_misses = 0;
|
||||
}
|
||||
}
|
||||
// Extract the Micro features.
|
||||
if (micro_features_ != NULL) delete [] micro_features_;
|
||||
char_features = char_desc->FeatureSets[micro_type];
|
||||
if (char_features == NULL) {
|
||||
tprintf("Error: no features to train on of type %s\n",
|
||||
kMicroFeatureType);
|
||||
num_micro_features_ = 0;
|
||||
micro_features_ = NULL;
|
||||
} else {
|
||||
num_micro_features_ = char_features->NumFeatures;
|
||||
micro_features_ = new MicroFeature[num_micro_features_];
|
||||
for (int f = 0; f < num_micro_features_; ++f) {
|
||||
for (int d = 0; d < MFCount; ++d) {
|
||||
micro_features_[f][d] = char_features->Features[f]->Params[d];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Extract the CN feature.
|
||||
char_features = char_desc->FeatureSets[cn_type];
|
||||
if (char_features == NULL) {
|
||||
tprintf("Error: no CN feature to train on.\n");
|
||||
} else {
|
||||
ASSERT_HOST(char_features->NumFeatures == 1);
|
||||
cn_feature_[CharNormY] = char_features->Features[0]->Params[CharNormY];
|
||||
cn_feature_[CharNormLength] =
|
||||
char_features->Features[0]->Params[CharNormLength];
|
||||
cn_feature_[CharNormRx] = char_features->Features[0]->Params[CharNormRx];
|
||||
cn_feature_[CharNormRy] = char_features->Features[0]->Params[CharNormRy];
|
||||
}
|
||||
// Extract the Geo feature.
|
||||
char_features = char_desc->FeatureSets[geo_type];
|
||||
if (char_features == NULL) {
|
||||
tprintf("Error: no Geo feature to train on.\n");
|
||||
} else {
|
||||
ASSERT_HOST(char_features->NumFeatures == 1);
|
||||
geo_feature_[GeoBottom] = char_features->Features[0]->Params[GeoBottom];
|
||||
geo_feature_[GeoTop] = char_features->Features[0]->Params[GeoTop];
|
||||
geo_feature_[GeoWidth] = char_features->Features[0]->Params[GeoWidth];
|
||||
}
|
||||
features_are_indexed_ = false;
|
||||
features_are_mapped_ = false;
|
||||
}
|
||||
|
||||
// Sets the mapped_features_ from the features_ using the provided
|
||||
// feature_space to the indexed versions of the features.
|
||||
void TrainingSample::IndexFeatures(const IntFeatureSpace& feature_space) {
|
||||
GenericVector<int> indexed_features;
|
||||
feature_space.IndexAndSortFeatures(features_, num_features_,
|
||||
&mapped_features_);
|
||||
features_are_indexed_ = true;
|
||||
features_are_mapped_ = false;
|
||||
}
|
||||
|
||||
// Sets the mapped_features_ from the features using the provided
|
||||
// feature_map.
|
||||
void TrainingSample::MapFeatures(const IntFeatureMap& feature_map) {
|
||||
GenericVector<int> indexed_features;
|
||||
feature_map.feature_space().IndexAndSortFeatures(features_, num_features_,
|
||||
&indexed_features);
|
||||
feature_map.MapIndexedFeatures(indexed_features, &mapped_features_);
|
||||
features_are_indexed_ = false;
|
||||
features_are_mapped_ = true;
|
||||
}
|
||||
|
||||
// Returns a pix representing the sample. (Int features only.)
|
||||
Pix* TrainingSample::RenderToPix(const UNICHARSET* unicharset) const {
|
||||
Pix* pix = pixCreate(kIntFeatureExtent, kIntFeatureExtent, 1);
|
||||
for (int f = 0; f < num_features_; ++f) {
|
||||
int start_x = features_[f].X;
|
||||
int start_y = kIntFeatureExtent - features_[f].Y;
|
||||
double dx = cos((features_[f].Theta / 256.0) * 2.0 * PI - PI);
|
||||
double dy = -sin((features_[f].Theta / 256.0) * 2.0 * PI - PI);
|
||||
for (int i = 0; i <= 5; ++i) {
|
||||
int x = static_cast<int>(start_x + dx * i);
|
||||
int y = static_cast<int>(start_y + dy * i);
|
||||
if (x >= 0 && x < 256 && y >= 0 && y < 256)
|
||||
pixSetPixel(pix, x, y, 1);
|
||||
}
|
||||
}
|
||||
if (unicharset != NULL)
|
||||
pixSetText(pix, unicharset->id_to_unichar(class_id_));
|
||||
return pix;
|
||||
}
|
||||
|
||||
// Displays the features in the given window with the given color.
|
||||
void TrainingSample::DisplayFeatures(ScrollView::Color color,
|
||||
ScrollView* window) const {
|
||||
for (int f = 0; f < num_features_; ++f) {
|
||||
RenderIntFeature(window, &features_[f], color);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns a pix of the original sample image. The pix is padded all round
|
||||
// by padding wherever possible.
|
||||
// The returned Pix must be pixDestroyed after use.
|
||||
// If the input page_pix is NULL, NULL is returned.
|
||||
Pix* TrainingSample::GetSamplePix(int padding, Pix* page_pix) const {
|
||||
if (page_pix == NULL)
|
||||
return NULL;
|
||||
int page_width = pixGetWidth(page_pix);
|
||||
int page_height = pixGetHeight(page_pix);
|
||||
TBOX padded_box = bounding_box();
|
||||
padded_box.pad(padding, padding);
|
||||
// Clip the padded_box to the limits of the page
|
||||
TBOX page_box(0, 0, page_width, page_height);
|
||||
padded_box &= page_box;
|
||||
Box* box = boxCreate(page_box.left(), page_height - page_box.top(),
|
||||
page_box.width(), page_box.height());
|
||||
Pix* sample_pix = pixClipRectangle(page_pix, box, NULL);
|
||||
boxDestroy(&box);
|
||||
return sample_pix;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
240
classify/trainingsample.h
Normal file
240
classify/trainingsample.h
Normal file
@ -0,0 +1,240 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_TRAINING_TRAININGSAMPLE_H__
|
||||
#define TESSERACT_TRAINING_TRAININGSAMPLE_H__
|
||||
|
||||
#include "elst.h"
|
||||
#include "featdefs.h"
|
||||
#include "intfx.h"
|
||||
#include "intmatcher.h"
|
||||
#include "matrix.h"
|
||||
#include "mf.h"
|
||||
#include "picofeat.h"
|
||||
#include "shapetable.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class IntFeatureMap;
|
||||
class IntFeatureSpace;
|
||||
class ShapeTable;
|
||||
|
||||
// Number of elements of cn_feature_.
|
||||
static const int kNumCNParams = 4;
|
||||
// Number of ways to shift the features when randomizing.
|
||||
static const int kSampleYShiftSize = 5;
|
||||
// Number of ways to scale the features when randomizing.
|
||||
static const int kSampleScaleSize = 3;
|
||||
// Total number of different ways to manipulate the features when randomizing.
|
||||
// The first and last combinations are removed to avoid an excessive
|
||||
// top movement (first) and an identity transformation (last).
|
||||
// WARNING: To avoid patterned duplication of samples, be sure to keep
|
||||
// kSampleRandomSize prime!
|
||||
// Eg with current values (kSampleYShiftSize = 5 and TkSampleScaleSize = 3)
|
||||
// kSampleRandomSize is 13, which is prime.
|
||||
static const int kSampleRandomSize = kSampleYShiftSize * kSampleScaleSize - 2;
|
||||
// ASSERT_IS_PRIME(kSampleRandomSize) !!
|
||||
|
||||
class TrainingSample : public ELIST_LINK {
|
||||
public:
|
||||
TrainingSample()
|
||||
: class_id_(INVALID_UNICHAR_ID), font_id_(0), page_num_(0),
|
||||
num_features_(0), num_micro_features_(0),
|
||||
features_(NULL), micro_features_(NULL), weight_(1.0),
|
||||
max_dist_(0.0), sample_index_(0),
|
||||
features_are_indexed_(false), features_are_mapped_(false),
|
||||
is_error_(false) {
|
||||
}
|
||||
~TrainingSample();
|
||||
|
||||
// Saves the given features into a TrainingSample. The features are copied,
|
||||
// so may be deleted afterwards. Delete the return value after use.
|
||||
static TrainingSample* CopyFromFeatures(const INT_FX_RESULT_STRUCT& fx_info,
|
||||
const INT_FEATURE_STRUCT* features,
|
||||
int num_features);
|
||||
// Constructs and returns a copy "randomized" by the method given by
|
||||
// the randomizer index. If index is out of [0, kSampleRandomSize) then
|
||||
// an exact copy is returned.
|
||||
TrainingSample* RandomizedCopy(int index) const;
|
||||
// Constructs and returns an exact copy.
|
||||
TrainingSample* Copy() const;
|
||||
|
||||
// WARNING! Serialize/DeSerialize do not save/restore the "cache" data
|
||||
// members, which is mostly the mapped features, and the weight.
|
||||
// It is assumed these can all be reconstructed from what is saved.
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Creates from the given file. Returns NULL in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
static TrainingSample* DeSerializeCreate(bool swap, FILE* fp);
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Extracts the needed information from the CHAR_DESC_STRUCT.
|
||||
void ExtractCharDesc(int feature_type, int micro_type,
|
||||
int cn_type, int geo_type,
|
||||
CHAR_DESC_STRUCT* char_desc);
|
||||
|
||||
// Sets the mapped_features_ from the features_ using the provided
|
||||
// feature_space to the indexed versions of the features.
|
||||
void IndexFeatures(const IntFeatureSpace& feature_space);
|
||||
// Sets the mapped_features_ from the features_ using the provided
|
||||
// feature_map.
|
||||
void MapFeatures(const IntFeatureMap& feature_map);
|
||||
|
||||
// Returns a pix representing the sample. (Int features only.)
|
||||
Pix* RenderToPix(const UNICHARSET* unicharset) const;
|
||||
// Displays the features in the given window with the given color.
|
||||
void DisplayFeatures(ScrollView::Color color, ScrollView* window) const;
|
||||
|
||||
// Returns a pix of the original sample image. The pix is padded all round
|
||||
// by padding wherever possible.
|
||||
// The returned Pix must be pixDestroyed after use.
|
||||
// If the input page_pix is NULL, NULL is returned.
|
||||
Pix* GetSamplePix(int padding, Pix* page_pix) const;
|
||||
|
||||
// Accessors.
|
||||
UNICHAR_ID class_id() const {
|
||||
return class_id_;
|
||||
}
|
||||
void set_class_id(int id) {
|
||||
class_id_ = id;
|
||||
}
|
||||
int font_id() const {
|
||||
return font_id_;
|
||||
}
|
||||
void set_font_id(int id) {
|
||||
font_id_ = id;
|
||||
}
|
||||
int page_num() const {
|
||||
return page_num_;
|
||||
}
|
||||
void set_page_num(int page) {
|
||||
page_num_ = page;
|
||||
}
|
||||
const TBOX& bounding_box() const {
|
||||
return bounding_box_;
|
||||
}
|
||||
void set_bounding_box(const TBOX& box) {
|
||||
bounding_box_ = box;
|
||||
}
|
||||
int num_features() const {
|
||||
return num_features_;
|
||||
}
|
||||
const INT_FEATURE_STRUCT* features() const {
|
||||
return features_;
|
||||
}
|
||||
int num_micro_features() const {
|
||||
return num_micro_features_;
|
||||
}
|
||||
const MicroFeature* micro_features() const {
|
||||
return micro_features_;
|
||||
}
|
||||
float cn_feature(int index) const {
|
||||
return cn_feature_[index];
|
||||
}
|
||||
int geo_feature(int index) const {
|
||||
return geo_feature_[index];
|
||||
}
|
||||
double weight() const {
|
||||
return weight_;
|
||||
}
|
||||
void set_weight(double value) {
|
||||
weight_ = value;
|
||||
}
|
||||
double max_dist() const {
|
||||
return max_dist_;
|
||||
}
|
||||
void set_max_dist(double value) {
|
||||
max_dist_ = value;
|
||||
}
|
||||
int sample_index() const {
|
||||
return sample_index_;
|
||||
}
|
||||
void set_sample_index(int value) {
|
||||
sample_index_ = value;
|
||||
}
|
||||
bool features_are_mapped() const {
|
||||
return features_are_mapped_;
|
||||
}
|
||||
const GenericVector<int>& mapped_features() const {
|
||||
ASSERT_HOST(features_are_mapped_);
|
||||
return mapped_features_;
|
||||
}
|
||||
const GenericVector<int>& indexed_features() const {
|
||||
ASSERT_HOST(features_are_indexed_);
|
||||
return mapped_features_;
|
||||
}
|
||||
bool is_error() const {
|
||||
return is_error_;
|
||||
}
|
||||
void set_is_error(bool value) {
|
||||
is_error_ = value;
|
||||
}
|
||||
|
||||
private:
|
||||
// Unichar id that this sample represents. There obviously must be a
|
||||
// reference UNICHARSET somewhere. Usually in TrainingSampleSet.
|
||||
UNICHAR_ID class_id_;
|
||||
// Font id in which this sample was printed. Refers to a fontinfo_table_ in
|
||||
// MasterTrainer.
|
||||
int font_id_;
|
||||
// Number of page that the sample came from.
|
||||
int page_num_;
|
||||
// Bounding box of sample in original image.
|
||||
TBOX bounding_box_;
|
||||
// Number of INT_FEATURE_STRUCT in features_ array.
|
||||
int num_features_;
|
||||
// Number of MicroFeature in micro_features_ array.
|
||||
int num_micro_features_;
|
||||
// Array of features.
|
||||
INT_FEATURE_STRUCT* features_;
|
||||
// Array of features.
|
||||
MicroFeature* micro_features_;
|
||||
// The one and only CN feature. Indexed by NORM_PARAM_NAME enum.
|
||||
float cn_feature_[kNumCNParams];
|
||||
// The one and only geometric feature. (Aims at replacing cn_feature_).
|
||||
// Indexed by GeoParams enum in picofeat.h
|
||||
int geo_feature_[GeoCount];
|
||||
|
||||
// Non-serialized cache data.
|
||||
// Weight used for boosting training.
|
||||
double weight_;
|
||||
// Maximum distance to other samples of same class/font used in computing
|
||||
// the canonical sample.
|
||||
double max_dist_;
|
||||
// Global index of this sample.
|
||||
int sample_index_;
|
||||
// Indexed/mapped features, as indicated by the bools below.
|
||||
GenericVector<int> mapped_features_;
|
||||
bool features_are_indexed_;
|
||||
bool features_are_mapped_;
|
||||
// True if the last classification was an error by the current definition.
|
||||
bool is_error_;
|
||||
|
||||
// Randomizing factors.
|
||||
static const int kYShiftValues[kSampleYShiftSize];
|
||||
static const double kScaleValues[kSampleScaleSize];
|
||||
};
|
||||
|
||||
ELISTIZEH(TrainingSample)
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_TRAININGSAMPLE_H__
|
870
classify/trainingsampleset.cpp
Normal file
870
classify/trainingsampleset.cpp
Normal file
@ -0,0 +1,870 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "trainingsampleset.h"
|
||||
#include "allheaders.h"
|
||||
#include "boxread.h"
|
||||
#include "fontinfo.h"
|
||||
#include "indexmapbidi.h"
|
||||
#include "intfeaturedist.h"
|
||||
#include "intfeaturemap.h"
|
||||
#include "intfeaturespace.h"
|
||||
#include "shapetable.h"
|
||||
#include "trainingsample.h"
|
||||
#include "unicity_table.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const int kTestChar = -1; // 37;
|
||||
// Max number of distances to compute the squared way
|
||||
const int kSquareLimit = 25;
|
||||
// Prime numbers for subsampling distances.
|
||||
const int kPrime1 = 17;
|
||||
const int kPrime2 = 13;
|
||||
// Min samples from which to start discarding outliers.
|
||||
const int kMinOutlierSamples = 5;
|
||||
|
||||
TrainingSampleSet::FontClassInfo::FontClassInfo()
|
||||
: num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const {
|
||||
if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
|
||||
return false;
|
||||
if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
|
||||
return false;
|
||||
if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
|
||||
if (!samples.Serialize(fp)) return false;
|
||||
return true;
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
|
||||
if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
|
||||
return false;
|
||||
if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
|
||||
return false;
|
||||
if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
|
||||
if (!samples.DeSerialize(swap, fp)) return false;
|
||||
if (swap) {
|
||||
ReverseN(&num_raw_samples, sizeof(num_raw_samples));
|
||||
ReverseN(&canonical_sample, sizeof(canonical_sample));
|
||||
ReverseN(&canonical_dist, sizeof(canonical_dist));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
TrainingSampleSet::TrainingSampleSet(const UnicityTable<FontInfo>& font_table)
|
||||
: num_raw_samples_(0), unicharset_size_(0),
|
||||
font_class_array_(NULL), fontinfo_table_(font_table) {
|
||||
}
|
||||
|
||||
TrainingSampleSet::~TrainingSampleSet() {
|
||||
delete font_class_array_;
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool TrainingSampleSet::Serialize(FILE* fp) const {
|
||||
if (!samples_.Serialize(fp)) return false;
|
||||
if (!unicharset_.save_to_file(fp)) return false;
|
||||
if (!font_id_map_.Serialize(fp)) return false;
|
||||
inT8 not_null = font_class_array_ != NULL;
|
||||
if (fwrite(¬_null, sizeof(not_null), 1, fp) != 1) return false;
|
||||
if (not_null) {
|
||||
if (!font_class_array_->SerializeClasses(fp)) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
|
||||
if (!samples_.DeSerialize(swap, fp)) return false;
|
||||
num_raw_samples_ = samples_.size();
|
||||
if (!unicharset_.load_from_file(fp)) return false;
|
||||
if (!font_id_map_.DeSerialize(swap, fp)) return false;
|
||||
if (font_class_array_ != NULL) {
|
||||
delete font_class_array_;
|
||||
font_class_array_ = NULL;
|
||||
}
|
||||
inT8 not_null;
|
||||
if (fread(¬_null, sizeof(not_null), 1, fp) != 1) return false;
|
||||
if (not_null) {
|
||||
FontClassInfo empty;
|
||||
font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
|
||||
if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
|
||||
}
|
||||
unicharset_size_ = unicharset_.size();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Load an initial unicharset, or set one up if the file cannot be read.
|
||||
void TrainingSampleSet::LoadUnicharset(const char* filename) {
|
||||
if (!unicharset_.load_from_file(filename)) {
|
||||
tprintf("Failed to load unicharset from file %s\n"
|
||||
"Building unicharset for boosting from scratch...\n",
|
||||
filename);
|
||||
unicharset_.clear();
|
||||
// Space character needed to represent NIL_LIST classification.
|
||||
unicharset_.unichar_insert(" ");
|
||||
}
|
||||
unicharset_size_ = unicharset_.size();
|
||||
}
|
||||
|
||||
// Adds a character sample to this sample set.
|
||||
// If the unichar is not already in the local unicharset, it is added.
|
||||
// Returns the unichar_id of the added sample, from the local unicharset.
|
||||
int TrainingSampleSet::AddSample(const char* unichar, TrainingSample* sample) {
|
||||
if (!unicharset_.contains_unichar(unichar)) {
|
||||
unicharset_.unichar_insert(unichar);
|
||||
if (unicharset_.size() > MAX_NUM_CLASSES) {
|
||||
tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
|
||||
"greater than MAX_NUM_CLASSES\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
|
||||
AddSample(char_id, sample);
|
||||
return char_id;
|
||||
}
|
||||
|
||||
// Adds a character sample to this sample set with the given unichar_id,
|
||||
// which must correspond to the local unicharset (in this).
|
||||
void TrainingSampleSet::AddSample(int unichar_id, TrainingSample* sample) {
|
||||
sample->set_class_id(unichar_id);
|
||||
samples_.push_back(sample);
|
||||
num_raw_samples_ = samples_.size();
|
||||
unicharset_size_ = unicharset_.size();
|
||||
}
|
||||
|
||||
// Returns the number of samples for the given font,class pair.
|
||||
// If randomize is true, returns the number of samples accessible
|
||||
// with randomizing on. (Increases the number of samples if small.)
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
|
||||
bool randomize) const {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
if (font_id < 0 || class_id < 0 ||
|
||||
font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
|
||||
// There are no samples because the font or class doesn't exist.
|
||||
return 0;
|
||||
}
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
if (font_index < 0)
|
||||
return 0; // The font has no samples.
|
||||
if (randomize)
|
||||
return (*font_class_array_)(font_index, class_id).samples.size();
|
||||
else
|
||||
return (*font_class_array_)(font_index, class_id).num_raw_samples;
|
||||
}
|
||||
|
||||
// Gets a sample by its index.
|
||||
const TrainingSample* TrainingSampleSet::GetSample(int index) const {
|
||||
return samples_[index];
|
||||
}
|
||||
|
||||
// Gets a sample by its font, class, index.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
|
||||
int index) const {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
if (font_index < 0) return NULL;
|
||||
int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
|
||||
return samples_[sample_index];
|
||||
}
|
||||
|
||||
// Get a sample by its font, class, index. Does not randomize.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
TrainingSample* TrainingSampleSet::MutableSample(int font_id, int class_id,
|
||||
int index) {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
if (font_index < 0) return NULL;
|
||||
int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
|
||||
return samples_[sample_index];
|
||||
}
|
||||
|
||||
// Returns a string debug representation of the given sample:
|
||||
// font, unichar_str, bounding box, page.
|
||||
STRING TrainingSampleSet::SampleToString(const TrainingSample& sample) const {
|
||||
STRING boxfile_str;
|
||||
MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
|
||||
sample.bounding_box(), sample.page_num(), &boxfile_str);
|
||||
return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
|
||||
}
|
||||
|
||||
// Gets the combined set of features used by all the samples of the given
|
||||
// font/class combination.
|
||||
const BitVector& TrainingSampleSet::GetCloudFeatures(
|
||||
int font_id, int class_id) const {
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
ASSERT_HOST(font_index >= 0);
|
||||
return (*font_class_array_)(font_index, class_id).cloud_features;
|
||||
}
|
||||
// Gets the indexed features of the canonical sample of the given
|
||||
// font/class combination.
|
||||
const GenericVector<int>& TrainingSampleSet::GetCanonicalFeatures(
|
||||
int font_id, int class_id) const {
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
ASSERT_HOST(font_index >= 0);
|
||||
return (*font_class_array_)(font_index, class_id).canonical_features;
|
||||
}
|
||||
|
||||
// Returns the distance between the given UniCharAndFonts pair.
|
||||
// If matched_fonts, only matching fonts, are considered, unless that yields
|
||||
// the empty set.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
float TrainingSampleSet::UnicharDistance(const UnicharAndFonts& uf1,
|
||||
const UnicharAndFonts& uf2,
|
||||
bool matched_fonts,
|
||||
const IntFeatureMap& feature_map) {
|
||||
int num_fonts1 = uf1.font_ids.size();
|
||||
int c1 = uf1.unichar_id;
|
||||
int num_fonts2 = uf2.font_ids.size();
|
||||
int c2 = uf2.unichar_id;
|
||||
double dist_sum = 0.0;
|
||||
int dist_count = 0;
|
||||
bool debug = false;
|
||||
if (matched_fonts) {
|
||||
// Compute distances only where fonts match.
|
||||
for (int i = 0; i < num_fonts1; ++i) {
|
||||
int f1 = uf1.font_ids[i];
|
||||
for (int j = 0; j < num_fonts2; ++j) {
|
||||
int f2 = uf2.font_ids[j];
|
||||
if (f1 == f2) {
|
||||
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
|
||||
++dist_count;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
|
||||
// Small enough sets to compute all the distances.
|
||||
for (int i = 0; i < num_fonts1; ++i) {
|
||||
int f1 = uf1.font_ids[i];
|
||||
for (int j = 0; j < num_fonts2; ++j) {
|
||||
int f2 = uf2.font_ids[j];
|
||||
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
|
||||
if (debug) {
|
||||
tprintf("Cluster dist %d %d %d %d = %g\n",
|
||||
f1, c1, f2, c2,
|
||||
ClusterDistance(f1, c1, f2, c2, feature_map));
|
||||
}
|
||||
++dist_count;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Subsample distances, using the largest set once, and stepping through
|
||||
// the smaller set so as to ensure that all the pairs are different.
|
||||
int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
|
||||
int index = 0;
|
||||
int num_samples = MAX(num_fonts1, num_fonts2);
|
||||
for (int i = 0; i < num_samples; ++i, index += increment) {
|
||||
int f1 = uf1.font_ids[i % num_fonts1];
|
||||
int f2 = uf2.font_ids[index % num_fonts2];
|
||||
if (debug) {
|
||||
tprintf("Cluster dist %d %d %d %d = %g\n",
|
||||
f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
|
||||
}
|
||||
dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
|
||||
++dist_count;
|
||||
}
|
||||
}
|
||||
if (dist_count == 0) {
|
||||
if (matched_fonts)
|
||||
return UnicharDistance(uf1, uf2, false, feature_map);
|
||||
return 0.0f;
|
||||
}
|
||||
return dist_sum / dist_count;
|
||||
}
|
||||
|
||||
// Returns the distance between the given pair of font/class pairs.
|
||||
// Finds in cache or computes and caches.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
|
||||
int font_id2, int class_id2,
|
||||
const IntFeatureMap& feature_map) {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_index1 = font_id_map_.SparseToCompact(font_id1);
|
||||
int font_index2 = font_id_map_.SparseToCompact(font_id2);
|
||||
if (font_index1 < 0 || font_index2 < 0)
|
||||
return 0.0f;
|
||||
FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
|
||||
if (font_id1 == font_id2) {
|
||||
// Special case cache for speed.
|
||||
if (fc_info.unichar_distance_cache.size() == 0)
|
||||
fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
|
||||
if (fc_info.unichar_distance_cache[class_id2] < 0) {
|
||||
// Distance has to be calculated.
|
||||
float result = ComputeClusterDistance(font_id1, class_id1,
|
||||
font_id2, class_id2,
|
||||
feature_map);
|
||||
fc_info.unichar_distance_cache[class_id2] = result;
|
||||
// Copy to the symmetric cache entry.
|
||||
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
|
||||
if (fc_info2.unichar_distance_cache.size() == 0)
|
||||
fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
|
||||
fc_info2.unichar_distance_cache[class_id1] = result;
|
||||
}
|
||||
return fc_info.unichar_distance_cache[class_id2];
|
||||
} else if (class_id1 == class_id2) {
|
||||
// Another special-case cache for equal class-id.
|
||||
if (fc_info.font_distance_cache.size() == 0)
|
||||
fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
|
||||
-1.0f);
|
||||
if (fc_info.font_distance_cache[font_index2] < 0) {
|
||||
// Distance has to be calculated.
|
||||
float result = ComputeClusterDistance(font_id1, class_id1,
|
||||
font_id2, class_id2,
|
||||
feature_map);
|
||||
fc_info.font_distance_cache[font_index2] = result;
|
||||
// Copy to the symmetric cache entry.
|
||||
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
|
||||
if (fc_info2.font_distance_cache.size() == 0)
|
||||
fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
|
||||
-1.0f);
|
||||
fc_info2.font_distance_cache[font_index1] = result;
|
||||
}
|
||||
return fc_info.font_distance_cache[font_index2];
|
||||
}
|
||||
// Both font and class are different. Linear search for class_id2/font_id2
|
||||
// in what is a hopefully short list of distances.
|
||||
int cache_index = 0;
|
||||
while (cache_index < fc_info.distance_cache.size() &&
|
||||
(fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
|
||||
fc_info.distance_cache[cache_index].font_id != font_id2))
|
||||
++cache_index;
|
||||
if (cache_index == fc_info.distance_cache.size()) {
|
||||
// Distance has to be calculated.
|
||||
float result = ComputeClusterDistance(font_id1, class_id1,
|
||||
font_id2, class_id2,
|
||||
feature_map);
|
||||
FontClassDistance fc_dist = { class_id2, font_id2, result };
|
||||
fc_info.distance_cache.push_back(fc_dist);
|
||||
// Copy to the symmetric cache entry. We know it isn't there already, as
|
||||
// we always copy to the symmetric entry.
|
||||
FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
|
||||
fc_dist.unichar_id = class_id1;
|
||||
fc_dist.font_id = font_id1;
|
||||
fc_info2.distance_cache.push_back(fc_dist);
|
||||
}
|
||||
return fc_info.distance_cache[cache_index].distance;
|
||||
}
|
||||
|
||||
// Computes the distance between the given pair of font/class pairs.
|
||||
float TrainingSampleSet::ComputeClusterDistance(
|
||||
int font_id1, int class_id1, int font_id2, int class_id2,
|
||||
const IntFeatureMap& feature_map) const {
|
||||
int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
|
||||
feature_map, false);
|
||||
dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
|
||||
feature_map, false);
|
||||
int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
|
||||
denominator += GetCanonicalFeatures(font_id2, class_id2).size();
|
||||
return static_cast<float>(dist) / denominator;
|
||||
}
|
||||
|
||||
// Helper to add a feature and its near neighbors to the good_features.
|
||||
// levels indicates how many times to compute the offset features of what is
|
||||
// already there. This is done by iteration rather than recursion.
|
||||
static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
|
||||
GenericVector<int>* good_features) {
|
||||
int prev_num_features = 0;
|
||||
good_features->push_back(f);
|
||||
int num_features = 1;
|
||||
for (int level = 0; level < levels; ++level) {
|
||||
for (int i = prev_num_features; i < num_features; ++i) {
|
||||
int feature = (*good_features)[i];
|
||||
for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
|
||||
if (dir == 0) continue;
|
||||
int f1 = feature_map.OffsetFeature(feature, dir);
|
||||
if (f1 >= 0) {
|
||||
good_features->push_back(f1);
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_num_features = num_features;
|
||||
num_features = good_features->size();
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the number of canonical features of font/class 2 for which
|
||||
// neither the feature nor any of its near neighbors occurs in the cloud
|
||||
// of font/class 1. Each such feature is a reliable separation between
|
||||
// the classes, ASSUMING that the canonical sample is sufficiently
|
||||
// representative that every sample has a feature near that particular
|
||||
// feature. To check that this is so on the fly would be prohibitively
|
||||
// expensive, but it might be possible to pre-qualify the canonical features
|
||||
// to include only those for which this assumption is true.
|
||||
// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
|
||||
// first, or the results will be nonsense.
|
||||
int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
|
||||
int font_id2, int class_id2,
|
||||
const IntFeatureMap& feature_map,
|
||||
bool thorough) const {
|
||||
int result = 0;
|
||||
const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
|
||||
if (sample2 == NULL)
|
||||
return 0; // There are no canonical features.
|
||||
const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
|
||||
class_id2);
|
||||
const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
|
||||
if (cloud1.size() == 0)
|
||||
return canonical2.size(); // There are no cloud features.
|
||||
|
||||
// Find a canonical2 feature that is not in cloud1.
|
||||
for (int f = 0; f < canonical2.size(); ++f) {
|
||||
int feature = canonical2[f];
|
||||
if (cloud1[feature])
|
||||
continue;
|
||||
// Gather the near neighbours of f.
|
||||
GenericVector<int> good_features;
|
||||
AddNearFeatures(feature_map, feature, 1, &good_features);
|
||||
// Check that none of the good_features are in the cloud.
|
||||
int i;
|
||||
for (i = 0; i < good_features.size(); ++i) {
|
||||
int good_f = good_features[i];
|
||||
if (cloud1[good_f]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i < good_features.size())
|
||||
continue; // Found one in the cloud.
|
||||
++result;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the total index of the requested sample.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
|
||||
int index) const {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
if (font_index < 0) return -1;
|
||||
return (*font_class_array_)(font_index, class_id).samples[index];
|
||||
}
|
||||
|
||||
// Gets the canonical sample for the given font, class pair.
|
||||
// ComputeCanonicalSamples must have been called first.
|
||||
const TrainingSample* TrainingSampleSet::GetCanonicalSample(
|
||||
int font_id, int class_id) const {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
if (font_index < 0) return NULL;
|
||||
int sample_index = (*font_class_array_)(font_index,
|
||||
class_id).canonical_sample;
|
||||
return sample_index >= 0 ? samples_[sample_index] : NULL;
|
||||
}
|
||||
|
||||
// Gets the max distance for the given canonical sample.
|
||||
// ComputeCanonicalSamples must have been called first.
|
||||
float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
if (font_index < 0) return 0.0f;
|
||||
if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
|
||||
return (*font_class_array_)(font_index, class_id).canonical_dist;
|
||||
else
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
// Generates indexed features for all samples with the supplied feature_space.
|
||||
void TrainingSampleSet::IndexFeatures(const IntFeatureSpace& feature_space) {
|
||||
for (int s = 0; s < samples_.size(); ++s)
|
||||
samples_[s]->IndexFeatures(feature_space);
|
||||
}
|
||||
|
||||
// Delete outlier samples with few features that are shared with others.
|
||||
// IndexFeatures must have been called already.
|
||||
void TrainingSampleSet::DeleteOutliers(const IntFeatureSpace& feature_space,
|
||||
bool debug) {
|
||||
if (font_class_array_ == NULL)
|
||||
OrganizeByFontAndClass();
|
||||
Pixa* pixa = NULL;
|
||||
if (debug)
|
||||
pixa = pixaCreate(0);
|
||||
GenericVector<int> feature_counts;
|
||||
int fs_size = feature_space.Size();
|
||||
int font_size = font_id_map_.CompactSize();
|
||||
for (int font_index = 0; font_index < font_size; ++font_index) {
|
||||
for (int c = 0; c < unicharset_size_; ++c) {
|
||||
// Create a histogram of the features used by all samples of this
|
||||
// font/class combination.
|
||||
feature_counts.init_to_size(fs_size, 0);
|
||||
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
|
||||
int sample_count = fcinfo.samples.size();
|
||||
if (sample_count < kMinOutlierSamples)
|
||||
continue;
|
||||
for (int i = 0; i < sample_count; ++i) {
|
||||
int s = fcinfo.samples[i];
|
||||
const GenericVector<int>& features = samples_[s]->indexed_features();
|
||||
for (int f = 0; f < features.size(); ++f) {
|
||||
++feature_counts[features[f]];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < sample_count; ++i) {
|
||||
int s = fcinfo.samples[i];
|
||||
const TrainingSample& sample = *samples_[s];
|
||||
const GenericVector<int>& features = sample.indexed_features();
|
||||
// A feature that has a histogram count of 1 is only used by this
|
||||
// sample, making it 'bad'. All others are 'good'.
|
||||
int good_features = 0;
|
||||
int bad_features = 0;
|
||||
for (int f = 0; f < features.size(); ++f) {
|
||||
if (feature_counts[features[f]] > 1)
|
||||
++good_features;
|
||||
else
|
||||
++bad_features;
|
||||
}
|
||||
// If more than 1/3 features are bad, then this is an outlier.
|
||||
if (bad_features * 2 > good_features) {
|
||||
tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
|
||||
SampleToString(sample).string(),
|
||||
good_features, bad_features);
|
||||
if (debug) {
|
||||
pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
|
||||
// Add the previous sample as well, so it is easier to see in
|
||||
// the output what is wrong with this sample.
|
||||
int t;
|
||||
if (i == 0)
|
||||
t = fcinfo.samples[1];
|
||||
else
|
||||
t = fcinfo.samples[i - 1];
|
||||
const TrainingSample &csample = *samples_[t];
|
||||
pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
|
||||
}
|
||||
// Mark the sample for deletion.
|
||||
KillSample(samples_[s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Truly delete all bad samples and renumber everything.
|
||||
DeleteDeadSamples();
|
||||
if (pixa != NULL) {
|
||||
Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
|
||||
pixaDestroy(&pixa);
|
||||
pixWrite("outliers.png", pix, IFF_PNG);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
}
|
||||
|
||||
// Marks the given sample index for deletion.
|
||||
// Deletion is actually completed by DeleteDeadSamples.
|
||||
void TrainingSampleSet::KillSample(TrainingSample* sample) {
|
||||
sample->set_sample_index(-1);
|
||||
}
|
||||
|
||||
// Deletes all samples with zero features marked by KillSample.
|
||||
void TrainingSampleSet::DeleteDeadSamples() {
|
||||
samples_.compact(
|
||||
NewPermanentTessCallback(this, &TrainingSampleSet::DeleteableSample));
|
||||
num_raw_samples_ = samples_.size();
|
||||
// Samples must be re-organized now we have deleted a few.
|
||||
}
|
||||
|
||||
// Callback function returns true if the given sample is to be deleted, due
|
||||
// to having a negative classid.
|
||||
bool TrainingSampleSet::DeleteableSample(const TrainingSample* sample) {
|
||||
return sample == NULL || sample->class_id() < 0;
|
||||
}
|
||||
|
||||
static Pix* DebugSample(const UNICHARSET& unicharset,
|
||||
TrainingSample* sample) {
|
||||
tprintf("\nOriginal features:\n");
|
||||
for (int i = 0; i < sample->num_features(); ++i) {
|
||||
sample->features()[i].print();
|
||||
}
|
||||
if (sample->features_are_mapped()) {
|
||||
tprintf("\nMapped features:\n");
|
||||
for (int i = 0; i < sample->mapped_features().size(); ++i) {
|
||||
tprintf("%d ", sample->mapped_features()[i]);
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
return sample->RenderToPix(&unicharset);
|
||||
}
|
||||
|
||||
// Construct an array to access the samples by font,class pair.
|
||||
void TrainingSampleSet::OrganizeByFontAndClass() {
|
||||
// Font indexes are sparse, so we used a map to compact them, so we can
|
||||
// have an efficient 2-d array of fonts and character classes.
|
||||
SetupFontIdMap();
|
||||
int compact_font_size = font_id_map_.CompactSize();
|
||||
// Get a 2-d array of generic vectors.
|
||||
if (font_class_array_ != NULL)
|
||||
delete font_class_array_;
|
||||
FontClassInfo empty;
|
||||
font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
|
||||
compact_font_size, unicharset_size_, empty);
|
||||
for (int s = 0; s < samples_.size(); ++s) {
|
||||
int font_id = samples_[s]->font_id();
|
||||
int class_id = samples_[s]->class_id();
|
||||
if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
|
||||
tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
|
||||
font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
|
||||
s);
|
||||
}
|
||||
ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
|
||||
ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
|
||||
int font_index = font_id_map_.SparseToCompact(font_id);
|
||||
(*font_class_array_)(font_index, class_id).samples.push_back(s);
|
||||
}
|
||||
// Set the num_raw_samples member of the FontClassInfo, to set the boundary
|
||||
// between the raw samples and the replicated ones.
|
||||
for (int f = 0; f < compact_font_size; ++f) {
|
||||
for (int c = 0; c < unicharset_size_; ++c)
|
||||
(*font_class_array_)(f, c).num_raw_samples =
|
||||
(*font_class_array_)(f, c).samples.size();
|
||||
}
|
||||
// This is the global number of samples and also marks the boundary between
|
||||
// real and replicated samples.
|
||||
num_raw_samples_ = samples_.size();
|
||||
}
|
||||
|
||||
// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
|
||||
// index for the font_class_array_.
|
||||
void TrainingSampleSet::SetupFontIdMap() {
|
||||
// Number of samples for each font_id.
|
||||
GenericVector<int> font_counts;
|
||||
for (int s = 0; s < samples_.size(); ++s) {
|
||||
int font_id = samples_[s]->font_id();
|
||||
while (font_id >= font_counts.size())
|
||||
font_counts.push_back(0);
|
||||
++font_counts[font_id];
|
||||
}
|
||||
font_id_map_.Init(font_counts.size(), false);
|
||||
for (int f = 0; f < font_counts.size(); ++f) {
|
||||
font_id_map_.SetMap(f, font_counts[f] > 0);
|
||||
}
|
||||
font_id_map_.Setup();
|
||||
}
|
||||
|
||||
|
||||
// Finds the sample for each font, class pair that has least maximum
|
||||
// distance to all the other samples of the same font, class.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map,
|
||||
bool debug) {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
IntFeatureDist f_table;
|
||||
if (debug) tprintf("feature table size %d\n", map.sparse_size());
|
||||
f_table.Init(&map);
|
||||
int worst_s1 = 0;
|
||||
int worst_s2 = 0;
|
||||
double global_worst_dist = 0.0;
|
||||
// Compute distances independently for each font and char index.
|
||||
int font_size = font_id_map_.CompactSize();
|
||||
for (int font_index = 0; font_index < font_size; ++font_index) {
|
||||
int font_id = font_id_map_.CompactToSparse(font_index);
|
||||
for (int c = 0; c < unicharset_size_; ++c) {
|
||||
int samples_found = 0;
|
||||
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
|
||||
if (fcinfo.samples.size() == 0 ||
|
||||
(kTestChar >= 0 && c != kTestChar)) {
|
||||
fcinfo.canonical_sample = -1;
|
||||
fcinfo.canonical_dist = 0.0f;
|
||||
if (debug) tprintf("Skipping class %d\n", c);
|
||||
continue;
|
||||
}
|
||||
// The canonical sample will be the one with the min_max_dist, which
|
||||
// is the sample with the lowest maximum distance to all other samples.
|
||||
double min_max_dist = 2.0;
|
||||
// We keep track of the farthest apart pair (max_s1, max_s2) which
|
||||
// are max_max_dist apart, so we can see how bad the variability is.
|
||||
double max_max_dist = 0.0;
|
||||
int max_s1 = 0;
|
||||
int max_s2 = 0;
|
||||
fcinfo.canonical_sample = fcinfo.samples[0];
|
||||
fcinfo.canonical_dist = 0.0f;
|
||||
for (int i = 0; i < fcinfo.samples.size(); ++i) {
|
||||
int s1 = fcinfo.samples[i];
|
||||
const GenericVector<int>& features1 = samples_[s1]->indexed_features();
|
||||
f_table.Set(features1, features1.size(), true);
|
||||
double max_dist = 0.0;
|
||||
// Run the full squared-order search for similar samples. It is still
|
||||
// reasonably fast because f_table.FeatureDistance is fast, but we
|
||||
// may have to reconsider if we start playing with too many samples
|
||||
// of a single char/font.
|
||||
for (int j = 0; j < fcinfo.samples.size(); ++j) {
|
||||
int s2 = fcinfo.samples[j];
|
||||
if (samples_[s2]->class_id() != c ||
|
||||
samples_[s2]->font_id() != font_id ||
|
||||
s2 == s1)
|
||||
continue;
|
||||
GenericVector<int> features2 = samples_[s2]->indexed_features();
|
||||
double dist = f_table.FeatureDistance(features2);
|
||||
int height = samples_[s2]->geo_feature(GeoTop) -
|
||||
samples_[s2]->geo_feature(GeoBottom);
|
||||
if (dist == 1.0 && height > 64) {
|
||||
// TODO(rays) rethink this when the polygonal approximation goes.
|
||||
// Currently it is possible for dots and other small characters
|
||||
// to be completely different, even within the same class.
|
||||
f_table.DebugFeatureDistance(features2);
|
||||
}
|
||||
if (dist > max_dist) {
|
||||
max_dist = dist;
|
||||
if (dist > max_max_dist) {
|
||||
max_s1 = s1;
|
||||
max_s2 = s2;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Using Set(..., false) is far faster than re initializing, due to
|
||||
// the sparseness of the feature space.
|
||||
f_table.Set(features1, features1.size(), false);
|
||||
samples_[s1]->set_max_dist(max_dist);
|
||||
++samples_found;
|
||||
if (max_dist < min_max_dist) {
|
||||
fcinfo.canonical_sample = s1;
|
||||
fcinfo.canonical_dist = max_dist;
|
||||
}
|
||||
UpdateRange(max_dist, &min_max_dist, &max_max_dist);
|
||||
}
|
||||
if (max_max_dist > global_worst_dist) {
|
||||
// Keep a record of the worst pair over all characters/fonts too.
|
||||
global_worst_dist = max_max_dist;
|
||||
worst_s1 = max_s1;
|
||||
worst_s2 = max_s2;
|
||||
}
|
||||
if (debug) {
|
||||
tprintf("Found %d samples of class %d=%s, font %d, "
|
||||
"dist range [%g, %g], worst pair= %s, %s\n",
|
||||
samples_found, c, unicharset_.debug_str(c).string(),
|
||||
font_index, min_max_dist, max_max_dist,
|
||||
SampleToString(*samples_[max_s1]).string(),
|
||||
SampleToString(*samples_[max_s2]).string());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
tprintf("Global worst dist = %g, between sample %d and %d\n",
|
||||
global_worst_dist, worst_s1, worst_s2);
|
||||
Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
|
||||
Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
|
||||
pixOr(pix1, pix1, pix2);
|
||||
pixWrite("worstpair.png", pix1, IFF_PNG);
|
||||
pixDestroy(&pix1);
|
||||
pixDestroy(&pix2);
|
||||
}
|
||||
}
|
||||
|
||||
// Replicates the samples to a minimum frequency defined by
|
||||
// 2 * kSampleRandomSize, or for larger counts duplicates all samples.
|
||||
// After replication, the replicated samples are perturbed slightly, but
|
||||
// in a predictable and repeatable way.
|
||||
// Use after OrganizeByFontAndClass().
|
||||
void TrainingSampleSet::ReplicateAndRandomizeSamples() {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_size = font_id_map_.CompactSize();
|
||||
for (int font_index = 0; font_index < font_size; ++font_index) {
|
||||
for (int c = 0; c < unicharset_size_; ++c) {
|
||||
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
|
||||
int sample_count = fcinfo.samples.size();
|
||||
int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
|
||||
if (sample_count > 0 && sample_count < min_samples) {
|
||||
int base_count = sample_count;
|
||||
for (int base_index = 0; sample_count < min_samples; ++sample_count) {
|
||||
int src_index = fcinfo.samples[base_index++];
|
||||
if (base_index >= base_count) base_index = 0;
|
||||
TrainingSample* sample = samples_[src_index]->RandomizedCopy(
|
||||
sample_count % kSampleRandomSize);
|
||||
int sample_index = samples_.size();
|
||||
sample->set_sample_index(sample_index);
|
||||
samples_.push_back(sample);
|
||||
fcinfo.samples.push_back(sample_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Caches the indexed features of the canonical samples.
|
||||
// ComputeCanonicalSamples must have been already called.
|
||||
// TODO(rays) see note on ReliablySeparable and try restricting the
|
||||
// canonical features to those that truly represent all samples.
|
||||
void TrainingSampleSet::ComputeCanonicalFeatures() {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_size = font_id_map_.CompactSize();
|
||||
for (int font_index = 0; font_index < font_size; ++font_index) {
|
||||
int font_id = font_id_map_.CompactToSparse(font_index);
|
||||
for (int c = 0; c < unicharset_size_; ++c) {
|
||||
int num_samples = NumClassSamples(font_id, c, false);
|
||||
if (num_samples == 0)
|
||||
continue;
|
||||
const TrainingSample* sample = GetCanonicalSample(font_id, c);
|
||||
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
|
||||
fcinfo.canonical_features = sample->indexed_features();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Computes the combined set of features used by all the samples of each
|
||||
// font/class combination. Use after ReplicateAndRandomizeSamples.
|
||||
void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
|
||||
ASSERT_HOST(font_class_array_ != NULL);
|
||||
int font_size = font_id_map_.CompactSize();
|
||||
for (int font_index = 0; font_index < font_size; ++font_index) {
|
||||
int font_id = font_id_map_.CompactToSparse(font_index);
|
||||
for (int c = 0; c < unicharset_size_; ++c) {
|
||||
int num_samples = NumClassSamples(font_id, c, false);
|
||||
if (num_samples == 0)
|
||||
continue;
|
||||
FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
|
||||
fcinfo.cloud_features.Init(feature_space_size);
|
||||
for (int s = 0; s < num_samples; ++s) {
|
||||
const TrainingSample* sample = GetSample(font_id, c, s);
|
||||
const GenericVector<int>& sample_features = sample->indexed_features();
|
||||
for (int i = 0; i < sample_features.size(); ++i)
|
||||
fcinfo.cloud_features.SetBit(sample_features[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Adds all fonts of the given class to the shape.
|
||||
void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
|
||||
for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
|
||||
int font_id = font_id_map_.CompactToSparse(f);
|
||||
shape->AddToShape(class_id, font_id);
|
||||
}
|
||||
}
|
||||
|
||||
// Display the samples with the given indexed feature that also match
|
||||
// the given shape.
|
||||
void TrainingSampleSet::DisplaySamplesWithFeature(int f_index,
|
||||
const Shape& shape,
|
||||
const IntFeatureSpace& space,
|
||||
ScrollView::Color color,
|
||||
ScrollView* window) const {
|
||||
for (int s = 0; s < num_raw_samples(); ++s) {
|
||||
const TrainingSample* sample = GetSample(s);
|
||||
if (shape.ContainsUnichar(sample->class_id())) {
|
||||
GenericVector<int> indexed_features;
|
||||
space.IndexAndSortFeatures(sample->features(), sample->num_features(),
|
||||
&indexed_features);
|
||||
for (int f = 0; f < indexed_features.size(); ++f) {
|
||||
if (indexed_features[f] == f_index) {
|
||||
sample->DisplayFeatures(color, window);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace tesseract.
|
290
classify/trainingsampleset.h
Normal file
290
classify/trainingsampleset.h
Normal file
@ -0,0 +1,290 @@
|
||||
// Copyright 2010 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__
|
||||
#define TESSERACT_TRAINING_TRAININGSAMPLESET_H__
|
||||
|
||||
#include "bitvector.h"
|
||||
#include "genericvector.h"
|
||||
#include "indexmapbidi.h"
|
||||
#include "matrix.h"
|
||||
#include "shapetable.h"
|
||||
#include "trainingsample.h"
|
||||
|
||||
class UNICHARSET;
|
||||
template <typename T> class UnicityTable;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
struct FontInfo;
|
||||
class IntFeatureMap;
|
||||
class IntFeatureSpace;
|
||||
class TrainingSample;
|
||||
class UnicharAndFonts;
|
||||
|
||||
// Collection of TrainingSample used for training or testing a classifier.
|
||||
// Provides several useful methods to operate on the collection as a whole,
|
||||
// including outlier detection and deletion, providing access by font and
|
||||
// class, finding the canonical sample, finding the "cloud" features (OR of
|
||||
// all features in all samples), replication of samples, caching of distance
|
||||
// metrics.
|
||||
class TrainingSampleSet {
|
||||
public:
|
||||
explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table);
|
||||
~TrainingSampleSet();
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Accessors
|
||||
int num_samples() const {
|
||||
return samples_.size();
|
||||
}
|
||||
int num_raw_samples() const {
|
||||
return num_raw_samples_;
|
||||
}
|
||||
int NumFonts() const {
|
||||
return font_id_map_.SparseSize();
|
||||
}
|
||||
const UNICHARSET& unicharset() const {
|
||||
return unicharset_;
|
||||
}
|
||||
int charsetsize() const {
|
||||
return unicharset_size_;
|
||||
}
|
||||
|
||||
// Loads an initial unicharset, or sets one up if the file cannot be read.
|
||||
void LoadUnicharset(const char* filename);
|
||||
|
||||
// Adds a character sample to this sample set.
|
||||
// If the unichar is not already in the local unicharset, it is added.
|
||||
// Returns the unichar_id of the added sample, from the local unicharset.
|
||||
int AddSample(const char* unichar, TrainingSample* sample);
|
||||
// Adds a character sample to this sample set with the given unichar_id,
|
||||
// which must correspond to the local unicharset (in this).
|
||||
void AddSample(int unichar_id, TrainingSample* sample);
|
||||
|
||||
// Returns the number of samples for the given font,class pair.
|
||||
// If randomize is true, returns the number of samples accessible
|
||||
// with randomizing on. (Increases the number of samples if small.)
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
int NumClassSamples(int font_id, int class_id, bool randomize) const;
|
||||
|
||||
// Gets a sample by its index.
|
||||
const TrainingSample* GetSample(int index) const;
|
||||
|
||||
// Gets a sample by its font, class, index.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
const TrainingSample* GetSample(int font_id, int class_id, int index) const;
|
||||
|
||||
// Get a sample by its font, class, index. Does not randomize.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
TrainingSample* MutableSample(int font_id, int class_id, int index);
|
||||
|
||||
// Returns a string debug representation of the given sample:
|
||||
// font, unichar_str, bounding box, page.
|
||||
STRING SampleToString(const TrainingSample& sample) const;
|
||||
|
||||
// Gets the combined set of features used by all the samples of the given
|
||||
// font/class combination.
|
||||
const BitVector& GetCloudFeatures(int font_id, int class_id) const;
|
||||
// Gets the indexed features of the canonical sample of the given
|
||||
// font/class combination.
|
||||
const GenericVector<int>& GetCanonicalFeatures(int font_id,
|
||||
int class_id) const;
|
||||
|
||||
// Returns the distance between the given UniCharAndFonts pair.
|
||||
// If matched_fonts, only matching fonts, are considered, unless that yields
|
||||
// the empty set.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
|
||||
bool matched_fonts, const IntFeatureMap& feature_map);
|
||||
|
||||
// Returns the distance between the given pair of font/class pairs.
|
||||
// Finds in cache or computes and caches.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
float ClusterDistance(int font_id1, int class_id1,
|
||||
int font_id2, int class_id2,
|
||||
const IntFeatureMap& feature_map);
|
||||
|
||||
// Computes the distance between the given pair of font/class pairs.
|
||||
float ComputeClusterDistance(int font_id1, int class_id1,
|
||||
int font_id2, int class_id2,
|
||||
const IntFeatureMap& feature_map) const;
|
||||
|
||||
// Returns the number of canonical features of font/class 2 for which
|
||||
// neither the feature nor any of its near neighbors occurs in the cloud
|
||||
// of font/class 1. Each such feature is a reliable separation between
|
||||
// the classes, ASSUMING that the canonical sample is sufficiently
|
||||
// representative that every sample has a feature near that particular
|
||||
// feature. To check that this is so on the fly would be prohibitively
|
||||
// expensive, but it might be possible to pre-qualify the canonical features
|
||||
// to include only those for which this assumption is true.
|
||||
// ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
|
||||
// first, or the results will be nonsense.
|
||||
int ReliablySeparable(int font_id1, int class_id1,
|
||||
int font_id2, int class_id2,
|
||||
const IntFeatureMap& feature_map,
|
||||
bool thorough) const;
|
||||
|
||||
|
||||
// Returns the total index of the requested sample.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
int GlobalSampleIndex(int font_id, int class_id, int index) const;
|
||||
|
||||
// Gets the canonical sample for the given font, class pair.
|
||||
// ComputeCanonicalSamples must have been called first.
|
||||
const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
|
||||
// Gets the max distance for the given canonical sample.
|
||||
// ComputeCanonicalSamples must have been called first.
|
||||
float GetCanonicalDist(int font_id, int class_id) const;
|
||||
|
||||
// Returns a mutable pointer to the sample with the given index.
|
||||
TrainingSample* mutable_sample(int index) {
|
||||
return samples_[index];
|
||||
}
|
||||
// Gets ownership of the sample with the given index, removing it from this.
|
||||
TrainingSample* extract_sample(int index) {
|
||||
TrainingSample* sample = samples_[index];
|
||||
samples_[index] = NULL;
|
||||
return sample;
|
||||
}
|
||||
|
||||
// Generates indexed features for all samples with the supplied feature_space.
|
||||
void IndexFeatures(const IntFeatureSpace& feature_space);
|
||||
|
||||
// Delete outlier samples with few features that are shared with others.
|
||||
// IndexFeatures must have been called already.
|
||||
void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug);
|
||||
|
||||
// Marks the given sample for deletion.
|
||||
// Deletion is actually completed by DeleteDeadSamples.
|
||||
void KillSample(TrainingSample* sample);
|
||||
|
||||
// Deletes all samples with a negative sample index marked by KillSample.
|
||||
// Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
|
||||
// must be called after as the samples have been renumbered.
|
||||
void DeleteDeadSamples();
|
||||
|
||||
// Callback function returns true if the given sample is to be deleted, due
|
||||
// to having a negative classid.
|
||||
bool DeleteableSample(const TrainingSample* sample);
|
||||
|
||||
// Construct an array to access the samples by font,class pair.
|
||||
void OrganizeByFontAndClass();
|
||||
|
||||
// Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
|
||||
// index for the font_class_array_.
|
||||
void SetupFontIdMap();
|
||||
|
||||
// Finds the sample for each font, class pair that has least maximum
|
||||
// distance to all the other samples of the same font, class.
|
||||
// OrganizeByFontAndClass must have been already called.
|
||||
void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);
|
||||
|
||||
// Replicates the samples to a minimum frequency defined by
|
||||
// 2 * kSampleRandomSize, or for larger counts duplicates all samples.
|
||||
// After replication, the replicated samples are perturbed slightly, but
|
||||
// in a predictable and repeatable way.
|
||||
// Use after OrganizeByFontAndClass().
|
||||
void ReplicateAndRandomizeSamples();
|
||||
|
||||
// Caches the indexed features of the canonical samples.
|
||||
// ComputeCanonicalSamples must have been already called.
|
||||
void ComputeCanonicalFeatures();
|
||||
// Computes the combined set of features used by all the samples of each
|
||||
// font/class combination. Use after ReplicateAndRandomizeSamples.
|
||||
void ComputeCloudFeatures(int feature_space_size);
|
||||
|
||||
// Adds all fonts of the given class to the shape.
|
||||
void AddAllFontsForClass(int class_id, Shape* shape) const;
|
||||
|
||||
// Display the samples with the given indexed feature that also match
|
||||
// the given shape.
|
||||
void DisplaySamplesWithFeature(int f_index, const Shape& shape,
|
||||
const IntFeatureSpace& feature_space,
|
||||
ScrollView::Color color,
|
||||
ScrollView* window) const;
|
||||
|
||||
private:
|
||||
// Struct to store a triplet of unichar, font, distance in the distance cache.
|
||||
struct FontClassDistance {
|
||||
int unichar_id;
|
||||
int font_id; // Real font id.
|
||||
float distance;
|
||||
};
|
||||
// Simple struct to store information related to each font/class combination.
|
||||
struct FontClassInfo {
|
||||
FontClassInfo();
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Number of raw samples.
|
||||
inT32 num_raw_samples;
|
||||
// Index of the canonical sample.
|
||||
inT32 canonical_sample;
|
||||
// Max distance of the canonical sample from any other.
|
||||
float canonical_dist;
|
||||
// Sample indices for the samples, including replicated.
|
||||
GenericVector<inT32> samples;
|
||||
|
||||
// Non-serialized cache data.
|
||||
// Indexed features of the canonical sample.
|
||||
GenericVector<int> canonical_features;
|
||||
// The mapped features of all the samples.
|
||||
BitVector cloud_features;
|
||||
|
||||
// Caches for ClusterDistance.
|
||||
// Caches for other fonts but matching this unichar. -1 indicates not set.
|
||||
// Indexed by compact font index from font_id_map_.
|
||||
GenericVector<float> font_distance_cache;
|
||||
// Caches for other unichars but matching this font. -1 indicates not set.
|
||||
GenericVector<float> unichar_distance_cache;
|
||||
// Cache for the rest (non matching font and unichar.)
|
||||
// A cache of distances computed by ReliablySeparable.
|
||||
GenericVector<FontClassDistance> distance_cache;
|
||||
};
|
||||
|
||||
PointerVector<TrainingSample> samples_;
|
||||
// Number of samples before replication/randomization.
|
||||
int num_raw_samples_;
|
||||
// Character set we are training for.
|
||||
UNICHARSET unicharset_;
|
||||
// Character set size to which the 2-d arrays below refer.
|
||||
int unicharset_size_;
|
||||
// Map to allow the font_class_array_ below to be compact.
|
||||
// The sparse space is the real font_id, used in samples_ .
|
||||
// The compact space is an index to font_class_array_
|
||||
IndexMapBiDi font_id_map_;
|
||||
// A 2-d array of FontClassInfo holding information related to each
|
||||
// (font_id, class_id) pair.
|
||||
GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;
|
||||
|
||||
// Reference to the fontinfo_table_ in MasterTrainer. Provides names
|
||||
// for font_ids in the samples. Not serialized!
|
||||
const UnicityTable<FontInfo>& fontinfo_table_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
|
||||
#endif // TRAININGSAMPLESETSET_H_
|
Loading…
Reference in New Issue
Block a user