Made some major classifier and clustering improvements

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@130 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith 2008-02-01 00:07:59 +00:00
parent 166c867d84
commit 6b5e0c4046
10 changed files with 1133 additions and 1971 deletions

View File

@ -62,7 +62,7 @@ float compare_tess_blobs(TBLOB *blob1,
SetBaseLineMatch();
IntegerMatcher (ClassForClassId (ad_templates->Templates, CMP_CLASS),
AllProtosOn, AllConfigsOn, fcount, fcount,
int_features, 0, 0, &int_result, testedit_match_debug);
int_features, 0, &int_result, testedit_match_debug);
FreeFeatureSet(float_features);
if (int_result.Rating < 0)
int_result.Rating = MAX_FLOAT32;

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,7 @@
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "tprintf.h"
#include "danerror.h"
#include "freelist.h"
#include <math.h>
@ -281,6 +282,7 @@ PROTOTYPE *MakeDegenerateProto(UINT16 N,
INT32 MinSamples);
PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
CLUSTERCONFIG *Config,
CLUSTER *Cluster,
STATISTICS *Statistics);
@ -1037,7 +1039,7 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
}
if (HOTELLING && Config->ProtoStyle == elliptical) {
Proto = TestEllipticalProto(Clusterer, Cluster, Statistics);
Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
if (Proto != NULL) {
FreeStatistics(Statistics);
return Proto;
@ -1129,6 +1131,7 @@ PROTOTYPE *MakeDegenerateProto( //this was MinSample
/** TestEllipticalProto ****************************************************
Parameters: Clusterer data struct containing samples being clustered
Config provides the magic number of samples that make a good cluster
Cluster cluster to be made into an elliptical prototype
Statistics statistical info about cluster
Globals: None
@ -1141,24 +1144,60 @@ Operation: This routine tests the specified cluster to see if **
Return: Pointer to new elliptical prototype or NULL.
****************************************************************************/
PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
CLUSTERCONFIG *Config,
CLUSTER *Cluster,
STATISTICS *Statistics) {
// Fraction of the number of samples used as a range around 1 within
// which a cluster has the magic size that allows a boost to the
// FTable by kFTableBoostMargin, thus allowing clusters near the
// magic size (equal to the number of sample characters) to be more
// likely to stay together.
const double kMagicSampleMargin = 0.0625;
const double kFTableBoostMargin = 2.0;
int N = Clusterer->SampleSize;
CLUSTER* Left = Cluster->Left;
CLUSTER* Right = Cluster->Right;
if (Left == NULL || Right == NULL)
return NULL;
int TotalDims = Left->SampleCount + Right->SampleCount;
if (TotalDims < N + 1)
if (TotalDims < N + 1 || TotalDims < 2)
return NULL;
FLOAT32* Inverse = (FLOAT32 *) Emalloc(N * N * sizeof(FLOAT32));
FLOAT32* Delta = (FLOAT32*) Emalloc(N * sizeof(FLOAT32));
double err = InvertMatrix(Statistics->CoVariance, N, Inverse);
if (err > 1) {
cprintf("Clustering error: Matrix inverse failed with error %g\n", err);
const int kMatrixSize = N * N * sizeof(FLOAT32);
FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
// Compute a new covariance matrix that only uses essential features.
for (int i = 0; i < N; ++i) {
int row_offset = i * N;
if (!Clusterer->ParamDesc[i].NonEssential) {
for (int j = 0; j < N; ++j) {
if (!Clusterer->ParamDesc[j].NonEssential)
Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
else
Covariance[j + row_offset] = 0.0f;
}
} else {
for (int j = 0; j < N; ++j) {
if (i == j)
Covariance[j + row_offset] = 1.0f;
else
Covariance[j + row_offset] = 0.0f;
}
}
}
double err = InvertMatrix(Covariance, N, Inverse);
if (err > 1) {
tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
}
int EssentialN = 0;
for (int dim = 0; dim < N; ++dim) {
Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
if (!Clusterer->ParamDesc[dim].NonEssential) {
Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
++EssentialN;
} else {
Delta[dim] = 0.0f;
}
}
// Compute Hotelling's T-squared.
double Tsq = 0.0;
@ -1169,19 +1208,30 @@ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
}
Tsq += Delta[x] * temp;
}
memfree(Covariance);
memfree(Inverse);
memfree(Delta);
Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
double F = Tsq * (TotalDims - N - 1) / ((TotalDims - N) * 2);
int Fx = N;
// Changed this function to match the formula in
// Statistical Methods in Medical Research p 473
// By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
// Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
int Fx = EssentialN;
if (Fx > FTABLE_X)
Fx = FTABLE_X;
--Fx;
int Fy = TotalDims - N - 1;
int Fy = TotalDims - EssentialN - 1;
if (Fy > FTABLE_Y)
Fy = FTABLE_Y;
--Fy;
if (F < FTable[Fy][Fx]) {
double FTarget = FTable[Fy][Fx];
if (Config->MagicSamples > 0 &&
TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
// Give magic-sized clusters a magic FTable boost.
FTarget += kFTableBoostMargin;
}
if (F < FTarget) {
return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
}
return NULL;

View File

@ -55,6 +55,7 @@ typedef struct // parameters to control clustering
// more than 1 feature in that cluster
FLOAT32 Independence; // desired independence between dimensions
FLOAT64 Confidence; // desired confidence in prototypes created
int MagicSamples; // Ideal number of samples in a cluster.
}
@ -80,8 +81,13 @@ FLOATUNION;
typedef struct proto
{
unsigned Significant:1; // TRUE if prototype is significant
unsigned Merged:1; // Merged after clustering so do not output
// but kept for display purposes. If it has no
// samples then it was actually merged.
// Otherwise it matched an already significant
// cluster.
unsigned Style:2; // spherical, elliptical, or mixed
unsigned NumSamples:29; // number of samples in the cluster
unsigned NumSamples:28; // number of samples in the cluster
CLUSTER *Cluster; // ptr to cluster which made prototype
DISTRIBUTION *Distrib; // different distribution for each dimension
FLOAT32 *Mean; // prototype mean
@ -129,19 +135,22 @@ CLUSTERER *MakeClusterer (INT16 SampleSize, PARAM_DESC ParamDesc[]);
SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], INT32 CharID);
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
void FreeClusterer(CLUSTERER *Clusterer);
void FreeClusterer(CLUSTERER *Clusterer);
void FreeProtoList(LIST *ProtoList);
void FreeProtoList(LIST *ProtoList);
void FreePrototype(void *arg); //PROTOTYPE *Prototype);
CLUSTER *NextSample(LIST *SearchState);
CLUSTER *NextSample(LIST *SearchState);
FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension);
FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension);
FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension);
FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension);
INT32 MergeClusters(INT16 N, PARAM_DESC ParamDesc[], INT32 n1, INT32 n2,
FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);
//--------------Global Data Definitions and Declarations---------------------------
// define errors that can be trapped

View File

@ -41,7 +41,7 @@
StartParamDesc (MicroFeatureParams)
DefineParam (0, 0, -0.5, 0.5)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (1, 0, 0.0, 1.0)
DefineParam (0, 1, -0.5, 0.5)
DefineParam (0, 1, -0.5, 0.5)
@ -65,9 +65,9 @@ DefineFeature (PicoFeatDesc, 2, 1, 1, MAX_UINT8, "Pico", "pf", PicoFeatParams)
/* define all of the parameters for the NormFeat type*/
StartParamDesc (CharNormParams)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
EndParamDesc
/* now define the feature type itself (see features.h for info about each
parameter).*/

File diff suppressed because it is too large Load Diff

View File

@ -30,6 +30,7 @@ typedef struct
FLOAT32 Rating;
UINT8 Config;
UINT8 Config2;
UINT16 FeatureMisses;
}
@ -38,8 +39,7 @@ INT_RESULT_STRUCT, *INT_RESULT;
typedef struct
{
FLOAT32 Rating;
FLOAT32 Rating2;
UINT32 config_mask;
INT_RESULT_STRUCT IMResult;
CLASS_ID Class;
}
@ -68,42 +68,12 @@ int ClassPruner(INT_TEMPLATES IntTemplates,
CLASS_PRUNER_RESULTS Results,
int Debug);
int feature_pruner(INT_TEMPLATES IntTemplates,
INT16 NumFeatures,
INT_FEATURE_ARRAY Features,
INT32 NumClasses,
CLASS_PRUNER_RESULTS Results);
int prune_configs(INT_TEMPLATES IntTemplates,
INT32 min_misses,
INT16 NumFeatures,
INT_FEATURE_ARRAY Features,
CLASS_NORMALIZATION_ARRAY NormalizationFactors,
INT32 class_count,
UINT16 BlobLength,
CLASS_PRUNER_RESULTS Results,
int Debug);
void PruningMatcher(INT_CLASS ClassTemplate,
UINT16 BlobLength,
INT16 NumFeatures,
INT_FEATURE_ARRAY Features,
INT32 min_misses,
UINT8 NormalizationFactor,
INT_RESULT Result,
int Debug);
void config_mask_to_proto_mask(INT_CLASS ClassTemplate,
BIT_VECTOR config_mask,
BIT_VECTOR proto_mask);
void IntegerMatcher(INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
UINT16 BlobLength,
INT16 NumFeatures,
INT_FEATURE_ARRAY Features,
INT32 min_misses,
UINT8 NormalizationFactor,
INT_RESULT Result,
int Debug);
@ -126,19 +96,19 @@ int FindBadFeatures(INT_CLASS ClassTemplate,
FEATURE_ID *FeatureArray,
int Debug);
void InitIntegerMatcher();
void InitIntegerMatcher();
void InitIntegerMatcherVars();
void InitIntegerMatcherVars();
void PrintIntMatcherStats(FILE *f);
void PrintIntMatcherStats(FILE *f);
void SetProtoThresh(FLOAT32 Threshold);
void SetProtoThresh(FLOAT32 Threshold);
void SetFeatureThresh(FLOAT32 Threshold);
void SetFeatureThresh(FLOAT32 Threshold);
void SetBaseLineMatch();
void SetBaseLineMatch();
void SetCharNormMatch();
void SetCharNormMatch();
/**----------------------------------------------------------------------------
Private Function Prototypes
@ -160,14 +130,7 @@ void IMDebugConfigurationSum(INT_FEATURE FeatureNum,
UINT8 *FeatureEvidence,
INT32 ConfigCount);
void PMUpdateTablesForFeature (INT_CLASS ClassTemplate,
int FeatureNum,
INT_FEATURE Feature,
UINT8 FeatureEvidence[MAX_NUM_CONFIGS],
int SumOfFeatureEvidence[MAX_NUM_CONFIGS],
int Debug);
void IMUpdateTablesForFeature (INT_CLASS ClassTemplate,
int IMUpdateTablesForFeature (INT_CLASS ClassTemplate,
BIT_VECTOR ProtoMask,
BIT_VECTOR ConfigMask,
int FeatureNum,
@ -209,10 +172,6 @@ UINT8
ProtoEvidence[MAX_NUM_PROTOS]
[MAX_PROTO_INDEX], INT16 NumFeatures);
void PMNormalizeSumOfEvidences (INT_CLASS ClassTemplate,
int SumOfFeatureEvidence[MAX_NUM_CONFIGS],
INT16 NumFeatures, INT32 used_features);
void IMNormalizeSumOfEvidences (INT_CLASS ClassTemplate,
int SumOfFeatureEvidence[MAX_NUM_CONFIGS],
INT16 NumFeatures, INT32 used_features);
@ -229,7 +188,7 @@ void IMDebugBestMatch(int BestMatch,
UINT8 NormalizationFactor);
#endif
void HeapSort (int n, register INT16 ra[], register UINT8 rb[]);
void HeapSort (int n, register int ra[], register int rb[]);
/**----------------------------------------------------------------------------
Global Data Definitions and Declarations

View File

@ -61,6 +61,26 @@ static jmp_buf QuickExit;
static void_proc WalkAction;
// Helper function to find the next essential dimension in a cycle.
static int NextLevel(int level) {
do {
++level;
if (level >= N)
level = 0;
} while (KeyDesc[level].NonEssential);
return level;
}
// Helper function to find the previous essential dimension in a cycle.
static int PrevLevel(int level) {
do {
--level;
if (level < 0)
level = N - 1;
} while (KeyDesc[level].NonEssential);
return level;
}
/**----------------------------------------------------------------------------
Public Code
----------------------------------------------------------------------------**/
@ -136,7 +156,7 @@ MakeKDTree (INT16 KeySize, PARAM_DESC KeyDesc[]) {
/*---------------------------------------------------------------------------*/
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) {
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) {
/*
** Parameters:
** Tree K-D tree in which data is to be stored
@ -164,7 +184,7 @@ void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) {
KeyDesc = &(Tree->KeyDesc[0]);
PtrToNode = &(Tree->Root.Left);
Node = *PtrToNode;
Level = 0;
Level = NextLevel(-1);
while (Node != NULL) {
if (Key[Level] < Node->BranchPoint) {
PtrToNode = &(Node->Left);
@ -176,9 +196,7 @@ void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) {
if (Key[Level] < Node->RightBranch)
Node->RightBranch = Key[Level];
}
Level++;
if (Level >= N)
Level = 0;
Level = NextLevel(Level);
Node = *PtrToNode;
}
@ -239,7 +257,7 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
KeyDesc = &(Tree->KeyDesc[0]);
Father = &(Tree->Root);
Current = Father->Left;
Level = 0;
Level = NextLevel(-1);
/* search tree for node to be deleted */
while ((Current != NULL) && (!NodeFound (Current, Key, Data))) {
@ -249,9 +267,7 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
else
Current = Current->Right;
Level++;
if (Level >= N)
Level = 0;
Level = NextLevel(Level);
}
if (Current != NULL) { /* if node to be deleted was found */
@ -271,15 +287,11 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
else
break;
Level++;
if (Level >= N)
Level = 0;
Level = NextLevel(Level);
}
/* compute level of replacement node's father */
Level--;
if (Level < 0)
Level = N - 1;
Level = PrevLevel(Level);
/* disconnect replacement node from it's father */
if (FatherReplacement->Left == Replacement) {
@ -304,7 +316,7 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) {
else
Father->Right = Replacement;
}
FreeKDNode(Current);
FreeKDNode(Current);
}
} /* KDDelete */
@ -381,7 +393,7 @@ void *NBuffer, FLOAT32 DBuffer[]) {
/*---------------------------------------------------------------------------*/
void KDWalk(KDTREE *Tree, void_proc Action) {
void KDWalk(KDTREE *Tree, void_proc Action) {
/*
** Parameters:
** Tree ptr to K-D tree to be walked
@ -401,12 +413,12 @@ void KDWalk(KDTREE *Tree, void_proc Action) {
*/
WalkAction = Action;
if (Tree->Root.Left != NULL)
Walk (Tree->Root.Left, 0);
Walk (Tree->Root.Left, NextLevel(-1));
} /* KDWalk */
/*---------------------------------------------------------------------------*/
void FreeKDTree(KDTREE *Tree) {
void FreeKDTree(KDTREE *Tree) {
/*
** Parameters:
** Tree tree data structure to be released
@ -424,7 +436,7 @@ void FreeKDTree(KDTREE *Tree) {
** 5/26/89, DSJ, Created.
*/
FreeSubTree (Tree->Root.Left);
memfree(Tree);
memfree(Tree);
} /* FreeKDTree */
@ -496,7 +508,7 @@ MakeKDNode (FLOAT32 Key[], char *Data, int Index) {
/*---------------------------------------------------------------------------*/
void FreeKDNode(KDNODE *Node) {
void FreeKDNode(KDNODE *Node) {
/*
** Parameters:
** Node ptr to node data structure to be freed
@ -516,7 +528,7 @@ void FreeKDNode(KDNODE *Node) {
/*---------------------------------------------------------------------------*/
void Search(int Level, KDNODE *SubTree) {
void Search(int Level, KDNODE *SubTree) {
/*
** Parameters:
** Level level in tree of sub-tree to be searched
@ -561,12 +573,12 @@ void Search(int Level, KDNODE *SubTree) {
Distance[NumberOfNeighbors] = d;
NumberOfNeighbors++;
if (NumberOfNeighbors == MaxNeighbors)
FindMaxDistance();
FindMaxDistance();
}
else {
Neighbor[Furthest] = SubTree->Data;
Distance[Furthest] = d;
FindMaxDistance();
FindMaxDistance();
}
}
if (QueryPoint[Level] < SubTree->BranchPoint) {
@ -575,7 +587,7 @@ void Search(int Level, KDNODE *SubTree) {
OldLBoxEdge = LBMax[Level];
LBMax[Level] = SubTree->RightBranch;
if (SubTree->Left != NULL)
Search (Level + 1, SubTree->Left);
Search (NextLevel(Level), SubTree->Left);
SBMax[Level] = OldSBoxEdge;
LBMax[Level] = OldLBoxEdge;
OldSBoxEdge = SBMin[Level];
@ -583,7 +595,7 @@ void Search(int Level, KDNODE *SubTree) {
OldLBoxEdge = LBMin[Level];
LBMin[Level] = SubTree->LeftBranch;
if ((SubTree->Right != NULL) && QueryIntersectsSearch ())
Search (Level + 1, SubTree->Right);
Search (NextLevel(Level), SubTree->Right);
SBMin[Level] = OldSBoxEdge;
LBMin[Level] = OldLBoxEdge;
}
@ -593,7 +605,7 @@ void Search(int Level, KDNODE *SubTree) {
OldLBoxEdge = LBMin[Level];
LBMin[Level] = SubTree->LeftBranch;
if (SubTree->Right != NULL)
Search (Level + 1, SubTree->Right);
Search (NextLevel(Level), SubTree->Right);
SBMin[Level] = OldSBoxEdge;
LBMin[Level] = OldLBoxEdge;
OldSBoxEdge = SBMax[Level];
@ -601,7 +613,7 @@ void Search(int Level, KDNODE *SubTree) {
OldLBoxEdge = LBMax[Level];
LBMax[Level] = SubTree->RightBranch;
if ((SubTree->Left != NULL) && QueryIntersectsSearch ())
Search (Level + 1, SubTree->Left);
Search (NextLevel(Level), SubTree->Left);
SBMax[Level] = OldSBoxEdge;
LBMax[Level] = OldLBoxEdge;
}
@ -657,7 +669,7 @@ register FLOAT32 p1[], register FLOAT32 p2[]) {
/*---------------------------------------------------------------------------*/
void FindMaxDistance() {
void FindMaxDistance() {
/*
** Parameters:
** None
@ -690,7 +702,7 @@ void FindMaxDistance() {
/*---------------------------------------------------------------------------*/
int QueryIntersectsSearch() {
int QueryIntersectsSearch() {
/*
** Parameters:
** None
@ -765,7 +777,7 @@ int QueryIntersectsSearch() {
/*---------------------------------------------------------------------------*/
int QueryInSearch() {
int QueryInSearch() {
/*
** Parameters:
** None
@ -813,7 +825,7 @@ int QueryInSearch() {
/*---------------------------------------------------------------------------*/
void Walk(KDNODE *SubTree, INT32 Level) {
void Walk(KDNODE *SubTree, INT32 Level) {
/*
** Parameters:
** SubTree ptr to root of subtree to be walked
@ -842,17 +854,17 @@ void Walk(KDNODE *SubTree, INT32 Level) {
else {
(*WalkAction) (SubTree->Data, preorder, Level);
if (SubTree->Left != NULL)
Walk (SubTree->Left, Level + 1);
Walk (SubTree->Left, NextLevel(Level));
(*WalkAction) (SubTree->Data, postorder, Level);
if (SubTree->Right != NULL)
Walk (SubTree->Right, Level + 1);
Walk (SubTree->Right, NextLevel(Level));
(*WalkAction) (SubTree->Data, endorder, Level);
}
} /* Walk */
/*---------------------------------------------------------------------------*/
void FreeSubTree(KDNODE *SubTree) {
void FreeSubTree(KDNODE *SubTree) {
/*
** Parameters:
** SubTree ptr to root node of sub-tree to be freed
@ -867,6 +879,6 @@ void FreeSubTree(KDNODE *SubTree) {
if (SubTree != NULL) {
FreeSubTree (SubTree->Left);
FreeSubTree (SubTree->Right);
memfree(SubTree);
memfree(SubTree);
}
} /* FreeSubTree */

View File

@ -49,6 +49,7 @@ int row_number; /* cjn: fixes link problem */
typedef struct
{
char *Label;
int SampleCount;
LIST List;
}
LABELEDLISTNODE, *LABELEDLIST;
@ -143,7 +144,7 @@ static BOOL8 ShowInsignificantProtos = FALSE;
//-M 0.025 -B 0.05 -I 0.8 -C 1e-3
static CLUSTERCONFIG Config =
{
elliptical, 0.025, 0.05, 0.8, 1e-3
elliptical, 0.025, 0.05, 0.8, 1e-3, 0
};
static FLOAT32 RoundingAccuracy = 0.0;
@ -235,6 +236,7 @@ int main (
//printf ("\nClustering %s ...", CharSample->Label);
Clusterer = SetUpForClustering(CharSample);
float SavedMinSamples = Config.MinSamples;
Config.MagicSamples = CharSample->SampleCount;
while (Config.MinSamples > 0.001) {
ProtoList = ClusterSamples(Clusterer, &Config);
if (NumberOfProtos(ProtoList, 1, 0) > 0)
@ -451,6 +453,7 @@ void ReadTrainingSamples (
f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
}
CharSample->List = push (CharSample->List, FeatureSamples);
CharSample->SampleCount++;
for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
if (Type != i)
FreeFeatureSet (FeaturesOfType (CharDesc, i));
@ -513,6 +516,7 @@ LABELEDLIST NewLabeledList (
LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
strcpy (LabeledList->Label, Label);
LabeledList->List = NIL;
LabeledList->SampleCount = 0;
return (LabeledList);
} /* NewLabeledList */

View File

@ -32,12 +32,14 @@
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "mf.h"
#include "general.h"
#include "clusttool.h"
#include "cluster.h"
#include "protos.h"
#include "minmax.h"
#include "debug.h"
#include "tprintf.h"
#include "const.h"
#include "mergenf.h"
#include "name2char.h"
@ -50,18 +52,21 @@
#include <string.h>
#include <stdio.h>
#define _USE_MATH_DEFINES
#include <math.h>
#define MAXNAMESIZE 80
#define MAX_NUM_SAMPLES 10000
#define PROGRAM_FEATURE_TYPE "mf"
#define MINSD (1.0f / 128.0f)
#define MINSD_ANGLE (1.0f / 64.0f)
int row_number; /* cjn: fixes link problem */
typedef struct
{
char *Label;
int SampleCount;
LIST List;
}
LABELEDLISTNODE, *LABELEDLIST;
@ -151,6 +156,9 @@ PARAMDESC *ConvertToPARAMDESC(
PARAM_DESC* Param_Desc,
int N);
*/
void MergeInsignificantProtos(LIST ProtoList, const char* label,
CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
LIST RemoveInsignificantProtos(
LIST ProtoList,
BOOL8 KeepSigProtos,
@ -184,21 +192,51 @@ static BOOL8 ShowInsignificantProtos = FALSE;
// global variable to hold configuration parameters to control clustering
// -M 0.40 -B 0.05 -I 1.0 -C 1e-6.
static CLUSTERCONFIG Config =
{ elliptical, 0.40, 0.05, 1.0, 1e-6 };
{ elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
static FLOAT32 RoundingAccuracy = 0.0;
static FLOAT32 RoundingAccuracy = 0.0f;
// The unicharset used during mftraining
static UNICHARSET unicharset_mftraining;
const char* test_ch = "";
/*----------------------------------------------------------------------------
Public Code
-----------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
int main (
int argc,
char **argv)
void DisplayProtoList(const char* ch, LIST protolist) {
void* window = c_create_window("Char samples", 50, 200,
520, 520, -130.0, 130.0, -130.0, 130.0);
LIST proto = protolist;
iterate(proto) {
PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));
if (prototype->Significant)
c_line_color_index(window, Green);
else if (prototype->NumSamples == 0)
c_line_color_index(window, Blue);
else if (prototype->Merged)
c_line_color_index(window, Magenta);
else
c_line_color_index(window, Red);
float x = CenterX(prototype->Mean);
float y = CenterY(prototype->Mean);
double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
c_move(window, (x - dx) * 256, (y - dy) * 256);
c_draw(window, (x + dx) * 256, (y + dy) * 256);
if (prototype->Significant)
tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",
x, y, dx, dy, prototype->NumSamples);
else if (prototype->NumSamples > 0 && !prototype->Merged)
tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",
x, y, dx, dy, prototype->NumSamples);
}
c_make_current(window);
}
/*---------------------------------------------------------------------------*/
int main (int argc, char **argv) {
/*
** Parameters:
** argc number of command line arguments
@ -231,123 +269,119 @@ int main (
** History: Fri Aug 18 08:56:17 1989, DSJ, Created.
** Mon May 18 1998, Christy Russson, Revistion started.
*/
char *PageName;
FILE *TrainingPage;
FILE *OutFile;
LIST CharList;
CLUSTERER *Clusterer = NULL;
LIST ProtoList = NIL;
LABELEDLIST CharSample;
PROTOTYPE *Prototype;
LIST ClassList = NIL;
int Cid, Pid;
PROTO Proto;
PROTO_STRUCT DummyProto;
BIT_VECTOR Config2;
MERGE_CLASS MergeClass;
INT_TEMPLATES IntTemplates;
LIST pCharList, pProtoList;
char Filename[MAXNAMESIZE];
{
char *PageName;
FILE *TrainingPage;
FILE *OutFile;
LIST CharList;
CLUSTERER *Clusterer = NULL;
LIST ProtoList = NIL;
LABELEDLIST CharSample;
PROTOTYPE *Prototype;
LIST ClassList = NIL;
int Cid, Pid;
PROTO Proto;
PROTO_STRUCT DummyProto;
BIT_VECTOR Config2;
MERGE_CLASS MergeClass;
INT_TEMPLATES IntTemplates;
LIST pCharList, pProtoList;
char Filename[MAXNAMESIZE];
// Clean the unichar set
unicharset_mftraining.clear();
// Space character needed to represent NIL classification
unicharset_mftraining.unichar_insert(" ");
// Clean the unichar set
unicharset_mftraining.clear();
// Space character needed to represent NIL classification
unicharset_mftraining.unichar_insert(" ");
ParseArguments (argc, argv);
InitFastTrainerVars ();
InitSubfeatureVars ();
while ((PageName = GetNextFilename()) != NULL) {
printf ("Reading %s ...\n", PageName);
TrainingPage = Efopen (PageName, "r");
CharList = ReadTrainingSamples (TrainingPage);
fclose (TrainingPage);
//WriteTrainingSamples (Directory, CharList);
pCharList = CharList;
iterate(pCharList) {
//Cluster
CharSample = (LABELEDLIST) first_node (pCharList);
// printf ("\nClustering %s ...", CharSample->Label);
Clusterer = SetUpForClustering(CharSample);
Config.MagicSamples = CharSample->SampleCount;
ProtoList = ClusterSamples(Clusterer, &Config);
CleanUpUnusedData(ProtoList);
ParseArguments (argc, argv);
InitFastTrainerVars ();
InitSubfeatureVars ();
while ((PageName = GetNextFilename()) != NULL)
{
printf ("Reading %s ...\n", PageName);
TrainingPage = Efopen (PageName, "r");
CharList = ReadTrainingSamples (TrainingPage);
fclose (TrainingPage);
//WriteTrainingSamples (Directory, CharList);
pCharList = CharList;
iterate(pCharList)
{
//Cluster
CharSample = (LABELEDLIST) first_node (pCharList);
// printf ("\nClustering %s ...", CharSample->Label);
Clusterer = SetUpForClustering(CharSample);
ProtoList = ClusterSamples(Clusterer, &Config);
//WriteClusteredTrainingSamples (Directory, ProtoList, Clusterer, CharSample);
CleanUpUnusedData(ProtoList);
//Merge
MergeInsignificantProtos(ProtoList, CharSample->Label,
Clusterer, &Config);
if (strcmp(test_ch, CharSample->Label) == 0)
DisplayProtoList(test_ch, ProtoList);
ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos,
ShowInsignificantProtos,
Clusterer->SampleSize);
FreeClusterer(Clusterer);
MergeClass = FindClass (ClassList, CharSample->Label);
if (MergeClass == NULL) {
MergeClass = NewLabeledClass (CharSample->Label);
ClassList = push (ClassList, MergeClass);
}
Cid = AddConfigToClass(MergeClass->Class);
pProtoList = ProtoList;
iterate (pProtoList) {
Prototype = (PROTOTYPE *) first_node (pProtoList);
//Merge
ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos,
ShowInsignificantProtos, Clusterer->SampleSize);
FreeClusterer(Clusterer);
MergeClass = FindClass (ClassList, CharSample->Label);
if (MergeClass == NULL)
{
MergeClass = NewLabeledClass (CharSample->Label);
ClassList = push (ClassList, MergeClass);
}
Cid = AddConfigToClass(MergeClass->Class);
pProtoList = ProtoList;
iterate (pProtoList)
{
Prototype = (PROTOTYPE *) first_node (pProtoList);
// see if proto can be approximated by existing proto
Pid = FindClosestExistingProto (MergeClass->Class, MergeClass->NumMerged, Prototype);
if (Pid == NO_PROTO)
{
Pid = AddProtoToClass (MergeClass->Class);
Proto = ProtoIn (MergeClass->Class, Pid);
MakeNewFromOld (Proto, Prototype);
MergeClass->NumMerged[Pid] = 1;
}
else
{
MakeNewFromOld (&DummyProto, Prototype);
ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto,
(FLOAT32) MergeClass->NumMerged[Pid], 1.0,
ProtoIn (MergeClass->Class, Pid));
MergeClass->NumMerged[Pid] ++;
}
Config2 = ConfigIn (MergeClass->Class, Cid);
AddProtoToConfig (Pid, Config2);
}
FreeProtoList (&ProtoList);
}
FreeTrainingSamples (CharList);
}
//WriteMergedTrainingSamples(Directory,ClassList);
WriteMicrofeat(Directory, ClassList);
InitIntProtoVars ();
InitPrototypes ();
SetUpForFloat2Int(ClassList);
IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining);
strcpy (Filename, "");
if (Directory != NULL)
{
strcat (Filename, Directory);
strcat (Filename, "/");
}
strcat (Filename, "inttemp");
// see if proto can be approximated by existing proto
Pid = FindClosestExistingProto(MergeClass->Class,
MergeClass->NumMerged, Prototype);
if (Pid == NO_PROTO) {
Pid = AddProtoToClass (MergeClass->Class);
Proto = ProtoIn (MergeClass->Class, Pid);
MakeNewFromOld (Proto, Prototype);
MergeClass->NumMerged[Pid] = 1;
}
else {
MakeNewFromOld (&DummyProto, Prototype);
ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto,
(FLOAT32) MergeClass->NumMerged[Pid], 1.0,
ProtoIn (MergeClass->Class, Pid));
MergeClass->NumMerged[Pid] ++;
}
Config2 = ConfigIn (MergeClass->Class, Cid);
AddProtoToConfig (Pid, Config2);
}
FreeProtoList (&ProtoList);
}
FreeTrainingSamples (CharList);
}
//WriteMergedTrainingSamples(Directory,ClassList);
WriteMicrofeat(Directory, ClassList);
InitIntProtoVars ();
InitPrototypes ();
SetUpForFloat2Int(ClassList);
IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining);
strcpy (Filename, "");
if (Directory != NULL) {
strcat (Filename, Directory);
strcat (Filename, "/");
}
strcat (Filename, "inttemp");
#ifdef __UNIX__
OutFile = Efopen (Filename, "w");
OutFile = Efopen (Filename, "w");
#else
OutFile = Efopen (Filename, "wb");
OutFile = Efopen (Filename, "wb");
#endif
WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining);
fclose (OutFile);
strcpy (Filename, "");
if (Directory != NULL)
{
strcat (Filename, Directory);
strcat (Filename, "/");
}
strcat (Filename, "pffmtable");
// Now create pffmtable.
WritePFFMTable(IntTemplates, Filename);
printf ("Done!\n"); /**/
FreeLabeledClassList (ClassList);
WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining);
fclose (OutFile);
strcpy (Filename, "");
if (Directory != NULL) {
strcat (Filename, Directory);
strcat (Filename, "/");
}
strcat (Filename, "pffmtable");
// Now create pffmtable.
WritePFFMTable(IntTemplates, Filename);
printf ("Done!\n"); /**/
FreeLabeledClassList (ClassList);
return 0;
} /* main */
@ -438,8 +472,8 @@ char **argv)
case 'R':
ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy );
if ( ParametersRead != 1 ) Error = TRUE;
else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
else if ( RoundingAccuracy > 0.01f ) RoundingAccuracy = 0.01f;
else if ( RoundingAccuracy < 0.0f ) RoundingAccuracy = 0.0f;
break;
case 'S':
switch ( tessoptarg[0] )
@ -547,9 +581,12 @@ LIST ReadTrainingSamples (
for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
FEATURE f = FeatureSamples->Features[feature];
for (int dim =0; dim < f->Type->NumParams; ++dim)
f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
f->Params[dim] += dim == MFDirection ?
UniformRandomNumber(-MINSD_ANGLE, MINSD_ANGLE) :
UniformRandomNumber(-MINSD, MINSD);
}
CharSample->List = push (CharSample->List, FeatureSamples);
CharSample->SampleCount++;
for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
if (Type != i)
FreeFeatureSet (FeaturesOfType (CharDesc, i));
@ -631,6 +668,7 @@ LABELEDLIST NewLabeledList (
LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
strcpy (LabeledList->Label, Label);
LabeledList->List = NIL;
LabeledList->SampleCount = 0;
return (LabeledList);
} /* NewLabeledList */
@ -1030,7 +1068,7 @@ CLUSTERER *SetUpForClustering(
if (Sample == NULL)
Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
for (j=0; j < N; j++)
if (RoundingAccuracy != 0.0)
if (RoundingAccuracy != 0.0f)
Sample[j] = round(FeatureSet->Features[i]->Params[j], RoundingAccuracy);
else
Sample[j] = FeatureSet->Features[i]->Params[j];
@ -1043,6 +1081,71 @@ CLUSTERER *SetUpForClustering(
} /* SetUpForClustering */
/*------------------------------------------------------------------------*/
void MergeInsignificantProtos(LIST ProtoList, const char* label,
CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
PROTOTYPE *Prototype;
bool debug = strcmp(test_ch, label) == 0;
LIST pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = (PROTOTYPE *) first_node (pProtoList);
if (Prototype->Significant || Prototype->Merged)
continue;
FLOAT32 best_dist = 0.125;
PROTOTYPE* best_match = NULL;
// Find the nearest alive prototype.
LIST list_it = ProtoList;
iterate(list_it) {
PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
if (test_p != Prototype && !test_p->Merged) {
FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
Clusterer->ParamDesc,
Prototype->Mean, test_p->Mean);
if (dist < best_dist) {
best_match = test_p;
best_dist = dist;
}
}
}
if (best_match != NULL && !best_match->Significant) {
if (debug)
tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
best_match->NumSamples, Prototype->NumSamples,
best_match->Mean[0], best_match->Mean[1],
Prototype->Mean[0], Prototype->Mean[1]);
best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
Clusterer->ParamDesc,
best_match->NumSamples,
Prototype->NumSamples,
best_match->Mean,
best_match->Mean, Prototype->Mean);
Prototype->NumSamples = 0;
Prototype->Merged = 1;
} else if (best_match != NULL) {
if (debug)
tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
Prototype->Mean[0], Prototype->Mean[1],
best_match->Mean[0], best_match->Mean[1]);
Prototype->Merged = 1;
}
}
// Mark significant those that now have enough samples.
int min_samples = (INT32) (Config->MinSamples * Clusterer->NumChar);
pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = (PROTOTYPE *) first_node (pProtoList);
// Process insignificant protos that do not match a green one
if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
!Prototype->Merged) {
if (debug)
tprintf("Red proto at %g,%g becoming green\n",
Prototype->Mean[0], Prototype->Mean[1]);
Prototype->Significant = true;
}
}
} /* MergeInsignificantProtos */
/*------------------------------------------------------------------------*/
LIST RemoveInsignificantProtos(
LIST ProtoList,