/****************************************************************************** ** Filename: mfTraining.c ** Purpose: Separates training pages into files for each character. ** Strips from files only the features and there parameters of the feature type mf. ** Author: Dan Johnson ** Revisment: Christy Russon ** Environment: HPUX 6.5 ** Library: HPUX 6.5 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created. ** 5/25/90, DSJ, Adapted to multiple feature types. ** Tuesday, May 17, 1998 Changes made to make feature specific and ** simplify structures. First step in simplifying training process. ** ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. ******************************************************************************/ /**---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------**/ #include "oldlist.h" #include "efio.h" #include "emalloc.h" #include "featdefs.h" #include "tessopt.h" #include "ocrfeatures.h" #include "mf.h" #include "general.h" #include "clusttool.h" #include "cluster.h" #include "protos.h" #include "minmax.h" #include "debug.h" #include "tprintf.h" #include "const.h" #include "mergenf.h" #include "name2char.h" #include "intproto.h" #include "variables.h" #include "freelist.h" #include "efio.h" #include "danerror.h" #include "globals.h" #include #include #define _USE_MATH_DEFINES #include #ifdef WIN32 #ifndef M_PI #define M_PI 3.14159265358979323846 #endif #endif #define MAXNAMESIZE 80 #define MAX_NUM_SAMPLES 10000 #define PROGRAM_FEATURE_TYPE "mf" #define MINSD (1.0f / 128.0f) #define MINSD_ANGLE (1.0f / 64.0f) int row_number; /* cjn: fixes link problem */ typedef struct { char *Label; int SampleCount; LIST List; } LABELEDLISTNODE, *LABELEDLIST; typedef struct { char* Label; int NumMerged[MAX_NUM_PROTOS]; CLASS_TYPE Class; }MERGE_CLASS_NODE; typedef MERGE_CLASS_NODE* MERGE_CLASS; #define round(x,frag)(floor(x/frag+.5)*frag) /**---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------**/ int main ( int argc, char **argv); /**---------------------------------------------------------------------------- Private Function Prototypes ----------------------------------------------------------------------------**/ void ParseArguments( int argc, char **argv); char *GetNextFilename (); LIST ReadTrainingSamples ( FILE *File); LABELEDLIST FindList ( LIST List, char *Label); MERGE_CLASS FindClass ( LIST List, char *Label); LABELEDLIST NewLabeledList ( char *Label); MERGE_CLASS NewLabeledClass ( char *Label); void WriteTrainingSamples ( char *Directory, LIST CharList); void WriteClusteredTrainingSamples ( char *Directory, LIST ProtoList, CLUSTERER *Clusterer, LABELEDLIST CharSample); /**/ void WriteMergedTrainingSamples( char *Directory, LIST ClassList); void WriteMicrofeat( char *Directory, LIST ClassList); void WriteProtos( FILE* File, MERGE_CLASS MergeClass); void WriteConfigs( FILE* File, CLASS_TYPE Class); void FreeTrainingSamples ( LIST CharList); void FreeLabeledClassList ( LIST ClassList); void FreeLabeledList ( LABELEDLIST LabeledList); CLUSTERER *SetUpForClustering( LABELEDLIST CharSample); /* PARAMDESC *ConvertToPARAMDESC( PARAM_DESC* Param_Desc, int N); */ void MergeInsignificantProtos(LIST ProtoList, const char* label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config); LIST RemoveInsignificantProtos( LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N); void CleanUpUnusedData( LIST ProtoList); void Normalize ( float *Values); void SetUpForFloat2Int( LIST LabeledClassList); void WritePFFMTable(INT_TEMPLATES Templates, const char* filename); //--------------Global Data Definitions and Declarations-------------- static char FontName[MAXNAMESIZE]; // globals used for parsing command line arguments static char *Directory = NULL; static int MaxNumSamples = MAX_NUM_SAMPLES; static int Argc; static char **Argv; // globals used to control what information is saved in the output file static BOOL8 ShowAllSamples = FALSE; static BOOL8 ShowSignificantProtos = TRUE; static BOOL8 ShowInsignificantProtos = FALSE; // global variable to hold configuration parameters to control clustering // -M 0.40 -B 0.05 -I 1.0 -C 1e-6. static CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }; static FLOAT32 RoundingAccuracy = 0.0f; // The unicharset used during mftraining static UNICHARSET unicharset_mftraining; const char* test_ch = ""; /*---------------------------------------------------------------------------- Public Code -----------------------------------------------------------------------------*/ void DisplayProtoList(const char* ch, LIST protolist) { void* window = c_create_window("Char samples", 50, 200, 520, 520, -130.0, 130.0, -130.0, 130.0); LIST proto = protolist; iterate(proto) { PROTOTYPE* prototype = reinterpret_cast(first_node(proto)); if (prototype->Significant) c_line_color_index(window, Green); else if (prototype->NumSamples == 0) c_line_color_index(window, Blue); else if (prototype->Merged) c_line_color_index(window, Magenta); else c_line_color_index(window, Red); float x = CenterX(prototype->Mean); float y = CenterY(prototype->Mean); double angle = OrientationOf(prototype->Mean) * 2 * M_PI; float dx = static_cast(LengthOf(prototype->Mean) * cos(angle) / 2); float dy = static_cast(LengthOf(prototype->Mean) * sin(angle) / 2); c_move(window, (x - dx) * 256, (y - dy) * 256); c_draw(window, (x + dx) * 256, (y + dy) * 256); if (prototype->Significant) tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototype->NumSamples); else if (prototype->NumSamples > 0 && !prototype->Merged) tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototype->NumSamples); } c_make_current(window); } /*---------------------------------------------------------------------------*/ int main (int argc, char **argv) { /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** The result of this program is a binary inttemp file used by ** the OCR engine. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. ** Mon May 18 1998, Christy Russson, Revistion started. */ char *PageName; FILE *TrainingPage; FILE *OutFile; LIST CharList; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL; LABELEDLIST CharSample; PROTOTYPE *Prototype; LIST ClassList = NIL; int Cid, Pid; PROTO Proto; PROTO_STRUCT DummyProto; BIT_VECTOR Config2; MERGE_CLASS MergeClass; INT_TEMPLATES IntTemplates; LIST pCharList, pProtoList; char Filename[MAXNAMESIZE]; // Clean the unichar set unicharset_mftraining.clear(); // Space character needed to represent NIL classification unicharset_mftraining.unichar_insert(" "); ParseArguments (argc, argv); InitFastTrainerVars (); InitSubfeatureVars (); while ((PageName = GetNextFilename()) != NULL) { printf ("Reading %s ...\n", PageName); TrainingPage = Efopen (PageName, "r"); CharList = ReadTrainingSamples (TrainingPage); fclose (TrainingPage); //WriteTrainingSamples (Directory, CharList); pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST) first_node (pCharList); // printf ("\nClustering %s ...", CharSample->Label); Clusterer = SetUpForClustering(CharSample); Config.MagicSamples = CharSample->SampleCount; ProtoList = ClusterSamples(Clusterer, &Config); CleanUpUnusedData(ProtoList); //Merge MergeInsignificantProtos(ProtoList, CharSample->Label, Clusterer, &Config); if (strcmp(test_ch, CharSample->Label) == 0) DisplayProtoList(test_ch, ProtoList); ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos, ShowInsignificantProtos, Clusterer->SampleSize); FreeClusterer(Clusterer); MergeClass = FindClass (ClassList, CharSample->Label); if (MergeClass == NULL) { MergeClass = NewLabeledClass (CharSample->Label); ClassList = push (ClassList, MergeClass); } Cid = AddConfigToClass(MergeClass->Class); pProtoList = ProtoList; iterate (pProtoList) { Prototype = (PROTOTYPE *) first_node (pProtoList); // see if proto can be approximated by existing proto Pid = FindClosestExistingProto(MergeClass->Class, MergeClass->NumMerged, Prototype); if (Pid == NO_PROTO) { Pid = AddProtoToClass (MergeClass->Class); Proto = ProtoIn (MergeClass->Class, Pid); MakeNewFromOld (Proto, Prototype); MergeClass->NumMerged[Pid] = 1; } else { MakeNewFromOld (&DummyProto, Prototype); ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto, (FLOAT32) MergeClass->NumMerged[Pid], 1.0, ProtoIn (MergeClass->Class, Pid)); MergeClass->NumMerged[Pid] ++; } Config2 = ConfigIn (MergeClass->Class, Cid); AddProtoToConfig (Pid, Config2); } FreeProtoList (&ProtoList); } FreeTrainingSamples (CharList); } //WriteMergedTrainingSamples(Directory,ClassList); WriteMicrofeat(Directory, ClassList); InitIntProtoVars (); InitPrototypes (); SetUpForFloat2Int(ClassList); IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "inttemp"); #ifdef __UNIX__ OutFile = Efopen (Filename, "w"); #else OutFile = Efopen (Filename, "wb"); #endif WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining); fclose (OutFile); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "pffmtable"); // Now create pffmtable. WritePFFMTable(IntTemplates, Filename); printf ("Done!\n"); /**/ FreeLabeledClassList (ClassList); return 0; } /* main */ /**---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------**/ /*---------------------------------------------------------------------------*/ void ParseArguments( int argc, char **argv) /* ** Parameters: ** argc number of command line arguments to parse ** argv command line arguments ** Globals: ** ShowAllSamples flag controlling samples display ** ShowSignificantProtos flag controlling proto display ** ShowInsignificantProtos flag controlling proto display ** Config current clustering parameters ** tessoptarg, tessoptind defined by tessopt sys call ** Argc, Argv global copies of argc and argv ** Operation: ** This routine parses the command line arguments that were ** passed to the program. The legal arguments are: ** -d "turn off display of samples" ** -p "turn off significant protos" ** -n "turn off insignificant proto" ** -S [ spherical | elliptical | mixed | automatic ] ** -M MinSamples "min samples per prototype (%)" ** -B MaxIllegal "max illegal chars per cluster (%)" ** -I Independence "0 to 1" ** -C Confidence "1e-200 to 1.0" ** -D Directory ** -N MaxNumSamples ** -R RoundingAccuracy ** Return: none ** Exceptions: Illegal options terminate the program. ** History: 7/24/89, DSJ, Created. */ { int Option; int ParametersRead; BOOL8 Error; Error = FALSE; Argc = argc; Argv = argv; while (( Option = tessopt( argc, argv, "R:N:D:C:I:M:B:S:d:n:p" )) != EOF ) { switch ( Option ) { case 'n': ShowInsignificantProtos = FALSE; break; case 'p': ShowSignificantProtos = FALSE; break; case 'd': ShowAllSamples = FALSE; break; case 'C': ParametersRead = sscanf( tessoptarg, "%lf", &(Config.Confidence) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.Confidence > 1 ) Config.Confidence = 1; else if ( Config.Confidence < 0 ) Config.Confidence = 0; break; case 'I': ParametersRead = sscanf( tessoptarg, "%f", &(Config.Independence) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.Independence > 1 ) Config.Independence = 1; else if ( Config.Independence < 0 ) Config.Independence = 0; break; case 'M': ParametersRead = sscanf( tessoptarg, "%f", &(Config.MinSamples) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.MinSamples > 1 ) Config.MinSamples = 1; else if ( Config.MinSamples < 0 ) Config.MinSamples = 0; break; case 'B': ParametersRead = sscanf( tessoptarg, "%f", &(Config.MaxIllegal) ); if ( ParametersRead != 1 ) Error = TRUE; else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1; else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0; break; case 'R': ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy ); if ( ParametersRead != 1 ) Error = TRUE; else if ( RoundingAccuracy > 0.01f ) RoundingAccuracy = 0.01f; else if ( RoundingAccuracy < 0.0f ) RoundingAccuracy = 0.0f; break; case 'S': switch ( tessoptarg[0] ) { case 's': Config.ProtoStyle = spherical; break; case 'e': Config.ProtoStyle = elliptical; break; case 'm': Config.ProtoStyle = mixed; break; case 'a': Config.ProtoStyle = automatic; break; default: Error = TRUE; } break; case 'D': Directory = tessoptarg; break; case 'N': if (sscanf (tessoptarg, "%d", &MaxNumSamples) != 1 || MaxNumSamples <= 0) Error = TRUE; break; case '?': Error = TRUE; break; } if ( Error ) { fprintf (stderr, "usage: %s [-D] [-P] [-N]\n", argv[0] ); fprintf (stderr, "\t[-S ProtoStyle]\n"); fprintf (stderr, "\t[-M MinSamples] [-B MaxBad] [-I Independence] [-C Confidence]\n" ); fprintf (stderr, "\t[-d directory] [-n MaxNumSamples] [ TrainingPage ... ]\n"); exit (2); } } } // ParseArguments /*---------------------------------------------------------------------------*/ char *GetNextFilename () /* ** Parameters: none ** Globals: ** tessoptind defined by tessopt sys call ** Argc, Argv global copies of argc and argv ** Operation: ** This routine returns the next command line argument. If ** there are no remaining command line arguments, it returns ** NULL. This routine should only be called after all option ** arguments have been parsed and removed with ParseArguments. ** Return: Next command line argument or NULL. ** Exceptions: none ** History: Fri Aug 18 09:34:12 1989, DSJ, Created. */ { if (tessoptind < Argc) return (Argv [tessoptind++]); else return (NULL); } /* GetNextFilename */ /*---------------------------------------------------------------------------*/ LIST ReadTrainingSamples ( FILE *File) /* ** Parameters: ** File open text file to read samples from ** Globals: none ** Operation: ** This routine reads training samples from a file and ** places them into a data structure which organizes the ** samples by FontName and CharName. It then returns this ** data structure. ** Return: none ** Exceptions: none ** History: Fri Aug 18 13:11:39 1989, DSJ, Created. ** Tue May 17 1998 simplifications to structure, illiminated ** font, and feature specification levels of structure. */ { char unichar[UNICHAR_LEN + 1]; LABELEDLIST CharSample; FEATURE_SET FeatureSamples; LIST TrainingSamples = NIL; CHAR_DESC CharDesc; int Type, i; while (fscanf (File, "%s %s", FontName, unichar) == 2) { if (!unicharset_mftraining.contains_unichar(unichar)) { unicharset_mftraining.unichar_insert(unichar); if (unicharset_mftraining.size() > MAX_NUM_CLASSES) { cprintf("Error: Size of unicharset of mftraining is " "greater than MAX_NUM_CLASSES\n"); exit(1); } } CharSample = FindList (TrainingSamples, unichar); if (CharSample == NULL) { CharSample = NewLabeledList (unichar); TrainingSamples = push (TrainingSamples, CharSample); } CharDesc = ReadCharDescription (File); Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE); FeatureSamples = FeaturesOfType(CharDesc, Type); for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) { FEATURE f = FeatureSamples->Features[feature]; for (int dim =0; dim < f->Type->NumParams; ++dim) f->Params[dim] += dim == MFDirection ? UniformRandomNumber(-MINSD_ANGLE, MINSD_ANGLE) : UniformRandomNumber(-MINSD, MINSD); } CharSample->List = push (CharSample->List, FeatureSamples); CharSample->SampleCount++; for (i = 0; i < NumFeatureSetsIn (CharDesc); i++) if (Type != i) FreeFeatureSet (FeaturesOfType (CharDesc, i)); free (CharDesc); } return (TrainingSamples); } /* ReadTrainingSamples */ /*---------------------------------------------------------------------------*/ LABELEDLIST FindList ( LIST List, char *Label) /* ** Parameters: ** List list to search ** Label label to search for ** Globals: none ** Operation: ** This routine searches thru a list of labeled lists to find ** a list with the specified label. If a matching labeled list ** cannot be found, NULL is returned. ** Return: Labeled list with the specified Label or NULL. ** Exceptions: none ** History: Fri Aug 18 15:57:41 1989, DSJ, Created. */ { LABELEDLIST LabeledList; iterate (List) { LabeledList = (LABELEDLIST) first_node (List); if (strcmp (LabeledList->Label, Label) == 0) return (LabeledList); } return (NULL); } /* FindList */ /*----------------------------------------------------------------------------*/ MERGE_CLASS FindClass ( LIST List, char *Label) { MERGE_CLASS MergeClass; iterate (List) { MergeClass = (MERGE_CLASS) first_node (List); if (strcmp (MergeClass->Label, Label) == 0) return (MergeClass); } return (NULL); } /* FindClass */ /*---------------------------------------------------------------------------*/ LABELEDLIST NewLabeledList ( char *Label) /* ** Parameters: ** Label label for new list ** Globals: none ** Operation: ** This routine allocates a new, empty labeled list and gives ** it the specified label. ** Return: New, empty labeled list. ** Exceptions: none ** History: Fri Aug 18 16:08:46 1989, DSJ, Created. */ { LABELEDLIST LabeledList; LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE)); LabeledList->Label = (char*)Emalloc (strlen (Label)+1); strcpy (LabeledList->Label, Label); LabeledList->List = NIL; LabeledList->SampleCount = 0; return (LabeledList); } /* NewLabeledList */ /*---------------------------------------------------------------------------*/ MERGE_CLASS NewLabeledClass ( char *Label) { MERGE_CLASS MergeClass; MergeClass = (MERGE_CLASS) Emalloc (sizeof (MERGE_CLASS_NODE)); MergeClass->Label = (char*)Emalloc (strlen (Label)+1); strcpy (MergeClass->Label, Label); MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS); return (MergeClass); } /* NewLabeledClass */ /*---------------------------------------------------------------------------*/ void WriteTrainingSamples ( char *Directory, LIST CharList) /* ** Parameters: ** Directory directory to place sample files into ** FontList list of fonts used in the training samples ** Globals: ** MaxNumSamples max number of samples per class to write ** Operation: ** This routine writes the specified samples into files which ** are organized according to the font name and character name ** of the samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ { LABELEDLIST CharSample; FEATURE_SET FeatureSet; LIST FeatureList; FILE *File; char Filename[MAXNAMESIZE]; int NumSamples; iterate (CharList) // iterate thru all of the fonts { CharSample = (LABELEDLIST) first_node (CharList); // construct the full pathname for the current samples file strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, FontName); strcat (Filename, "/"); strcat (Filename, CharSample->Label); strcat (Filename, "."); strcat (Filename, PROGRAM_FEATURE_TYPE); printf ("\nWriting %s ...", Filename); /* if file does not exist, create a new one with an appropriate header; otherwise append samples to the existing file */ File = fopen (Filename, "r"); if (File == NULL) { File = Efopen (Filename, "w"); WriteOldParamDesc (File, DefinitionOf (ShortNameToFeatureType (PROGRAM_FEATURE_TYPE))); } else { fclose (File); File = Efopen (Filename, "a"); } // append samples onto the file FeatureList = CharSample->List; NumSamples = 0; iterate (FeatureList) { if (NumSamples >= MaxNumSamples) break; FeatureSet = (FEATURE_SET) first_node (FeatureList); WriteFeatureSet (File, FeatureSet); NumSamples++; } fclose (File); } } /* WriteTrainingSamples */ /*----------------------------------------------------------------------------*/ void WriteClusteredTrainingSamples ( char *Directory, LIST ProtoList, CLUSTERER *Clusterer, LABELEDLIST CharSample) /* ** Parameters: ** Directory directory to place sample files into ** Globals: ** MaxNumSamples max number of samples per class to write ** Operation: ** This routine writes the specified samples into files which ** are organized according to the font name and character name ** of the samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ { FILE *File; char Filename[MAXNAMESIZE]; strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, FontName); strcat (Filename, "/"); strcat (Filename, CharSample->Label); strcat (Filename, "."); strcat (Filename, PROGRAM_FEATURE_TYPE); strcat (Filename, ".p"); printf ("\nWriting %s ...", Filename); File = Efopen (Filename, "w"); WriteProtoList(File, Clusterer->SampleSize, Clusterer->ParamDesc, ProtoList, ShowSignificantProtos, ShowInsignificantProtos); fclose (File); } /* WriteClusteredTrainingSamples */ /*---------------------------------------------------------------------------*/ void WriteMergedTrainingSamples( char *Directory, LIST ClassList) { FILE *File; char Filename[MAXNAMESIZE]; MERGE_CLASS MergeClass; iterate (ClassList) { MergeClass = (MERGE_CLASS) first_node (ClassList); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "Merged/"); strcat (Filename, MergeClass->Label); strcat (Filename, PROTO_SUFFIX); printf ("\nWriting Merged %s ...", Filename); File = Efopen (Filename, "w"); WriteOldProtoFile (File, MergeClass->Class); fclose (File); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "Merged/"); strcat (Filename, MergeClass->Label); strcat (Filename, CONFIG_SUFFIX); printf ("\nWriting Merged %s ...", Filename); File = Efopen (Filename, "w"); WriteOldConfigFile (File, MergeClass->Class); fclose (File); } } // WriteMergedTrainingSamples /*--------------------------------------------------------------------------*/ void WriteMicrofeat( char *Directory, LIST ClassList) { FILE *File; char Filename[MAXNAMESIZE]; MERGE_CLASS MergeClass; strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "Microfeat"); File = Efopen (Filename, "w"); printf ("\nWriting Merged %s ...", Filename); iterate(ClassList) { MergeClass = (MERGE_CLASS) first_node (ClassList); WriteProtos(File, MergeClass); WriteConfigs(File, MergeClass->Class); } fclose (File); } // WriteMicrofeat /*---------------------------------------------------------------------------*/ void WriteProtos( FILE* File, MERGE_CLASS MergeClass) { float Values[3]; int i; PROTO Proto; fprintf(File, "%s\n", MergeClass->Label); fprintf(File, "%d\n", NumProtosIn(MergeClass->Class)); for(i=0; i < NumProtosIn(MergeClass->Class); i++) { Proto = ProtoIn(MergeClass->Class,i); fprintf(File, "\t%8.4f %8.4f %8.4f %8.4f ", ProtoX(Proto), ProtoY(Proto), ProtoLength(Proto), ProtoAngle(Proto)); Values[0] = ProtoX(Proto); Values[1] = ProtoY(Proto); Values[2] = ProtoAngle(Proto); Normalize(Values); fprintf(File, "%8.4f %8.4f %8.4f\n", Values[0], Values[1], Values[2]); } } // WriteProtos /*----------------------------------------------------------------------------*/ void WriteConfigs( FILE* File, CLASS_TYPE Class) { BIT_VECTOR Config; int i, j, WordsPerConfig; WordsPerConfig = WordsInVectorOfSize(NumProtosIn(Class)); fprintf(File, "%d %d\n", NumConfigsIn(Class),WordsPerConfig); for(i=0; i < NumConfigsIn(Class); i++) { Config = ConfigIn(Class,i); for(j=0; j < WordsPerConfig; j++) fprintf(File, "%08x ", Config[j]); fprintf(File, "\n"); } fprintf(File, "\n"); } // WriteConfigs /*---------------------------------------------------------------------------*/ void FreeTrainingSamples ( LIST CharList) /* ** Parameters: ** FontList list of all fonts in document ** Globals: none ** Operation: ** This routine deallocates all of the space allocated to ** the specified list of training samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 17:44:27 1989, DSJ, Created. */ { LABELEDLIST CharSample; FEATURE_SET FeatureSet; LIST FeatureList; // printf ("FreeTrainingSamples...\n"); iterate (CharList) /* iterate thru all of the fonts */ { CharSample = (LABELEDLIST) first_node (CharList); FeatureList = CharSample->List; iterate (FeatureList) /* iterate thru all of the classes */ { FeatureSet = (FEATURE_SET) first_node (FeatureList); FreeFeatureSet (FeatureSet); } FreeLabeledList (CharSample); } destroy (CharList); } /* FreeTrainingSamples */ /*-----------------------------------------------------------------------------*/ void FreeLabeledClassList ( LIST ClassList) /* ** Parameters: ** FontList list of all fonts in document ** Globals: none ** Operation: ** This routine deallocates all of the space allocated to ** the specified list of training samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 17:44:27 1989, DSJ, Created. */ { MERGE_CLASS MergeClass; iterate (ClassList) /* iterate thru all of the fonts */ { MergeClass = (MERGE_CLASS) first_node (ClassList); free (MergeClass->Label); FreeClass(MergeClass->Class); free (MergeClass); } destroy (ClassList); } /* FreeLabeledClassList */ /*---------------------------------------------------------------------------*/ void FreeLabeledList ( LABELEDLIST LabeledList) /* ** Parameters: ** LabeledList labeled list to be freed ** Globals: none ** Operation: ** This routine deallocates all of the memory consumed by ** a labeled list. It does not free any memory which may be ** consumed by the items in the list. ** Return: none ** Exceptions: none ** History: Fri Aug 18 17:52:45 1989, DSJ, Created. */ { destroy (LabeledList->List); free (LabeledList->Label); free (LabeledList); } /* FreeLabeledList */ /*---------------------------------------------------------------------------*/ CLUSTERER *SetUpForClustering( LABELEDLIST CharSample) /* ** Parameters: ** CharSample: LABELEDLIST that holds all the feature information for a ** given character. ** Globals: ** None ** Operation: ** This routine reads samples from a LABELEDLIST and enters ** those samples into a clusterer data structure. This ** data structure is then returned to the caller. ** Return: ** Pointer to new clusterer data structure. ** Exceptions: ** None ** History: ** 8/16/89, DSJ, Created. */ { uinT16 N; int i, j; FLOAT32 *Sample = NULL; CLUSTERER *Clusterer; inT32 CharID; LIST FeatureList = NULL; FEATURE_SET FeatureSet = NULL; FEATURE_DESC FeatureDesc = NULL; // PARAM_DESC* ParamDesc; FeatureDesc = DefinitionOf(ShortNameToFeatureType(PROGRAM_FEATURE_TYPE)); N = FeatureDesc->NumParams; // ParamDesc = ConvertToPARAMDESC(FeatureDesc->ParamDesc, N); Clusterer = MakeClusterer(N,FeatureDesc->ParamDesc); // free(ParamDesc); FeatureList = CharSample->List; CharID = 0; iterate(FeatureList) { if (CharID >= MaxNumSamples) break; FeatureSet = (FEATURE_SET) first_node (FeatureList); for (i=0; i < FeatureSet->MaxNumFeatures; i++) { if (Sample == NULL) Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); for (j=0; j < N; j++) if (RoundingAccuracy != 0.0f) Sample[j] = round(FeatureSet->Features[i]->Params[j], RoundingAccuracy); else Sample[j] = FeatureSet->Features[i]->Params[j]; MakeSample (Clusterer, Sample, CharID); } CharID++; } if ( Sample != NULL ) free( Sample ); return( Clusterer ); } /* SetUpForClustering */ /*------------------------------------------------------------------------*/ void MergeInsignificantProtos(LIST ProtoList, const char* label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { PROTOTYPE *Prototype; bool debug = strcmp(test_ch, label) == 0; LIST pProtoList = ProtoList; iterate(pProtoList) { Prototype = (PROTOTYPE *) first_node (pProtoList); if (Prototype->Significant || Prototype->Merged) continue; FLOAT32 best_dist = 0.125; PROTOTYPE* best_match = NULL; // Find the nearest alive prototype. LIST list_it = ProtoList; iterate(list_it) { PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it); if (test_p != Prototype && !test_p->Merged) { FLOAT32 dist = ComputeDistance(Clusterer->SampleSize, Clusterer->ParamDesc, Prototype->Mean, test_p->Mean); if (dist < best_dist) { best_match = test_p; best_dist = dist; } } } if (best_match != NULL && !best_match->Significant) { if (debug) tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n", best_match->NumSamples, Prototype->NumSamples, best_match->Mean[0], best_match->Mean[1], Prototype->Mean[0], Prototype->Mean[1]); best_match->NumSamples = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, best_match->NumSamples, Prototype->NumSamples, best_match->Mean, best_match->Mean, Prototype->Mean); Prototype->NumSamples = 0; Prototype->Merged = 1; } else if (best_match != NULL) { if (debug) tprintf("Red proto at %g,%g matched a green one at %g,%g\n", Prototype->Mean[0], Prototype->Mean[1], best_match->Mean[0], best_match->Mean[1]); Prototype->Merged = 1; } } // Mark significant those that now have enough samples. int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar); pProtoList = ProtoList; iterate(pProtoList) { Prototype = (PROTOTYPE *) first_node (pProtoList); // Process insignificant protos that do not match a green one if (!Prototype->Significant && Prototype->NumSamples >= min_samples && !Prototype->Merged) { if (debug) tprintf("Red proto at %g,%g becoming green\n", Prototype->Mean[0], Prototype->Mean[1]); Prototype->Significant = true; } } } /* MergeInsignificantProtos */ /*------------------------------------------------------------------------*/ LIST RemoveInsignificantProtos( LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N) { LIST NewProtoList = NIL; LIST pProtoList; PROTOTYPE* Proto; PROTOTYPE* NewProto; int i; pProtoList = ProtoList; iterate(pProtoList) { Proto = (PROTOTYPE *) first_node (pProtoList); if ((Proto->Significant && KeepSigProtos) || (!Proto->Significant && KeepInsigProtos)) { NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE)); NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); NewProto->Significant = Proto->Significant; NewProto->Style = Proto->Style; NewProto->NumSamples = Proto->NumSamples; NewProto->Cluster = NULL; NewProto->Distrib = NULL; for (i=0; i < N; i++) NewProto->Mean[i] = Proto->Mean[i]; if (Proto->Variance.Elliptical != NULL) { NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); for (i=0; i < N; i++) NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i]; } else NewProto->Variance.Elliptical = NULL; //--------------------------------------------- if (Proto->Magnitude.Elliptical != NULL) { NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); for (i=0; i < N; i++) NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i]; } else NewProto->Magnitude.Elliptical = NULL; //------------------------------------------------ if (Proto->Weight.Elliptical != NULL) { NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); for (i=0; i < N; i++) NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i]; } else NewProto->Weight.Elliptical = NULL; NewProto->TotalMagnitude = Proto->TotalMagnitude; NewProto->LogMagnitude = Proto->LogMagnitude; NewProtoList = push_last(NewProtoList, NewProto); } } //FreeProtoList (ProtoList); return (NewProtoList); } /* RemoveInsignificantProtos */ /*-----------------------------------------------------------------------------*/ void CleanUpUnusedData( LIST ProtoList) { PROTOTYPE* Prototype; iterate(ProtoList) { Prototype = (PROTOTYPE *) first_node (ProtoList); if(Prototype->Variance.Elliptical != NULL) { memfree(Prototype->Variance.Elliptical); Prototype->Variance.Elliptical = NULL; } if(Prototype->Magnitude.Elliptical != NULL) { memfree(Prototype->Magnitude.Elliptical); Prototype->Magnitude.Elliptical = NULL; } if(Prototype->Weight.Elliptical != NULL) { memfree(Prototype->Weight.Elliptical); Prototype->Weight.Elliptical = NULL; } } } /*--------------------------------------------------------------------------*/ void Normalize ( float *Values) { register float Slope; register float Intercept; register float Normalizer; Slope = tan (Values [2] * 2 * PI); Intercept = Values [1] - Slope * Values [0]; Normalizer = 1 / sqrt (Slope * Slope + 1.0); Values [0] = Slope * Normalizer; Values [1] = - Normalizer; Values [2] = Intercept * Normalizer; } // Normalize /** SetUpForFloat2Int **************************************************/ void SetUpForFloat2Int( LIST LabeledClassList) { MERGE_CLASS MergeClass; CLASS_TYPE Class; int NumProtos; int NumConfigs; int NumWords; int i, j; float Values[3]; PROTO NewProto; PROTO OldProto; BIT_VECTOR NewConfig; BIT_VECTOR OldConfig; // printf("Float2Int ...\n"); iterate(LabeledClassList) { MergeClass = (MERGE_CLASS) first_node (LabeledClassList); Class = &TrainingData[unicharset_mftraining.unichar_to_id( MergeClass->Label)]; NumProtos = NumProtosIn(MergeClass->Class); NumConfigs = NumConfigsIn(MergeClass->Class); NumProtosIn(Class) = NumProtos; Class->MaxNumProtos = NumProtos; Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos); for(i=0; i < NumProtos; i++) { NewProto = ProtoIn(Class, i); OldProto = ProtoIn(MergeClass->Class, i); Values[0] = ProtoX(OldProto); Values[1] = ProtoY(OldProto); Values[2] = ProtoAngle(OldProto); Normalize(Values); ProtoX(NewProto) = ProtoX(OldProto); ProtoY(NewProto) = ProtoY(OldProto); ProtoLength(NewProto) = ProtoLength(OldProto); ProtoAngle(NewProto) = ProtoAngle(OldProto); CoefficientA(NewProto) = Values[0]; CoefficientB(NewProto) = Values[1]; CoefficientC(NewProto) = Values[2]; } NumConfigsIn(Class) = NumConfigs; Class->MaxNumConfigs = NumConfigs; Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs); NumWords = WordsInVectorOfSize(NumProtos); for(i=0; i < NumConfigs; i++) { NewConfig = NewBitVector(NumProtos); OldConfig = ConfigIn(MergeClass->Class, i); for(j=0; j < NumWords; j++) NewConfig[j] = OldConfig[j]; ConfigIn(Class, i) = NewConfig; } } } // SetUpForFloat2Int /*--------------------------------------------------------------------------*/ void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) { FILE* fp = Efopen(filename, "wb"); /* then write out each class */ for (int i = 0; i < NumClassesIn (Templates); i++) { int MaxLength = 0; INT_CLASS Class = ClassForIndex (Templates, i); for (int ConfigId = 0; ConfigId < NumIntConfigsIn (Class); ConfigId++) { if (LengthForConfigId (Class, ConfigId) > MaxLength) MaxLength = LengthForConfigId (Class, ConfigId); } fprintf(fp, "%s %d\n", unicharset_mftraining.id_to_unichar( ClassIdForIndex(Templates, i)), MaxLength); } fclose(fp); } // WritePFFMTable