mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-04 01:39:16 +08:00
e33ae59f4d
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@653 d0cd1f9f-072b-0410-8dd7-cf729c803f20
970 lines
31 KiB
C++
970 lines
31 KiB
C++
// Copyright 2008 Google Inc. All Rights Reserved.
|
|
// Author: scharron@google.com (Samuel Charron)
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "commontraining.h"
|
|
|
|
#ifndef USE_STD_NAMESPACE
|
|
#include "base/init_google.h"
|
|
#include "base/commandlineflags.h"
|
|
#endif
|
|
#include "allheaders.h"
|
|
#include "ccutil.h"
|
|
#include "classify.h"
|
|
#include "oldlist.h"
|
|
#include "globals.h"
|
|
#include "mf.h"
|
|
#include "clusttool.h"
|
|
#include "cluster.h"
|
|
#include "tessopt.h"
|
|
#include "efio.h"
|
|
#include "emalloc.h"
|
|
#include "featdefs.h"
|
|
#include "fontinfo.h"
|
|
#include "intfeaturespace.h"
|
|
#include "mastertrainer.h"
|
|
#include "tessdatamanager.h"
|
|
#include "tprintf.h"
|
|
#include "freelist.h"
|
|
#include "params.h"
|
|
#include "shapetable.h"
|
|
#include "unicity_table.h"
|
|
|
|
#include <math.h>
|
|
|
|
using tesseract::CCUtil;
|
|
using tesseract::FontInfo;
|
|
using tesseract::IntFeatureSpace;
|
|
using tesseract::ParamUtils;
|
|
using tesseract::ShapeTable;
|
|
|
|
// Global Variables.
|
|
// global variable to hold configuration parameters to control clustering
|
|
// -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
|
|
CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
|
|
|
|
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
|
|
INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
|
|
STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
|
|
STRING_PARAM_FLAG(D, "", "Directory to write output files to");
|
|
STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
|
|
STRING_PARAM_FLAG(X, "", "File listing font xheights");
|
|
STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
|
|
STRING_PARAM_FLAG(O, "", "File to write unicharset to");
|
|
STRING_PARAM_FLAG(input_trainer, "", "File to load trainer from");
|
|
STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
|
|
STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
|
|
|
|
// The usage strings are different as the DEFINE_* flags are available on
|
|
// the command line, but the *_VAR flags are set through a config file with
|
|
// some of them available through special command-line args.
|
|
#ifndef USE_STD_NAMESPACE
|
|
const char* kUsage = "[flags] [ .tr files ... ]\n";
|
|
#else
|
|
const char* kUsage = "[-c configfile]\n"
|
|
"\t[-D Directory]\n"
|
|
"\t[-M MinSamples] [-B MaxBad] [-I Independence] [-C Confidence]\n"
|
|
"\t[-U InputUnicharset]\n"
|
|
"\t[-O OutputUnicharset]\n"
|
|
"\t[-F FontInfoFile]\n"
|
|
"\t[-X InputXHeightsFile]\n"
|
|
"\t[-S InputShapeTable]\n"
|
|
"\t[ .tr files ... ]\n";
|
|
#endif
|
|
|
|
FEATURE_DEFS_STRUCT feature_defs;
|
|
CCUtil ccutil;
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void ParseArguments(int* argc, char ***argv) {
|
|
/*
|
|
** Parameters:
|
|
** argc number of command line arguments to parse
|
|
** argv command line arguments
|
|
** Globals:
|
|
** ShowSignificantProtos flag controlling proto display
|
|
** ShowInsignificantProtos flag controlling proto display
|
|
** Config current clustering parameters
|
|
** tessoptarg, tessoptind defined by tessopt sys call
|
|
** Argc, Argv global copies of argc and argv
|
|
** Operation:
|
|
** This routine parses the command line arguments that were
|
|
** passed to the program. The legal arguments are shown in the usage
|
|
** message below:
|
|
|
|
** Return: none
|
|
** Exceptions: Illegal options terminate the program.
|
|
** History: 7/24/89, DSJ, Created.
|
|
*/
|
|
#ifndef USE_STD_NAMESPACE
|
|
InitGoogle(kUsage, argc, argv, true);
|
|
tessoptind = 1;
|
|
#else
|
|
int Option;
|
|
int ParametersRead;
|
|
BOOL8 Error;
|
|
|
|
Error = FALSE;
|
|
while ((Option = tessopt(*argc, *argv, "F:O:U:D:C:I:M:B:S:X:c:")) != EOF) {
|
|
switch (Option) {
|
|
case 'C':
|
|
ParametersRead = sscanf(tessoptarg, "%lf", &(Config.Confidence) );
|
|
if ( ParametersRead != 1 ) Error = TRUE;
|
|
else if ( Config.Confidence > 1 ) Config.Confidence = 1;
|
|
else if ( Config.Confidence < 0 ) Config.Confidence = 0;
|
|
break;
|
|
case 'I':
|
|
ParametersRead = sscanf(tessoptarg, "%f", &(Config.Independence) );
|
|
if ( ParametersRead != 1 ) Error = TRUE;
|
|
else if ( Config.Independence > 1 ) Config.Independence = 1;
|
|
else if ( Config.Independence < 0 ) Config.Independence = 0;
|
|
break;
|
|
case 'M':
|
|
ParametersRead = sscanf(tessoptarg, "%f", &(Config.MinSamples) );
|
|
if ( ParametersRead != 1 ) Error = TRUE;
|
|
else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
|
|
else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
|
|
break;
|
|
case 'B':
|
|
ParametersRead = sscanf(tessoptarg, "%f", &(Config.MaxIllegal) );
|
|
if ( ParametersRead != 1 ) Error = TRUE;
|
|
else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
|
|
else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
|
|
break;
|
|
case 'c':
|
|
FLAGS_configfile.set_value(tessoptarg);
|
|
break;
|
|
case 'D':
|
|
FLAGS_D.set_value(tessoptarg);
|
|
break;
|
|
case 'U':
|
|
FLAGS_U.set_value(tessoptarg);
|
|
break;
|
|
case 'O':
|
|
FLAGS_O.set_value(tessoptarg);
|
|
break;
|
|
case 'F':
|
|
FLAGS_F.set_value(tessoptarg);
|
|
break;
|
|
case 'X':
|
|
FLAGS_X.set_value(tessoptarg);
|
|
break;
|
|
case '?':
|
|
Error = TRUE;
|
|
break;
|
|
}
|
|
if (Error) {
|
|
fprintf(stderr, "Usage: %s %s\n", (*argv)[0], kUsage);
|
|
exit(2);
|
|
}
|
|
}
|
|
#endif
|
|
// Set additional parameters from config file if specified.
|
|
if (!FLAGS_configfile.empty()) {
|
|
tesseract::ParamUtils::ReadParamsFile(
|
|
FLAGS_configfile.c_str(),
|
|
tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
|
|
ccutil.params());
|
|
}
|
|
} // ParseArguments
|
|
|
|
namespace tesseract {
|
|
|
|
// Helper loads shape table from the given file.
|
|
ShapeTable* LoadShapeTable(const STRING& file_prefix) {
|
|
ShapeTable* shape_table = NULL;
|
|
STRING shape_table_file = file_prefix;
|
|
shape_table_file += kShapeTableFileSuffix;
|
|
FILE* shape_fp = fopen(shape_table_file.string(), "rb");
|
|
if (shape_fp != NULL) {
|
|
shape_table = new ShapeTable;
|
|
if (!shape_table->DeSerialize(false, shape_fp)) {
|
|
delete shape_table;
|
|
shape_table = NULL;
|
|
tprintf("Error: Failed to read shape table %s\n",
|
|
shape_table_file.string());
|
|
} else {
|
|
int num_shapes = shape_table->NumShapes();
|
|
tprintf("Read shape table %s of %d shapes\n",
|
|
shape_table_file.string(), num_shapes);
|
|
}
|
|
fclose(shape_fp);
|
|
} else {
|
|
tprintf("Warning: No shape table file present: %s\n",
|
|
shape_table_file.string());
|
|
}
|
|
return shape_table;
|
|
}
|
|
|
|
// Helper to write the shape_table.
|
|
void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
|
|
STRING shape_table_file = file_prefix;
|
|
shape_table_file += kShapeTableFileSuffix;
|
|
FILE* fp = fopen(shape_table_file.string(), "wb");
|
|
if (fp != NULL) {
|
|
if (!shape_table.Serialize(fp)) {
|
|
fprintf(stderr, "Error writing shape table: %s\n",
|
|
shape_table_file.string());
|
|
}
|
|
fclose(fp);
|
|
} else {
|
|
fprintf(stderr, "Error creating shape table: %s\n",
|
|
shape_table_file.string());
|
|
}
|
|
}
|
|
|
|
// Creates a MasterTraininer and loads the training data into it:
|
|
// Initializes feature_defs and IntegerFX.
|
|
// Loads the shape_table if shape_table != NULL.
|
|
// Loads initial unicharset from -U command-line option.
|
|
// If FLAGS_input_trainer is set, loads the majority of data from there, else:
|
|
// Loads font info from -F option.
|
|
// Loads xheights from -X option.
|
|
// Loads samples from .tr files in remaining command-line args.
|
|
// Deletes outliers and computes canonical samples.
|
|
// If FLAGS_output_trainer is set, saves the trainer for future use.
|
|
// Computes canonical and cloud features.
|
|
// If shape_table is not NULL, but failed to load, make a fake flat one,
|
|
// as shape clustering was not run.
|
|
MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
|
|
bool replication,
|
|
ShapeTable** shape_table,
|
|
STRING* file_prefix) {
|
|
InitFeatureDefs(&feature_defs);
|
|
InitIntegerFX();
|
|
*file_prefix = "";
|
|
if (!FLAGS_D.empty()) {
|
|
*file_prefix += FLAGS_D.c_str();
|
|
*file_prefix += "/";
|
|
}
|
|
// If we are shape clustering (NULL shape_table) or we successfully load
|
|
// a shape_table written by a previous shape clustering, then
|
|
// shape_analysis will be true, meaning that the MasterTrainer will replace
|
|
// some members of the unicharset with their fragments.
|
|
bool shape_analysis = false;
|
|
if (shape_table != NULL) {
|
|
*shape_table = LoadShapeTable(*file_prefix);
|
|
if (*shape_table != NULL)
|
|
shape_analysis = true;
|
|
} else {
|
|
shape_analysis = true;
|
|
}
|
|
MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
|
|
shape_analysis,
|
|
replication,
|
|
FLAGS_debug_level);
|
|
if (FLAGS_input_trainer.empty()) {
|
|
trainer->LoadUnicharset(FLAGS_U.c_str());
|
|
// Get basic font information from font_properties.
|
|
if (!FLAGS_F.empty()) {
|
|
if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
|
|
delete trainer;
|
|
return NULL;
|
|
}
|
|
}
|
|
if (!FLAGS_X.empty()) {
|
|
if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
|
|
delete trainer;
|
|
return NULL;
|
|
}
|
|
}
|
|
IntFeatureSpace fs;
|
|
fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
|
|
trainer->SetFeatureSpace(fs);
|
|
const char* page_name;
|
|
// Load training data from .tr files on the command line.
|
|
while ((page_name = GetNextFilename(argc, argv)) != NULL) {
|
|
tprintf("Reading %s ...\n", page_name);
|
|
FILE* fp = Efopen(page_name, "rb");
|
|
trainer->ReadTrainingSamples(fp, feature_defs, false);
|
|
fclose(fp);
|
|
|
|
// If there is a file with [lang].[fontname].exp[num].fontinfo present,
|
|
// read font spacing information in to fontinfo_table.
|
|
int pagename_len = strlen(page_name);
|
|
char *fontinfo_file_name = new char[pagename_len + 7];
|
|
strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
|
|
strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
|
|
trainer->AddSpacingInfo(fontinfo_file_name);
|
|
delete[] fontinfo_file_name;
|
|
|
|
// Load the images into memory if required by the classifier.
|
|
if (FLAGS_load_images) {
|
|
STRING image_name = page_name;
|
|
// Chop off the tr and replace with tif. Extension must be tif!
|
|
image_name.truncate_at(image_name.length() - 2);
|
|
image_name += "tif";
|
|
trainer->LoadPageImages(image_name.string());
|
|
}
|
|
}
|
|
trainer->PostLoadCleanup();
|
|
// Write the master trainer if required.
|
|
if (!FLAGS_output_trainer.empty()) {
|
|
FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
|
|
if (fp == NULL) {
|
|
tprintf("Can't create saved trainer data!\n");
|
|
} else {
|
|
trainer->Serialize(fp);
|
|
fclose(fp);
|
|
}
|
|
}
|
|
} else {
|
|
bool success = false;
|
|
tprintf("Loading master trainer from file:%s\n",
|
|
FLAGS_input_trainer.c_str());
|
|
FILE* fp = fopen(FLAGS_input_trainer.c_str(), "rb");
|
|
if (fp == NULL) {
|
|
tprintf("Can't read file %s to initialize master trainer\n",
|
|
FLAGS_input_trainer.c_str());
|
|
} else {
|
|
success = trainer->DeSerialize(false, fp);
|
|
fclose(fp);
|
|
}
|
|
if (!success) {
|
|
tprintf("Deserialize of master trainer failed!\n");
|
|
delete trainer;
|
|
return NULL;
|
|
}
|
|
}
|
|
trainer->PreTrainingSetup();
|
|
if (!FLAGS_O.empty() &&
|
|
!trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
|
|
fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
|
|
delete trainer;
|
|
return NULL;
|
|
}
|
|
if (shape_table != NULL) {
|
|
// If we previously failed to load a shapetable, then shape clustering
|
|
// wasn't run so make a flat one now.
|
|
if (*shape_table == NULL) {
|
|
*shape_table = new ShapeTable;
|
|
trainer->SetupFlatShapeTable(*shape_table);
|
|
tprintf("Flat shape table summary: %s\n",
|
|
(*shape_table)->SummaryStr().string());
|
|
}
|
|
(*shape_table)->set_unicharset(trainer->unicharset());
|
|
}
|
|
return trainer;
|
|
}
|
|
|
|
} // namespace tesseract.
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
const char *GetNextFilename(int argc, const char* const * argv) {
|
|
/*
|
|
** Parameters: none
|
|
** Globals:
|
|
** tessoptind defined by tessopt sys call
|
|
** Operation:
|
|
** This routine returns the next command line argument. If
|
|
** there are no remaining command line arguments, it returns
|
|
** NULL. This routine should only be called after all option
|
|
** arguments have been parsed and removed with ParseArguments.
|
|
** Return: Next command line argument or NULL.
|
|
** Exceptions: none
|
|
** History: Fri Aug 18 09:34:12 1989, DSJ, Created.
|
|
*/
|
|
if (tessoptind < argc)
|
|
return argv[tessoptind++];
|
|
else
|
|
return NULL;
|
|
} /* GetNextFilename */
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
LABELEDLIST FindList (
|
|
LIST List,
|
|
char *Label)
|
|
|
|
/*
|
|
** Parameters:
|
|
** List list to search
|
|
** Label label to search for
|
|
** Globals: none
|
|
** Operation:
|
|
** This routine searches thru a list of labeled lists to find
|
|
** a list with the specified label. If a matching labeled list
|
|
** cannot be found, NULL is returned.
|
|
** Return: Labeled list with the specified Label or NULL.
|
|
** Exceptions: none
|
|
** History: Fri Aug 18 15:57:41 1989, DSJ, Created.
|
|
*/
|
|
|
|
{
|
|
LABELEDLIST LabeledList;
|
|
|
|
iterate (List)
|
|
{
|
|
LabeledList = (LABELEDLIST) first_node (List);
|
|
if (strcmp (LabeledList->Label, Label) == 0)
|
|
return (LabeledList);
|
|
}
|
|
return (NULL);
|
|
|
|
} /* FindList */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
LABELEDLIST NewLabeledList (
|
|
const char *Label)
|
|
|
|
/*
|
|
** Parameters:
|
|
** Label label for new list
|
|
** Globals: none
|
|
** Operation:
|
|
** This routine allocates a new, empty labeled list and gives
|
|
** it the specified label.
|
|
** Return: New, empty labeled list.
|
|
** Exceptions: none
|
|
** History: Fri Aug 18 16:08:46 1989, DSJ, Created.
|
|
*/
|
|
|
|
{
|
|
LABELEDLIST LabeledList;
|
|
|
|
LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
|
|
LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
|
|
strcpy (LabeledList->Label, Label);
|
|
LabeledList->List = NIL_LIST;
|
|
LabeledList->SampleCount = 0;
|
|
LabeledList->font_sample_count = 0;
|
|
return (LabeledList);
|
|
|
|
} /* NewLabeledList */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
// TODO(rays) This is now used only by cntraining. Convert cntraining to use
|
|
// the new method or get rid of it entirely.
|
|
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
|
|
const char *feature_name, int max_samples,
|
|
UNICHARSET* unicharset,
|
|
FILE* file, LIST* training_samples) {
|
|
/*
|
|
** Parameters:
|
|
** file open text file to read samples from
|
|
** Globals: none
|
|
** Operation:
|
|
** This routine reads training samples from a file and
|
|
** places them into a data structure which organizes the
|
|
** samples by FontName and CharName. It then returns this
|
|
** data structure.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Fri Aug 18 13:11:39 1989, DSJ, Created.
|
|
** Tue May 17 1998 simplifications to structure, illiminated
|
|
** font, and feature specification levels of structure.
|
|
*/
|
|
char buffer[2048];
|
|
char unichar[UNICHAR_LEN + 1];
|
|
LABELEDLIST char_sample;
|
|
FEATURE_SET feature_samples;
|
|
CHAR_DESC char_desc;
|
|
int i;
|
|
int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
|
|
// Zero out the font_sample_count for all the classes.
|
|
LIST it = *training_samples;
|
|
iterate(it) {
|
|
char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
|
|
char_sample->font_sample_count = 0;
|
|
}
|
|
|
|
while (fgets(buffer, 2048, file) != NULL) {
|
|
if (buffer[0] == '\n')
|
|
continue;
|
|
|
|
sscanf(buffer, "%*s %s", unichar);
|
|
if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
|
|
unicharset->unichar_insert(unichar);
|
|
if (unicharset->size() > MAX_NUM_CLASSES) {
|
|
tprintf("Error: Size of unicharset in training is "
|
|
"greater than MAX_NUM_CLASSES\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
char_sample = FindList(*training_samples, unichar);
|
|
if (char_sample == NULL) {
|
|
char_sample = NewLabeledList(unichar);
|
|
*training_samples = push(*training_samples, char_sample);
|
|
}
|
|
char_desc = ReadCharDescription(feature_defs, file);
|
|
feature_samples = char_desc->FeatureSets[feature_type];
|
|
if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
|
|
char_sample->List = push(char_sample->List, feature_samples);
|
|
char_sample->SampleCount++;
|
|
char_sample->font_sample_count++;
|
|
} else {
|
|
FreeFeatureSet(feature_samples);
|
|
}
|
|
for (i = 0; i < char_desc->NumFeatureSets; i++) {
|
|
if (feature_type != i)
|
|
FreeFeatureSet(char_desc->FeatureSets[i]);
|
|
}
|
|
free(char_desc);
|
|
}
|
|
} // ReadTrainingSamples
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void FreeTrainingSamples(LIST CharList) {
|
|
/*
|
|
** Parameters:
|
|
** FontList list of all fonts in document
|
|
** Globals: none
|
|
** Operation:
|
|
** This routine deallocates all of the space allocated to
|
|
** the specified list of training samples.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Fri Aug 18 17:44:27 1989, DSJ, Created.
|
|
*/
|
|
LABELEDLIST char_sample;
|
|
FEATURE_SET FeatureSet;
|
|
LIST FeatureList;
|
|
|
|
|
|
iterate(CharList) { /* iterate thru all of the fonts */
|
|
char_sample = (LABELEDLIST) first_node(CharList);
|
|
FeatureList = char_sample->List;
|
|
iterate(FeatureList) { /* iterate thru all of the classes */
|
|
FeatureSet = (FEATURE_SET) first_node(FeatureList);
|
|
FreeFeatureSet(FeatureSet);
|
|
}
|
|
FreeLabeledList(char_sample);
|
|
}
|
|
destroy(CharList);
|
|
} /* FreeTrainingSamples */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void FreeLabeledList(LABELEDLIST LabeledList) {
|
|
/*
|
|
** Parameters:
|
|
** LabeledList labeled list to be freed
|
|
** Globals: none
|
|
** Operation:
|
|
** This routine deallocates all of the memory consumed by
|
|
** a labeled list. It does not free any memory which may be
|
|
** consumed by the items in the list.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Fri Aug 18 17:52:45 1989, DSJ, Created.
|
|
*/
|
|
destroy(LabeledList->List);
|
|
free(LabeledList->Label);
|
|
free(LabeledList);
|
|
} /* FreeLabeledList */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs,
|
|
LABELEDLIST char_sample,
|
|
const char* program_feature_type) {
|
|
/*
|
|
** Parameters:
|
|
** char_sample: LABELEDLIST that holds all the feature information for a
|
|
** given character.
|
|
** Globals:
|
|
** None
|
|
** Operation:
|
|
** This routine reads samples from a LABELEDLIST and enters
|
|
** those samples into a clusterer data structure. This
|
|
** data structure is then returned to the caller.
|
|
** Return:
|
|
** Pointer to new clusterer data structure.
|
|
** Exceptions:
|
|
** None
|
|
** History:
|
|
** 8/16/89, DSJ, Created.
|
|
*/
|
|
uinT16 N;
|
|
int i, j;
|
|
FLOAT32 *Sample = NULL;
|
|
CLUSTERER *Clusterer;
|
|
inT32 CharID;
|
|
LIST FeatureList = NULL;
|
|
FEATURE_SET FeatureSet = NULL;
|
|
|
|
int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
|
|
N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
|
|
Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
|
|
|
|
FeatureList = char_sample->List;
|
|
CharID = 0;
|
|
iterate(FeatureList) {
|
|
FeatureSet = (FEATURE_SET) first_node(FeatureList);
|
|
for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
|
|
if (Sample == NULL)
|
|
Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
|
|
for (j = 0; j < N; j++)
|
|
Sample[j] = FeatureSet->Features[i]->Params[j];
|
|
MakeSample (Clusterer, Sample, CharID);
|
|
}
|
|
CharID++;
|
|
}
|
|
if ( Sample != NULL ) free( Sample );
|
|
return( Clusterer );
|
|
|
|
} /* SetUpForClustering */
|
|
|
|
/*------------------------------------------------------------------------*/
|
|
void MergeInsignificantProtos(LIST ProtoList, const char* label,
|
|
CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
|
|
PROTOTYPE *Prototype;
|
|
bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
|
|
|
|
LIST pProtoList = ProtoList;
|
|
iterate(pProtoList) {
|
|
Prototype = (PROTOTYPE *) first_node (pProtoList);
|
|
if (Prototype->Significant || Prototype->Merged)
|
|
continue;
|
|
FLOAT32 best_dist = 0.125;
|
|
PROTOTYPE* best_match = NULL;
|
|
// Find the nearest alive prototype.
|
|
LIST list_it = ProtoList;
|
|
iterate(list_it) {
|
|
PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
|
|
if (test_p != Prototype && !test_p->Merged) {
|
|
FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
|
|
Clusterer->ParamDesc,
|
|
Prototype->Mean, test_p->Mean);
|
|
if (dist < best_dist) {
|
|
best_match = test_p;
|
|
best_dist = dist;
|
|
}
|
|
}
|
|
}
|
|
if (best_match != NULL && !best_match->Significant) {
|
|
if (debug)
|
|
tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
|
|
best_match->NumSamples, Prototype->NumSamples,
|
|
best_match->Mean[0], best_match->Mean[1],
|
|
Prototype->Mean[0], Prototype->Mean[1]);
|
|
best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
|
|
Clusterer->ParamDesc,
|
|
best_match->NumSamples,
|
|
Prototype->NumSamples,
|
|
best_match->Mean,
|
|
best_match->Mean, Prototype->Mean);
|
|
Prototype->NumSamples = 0;
|
|
Prototype->Merged = 1;
|
|
} else if (best_match != NULL) {
|
|
if (debug)
|
|
tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
|
|
Prototype->Mean[0], Prototype->Mean[1],
|
|
best_match->Mean[0], best_match->Mean[1]);
|
|
Prototype->Merged = 1;
|
|
}
|
|
}
|
|
// Mark significant those that now have enough samples.
|
|
int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
|
|
pProtoList = ProtoList;
|
|
iterate(pProtoList) {
|
|
Prototype = (PROTOTYPE *) first_node (pProtoList);
|
|
// Process insignificant protos that do not match a green one
|
|
if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
|
|
!Prototype->Merged) {
|
|
if (debug)
|
|
tprintf("Red proto at %g,%g becoming green\n",
|
|
Prototype->Mean[0], Prototype->Mean[1]);
|
|
Prototype->Significant = true;
|
|
}
|
|
}
|
|
} /* MergeInsignificantProtos */
|
|
|
|
/*-----------------------------------------------------------------------------*/
|
|
void CleanUpUnusedData(
|
|
LIST ProtoList)
|
|
{
|
|
PROTOTYPE* Prototype;
|
|
|
|
iterate(ProtoList)
|
|
{
|
|
Prototype = (PROTOTYPE *) first_node (ProtoList);
|
|
if(Prototype->Variance.Elliptical != NULL)
|
|
{
|
|
memfree(Prototype->Variance.Elliptical);
|
|
Prototype->Variance.Elliptical = NULL;
|
|
}
|
|
if(Prototype->Magnitude.Elliptical != NULL)
|
|
{
|
|
memfree(Prototype->Magnitude.Elliptical);
|
|
Prototype->Magnitude.Elliptical = NULL;
|
|
}
|
|
if(Prototype->Weight.Elliptical != NULL)
|
|
{
|
|
memfree(Prototype->Weight.Elliptical);
|
|
Prototype->Weight.Elliptical = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*------------------------------------------------------------------------*/
|
|
LIST RemoveInsignificantProtos(
|
|
LIST ProtoList,
|
|
BOOL8 KeepSigProtos,
|
|
BOOL8 KeepInsigProtos,
|
|
int N)
|
|
|
|
{
|
|
LIST NewProtoList = NIL_LIST;
|
|
LIST pProtoList;
|
|
PROTOTYPE* Proto;
|
|
PROTOTYPE* NewProto;
|
|
int i;
|
|
|
|
pProtoList = ProtoList;
|
|
iterate(pProtoList)
|
|
{
|
|
Proto = (PROTOTYPE *) first_node (pProtoList);
|
|
if ((Proto->Significant && KeepSigProtos) ||
|
|
(!Proto->Significant && KeepInsigProtos))
|
|
{
|
|
NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
|
|
|
|
NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
|
|
NewProto->Significant = Proto->Significant;
|
|
NewProto->Style = Proto->Style;
|
|
NewProto->NumSamples = Proto->NumSamples;
|
|
NewProto->Cluster = NULL;
|
|
NewProto->Distrib = NULL;
|
|
|
|
for (i=0; i < N; i++)
|
|
NewProto->Mean[i] = Proto->Mean[i];
|
|
if (Proto->Variance.Elliptical != NULL)
|
|
{
|
|
NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
|
|
for (i=0; i < N; i++)
|
|
NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
|
|
}
|
|
else
|
|
NewProto->Variance.Elliptical = NULL;
|
|
//---------------------------------------------
|
|
if (Proto->Magnitude.Elliptical != NULL)
|
|
{
|
|
NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
|
|
for (i=0; i < N; i++)
|
|
NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
|
|
}
|
|
else
|
|
NewProto->Magnitude.Elliptical = NULL;
|
|
//------------------------------------------------
|
|
if (Proto->Weight.Elliptical != NULL)
|
|
{
|
|
NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
|
|
for (i=0; i < N; i++)
|
|
NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
|
|
}
|
|
else
|
|
NewProto->Weight.Elliptical = NULL;
|
|
|
|
NewProto->TotalMagnitude = Proto->TotalMagnitude;
|
|
NewProto->LogMagnitude = Proto->LogMagnitude;
|
|
NewProtoList = push_last(NewProtoList, NewProto);
|
|
}
|
|
}
|
|
FreeProtoList(&ProtoList);
|
|
return (NewProtoList);
|
|
} /* RemoveInsignificantProtos */
|
|
|
|
/*----------------------------------------------------------------------------*/
|
|
MERGE_CLASS FindClass (
|
|
LIST List,
|
|
const char *Label)
|
|
{
|
|
MERGE_CLASS MergeClass;
|
|
|
|
iterate (List)
|
|
{
|
|
MergeClass = (MERGE_CLASS) first_node (List);
|
|
if (strcmp (MergeClass->Label, Label) == 0)
|
|
return (MergeClass);
|
|
}
|
|
return (NULL);
|
|
|
|
} /* FindClass */
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
MERGE_CLASS NewLabeledClass (
|
|
const char *Label)
|
|
{
|
|
MERGE_CLASS MergeClass;
|
|
|
|
MergeClass = new MERGE_CLASS_NODE;
|
|
MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
|
|
strcpy (MergeClass->Label, Label);
|
|
MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
|
|
return (MergeClass);
|
|
|
|
} /* NewLabeledClass */
|
|
|
|
/*-----------------------------------------------------------------------------*/
|
|
void FreeLabeledClassList (
|
|
LIST ClassList)
|
|
|
|
/*
|
|
** Parameters:
|
|
** FontList list of all fonts in document
|
|
** Globals: none
|
|
** Operation:
|
|
** This routine deallocates all of the space allocated to
|
|
** the specified list of training samples.
|
|
** Return: none
|
|
** Exceptions: none
|
|
** History: Fri Aug 18 17:44:27 1989, DSJ, Created.
|
|
*/
|
|
|
|
{
|
|
MERGE_CLASS MergeClass;
|
|
|
|
iterate (ClassList) /* iterate thru all of the fonts */
|
|
{
|
|
MergeClass = (MERGE_CLASS) first_node (ClassList);
|
|
free (MergeClass->Label);
|
|
FreeClass(MergeClass->Class);
|
|
delete MergeClass;
|
|
}
|
|
destroy (ClassList);
|
|
|
|
} /* FreeLabeledClassList */
|
|
|
|
/** SetUpForFloat2Int **************************************************/
|
|
CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
|
|
LIST LabeledClassList) {
|
|
MERGE_CLASS MergeClass;
|
|
CLASS_TYPE Class;
|
|
int NumProtos;
|
|
int NumConfigs;
|
|
int NumWords;
|
|
int i, j;
|
|
float Values[3];
|
|
PROTO NewProto;
|
|
PROTO OldProto;
|
|
BIT_VECTOR NewConfig;
|
|
BIT_VECTOR OldConfig;
|
|
|
|
// printf("Float2Int ...\n");
|
|
|
|
CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
|
|
iterate(LabeledClassList)
|
|
{
|
|
UnicityTableEqEq<int> font_set;
|
|
MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
|
|
Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
|
|
NumProtos = MergeClass->Class->NumProtos;
|
|
NumConfigs = MergeClass->Class->NumConfigs;
|
|
font_set.move(&MergeClass->Class->font_set);
|
|
Class->NumProtos = NumProtos;
|
|
Class->MaxNumProtos = NumProtos;
|
|
Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
|
|
for(i=0; i < NumProtos; i++)
|
|
{
|
|
NewProto = ProtoIn(Class, i);
|
|
OldProto = ProtoIn(MergeClass->Class, i);
|
|
Values[0] = OldProto->X;
|
|
Values[1] = OldProto->Y;
|
|
Values[2] = OldProto->Angle;
|
|
Normalize(Values);
|
|
NewProto->X = OldProto->X;
|
|
NewProto->Y = OldProto->Y;
|
|
NewProto->Length = OldProto->Length;
|
|
NewProto->Angle = OldProto->Angle;
|
|
NewProto->A = Values[0];
|
|
NewProto->B = Values[1];
|
|
NewProto->C = Values[2];
|
|
}
|
|
|
|
Class->NumConfigs = NumConfigs;
|
|
Class->MaxNumConfigs = NumConfigs;
|
|
Class->font_set.move(&font_set);
|
|
Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
|
|
NumWords = WordsInVectorOfSize(NumProtos);
|
|
for(i=0; i < NumConfigs; i++)
|
|
{
|
|
NewConfig = NewBitVector(NumProtos);
|
|
OldConfig = MergeClass->Class->Configurations[i];
|
|
for(j=0; j < NumWords; j++)
|
|
NewConfig[j] = OldConfig[j];
|
|
Class->Configurations[i] = NewConfig;
|
|
}
|
|
}
|
|
return float_classes;
|
|
} // SetUpForFloat2Int
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
void Normalize (
|
|
float *Values)
|
|
{
|
|
register float Slope;
|
|
register float Intercept;
|
|
register float Normalizer;
|
|
|
|
Slope = tan (Values [2] * 2 * PI);
|
|
Intercept = Values [1] - Slope * Values [0];
|
|
Normalizer = 1 / sqrt (Slope * Slope + 1.0);
|
|
|
|
Values [0] = Slope * Normalizer;
|
|
Values [1] = - Normalizer;
|
|
Values [2] = Intercept * Normalizer;
|
|
} // Normalize
|
|
|
|
/*-------------------------------------------------------------------------*/
|
|
void FreeNormProtoList (
|
|
LIST CharList)
|
|
|
|
{
|
|
LABELEDLIST char_sample;
|
|
|
|
iterate (CharList) /* iterate thru all of the fonts */
|
|
{
|
|
char_sample = (LABELEDLIST) first_node (CharList);
|
|
FreeLabeledList (char_sample);
|
|
}
|
|
destroy (CharList);
|
|
|
|
} // FreeNormProtoList
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
void AddToNormProtosList(
|
|
LIST* NormProtoList,
|
|
LIST ProtoList,
|
|
char* CharName)
|
|
{
|
|
PROTOTYPE* Proto;
|
|
LABELEDLIST LabeledProtoList;
|
|
|
|
LabeledProtoList = NewLabeledList(CharName);
|
|
iterate(ProtoList)
|
|
{
|
|
Proto = (PROTOTYPE *) first_node (ProtoList);
|
|
LabeledProtoList->List = push(LabeledProtoList->List, Proto);
|
|
}
|
|
*NormProtoList = push(*NormProtoList, LabeledProtoList);
|
|
}
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
int NumberOfProtos(
|
|
LIST ProtoList,
|
|
BOOL8 CountSigProtos,
|
|
BOOL8 CountInsigProtos)
|
|
{
|
|
int N = 0;
|
|
PROTOTYPE *Proto;
|
|
|
|
iterate(ProtoList)
|
|
{
|
|
Proto = (PROTOTYPE *) first_node ( ProtoList );
|
|
if (( Proto->Significant && CountSigProtos ) ||
|
|
( ! Proto->Significant && CountInsigProtos ) )
|
|
N++;
|
|
}
|
|
return(N);
|
|
}
|