tesseract/training/mftraining.cpp

320 lines
12 KiB
C++

/******************************************************************************
** Filename: mftraining.c
** Purpose: Separates training pages into files for each character.
** Strips from files only the features and there parameters of
the feature type mf.
** Author: Dan Johnson
** Revisment: Christy Russon
** Environment: HPUX 6.5
** Library: HPUX 6.5
** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
** 5/25/90, DSJ, Adapted to multiple feature types.
** Tuesday, May 17, 1998 Changes made to make feature specific and
** simplify structures. First step in simplifying training process.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
/*----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------*/
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
#include <string.h>
#include <stdio.h>
#define _USE_MATH_DEFINES
#include <math.h>
#ifdef _WIN32
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#endif
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "commontraining.h"
#include "danerror.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "genericvector.h"
#include "indexmapbidi.h"
#include "intproto.h"
#include "mastertrainer.h"
#include "mergenf.h"
#include "mf.h"
#include "ndminx.h"
#include "ocrfeatures.h"
#include "oldlist.h"
#include "protos.h"
#include "shapetable.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"
using tesseract::IndexMapBiDi;
using tesseract::MasterTrainer;
using tesseract::Shape;
using tesseract::ShapeTable;
#define PROGRAM_FEATURE_TYPE "mf"
// Max length of a fake shape label.
const int kMaxShapeLabelLength = 10;
DECLARE_STRING_PARAM_FLAG(test_ch);
/*----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------*/
int main (
int argc,
char **argv);
/*----------------------------------------------------------------------------
Public Code
-----------------------------------------------------------------------------*/
#ifndef GRAPHICS_DISABLED
static void DisplayProtoList(const char* ch, LIST protolist) {
void* window = c_create_window("Char samples", 50, 200,
520, 520, -130.0, 130.0, -130.0, 130.0);
LIST proto = protolist;
iterate(proto) {
PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));
if (prototype->Significant)
c_line_color_index(window, Green);
else if (prototype->NumSamples == 0)
c_line_color_index(window, Blue);
else if (prototype->Merged)
c_line_color_index(window, Magenta);
else
c_line_color_index(window, Red);
float x = CenterX(prototype->Mean);
float y = CenterY(prototype->Mean);
double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
c_move(window, (x - dx) * 256, (y - dy) * 256);
c_draw(window, (x + dx) * 256, (y + dy) * 256);
if (prototype->Significant)
tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",
x, y, dx, dy, prototype->NumSamples);
else if (prototype->NumSamples > 0 && !prototype->Merged)
tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",
x, y, dx, dy, prototype->NumSamples);
}
c_make_current(window);
}
#endif // GRAPHICS_DISABLED
// Helper to run clustering on a single config.
// Mostly copied from the old mftraining, but with renamed variables.
static LIST ClusterOneConfig(int shape_id, const char* class_label,
LIST mf_classes,
const ShapeTable& shape_table,
MasterTrainer* trainer) {
int num_samples;
CLUSTERER *clusterer = trainer->SetupForClustering(shape_table,
feature_defs,
shape_id,
&num_samples);
Config.MagicSamples = num_samples;
LIST proto_list = ClusterSamples(clusterer, &Config);
CleanUpUnusedData(proto_list);
// Merge protos where reasonable to make more of them significant by
// representing almost all samples of the class/font.
MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
#ifndef GRAPHICS_DISABLED
if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0)
DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
#endif // GRAPHICS_DISABLED
// Delete the protos that will not be used in the inttemp output file.
proto_list = RemoveInsignificantProtos(proto_list, true,
false,
clusterer->SampleSize);
FreeClusterer(clusterer);
MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
if (merge_class == NULL) {
merge_class = NewLabeledClass(class_label);
mf_classes = push(mf_classes, merge_class);
}
int config_id = AddConfigToClass(merge_class->Class);
merge_class->Class->font_set.push_back(shape_id);
LIST proto_it = proto_list;
iterate(proto_it) {
PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE*>(first_node(proto_it));
// See if proto can be approximated by existing proto.
int p_id = FindClosestExistingProto(merge_class->Class,
merge_class->NumMerged, prototype);
if (p_id == NO_PROTO) {
// Need to make a new proto, as it doesn't match anything.
p_id = AddProtoToClass(merge_class->Class);
MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
merge_class->NumMerged[p_id] = 1;
} else {
PROTO_STRUCT dummy_proto;
MakeNewFromOld(&dummy_proto, prototype);
// Merge with the similar proto.
ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
static_cast<FLOAT32>(merge_class->NumMerged[p_id]),
1.0,
ProtoIn(merge_class->Class, p_id));
merge_class->NumMerged[p_id]++;
}
AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
}
FreeProtoList(&proto_list);
return mf_classes;
}
// Helper to setup the config map.
// Setup an index mapping from the shapes in the shape table to the classes
// that will be trained. In keeping with the original design, each shape
// with the same list of unichars becomes a different class and the configs
// represent the different combinations of fonts.
static void SetupConfigMap(ShapeTable* shape_table, IndexMapBiDi* config_map) {
int num_configs = shape_table->NumShapes();
config_map->Init(num_configs, true);
config_map->Setup();
for (int c1 = 0; c1 < num_configs; ++c1) {
// Only process ids that are not already merged.
if (config_map->SparseToCompact(c1) == c1) {
Shape* shape1 = shape_table->MutableShape(c1);
// Find all the subsequent shapes that are equal.
for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
config_map->Merge(c1, c2);
}
}
}
}
config_map->CompleteMerges();
}
/**
* This program reads in a text file consisting of feature
* samples from a training page in the following format:
* @verbatim
FontName UTF8-char-str xmin ymin xmax ymax page-number
NumberOfFeatureTypes(N)
FeatureTypeName1 NumberOfFeatures(M)
Feature1
...
FeatureM
FeatureTypeName2 NumberOfFeatures(M)
Feature1
...
FeatureM
...
FeatureTypeNameN NumberOfFeatures(M)
Feature1
...
FeatureM
FontName CharName ...
@endverbatim
* The result of this program is a binary inttemp file used by
* the OCR engine.
* @param argc number of command line arguments
* @param argv array of command line arguments
* @return none
* @note Exceptions: none
* @note History: Fri Aug 18 08:56:17 1989, DSJ, Created.
* @note History: Mon May 18 1998, Christy Russson, Revistion started.
*/
int main (int argc, char **argv) {
ParseArguments(&argc, &argv);
ShapeTable* shape_table = NULL;
STRING file_prefix;
// Load the training data.
MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
false,
&shape_table,
&file_prefix);
if (trainer == NULL)
return 1; // Failed.
// Setup an index mapping from the shapes in the shape table to the classes
// that will be trained. In keeping with the original design, each shape
// with the same list of unichars becomes a different class and the configs
// represent the different combinations of fonts.
IndexMapBiDi config_map;
SetupConfigMap(shape_table, &config_map);
WriteShapeTable(file_prefix, *shape_table);
// If the shape_table is flat, then either we didn't run shape clustering, or
// it did nothing, so we just output the trainer's unicharset.
// Otherwise shape_set will hold a fake unicharset with an entry for each
// shape in the shape table, and we will output that instead.
UNICHARSET shape_set;
const UNICHARSET* unicharset = &trainer->unicharset();
// If we ran shapeclustering (and it worked) then at least one shape will
// have multiple unichars, so we have to build a fake unicharset.
if (shape_table->AnyMultipleUnichars()) {
unicharset = &shape_set;
// Now build a fake unicharset for the compact shape space to keep the
// output modules happy that we are doing things correctly.
int num_shapes = config_map.CompactSize();
for (int s = 0; s < num_shapes; ++s) {
char shape_label[kMaxShapeLabelLength + 1];
snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s);
shape_set.unichar_insert(shape_label);
}
}
// Now train each config separately.
int num_configs = shape_table->NumShapes();
LIST mf_classes = NIL_LIST;
for (int s = 0; s < num_configs; ++s) {
int unichar_id, font_id;
if (unicharset == &shape_set) {
// Using fake unichar_ids from the config_map/shape_set.
unichar_id = config_map.SparseToCompact(s);
} else {
// Get the real unichar_id from the shape table/unicharset.
shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
}
const char* class_label = unicharset->id_to_unichar(unichar_id);
mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
trainer);
}
STRING inttemp_file = file_prefix;
inttemp_file += "inttemp";
STRING pffmtable_file = file_prefix;
pffmtable_file += "pffmtable";
CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
// Now write the inttemp and pffmtable.
trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
*shape_table, float_classes,
inttemp_file.string(),
pffmtable_file.string());
for (int c = 0; c < unicharset->size(); ++c) {
FreeClassFields(&float_classes[c]);
}
delete [] float_classes;
FreeLabeledClassList(mf_classes);
delete trainer;
delete shape_table;
printf("Done!\n");
if (!FLAGS_test_ch.empty()) {
// If we are displaying debug window(s), wait for the user to look at them.
printf("Hit return to exit...\n");
while (getchar() != '\n');
}
return 0;
} /* main */