mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-25 11:39:06 +08:00
d11dc049e3
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1015 d0cd1f9f-072b-0410-8dd7-cf729c803f20
313 lines
13 KiB
C++
313 lines
13 KiB
C++
// Copyright 2010 Google Inc. All Rights Reserved.
|
|
// Author: rays@google.com (Ray Smith)
|
|
///////////////////////////////////////////////////////////////////////
|
|
// File: mastertrainer.h
|
|
// Description: Trainer to build the MasterClassifier.
|
|
// Author: Ray Smith
|
|
// Created: Wed Nov 03 18:07:01 PDT 2010
|
|
//
|
|
// (C) Copyright 2010, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_TRAINING_MASTERTRAINER_H__
|
|
#define TESSERACT_TRAINING_MASTERTRAINER_H__
|
|
|
|
/**----------------------------------------------------------------------------
|
|
Include Files and Type Defines
|
|
----------------------------------------------------------------------------**/
|
|
#include "classify.h"
|
|
#include "cluster.h"
|
|
#include "intfx.h"
|
|
#include "elst.h"
|
|
#include "errorcounter.h"
|
|
#include "featdefs.h"
|
|
#include "fontinfo.h"
|
|
#include "indexmapbidi.h"
|
|
#include "intfeaturespace.h"
|
|
#include "intfeaturemap.h"
|
|
#include "intmatcher.h"
|
|
#include "params.h"
|
|
#include "shapetable.h"
|
|
#include "trainingsample.h"
|
|
#include "trainingsampleset.h"
|
|
#include "unicharset.h"
|
|
|
|
namespace tesseract {
|
|
|
|
class ShapeClassifier;
|
|
|
|
// Simple struct to hold the distance between two shapes during clustering.
|
|
struct ShapeDist {
|
|
ShapeDist() : shape1(0), shape2(0), distance(0.0f) {}
|
|
ShapeDist(int s1, int s2, float dist)
|
|
: shape1(s1), shape2(s2), distance(dist) {}
|
|
|
|
// Sort operator to sort in ascending order of distance.
|
|
bool operator<(const ShapeDist& other) const {
|
|
return distance < other.distance;
|
|
}
|
|
|
|
int shape1;
|
|
int shape2;
|
|
float distance;
|
|
};
|
|
|
|
// Class to encapsulate training processes that use the TrainingSampleSet.
|
|
// Initially supports shape clustering and mftrainining.
|
|
// Other important features of the MasterTrainer are conditioning the data
|
|
// by outlier elimination, replication with perturbation, and serialization.
|
|
class MasterTrainer {
|
|
public:
|
|
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis,
|
|
bool replicate_samples, int debug_level);
|
|
~MasterTrainer();
|
|
|
|
// Writes to the given file. Returns false in case of error.
|
|
bool Serialize(FILE* fp) const;
|
|
// Reads from the given file. Returns false in case of error.
|
|
// If swap is true, assumes a big/little-endian swap is needed.
|
|
bool DeSerialize(bool swap, FILE* fp);
|
|
|
|
// Loads an initial unicharset, or sets one up if the file cannot be read.
|
|
void LoadUnicharset(const char* filename);
|
|
|
|
// Sets the feature space definition.
|
|
void SetFeatureSpace(const IntFeatureSpace& fs) {
|
|
feature_space_ = fs;
|
|
feature_map_.Init(fs);
|
|
}
|
|
|
|
// Reads the samples and their features from the given file,
|
|
// adding them to the trainer with the font_id from the content of the file.
|
|
// If verification, then these are verification samples, not training.
|
|
void ReadTrainingSamples(const char* page_name,
|
|
const FEATURE_DEFS_STRUCT& feature_defs,
|
|
bool verification);
|
|
|
|
// Adds the given single sample to the trainer, setting the classid
|
|
// appropriately from the given unichar_str.
|
|
void AddSample(bool verification, const char* unichar_str,
|
|
TrainingSample* sample);
|
|
|
|
// Loads all pages from the given tif filename and append to page_images_.
|
|
// Must be called after ReadTrainingSamples, as the current number of images
|
|
// is used as an offset for page numbers in the samples.
|
|
void LoadPageImages(const char* filename);
|
|
|
|
// Cleans up the samples after initial load from the tr files, and prior to
|
|
// saving the MasterTrainer:
|
|
// Remaps fragmented chars if running shape anaylsis.
|
|
// Sets up the samples appropriately for class/fontwise access.
|
|
// Deletes outlier samples.
|
|
void PostLoadCleanup();
|
|
|
|
// Gets the samples ready for training. Use after both
|
|
// ReadTrainingSamples+PostLoadCleanup or DeSerialize.
|
|
// Re-indexes the features and computes canonical and cloud features.
|
|
void PreTrainingSetup();
|
|
|
|
// Sets up the master_shapes_ table, which tells which fonts should stay
|
|
// together until they get to a leaf node classifier.
|
|
void SetupMasterShapes();
|
|
|
|
// Adds the junk_samples_ to the main samples_ set. Junk samples are initially
|
|
// fragments and n-grams (all incorrectly segmented characters).
|
|
// Various training functions may result in incorrectly segmented characters
|
|
// being added to the unicharset of the main samples, perhaps because they
|
|
// form a "radical" decomposition of some (Indic) grapheme, or because they
|
|
// just look the same as a real character (like rn/m)
|
|
// This function moves all the junk samples, to the main samples_ set, but
|
|
// desirable junk, being any sample for which the unichar already exists in
|
|
// the samples_ unicharset gets the unichar-ids re-indexed to match, but
|
|
// anything else gets re-marked as unichar_id 0 (space character) to identify
|
|
// it as junk to the error counter.
|
|
void IncludeJunk();
|
|
|
|
// Replicates the samples and perturbs them if the enable_replication_ flag
|
|
// is set. MUST be used after the last call to OrganizeByFontAndClass on
|
|
// the training samples, ie after IncludeJunk if it is going to be used, as
|
|
// OrganizeByFontAndClass will eat the replicated samples into the regular
|
|
// samples.
|
|
void ReplicateAndRandomizeSamplesIfRequired();
|
|
|
|
// Loads the basic font properties file into fontinfo_table_.
|
|
// Returns false on failure.
|
|
bool LoadFontInfo(const char* filename);
|
|
|
|
// Loads the xheight font properties file into xheights_.
|
|
// Returns false on failure.
|
|
bool LoadXHeights(const char* filename);
|
|
|
|
// Reads spacing stats from filename and adds them to fontinfo_table.
|
|
// Returns false on failure.
|
|
bool AddSpacingInfo(const char *filename);
|
|
|
|
// Returns the font id corresponding to the given font name.
|
|
// Returns -1 if the font cannot be found.
|
|
int GetFontInfoId(const char* font_name);
|
|
// Returns the font_id of the closest matching font name to the given
|
|
// filename. It is assumed that a substring of the filename will match
|
|
// one of the fonts. If more than one is matched, the longest is returned.
|
|
int GetBestMatchingFontInfoId(const char* filename);
|
|
|
|
// Returns the filename of the tr file corresponding to the command-line
|
|
// argument with the given index.
|
|
const STRING& GetTRFileName(int index) const {
|
|
return tr_filenames_[index];
|
|
}
|
|
|
|
// Sets up a flat shapetable with one shape per class/font combination.
|
|
void SetupFlatShapeTable(ShapeTable* shape_table);
|
|
|
|
// Sets up a Clusterer for mftraining on a single shape_id.
|
|
// Call FreeClusterer on the return value after use.
|
|
CLUSTERER* SetupForClustering(const ShapeTable& shape_table,
|
|
const FEATURE_DEFS_STRUCT& feature_defs,
|
|
int shape_id, int* num_samples);
|
|
|
|
// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
|
|
// to the given inttemp_file, and the corresponding pffmtable.
|
|
// The unicharset is the original encoding of graphemes, and shape_set should
|
|
// match the size of the shape_table, and may possibly be totally fake.
|
|
void WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
|
|
const UNICHARSET& shape_set,
|
|
const ShapeTable& shape_table,
|
|
CLASS_STRUCT* float_classes,
|
|
const char* inttemp_file,
|
|
const char* pffmtable_file);
|
|
|
|
const UNICHARSET& unicharset() const {
|
|
return samples_.unicharset();
|
|
}
|
|
TrainingSampleSet* GetSamples() {
|
|
return &samples_;
|
|
}
|
|
const ShapeTable& master_shapes() const {
|
|
return master_shapes_;
|
|
}
|
|
|
|
// Generates debug output relating to the canonical distance between the
|
|
// two given UTF8 grapheme strings.
|
|
void DebugCanonical(const char* unichar_str1, const char* unichar_str2);
|
|
#ifndef GRAPHICS_DISABLED
|
|
// Debugging for cloud/canonical features.
|
|
// Displays a Features window containing:
|
|
// If unichar_str2 is in the unicharset, and canonical_font is non-negative,
|
|
// displays the canonical features of the char/font combination in red.
|
|
// If unichar_str1 is in the unicharset, and cloud_font is non-negative,
|
|
// displays the cloud feature of the char/font combination in green.
|
|
// The canonical features are drawn first to show which ones have no
|
|
// matches in the cloud features.
|
|
// Until the features window is destroyed, each click in the features window
|
|
// will display the samples that have that feature in a separate window.
|
|
void DisplaySamples(const char* unichar_str1, int cloud_font,
|
|
const char* unichar_str2, int canonical_font);
|
|
#endif // GRAPHICS_DISABLED
|
|
|
|
void TestClassifierVOld(bool replicate_samples,
|
|
ShapeClassifier* test_classifier,
|
|
ShapeClassifier* old_classifier);
|
|
|
|
// Tests the given test_classifier on the internal samples.
|
|
// See TestClassifier for details.
|
|
void TestClassifierOnSamples(CountTypes error_mode,
|
|
int report_level,
|
|
bool replicate_samples,
|
|
ShapeClassifier* test_classifier,
|
|
STRING* report_string);
|
|
// Tests the given test_classifier on the given samples
|
|
// error_mode indicates what counts as an error.
|
|
// report_levels:
|
|
// 0 = no output.
|
|
// 1 = bottom-line error rate.
|
|
// 2 = bottom-line error rate + time.
|
|
// 3 = font-level error rate + time.
|
|
// 4 = list of all errors + short classifier debug output on 16 errors.
|
|
// 5 = list of all errors + short classifier debug output on 25 errors.
|
|
// If replicate_samples is true, then the test is run on an extended test
|
|
// sample including replicated and systematically perturbed samples.
|
|
// If report_string is non-NULL, a summary of the results for each font
|
|
// is appended to the report_string.
|
|
double TestClassifier(CountTypes error_mode,
|
|
int report_level,
|
|
bool replicate_samples,
|
|
TrainingSampleSet* samples,
|
|
ShapeClassifier* test_classifier,
|
|
STRING* report_string);
|
|
|
|
// Returns the average (in some sense) distance between the two given
|
|
// shapes, which may contain multiple fonts and/or unichars.
|
|
// This function is public to facilitate testing.
|
|
float ShapeDistance(const ShapeTable& shapes, int s1, int s2);
|
|
|
|
private:
|
|
// Replaces samples that are always fragmented with the corresponding
|
|
// fragment samples.
|
|
void ReplaceFragmentedSamples();
|
|
|
|
// Runs a hierarchical agglomerative clustering to merge shapes in the given
|
|
// shape_table, while satisfying the given constraints:
|
|
// * End with at least min_shapes left in shape_table,
|
|
// * No shape shall have more than max_shape_unichars in it,
|
|
// * Don't merge shapes where the distance between them exceeds max_dist.
|
|
void ClusterShapes(int min_shapes, int max_shape_unichars,
|
|
float max_dist, ShapeTable* shape_table);
|
|
|
|
private:
|
|
NormalizationMode norm_mode_;
|
|
// Character set we are training for.
|
|
UNICHARSET unicharset_;
|
|
// Original feature space. Subspace mapping is contained in feature_map_.
|
|
IntFeatureSpace feature_space_;
|
|
TrainingSampleSet samples_;
|
|
TrainingSampleSet junk_samples_;
|
|
TrainingSampleSet verify_samples_;
|
|
// Master shape table defines what fonts stay together until the leaves.
|
|
ShapeTable master_shapes_;
|
|
// Flat shape table has each unichar/font id pair in a separate shape.
|
|
ShapeTable flat_shapes_;
|
|
// Font metrics gathered from multiple files.
|
|
FontInfoTable fontinfo_table_;
|
|
// Array of xheights indexed by font ids in fontinfo_table_;
|
|
GenericVector<inT32> xheights_;
|
|
|
|
// Non-serialized data initialized by other means or used temporarily
|
|
// during loading of training samples.
|
|
// Number of different class labels in unicharset_.
|
|
int charsetsize_;
|
|
// Flag to indicate that we are running shape analysis and need fragments
|
|
// fixing.
|
|
bool enable_shape_anaylsis_;
|
|
// Flag to indicate that sample replication is required.
|
|
bool enable_replication_;
|
|
// Array of classids of fragments that replace the correctly segmented chars.
|
|
int* fragments_;
|
|
// Classid of previous correctly segmented sample that was added.
|
|
int prev_unichar_id_;
|
|
// Debug output control.
|
|
int debug_level_;
|
|
// Feature map used to construct reduced feature spaces for compact
|
|
// classifiers.
|
|
IntFeatureMap feature_map_;
|
|
// Vector of Pix pointers used for classifiers that need the image.
|
|
// Indexed by page_num_ in the samples.
|
|
// These images are owned by the trainer and need to be pixDestroyed.
|
|
GenericVector<Pix*> page_images_;
|
|
// Vector of filenames of loaded tr files.
|
|
GenericVector<STRING> tr_filenames_;
|
|
};
|
|
|
|
} // namespace tesseract.
|
|
|
|
#endif
|