tesseract/classify/sampleiterator.h

196 lines
7.3 KiB
C++

// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
#define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
namespace tesseract {
class IndexMapBiDi;
class IntFeatureMap;
class ShapeTable;
class TrainingSample;
class TrainingSampleSet;
class UnicharAndFonts;
// Iterator class to encapsulate the complex iteration involved in getting
// all samples of all shapes needed for a classification problem.
//
// =====INPUTS TO Init FUNCTION=====
// The charset_map defines a subset of the sample_set classes (with a NULL
// shape_table, or the shape_table classes if not NULL.)
//
// The shape_table (if not NULL) defines the mapping from shapes to
// font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
//
// The sample_set holds the samples and provides indexed access to samples
// of font_id/class_id pairs.
//
// If randomize is true, the samples are perturbed slightly, but the
// perturbation is guaranteed to be the same for multiple identical
// iterations.
//
// =====DIFFERENT COMBINATIONS OF INPUTS=====
// NULL shape_table:
// Without a shape_table, everything works in UNICHAR_IDs.
//
// NULL shape_table, NULL charset_map:
// Iterations simply run over the samples in the order the samples occur in the
// input files.
// GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
//
// NULL shape_table, non-NULL charset_map:
// When shape_table is NULL, the charset_map indexes unichar_ids directly,
// and an iteration returns all samples of all chars in the charset_map, which
// is a subset of the full unicharset.
// The iteration will be in groups of the same unichar_id, in the order
// defined by the charset_map.
// GetCompactClassID returns the charset_map index of a sample, and
// GetSparseClassID returns the sample UNICHAR_ID.
//
// Non-NULL shape_table:
// With a shape_table, samples are grouped according to the shape_table, so
// multiple UNICHAR_IDs and fonts may be grouped together, and everything
// works in shape_ids.
//
// Non-NULL shape_table, NULL charset_map.
// Iterations simply run over the samples in the order of shape_id.
// GetCompactClassID and GetSparseClassID both return the shape_id.
// (If you want the unichar_id or font_id, the sample still has them.)
//
// Non-NULL shape_table, non-NULL charset_map.
// When shape_table is not NULL, the charset_map indexes and subsets shapes in
// the shape_table, and iterations will be in shape_table order, not
// charset_map order.
// GetCompactClassID returns the charset_map index of a shape, and
// GetSparseClassID returns the shape_id.
//
// =====What is SampleIterator good for?=====
// Inside a classifier training module, the SampleIterator has abstracted away
// all the different modes above.
// Use the following iteration to train your classifier:
// for (it.Begin(); !it.AtEnd(); it.Next()) {
// const TrainingSample& sample = it.GetSample();
// int class_id = it.GetCompactClassID();
// Your classifier may or may not be dealing with a shape_table, and may be
// dealing with some subset of the character/shape set. It doesn't need to
// know and shouldn't care. It is just learning shapes with compact class ids
// in the range [0, it.CompactCharsetSize()).
class SampleIterator {
public:
SampleIterator();
~SampleIterator();
void Clear();
// See class comment for arguments.
void Init(const IndexMapBiDi* charset_map,
const ShapeTable* shape_table,
bool randomize,
TrainingSampleSet* sample_set);
// Iterator functions designed for use with a simple for loop:
// for (it.Begin(); !it.AtEnd(); it.Next()) {
// const TrainingSample& sample = it.GetSample();
// int class_id = it.GetCompactClassID();
// ...
// }
void Begin();
bool AtEnd() const;
const TrainingSample& GetSample() const;
TrainingSample* MutableSample() const;
// Returns the total index (from the original set of samples) of the current
// sample.
int GlobalSampleIndex() const;
// Returns the index of the current sample in compact charset space, so
// in a 2-class problem between x and y, the returned indices will all be
// 0 or 1, and have nothing to do with the unichar_ids.
// If the charset_map_ is NULL, then this is equal to GetSparseClassID().
int GetCompactClassID() const;
// Returns the index of the current sample in sparse charset space, so
// in a 2-class problem between x and y, the returned indices will all be
// x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
// with a shape_table_.
int GetSparseClassID() const;
// Moves on to the next indexable sample. If the end is reached, leaves
// the state such that AtEnd() is true.
void Next();
// Returns the size of the compact charset space.
int CompactCharsetSize() const;
// Returns the size of the sparse charset space.
int SparseCharsetSize() const;
const IndexMapBiDi& charset_map() const {
return *charset_map_;
}
const ShapeTable* shape_table() const {
return shape_table_;
}
// Sample set operations.
const TrainingSampleSet* sample_set() const {
return sample_set_;
}
// A set of functions that do something to all the samples accessed by the
// iterator, as it is currently setup.
// Apply the supplied feature_space/feature_map transform to all samples
// accessed by this iterator.
void MapSampleFeatures(const IntFeatureMap& feature_map);
// Adjust the weights of all the samples to be uniform in the given charset.
// Returns the number of samples in the iterator.
int UniformSamples();
// Normalize the weights of all the samples defined by the iterator so they
// sum to 1. Returns the minimum assigned sample weight.
double NormalizeSamples();
private:
// Helper returns the current UnicharAndFont shape_entry.
const UnicharAndFonts* GetShapeEntry() const;
// Map to subset the actual charset space.
const IndexMapBiDi* charset_map_;
// Shape table to recombine character classes into shapes
const ShapeTable* shape_table_;
// The samples to iterate over.
TrainingSampleSet* sample_set_;
// Flag to control randomizing the sample features.
bool randomize_;
// Shape table owned by this used to iterate character classes.
ShapeTable* owned_shape_table_;
// Top-level iteration. Shape index in sparse charset_map space.
int shape_index_;
int num_shapes_;
// Index to the character class within a shape.
int shape_char_index_;
int num_shape_chars_;
// Index to the font within a shape/class pair.
int shape_font_index_;
int num_shape_fonts_;
// The lowest level iteration. sample_index_/num_samples_ counts samples
// in the current shape/class/font combination.
int sample_index_;
int num_samples_;
};
} // namespace tesseract.
#endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_