Refactored classifier to make it easier to add new ones and generalized feature extractor to allow fx from grey

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@873 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 01:42:41 +08:00 · 2013-09-23 15:15:06 +00:00 · 2013-09-23 15:15:06 +00:00 · 99edf4ccbd
commit 99edf4ccbd
parent 2aafc9df24
48 changed files with 2192 additions and 1797 deletions
--- a/classify/Makefile.am
+++ b/classify/Makefile.am
@ -9,7 +9,7 @@ AM_CPPFLAGS += -DTESS_EXPORTS \
 endif

 noinst_HEADERS = \
-    adaptive.h baseline.h blobclass.h chartoname.h \
+    adaptive.h blobclass.h chartoname.h \
    classify.h cluster.h clusttool.h cutoffs.h \
    errorcounter.h extern.h extract.h \
    featdefs.h flexfx.h float2int.h fpoint.h fxdefs.h \
@ -19,7 +19,7 @@ noinst_HEADERS = \
    normfeat.h normmatch.h \
    ocrfeatures.h outfeat.h picofeat.h protos.h \
    sampleiterator.h shapeclassifier.h shapetable.h \
-    speckle.h tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h
+    tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h

 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_classify.la
@ -45,7 +45,7 @@ libtesseract_classify_la_SOURCES = \
    mastertrainer.cpp mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
    normfeat.cpp normmatch.cpp \
    ocrfeatures.cpp outfeat.cpp picofeat.cpp protos.cpp \
-    sampleiterator.cpp shapetable.cpp speckle.cpp \
+    sampleiterator.cpp shapeclassifier.cpp shapetable.cpp \
    tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp xform2d.cpp


--- a/classify/adaptmatch.cpp
+++ b/classify/adaptmatch.cpp
--- a/classify/baseline.h
+++ b/classify/baseline.h
@ -1,41 +0,0 @@
-/* -*-C-*-
- ********************************************************************************
- *
- * File:        baseline.h  (Formerly baseline.h)
- * Description:
- * Author:       Mark Seaman, SW Productivity
- * Created:      Fri Oct 16 14:37:00 1987
- * Modified:     Wed Feb 27 13:39:35 1991 (Mark Seaman) marks@hpgrlt
- * Language:     C
- * Package:      N/A
- * Status:       Reusable Software Component
- *
- * (c) Copyright 1987, Hewlett-Packard Company.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- *************************************************************************/
-#ifndef BASELINE_H
-#define BASELINE_H
-
-/*----------------------------------------------------------------------
-              I n c l u d e s
----------------------------------------------------------------------*/
-#include "host.h"
-#include "blobs.h"
-#include "params.h"
-
-/*----------------------------------------------------------------------
-              T y p e s
----------------------------------------------------------------------*/
-#define BASELINE_OFFSET 64
-#define BASELINE_SCALE  128
-
-#endif
--- a/classify/blobclass.cpp
+++ b/classify/blobclass.cpp
@ -49,8 +49,11 @@ extern char imagefile[];
 ----------------------------------------------------------------------------**/

 /*---------------------------------------------------------------------------*/
+// As all TBLOBs, Blob is in baseline normalized coords.
+// See SetupBLCNDenorms in intfx.cpp for other args.
 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& denorm, const char* BlobText) {
+               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
 /*
 **      Parameters:
 **              Blob            blob whose micro-features are to be learned
@ -95,18 +98,20 @@ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
    cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
  }

-  LearnBlob(FeatureDefs, FeatureFile, Blob, denorm, BlobText,
-            CurrFontName.string());
+  LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
+            BlobText, CurrFontName.string());
 }                                // LearnBlob

 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
-               TBLOB* Blob, const DENORM& denorm,
+               TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info,
               const char* BlobText, const char* FontName) {
  CHAR_DESC CharDesc;

  ASSERT_HOST(FeatureFile != NULL);

-  CharDesc = ExtractBlobFeatures(FeatureDefs, denorm, Blob);
+  CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
+                                 Blob);
  if (CharDesc == NULL) {
    cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
    return;
--- a/classify/blobclass.h
+++ b/classify/blobclass.h
@ -40,11 +40,14 @@
          Public Function Prototypes
 ----------------------------------------------------------------------------**/
 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& denorm, const char* BlobText);
+               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info,
+               const char* BlobText);

 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* File, TBLOB* Blob,
-               const DENORM& denorm, const char* BlobText,
-               const char* FontName);
+               const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info,
+               const char* BlobText, const char* FontName);

 /**----------------------------------------------------------------------------
        Global Data Definitions and Declarations
--- a/classify/classify.cpp
+++ b/classify/classify.cpp
@ -26,6 +26,7 @@
 #include "intproto.h"
 #include "mfoutline.h"
 #include "scrollview.h"
+#include "shapeclassifier.h"
 #include "shapetable.h"
 #include "unicity_table.h"
 #include <string.h>
@ -52,6 +53,11 @@ Classify::Classify()
                  this->params()),  /* PREV DEFAULT 0.1 */
    double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...",
                  this->params()),  /* PREV DEFAULT 0.3 */
+    double_MEMBER(classify_max_rating_ratio, 1.5,
+                  "Veto ratio between classifier ratings", this->params()),
+    double_MEMBER(classify_max_certainty_margin, 5.5,
+                  "Veto difference between classifier certainties",
+                  this->params()),
    BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
                this->params()),
    BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
@ -65,6 +71,8 @@ Classify::Classify()
               "Save adapted templates to a file", this->params()),
    BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
                this->params()),
+    BOOL_MEMBER(classify_nonlinear_norm, 0,
+                "Non-linear stroke-density normalization", this->params()),
    INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
    INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
    INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
@ -100,6 +108,12 @@ Classify::Classify()
                  this->params()),
    double_MEMBER(tessedit_class_miss_scale, 0.00390625,
                  "Scale factor for features not used", this->params()),
+    double_MEMBER(classify_adapted_pruning_factor, 2.5,
+                  "Prune poor adapted results this much worse than best result",
+                  this->params()),
+    double_MEMBER(classify_adapted_pruning_threshold, -1.0,
+                  "Threshold at which classify_adapted_pruning_factor starts",
+                  this->params()),
    INT_MEMBER(classify_adapt_proto_threshold, 230,
               "Threshold for good protos during adaptive 0-255",
               this->params()),
@ -122,19 +136,24 @@ Classify::Classify()
                  this->params()),
    INT_MEMBER(classify_class_pruner_threshold, 229,
               "Class Pruner Threshold 0-255", this->params()),
-    INT_MEMBER(classify_class_pruner_multiplier, 30,
+    INT_MEMBER(classify_class_pruner_multiplier, 15,
               "Class Pruner Multiplier 0-255:       ", this->params()),
    INT_MEMBER(classify_cp_cutoff_strength, 7,
               "Class Pruner CutoffStrength:         ", this->params()),
-    INT_MEMBER(classify_integer_matcher_multiplier, 14,
+    INT_MEMBER(classify_integer_matcher_multiplier, 10,
               "Integer Matcher Multiplier  0-255:   ", this->params()),
    EnableLearning(true),
    INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
               this->params()),
    BOOL_MEMBER(classify_bln_numeric_mode, 0,
                "Assume the input is numbers [0-9].", this->params()),
+    double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
+                  this->params()),
+    double_MEMBER(speckle_rating_penalty, 10.0,
+                  "Penalty to add to worst rating for noise", this->params()),
    shape_table_(NULL),
-    dict_(&image_) {
+    dict_(&image_),
+    static_classifier_(NULL) {
  fontinfo_table_.set_compare_callback(
      NewPermanentTessCallback(CompareFontInfo));
  fontinfo_table_.set_clear_callback(
@ -184,4 +203,45 @@ Classify::~Classify() {
  delete[] BaselineCutoffs;
 }

+
+// Takes ownership of the given classifier, and uses it for future calls
+// to CharNormClassifier.
+void Classify::SetStaticClassifier(ShapeClassifier* static_classifier) {
+  delete static_classifier_;
+  static_classifier_ = static_classifier;
+}
+
+// Moved from speckle.cpp
+// Adds a noise classification result that is a bit worse than the worst
+// current result, or the worst possible result if no current results.
+void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {
+    BLOB_CHOICE_IT bc_it(choices);
+  // If there is no classifier result, we will use the worst possible certainty
+  // and corresponding rating.
+  float certainty = -getDict().certainty_scale;
+  float rating = rating_scale * blob_length;
+  if (!choices->empty() && blob_length > 0) {
+    bc_it.move_to_last();
+    BLOB_CHOICE* worst_choice = bc_it.data();
+    // Add speckle_rating_penalty to worst rating, matching old value.
+    rating = worst_choice->rating() + speckle_rating_penalty;
+    // Compute the rating to correspond to the certainty. (Used to be kept
+    // the same, but that messes up the language model search.)
+    certainty = -rating * getDict().certainty_scale /
+        (rating_scale * blob_length);
+  }
+  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
+                                             -1, -1, 0, 0, MAX_FLOAT32, 0,
+                                             BCC_SPECKLE_CLASSIFIER);
+  bc_it.add_to_end(blob_choice);
+}
+
+// Returns true if the blob is small enough to be a large speckle.
+bool Classify::LargeSpeckle(const TBLOB &blob) {
+  double speckle_size = kBlnXHeight * speckle_large_max_size;
+  TBOX bbox = blob.bounding_box();
+  return bbox.width() < speckle_size && bbox.height() < speckle_size;
+}
+
+
 }  // namespace tesseract
--- a/classify/classify.h
+++ b/classify/classify.h
@ -43,8 +43,10 @@ static const int kBlankFontinfoId = -2;

 namespace tesseract {

+class ShapeClassifier;
 struct ShapeRating;
 class ShapeTable;
+struct UnicharRating;

 // How segmented is a blob. In this enum, character refers to a classifiable
 // unit, but that is too long and character is usually easier to understand.
@ -67,6 +69,17 @@ class Classify : public CCStruct {
    return shape_table_;
  }

+  // Takes ownership of the given classifier, and uses it for future calls
+  // to CharNormClassifier.
+  void SetStaticClassifier(ShapeClassifier* static_classifier);
+
+  // Adds a noise classification result that is a bit worse than the worst
+  // current result, or the worst possible result if no current results.
+  void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);
+
+  // Returns true if the blob is small enough to be a large speckle.
+  bool LargeSpeckle(const TBLOB &blob);
+
  /* adaptive.cpp ************************************************************/
  ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
  int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
@ -112,9 +125,7 @@ class Classify : public CCStruct {
  // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
  // is called and the data will be written to a file for static training.
  // Otherwise AdaptToBlob is called for adaption within a document.
-  // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
-  // be learned, otherwise all chars with good correct_text are learned.
-  void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
+  void LearnWord(const char* filename, WERD_RES *word);

  // Builds a blob of length fragments, from the word, starting at start,
  // and then learn it, as having the given correct_text.
@ -130,18 +141,15 @@ class Classify : public CCStruct {
                   const char* correct_text, WERD_RES *word);
  void InitAdaptiveClassifier(bool load_pre_trained_templates);
  void InitAdaptedClass(TBLOB *Blob,
-                        const DENORM& denorm,
                        CLASS_ID ClassId,
                        int FontinfoId,
                        ADAPT_CLASS Class,
                        ADAPT_TEMPLATES Templates);
  void AdaptToPunc(TBLOB *Blob,
-                   const DENORM& denorm,
                   CLASS_ID ClassId,
                   int FontinfoId,
                   FLOAT32 Threshold);
  void AmbigClassifier(TBLOB *Blob,
-                       const DENORM& denorm,
                       INT_TEMPLATES Templates,
                       ADAPT_CLASS *Classes,
                       UNICHAR_ID *Ambiguities,
@ -194,15 +202,8 @@ class Classify : public CCStruct {

 #ifndef GRAPHICS_DISABLED
  void DebugAdaptiveClassifier(TBLOB *Blob,
-                               const DENORM& denorm,
                               ADAPT_RESULTS *Results);
 #endif
-  void GetAdaptThresholds (TWERD * Word,
-                           const DENORM& denorm,
-                           const WERD_CHOICE& BestChoice,
-                           const WERD_CHOICE& BestRawChoice,
-                           FLOAT32 Thresholds[]);
-
  PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
                             int NumBadFeat,
                             FEATURE_ID BadFeat[],
@ -218,19 +219,14 @@ class Classify : public CCStruct {
  void MakePermanent(ADAPT_TEMPLATES Templates,
                     CLASS_ID ClassId,
                     int ConfigId,
-                     const DENORM& denorm,
                     TBLOB *Blob);
  void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
  void RemoveBadMatches(ADAPT_RESULTS *Results);
  void SetAdaptiveThreshold(FLOAT32 Threshold);
-  void ShowBestMatchFor(TBLOB *Blob,
-                        const DENORM& denorm,
-                        CLASS_ID ClassId,
-                        int shape_id,
-                        BOOL8 AdaptiveOn,
-                        BOOL8 PreTrainedOn,
-                        ADAPT_RESULTS *Results);
+  void ShowBestMatchFor(int shape_id,
+                        const INT_FEATURE_STRUCT* features,
+                        int num_features);
  // Returns a string for the classifier class_id: either the corresponding
  // unicharset debug_str or the shape_table_ debug str.
  STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
@ -251,59 +247,46 @@ class Classify : public CCStruct {
  // unichar-id!). Uses a search, so not fast.
  int ShapeIDToClassID(int shape_id) const;
  UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
-                                 const DENORM& denorm,
                                 ADAPT_TEMPLATES Templates,
                                 ADAPT_RESULTS *Results);
  int CharNormClassifier(TBLOB *Blob,
-                         const DENORM& denorm,
                         INT_TEMPLATES Templates,
                         ADAPT_RESULTS *Results);

  // As CharNormClassifier, but operates on a TrainingSample and outputs to
  // a GenericVector of ShapeRating without conversion to classes.
-  int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample,
-                             GenericVector<ShapeRating>* results);
-  UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
-                             const DENORM& denorm,
-                             CLASS_ID CorrectClass);
-  void DoAdaptiveMatch(TBLOB *Blob,
-                       const DENORM& denorm,
-                       ADAPT_RESULTS *Results);
+  int CharNormTrainingSample(bool pruner_only, int keep_this,
+                             const TrainingSample& sample,
+                             GenericVector<UnicharRating>* results);
+  UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);
+  void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);
  void AdaptToChar(TBLOB *Blob,
-                   const DENORM& denorm,
                   CLASS_ID ClassId,
                   int FontinfoId,
                   FLOAT32 Threshold);
-  void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
-                          INT_CLASS_STRUCT* int_class);
-  int AdaptableWord(TWERD *Word,
-                  const WERD_CHOICE &BestChoiceWord,
-                  const WERD_CHOICE &RawChoiceWord);
+  void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
+  bool AdaptableWord(WERD_RES* word);
  void EndAdaptiveClassifier();
  void PrintAdaptiveStatistics(FILE *File);
  void SettupPass1();
  void SettupPass2();
  void AdaptiveClassifier(TBLOB *Blob,
-                          const DENORM& denorm,
                          BLOB_CHOICE_LIST *Choices,
                          CLASS_PRUNER_RESULTS cp_results);
  void ClassifyAsNoise(ADAPT_RESULTS *Results);
  void ResetAdaptiveClassifierInternal();

  int GetBaselineFeatures(TBLOB *Blob,
-                          const DENORM& denorm,
                          INT_TEMPLATES Templates,
                          INT_FEATURE_ARRAY IntFeatures,
                          uinT8* CharNormArray,
                          inT32 *BlobLength);
  int GetCharNormFeatures(TBLOB *Blob,
-                          const DENORM& denorm,
                          INT_TEMPLATES Templates,
                          INT_FEATURE_ARRAY IntFeatures,
                          uinT8* PrunerNormArray,
                          uinT8* CharNormArray,
-                          inT32 *BlobLength,
-                          inT32 *FeatureOutlineIndex);
+                          inT32 *BlobLength);
  // Computes the char_norm_array for the unicharset and, if not NULL, the
  // pruner_array as appropriate according to the existence of the shape_table.
  // The norm_feature is deleted as it is almost certainly no longer needed.
@ -313,13 +296,54 @@ class Classify : public CCStruct {
                             uinT8* pruner_array);

  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
-  void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob);
+  void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);

  void ResetFeaturesHaveBeenExtracted();
  bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
-  bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob);
+  bool LooksLikeGarbage(TBLOB *blob);
  void RefreshDebugWindow(ScrollView **win, const char *msg,
                          int y_offset, const TBOX &wbox);
+  // intfx.cpp
+  // Computes the DENORMS for bl(baseline) and cn(character) normalization
+  // during feature extraction. The input denorm describes the current state
+  // of the blob, which is usually a baseline-normalized word.
+  // The Transforms setup are as follows:
+  // Baseline Normalized (bl) Output:
+  //   We center the grapheme by aligning the x-coordinate of its centroid with
+  //   x=128 and leaving the already-baseline-normalized y as-is.
+  //
+  // Character Normalized (cn) Output:
+  //   We align the grapheme's centroid at the origin and scale it
+  //   asymmetrically in x and y so that the 2nd moments are a standard value
+  //   (51.2) ie the result is vaguely square.
+  // If classify_nonlinear_norm is true:
+  //   A non-linear normalization is setup that attempts to evenly distribute
+  //   edges across x and y.
+  //
+  // Some of the fields of fx_info are also setup:
+  // Length: Total length of outline.
+  // Rx:     Rounded y second moment. (Reversed by convention.)
+  // Ry:     rounded x second moment.
+  // Xmean:  Rounded x center of mass of the blob.
+  // Ymean:  Rounded y center of mass of the blob.
+  static void SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
+                               DENORM* bl_denorm, DENORM* cn_denorm,
+                               INT_FX_RESULT_STRUCT* fx_info);
+
+  // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
+  // (x,y) position and angle as measured counterclockwise from the vector
+  // <-1, 0>, from blob using two normalizations defined by bl_denorm and
+  // cn_denorm. See SetpuBLCNDenorms for definitions.
+  // If outline_cn_counts is not NULL, on return it contains the cumulative
+  // number of cn features generated for each outline in the blob (in order).
+  // Thus after the first outline, there were (*outline_cn_counts)[0] features,
+  // after the second outline, there were (*outline_cn_counts)[1] features etc.
+  static void ExtractFeatures(const TBLOB& blob,
+                              bool nonlinear_norm,
+                              GenericVector<INT_FEATURE_STRUCT>* bl_features,
+                              GenericVector<INT_FEATURE_STRUCT>* cn_features,
+                              INT_FX_RESULT_STRUCT* results,
+                              GenericVector<int>* outline_cn_counts);
  /* float2int.cpp ************************************************************/
  void ClearCharNormArray(uinT8* char_norm_array);
  void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
@ -336,6 +360,9 @@ class Classify : public CCStruct {
  UnicityTable<FontInfo>& get_fontinfo_table() {
    return fontinfo_table_;
  }
+  const UnicityTable<FontInfo>& get_fontinfo_table() const {
+    return fontinfo_table_;
+  }
  UnicityTable<FontSet>& get_fontset_table() {
    return fontset_table_;
  }
@ -365,6 +392,10 @@ class Classify : public CCStruct {
  double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
  double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
  double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
+  double_VAR_H(classify_max_rating_ratio, 1.5,
+               "Veto ratio between classifier ratings");
+  double_VAR_H(classify_max_certainty_margin, 5.5,
+               "Veto difference between classifier certainties");

  /* adaptmatch.cpp ***********************************************************/
  BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
@ -375,6 +406,8 @@ class Classify : public CCStruct {
  BOOL_VAR_H(classify_save_adapted_templates, 0,
             "Save adapted templates to a file");
  BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
+  BOOL_VAR_H(classify_nonlinear_norm, 0,
+             "Non-linear stroke-density normalization");
  INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
  INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
  INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
@ -398,6 +431,10 @@ class Classify : public CCStruct {
  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
  double_VAR_H(tessedit_class_miss_scale, 0.00390625,
               "Scale factor for features not used");
+  double_VAR_H(classify_adapted_pruning_factor, 2.5,
+               "Prune poor adapted results this much worse than best result");
+  double_VAR_H(classify_adapted_pruning_threshold, -1.0,
+               "Threshold at which classify_adapted_pruning_factor starts");
  INT_VAR_H(classify_adapt_proto_threshold, 230,
            "Threshold for good protos during adaptive 0-255");
  INT_VAR_H(classify_adapt_feature_threshold, 230,
@ -418,11 +455,11 @@ class Classify : public CCStruct {
  /* intmatcher.cpp **********************************************************/
  INT_VAR_H(classify_class_pruner_threshold, 229,
            "Class Pruner Threshold 0-255");
-  INT_VAR_H(classify_class_pruner_multiplier, 30,
+  INT_VAR_H(classify_class_pruner_multiplier, 15,
            "Class Pruner Multiplier 0-255:       ");
  INT_VAR_H(classify_cp_cutoff_strength, 7,
            "Class Pruner CutoffStrength:         ");
-  INT_VAR_H(classify_integer_matcher_multiplier, 14,
+  INT_VAR_H(classify_integer_matcher_multiplier, 10,
            "Integer Matcher Multiplier  0-255:   ");

  // Use class variables to hold onto built-in templates and adapted templates.
@ -453,6 +490,9 @@ class Classify : public CCStruct {
  INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
  BOOL_VAR_H(classify_bln_numeric_mode, 0,
             "Assume the input is numbers [0-9].");
+  double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");
+  double_VAR_H(speckle_rating_penalty, 10.0,
+               "Penalty to add to worst rating for noise");

 protected:
  IntegerMatcher im_;
@ -466,6 +506,8 @@ class Classify : public CCStruct {
 private:

  Dict dict_;
+  // The currently active static classifier.
+  ShapeClassifier* static_classifier_;

  /* variables used to hold performance statistics */
  int AdaptiveMatcherCalls;
--- a/classify/cluster.cpp
+++ b/classify/cluster.cpp
@ -15,11 +15,12 @@
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 ******************************************************************************/
-#include "oldheap.h"
 #include "const.h"
 #include "cluster.h"
 #include "emalloc.h"
+#include "genericheap.h"
 #include "helpers.h"
+#include "kdpair.h"
 #include "matrix.h"
 #include "tprintf.h"
 #include "danerror.h"
@ -164,6 +165,9 @@ struct TEMPCLUSTER {
  CLUSTER *Neighbor;
 };

+typedef tesseract::KDPairInc<float, TEMPCLUSTER*> ClusterPair;
+typedef tesseract::GenericHeap<ClusterPair> ClusterHeap;
+
 struct STATISTICS {
  FLOAT32 AvgVariance;
  FLOAT32 *CoVariance;
@ -190,7 +194,7 @@ struct CHISTRUCT{

 // For use with KDWalk / MakePotentialClusters
 struct ClusteringContext {
-  HEAP *heap;  // heap used to hold temp clusters, "best" on top
+  ClusterHeap *heap;  // heap used to hold temp clusters, "best" on top
  TEMPCLUSTER *candidates;  // array of potential clusters
  KDTREE *tree;  // kd-tree to be searched for neighbors
  inT32 next;  // next candidate to be used
@ -693,7 +697,7 @@ History:	5/29/89, DSJ, Created.
 ******************************************************************************/
 void CreateClusterTree(CLUSTERER *Clusterer) {
  ClusteringContext context;
-  HEAPENTRY HeapEntry;
+  ClusterPair HeapEntry;
  TEMPCLUSTER *PotentialCluster;

  // each sample and its nearest neighbor form a "potential" cluster
@ -702,12 +706,12 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
  context.candidates = (TEMPCLUSTER *)
    Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
  context.next = 0;
-  context.heap = MakeHeap(Clusterer->NumberOfSamples);
+  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);

  // form potential clusters into actual clusters - always do "best" first
-  while (GetTopOfHeap(context.heap, &HeapEntry) != EMPTY) {
-    PotentialCluster = (TEMPCLUSTER *)HeapEntry.Data;
+  while (context.heap->Pop(&HeapEntry)) {
+    PotentialCluster = HeapEntry.data;

    // if main cluster of potential cluster is already in another cluster
    // then we don't need to worry about it
@ -720,9 +724,9 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
    else if (PotentialCluster->Neighbor->Clustered) {
      PotentialCluster->Neighbor =
        FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
-                            &HeapEntry.Key);
+                            &HeapEntry.key);
      if (PotentialCluster->Neighbor != NULL) {
-        HeapStore(context.heap, &HeapEntry);
+        context.heap->Push(&HeapEntry);
      }
    }

@ -732,9 +736,9 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
          MakeNewCluster(Clusterer, PotentialCluster);
      PotentialCluster->Neighbor =
          FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
-                              &HeapEntry.Key);
+                              &HeapEntry.key);
      if (PotentialCluster->Neighbor != NULL) {
-        HeapStore(context.heap, &HeapEntry);
+        context.heap->Push(&HeapEntry);
      }
    }
  }
@ -745,7 +749,7 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
  // free up the memory used by the K-D tree, heap, and temp clusters
  FreeKDTree(context.tree);
  Clusterer->KDTree = NULL;
-  FreeHeap(context.heap);
+  delete context.heap;
  memfree(context.candidates);
 }                                // CreateClusterTree

@ -763,16 +767,16 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
 ******************************************************************************/
 void MakePotentialClusters(ClusteringContext *context,
                           CLUSTER *Cluster, inT32 Level) {
-  HEAPENTRY HeapEntry;
+  ClusterPair HeapEntry;
  int next = context->next;
  context->candidates[next].Cluster = Cluster;
-  HeapEntry.Data = (char *) &(context->candidates[next]);
+  HeapEntry.data = &(context->candidates[next]);
  context->candidates[next].Neighbor =
      FindNearestNeighbor(context->tree,
                          context->candidates[next].Cluster,
-                          &HeapEntry.Key);
+                          &HeapEntry.key);
  if (context->candidates[next].Neighbor != NULL) {
-    HeapStore(context->heap, &HeapEntry);
+    context->heap->Push(&HeapEntry);
    context->next++;
  }
 }                                // MakePotentialClusters
--- a/classify/errorcounter.cpp
+++ b/classify/errorcounter.cpp
@ -27,6 +27,9 @@

 namespace tesseract {

+// Difference in result rating to be thought of as an "equal" choice.
+const double kRatingEpsilon = 1.0 / 32;
+
 // Tests a classifier, computing its error rate.
 // See errorcounter.h for description of arguments.
 // Iterates over the samples, calling the classifier in normal/silent mode.
@ -35,14 +38,12 @@ namespace tesseract {
 // with a debug flag and a keep_this argument to find out what is going on.
 double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
    int report_level, CountTypes boosting_mode,
-    const UnicityTable<FontInfo>& fontinfo_table,
+    const FontInfoTable& fontinfo_table,
    const GenericVector<Pix*>& page_images, SampleIterator* it,
    double* unichar_error,  double* scaled_error, STRING* fonts_report) {
-  int charsetsize = it->shape_table()->unicharset().size();
-  int shapesize = it->CompactCharsetSize();
  int fontsize = it->sample_set()->NumFonts();
-  ErrorCounter counter(charsetsize, shapesize, fontsize);
-  GenericVector<ShapeRating> results;
+  ErrorCounter counter(classifier->GetUnicharset(), fontsize);
+  GenericVector<UnicharRating> results;

  clock_t start = clock();
  int total_samples = 0;
@ -56,21 +57,28 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
    Pix* page_pix = 0 <= page_index && page_index < page_images.size()
                  ? page_images[page_index] : NULL;
    // No debug, no keep this.
-    classifier->ClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID,
-                               &results);
-    if (mutable_sample->class_id() == 0) {
+    classifier->UnicharClassifySample(*mutable_sample, page_pix, 0,
+                                      INVALID_UNICHAR_ID, &results);
+    bool debug_it = false;
+    int correct_id = mutable_sample->class_id();
+    if (counter.unicharset_.has_special_codes() &&
+        (correct_id == UNICHAR_SPACE || correct_id == UNICHAR_JOINED ||
+         correct_id == UNICHAR_BROKEN)) {
      // This is junk so use the special counter.
-      counter.AccumulateJunk(*it->shape_table(), results, mutable_sample);
-    } else if (counter.AccumulateErrors(report_level > 3, boosting_mode,
-                                        fontinfo_table, *it->shape_table(),
-                                        results, mutable_sample) &&
-               error_samples > 0) {
+      debug_it = counter.AccumulateJunk(report_level > 3,
+                                        results,
+                                        mutable_sample);
+    } else {
+      debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode,
+                                          fontinfo_table,
+                                          results, mutable_sample);
+    }
+    if (debug_it && error_samples > 0) {
      // Running debug, keep the correct answer, and debug the classifier.
-      tprintf("Error on sample %d: Classifier debug output:\n",
-              it->GlobalSampleIndex());
-      int keep_this = it->GetSparseClassID();
-      classifier->ClassifySample(*mutable_sample, page_pix, 1, keep_this,
-                                 &results);
+      tprintf("Error on sample %d: %s Classifier debug output:\n",
+              it->GlobalSampleIndex(),
+              it->sample_set()->SampleToString(*mutable_sample).string());
+      classifier->DebugDisplay(*mutable_sample, page_pix, correct_id);
      --error_samples;
    }
    ++total_samples;
@ -89,12 +97,70 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
  return unscaled_error;
 }

+// Tests a pair of classifiers, debugging errors of the new against the old.
+// See errorcounter.h for description of arguments.
+// Iterates over the samples, calling the classifiers in normal/silent mode.
+// If the new_classifier makes a boosting_mode error that the old_classifier
+// does not, it will then call the new_classifier again with a debug flag
+// and a keep_this argument to find out what is going on.
+void ErrorCounter::DebugNewErrors(
+    ShapeClassifier* new_classifier, ShapeClassifier* old_classifier,
+    CountTypes boosting_mode,
+    const FontInfoTable& fontinfo_table,
+    const GenericVector<Pix*>& page_images, SampleIterator* it) {
+  int fontsize = it->sample_set()->NumFonts();
+  ErrorCounter old_counter(old_classifier->GetUnicharset(), fontsize);
+  ErrorCounter new_counter(new_classifier->GetUnicharset(), fontsize);
+  GenericVector<UnicharRating> results;
+
+  int total_samples = 0;
+  int error_samples = 25;
+  int total_new_errors = 0;
+  // Iterate over all the samples, accumulating errors.
+  for (it->Begin(); !it->AtEnd(); it->Next()) {
+    TrainingSample* mutable_sample = it->MutableSample();
+    int page_index = mutable_sample->page_num();
+    Pix* page_pix = 0 <= page_index && page_index < page_images.size()
+                  ? page_images[page_index] : NULL;
+    // No debug, no keep this.
+    old_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0,
+                                          INVALID_UNICHAR_ID, &results);
+    int correct_id = mutable_sample->class_id();
+    if (correct_id != 0 &&
+        !old_counter.AccumulateErrors(true, boosting_mode, fontinfo_table,
+                                      results, mutable_sample)) {
+      // old classifier was correct, check the new one.
+      new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0,
+                                            INVALID_UNICHAR_ID, &results);
+      if (correct_id != 0 &&
+          new_counter.AccumulateErrors(true, boosting_mode, fontinfo_table,
+                                        results, mutable_sample)) {
+        tprintf("New Error on sample %d: Classifier debug output:\n",
+                it->GlobalSampleIndex());
+        ++total_new_errors;
+        new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 1,
+                                              correct_id, &results);
+        if (results.size() > 0 && error_samples > 0) {
+          new_classifier->DebugDisplay(*mutable_sample, page_pix, correct_id);
+          --error_samples;
+        }
+      }
+    }
+    ++total_samples;
+  }
+  tprintf("Total new errors = %d\n", total_new_errors);
+}
+
 // Constructor is private. Only anticipated use of ErrorCounter is via
 // the static ComputeErrorRate.
-ErrorCounter::ErrorCounter(int charsetsize, int shapesize, int fontsize)
-  : scaled_error_(0.0), unichar_counts_(charsetsize, shapesize, 0) {
+ErrorCounter::ErrorCounter(const UNICHARSET& unicharset, int fontsize)
+  : scaled_error_(0.0), rating_epsilon_(kRatingEpsilon),
+    unichar_counts_(unicharset.size(), unicharset.size(), 0),
+    ok_score_hist_(0, 101), bad_score_hist_(0, 101),
+    unicharset_(unicharset) {
  Counts empty_counts;
  font_counts_.init_to_size(fontsize, empty_counts);
+  multi_unichar_counts_.init_to_size(unicharset.size(), 0);
 }
 ErrorCounter::~ErrorCounter() {
 }
@ -107,13 +173,11 @@ ErrorCounter::~ErrorCounter() {
 // for error counting and shape_table is used to understand the relationship
 // between unichar_ids and shape_ids in the results
 bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
-                                    const UnicityTable<FontInfo>& font_table,
-                                    const ShapeTable& shape_table,
-                                    const GenericVector<ShapeRating>& results,
+                                    const FontInfoTable& font_table,
+                                    const GenericVector<UnicharRating>& results,
                                    TrainingSample* sample) {
  int num_results = results.size();
-  int res_index = 0;
-  bool debug_it = false;
+  int answer_actual_rank = -1;
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
  sample->set_is_error(false);
@ -123,107 +187,143 @@ bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
    // improve the classifier.
    sample->set_is_error(true);
    ++font_counts_[font_id].n[CT_REJECT];
-  } else if (shape_table.GetShape(results[0].shape_id).
-          ContainsUnicharAndFont(unichar_id, font_id)) {
-    ++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT];
-    // Unichar and font OK, but count if multiple unichars.
-    if (shape_table.GetShape(results[0].shape_id).size() > 1)
-      ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
  } else {
-    // This is a top shape error.
-    ++font_counts_[font_id].n[CT_SHAPE_TOP_ERR];
-    // Check to see if any font in the top choice has attributes that match.
-    bool attributes_match = false;
-    uinT32 font_props = font_table.get(font_id).properties;
-    const Shape& shape = shape_table.GetShape(results[0].shape_id);
-    for (int c = 0; c < shape.size() && !attributes_match; ++c) {
-      for (int f = 0; f < shape[c].font_ids.size(); ++f) {
-        if (font_table.get(shape[c].font_ids[f]).properties == font_props) {
-          attributes_match = true;
-          break;
-        }
+    // Find rank of correct unichar answer, using rating_epsilon_ to allow
+    // different answers to score as equal. (Ignoring the font.)
+    int epsilon_rank = 0;
+    int answer_epsilon_rank = -1;
+    int num_top_answers = 0;
+    double prev_rating = results[0].rating;
+    bool joined = false;
+    bool broken = false;
+    int res_index = 0;
+    while (res_index < num_results) {
+      if (results[res_index].rating < prev_rating - rating_epsilon_) {
+        ++epsilon_rank;
+        prev_rating = results[res_index].rating;
      }
-    }
-    // TODO(rays) It is easy to add counters for individual font attributes
-    // here if we want them.
-    if (!attributes_match)
-      ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
-    if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true);
-    // Find rank of correct unichar answer. (Ignoring the font.)
-    while (res_index < num_results &&
-           !shape_table.GetShape(results[res_index].shape_id).
-                ContainsUnichar(unichar_id)) {
+      if (results[res_index].unichar_id == unichar_id &&
+          answer_epsilon_rank < 0) {
+        answer_epsilon_rank = epsilon_rank;
+        answer_actual_rank = res_index;
+      }
+      if (results[res_index].unichar_id == UNICHAR_JOINED &&
+          unicharset_.has_special_codes())
+        joined = true;
+      else if (results[res_index].unichar_id == UNICHAR_BROKEN &&
+               unicharset_.has_special_codes())
+        broken = true;
+      else if (epsilon_rank == 0)
+        ++num_top_answers;
      ++res_index;
    }
-    if (res_index == 0) {
+    if (answer_actual_rank != 0) {
+      // Correct result is not absolute top.
+      ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR];
+      if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) sample->set_is_error(true);
+    }
+    if (answer_epsilon_rank == 0) {
+      ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK];
      // Unichar OK, but count if multiple unichars.
-      if (shape_table.GetShape(results[res_index].shape_id).size() > 1) {
+      if (num_top_answers > 1) {
        ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
+        ++multi_unichar_counts_[unichar_id];
+      }
+      // Check to see if any font in the top choice has attributes that match.
+      // TODO(rays) It is easy to add counters for individual font attributes
+      // here if we want them.
+      if (font_table.SetContainsFontProperties(
+          font_id, results[answer_actual_rank].fonts)) {
+        // Font attributes were matched.
+        // Check for multiple properties.
+        if (font_table.SetContainsMultipleFontProperties(
+            results[answer_actual_rank].fonts))
+          ++font_counts_[font_id].n[CT_OK_MULTI_FONT];
+      } else {
+        // Font attributes weren't matched.
+        ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
      }
    } else {
-      // Count maps from unichar id to shape id.
-      if (num_results > 0)
-        ++unichar_counts_(unichar_id, results[0].shape_id);
-      // This is a unichar error.
+      // This is a top unichar error.
      ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
      if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
-      if (res_index >= MIN(2, num_results)) {
+      // Count maps from unichar id to wrong unichar id.
+      ++unichar_counts_(unichar_id, results[0].unichar_id);
+      if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {
        // It is also a 2nd choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
        if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
      }
-      if (res_index >= num_results) {
+      if (answer_epsilon_rank < 0) {
        // It is also a top-n choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
        if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
-        debug_it = debug;
+        answer_epsilon_rank = epsilon_rank;
      }
    }
+    // Compute mean number of return values and mean rank of correct answer.
+    font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
+    font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank;
+    if (joined)
+      ++font_counts_[font_id].n[CT_OK_JOINED];
+    if (broken)
+      ++font_counts_[font_id].n[CT_OK_BROKEN];
  }
-  // Compute mean number of return values and mean rank of correct answer.
-  font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
-  font_counts_[font_id].n[CT_RANK] += res_index;
  // If it was an error for boosting then sum the weight.
  if (sample->is_error()) {
    scaled_error_ += sample->weight();
-  }
-  if (debug_it) {
-    tprintf("%d results for char %s font %d :",
-            num_results, shape_table.unicharset().id_to_unichar(unichar_id),
-            font_id);
-    for (int i = 0; i < num_results; ++i) {
-      tprintf(" %.3f/%.3f:%s",
-              results[i].rating, results[i].font,
-              shape_table.DebugStr(results[i].shape_id).string());
+    if (debug) {
+      tprintf("%d results for char %s font %d :",
+              num_results, unicharset_.id_to_unichar(unichar_id),
+              font_id);
+      for (int i = 0; i < num_results; ++i) {
+        tprintf(" %.3f : %s\n",
+                results[i].rating,
+                unicharset_.id_to_unichar(results[i].unichar_id));
+      }
+      return true;
    }
-    tprintf("\n");
-    return true;
+    int percent = 0;
+    if (num_results > 0)
+      percent = IntCastRounded(results[0].rating * 100);
+    bad_score_hist_.add(percent, 1);
+  } else {
+    int percent = 0;
+    if (answer_actual_rank >= 0)
+      percent = IntCastRounded(results[answer_actual_rank].rating * 100);
+    ok_score_hist_.add(percent, 1);
  }
  return false;
 }

 // Accumulates counts for junk. Counts only whether the junk was correctly
 // rejected or not.
-void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table,
-                                  const GenericVector<ShapeRating>& results,
+bool ErrorCounter::AccumulateJunk(bool debug,
+                                  const GenericVector<UnicharRating>& results,
                                  TrainingSample* sample) {
  // For junk we accept no answer, or an explicit shape answer matching the
  // class id of the sample.
  int num_results = results.size();
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
-  if (num_results > 0 &&
-      !shape_table.GetShape(results[0].shape_id).ContainsUnichar(unichar_id)) {
+  int percent = 0;
+  if (num_results > 0)
+    percent = IntCastRounded(results[0].rating * 100);
+  if (num_results > 0 && results[0].unichar_id != unichar_id) {
    // This is a junk error.
    ++font_counts_[font_id].n[CT_ACCEPTED_JUNK];
    sample->set_is_error(true);
    // It counts as an error for boosting too so sum the weight.
    scaled_error_ += sample->weight();
+    bad_score_hist_.add(percent, 1);
+    return debug;
  } else {
    // Correctly rejected.
    ++font_counts_[font_id].n[CT_REJECTED_JUNK];
    sample->set_is_error(false);
+    ok_score_hist_.add(percent, 1);
  }
+  return false;
 }

 // Creates a report of the error rate. The report_level controls the detail
@ -239,7 +339,7 @@ void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table,
 // If not NULL, the report string is saved in fonts_report.
 // (Ignoring report_level).
 double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
-                                  const UnicityTable<FontInfo>& fontinfo_table,
+                                  const FontInfoTable& fontinfo_table,
                                  const SampleIterator& it,
                                  double* unichar_error,
                                  STRING* fonts_report) {
@ -251,7 +351,7 @@ double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
    // Accumulate counts over fonts.
    totals += font_counts_[f];
    STRING font_report;
-    if (ReportString(font_counts_[f], &font_report)) {
+    if (ReportString(false, font_counts_[f], &font_report)) {
      if (fonts_report != NULL) {
        *fonts_report += fontinfo_table.get(f).name;
        *fonts_report += ": ";
@ -264,39 +364,59 @@ double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
      }
    }
  }
+  // Report the totals.
+  STRING total_report;
+  bool any_results = ReportString(true, totals, &total_report);
+  if (fonts_report != NULL && fonts_report->length() == 0) {
+    // Make sure we return something even if there were no samples.
+    *fonts_report = "NoSamplesFound: ";
+    *fonts_report += total_report;
+    *fonts_report += "\n";
+  }
  if (report_level > 0) {
    // Report the totals.
    STRING total_report;
-    if (ReportString(totals, &total_report)) {
+    if (any_results) {
      tprintf("TOTAL Scaled Err=%.4g%%, %s\n",
              scaled_error_ * 100.0, total_report.string());
    }
    // Report the worst substitution error only for now.
    if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {
-      const UNICHARSET& unicharset = it.shape_table()->unicharset();
-      int charsetsize = unicharset.size();
-      int shapesize = it.CompactCharsetSize();
+      int charsetsize = unicharset_.size();
      int worst_uni_id = 0;
-      int worst_shape_id = 0;
+      int worst_result_id = 0;
      int worst_err = 0;
      for (int u = 0; u < charsetsize; ++u) {
-        for (int s = 0; s < shapesize; ++s) {
-          if (unichar_counts_(u, s) > worst_err) {
-            worst_err = unichar_counts_(u, s);
+        for (int v = 0; v < charsetsize; ++v) {
+          if (unichar_counts_(u, v) > worst_err) {
+            worst_err = unichar_counts_(u, v);
            worst_uni_id = u;
-            worst_shape_id = s;
+            worst_result_id = v;
          }
        }
      }
      if (worst_err > 0) {
        tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
-                worst_uni_id, unicharset.id_to_unichar(worst_uni_id),
-                it.shape_table()->DebugStr(worst_shape_id).string(),
+                worst_uni_id, unicharset_.id_to_unichar(worst_uni_id),
+                unicharset_.id_to_unichar(worst_result_id),
                worst_err, totals.n[CT_UNICHAR_TOP1_ERR],
                100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);
      }
    }
+    tprintf("Multi-unichar shape use:\n");
+    for (int u = 0; u < multi_unichar_counts_.size(); ++u) {
+      if (multi_unichar_counts_[u] > 0) {
+        tprintf("%d multiple answers for unichar: %s\n",
+                multi_unichar_counts_[u],
+                unicharset_.id_to_unichar(u));
+      }
+    }
+    tprintf("OK Score histogram:\n");
+    ok_score_hist_.print();
+    tprintf("ERROR Score histogram:\n");
+    bad_score_hist_.print();
  }
+
  double rates[CT_SIZE];
  if (!ComputeRates(totals, rates))
    return 0.0;
@ -308,32 +428,37 @@ double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,

 // Sets the report string to a combined human and machine-readable report
 // string of the error rates.
-// Returns false if there is no data, leaving report unchanged.
-bool ErrorCounter::ReportString(const Counts& counts, STRING* report) {
+// Returns false if there is no data, leaving report unchanged, unless
+// even_if_empty is true.
+bool ErrorCounter::ReportString(bool even_if_empty, const Counts& counts,
+                                STRING* report) {
  // Compute the error rates.
  double rates[CT_SIZE];
-  if (!ComputeRates(counts, rates))
+  if (!ComputeRates(counts, rates) && !even_if_empty)
    return false;
  // Using %.4g%%, the length of the output string should exactly match the
  // length of the format string, but in case of overflow, allow for +eddd
  // on each number.
  const int kMaxExtraLength = 5;  // Length of +eddd.
  // Keep this format string and the snprintf in sync with the CountTypes enum.
-  const char* format_str = "ShapeErr=%.4g%%, FontAttr=%.4g%%, "
-                           "Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], "
-                           "Multi=%.4g%%, Rej=%.4g%%, "
+  const char* format_str = "Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] "
+                           "Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, "
+                           "FontAttr=%.4g%%, Multi=%.4g%%, "
                           "Answers=%.3g, Rank=%.3g, "
                           "OKjunk=%.4g%%, Badjunk=%.4g%%";
  int max_str_len = strlen(format_str) + kMaxExtraLength * (CT_SIZE - 1) + 1;
  char* formatted_str = new char[max_str_len];
  snprintf(formatted_str, max_str_len, format_str,
-           rates[CT_SHAPE_TOP_ERR] * 100.0,
-           rates[CT_FONT_ATTR_ERR] * 100.0,
           rates[CT_UNICHAR_TOP1_ERR] * 100.0,
           rates[CT_UNICHAR_TOP2_ERR] * 100.0,
           rates[CT_UNICHAR_TOPN_ERR] * 100.0,
+           rates[CT_UNICHAR_TOPTOP_ERR] * 100.0,
           rates[CT_OK_MULTI_UNICHAR] * 100.0,
+           rates[CT_OK_JOINED] * 100.0,
+           rates[CT_OK_BROKEN] * 100.0,
           rates[CT_REJECT] * 100.0,
+           rates[CT_FONT_ATTR_ERR] * 100.0,
+           rates[CT_OK_MULTI_FONT] * 100.0,
           rates[CT_NUM_RESULTS],
           rates[CT_RANK],
           100.0 * rates[CT_REJECTED_JUNK],
@ -350,13 +475,9 @@ bool ErrorCounter::ReportString(const Counts& counts, STRING* report) {
 // Computes the error rates and returns in rates which is an array of size
 // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
 bool ErrorCounter::ComputeRates(const Counts& counts, double rates[CT_SIZE]) {
-  int ok_samples = counts.n[CT_SHAPE_TOP_CORRECT] + counts.n[CT_SHAPE_TOP_ERR] +
+  int ok_samples = counts.n[CT_UNICHAR_TOP_OK] + counts.n[CT_UNICHAR_TOP1_ERR] +
      counts.n[CT_REJECT];
  int junk_samples = counts.n[CT_REJECTED_JUNK] + counts.n[CT_ACCEPTED_JUNK];
-  if (ok_samples == 0 && junk_samples == 0) {
-    // There is no data.
-    return false;
-  }
  // Compute rates for normal chars.
  double denominator = static_cast<double>(MAX(ok_samples, 1));
  for (int ct = 0; ct <= CT_RANK; ++ct)
@ -365,7 +486,7 @@ bool ErrorCounter::ComputeRates(const Counts& counts, double rates[CT_SIZE]) {
  denominator = static_cast<double>(MAX(junk_samples, 1));
  for (int ct = CT_REJECTED_JUNK; ct <= CT_ACCEPTED_JUNK; ++ct)
    rates[ct] = counts.n[ct] / denominator;
-  return true;
+  return ok_samples != 0 || junk_samples != 0;
 }

 ErrorCounter::Counts::Counts() {
--- a/classify/errorcounter.h
+++ b/classify/errorcounter.h
@ -18,6 +18,7 @@

 #include "genericvector.h"
 #include "matrix.h"
+#include "statistc.h"

 struct Pix;
 template <typename T> class UnicityTable;
@ -25,11 +26,11 @@ template <typename T> class UnicityTable;
 namespace tesseract {

 struct FontInfo;
+class FontInfoTable;
 class SampleIterator;
 class ShapeClassifier;
-class ShapeRating;
-class ShapeTable;
 class TrainingSample;
+class UnicharRating;

 // Enumeration of the different types of error count.
 // Error counts work as follows:
@ -37,22 +38,21 @@ class TrainingSample;
 // Ground truth is a valid unichar-id / font-id pair:
 //        Number of classifier answers?
 //          0                       >0
-//     CT_REJECT     BOTH unichar-id and font-id match top shape?
-//     __________             yes!              no
-//                   CT_SHAPE_TOP_CORRECT  CT_SHAPE_TOP_ERR
-//                           |            Font attributes match?
-//                           |               yes!        no
-//                           |                 |     CT_FONT_ATTR_ERROR
-//                           |         Top unichar-id matches?
-//                           |         yes!          no
-//       Top shape-id has multiple unichars?    CT_UNICHAR_TOP1_ERR
-//               yes!            no           2nd shape unichar id matches?
-//        CT_OK_MULTI_UNICHAR   ________        yes!              no
-//        ___________________                  _____  CT_UNICHAR_TOP2_ERR
-//                                                    Any unichar-id matches?
-//                                                    yes!        no
-//                                                   ______ CT_UNICHAR_TOPN_ERR
-//                                                           _________________
+//     CT_REJECT          unichar-id matches top shape?
+//     __________             yes!                      no
+//                   CT_UNICHAR_TOP_OK           CT_UNICHAR_TOP1_ERR
+//      Top shape-id has multiple unichars?   2nd shape unichar id matches?
+//            yes!              no              yes!              no
+//      CT_OK_MULTI_UNICHAR     |              _____    CT_UNICHAR_TOP2_ERR
+//             Font attributes match?                 Any unichar-id matches?
+//              yes!              no                  yes!        no
+//      CT_FONT_ATTR_OK   CT_FONT_ATTR_ERR          ______  CT_UNICHAR_TOPN_ERR
+//                |       __________________                 _________________
+//      Top shape-id has multiple font attrs?
+//            yes!              no
+//      CT_OK_MULTI_FONT
+//      _____________________________
+//
 // Note that multiple counts may be activated for a single sample!
 //
 // Ground truth is for a fragment/n-gram that is NOT in the unicharset.
@ -67,14 +67,20 @@ class TrainingSample;
 //
 // Keep in sync with the ReportString function.
 enum CountTypes {
-  CT_SHAPE_TOP_CORRECT,  // Top shape id is actually correct.
-  CT_SHAPE_TOP_ERR,      // Top shape id is not correct.
-  CT_FONT_ATTR_ERR,      // Font attributes incorrect, ignoring unichar.
+  CT_UNICHAR_TOP_OK,     // Top shape contains correct unichar id.
+  // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
+  // kRatingEpsilon from the first result in each group. The real top choice
+  // is measured using TOPTOP.
  CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
  CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
  CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
+  CT_UNICHAR_TOPTOP_ERR,   // Very top choice not correct.
  CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
+  CT_OK_JOINED,          // Top shape id is correct but marked joined.
+  CT_OK_BROKEN,          // Top shape id is correct but marked broken.
  CT_REJECT,             // Classifier hates this.
+  CT_FONT_ATTR_ERR,      // Top unichar OK, but font attributes incorrect.
+  CT_OK_MULTI_FONT,      // CT_FONT_ATTR_OK but there are multiple font attrs.
  CT_NUM_RESULTS,        // Number of answers produced.
  CT_RANK,               // Rank of correct answer.
  CT_REJECTED_JUNK,      // Junk that was correctly rejected.
@ -115,12 +121,24 @@ class ErrorCounter {
  // * The return value is the un-weighted version of the scaled_error.
  static double ComputeErrorRate(ShapeClassifier* classifier,
                                 int report_level, CountTypes boosting_mode,
-                                 const UnicityTable<FontInfo>& fontinfo_table,
+                                 const FontInfoTable& fontinfo_table,
                                 const GenericVector<Pix*>& page_images,
                                 SampleIterator* it,
                                 double* unichar_error,
                                 double* scaled_error,
                                 STRING* fonts_report);
+  // Tests a pair of classifiers, debugging errors of the new against the old.
+  // See errorcounter.h for description of arguments.
+  // Iterates over the samples, calling the classifiers in normal/silent mode.
+  // If the new_classifier makes a boosting_mode error that the old_classifier
+  // does not, and the appropriate, it will then call the new_classifier again
+  // with a debug flag and a keep_this argument to find out what is going on.
+  static void DebugNewErrors(ShapeClassifier* new_classifier,
+                             ShapeClassifier* old_classifier,
+                             CountTypes boosting_mode,
+                             const FontInfoTable& fontinfo_table,
+                             const GenericVector<Pix*>& page_images,
+                             SampleIterator* it);

 private:
  // Simple struct to hold an array of counts.
@ -134,7 +152,7 @@ class ErrorCounter {

  // Constructor is private. Only anticipated use of ErrorCounter is via
  // the static ComputeErrorRate.
-  ErrorCounter(int charsetsize, int shapesize, int fontsize);
+  ErrorCounter(const UNICHARSET& unicharset, int fontsize);
  ~ErrorCounter();

  // Accumulates the errors from the classifier results on a single sample.
@ -145,15 +163,13 @@ class ErrorCounter {
  // for error counting and shape_table is used to understand the relationship
  // between unichar_ids and shape_ids in the results
  bool AccumulateErrors(bool debug, CountTypes boosting_mode,
-                        const UnicityTable<FontInfo>& font_table,
-                        const ShapeTable& shape_table,
-                        const GenericVector<ShapeRating>& results,
+                        const FontInfoTable& font_table,
+                        const GenericVector<UnicharRating>& results,
                        TrainingSample* sample);

  // Accumulates counts for junk. Counts only whether the junk was correctly
  // rejected or not.
-  void AccumulateJunk(const ShapeTable& shape_table,
-                      const GenericVector<ShapeRating>& results,
+  bool AccumulateJunk(bool debug, const GenericVector<UnicharRating>& results,
                      TrainingSample* sample);

  // Creates a report of the error rate. The report_level controls the detail
@ -169,15 +185,17 @@ class ErrorCounter {
  // If not NULL, the report string is saved in fonts_report.
  // (Ignoring report_level).
  double ReportErrors(int report_level, CountTypes boosting_mode,
-                      const UnicityTable<FontInfo>& fontinfo_table,
+                      const FontInfoTable& fontinfo_table,
                      const SampleIterator& it,
                      double* unichar_error,
                      STRING* fonts_report);

  // Sets the report string to a combined human and machine-readable report
  // string of the error rates.
-  // Returns false if there is no data, leaving report unchanged.
-  static bool ReportString(const Counts& counts, STRING* report);
+  // Returns false if there is no data, leaving report unchanged, unless
+  // even_if_empty is true.
+  static bool ReportString(bool even_if_empty, const Counts& counts,
+                           STRING* report);

  // Computes the error rates and returns in rates which is an array of size
  // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
@ -186,11 +204,22 @@ class ErrorCounter {

  // Total scaled error used by boosting algorithms.
  double scaled_error_;
+  // Difference in result rating to be thought of as an "equal" choice.
+  double rating_epsilon_;
  // Vector indexed by font_id from the samples of error accumulators.
  GenericVector<Counts> font_counts_;
  // Counts of the results that map each unichar_id (from samples) to an
  // incorrect shape_id.
  GENERIC_2D_ARRAY<int> unichar_counts_;
+  // Count of the number of times each shape_id occurs, is correct, and multi-
+  // unichar.
+  GenericVector<int> multi_unichar_counts_;
+  // Histogram of scores (as percent) for correct answers.
+  STATS ok_score_hist_;
+  // Histogram of scores (as percent) for incorrect answers.
+  STATS bad_score_hist_;
+  // Unicharset for printing character ids in results.
+  const UNICHARSET& unicharset_;
 };

 }  // namespace tesseract.
--- a/classify/extract.cpp
+++ b/classify/extract.cpp
@ -49,8 +49,10 @@ void ExtractorStub();
 * @note History: Sun Jan 21 10:07:28 1990, DSJ, Created.
 */
 CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              const DENORM& denorm, TBLOB *Blob) {
-  return (ExtractFlexFeatures(FeatureDefs, Blob, denorm));
+                              const DENORM& bl_denorm, const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info,
+                              TBLOB *Blob) {
+  return ExtractFlexFeatures(FeatureDefs, Blob, bl_denorm, cn_denorm, fx_info);
 }                                /* ExtractBlobFeatures */

 /*-----------------------------------------------------------------------------
--- a/classify/extract.h
+++ b/classify/extract.h
@ -26,8 +26,12 @@ class DENORM;
 /*-----------------------------------------------------------------------------
          Public Function Prototypes
 -----------------------------------------------------------------------------*/
+// Deprecated! Will be deleted soon!
+// In the meantime, as all TBLOBs, Blob is in baseline normalized coords.
+// See SetupBLCNDenorms in intfx.cpp for other args.
 CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              const DENORM& denorm, TBLOB *Blob);
+                              const DENORM& bl_denorm, const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info, TBLOB *Blob);

 /*---------------------------------------------------------------------------
          Private Function Prototypes
--- a/classify/featdefs.cpp
+++ b/classify/featdefs.cpp
@ -19,7 +19,7 @@
          Include Files and Type Defines
 -----------------------------------------------------------------------------*/
 #ifdef _MSC_VER
-#include "mathfix.h"
+#include <mathfix.h>
 #endif

 #include "featdefs.h"
--- a/classify/flexfx.cpp
+++ b/classify/flexfx.cpp
@ -28,8 +28,13 @@
              Public Code
 ----------------------------------------------------------------------------**/
 /*---------------------------------------------------------------------------*/
+// Deprecated! Will be deleted soon!
+// In the meantime, as all TBLOBs, Blob is in baseline normalized coords.
+// See SetupBLCNDenorms in intfx.cpp for other args.
 CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              TBLOB *Blob, const DENORM& denorm) {
+                              TBLOB *Blob, const DENORM& bl_denorm,
+                              const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info) {
 /*
 **	Parameters:
 **		Blob		blob to extract features from
@ -50,8 +55,13 @@ CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
    if (FeatureDefs.FeatureExtractors[Type] != NULL &&
        FeatureDefs.FeatureExtractors[Type]->Extractor != NULL) {
      CharDesc->FeatureSets[Type] =
-        (FeatureDefs.FeatureExtractors[Type])->Extractor(Blob, denorm);
+        (FeatureDefs.FeatureExtractors[Type])->Extractor(Blob,
+                                                         bl_denorm,
+                                                         cn_denorm,
+                                                         fx_info);
      if (CharDesc->FeatureSets[Type] == NULL) {
+        tprintf("Feature extractor for type %d = %s returned NULL!\n",
+                Type, FeatureDefs.FeatureDesc[Type]->ShortName);
        FreeCharDescription(CharDesc);
        return NULL;
      }
--- a/classify/flexfx.h
+++ b/classify/flexfx.h
@ -27,7 +27,10 @@
 /**----------------------------------------------------------------------------
          Public Function Prototypes
 ----------------------------------------------------------------------------**/
+// As with all TBLOBs this one is also baseline normalized.
 CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              TBLOB *Blob, const DENORM& denorm);
+                              TBLOB *Blob, const DENORM& bl_denorm,
+                              const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info);

 #endif
--- a/classify/intfeaturespace.cpp
+++ b/classify/intfeaturespace.cpp
@ -90,8 +90,7 @@ void IntFeatureSpace::IndexAndSortFeatures(
 // window, or -1 if the feature is a miss.
 int IntFeatureSpace::XYToFeatureIndex(int x, int y) const {
  // Round the x,y position to a feature. Search for a valid theta.
-  INT_FEATURE_STRUCT feature = {static_cast<uinT8>(x), static_cast<uinT8>(y),
-                                0, 0};
+  INT_FEATURE_STRUCT feature(x, y, 0);
  int index = -1;
  for (int theta = 0; theta <= MAX_UINT8 && index < 0; ++theta) {
    feature.Theta = theta;
@ -127,16 +126,10 @@ int IntFeatureSpace::XYToFeatureIndex(int x, int y) const {
 INT_FEATURE_STRUCT IntFeatureSpace::PositionFromBuckets(int x,
                                                        int y,
                                                        int theta) const {
-  INT_FEATURE_STRUCT pos = {
-      static_cast<uinT8>(ClipToRange(
-          (x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,
-          0, MAX_UINT8)),
-      static_cast<uinT8>(ClipToRange(
-          (y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,
-          0, MAX_UINT8)),
-      static_cast<uinT8>(ClipToRange(
-          DivRounded(theta * kIntFeatureExtent, theta_buckets_),
-          0, MAX_UINT8))};
+  INT_FEATURE_STRUCT pos(
+      (x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,
+      (y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,
+      DivRounded(theta * kIntFeatureExtent, theta_buckets_));
  return pos;
 }

--- a/classify/intfx.cpp
+++ b/classify/intfx.cpp
--- a/classify/intfx.h
+++ b/classify/intfx.h
@ -1,10 +1,10 @@
 /******************************************************************************
- **	Filename:    intfx.h
- **	Purpose:     Interface to high level integer feature extractor.
- **	Author:      Robert Moss
- **	History:     Tue May 21 15:51:57 MDT 1991, RWM, Created.
+ **  Filename:    intfx.h
+ **  Purpose:     Interface to high level integer feature extractor.
+ **  Author:      Robert Moss
+ **  History:     Tue May 21 15:51:57 MDT 1991, RWM, Created.
 **
- **	(c) Copyright Hewlett-Packard Company, 1988.
+ **  (c) Copyright Hewlett-Packard Company, 1988.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
@ -42,6 +42,9 @@ struct INT_FX_RESULT_STRUCT {
  uinT8 YTop;                    // Top of blob in BLN coords.
 };

+// The standard feature length
+const double kStandardFeatureLength = 64.0 / 5;
+
 /**----------------------------------------------------------------------------
          Public Function Prototypes
 ----------------------------------------------------------------------------**/
@ -51,28 +54,22 @@ void InitIntegerFX();
 // theta direction in an INT_FEATURE_STRUCT.
 FCOORD FeatureDirection(uinT8 theta);

-tesseract::TrainingSample* GetIntFeatures(
-    tesseract::NormalizationMode mode, TBLOB *blob,
-    const DENORM& denorm);
+namespace tesseract {
+  // Generates a TrainingSample from a TBLOB. Extracts features and sets
+  // the bounding box, so classifiers that operate on the image can work.
+  // TODO(rays) BlobToTrainingSample must remain a global function until
+  // the FlexFx and FeatureDescription code can be removed and LearnBlob
+  // made a member of Classify.
+  TrainingSample* BlobToTrainingSample(const TBLOB& blob,
+                                       tesseract::NormalizationMode mode,
+                                       bool nonlinear_norm);
+}

-int ExtractIntFeat(TBLOB *Blob,
-                   const DENORM& denorm,
-                   INT_FEATURE_ARRAY BLFeat,
-                   INT_FEATURE_ARRAY CNFeat,
-                   INT_FX_RESULT_STRUCT* Results,
-                   inT32 *FeatureOutlineArray = 0);
+// Deprecated! Prefer tesseract::Classify::ExtractFeatures instead.
+bool ExtractIntFeat(const TBLOB& blob,
+                    bool nonlinear_norm,
+                    INT_FEATURE_ARRAY BLFeat,
+                    INT_FEATURE_ARRAY CNFeat,
+                    INT_FX_RESULT_STRUCT* Results);

-uinT8 BinaryAnglePlusPi(inT32 Y, inT32 X);
-
-int SaveFeature(INT_FEATURE_ARRAY FeatureArray,
-                uinT16 FeatureNum,
-                inT16 X,
-                inT16 Y,
-                uinT8 Theta);
-
-uinT16 MySqrt(inT32 X, inT32 Y);
-
-uinT8 MySqrt2(uinT16 N, uinT32 I, uinT8 *Exp);
-
-void ClipRadius(uinT8 *RxInv, uinT8 *RxExp, uinT8 *RyInv, uinT8 *RyExp);
 #endif
--- a/classify/intmatcher.h
+++ b/classify/intmatcher.h
@ -28,7 +28,7 @@ extern BOOL_VAR_H(disable_character_fragments, FALSE,
                  "Do not include character fragments in the"
                  " results of the classifier");

-extern INT_VAR_H(classify_integer_matcher_multiplier, 14,
+extern INT_VAR_H(classify_integer_matcher_multiplier, 10,
                 "Integer Matcher Multiplier  0-255:   ");


--- a/classify/intproto.cpp
+++ b/classify/intproto.cpp
@ -37,6 +37,7 @@
 #include "mfoutline.h"
 #include "ndminx.h"
 #include "picofeat.h"
+#include "points.h"
 #include "shapetable.h"
 #include "svmnode.h"

@ -206,6 +207,22 @@ double_VAR(classify_pp_side_pad, 2.5, "Proto Pruner Side Pad");
 /*-----------------------------------------------------------------------------
              Public Code
 -----------------------------------------------------------------------------*/
+// Builds a feature from an FCOORD for position with all the necessary
+// clipping and rounding.
+INT_FEATURE_STRUCT::INT_FEATURE_STRUCT(const FCOORD& pos, uinT8 theta)
+  : X(ClipToRange<inT16>(static_cast<inT16>(pos.x() + 0.5), 0, 255)),
+    Y(ClipToRange<inT16>(static_cast<inT16>(pos.y() + 0.5), 0, 255)),
+    Theta(theta),
+    CP_misses(0) {
+}
+// Builds a feature from ints with all the necessary clipping and casting.
+INT_FEATURE_STRUCT::INT_FEATURE_STRUCT(int x, int y, int theta)
+  : X(static_cast<uinT8>(ClipToRange(x, 0, MAX_UINT8))),
+    Y(static_cast<uinT8>(ClipToRange(y, 0, MAX_UINT8))),
+    Theta(static_cast<uinT8>(ClipToRange(theta, 0, MAX_UINT8))),
+    CP_misses(0) {
+}
+
 /*---------------------------------------------------------------------------*/
 /**
 * This routine adds a new class structure to a set of
--- a/classify/intproto.h
+++ b/classify/intproto.h
@ -28,6 +28,8 @@
 #include "scrollview.h"
 #include "unicharset.h"

+class FCOORD;
+
 /* define order of params in pruners */
 #define PRUNER_X      0
 #define PRUNER_Y      1
@ -130,8 +132,14 @@ INT_TEMPLATES_STRUCT, *INT_TEMPLATES;
 #define MAX_NUM_INT_FEATURES 512
 #define INT_CHAR_NORM_RANGE  256

-struct INT_FEATURE_STRUCT
-{
+struct INT_FEATURE_STRUCT {
+  INT_FEATURE_STRUCT() : X(0), Y(0), Theta(0), CP_misses(0) { }
+  // Builds a feature from an FCOORD for position with all the necessary
+  // clipping and rounding.
+  INT_FEATURE_STRUCT(const FCOORD& pos, uinT8 theta);
+  // Builds a feature from ints with all the necessary clipping and casting.
+  INT_FEATURE_STRUCT(int x, int y, int theta);
+
  uinT8 X;
  uinT8 Y;
  uinT8 Theta;
--- a/classify/mastertrainer.cpp
+++ b/classify/mastertrainer.cpp
@ -30,6 +30,7 @@
 #include "allheaders.h"
 #include "boxread.h"
 #include "classify.h"
+#include "efio.h"
 #include "errorcounter.h"
 #include "featdefs.h"
 #include "sampleiterator.h"
@ -58,10 +59,6 @@ MasterTrainer::MasterTrainer(NormalizationMode norm_mode,
    enable_shape_anaylsis_(shape_analysis),
    enable_replication_(replicate_samples),
    fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
-  fontinfo_table_.set_compare_callback(
-      NewPermanentTessCallback(CompareFontInfo));
-  fontinfo_table_.set_clear_callback(
-      NewPermanentTessCallback(FontInfoDeleteCallback));
 }

 MasterTrainer::~MasterTrainer() {
@ -82,10 +79,7 @@ bool MasterTrainer::Serialize(FILE* fp) const {
  if (!verify_samples_.Serialize(fp)) return false;
  if (!master_shapes_.Serialize(fp)) return false;
  if (!flat_shapes_.Serialize(fp)) return false;
-  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_info)))
-    return false;
-  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_spacing_info)))
-    return false;
+  if (!fontinfo_table_.Serialize(fp)) return false;
  if (!xheights_.Serialize(fp)) return false;
  return true;
 }
@ -106,11 +100,7 @@ bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
  if (!verify_samples_.DeSerialize(swap, fp)) return false;
  if (!master_shapes_.DeSerialize(swap, fp)) return false;
  if (!flat_shapes_.DeSerialize(swap, fp)) return false;
-  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap))
-    return false;
-  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_spacing_info),
-                            swap))
-    return false;
+  if (!fontinfo_table_.DeSerialize(swap, fp)) return false;
  if (!xheights_.DeSerialize(swap, fp)) return false;
  return true;
 }
@ -122,8 +112,10 @@ void MasterTrainer::LoadUnicharset(const char* filename) {
            "Building unicharset for training from scratch...\n",
            filename);
    unicharset_.clear();
-    // Space character needed to represent NIL_LIST classification.
-    unicharset_.unichar_insert(" ");
+    UNICHARSET initialized;
+    // Add special characters, as they were removed by the clear, but the
+    // default constructor puts them in.
+    unicharset_.AppendOtherUnicharset(initialized);
  }
  charsetsize_ = unicharset_.size();
  delete [] fragments_;
@ -138,7 +130,7 @@ void MasterTrainer::LoadUnicharset(const char* filename) {
 // adding them to the trainer with the font_id from the content of the file.
 // See mftraining.cpp for a description of the file format.
 // If verification, then these are verification samples, not training.
-void MasterTrainer::ReadTrainingSamples(FILE  *fp,
+void MasterTrainer::ReadTrainingSamples(const char* page_name,
                                        const FEATURE_DEFS_STRUCT& feature_defs,
                                        bool verification) {
  char buffer[2048];
@ -148,6 +140,12 @@ void MasterTrainer::ReadTrainingSamples(FILE  *fp,
  int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
  int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);

+  FILE* fp = Efopen(page_name, "rb");
+  if (fp == NULL) {
+    tprintf("Failed to open tr file: %s\n", page_name);
+    return;
+  }
+  tr_filenames_.push_back(STRING(page_name));
  while (fgets(buffer, sizeof(buffer), fp) != NULL) {
    if (buffer[0] == '\n')
      continue;
@ -159,6 +157,7 @@ void MasterTrainer::ReadTrainingSamples(FILE  *fp,
    }
    *space++ = '\0';
    int font_id = GetFontInfoId(buffer);
+    if (font_id < 0) font_id = 0;
    int page_number;
    STRING unichar;
    TBOX bounding_box;
@ -177,6 +176,7 @@ void MasterTrainer::ReadTrainingSamples(FILE  *fp,
    FreeCharDescription(char_desc);
  }
  charsetsize_ = unicharset_.size();
+  fclose(fp);
 }

 // Adds the given single sample to the trainer, setting the classid
@ -278,23 +278,23 @@ void MasterTrainer::SetupMasterShapes() {
    const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);

    if (fragment == NULL)
-      char_shapes.AppendMasterShapes(shapes);
+      char_shapes.AppendMasterShapes(shapes, NULL);
    else if (fragment->is_beginning())
-      char_shapes_begin_fragment.AppendMasterShapes(shapes);
+      char_shapes_begin_fragment.AppendMasterShapes(shapes, NULL);
    else if (fragment->is_ending())
-      char_shapes_end_fragment.AppendMasterShapes(shapes);
+      char_shapes_end_fragment.AppendMasterShapes(shapes, NULL);
    else
-      char_shapes.AppendMasterShapes(shapes);
+      char_shapes.AppendMasterShapes(shapes, NULL);
  }
  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                kFontMergeDistance, &char_shapes_begin_fragment);
-  char_shapes.AppendMasterShapes(char_shapes_begin_fragment);
+  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, NULL);
  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                kFontMergeDistance, &char_shapes_end_fragment);
-  char_shapes.AppendMasterShapes(char_shapes_end_fragment);
+  char_shapes.AppendMasterShapes(char_shapes_end_fragment, NULL);
  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                kFontMergeDistance, &char_shapes);
-  master_shapes_.AppendMasterShapes(char_shapes);
+  master_shapes_.AppendMasterShapes(char_shapes, NULL);
  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
 }

@ -401,7 +401,7 @@ bool MasterTrainer::LoadXHeights(const char* filename) {
      continue;
    fontinfo.name = buffer;
    if (!fontinfo_table_.contains(fontinfo)) continue;
-    int fontinfo_id = fontinfo_table_.get_id(fontinfo);
+    int fontinfo_id = fontinfo_table_.get_index(fontinfo);
    xheights_[fontinfo_id] = xht;
    total_xheight += xht;
    ++xheight_count;
@ -439,7 +439,7 @@ bool MasterTrainer::AddSpacingInfo(const char *filename) {
  char kerned_uch[UNICHAR_LEN];
  int x_gap, x_gap_before, x_gap_after, num_kerned;
  ASSERT_HOST(fscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
-  FontInfo *fi = fontinfo_table_.get_mutable(fontinfo_id);
+  FontInfo *fi = &fontinfo_table_.get(fontinfo_id);
  fi->init_spacing(unicharset_.size());
  FontSpacingInfo *spacing = NULL;
  for (int l = 0; l < num_unichars; ++l) {
@ -480,11 +480,7 @@ int MasterTrainer::GetFontInfoId(const char* font_name) {
  fontinfo.name = const_cast<char*>(font_name);
  fontinfo.properties = 0;  // Not used to lookup in the table
  fontinfo.universal_id = 0;
-  if (!fontinfo_table_.contains(fontinfo)) {
-    return -1;
-  } else {
-    return fontinfo_table_.get_id(fontinfo);
-  }
+  return fontinfo_table_.get_index(fontinfo);
 }
 // Returns the font_id of the closest matching font name to the given
 // filename. It is assumed that a substring of the filename will match
@ -585,7 +581,7 @@ void MasterTrainer::WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
                                             const char* pffmtable_file) {
  tesseract::Classify *classify = new tesseract::Classify();
  // Move the fontinfo table to classify.
-  classify->get_fontinfo_table().move(&fontinfo_table_);
+  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
  INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
                                                             shape_set);
  FILE* fp = fopen(inttemp_file, "wb");
@ -750,17 +746,29 @@ void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
 }
 #endif  // GRAPHICS_DISABLED

+void MasterTrainer::TestClassifierVOld(bool replicate_samples,
+                                       ShapeClassifier* test_classifier,
+                                       ShapeClassifier* old_classifier) {
+  SampleIterator sample_it;
+  sample_it.Init(NULL, NULL, replicate_samples, &samples_);
+  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
+                               CT_UNICHAR_TOPN_ERR, fontinfo_table_,
+                               page_images_, &sample_it);
+}
+
 // Tests the given test_classifier on the internal samples.
 // See TestClassifier for details.
-void MasterTrainer::TestClassifierOnSamples(int report_level,
+void MasterTrainer::TestClassifierOnSamples(CountTypes error_mode,
+                                            int report_level,
                                            bool replicate_samples,
                                            ShapeClassifier* test_classifier,
                                            STRING* report_string) {
-  TestClassifier(report_level, replicate_samples, &samples_,
+  TestClassifier(error_mode, report_level, replicate_samples, &samples_,
                 test_classifier, report_string);
 }

-// Tests the given test_classifier on the given samples
+// Tests the given test_classifier on the given samples.
+// error_mode indicates what counts as an error.
 // report_levels:
 // 0 = no output.
 // 1 = bottom-line error rate.
@ -772,14 +780,14 @@ void MasterTrainer::TestClassifierOnSamples(int report_level,
 // sample including replicated and systematically perturbed samples.
 // If report_string is non-NULL, a summary of the results for each font
 // is appended to the report_string.
-double MasterTrainer::TestClassifier(int report_level,
+double MasterTrainer::TestClassifier(CountTypes error_mode,
+                                     int report_level,
                                     bool replicate_samples,
                                     TrainingSampleSet* samples,
                                     ShapeClassifier* test_classifier,
                                     STRING* report_string) {
  SampleIterator sample_it;
-  sample_it.Init(NULL, test_classifier->GetShapeTable(), replicate_samples,
-                 samples);
+  sample_it.Init(NULL, NULL, replicate_samples, samples);
  if (report_level > 0) {
    int num_samples = 0;
    for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
@ -791,7 +799,7 @@ double MasterTrainer::TestClassifier(int report_level,
  }
  double unichar_error = 0.0;
  ErrorCounter::ComputeErrorRate(test_classifier, report_level,
-                                 CT_SHAPE_TOP_ERR, fontinfo_table_,
+                                 error_mode, fontinfo_table_,
                                 page_images_, &sample_it, &unichar_error,
                                 NULL, report_string);
  return unichar_error;
--- a/classify/mastertrainer.h
+++ b/classify/mastertrainer.h
@ -29,6 +29,7 @@
 #include "cluster.h"
 #include "intfx.h"
 #include "elst.h"
+#include "errorcounter.h"
 #include "featdefs.h"
 #include "fontinfo.h"
 #include "indexmapbidi.h"
@ -89,7 +90,7 @@ class MasterTrainer {
  // Reads the samples and their features from the given file,
  // adding them to the trainer with the font_id from the content of the file.
  // If verification, then these are verification samples, not training.
-  void ReadTrainingSamples(FILE  *fp,
+  void ReadTrainingSamples(const char* page_name,
                           const FEATURE_DEFS_STRUCT& feature_defs,
                           bool verification);

@ -159,6 +160,12 @@ class MasterTrainer {
  // one of the fonts. If more than one is matched, the longest is returned.
  int GetBestMatchingFontInfoId(const char* filename);

+  // Returns the filename of the tr file corresponding to the command-line
+  // argument with the given index.
+  const STRING& GetTRFileName(int index) const {
+    return tr_filenames_[index];
+  }
+
  // Sets up a flat shapetable with one shape per class/font combination.
  void SetupFlatShapeTable(ShapeTable* shape_table);

@ -207,13 +214,19 @@ class MasterTrainer {
                      const char* unichar_str2, int canonical_font);
  #endif  // GRAPHICS_DISABLED

+  void TestClassifierVOld(bool replicate_samples,
+                          ShapeClassifier* test_classifier,
+                          ShapeClassifier* old_classifier);
+
  // Tests the given test_classifier on the internal samples.
  // See TestClassifier for details.
-  void TestClassifierOnSamples(int report_level,
+  void TestClassifierOnSamples(CountTypes error_mode,
+                               int report_level,
                               bool replicate_samples,
                               ShapeClassifier* test_classifier,
                               STRING* report_string);
  // Tests the given test_classifier on the given samples
+  // error_mode indicates what counts as an error.
  // report_levels:
  // 0 = no output.
  // 1 = bottom-line error rate.
@ -225,7 +238,8 @@ class MasterTrainer {
  // sample including replicated and systematically perturbed samples.
  // If report_string is non-NULL, a summary of the results for each font
  // is appended to the report_string.
-  double TestClassifier(int report_level,
+  double TestClassifier(CountTypes error_mode,
+                        int report_level,
                        bool replicate_samples,
                        TrainingSampleSet* samples,
                        ShapeClassifier* test_classifier,
@ -263,9 +277,9 @@ class MasterTrainer {
  // Flat shape table has each unichar/font id pair in a separate shape.
  ShapeTable flat_shapes_;
  // Font metrics gathered from multiple files.
-  UnicityTable<FontInfo> fontinfo_table_;
+  FontInfoTable fontinfo_table_;
  // Array of xheights indexed by font ids in fontinfo_table_;
-  GenericVector<int> xheights_;
+  GenericVector<inT32> xheights_;

  // Non-serialized data initialized by other means or used temporarily
  // during loading of training samples.
@ -291,6 +305,8 @@ class MasterTrainer {
  // Indexed by page_num_ in the samples.
  // These images are owned by the trainer and need to be pixDestroyed.
  GenericVector<Pix*> page_images_;
+  // Vector of filenames of loaded tr files.
+  GenericVector<STRING> tr_filenames_;
 };

 }  // namespace tesseract.
--- a/classify/mf.cpp
+++ b/classify/mf.cpp
@ -33,7 +33,9 @@
              Private Code
 ----------------------------------------------------------------------------**/
 /*---------------------------------------------------------------------------*/
-FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& denorm) {
+FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm,
+                          const DENORM& cn_denorm,
+                          const INT_FX_RESULT_STRUCT& fx_info) {
 /*
 **	Parameters:
 **		Blob		blob to extract micro-features from
@ -52,7 +54,8 @@ FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& denorm) {
  FEATURE Feature;
  MICROFEATURE OldFeature;

-  OldFeatures = (MICROFEATURES)BlobMicroFeatures(Blob, denorm);
+  OldFeatures = (MICROFEATURES)BlobMicroFeatures(Blob, bl_denorm, cn_denorm,
+                                                 fx_info);
  if (OldFeatures == NULL)
    return NULL;
  NumFeatures = count (OldFeatures);
--- a/classify/mf.h
+++ b/classify/mf.h
@ -34,6 +34,8 @@ typedef float MicroFeature[MFCount];
 /*----------------------------------------------------------------------------
          Private Function Prototypes
 -----------------------------------------------------------------------------*/
-FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& denorm);
+FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm,
+                          const DENORM& cn_denorm,
+                          const INT_FX_RESULT_STRUCT& fx_info);

 #endif
--- a/classify/mfoutline.cpp
+++ b/classify/mfoutline.cpp
@ -103,56 +103,6 @@ LIST ConvertOutlines(TESSLINE *outline,
  return mf_outlines;
 }

-
-/*---------------------------------------------------------------------------*/
-void ComputeOutlineStats(LIST Outlines, OUTLINE_STATS *OutlineStats) {
-/*
- ** Parameters:
- **   Outlines  list of outlines to compute stats for
- **   OutlineStats  place to put results
- ** Globals: none
- ** Operation: This routine computes several statistics about the outlines
- **   in Outlines.  These statistics are usually used to perform
- **   anistropic normalization of all of the outlines.  The
- **   statistics generated are:
- **     first moments about x and y axes
- **     total length of all outlines
- **     center of mass of all outlines
- **     second moments about center of mass axes
- **     radius of gyration about center of mass axes
- ** Return: none (results are returned in OutlineStats)
- ** Exceptions: none
- ** History: Fri Dec 14 08:32:03 1990, DSJ, Created.
- */
-  MFOUTLINE Outline;
-  MFOUTLINE EdgePoint;
-  MFEDGEPT *Current;
-  MFEDGEPT *Last;
-
-  InitOutlineStats(OutlineStats);
-  iterate(Outlines) {
-    Outline = (MFOUTLINE) first_node (Outlines);
-
-    Last = PointAt (Outline);
-    Outline = NextPointAfter (Outline);
-    EdgePoint = Outline;
-    do {
-      Current = PointAt (EdgePoint);
-
-      UpdateOutlineStats (OutlineStats,
-        Last->Point.x, Last->Point.y,
-        Current->Point.x, Current->Point.y);
-
-      Last = Current;
-      EdgePoint = NextPointAfter (EdgePoint);
-    }
-    while (EdgePoint != Outline);
-  }
-  FinishOutlineStats(OutlineStats);
-
-}                                /* ComputeOutlineStats */
-
-
 /*---------------------------------------------------------------------------*/
 void FindDirectionChanges(MFOUTLINE Outline,
                          FLOAT32 MinSlope,
@ -334,7 +284,8 @@ void NormalizeOutline(MFOUTLINE Outline,
  MFOUTLINE EdgePoint = Outline;
  do {
    MFEDGEPT *Current = PointAt(EdgePoint);
-    Current->Point.y = MF_SCALE_FACTOR * (Current->Point.y - BASELINE_OFFSET);
+    Current->Point.y = MF_SCALE_FACTOR *
+        (Current->Point.y - kBlnBaselineOffset);
    Current->Point.x = MF_SCALE_FACTOR * (Current->Point.x - XOrigin);
    EdgePoint = NextPointAfter(EdgePoint);
  } while (EdgePoint != Outline);
@ -365,34 +316,10 @@ void Classify::NormalizeOutlines(LIST Outlines,
 ** History: Fri Dec 14 08:14:55 1990, DSJ, Created.
 */
  MFOUTLINE Outline;
-  OUTLINE_STATS OutlineStats;
-  FLOAT32 BaselineScale;

  switch (classify_norm_method) {
    case character:
-      ComputeOutlineStats(Outlines, &OutlineStats);
-
-      /* limit scale factor to avoid overscaling small blobs (.,`'),
-         thin blobs (l1ift), and merged blobs */
-      *XScale = *YScale = BaselineScale = MF_SCALE_FACTOR;
-      *XScale *= OutlineStats.Ry;
-      *YScale *= OutlineStats.Rx;
-      if (*XScale < classify_min_norm_scale_x)
-        *XScale = classify_min_norm_scale_x;
-      if (*YScale < classify_min_norm_scale_y)
-        *YScale = classify_min_norm_scale_y;
-      if (*XScale > classify_max_norm_scale_x &&
-          *YScale <= classify_max_norm_scale_y)
-        *XScale = classify_max_norm_scale_x;
-      *XScale = classify_char_norm_range * BaselineScale / *XScale;
-      *YScale = classify_char_norm_range * BaselineScale / *YScale;
-
-      iterate(Outlines) {
-        Outline = (MFOUTLINE) first_node (Outlines);
-        CharNormalizeOutline (Outline,
-          OutlineStats.x, OutlineStats.y,
-          *XScale, *YScale);
-      }
+      ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
      break;

    case baseline:
@ -436,11 +363,7 @@ void ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction) {


 /*---------------------------------------------------------------------------*/
-void CharNormalizeOutline(MFOUTLINE Outline,
-                          FLOAT32 XCenter,
-                          FLOAT32 YCenter,
-                          FLOAT32 XScale,
-                          FLOAT32 YScale) {
+void CharNormalizeOutline(MFOUTLINE Outline, const DENORM& cn_denorm) {
 /*
 ** Parameters:
 **   Outline     outline to be character normalized
@ -463,13 +386,13 @@ void CharNormalizeOutline(MFOUTLINE Outline,
  First = Outline;
  Current = First;
  do {
-    CurrentPoint = PointAt (Current);
-    CurrentPoint->Point.x =
-      (CurrentPoint->Point.x - XCenter) * XScale;
-    CurrentPoint->Point.y =
-      (CurrentPoint->Point.y - YCenter) * YScale;
+    CurrentPoint = PointAt(Current);
+    FCOORD pos(CurrentPoint->Point.x, CurrentPoint->Point.y);
+    cn_denorm.LocalNormTransform(pos, &pos);
+    CurrentPoint->Point.x = (pos.x() - MAX_UINT8 / 2) * MF_SCALE_FACTOR;
+    CurrentPoint->Point.y = (pos.y() - MAX_UINT8 / 2) * MF_SCALE_FACTOR;

-    Current = NextPointAfter (Current);
+    Current = NextPointAfter(Current);
  }
  while (Current != First);

--- a/classify/mfoutline.h
+++ b/classify/mfoutline.h
@ -21,10 +21,10 @@
 /**----------------------------------------------------------------------------
          Include Files and Type Defines
 ----------------------------------------------------------------------------**/
+#include "blobs.h"
 #include "host.h"
 #include "oldlist.h"
 #include "fpoint.h"
-#include "baseline.h"
 #include "params.h"

 #define NORMAL_X_HEIGHT   (0.5)
@ -68,7 +68,7 @@ typedef enum {
 #define AverageOf(A,B)    (((A) + (B)) / 2)

 /* macro for computing the scale factor to use to normalize characters */
-#define MF_SCALE_FACTOR  (NORMAL_X_HEIGHT / BASELINE_SCALE)
+#define MF_SCALE_FACTOR  (NORMAL_X_HEIGHT / kBlnXHeight)

 /* macros for manipulating micro-feature outlines */
 #define DegenerateOutline(O)  (((O) == NIL_LIST) || ((O) == list_rest(O)))
@ -93,8 +93,6 @@ LIST ConvertOutlines(TESSLINE *Outline,
                     LIST ConvertedOutlines,
                     OUTLINETYPE OutlineType);

-void ComputeOutlineStats(LIST Outlines, OUTLINE_STATS *OutlineStats);
-
 void FilterEdgeNoise(MFOUTLINE Outline, FLOAT32 NoiseSegmentLength);

 void FindDirectionChanges(MFOUTLINE Outline,
@ -119,11 +117,10 @@ void NormalizeOutline(MFOUTLINE Outline,
 -----------------------------------------------------------------------------*/
 void ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction);

-void CharNormalizeOutline(MFOUTLINE Outline,
-                          FLOAT32 XCenter,
-                          FLOAT32 YCenter,
-                          FLOAT32 XScale,
-                          FLOAT32 YScale);
+// Normalizes the Outline in-place using cn_denorm's local transformation,
+// then converts from the integer feature range [0,255] to the clusterer
+// feature range of [-0.5, 0.5].
+void CharNormalizeOutline(MFOUTLINE Outline, const DENORM& cn_denorm);

 void ComputeDirection(MFEDGEPT *Start,
                      MFEDGEPT *Finish,
--- a/classify/mfx.cpp
+++ b/classify/mfx.cpp
@ -59,7 +59,9 @@ MICROFEATURE ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End);
 ----------------------------------------------------------------------------**/

 /*---------------------------------------------------------------------------*/
-CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm) {
+CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                const DENORM& cn_denorm,
+                                const INT_FX_RESULT_STRUCT& fx_info) {
 /*
 **      Parameters:
 **              Blob            blob to extract micro-features from
@ -74,35 +76,25 @@ CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm) {
 **      History: 7/21/89, DSJ, Created.
 */
  MICROFEATURES MicroFeatures = NIL_LIST;
-  FLOAT32 XScale, YScale;
  LIST Outlines;
  LIST RemainingOutlines;
  MFOUTLINE Outline;
-  INT_FEATURE_ARRAY blfeatures;
-  INT_FEATURE_ARRAY cnfeatures;
-  INT_FX_RESULT_STRUCT results;

  if (Blob != NULL) {
-    Outlines = ConvertBlob (Blob);
-    if (!ExtractIntFeat(Blob, denorm, blfeatures, cnfeatures, &results))
-      return NULL;
-    XScale = 0.2f / results.Ry;
-    YScale = 0.2f / results.Rx;
+    Outlines = ConvertBlob(Blob);

    RemainingOutlines = Outlines;
    iterate(RemainingOutlines) {
      Outline = (MFOUTLINE) first_node (RemainingOutlines);
-      CharNormalizeOutline (Outline,
-        results.Xmean, results.Ymean,
-        XScale, YScale);
+      CharNormalizeOutline(Outline, cn_denorm);
    }

    RemainingOutlines = Outlines;
    iterate(RemainingOutlines) {
-      Outline = (MFOUTLINE) first_node (RemainingOutlines);
+      Outline = (MFOUTLINE) first_node(RemainingOutlines);
      FindDirectionChanges(Outline, classify_min_slope, classify_max_slope);
      MarkDirectionChanges(Outline);
-      MicroFeatures = ConvertToMicroFeatures (Outline, MicroFeatures);
+      MicroFeatures = ConvertToMicroFeatures(Outline, MicroFeatures);
    }
    FreeOutlines(Outlines);
  }
--- a/classify/mfx.h
+++ b/classify/mfx.h
@ -35,6 +35,8 @@ extern double_VAR_H(classify_max_slope, 2.414213562,
 /**----------------------------------------------------------------------------
          Public Function Prototypes
 ----------------------------------------------------------------------------**/
-CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm);
+CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                const DENORM& cn_denorm,
+                                const INT_FX_RESULT_STRUCT& fx_info);

 #endif
--- a/classify/normfeat.cpp
+++ b/classify/normfeat.cpp
@ -59,22 +59,18 @@ FLOAT32 ActualOutlineLength(FEATURE Feature) {
 //     the x center of the grapheme's bounding box.
 //     English: [0.011, 0.31]
 //
-FEATURE_SET ExtractCharNormFeatures(TBLOB *blob, const DENORM& denorm) {
+FEATURE_SET ExtractCharNormFeatures(TBLOB *blob, const DENORM& bl_denorm,
+                                    const DENORM& cn_denorm,
+                                    const INT_FX_RESULT_STRUCT& fx_info) {
  FEATURE_SET feature_set = NewFeatureSet(1);
  FEATURE feature = NewFeature(&CharNormDesc);

-  INT_FEATURE_ARRAY blfeatures;
-  INT_FEATURE_ARRAY cnfeatures;
-  INT_FX_RESULT_STRUCT FXInfo;
-
-  ExtractIntFeat(blob, denorm, blfeatures, cnfeatures, &FXInfo);
-
  feature->Params[CharNormY] =
-      MF_SCALE_FACTOR * (FXInfo.Ymean - BASELINE_OFFSET);
+      MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
  feature->Params[CharNormLength] =
-      MF_SCALE_FACTOR * FXInfo.Length / LENGTH_COMPRESSION;
-  feature->Params[CharNormRx] = MF_SCALE_FACTOR * FXInfo.Rx;
-  feature->Params[CharNormRy] = MF_SCALE_FACTOR * FXInfo.Ry;
+      MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
+  feature->Params[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
+  feature->Params[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;

  AddFeature(feature_set, feature);

--- a/classify/normfeat.h
+++ b/classify/normfeat.h
@ -34,6 +34,8 @@ typedef enum {
 ----------------------------------------------------------------------------**/
 FLOAT32 ActualOutlineLength(FEATURE Feature);

-FEATURE_SET ExtractCharNormFeatures(TBLOB *Blob, const DENORM& denorm);
+FEATURE_SET ExtractCharNormFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                    const DENORM& cn_denorm,
+                                    const INT_FX_RESULT_STRUCT& fx_info);

 #endif
--- a/classify/normmatch.cpp
+++ b/classify/normmatch.cpp
@ -94,7 +94,7 @@ FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId,
  PROTOTYPE *Proto;
  int ProtoId;

-  if(ClassId > NormProtos->NumProtos) {
+  if (ClassId > NormProtos->NumProtos) {
    ClassId = NO_CLASS;
  }

--- a/classify/ocrfeatures.cpp
+++ b/classify/ocrfeatures.cpp
@ -230,7 +230,7 @@ void WriteFeature(FILE *File, FEATURE Feature) {
  int i;

  for (i = 0; i < Feature->Type->NumParams; i++) {
-#ifndef _WIN32
+#ifndef WIN32
    assert(!isnan(Feature->Params[i]));
 #endif
    fprintf(File, " %g", Feature->Params[i]);
--- a/classify/ocrfeatures.h
+++ b/classify/ocrfeatures.h
@ -26,6 +26,7 @@
 #include <stdio.h>

 class DENORM;
+struct INT_FX_RESULT_STRUCT;

 #undef Min
 #undef Max
@ -78,7 +79,8 @@ typedef FEATURE_SET_STRUCT *FEATURE_SET;
 // classifier does not need to know the details of this data structure.
 typedef char *CHAR_FEATURES;

-typedef FEATURE_SET (*FX_FUNC) (TBLOB *, const DENORM&);
+typedef FEATURE_SET (*FX_FUNC)(TBLOB *, const DENORM&, const DENORM&,
+                               const INT_FX_RESULT_STRUCT&);

 struct FEATURE_EXT_STRUCT {
  FX_FUNC Extractor;             // func to extract features
--- a/classify/picofeat.cpp
+++ b/classify/picofeat.cpp
@ -224,7 +224,9 @@ void NormalizePicoX(FEATURE_SET FeatureSet) {
 }                                /* NormalizePicoX */

 /*---------------------------------------------------------------------------*/
-FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
+FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& bl_denorm,
+                                 const DENORM& cn_denorm,
+                                 const INT_FX_RESULT_STRUCT& fx_info) {
 /*
 ** Parameters:
 **   blob    blob to extract features from
@ -233,8 +235,8 @@ FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
 ** Exceptions: none
 ** History: 8/8/2011, rays, Created.
 */
-  tesseract::TrainingSample* sample = GetIntFeatures(
-      tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
+  tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
+      *blob, tesseract::NM_CHAR_ANISOTROPIC, false);
  if (sample == NULL) return NULL;

  int num_features = sample->num_features();
@ -254,7 +256,9 @@ FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
 }                                /* ExtractIntCNFeatures */

 /*---------------------------------------------------------------------------*/
-FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& denorm) {
+FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& bl_denorm,
+                                  const DENORM& cn_denorm,
+                                  const INT_FX_RESULT_STRUCT& fx_info) {
 /*
 ** Parameters:
 **   blob    blob to extract features from
@ -263,8 +267,8 @@ FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& denorm) {
 ** Exceptions: none
 ** History: 8/8/2011, rays, Created.
 */
-  tesseract::TrainingSample* sample = GetIntFeatures(
-      tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
+  tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
+      *blob, tesseract::NM_CHAR_ANISOTROPIC, false);
  if (sample == NULL) return NULL;

  FEATURE_SET feature_set = NewFeatureSet(1);
--- a/classify/picofeat.h
+++ b/classify/picofeat.h
@ -58,8 +58,12 @@ extern double_VAR_H(classify_pico_feature_length, 0.05, "Pico Feature Length");
 ----------------------------------------------------------------------------**/
 #define GetPicoFeatureLength()  (PicoFeatureLength)

-FEATURE_SET ExtractIntCNFeatures(TBLOB *Blob, const DENORM& denorm);
-FEATURE_SET ExtractIntGeoFeatures(TBLOB *Blob, const DENORM& denorm);
+FEATURE_SET ExtractIntCNFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                 const DENORM& cn_denorm,
+                                 const INT_FX_RESULT_STRUCT& fx_info);
+FEATURE_SET ExtractIntGeoFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                  const DENORM& cn_denorm,
+                                  const INT_FX_RESULT_STRUCT& fx_info);

 /**----------------------------------------------------------------------------
        Global Data Definitions and Declarations
--- a/classify/shapeclassifier.cpp
+++ b/classify/shapeclassifier.cpp
@ -0,0 +1,230 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        shapeclassifier.h
+// Description: Base interface class for classifiers that return a
+//              shape index.
+// Author:      Ray Smith
+// Created:     Thu Dec 15 15:24:27 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "shapeclassifier.h"
+#include "genericvector.h"
+#include "scrollview.h"
+#include "shapetable.h"
+#include "svmnode.h"
+#include "trainingsample.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Classifies the given [training] sample, writing to results.
+// See shapeclassifier.h for a full description.
+// Default implementation calls the ShapeRating version.
+int ShapeClassifier::UnicharClassifySample(
+    const TrainingSample& sample, Pix* page_pix, int debug,
+    UNICHAR_ID keep_this, GenericVector<UnicharRating>* results) {
+  results->truncate(0);
+  GenericVector<ShapeRating> shape_results;
+  int num_shape_results = ClassifySample(sample, page_pix, debug, keep_this,
+                                         &shape_results);
+  const ShapeTable* shapes = GetShapeTable();
+  GenericVector<int> unichar_map;
+  unichar_map.init_to_size(shapes->unicharset().size(), -1);
+  for (int r = 0; r < num_shape_results; ++r) {
+    shapes->AddShapeToResults(shape_results[r], &unichar_map, results);
+  }
+  return results->size();
+}
+
+// Classifies the given [training] sample, writing to results.
+// See shapeclassifier.h for a full description.
+// Default implementation aborts.
+int ShapeClassifier::ClassifySample(const TrainingSample& sample, Pix* page_pix,
+                           int debug, int keep_this,
+                           GenericVector<ShapeRating>* results) {
+  ASSERT_HOST("Must implement ClassifySample!" == NULL);
+  return 0;
+}
+
+// Returns the shape that contains unichar_id that has the best result.
+// If result is not NULL, it is set with the shape_id and rating.
+// Does not need to be overridden if ClassifySample respects the keep_this
+// rule.
+int ShapeClassifier::BestShapeForUnichar(const TrainingSample& sample,
+                                         Pix* page_pix, UNICHAR_ID unichar_id,
+                                         ShapeRating* result) {
+  GenericVector<ShapeRating> results;
+  const ShapeTable* shapes = GetShapeTable();
+  int num_results = ClassifySample(sample, page_pix, 0, unichar_id, &results);
+  for (int r = 0; r < num_results; ++r) {
+    if (shapes->GetShape(results[r].shape_id).ContainsUnichar(unichar_id)) {
+      if (result != NULL)
+        *result = results[r];
+      return results[r].shape_id;
+    }
+  }
+  return -1;
+}
+
+// Provides access to the UNICHARSET that this classifier works with.
+// Only needs to be overridden if GetShapeTable() can return NULL.
+const UNICHARSET& ShapeClassifier::GetUnicharset() const {
+  return GetShapeTable()->unicharset();
+}
+
+// Visual debugger classifies the given sample, displays the results and
+// solicits user input to display other classifications. Returns when
+// the user has finished with debugging the sample.
+// Probably doesn't need to be overridden if the subclass provides
+// DisplayClassifyAs.
+void ShapeClassifier::DebugDisplay(const TrainingSample& sample,
+                                   Pix* page_pix,
+                                   UNICHAR_ID unichar_id) {
+  static ScrollView* terminator = NULL;
+  if (terminator == NULL) {
+    terminator = new ScrollView("XIT", 0, 0, 50, 50, 50, 50, true);
+  }
+  ScrollView* debug_win = CreateFeatureSpaceWindow("ClassifierDebug", 0, 0);
+  // Provide a right-click menu to choose the class.
+  SVMenuNode* popup_menu = new SVMenuNode();
+  popup_menu->AddChild("Choose class to debug", 0, "x", "Class to debug");
+  popup_menu->BuildMenu(debug_win, false);
+  // Display the features in green.
+  const INT_FEATURE_STRUCT* features = sample.features();
+  int num_features = sample.num_features();
+  for (int f = 0; f < num_features; ++f) {
+    RenderIntFeature(debug_win, &features[f], ScrollView::GREEN);
+  }
+  debug_win->Update();
+  GenericVector<UnicharRating> results;
+  // Debug classification until the user quits.
+  const UNICHARSET& unicharset = GetUnicharset();
+  SVEvent* ev;
+  SVEventType ev_type;
+  do {
+    PointerVector<ScrollView> windows;
+    if (unichar_id >= 0) {
+      tprintf("Debugging class %d = %s\n",
+              unichar_id, unicharset.id_to_unichar(unichar_id));
+      UnicharClassifySample(sample, page_pix, 1, unichar_id, &results);
+      DisplayClassifyAs(sample, page_pix, unichar_id, 1, &windows);
+    } else {
+      tprintf("Invalid unichar_id: %d\n", unichar_id);
+      UnicharClassifySample(sample, page_pix, 1, -1, &results);
+    }
+    if (unichar_id >= 0) {
+      tprintf("Debugged class %d = %s\n",
+              unichar_id, unicharset.id_to_unichar(unichar_id));
+    }
+    tprintf("Right-click in ClassifierDebug window to choose debug class,");
+    tprintf(" Left-click or close window to quit...\n");
+    UNICHAR_ID old_unichar_id;
+    do {
+      old_unichar_id = unichar_id;
+      ev = debug_win->AwaitEvent(SVET_ANY);
+      ev_type = ev->type;
+      if (ev_type == SVET_POPUP) {
+        if (unicharset.contains_unichar(ev->parameter)) {
+          unichar_id = unicharset.unichar_to_id(ev->parameter);
+        } else {
+          tprintf("Char class '%s' not found in unicharset", ev->parameter);
+        }
+      }
+      delete ev;
+    } while (unichar_id == old_unichar_id &&
+             ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
+  } while (ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
+  delete debug_win;
+}
+
+// Displays classification as the given shape_id. Creates as many windows
+// as it feels fit, using index as a guide for placement. Adds any created
+// windows to the windows output and returns a new index that may be used
+// by any subsequent classifiers. Caller waits for the user to view and
+// then destroys the windows by clearing the vector.
+int ShapeClassifier::DisplayClassifyAs(
+    const TrainingSample& sample, Pix* page_pix,
+    UNICHAR_ID unichar_id, int index,
+    PointerVector<ScrollView>* windows) {
+  // Does nothing in the default implementation.
+  return index;
+}
+
+// Prints debug information on the results.
+void ShapeClassifier::UnicharPrintResults(
+    const char* context, const GenericVector<UnicharRating>& results) const {
+  tprintf("%s\n", context);
+  for (int i = 0; i < results.size(); ++i) {
+    tprintf("%g: c_id=%d=%s", results[i].rating, results[i].unichar_id,
+            GetUnicharset().id_to_unichar(results[i].unichar_id));
+    if (results[i].fonts.size() != 0) {
+      tprintf(" Font Vector:");
+      for (int f = 0; f < results[i].fonts.size(); ++f) {
+        tprintf(" %d", results[i].fonts[f]);
+      }
+    }
+    tprintf("\n");
+  }
+}
+void ShapeClassifier::PrintResults(
+    const char* context, const GenericVector<ShapeRating>& results) const {
+  tprintf("%s\n", context);
+  for (int i = 0; i < results.size(); ++i) {
+    tprintf("%g:", results[i].rating);
+    if (results[i].joined)
+      tprintf("[J]");
+    if (results[i].broken)
+      tprintf("[B]");
+    tprintf(" %s\n", GetShapeTable()->DebugStr(results[i].shape_id).string());
+  }
+}
+
+// Removes any result that has all its unichars covered by a better choice,
+// regardless of font.
+void ShapeClassifier::FilterDuplicateUnichars(
+    GenericVector<ShapeRating>* results) const {
+  GenericVector<ShapeRating> filtered_results;
+  // Copy results to filtered results and knock out duplicate unichars.
+  const ShapeTable* shapes = GetShapeTable();
+  for (int r = 0; r < results->size(); ++r) {
+    if (r > 0) {
+      const Shape& shape_r = shapes->GetShape((*results)[r].shape_id);
+      int c;
+      for (c = 0; c < shape_r.size(); ++c) {
+        int unichar_id = shape_r[c].unichar_id;
+        int s;
+        for (s = 0; s < r; ++s) {
+          const Shape& shape_s = shapes->GetShape((*results)[s].shape_id);
+          if (shape_s.ContainsUnichar(unichar_id))
+            break;  // We found unichar_id.
+        }
+        if (s == r)
+          break;  // We didn't find unichar_id.
+      }
+      if (c == shape_r.size())
+        continue;  // We found all the unichar ids in previous answers.
+    }
+    filtered_results.push_back((*results)[r]);
+  }
+  *results = filtered_results;
+}
+
+}  // namespace tesseract.
+
+
+
+
+
--- a/classify/shapeclassifier.h
+++ b/classify/shapeclassifier.h
@ -23,44 +23,21 @@
 #ifndef TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
 #define TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_

+#include "unichar.h"
+
 template <typename T> class GenericVector;
 struct Pix;
+class ScrollView;
+class UNICHARSET;

 namespace tesseract {

+template <typename T> class PointerVector;
+struct ShapeRating;
 class ShapeTable;
 class TrainingSample;
-
-// Classifier result from a low-level classification is an index into some
-// ShapeTable and a rating.
-struct ShapeRating {
-  ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f) {}
-  ShapeRating(int s, float r)
-    : shape_id(s), rating(r), raw(1.0f), font(0.0f) {}
-
-  // Sort function to sort ratings appropriately by descending rating.
-  static int SortDescendingRating(const void* t1, const void* t2) {
-    const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
-    const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
-    if (a->rating > b->rating) {
-      return -1;
-    } else if (a->rating < b->rating) {
-      return 1;
-    } else {
-      return a->shape_id - b->shape_id;
-    }
-  }
-
-  // Index into some shape table indicates the class of the answer.
-  int shape_id;
-  // Rating from classifier with 1.0 perfect and 0.0 impossible.
-  // Call it a probability if you must.
-  float rating;
-  // Subsidiary rating that a classifier may use internally.
-  float raw;
-  // Subsidiary rating that a classifier may use internally.
-  float font;
-};
+class TrainingSampleSet;
+struct UnicharRating;

 // Interface base class for classifiers that produce ShapeRating results.
 class ShapeClassifier {
@ -76,18 +53,70 @@ class ShapeClassifier {
  // to get the appropriate tesseract features.
  // If debug is non-zero, then various degrees of classifier dependent debug
  // information is provided.
-  // If keep_this (a shape index) is >= 0, then the results should always
+  // If keep_this (a UNICHAR_ID) is >= 0, then the results should always
  // contain keep_this, and (if possible) anything of intermediate confidence.
-  // (Used for answering "Why didn't it get that right?" questions.)
+  // (Used for answering "Why didn't it get that right?" questions.) It must
+  // be a UNICHAR_ID as the callers have no clue how to choose the best shape
+  // that may contain a desired answer.
  // The return value is the number of classes saved in results.
-  // NOTE that overriding functions MUST clear results unless the classifier
-  // is working with a team of such classifiers.
+  // NOTE that overriding functions MUST clear and sort the results by
+  // descending rating unless the classifier is working with a team of such
+  // classifiers.
+  // NOTE: Neither overload of ClassifySample is pure, but at least one must
+  // be overridden by a classifier in order for it to do anything.
+  virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
+                                    int debug, UNICHAR_ID keep_this,
+                                    GenericVector<UnicharRating>* results);
+
+ protected:
  virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
-                             int debug, int keep_this,
-                             GenericVector<ShapeRating>* results) = 0;
+                             int debug, UNICHAR_ID keep_this,
+                             GenericVector<ShapeRating>* results);
+
+ public:
+  // Returns the shape that contains unichar_id that has the best result.
+  // If result is not NULL, it is set with the shape_id and rating.
+  // Returns -1 if ClassifySample fails to provide any result containing
+  // unichar_id. BestShapeForUnichar does not need to be overridden if
+  // ClassifySample respects the keep_this rule.
+  virtual int BestShapeForUnichar(const TrainingSample& sample, Pix* page_pix,
+                                  UNICHAR_ID unichar_id, ShapeRating* result);

  // Provides access to the ShapeTable that this classifier works with.
  virtual const ShapeTable* GetShapeTable() const = 0;
+  // Provides access to the UNICHARSET that this classifier works with.
+  // Must be overridden IFF GetShapeTable() returns NULL.
+  virtual const UNICHARSET& GetUnicharset() const;
+
+  // Visual debugger classifies the given sample, displays the results and
+  // solicits user input to display other classifications. Returns when
+  // the user has finished with debugging the sample.
+  // Probably doesn't need to be overridden if the subclass provides
+  // DisplayClassifyAs.
+  virtual void DebugDisplay(const TrainingSample& sample, Pix* page_pix,
+                            UNICHAR_ID unichar_id);
+
+
+  // Displays classification as the given unichar_id. Creates as many windows
+  // as it feels fit, using index as a guide for placement. Adds any created
+  // windows to the windows output and returns a new index that may be used
+  // by any subsequent classifiers. Caller waits for the user to view and
+  // then destroys the windows by clearing the vector.
+  virtual int DisplayClassifyAs(const TrainingSample& sample,  Pix* page_pix,
+                                UNICHAR_ID unichar_id, int index,
+                                PointerVector<ScrollView>* windows);
+
+  // Prints debug information on the results. context is some introductory/title
+  // message.
+  virtual void UnicharPrintResults(
+      const char* context, const GenericVector<UnicharRating>& results) const;
+  virtual void PrintResults(const char* context,
+                            const GenericVector<ShapeRating>& results) const;
+
+ protected:
+  // Removes any result that has all its unichars covered by a better choice,
+  // regardless of font.
+  void FilterDuplicateUnichars(GenericVector<ShapeRating>* results) const;
 };

 }  // namespace tesseract.
--- a/classify/shapetable.cpp
+++ b/classify/shapetable.cpp
@ -22,12 +22,47 @@

 #include "shapetable.h"

+#include "bitvector.h"
+#include "fontinfo.h"
 #include "intfeaturespace.h"
 #include "strngs.h"
 #include "unicharset.h"
+#include "unicity_table.h"

 namespace tesseract {

+// Helper function to get the index of the first result with the required
+// unichar_id. If the results are sorted by rating, this will also be the
+// best result with the required unichar_id.
+// Returns -1 if the unichar_id is not found
+int ShapeRating::FirstResultWithUnichar(
+    const GenericVector<ShapeRating>& results,
+    const ShapeTable& shape_table,
+    UNICHAR_ID unichar_id) {
+  for (int r = 0; r < results.size(); ++r) {
+    int shape_id = results[r].shape_id;
+    const Shape& shape = shape_table.GetShape(shape_id);
+    if (shape.ContainsUnichar(unichar_id)) {
+      return r;
+    }
+  }
+  return -1;
+}
+
+// Helper function to get the index of the first result with the required
+// unichar_id. If the results are sorted by rating, this will also be the
+// best result with the required unichar_id.
+// Returns -1 if the unichar_id is not found
+int UnicharRating::FirstResultWithUnichar(
+    const GenericVector<UnicharRating>& results,
+    UNICHAR_ID unichar_id) {
+  for (int r = 0; r < results.size(); ++r) {
+    if (results[r].unichar_id == unichar_id)
+      return r;
+  }
+  return -1;
+}
+
 // Writes to the given file. Returns false in case of error.
 bool UnicharAndFonts::Serialize(FILE* fp) const {
  if (fwrite(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
@ -138,6 +173,39 @@ bool Shape::ContainsFont(int font_id) const {
  }
  return false;
 }
+// Returns true if the shape contains the given font properties, ignoring
+// unichar_id.
+bool Shape::ContainsFontProperties(const FontInfoTable& font_table,
+                                   uinT32 properties) const {
+  for (int c = 0; c < unichars_.size(); ++c) {
+    GenericVector<int>& font_list = unichars_[c].font_ids;
+    for (int f = 0; f < font_list.size(); ++f) {
+      if (font_table.get(font_list[f]).properties == properties)
+        return true;
+    }
+  }
+  return false;
+}
+// Returns true if the shape contains multiple different font properties,
+// ignoring unichar_id.
+bool Shape::ContainsMultipleFontProperties(
+    const FontInfoTable& font_table) const {
+  uinT32 properties = font_table.get(unichars_[0].font_ids[0]).properties;
+  for (int c = 0; c < unichars_.size(); ++c) {
+    GenericVector<int>& font_list = unichars_[c].font_ids;
+    for (int f = 0; f < font_list.size(); ++f) {
+      if (font_table.get(font_list[f]).properties != properties)
+        return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if this shape is equal to other (ignoring order of unichars
+// and fonts).
+bool Shape::operator==(const Shape& other) const {
+  return IsSubsetOf(other) && other.IsSubsetOf(*this);
+}

 // Returns true if this is a subset (including equal) of other.
 bool Shape::IsSubsetOf(const Shape& other) const {
@ -172,10 +240,10 @@ void Shape::SortUnichars() {
  unichars_sorted_ = true;
 }

-ShapeTable::ShapeTable() : unicharset_(NULL) {
+ShapeTable::ShapeTable() : unicharset_(NULL), num_fonts_(0) {
 }
 ShapeTable::ShapeTable(const UNICHARSET& unicharset)
-  : unicharset_(&unicharset) {
+  : unicharset_(&unicharset), num_fonts_(0) {
 }

 // Writes to the given file. Returns false in case of error.
@ -187,9 +255,38 @@ bool ShapeTable::Serialize(FILE* fp) const {
 // If swap is true, assumes a big/little-endian swap is needed.
 bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
  if (!shape_table_.DeSerialize(swap, fp)) return false;
+  num_fonts_ = 0;
  return true;
 }

+// Returns the number of fonts used in this ShapeTable, computing it if
+// necessary.
+int ShapeTable::NumFonts() const {
+  if (num_fonts_ <= 0) {
+    for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
+      const Shape& shape = *shape_table_[shape_id];
+      for (int c = 0; c < shape.size(); ++c) {
+        for (int f = 0; f < shape[c].font_ids.size(); ++f) {
+          if (shape[c].font_ids[f] >= num_fonts_)
+            num_fonts_ = shape[c].font_ids[f] + 1;
+        }
+      }
+    }
+  }
+  return num_fonts_;
+}
+
+// Re-indexes the class_ids in the shapetable according to the given map.
+// Useful in conjunction with set_unicharset.
+void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) {
+  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
+    Shape* shape = shape_table_[shape_id];
+    for (int c = 0; c < shape->size(); ++c) {
+      shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
+    }
+  }
+}
+
 // Returns a string listing the classes/fonts in a shape.
 STRING ShapeTable::DebugStr(int shape_id) const {
  if (shape_id < 0 || shape_id >= shape_table_.size())
@ -251,15 +348,22 @@ int ShapeTable::AddShape(int unichar_id, int font_id) {
  Shape* shape = new Shape;
  shape->AddToShape(unichar_id, font_id);
  shape_table_.push_back(shape);
+  num_fonts_ = MAX(num_fonts_, font_id + 1);
  return index;
 }

-// Adds a copy of the given shape.
-// Returns the assigned index.
+// Adds a copy of the given shape unless it is already present.
+// Returns the assigned index or index of existing shape if already present.
 int ShapeTable::AddShape(const Shape& other) {
-  int index = shape_table_.size();
-  Shape* shape = new Shape(other);
-  shape_table_.push_back(shape);
+  int index;
+  for (index = 0; index < shape_table_.size() &&
+       !(other == *shape_table_[index]); ++index)
+    continue;
+  if (index == shape_table_.size()) {
+    Shape* shape = new Shape(other);
+    shape_table_.push_back(shape);
+  }
+  num_fonts_ = 0;
  return index;
 }

@ -275,12 +379,14 @@ void ShapeTable::DeleteShape(int shape_id) {
 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
  Shape& shape = *shape_table_[shape_id];
  shape.AddToShape(unichar_id, font_id);
+  num_fonts_ = MAX(num_fonts_, font_id + 1);
 }

 // Adds the given shape to the existing shape with the given index.
 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
  Shape& shape = *shape_table_[shape_id];
  shape.AddShape(other);
+  num_fonts_ = 0;
 }

 // Returns the id of the shape that contains the given unichar and font.
@ -316,25 +422,26 @@ void ShapeTable::GetFirstUnicharAndFont(int shape_id,
 // a ShapeTable.
 int ShapeTable::BuildFromShape(const Shape& shape,
                               const ShapeTable& master_shapes) {
-  int num_masters = 0;
+  BitVector shape_map(master_shapes.NumShapes());
  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
    for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
      int c = shape[u_ind].unichar_id;
      int f = shape[u_ind].font_ids[f_ind];
-      if (FindShape(c, f) < 0) {
-        int shape_id = AddShape(c, f);
-        int master_id = master_shapes.FindShape(c, f);
-        if (master_id >= 0 && shape.size() > 1) {
-          const Shape& master = master_shapes.GetShape(master_id);
-          if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) {
-            // Add everything else from the master shape.
-            shape_table_[shape_id]->AddShape(master);
-            ++num_masters;
-          }
-        }
+      int master_id = master_shapes.FindShape(c, f);
+      if (master_id >= 0) {
+        shape_map.SetBit(master_id);
+      } else if (FindShape(c, f) < 0) {
+        AddShape(c, f);
      }
    }
  }
+  int num_masters = 0;
+  for (int s = 0; s < master_shapes.NumShapes(); ++s) {
+    if (shape_map[s]) {
+      AddShape(master_shapes.GetShape(s));
+      ++num_masters;
+    }
+  }
  return num_masters;
 }

@ -381,7 +488,7 @@ void ShapeTable::ForceFontMerges(int start, int end) {
    }
  }
  ShapeTable compacted(*unicharset_);
-  compacted.AppendMasterShapes(*this);
+  compacted.AppendMasterShapes(*this, NULL);
  *this = compacted;
 }

@ -422,6 +529,13 @@ void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
 }

+// Swaps two shape_ids.
+void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
+  Shape* tmp = shape_table_[shape_id1];
+  shape_table_[shape_id1] = shape_table_[shape_id2];
+  shape_table_[shape_id2] = tmp;
+}
+
 // Returns the destination of this shape, (if merged), taking into account
 // the fact that the destination may itself have been merged.
 int ShapeTable::MasterDestinationIndex(int shape_id) const {
@ -435,11 +549,129 @@ int ShapeTable::MasterDestinationIndex(int shape_id) const {
  return master_id;
 }

+// Returns false if the unichars in neither shape is a subset of the other.
+bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  int c1, c2;
+  for (c1 = 0; c1 < shape1.size(); ++c1) {
+    int unichar_id1 = shape1[c1].unichar_id;
+    if (!shape2.ContainsUnichar(unichar_id1))
+      break;
+  }
+  for (c2 = 0; c2 < shape2.size(); ++c2) {
+    int unichar_id2 = shape2[c2].unichar_id;
+    if (!shape1.ContainsUnichar(unichar_id2))
+      break;
+  }
+  return c1 == shape1.size() || c2 == shape2.size();
+}
+
+// Returns false if the unichars in neither shape is a subset of the other.
+bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2,
+                                    int shape_id) const {
+  const Shape& merge1 = GetShape(merge_id1);
+  const Shape& merge2 = GetShape(merge_id2);
+  const Shape& shape = GetShape(shape_id);
+  int cm1, cm2, cs;
+  for (cs = 0; cs < shape.size(); ++cs) {
+    int unichar_id = shape[cs].unichar_id;
+    if (!merge1.ContainsUnichar(unichar_id) &&
+        !merge2.ContainsUnichar(unichar_id))
+      break;  // Shape is not a subset of the merge.
+  }
+  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
+    int unichar_id1 = merge1[cm1].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id1))
+      break;  // Merge is not a subset of shape
+  }
+  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
+    int unichar_id2 = merge2[cm2].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id2))
+      break;  // Merge is not a subset of shape
+  }
+  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
+}
+
+// Returns true if the unichar sets are equal between the shapes.
+bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  for (int c1 = 0; c1 < shape1.size(); ++c1) {
+    int unichar_id1 = shape1[c1].unichar_id;
+    if (!shape2.ContainsUnichar(unichar_id1))
+      return false;
+  }
+  for (int c2 = 0; c2 < shape2.size(); ++c2) {
+    int unichar_id2 = shape2[c2].unichar_id;
+    if (!shape1.ContainsUnichar(unichar_id2))
+      return false;
+  }
+  return true;
+}
+
+// Returns true if the unichar sets are equal between the shapes.
+bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2,
+                                    int shape_id) const {
+  const Shape& merge1 = GetShape(merge_id1);
+  const Shape& merge2 = GetShape(merge_id2);
+  const Shape& shape = GetShape(shape_id);
+  for (int cs = 0; cs < shape.size(); ++cs) {
+    int unichar_id = shape[cs].unichar_id;
+    if (!merge1.ContainsUnichar(unichar_id) &&
+        !merge2.ContainsUnichar(unichar_id))
+      return false;  // Shape has a unichar that appears in neither merge.
+  }
+  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
+    int unichar_id1 = merge1[cm1].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id1))
+      return false;  // Merge has a unichar that is not in shape.
+  }
+  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
+    int unichar_id2 = merge2[cm2].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id2))
+      return false;  // Merge has a unichar that is not in shape.
+  }
+  return true;
+}
+
+// Returns true if there is a common unichar between the shapes.
+bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  for (int c1 = 0; c1 < shape1.size(); ++c1) {
+    int unichar_id1 = shape1[c1].unichar_id;
+    if (shape2.ContainsUnichar(unichar_id1))
+      return true;
+  }
+  return false;
+}
+
+// Returns true if there is a common font id between the shapes.
+bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  for (int c1 = 0; c1 < shape1.size(); ++c1) {
+    const GenericVector<int>& font_list1 = shape1[c1].font_ids;
+    for (int f = 0; f < font_list1.size(); ++f) {
+      if (shape2.ContainsFont(font_list1[f]))
+        return true;
+    }
+  }
+  return false;
+}
+
 // Appends the master shapes from other to this.
-void ShapeTable::AppendMasterShapes(const ShapeTable& other) {
+// If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
+void ShapeTable::AppendMasterShapes(const ShapeTable& other,
+                                    GenericVector<int>* shape_map) {
+  if (shape_map != NULL)
+    shape_map->init_to_size(other.NumShapes(), -1);
  for (int s = 0; s < other.shape_table_.size(); ++s) {
    if (other.shape_table_[s]->destination_index() < 0) {
-      AddShape(*other.shape_table_[s]);
+      int index = AddShape(*other.shape_table_[s]);
+      if (shape_map != NULL)
+        (*shape_map)[s] = index;
    }
  }
 }
@ -455,6 +687,46 @@ int ShapeTable::NumMasterShapes() const {
 }


+// Adds the unichars of the given shape_id to the vector of results. Any
+// unichar_id that is already present just has the fonts added to the
+// font set for that result without adding a new entry in the vector.
+// NOTE: it is assumed that the results are given to this function in order
+// of decreasing rating.
+// The unichar_map vector indicates the index of the results entry containing
+// each unichar, or -1 if the unichar is not yet included in results.
+void ShapeTable::AddShapeToResults(const ShapeRating& shape_rating,
+                                   GenericVector<int>* unichar_map,
+                                   GenericVector<UnicharRating>* results)const {
+  if (shape_rating.joined) {
+    AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map,
+                        results);
+  }
+  if (shape_rating.broken) {
+    AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map,
+                        results);
+  }
+  const Shape& shape = GetShape(shape_rating.shape_id);
+  for (int u = 0; u < shape.size(); ++u) {
+    int result_index = AddUnicharToResults(shape[u].unichar_id,
+                                           shape_rating.rating,
+                                           unichar_map, results);
+    (*results)[result_index].fonts += shape[u].font_ids;
+  }
+}
+
+// Adds the given unichar_id to the results if needed, updating unichar_map
+// and returning the index of unichar in results.
+int ShapeTable::AddUnicharToResults(
+    int unichar_id, float rating, GenericVector<int>* unichar_map,
+    GenericVector<UnicharRating>* results) const {
+  int result_index = unichar_map->get(unichar_id);
+  if (result_index < 0) {
+    UnicharRating result(unichar_id, rating);
+    result_index = results->push_back(result);
+    (*unichar_map)[unichar_id] = result_index;
+  }
+  return result_index;
+}
+
+
 }  // namespace tesseract
-
-
--- a/classify/shapetable.h
+++ b/classify/shapetable.h
@ -23,6 +23,8 @@
 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
 #define TESSERACT_CLASSIFY_SHAPETABLE_H_

+#include "bitvector.h"
+#include "genericheap.h"
 #include "genericvector.h"
 #include "intmatcher.h"

@ -31,6 +33,113 @@ class UNICHARSET;

 namespace tesseract {

+struct FontInfo;
+class FontInfoTable;
+class ShapeTable;
+
+// Simple struct to hold a single classifier unichar selection, a corresponding
+// rating, and a list of appropriate fonts.
+struct UnicharRating {
+  UnicharRating() : unichar_id(0), rating(0.0f) {}
+  UnicharRating(int u, float r)
+    : unichar_id(u), rating(r) {}
+
+  // Sort function to sort ratings appropriately by descending rating.
+  static int SortDescendingRating(const void* t1, const void* t2) {
+    const UnicharRating* a = reinterpret_cast<const UnicharRating *>(t1);
+    const UnicharRating* b = reinterpret_cast<const UnicharRating *>(t2);
+    if (a->rating > b->rating) {
+      return -1;
+    } else if (a->rating < b->rating) {
+      return 1;
+    } else {
+      return a->unichar_id - b->unichar_id;
+    }
+  }
+  // Helper function to get the index of the first result with the required
+  // unichar_id. If the results are sorted by rating, this will also be the
+  // best result with the required unichar_id.
+  // Returns -1 if the unichar_id is not found
+  static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results,
+                                    UNICHAR_ID unichar_id);
+
+  // Index into some UNICHARSET table indicates the class of the answer.
+  UNICHAR_ID unichar_id;
+  // Rating from classifier with 1.0 perfect and 0.0 impossible.
+  // Call it a probability if you must.
+  float rating;
+  // Set of fonts for this shape in order of decreasing preference.
+  // (There is no mechanism for storing scores for fonts as yet.)
+  GenericVector<int> fonts;
+};
+
+// Classifier result from a low-level classification is an index into some
+// ShapeTable and a rating.
+struct ShapeRating {
+  ShapeRating()
+    : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f),
+      joined(false), broken(false) {}
+  ShapeRating(int s, float r)
+    : shape_id(s), rating(r), raw(1.0f), font(0.0f),
+      joined(false), broken(false) {}
+
+  // Sort function to sort ratings appropriately by descending rating.
+  static int SortDescendingRating(const void* t1, const void* t2) {
+    const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
+    const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
+    if (a->rating > b->rating) {
+      return -1;
+    } else if (a->rating < b->rating) {
+      return 1;
+    } else {
+      return a->shape_id - b->shape_id;
+    }
+  }
+  // Helper function to get the index of the first result with the required
+  // unichar_id. If the results are sorted by rating, this will also be the
+  // best result with the required unichar_id.
+  // Returns -1 if the unichar_id is not found
+  static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results,
+                                    const ShapeTable& shape_table,
+                                    UNICHAR_ID unichar_id);
+
+  // Index into some shape table indicates the class of the answer.
+  int shape_id;
+  // Rating from classifier with 1.0 perfect and 0.0 impossible.
+  // Call it a probability if you must.
+  float rating;
+  // Subsidiary rating that a classifier may use internally.
+  float raw;
+  // Subsidiary rating that a classifier may use internally.
+  float font;
+  // Flag indicating that the input may be joined.
+  bool joined;
+  // Flag indicating that the input may be broken (a fragment).
+  bool broken;
+};
+
+// Simple struct to hold an entry for a heap-based priority queue of
+// ShapeRating.
+struct ShapeQueueEntry {
+  ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}
+  ShapeQueueEntry(const ShapeRating& rating, int level0)
+    : result(rating), level(level0) {}
+
+  // Sort by decreasing rating and decreasing level for equal rating.
+  bool operator<(const ShapeQueueEntry& other) const {
+    if (result.rating > other.result.rating) return true;
+    if (result.rating == other.result.rating)
+      return level > other.level;
+    return false;
+  }
+
+  // Output from classifier.
+  ShapeRating result;
+  // Which level in the tree did this come from?
+  int level;
+};
+typedef GenericHeap<ShapeQueueEntry> ShapeQueue;
+
 // Simple struct to hold a set of fonts associated with a single unichar-id.
 // A vector of UnicharAndFonts makes a shape.
 struct UnicharAndFonts {
@ -83,6 +192,10 @@ class Shape {
  const UnicharAndFonts& operator[](int index) const {
    return unichars_[index];
  }
+  // Sets the unichar_id of the given index to the new unichar_id.
+  void SetUnicharId(int index, int unichar_id) {
+    unichars_[index].unichar_id = unichar_id;
+  }
  // Adds a font_id for the given unichar_id. If the unichar_id is not
  // in the shape, it is added.
  void AddToShape(int unichar_id, int font_id);
@ -94,6 +207,16 @@ class Shape {
  bool ContainsUnichar(int unichar_id) const;
  // Returns true if the shape contains the given font, ignoring unichar_id.
  bool ContainsFont(int font_id) const;
+  // Returns true if the shape contains the given font properties, ignoring
+  // unichar_id.
+  bool ContainsFontProperties(const FontInfoTable& font_table,
+                              uinT32 properties) const;
+  // Returns true if the shape contains multiple different font properties,
+  // ignoring unichar_id.
+  bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const;
+  // Returns true if this shape is equal to other (ignoring order of unichars
+  // and fonts).
+  bool operator==(const Shape& other) const;
  // Returns true if this is a subset (including equal) of other.
  bool IsSubsetOf(const Shape& other) const;
  // Returns true if the lists of unichar ids are the same in this and other,
@ -143,11 +266,17 @@ class ShapeTable {
  const UNICHARSET& unicharset() const {
    return *unicharset_;
  }
+  // Returns the number of fonts used in this ShapeTable, computing it if
+  // necessary.
+  int NumFonts() const;
  // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
  // entire life of the ShapeTable.
  void set_unicharset(const UNICHARSET& unicharset) {
    unicharset_ = &unicharset;
  }
+  // Re-indexes the class_ids in the shapetable according to the given map.
+  // Useful in conjunction with set_unicharset.
+  void ReMapClassIds(const GenericVector<int>& unicharset_map);
  // Returns a string listing the classes/fonts in a shape.
  STRING DebugStr(int shape_id) const;
  // Returns a debug string summarizing the table.
@ -156,8 +285,8 @@ class ShapeTable {
  // Adds a new shape starting with the given unichar_id and font_id.
  // Returns the assigned index.
  int AddShape(int unichar_id, int font_id);
-  // Adds a copy of the given shape.
-  // Returns the assigned index.
+  // Adds a copy of the given shape unless it is already present.
+  // Returns the assigned index or index of existing shape if already present.
  int AddShape(const Shape& other);
  // Removes the shape given by the shape index. All indices above are changed!
  void DeleteShape(int shape_id);
@ -204,10 +333,14 @@ class ShapeTable {
  int MergedUnicharCount(int shape_id1, int shape_id2) const;
  // Merges two shape_ids, leaving shape_id2 marked as merged.
  void MergeShapes(int shape_id1, int shape_id2);
+  // Swaps two shape_ids.
+  void SwapShapes(int shape_id1, int shape_id2);
  // Appends the master shapes from other to this.
  // Used to create a clean ShapeTable from a merged one, or to create a
  // copy of a ShapeTable.
-  void AppendMasterShapes(const ShapeTable& other);
+  // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
+  void AppendMasterShapes(const ShapeTable& other,
+                          GenericVector<int>* shape_map);
  // Returns the number of master shapes remaining after merging.
  int NumMasterShapes() const;
  // Returns the destination of this shape, (if merged), taking into account
@ -215,11 +348,43 @@ class ShapeTable {
  // For a non-merged shape, returns the input shape_id.
  int MasterDestinationIndex(int shape_id) const;

+  // Returns false if the unichars in neither shape is a subset of the other..
+  bool SubsetUnichar(int shape_id1, int shape_id2) const;
+  // Returns false if the unichars in neither shape is a subset of the other..
+  bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
+  // Returns true if the unichar sets are equal between the shapes.
+  bool EqualUnichars(int shape_id1, int shape_id2) const;
+  bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
+  // Returns true if there is a common unichar between the shapes.
+  bool CommonUnichars(int shape_id1, int shape_id2) const;
+  // Returns true if there is a common font id between the shapes.
+  bool CommonFont(int shape_id1, int shape_id2) const;
+
+  // Adds the unichars of the given shape_id to the vector of results. Any
+  // unichar_id that is already present just has the fonts added to the
+  // font set for that result without adding a new entry in the vector.
+  // NOTE: it is assumed that the results are given to this function in order
+  // of decreasing rating.
+  // The unichar_map vector indicates the index of the results entry containing
+  // each unichar, or -1 if the unichar is not yet included in results.
+  void AddShapeToResults(const ShapeRating& shape_rating,
+                         GenericVector<int>* unichar_map,
+                         GenericVector<UnicharRating>* results) const;
+
 private:
+  // Adds the given unichar_id to the results if needed, updating unichar_map
+  // and returning the index of unichar in results.
+  int AddUnicharToResults(int unichar_id, float rating,
+                          GenericVector<int>* unichar_map,
+                          GenericVector<UnicharRating>* results) const;
+
  // Pointer to a provided unicharset used only by the Debugstr member.
  const UNICHARSET* unicharset_;
  // Vector of pointers to the Shapes in this ShapeTable.
  PointerVector<Shape> shape_table_;
+
+  // Cached data calculated on demand.
+  mutable int num_fonts_;
 };

 }  // namespace tesseract.
--- a/classify/speckle.cpp
+++ b/classify/speckle.cpp
@ -1,107 +0,0 @@
-/******************************************************************************
- **	Filename:    speckle.c
- **	Purpose:     Routines used by classifier to filter out speckle.
- **	Author:      Dan Johnson
- **	History:     Mon Mar 11 10:06:14 1991, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-/*-----------------------------------------------------------------------------
-          Include Files and Type Defines
-----------------------------------------------------------------------------*/
-#include "speckle.h"
-
-#include "blobs.h"
-#include "ratngs.h"
-#include "params.h"
-
-/*-----------------------------------------------------------------------------
-        Global Data Definitions and Declarations
-----------------------------------------------------------------------------*/
-/** define control knobs for adjusting definition of speckle*/
-double_VAR(speckle_large_max_size, 0.30, "Max large speckle size");
-
-double_VAR(speckle_small_penalty, 10.0, "Small speckle penalty");
-
-double_VAR(speckle_large_penalty, 10.0, "Large speckle penalty");
-
-double_VAR(speckle_small_certainty, -1.0, "Small speckle certainty");
-
-/*-----------------------------------------------------------------------------
-              Public Code
-----------------------------------------------------------------------------*/
-/*---------------------------------------------------------------------------*/
-/**
- * This routine adds a null choice to Choices with a
- * rating equal to the worst rating in Choices plus a pad.
- * The certainty of the new choice is the same as the
- * certainty of the worst choice in Choices.  The new choice
- * is added to the end of Choices.
- *
- * Globals:
- * - #speckle_small_penalty rating for a small speckle
- * - #speckle_large_penalty rating penalty for a large speckle
- * - #speckle_small_certainty certainty for a small speckle
- *
- * @param Choices choices to add a speckle choice to
- *
- * @return New Choices list with null choice added to end.
- *
- * Exceptions: none
- * History: Mon Mar 11 11:08:11 1991, DSJ, Created.
- */
-void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
-  assert(Choices != NULL);
-  BLOB_CHOICE *blob_choice;
-  BLOB_CHOICE_IT temp_it;
-  temp_it.set_to_list(Choices);
-
-  // If there are no other choices, use the small speckle penalty plus
-  // the large speckle penalty.
-  if (Choices->length() == 0) {
-    blob_choice =
-      new BLOB_CHOICE(0, speckle_small_certainty + speckle_large_penalty,
-                      speckle_small_certainty, -1, -1, NULL, 0, 0, false);
-    temp_it.add_to_end(blob_choice);
-    return;
-  }
-
-  // If there are other choices,  add a null choice that is slightly worse
-  // than the worst choice so far.
-  temp_it.move_to_last();
-  blob_choice = temp_it.data();  // pick the worst choice
-  temp_it.add_to_end(
-      new BLOB_CHOICE(0, blob_choice->rating() + speckle_large_penalty,
-                      blob_choice->certainty(), -1, -1, NULL, 0, 0, false));
-}                                /* AddLargeSpeckleTo */
-
-
-/*---------------------------------------------------------------------------*/
-/**
- * This routine returns TRUE if both the width of height
- * of Blob are less than the MaxLargeSpeckleSize.
- *
- * Globals:
- * - #speckle_large_max_size largest allowed speckle
- *
- * Exceptions: none
- * History: Mon Mar 11 10:06:49 1991, DSJ, Created.
- *
- * @param blob blob to test against speckle criteria
- *
- * @return TRUE if blob is speckle, FALSE otherwise.
- */
-BOOL8 LargeSpeckle(TBLOB *blob) {
-  double speckle_size = BASELINE_SCALE * speckle_large_max_size;
-  TBOX bbox = blob->bounding_box();
-  return (bbox.width() < speckle_size && bbox.height() < speckle_size);
-}                                /* LargeSpeckle */
--- a/classify/speckle.h
+++ b/classify/speckle.h
@ -1,35 +0,0 @@
-/******************************************************************************
- **	Filename:    speckle.h
- **	Purpose:     Interface to classifier speckle filtering routines.
- **	Author:      Dan Johnson
- **	History:     Mon Mar 11 10:14:16 1991, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-#ifndef SPECKLE_H
-#define SPECKLE_H
-
-/*-----------------------------------------------------------------------------
-          Include Files and Type Defines
-----------------------------------------------------------------------------*/
-
-#include "baseline.h"
-#include "ratngs.h"
-
-/*-----------------------------------------------------------------------------
-          Public Function Prototypes
-----------------------------------------------------------------------------*/
-void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices);
-
-BOOL8 LargeSpeckle(TBLOB *Blob);
-
-#endif
--- a/classify/tessclassifier.cpp
+++ b/classify/tessclassifier.cpp
@ -28,17 +28,25 @@ namespace tesseract {

 // Classifies the given [training] sample, writing to results.
 // See ShapeClassifier for a full description.
-int TessClassifier::ClassifySample(const TrainingSample& sample,
-                                   Pix* page_pix, int debug, int keep_this,
-                                   GenericVector<ShapeRating>* results) {
+int TessClassifier::UnicharClassifySample(
+    const TrainingSample& sample, Pix* page_pix, int debug,
+    UNICHAR_ID keep_this, GenericVector<UnicharRating>* results) {
+  int old_matcher_level = classify_->matcher_debug_level;
+  int old_matcher_flags = classify_->matcher_debug_flags;
+  int old_classify_level = classify_->classify_debug_level;
  if (debug) {
-    classify_->matcher_debug_level.set_value(debug ? 2 : 0);
-    classify_->matcher_debug_flags.set_value(debug ? 25 : 0);
-    classify_->classify_debug_level.set_value(debug ? 3 : 0);
-  } else {
-    classify_->classify_debug_level.set_value(debug ? 2 : 0);
+    // Explicitly set values of various control parameters to generate debug
+    // output if required, restoring the old values after classifying.
+    classify_->matcher_debug_level.set_value(2);
+    classify_->matcher_debug_flags.set_value(25);
+    classify_->classify_debug_level.set_value(3);
+  }
+  classify_->CharNormTrainingSample(pruner_only_, keep_this, sample, results);
+  if (debug) {
+    classify_->matcher_debug_level.set_value(old_matcher_level);
+    classify_->matcher_debug_flags.set_value(old_matcher_flags);
+    classify_->classify_debug_level.set_value(old_classify_level);
  }
-  classify_->CharNormTrainingSample(pruner_only_, sample, results);
  return results->size();
 }

@ -46,6 +54,32 @@ int TessClassifier::ClassifySample(const TrainingSample& sample,
 const ShapeTable* TessClassifier::GetShapeTable() const {
  return classify_->shape_table();
 }
+// Provides access to the UNICHARSET that this classifier works with.
+// Only needs to be overridden if GetShapeTable() can return NULL.
+const UNICHARSET& TessClassifier::GetUnicharset() const {
+  return classify_->unicharset;
+}
+
+// Displays classification as the given shape_id. Creates as many windows
+// as it feels fit, using index as a guide for placement. Adds any created
+// windows to the windows output and returns a new index that may be used
+// by any subsequent classifiers. Caller waits for the user to view and
+// then destroys the windows by clearing the vector.
+int TessClassifier::DisplayClassifyAs(
+    const TrainingSample& sample, Pix* page_pix, int unichar_id, int index,
+    PointerVector<ScrollView>* windows) {
+  int shape_id = unichar_id;
+  if (GetShapeTable() != NULL)
+    shape_id = BestShapeForUnichar(sample, page_pix, unichar_id, NULL);
+  if (shape_id < 0) return index;
+  if (UnusedClassIdIn(classify_->PreTrainedTemplates, shape_id)) {
+    tprintf("No built-in templates for class/shape %d\n", shape_id);
+    return index;
+  }
+  classify_->ShowBestMatchFor(shape_id, sample.features(),
+                              sample.num_features());
+  return index;
+}

 }  // namespace tesseract

--- a/classify/tessclassifier.h
+++ b/classify/tessclassifier.h
@ -41,11 +41,23 @@ class TessClassifier : public ShapeClassifier {

  // Classifies the given [training] sample, writing to results.
  // See ShapeClassifier for a full description.
-  virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
-                             int debug, int keep_this,
-                             GenericVector<ShapeRating>* results);
+  virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
+                                    int debug, UNICHAR_ID keep_this,
+                                    GenericVector<UnicharRating>* results);
  // Provides access to the ShapeTable that this classifier works with.
  virtual const ShapeTable* GetShapeTable() const;
+  // Provides access to the UNICHARSET that this classifier works with.
+  // Only needs to be overridden if GetShapeTable() can return NULL.
+  virtual const UNICHARSET& GetUnicharset() const;
+
+  // Displays classification as the given shape_id. Creates as many windows
+  // as it feels fit, using index as a guide for placement. Adds any created
+  // windows to the windows output and returns a new index that may be used
+  // by any subsequent classifiers. Caller waits for the user to view and
+  // then destroys the windows by clearing the vector.
+  virtual int DisplayClassifyAs(const TrainingSample& sample, Pix* page_pix,
+                                int unichar_id, int index,
+                                PointerVector<ScrollView>* windows);

 private:
  // Indicates that this classifier is to use just the ClassPruner, or the
--- a/classify/trainingsample.cpp
+++ b/classify/trainingsample.cpp
@ -59,6 +59,8 @@ bool TrainingSample::Serialize(FILE* fp) const {
  if (fwrite(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
  if (fwrite(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
    return false;
+  if (fwrite(&outline_length_, sizeof(outline_length_), 1, fp) != 1)
+    return false;
  if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_)
    return false;
  if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_,
@ -90,10 +92,13 @@ bool TrainingSample::DeSerialize(bool swap, FILE* fp) {
  if (fread(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
  if (fread(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
    return false;
+  if (fread(&outline_length_, sizeof(outline_length_), 1, fp) != 1)
+    return false;
  if (swap) {
    ReverseN(&class_id_, sizeof(class_id_));
    ReverseN(&num_features_, sizeof(num_features_));
    ReverseN(&num_micro_features_, sizeof(num_micro_features_));
+    ReverseN(&outline_length_, sizeof(outline_length_));
  }
  delete [] features_;
  features_ = new INT_FEATURE_STRUCT[num_features_];
@ -113,20 +118,40 @@ bool TrainingSample::DeSerialize(bool swap, FILE* fp) {

 // Saves the given features into a TrainingSample.
 TrainingSample* TrainingSample::CopyFromFeatures(
-    const INT_FX_RESULT_STRUCT& fx_info, const INT_FEATURE_STRUCT* features,
+    const INT_FX_RESULT_STRUCT& fx_info,
+    const TBOX& bounding_box,
+    const INT_FEATURE_STRUCT* features,
    int num_features) {
  TrainingSample* sample = new TrainingSample;
  sample->num_features_ = num_features;
  sample->features_ = new INT_FEATURE_STRUCT[num_features];
+  sample->outline_length_ = fx_info.Length;
  memcpy(sample->features_, features, num_features * sizeof(features[0]));
-  sample->geo_feature_[GeoBottom] = fx_info.YBottom;
-  sample->geo_feature_[GeoTop] = fx_info.YTop;
-  sample->geo_feature_[GeoWidth] = fx_info.Width;
+  sample->geo_feature_[GeoBottom] = bounding_box.bottom();
+  sample->geo_feature_[GeoTop] = bounding_box.top();
+  sample->geo_feature_[GeoWidth] = bounding_box.width();
+
+  // Generate the cn_feature_ from the fx_info.
+  sample->cn_feature_[CharNormY] =
+      MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
+  sample->cn_feature_[CharNormLength] =
+      MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
+  sample->cn_feature_[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
+  sample->cn_feature_[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;
+
  sample->features_are_indexed_ = false;
  sample->features_are_mapped_ = false;
  return sample;
 }

+// Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.
+FEATURE_STRUCT* TrainingSample::GetCNFeature() const {
+  FEATURE feature = NewFeature(&CharNormDesc);
+  for (int i = 0; i < kNumCNParams; ++i)
+    feature->Params[i] = cn_feature_[i];
+  return feature;
+}
+
 // Constructs and returns a copy randomized by the method given by
 // the randomizer index. If index is out of [0, kSampleRandomSize) then
 // an exact copy is returned.
--- a/classify/trainingsample.h
+++ b/classify/trainingsample.h
@ -54,7 +54,7 @@ class TrainingSample : public ELIST_LINK {
 public:
  TrainingSample()
    : class_id_(INVALID_UNICHAR_ID), font_id_(0), page_num_(0),
-      num_features_(0), num_micro_features_(0),
+      num_features_(0), num_micro_features_(0), outline_length_(0),
      features_(NULL), micro_features_(NULL), weight_(1.0),
      max_dist_(0.0), sample_index_(0),
      features_are_indexed_(false), features_are_mapped_(false),
@ -65,8 +65,11 @@ class TrainingSample : public ELIST_LINK {
  // Saves the given features into a TrainingSample. The features are copied,
  // so may be deleted afterwards. Delete the return value after use.
  static TrainingSample* CopyFromFeatures(const INT_FX_RESULT_STRUCT& fx_info,
+                                          const TBOX& bounding_box,
                                          const INT_FEATURE_STRUCT* features,
                                          int num_features);
+  // Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.
+  FEATURE_STRUCT* GetCNFeature() const;
  // Constructs and returns a copy "randomized" by the method given by
  // the randomizer index. If index is out of [0, kSampleRandomSize) then
  // an exact copy is returned.
@ -146,6 +149,9 @@ class TrainingSample : public ELIST_LINK {
  const MicroFeature* micro_features() const {
    return micro_features_;
  }
+  int outline_length() const {
+    return outline_length_;
+  }
  float cn_feature(int index) const {
    return cn_feature_[index];
  }
@ -203,6 +209,10 @@ class TrainingSample : public ELIST_LINK {
  int num_features_;
  // Number of MicroFeature in micro_features_ array.
  int num_micro_features_;
+  // Total length of outline in the baseline normalized coordinate space.
+  // See comment in WERD_RES class definition for a discussion of coordinate
+  // spaces.
+  int outline_length_;
  // Array of features.
  INT_FEATURE_STRUCT* features_;
  // Array of features.
--- a/classify/trainingsampleset.cpp
+++ b/classify/trainingsampleset.cpp
@ -67,7 +67,7 @@ bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
  return true;
 }

-TrainingSampleSet::TrainingSampleSet(const UnicityTable<FontInfo>& font_table)
+TrainingSampleSet::TrainingSampleSet(const FontInfoTable& font_table)
  : num_raw_samples_(0), unicharset_size_(0),
    font_class_array_(NULL), fontinfo_table_(font_table) {
 }
@ -115,11 +115,12 @@ bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
 void TrainingSampleSet::LoadUnicharset(const char* filename) {
  if (!unicharset_.load_from_file(filename)) {
    tprintf("Failed to load unicharset from file %s\n"
-            "Building unicharset for boosting from scratch...\n",
+            "Building unicharset from scratch...\n",
            filename);
    unicharset_.clear();
-    // Space character needed to represent NIL_LIST classification.
-    unicharset_.unichar_insert(" ");
+    // Add special characters as they were removed by the clear.
+    UNICHARSET empty;
+    unicharset_.AppendOtherUnicharset(empty);
  }
  unicharset_size_ = unicharset_.size();
 }
@ -708,14 +709,6 @@ void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map,
            continue;
          GenericVector<int> features2 = samples_[s2]->indexed_features();
          double dist = f_table.FeatureDistance(features2);
-          int height = samples_[s2]->geo_feature(GeoTop) -
-              samples_[s2]->geo_feature(GeoBottom);
-          if (dist == 1.0 && height > 64) {
-            // TODO(rays) rethink this when the polygonal approximation goes.
-            // Currently it is possible for dots and other small characters
-            // to be completely different, even within the same class.
-            f_table.DebugFeatureDistance(features2);
-          }
          if (dist > max_dist) {
            max_dist = dist;
            if (dist > max_max_dist) {
--- a/classify/trainingsampleset.h
+++ b/classify/trainingsampleset.h
@ -24,11 +24,11 @@
 #include "trainingsample.h"

 class UNICHARSET;
-template <typename T> class UnicityTable;

 namespace tesseract {

 struct FontInfo;
+class FontInfoTable;
 class IntFeatureMap;
 class IntFeatureSpace;
 class TrainingSample;
@ -42,7 +42,7 @@ class UnicharAndFonts;
 // metrics.
 class TrainingSampleSet {
 public:
-  explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table);
+  explicit TrainingSampleSet(const FontInfoTable& fontinfo_table);
  ~TrainingSampleSet();

  // Writes to the given file. Returns false in case of error.
@ -67,6 +67,9 @@ class TrainingSampleSet {
  int charsetsize() const {
    return unicharset_size_;
  }
+  const FontInfoTable& fontinfo_table() const {
+    return fontinfo_table_;
+  }

  // Loads an initial unicharset, or sets one up if the file cannot be read.
  void LoadUnicharset(const char* filename);
@ -281,7 +284,7 @@ class TrainingSampleSet {

  // Reference to the fontinfo_table_ in MasterTrainer. Provides names
  // for font_ids in the samples. Not serialized!
-  const UnicityTable<FontInfo>& fontinfo_table_;
+  const FontInfoTable& fontinfo_table_;
 };

 }  // namespace tesseract.