From 99edf4ccbd97b1b925a8481f8974b9d4a7fa9615 Mon Sep 17 00:00:00 2001
From: "theraysmith@gmail.com"
 <theraysmith@gmail.com@d0cd1f9f-072b-0410-8dd7-cf729c803f20>
Date: Mon, 23 Sep 2013 15:15:06 +0000
Subject: [PATCH] Refactored classifier to make it easier to add new ones and
 generalized feature extractor to allow fx from grey

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@873 d0cd1f9f-072b-0410-8dd7-cf729c803f20
---
 classify/Makefile.am           |    6 +-
 classify/adaptmatch.cpp        |  616 ++++++------------
 classify/baseline.h            |   41 --
 classify/blobclass.cpp         |   15 +-
 classify/blobclass.h           |    9 +-
 classify/classify.cpp          |   66 +-
 classify/classify.h            |  134 ++--
 classify/cluster.cpp           |   34 +-
 classify/errorcounter.cpp      |  335 ++++++----
 classify/errorcounter.h        |   91 ++-
 classify/extract.cpp           |    6 +-
 classify/extract.h             |    6 +-
 classify/featdefs.cpp          |    2 +-
 classify/flexfx.cpp            |   14 +-
 classify/flexfx.h              |    5 +-
 classify/intfeaturespace.cpp   |   17 +-
 classify/intfx.cpp             | 1090 +++++++++++++-------------------
 classify/intfx.h               |   51 +-
 classify/intmatcher.h          |    2 +-
 classify/intproto.cpp          |   17 +
 classify/intproto.h            |   12 +-
 classify/mastertrainer.cpp     |   84 +--
 classify/mastertrainer.h       |   26 +-
 classify/mf.cpp                |    7 +-
 classify/mf.h                  |    4 +-
 classify/mfoutline.cpp         |   97 +--
 classify/mfoutline.h           |   15 +-
 classify/mfx.cpp               |   22 +-
 classify/mfx.h                 |    4 +-
 classify/normfeat.cpp          |   18 +-
 classify/normfeat.h            |    4 +-
 classify/normmatch.cpp         |    2 +-
 classify/ocrfeatures.cpp       |    2 +-
 classify/ocrfeatures.h         |    4 +-
 classify/picofeat.cpp          |   16 +-
 classify/picofeat.h            |    8 +-
 classify/shapeclassifier.cpp   |  230 +++++++
 classify/shapeclassifier.h     |  103 +--
 classify/shapetable.cpp        |  320 +++++++++-
 classify/shapetable.h          |  171 ++++-
 classify/speckle.cpp           |  107 ----
 classify/speckle.h             |   35 -
 classify/tessclassifier.cpp    |   52 +-
 classify/tessclassifier.h      |   18 +-
 classify/trainingsample.cpp    |   33 +-
 classify/trainingsample.h      |   12 +-
 classify/trainingsampleset.cpp |   17 +-
 classify/trainingsampleset.h   |    9 +-
 48 files changed, 2192 insertions(+), 1797 deletions(-)
 delete mode 100644 classify/baseline.h
 create mode 100644 classify/shapeclassifier.cpp
 delete mode 100644 classify/speckle.cpp
 delete mode 100644 classify/speckle.h

diff --git a/classify/Makefile.am b/classify/Makefile.am
index 8b352a99f..1dc99ec8a 100644
--- a/classify/Makefile.am
+++ b/classify/Makefile.am
@@ -9,7 +9,7 @@ AM_CPPFLAGS += -DTESS_EXPORTS \
 endif
 
 noinst_HEADERS = \
-    adaptive.h baseline.h blobclass.h chartoname.h \
+    adaptive.h blobclass.h chartoname.h \
     classify.h cluster.h clusttool.h cutoffs.h \
     errorcounter.h extern.h extract.h \
     featdefs.h flexfx.h float2int.h fpoint.h fxdefs.h \
@@ -19,7 +19,7 @@ noinst_HEADERS = \
     normfeat.h normmatch.h \
     ocrfeatures.h outfeat.h picofeat.h protos.h \
     sampleiterator.h shapeclassifier.h shapetable.h \
-    speckle.h tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h
+    tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h
 
 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_classify.la
@@ -45,7 +45,7 @@ libtesseract_classify_la_SOURCES = \
     mastertrainer.cpp mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
     normfeat.cpp normmatch.cpp \
     ocrfeatures.cpp outfeat.cpp picofeat.cpp protos.cpp \
-    sampleiterator.cpp shapetable.cpp speckle.cpp \
+    sampleiterator.cpp shapeclassifier.cpp shapetable.cpp \
     tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp xform2d.cpp
 
 
diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp
index dcdbfbd20..1ef606e3b 100644
--- a/classify/adaptmatch.cpp
+++ b/classify/adaptmatch.cpp
@@ -31,10 +31,8 @@
 #include "outfeat.h"
 #include "emalloc.h"
 #include "intfx.h"
-#include "speckle.h"
 #include "efio.h"
 #include "normmatch.h"
-#include "permute.h"
 #include "ndminx.h"
 #include "intproto.h"
 #include "const.h"
@@ -167,7 +165,6 @@ namespace tesseract {
  * @note History: Mon Mar 11 10:00:58 1991, DSJ, Created.
  *
  * @param Blob    blob to be classified
- * @param denorm normalization/denormalization parameters
  * @param[out] Choices    List of choices found by adaptive matcher.
  * @param[out] CPResults  Array of CPResultStruct of size MAX_NUM_CLASSES is
  * filled on return with the choices found by the
@@ -176,7 +173,6 @@ namespace tesseract {
  *
  */
 void Classify::AdaptiveClassifier(TBLOB *Blob,
-                                  const DENORM& denorm,
                                   BLOB_CHOICE_LIST *Choices,
                                   CLASS_PRUNER_RESULTS CPResults) {
   assert(Choices != NULL);
@@ -185,7 +181,8 @@ void Classify::AdaptiveClassifier(TBLOB *Blob,
 
   if (AdaptedTemplates == NULL)
     AdaptedTemplates = NewAdaptedTemplates (true);
-  DoAdaptiveMatch(Blob, denorm, Results);
+
+  DoAdaptiveMatch(Blob, Results);
   if (CPResults != NULL)
     memcpy(CPResults, Results->CPResults,
            sizeof(CPResults[0]) * Results->NumMatches);
@@ -194,32 +191,23 @@ void Classify::AdaptiveClassifier(TBLOB *Blob,
   qsort((void *)Results->match, Results->NumMatches,
         sizeof(ScoredClass), CompareByRating);
   RemoveExtraPuncs(Results);
-  ConvertMatchesToChoices(denorm, Blob->bounding_box(), Results, Choices);
+  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
+                          Choices);
 
   if (matcher_debug_level >= 1) {
     cprintf ("AD Matches =  ");
     PrintAdaptiveMatchResults(stdout, Results);
   }
 
-  if (LargeSpeckle(Blob))
-    AddLargeSpeckleTo(Choices);
+  if (LargeSpeckle(*Blob) || Choices->length() == 0)
+    AddLargeSpeckleTo(Results->BlobLength, Choices);
 
 #ifndef GRAPHICS_DISABLED
   if (classify_enable_adaptive_debugger)
-    DebugAdaptiveClassifier(Blob, denorm, Results);
+    DebugAdaptiveClassifier(Blob, Results);
 #endif
 
   NumClassesOutput += Choices->length();
-  if (Choices->length() == 0) {
-    if (!classify_bln_numeric_mode)
-      tprintf ("Empty classification!\n");  // Should never normally happen.
-    Choices = new BLOB_CHOICE_LIST();
-    BLOB_CHOICE_IT temp_it;
-    temp_it.set_to_list(Choices);
-    temp_it.add_to_end(
-        new BLOB_CHOICE(0, 50.0f, -20.0f, -1, -1, NULL, 0, 0, false));
-  }
-
   delete Results;
 }                                /* AdaptiveClassifier */
 
@@ -251,19 +239,14 @@ void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
 // Otherwise AdaptToBlob is called for adaption within a document.
 // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
 // be learned, otherwise all chars with good correct_text are learned.
-void Classify::LearnWord(const char* filename, const char *rejmap,
-                         WERD_RES *word) {
+void Classify::LearnWord(const char* filename, WERD_RES *word) {
   int word_len = word->correct_text.size();
   if (word_len == 0) return;
 
   float* thresholds = NULL;
   if (filename == NULL) {
     // Adaption mode.
-    if (!EnableLearning || word->best_choice == NULL ||
-        // If word->best_choice is not recorded at the top of accumulator's
-        // best choices (which could happen for choices that are
-        // altered with ReplaceAmbig()) we skip the adaption.
-        !getDict().CurrentBestChoiceIs(*(word->best_choice)))
+    if (!EnableLearning || word->best_choice == NULL)
       return;  // Can't or won't adapt.
 
     NumWordsAdaptedTo++;
@@ -271,11 +254,12 @@ void Classify::LearnWord(const char* filename, const char *rejmap,
       tprintf("\n\nAdapting to word = %s\n",
               word->best_choice->debug_string().string());
     thresholds = new float[word_len];
-    GetAdaptThresholds(word->rebuild_word, word->denorm, *word->best_choice,
-                       *word->raw_choice, thresholds);
+    word->ComputeAdaptionThresholds(certainty_scale,
+                                    matcher_perfect_threshold,
+                                    matcher_good_threshold,
+                                    matcher_rating_margin, thresholds);
   }
   int start_blob = 0;
-  char prev_map_char = '0';
 
   #ifndef GRAPHICS_DISABLED
   if (classify_debug_character_fragments) {
@@ -295,9 +279,7 @@ void Classify::LearnWord(const char* filename, const char *rejmap,
     if (classify_debug_character_fragments) {
       tprintf("\nLearning %s\n",  word->correct_text[ch].string());
     }
-    char rej_map_char = rejmap != NULL ? *rejmap++ : '1';
-
-    if (word->correct_text[ch].length() > 0 && rej_map_char == '1') {
+    if (word->correct_text[ch].length() > 0) {
       float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;
 
       LearnPieces(filename, start_blob, word->best_state[ch],
@@ -308,14 +290,12 @@ void Classify::LearnWord(const char* filename, const char *rejmap,
         // that each match a whole character with at least
         // classify_character_fragments_garbage_certainty_threshold
         bool garbage = false;
-        TBLOB* frag_blob = word->chopped_word->blobs;
-        for (int i = 0; i < start_blob; ++i) frag_blob = frag_blob->next;
         int frag;
         for (frag = 0; frag < word->best_state[ch]; ++frag) {
+          TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
           if (classify_character_fragments_garbage_certainty_threshold < 0) {
-            garbage |= LooksLikeGarbage(word->denorm, frag_blob);
+            garbage |= LooksLikeGarbage(frag_blob);
           }
-          frag_blob = frag_blob->next;
         }
         // Learn the fragments.
         if (!garbage) {
@@ -346,28 +326,22 @@ void Classify::LearnWord(const char* filename, const char *rejmap,
       // TODO(rays): re-enable this part of the code when we switch to the
       // new classifier that needs to see examples of garbage.
       /*
-      char next_map_char = ch + 1 < word_len
-                           ? (rejmap != NULL ? *rejmap : '1')
-                           : '0';
       if (word->best_state[ch] > 1) {
         // If the next blob is good, make junk with the rightmost fragment.
-        if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
-            next_map_char == '1') {
+        if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
           LearnPieces(filename, start_blob + word->best_state[ch] - 1,
                       word->best_state[ch + 1] + 1,
                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
         }
         // If the previous blob is good, make junk with the leftmost fragment.
-        if (ch > 0 && word->correct_text[ch - 1].length() > 0 &&
-            prev_map_char == '1') {
+        if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
           LearnPieces(filename, start_blob - word->best_state[ch - 1],
                       word->best_state[ch - 1] + 1,
                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
         }
       }
       // If the next blob is good, make a join with it.
-      if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
-          next_map_char == '1') {
+      if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
         STRING joined_text = word->correct_text[ch];
         joined_text += word->correct_text[ch + 1];
         LearnPieces(filename, start_blob,
@@ -377,7 +351,6 @@ void Classify::LearnWord(const char* filename, const char *rejmap,
       */
     }
     start_blob += word->best_state[ch];
-    prev_map_char = rej_map_char;
   }
   delete [] thresholds;
 }  // LearnWord.
@@ -388,7 +361,7 @@ void Classify::LearnWord(const char* filename, const char *rejmap,
 // is called and the data will be written to a file for static training.
 // Otherwise AdaptToBlob is called for adaption within a document.
 // threshold is a magic number required by AdaptToChar and generated by
-// GetAdaptThresholds.
+// ComputeAdaptionThresholds.
 // Although it can be partly inferred from the string, segmentation is
 // provided to explicitly clarify the character segmentation.
 void Classify::LearnPieces(const char* filename, int start, int length,
@@ -401,15 +374,12 @@ void Classify::LearnPieces(const char* filename, int start, int length,
     return;
 
   if (length > 1) {
-    join_pieces(word->chopped_word->blobs, word->seam_array,
-                start, start + length - 1);
+    join_pieces(word->seam_array, start, start + length - 1,
+                word->chopped_word);
   }
-  TBLOB* blob = word->chopped_word->blobs;
-  for (int i = 0; i < start; ++i)
-    blob = blob->next;
+  TBLOB* blob = word->chopped_word->blobs[start];
   // Rotate the blob if needed for classification.
-  const DENORM* denorm = &word->denorm;
-  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded(&denorm);
+  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
   if (rotated_blob == NULL)
     rotated_blob = blob;
 
@@ -434,8 +404,12 @@ void Classify::LearnPieces(const char* filename, int start, int length,
     classify_norm_method.set_value(character);  // force char norm spc 30/11/93
     tess_bn_matching.set_value(false);    // turn it off
     tess_cn_matching.set_value(false);
-    LearnBlob(feature_defs_, filename, rotated_blob, *denorm,
-              correct_text);
+    DENORM bl_denorm, cn_denorm;
+    INT_FX_RESULT_STRUCT fx_info;
+    SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
+                     &bl_denorm, &cn_denorm, &fx_info);
+    LearnBlob(feature_defs_, filename, rotated_blob, bl_denorm, cn_denorm,
+              fx_info, correct_text);
   } else if (unicharset.contains_unichar(correct_text)) {
     UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
     int font_id = word->fontinfo != NULL
@@ -446,16 +420,15 @@ void Classify::LearnPieces(const char* filename, int start, int length,
               unicharset.id_to_unichar(class_id), threshold, font_id);
     // If filename is not NULL we are doing recognition
     // (as opposed to training), so we must have already set word fonts.
-    AdaptToChar(rotated_blob, *denorm, class_id, font_id, threshold);
+    AdaptToChar(rotated_blob, class_id, font_id, threshold);
   } else if (classify_debug_level >= 1) {
     tprintf("Can't adapt to %s not in unicharset\n", correct_text);
   }
   if (rotated_blob != blob) {
     delete rotated_blob;
-    delete denorm;
   }
 
-  break_pieces(blob, word->seam_array, start, start + length - 1);
+  break_pieces(word->seam_array, start, start + length - 1, word->chopped_word);
 }  // LearnPieces.
 
 /*---------------------------------------------------------------------------*/
@@ -521,6 +494,10 @@ void Classify::EndAdaptiveClassifier() {
   }
   delete shape_table_;
   shape_table_ = NULL;
+  if (static_classifier_ != NULL) {
+    delete static_classifier_;
+    static_classifier_ = NULL;
+  }
 }                                /* EndAdaptiveClassifier */
 
 
@@ -581,6 +558,7 @@ void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
       ReadNormProtos(tessdata_manager.GetDataFilePtr(),
                      tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
     if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
+    static_classifier_ = new TessClassifier(false, this);
   }
 
   im_.Init(&classify_debug_level, classify_integer_matcher_multiplier);
@@ -741,7 +719,6 @@ void Classify::SettupPass2() {
  * config in that class.
  *
  * @param Blob blob to model new class after
- * @param denorm normalization/denormalization parameters
  * @param ClassId id of the class to be initialized
  * @param FontinfoId font information inferred from pre-trained templates
  * @param Class adapted class to be initialized
@@ -756,7 +733,6 @@ void Classify::SettupPass2() {
  * @note History: Thu Mar 14 12:49:39 1991, DSJ, Created.
  */
 void Classify::InitAdaptedClass(TBLOB *Blob,
-                                const DENORM& denorm,
                                 CLASS_ID ClassId,
                                 int FontinfoId,
                                 ADAPT_CLASS Class,
@@ -822,7 +798,7 @@ void Classify::InitAdaptedClass(TBLOB *Blob,
     cprintf ("Added new class '%s' with class id %d and %d protos.\n",
              unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
     if (classify_learning_debug_level > 1)
-      DisplayAdaptedChar(Blob, denorm, IClass);
+      DisplayAdaptedChar(Blob, IClass);
   }
 
   if (IsEmptyAdaptedClass(Class))
@@ -885,21 +861,19 @@ int Classify::GetAdaptiveFeatures(TBLOB *Blob,
  *
  * @param Word current word
  * @param BestChoiceWord best overall choice for word with context
- * @param RawChoiceWord best choice for word without context
  *
  * @return TRUE or FALSE
  * @note Exceptions: none
  * @note History: Thu May 30 14:25:06 1991, DSJ, Created.
  */
-int Classify::AdaptableWord(TWERD *Word,
-                            const WERD_CHOICE &BestChoiceWord,
-                            const WERD_CHOICE &RawChoiceWord) {
-  int BestChoiceLength = BestChoiceWord.length();
+bool Classify::AdaptableWord(WERD_RES* word) {
+  if (word->best_choice == NULL) return false;
+  int BestChoiceLength = word->best_choice->length();
   float adaptable_score =
     getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
   return   // rules that apply in general - simplest to compute first
       BestChoiceLength > 0 &&
-      BestChoiceLength == Word->NumBlobs() &&
+      BestChoiceLength == word->rebuild_word->NumBlobs() &&
       BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
       // This basically ensures that the word is at least a dictionary match
       // (freq word, user word, system dawg word, etc).
@@ -907,16 +881,14 @@ int Classify::AdaptableWord(TWERD *Word,
       // than higher than adaptable_score=1.1+0.05=1.15
       // Since these are other flags that ensure that the word is dict word,
       // this check could be at times redundant.
-      getDict().CurrentBestChoiceAdjustFactor() <= adaptable_score &&
+      word->best_choice->adjust_factor() <= adaptable_score &&
       // Make sure that alternative choices are not dictionary words.
-      getDict().AlternativeChoicesWorseThan(adaptable_score) &&
-      getDict().CurrentBestChoiceIs(BestChoiceWord);
+      word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
 }
 
 /*---------------------------------------------------------------------------*/
 /**
  * @param Blob blob to add to templates for ClassId
- * @param denorm normalization/denormalization parameters
  * @param ClassId class to add blob to
  * @param FontinfoId font information from pre-trained templates
  * @param Threshold minimum match rating to existing template
@@ -931,7 +903,6 @@ int Classify::AdaptableWord(TWERD *Word,
  * @note History: Thu Mar 14 09:36:03 1991, DSJ, Created.
  */
 void Classify::AdaptToChar(TBLOB *Blob,
-                           const DENORM& denorm,
                            CLASS_ID ClassId,
                            int FontinfoId,
                            FLOAT32 Threshold) {
@@ -952,8 +923,7 @@ void Classify::AdaptToChar(TBLOB *Blob,
   Class = AdaptedTemplates->Class[ClassId];
   assert(Class != NULL);
   if (IsEmptyAdaptedClass(Class)) {
-    InitAdaptedClass(Blob, denorm, ClassId, FontinfoId, Class,
-                     AdaptedTemplates);
+    InitAdaptedClass(Blob, ClassId, FontinfoId, Class, AdaptedTemplates);
   }
   else {
     IClass = ClassForClassId (AdaptedTemplates->Templates, ClassId);
@@ -999,9 +969,8 @@ void Classify::AdaptToChar(TBLOB *Blob,
           IntResult.Config, TempConfig->NumTimesSeen);
 
       if (TempConfigReliable(ClassId, TempConfig)) {
-        MakePermanent(AdaptedTemplates, ClassId, IntResult.Config, denorm,
-                      Blob);
-        UpdateAmbigsGroup(ClassId, denorm, Blob);
+        MakePermanent(AdaptedTemplates, ClassId, IntResult.Config, Blob);
+        UpdateAmbigsGroup(ClassId, Blob);
       }
     }
     else {
@@ -1009,7 +978,7 @@ void Classify::AdaptToChar(TBLOB *Blob,
         cprintf ("Found poor match to temp config %d = %4.1f%%.\n",
           IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
         if (classify_learning_debug_level > 2)
-          DisplayAdaptedChar(Blob, denorm, IClass);
+          DisplayAdaptedChar(Blob, IClass);
       }
       NewTempConfigId = MakeNewTemporaryConfig(AdaptedTemplates,
                                                ClassId,
@@ -1019,13 +988,13 @@ void Classify::AdaptToChar(TBLOB *Blob,
                                                FloatFeatures);
       if (NewTempConfigId >= 0 &&
           TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
-        MakePermanent(AdaptedTemplates, ClassId, NewTempConfigId, denorm, Blob);
-        UpdateAmbigsGroup(ClassId, denorm, Blob);
+        MakePermanent(AdaptedTemplates, ClassId, NewTempConfigId, Blob);
+        UpdateAmbigsGroup(ClassId, Blob);
       }
 
 #ifndef GRAPHICS_DISABLED
       if (classify_learning_debug_level > 1) {
-        DisplayAdaptedChar(Blob, denorm, IClass);
+        DisplayAdaptedChar(Blob, IClass);
       }
 #endif
     }
@@ -1033,13 +1002,12 @@ void Classify::AdaptToChar(TBLOB *Blob,
   }
 }                                /* AdaptToChar */
 
-void Classify::DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
-                                  INT_CLASS_STRUCT* int_class) {
+void Classify::DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class) {
 #ifndef GRAPHICS_DISABLED
   int bloblength = 0;
   INT_FEATURE_ARRAY features;
   uinT8* norm_array = new uinT8[unicharset.size()];
-  int num_features = GetBaselineFeatures(blob, denorm, PreTrainedTemplates,
+  int num_features = GetBaselineFeatures(blob, PreTrainedTemplates,
                                          features,
                                          norm_array, &bloblength);
   delete [] norm_array;
@@ -1068,7 +1036,6 @@ void Classify::DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
 /*---------------------------------------------------------------------------*/
 /**
  * @param Blob blob to add to templates for ClassId
- * @param denorm normalization/denormalization parameters
  * @param ClassId class to add blob to
  * @param FontinfoId font information from pre-trained teamples
  * @param Threshold minimum match rating to existing template
@@ -1080,7 +1047,6 @@ void Classify::DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
  * @note History: Thu Mar 14 09:36:03 1991, DSJ, Created.
  */
 void Classify::AdaptToPunc(TBLOB *Blob,
-                           const DENORM& denorm,
                            CLASS_ID ClassId,
                            int FontinfoId,
                            FLOAT32 Threshold) {
@@ -1088,7 +1054,7 @@ void Classify::AdaptToPunc(TBLOB *Blob,
   int i;
 
   Results->Initialize();
-  CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
+  CharNormClassifier(Blob, PreTrainedTemplates, Results);
   RemoveBadMatches(Results);
 
   if (Results->NumMatches != 1) {
@@ -1106,7 +1072,7 @@ void Classify::AdaptToPunc(TBLOB *Blob,
       cprintf ("Adapting to punc = %s, thr= %g\n",
                unicharset.id_to_unichar(ClassId), Threshold);
     #endif
-    AdaptToChar(Blob, denorm, ClassId, FontinfoId, Threshold);
+    AdaptToChar(Blob, ClassId, FontinfoId, Threshold);
   }
   delete Results;
 }                                /* AdaptToPunc */
@@ -1193,7 +1159,6 @@ void Classify::AddNewResult(ADAPT_RESULTS *results,
  * - #AllConfigsOn mask that enables all configs
  *
  * @param Blob blob to be classified
- * @param denorm normalization/denormalization parameters
  * @param Templates built-in templates to classify against
  * @param Classes adapted class templates
  * @param Ambiguities array of class id's to match against
@@ -1203,7 +1168,6 @@ void Classify::AddNewResult(ADAPT_RESULTS *results,
  * @note History: Tue Mar 12 19:40:36 1991, DSJ, Created.
  */
 void Classify::AmbigClassifier(TBLOB *Blob,
-                               const DENORM& denorm,
                                INT_TEMPLATES Templates,
                                ADAPT_CLASS *Classes,
                                UNICHAR_ID *Ambiguities,
@@ -1216,9 +1180,9 @@ void Classify::AmbigClassifier(TBLOB *Blob,
 
   AmbigClassifierCalls++;
 
-  NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
+  NumFeatures = GetCharNormFeatures(Blob, Templates, IntFeatures,
                                     NULL, CharNormArray,
-                                    &(Results->BlobLength), NULL);
+                                    &(Results->BlobLength));
   if (NumFeatures <= 0) {
     delete [] CharNormArray;
     return;
@@ -1412,7 +1376,6 @@ double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
  * - BaselineCutoffs expected num features for each class
  *
  * @param Blob blob to be classified
- * @param denorm normalization/denormalization parameters
  * @param Templates current set of adapted templates
  * @param Results place to put match results
  *
@@ -1421,7 +1384,6 @@ double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
  * @note History: Tue Mar 12 19:38:03 1991, DSJ, Created.
  */
 UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
-                                         const DENORM& denorm,
                                          ADAPT_TEMPLATES Templates,
                                          ADAPT_RESULTS *Results) {
   int NumFeatures;
@@ -1432,9 +1394,8 @@ UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
 
   BaselineClassifierCalls++;
 
-  NumFeatures = GetBaselineFeatures(
-      Blob, denorm, Templates->Templates, IntFeatures, CharNormArray,
-      &(Results->BlobLength));
+  NumFeatures = GetBaselineFeatures(Blob, Templates->Templates, IntFeatures,
+                                    CharNormArray, &Results->BlobLength);
   if (NumFeatures <= 0) {
     delete [] CharNormArray;
     return NULL;
@@ -1472,7 +1433,6 @@ UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
  * are added to Results.
  *
  * @param Blob blob to be classified
- * @param denorm normalization/denormalization parameters
  * @param Templates templates to classify unknown against
  * @param Results place to put match results
  *
@@ -1484,70 +1444,52 @@ UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
  * @note Exceptions: none
  * @note History: Tue Mar 12 16:02:52 1991, DSJ, Created.
  */
-int Classify::CharNormClassifier(TBLOB *Blob,
-                                 const DENORM& denorm,
+int Classify::CharNormClassifier(TBLOB *blob,
                                  INT_TEMPLATES Templates,
-                                 ADAPT_RESULTS *Results) {
-  int NumFeatures;
-  int NumClasses;
-  INT_FEATURE_ARRAY IntFeatures;
-
+                                 ADAPT_RESULTS *adapt_results) {
   CharNormClassifierCalls++;
-
-  uinT8* CharNormArray = new uinT8[unicharset.size()];
-  int num_pruner_classes = MAX(unicharset.size(),
-                               PreTrainedTemplates->NumClasses);
-  uinT8* PrunerNormArray = new uinT8[num_pruner_classes];
-  NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
-                                    PrunerNormArray, CharNormArray,
-                                    &(Results->BlobLength), NULL);
-  if (NumFeatures <= 0) {
-    delete [] CharNormArray;
-    delete [] PrunerNormArray;
-    return 0;
+  TrainingSample* sample = BlobToTrainingSample(*blob, NM_CHAR_ANISOTROPIC,
+                                                classify_nonlinear_norm);
+  if (sample == NULL) return 0;
+  // This is the length that is used for scaling ratings vs certainty.
+  adapt_results->BlobLength =
+      IntCastRounded(sample->outline_length() / kStandardFeatureLength);
+  GenericVector<UnicharRating> unichar_results;
+  static_classifier_->UnicharClassifySample(*sample, blob->denorm().pix(), 0,
+                                            -1, &unichar_results);
+  // Convert results to the format used internally by AdaptiveClassifier.
+  for (int r = 0; r < unichar_results.size(); ++r) {
+    int unichar_id = unichar_results[r].unichar_id;
+    // Fonts are listed in order of preference.
+    int font1 = unichar_results[r].fonts.size() >= 1
+              ? unichar_results[r].fonts[0] : kBlankFontinfoId;
+    int font2 = unichar_results[r].fonts.size() >= 2
+              ? unichar_results[r].fonts[1] : kBlankFontinfoId;
+    float rating = 1.0f - unichar_results[r].rating;
+    AddNewResult(adapt_results, unichar_id, -1, rating, false, 0, font1, font2);
   }
-
-  NumClasses = PruneClasses(Templates, NumFeatures, IntFeatures,
-                            PrunerNormArray,
-                            shape_table_ != NULL ? &shapetable_cutoffs_[0]
-                                                 : CharNormCutoffs,
-                            Results->CPResults);
-
-  if (tessedit_single_match && NumClasses > 1)
-    NumClasses = 1;
-  NumCharNormClassesTried += NumClasses;
-
-  im_.SetCharNormMatch(classify_integer_matcher_multiplier);
-  MasterMatcher(Templates, NumFeatures, IntFeatures, CharNormArray,
-                NULL, matcher_debug_flags, NumClasses,
-                Blob->bounding_box(), Results->CPResults, Results);
-  delete [] CharNormArray;
-  delete [] PrunerNormArray;
-  return NumFeatures;
+  int num_features = sample->num_features();
+  delete sample;
+  return num_features;
 }                                /* CharNormClassifier */
 
 // As CharNormClassifier, but operates on a TrainingSample and outputs to
 // a GenericVector of ShapeRating without conversion to classes.
 int Classify::CharNormTrainingSample(bool pruner_only,
+                                     int keep_this,
                                      const TrainingSample& sample,
-                                     GenericVector<ShapeRating>* results) {
+                                     GenericVector<UnicharRating>* results) {
   results->clear();
   ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
   adapt_results->Initialize();
   // Compute the bounding box of the features.
   int num_features = sample.num_features();
-  TBOX blob_box;
-  for (int f = 0; f < num_features; ++f) {
-    const INT_FEATURE_STRUCT feature = sample.features()[f];
-    TBOX fbox(feature.X, feature.Y, feature.X, feature.Y);
-    blob_box += fbox;
-  }
+  // Only the top and bottom of the blob_box are used by MasterMatcher, so
+  // fabricate right and left using top and bottom.
+  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
+                sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
   // Compute the char_norm_array from the saved cn_feature.
-  FEATURE norm_feature = NewFeature(&CharNormDesc);
-  norm_feature->Params[CharNormY] = sample.cn_feature(CharNormY);
-  norm_feature->Params[CharNormLength] = sample.cn_feature(CharNormLength);
-  norm_feature->Params[CharNormRx] = sample.cn_feature(CharNormRx);
-  norm_feature->Params[CharNormRy] = sample.cn_feature(CharNormRy);
+  FEATURE norm_feature = sample.GetCNFeature();
   uinT8* char_norm_array = new uinT8[unicharset.size()];
   int num_pruner_classes = MAX(unicharset.size(),
                                PreTrainedTemplates->NumClasses);
@@ -1564,19 +1506,16 @@ int Classify::CharNormTrainingSample(bool pruner_only,
                                                       : CharNormCutoffs,
                                  adapt_results->CPResults);
   delete [] pruner_norm_array;
+  if (keep_this >= 0) {
+    num_classes = 1;
+    adapt_results->CPResults[0].Class = keep_this;
+  }
   if (pruner_only) {
     // Convert pruner results to output format.
     for (int i = 0; i < num_classes; ++i) {
       int class_id = adapt_results->CPResults[i].Class;
-      int shape_id = class_id;
-      if (shape_table_ != NULL) {
-        // All shapes in a class have the same combination of unichars, so
-        // it doesn't really matter which config we give it, as we aren't
-        // trying to get the font here.
-        shape_id = ClassAndConfigIDToFontOrShapeID(class_id, 0);
-      }
       results->push_back(
-          ShapeRating(shape_id, 1.0f - adapt_results->CPResults[i].Rating));
+          UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
     }
   } else {
     im_.SetCharNormMatch(classify_integer_matcher_multiplier);
@@ -1587,9 +1526,15 @@ int Classify::CharNormTrainingSample(bool pruner_only,
     // Convert master matcher results to output format.
     for (int i = 0; i < adapt_results->NumMatches; i++) {
       ScoredClass next = adapt_results->match[i];
-      results->push_back(ShapeRating(next.shape_id, 1.0f - next.rating));
+      UnicharRating rating(next.unichar_id, 1.0f - next.rating);
+      if (next.fontinfo_id >= 0) {
+        rating.fonts.push_back(next.fontinfo_id);
+        if (next.fontinfo_id2 >= 0)
+          rating.fonts.push_back(next.fontinfo_id2);
+      }
+      results->push_back(rating);
     }
-    results->sort(&ShapeRating::SortDescendingRating);
+    results->sort(&UnicharRating::SortDescendingRating);
   }
   delete [] char_norm_array;
   delete adapt_results;
@@ -1694,6 +1639,7 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
       max_matches = MAX_MATCHES;
   }
 
+  float best_certainty = -MAX_FLOAT32;
   for (int i = 0; i < Results->NumMatches; i++) {
     ScoredClass next = Results->match[i];
     int fontinfo_id = next.fontinfo_id;
@@ -1717,13 +1663,27 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
       Rating *= rating_scale * Results->BlobLength;
       Certainty *= -(getDict().certainty_scale);
     }
-    inT16 min_xheight, max_xheight;
+    // Adapted results, by their very nature, should have good certainty.
+    // Those that don't are at best misleading, and often lead to errors,
+    // so don't accept adapted results that are too far behind the best result,
+    // whether adapted or static.
+    // TODO(rays) find some way of automatically tuning these constants.
+    if (Certainty > best_certainty) {
+      best_certainty = MIN(Certainty, classify_adapted_pruning_threshold);
+    } else if (adapted &&
+               Certainty / classify_adapted_pruning_factor < best_certainty) {
+      continue;  // Don't accept bad adapted results.
+    }
+
+    float min_xheight, max_xheight, yshift;
     denorm.XHeightRange(next.unichar_id, unicharset, box,
-                        &min_xheight, &max_xheight);
+                        &min_xheight, &max_xheight, &yshift);
     temp_it.add_to_end(new BLOB_CHOICE(next.unichar_id, Rating, Certainty,
                                         fontinfo_id, fontinfo_id2,
                                         unicharset.get_script(next.unichar_id),
-                                        min_xheight, max_xheight, adapted));
+                                        min_xheight, max_xheight, yshift,
+                                        adapted ? BCC_ADAPTED_CLASSIFIER
+                                                : BCC_STATIC_CLASSIFIER));
     contains_nonfrag |= !current_is_frag;  // update contains_nonfrag
     choices_length++;
     if (choices_length >= max_matches) break;
@@ -1737,7 +1697,6 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
 /**
  *
  * @param Blob blob whose classification is being debugged
- * @param denorm normalization/denormalization parameters
  * @param Results results of match being debugged
  *
  * Globals: none
@@ -1745,39 +1704,18 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
  * @note Exceptions: none
  * @note History: Wed Mar 13 16:44:41 1991, DSJ, Created.
  */
-void Classify::DebugAdaptiveClassifier(TBLOB *Blob,
-                                       const DENORM& denorm,
+void Classify::DebugAdaptiveClassifier(TBLOB *blob,
                                        ADAPT_RESULTS *Results) {
+  if (static_classifier_ == NULL) return;
   for (int i = 0; i < Results->NumMatches; i++) {
-    if (Results->match[i].rating < Results->best_match.rating)
+    if (i == 0 || Results->match[i].rating < Results->best_match.rating)
       Results->best_match = Results->match[i];
   }
-  const char *Prompt =
-    "Left-click in IntegerMatch Window to continue or right click to debug...";
-  CLASS_ID unichar_id = Results->best_match.unichar_id;
-  int shape_id = Results->best_match.shape_id;
-  bool adaptive_on = true;
-  bool pretrained_on = true;
-
-  const char* debug_mode;
-  do {
-    if (!pretrained_on)
-      debug_mode = "Adaptive Templates Only";
-    else if (!adaptive_on)
-      debug_mode = "PreTrained Templates Only";
-    else
-      debug_mode = "All Templates";
-    ShowMatchDisplay();
-    tprintf("Debugging class %d = %s in mode %s ...",
-            unichar_id, unicharset.id_to_unichar(unichar_id), debug_mode);
-    if (shape_id >= 0 && shape_table_ != NULL) {
-      tprintf(" from shape %s\n", shape_table_->DebugStr(shape_id).string());
-    }
-    ShowBestMatchFor(Blob, denorm, unichar_id, shape_id, adaptive_on,
-                     pretrained_on, Results);
-    UpdateMatchDisplay();
-  } while ((unichar_id = GetClassToDebug(Prompt, &adaptive_on,
-                                         &pretrained_on, &shape_id)) != 0);
+  TrainingSample* sample = BlobToTrainingSample(*blob, NM_CHAR_ANISOTROPIC,
+                                                classify_nonlinear_norm);
+  if (sample == NULL) return;
+  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
+                                   Results->best_match.unichar_id);
 }                                /* DebugAdaptiveClassifier */
 #endif
 
@@ -1794,7 +1732,6 @@ void Classify::DebugAdaptiveClassifier(TBLOB *Blob,
  * of these classifications are merged together into Results.
  *
  * @param Blob blob to be classified
- * @param denorm normalization/denormalization parameters
  * @param Results place to put match results
  *
  * Globals:
@@ -1805,9 +1742,7 @@ void Classify::DebugAdaptiveClassifier(TBLOB *Blob,
  * @note Exceptions: none
  * @note History: Tue Mar 12 08:50:11 1991, DSJ, Created.
  */
-void Classify::DoAdaptiveMatch(TBLOB *Blob,
-                               const DENORM& denorm,
-                               ADAPT_RESULTS *Results) {
+void Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) {
   UNICHAR_ID *Ambiguities;
 
   AdaptiveMatcherCalls++;
@@ -1815,16 +1750,16 @@ void Classify::DoAdaptiveMatch(TBLOB *Blob,
 
   if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min ||
       tess_cn_matching) {
-    CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
+    CharNormClassifier(Blob, PreTrainedTemplates, Results);
   } else {
-    Ambiguities = BaselineClassifier(Blob, denorm, AdaptedTemplates, Results);
+    Ambiguities = BaselineClassifier(Blob, AdaptedTemplates, Results);
     if ((Results->NumMatches > 0 &&
          MarginalMatch (Results->best_match.rating) &&
          !tess_bn_matching) ||
         Results->NumMatches == 0) {
-      CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
+      CharNormClassifier(Blob, PreTrainedTemplates, Results);
     } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
-      AmbigClassifier(Blob, denorm,
+      AmbigClassifier(Blob,
                       PreTrainedTemplates,
                       AdaptedTemplates->Class,
                       Ambiguities,
@@ -1840,43 +1775,6 @@ void Classify::DoAdaptiveMatch(TBLOB *Blob,
     ClassifyAsNoise(Results);
 }   /* DoAdaptiveMatch */
 
-/*---------------------------------------------------------------------------*/
-/**
- * This routine tries to estimate how tight the adaptation
- * threshold should be set for each character in the current
- * word.  In general, the routine tries to set tighter
- * thresholds for a character when the current set of templates
- * would have made an error on that character.  It tries
- * to set a threshold tight enough to eliminate the error.
- * Two different sets of rules can be used to determine the
- * desired thresholds.
- *
- * @param Word current word
- * @param denorm normalization/denormalization parameters
- * @param BestChoice best choice for current word with context
- * @param BestRawChoice best choice for current word without context
- * @param[out] Thresholds array of thresholds to be filled in
- *
- * Globals:
- * - matcher_good_threshold
- * - matcher_perfect_threshold
- * - matcher_rating_margin
- *
- * @return none (results are returned in Thresholds)
- * @note Exceptions: none
- * @note History: Fri May 31 09:22:08 1991, DSJ, Created.
- */
-void Classify::GetAdaptThresholds(TWERD * Word,
-                                  const DENORM& denorm,
-                                  const WERD_CHOICE& BestChoice,
-                                  const WERD_CHOICE& BestRawChoice,
-                                  FLOAT32 Thresholds[]) {
-  getDict().FindClassifierErrors(matcher_perfect_threshold,
-                                 matcher_good_threshold,
-                                 matcher_rating_margin,
-                                 Thresholds);
-}                              /* GetAdaptThresholds */
-
 /*---------------------------------------------------------------------------*/
 /**
  * This routine matches blob to the built-in templates
@@ -1884,7 +1782,6 @@ void Classify::GetAdaptThresholds(TWERD * Word,
  * class which are potential ambiguities.
  *
  * @param Blob blob to get classification ambiguities for
- * @param denorm normalization/denormalization parameters
  * @param CorrectClass correct class for Blob
  *
  * Globals:
@@ -1896,7 +1793,6 @@ void Classify::GetAdaptThresholds(TWERD * Word,
  * @note History: Fri Mar 15 08:08:22 1991, DSJ, Created.
  */
 UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
-                                     const DENORM& denorm,
                                      CLASS_ID CorrectClass) {
   ADAPT_RESULTS *Results = new ADAPT_RESULTS();
   UNICHAR_ID *Ambiguities;
@@ -1904,7 +1800,7 @@ UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
 
   Results->Initialize();
 
-  CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
+  CharNormClassifier(Blob, PreTrainedTemplates, Results);
   RemoveBadMatches(Results);
   qsort((void *)Results->match, Results->NumMatches,
         sizeof(ScoredClass), CompareByRating);
@@ -1938,7 +1834,6 @@ UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
  * array provided by the caller.
  *
  * @param Blob blob to extract features from
- * @param denorm normalization/denormalization parameters
  * @param Templates used to compute char norm adjustments
  * @param IntFeatures array to fill with integer features
  * @param CharNormArray array to fill with dummy char norm adjustments
@@ -1955,30 +1850,24 @@ UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
  * @note History: Tue May 28 10:40:52 1991, DSJ, Created.
  */
 int Classify::GetBaselineFeatures(TBLOB *Blob,
-                                  const DENORM& denorm,
                                   INT_TEMPLATES Templates,
                                   INT_FEATURE_ARRAY IntFeatures,
                                   uinT8* CharNormArray,
                                   inT32 *BlobLength) {
-  register INT_FEATURE Src, Dest, End;
-
   if (!FeaturesHaveBeenExtracted) {
-    FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
-                                CharNormFeatures, &FXInfo, NULL);
+    FeaturesOK = ExtractIntFeat(*Blob, classify_nonlinear_norm,
+                                BaselineFeatures, CharNormFeatures, &FXInfo);
     FeaturesHaveBeenExtracted = TRUE;
   }
 
+  *BlobLength = IntCastRounded(FXInfo.Length / kStandardFeatureLength);
   if (!FeaturesOK) {
-    *BlobLength = FXInfo.NumBL;
     return 0;
   }
 
-  for (Src = BaselineFeatures, End = Src + FXInfo.NumBL, Dest = IntFeatures;
-       Src < End;
-       *Dest++ = *Src++);
+  memcpy(IntFeatures, BaselineFeatures, FXInfo.NumBL * sizeof(IntFeatures[0]));
 
   ClearCharNormArray(CharNormArray);
-  *BlobLength = FXInfo.NumBL;
   return FXInfo.NumBL;
 }                              /* GetBaselineFeatures */
 
@@ -1988,9 +1877,9 @@ void Classify::ResetFeaturesHaveBeenExtracted() {
 
 // Returns true if the given blob looks too dissimilar to any character
 // present in the classifier templates.
-bool Classify::LooksLikeGarbage(const DENORM& denorm, TBLOB *blob) {
+bool Classify::LooksLikeGarbage(TBLOB *blob) {
   BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
-  AdaptiveClassifier(blob, denorm, ratings, NULL);
+  AdaptiveClassifier(blob, ratings, NULL);
   BLOB_CHOICE_IT ratings_it(ratings);
   const UNICHARSET &unicharset = getDict().getUnicharset();
   if (classify_debug_character_fragments) {
@@ -2002,9 +1891,10 @@ bool Classify::LooksLikeGarbage(const DENORM& denorm, TBLOB *blob) {
     if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != NULL) {
       continue;
     }
+    float certainty = ratings_it.data()->certainty();
     delete ratings;
-    return (ratings_it.data()->certainty() <
-            classify_character_fragments_garbage_certainty_threshold);
+    return certainty <
+            classify_character_fragments_garbage_certainty_threshold;
   }
   delete ratings;
   return true;  // no whole characters in ratings
@@ -2023,14 +1913,12 @@ bool Classify::LooksLikeGarbage(const DENORM& denorm, TBLOB *blob) {
  * array provided by the caller.
  *
  * @param Blob blob to extract features from
- * @param denorm normalization/denormalization parameters
  * @param Templates used to compute char norm adjustments
  * @param IntFeatures array to fill with integer features
  * @param PrunerNormArray Array of factors from blob normalization
  *        process
  * @param CharNormArray array to fill with dummy char norm adjustments
  * @param BlobLength length of blob in baseline-normalized units
- * @param FeatureOutlineArray
  *
  * Globals:
  * - FeaturesHaveBeenExtracted TRUE if fx has been done
@@ -2043,39 +1931,29 @@ bool Classify::LooksLikeGarbage(const DENORM& denorm, TBLOB *blob) {
  * @note History: Tue May 28 10:40:52 1991, DSJ, Created.
  */
 int Classify::GetCharNormFeatures(TBLOB *Blob,
-                                  const DENORM& denorm,
                                   INT_TEMPLATES Templates,
                                   INT_FEATURE_ARRAY IntFeatures,
                                   uinT8* PrunerNormArray,
                                   uinT8* CharNormArray,
-                                  inT32 *BlobLength,
-                                  inT32 *FeatureOutlineArray) {
-  register INT_FEATURE Src, Dest, End;
+                                  inT32 *BlobLength) {
   FEATURE NormFeature;
   FLOAT32 Baseline, Scale;
-  inT32 FeatureOutlineIndex[MAX_NUM_INT_FEATURES];
 
   if (!FeaturesHaveBeenExtracted) {
-    FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
-                                CharNormFeatures, &FXInfo,
-                                FeatureOutlineIndex);
+    FeaturesOK = ExtractIntFeat(*Blob, classify_nonlinear_norm,
+                                BaselineFeatures, CharNormFeatures, &FXInfo);
     FeaturesHaveBeenExtracted = TRUE;
   }
 
+  *BlobLength = IntCastRounded(FXInfo.Length / kStandardFeatureLength);
   if (!FeaturesOK) {
-    *BlobLength = FXInfo.NumBL;
-    return (0);
+    return 0;
   }
 
-  for (Src = CharNormFeatures, End = Src + FXInfo.NumCN, Dest = IntFeatures;
-       Src < End;
-       *Dest++ = *Src++);
-  for (int i = 0;  FeatureOutlineArray && i < FXInfo.NumCN; ++i) {
-    FeatureOutlineArray[i] = FeatureOutlineIndex[i];
-  }
+  memcpy(IntFeatures, CharNormFeatures, FXInfo.NumCN * sizeof(IntFeatures[0]));
 
   NormFeature = NewFeature(&CharNormDesc);
-  Baseline = BASELINE_OFFSET;
+  Baseline = kBlnBaselineOffset;
   Scale = MF_SCALE_FACTOR;
   NormFeature->Params[CharNormY] = (FXInfo.Ymean - Baseline) * Scale;
   NormFeature->Params[CharNormLength] =
@@ -2083,8 +1961,7 @@ int Classify::GetCharNormFeatures(TBLOB *Blob,
   NormFeature->Params[CharNormRx] = FXInfo.Rx * Scale;
   NormFeature->Params[CharNormRy] = FXInfo.Ry * Scale;
   ComputeCharNormArrays(NormFeature, Templates, CharNormArray, PrunerNormArray);
-  *BlobLength = FXInfo.NumBL;
-  return (FXInfo.NumCN);
+  return FXInfo.NumCN;
 }                              /* GetCharNormFeatures */
 
 // Computes the char_norm_array for the unicharset and, if not NULL, the
@@ -2312,7 +2189,6 @@ PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features,
  * @param Templates current set of adaptive templates
  * @param ClassId class containing config to be made permanent
  * @param ConfigId config to be made permanent
- * @param denorm normalization/denormalization parameters
  * @param Blob current blob being adapted to
  *
  * Globals: none
@@ -2323,7 +2199,6 @@ PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features,
 void Classify::MakePermanent(ADAPT_TEMPLATES Templates,
                              CLASS_ID ClassId,
                              int ConfigId,
-                             const DENORM& denorm,
                              TBLOB *Blob) {
   UNICHAR_ID *Ambigs;
   TEMP_CONFIG Config;
@@ -2339,7 +2214,7 @@ void Classify::MakePermanent(ADAPT_TEMPLATES Templates,
   Class->NumPermConfigs++;
 
   // Initialize permanent config.
-  Ambigs = GetAmbiguities(Blob, denorm, ClassId);
+  Ambigs = GetAmbiguities(Blob, ClassId);
   PERM_CONFIG Perm = (PERM_CONFIG) alloc_struct(sizeof(PERM_CONFIG_STRUCT),
                                                 "PERM_CONFIG_STRUCT");
   Perm->Ambigs = Ambigs;
@@ -2555,164 +2430,48 @@ void Classify::SetAdaptiveThreshold(FLOAT32 Threshold) {
 
 /*---------------------------------------------------------------------------*/
 /**
- * This routine compares Blob to both sets of templates
- * (adaptive and pre-trained) and then displays debug
- * information for the config which matched best.
+ * This routine displays debug information for the best config
+ * of the given shape_id for the given set of features.
  *
- * @param Blob blob to show best matching config for
- * @param denorm normalization/denormalization parameters
- * @param ClassId class whose configs are to be searched
- * @param shape_id shape index
- * @param AdaptiveOn TRUE if adaptive configs are enabled
- * @param PreTrainedOn TRUE if pretrained configs are enabled
- * @param Results results of match being debugged
- *
- * Globals:
- * - PreTrainedTemplates built-in training
- * - AdaptedTemplates adaptive templates
- * - AllProtosOn dummy proto mask
- * - AllConfigsOn dummy config mask
+ * @param shape_id classifier id to work with
+ * @param features features of the unknown character
+ * @param num_features Number of features in the features array.
  *
  * @note Exceptions: none
  * @note History: Fri Mar 22 08:43:52 1991, DSJ, Created.
  */
-void Classify::ShowBestMatchFor(TBLOB *Blob,
-                                const DENORM& denorm,
-                                CLASS_ID ClassId,
-                                int shape_id,
-                                BOOL8 AdaptiveOn,
-                                BOOL8 PreTrainedOn,
-                                ADAPT_RESULTS *Results) {
-  int NumCNFeatures = 0, NumBLFeatures = 0;
-  INT_FEATURE_ARRAY CNFeatures, BLFeatures;
-  INT_RESULT_STRUCT CNResult, BLResult;
-  inT32 BlobLength;
-  uinT32 ConfigMask;
-  static int next_config = -1;
-
-  if (PreTrainedOn) next_config = -1;
-
-  CNResult.Rating = BLResult.Rating = 2.0;
-
-  if (!LegalClassId (ClassId)) {
-    cprintf ("%d is not a legal class id!!\n", ClassId);
+void Classify::ShowBestMatchFor(int shape_id,
+                                const INT_FEATURE_STRUCT* features,
+                                int num_features) {
+  uinT32 config_mask;
+  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
+    tprintf("No built-in templates for class/shape %d\n", shape_id);
     return;
   }
-
-  uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
-  uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];
-
-  if (shape_table_ == NULL)
-    shape_id = ClassId;
-  else
-    shape_id = ShapeIDToClassID(shape_id);
-  if (PreTrainedOn && shape_id >= 0) {
-    if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
-      tprintf("No built-in templates for class/shape %d\n", shape_id);
-    } else {
-      NumCNFeatures = GetCharNormFeatures(Blob, denorm, PreTrainedTemplates,
-                                          CNFeatures, NULL, CNAdjust,
-                                          &BlobLength, NULL);
-      if (NumCNFeatures <= 0) {
-        tprintf("Illegal blob (char norm features)!\n");
-      } else {
-        im_.SetCharNormMatch(classify_integer_matcher_multiplier);
-        im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
-                  AllProtosOn, AllConfigsOn,
-                  NumCNFeatures, CNFeatures,
-                  &CNResult,
-                  classify_adapt_feature_threshold, NO_DEBUG,
-                  matcher_debug_separate_windows);
-        ExpandShapesAndApplyCorrections(NULL, false, shape_id,
-                                        Blob->bounding_box().bottom(),
-                                        Blob->bounding_box().top(),
-                                        0, BlobLength, CNAdjust,
-                                        CNResult, Results);
-      }
-    }
+  if (num_features <= 0) {
+    tprintf("Illegal blob (char norm features)!\n");
+    return;
   }
-
-  if (AdaptiveOn) {
-    if (ClassId < 0 || ClassId >= AdaptedTemplates->Templates->NumClasses) {
-      tprintf("Invalid adapted class id: %d\n", ClassId);
-    } else if (UnusedClassIdIn(AdaptedTemplates->Templates, ClassId) ||
-               AdaptedTemplates->Class[ClassId] == NULL ||
-               IsEmptyAdaptedClass(AdaptedTemplates->Class[ClassId])) {
-      tprintf("No AD templates for class %d = %s\n",
-              ClassId, unicharset.id_to_unichar(ClassId));
-    } else {
-      NumBLFeatures = GetBaselineFeatures(Blob,
-                                          denorm,
-                                          AdaptedTemplates->Templates,
-                                          BLFeatures, BLAdjust,
-                                          &BlobLength);
-      if (NumBLFeatures <= 0)
-        tprintf("Illegal blob (baseline features)!\n");
-      else {
-        im_.SetBaseLineMatch();
-        im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
-                  AllProtosOn, AllConfigsOn,
-                  NumBLFeatures, BLFeatures,
-                  &BLResult,
-                  classify_adapt_feature_threshold, NO_DEBUG,
-                  matcher_debug_separate_windows);
-        ExpandShapesAndApplyCorrections(
-            AdaptedTemplates->Class, false,
-            ClassId, Blob->bounding_box().bottom(),
-            Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
-            BLResult, Results);
-      }
-    }
-  }
-
+  INT_RESULT_STRUCT cn_result;
+  classify_norm_method.set_value(character);
+  im_.SetCharNormMatch(classify_integer_matcher_multiplier);
+  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
+            AllProtosOn, AllConfigsOn,
+            num_features, features, &cn_result,
+            classify_adapt_feature_threshold, NO_DEBUG,
+            matcher_debug_separate_windows);
   tprintf("\n");
-  if (BLResult.Rating < CNResult.Rating) {
-    if (next_config < 0) {
-      ConfigMask = 1 << BLResult.Config;
-      next_config = 0;
-    } else {
-      ConfigMask = 1 << next_config;
-      ++next_config;
-    }
-    classify_norm_method.set_value(baseline);
+  config_mask = 1 << cn_result.Config;
 
-    im_.SetBaseLineMatch();
-    tprintf("Adaptive Class ID: %d\n", ClassId);
-    im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
-              AllProtosOn, (BIT_VECTOR) &ConfigMask,
-              NumBLFeatures, BLFeatures,
-              &BLResult,
-              classify_adapt_feature_threshold,
-              matcher_debug_flags,
-              matcher_debug_separate_windows);
-    ExpandShapesAndApplyCorrections(
-        AdaptedTemplates->Class, true,
-        ClassId, Blob->bounding_box().bottom(),
-        Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
-        BLResult, Results);
-  } else if (shape_id >= 0) {
-    ConfigMask = 1 << CNResult.Config;
-    classify_norm_method.set_value(character);
-
-    tprintf("Static Shape ID: %d\n", shape_id);
-    im_.SetCharNormMatch(classify_integer_matcher_multiplier);
-    im_.Match(ClassForClassId (PreTrainedTemplates, shape_id),
-              AllProtosOn, (BIT_VECTOR) & ConfigMask,
-              NumCNFeatures, CNFeatures,
-              &CNResult,
-              classify_adapt_feature_threshold,
-              matcher_debug_flags,
-              matcher_debug_separate_windows);
-    ExpandShapesAndApplyCorrections(NULL, true, shape_id,
-                                    Blob->bounding_box().bottom(),
-                                    Blob->bounding_box().top(),
-                                    0, BlobLength, CNAdjust,
-                                    CNResult, Results);
-  }
-
-  // Clean up.
-  delete[] CNAdjust;
-  delete[] BLAdjust;
+  tprintf("Static Shape ID: %d\n", shape_id);
+  ShowMatchDisplay();
+  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
+            AllProtosOn, reinterpret_cast<BIT_VECTOR>(&config_mask),
+            num_features, features, &cn_result,
+            classify_adapt_feature_threshold,
+            matcher_debug_flags,
+            matcher_debug_separate_windows);
+  UpdateMatchDisplay();
 }                              /* ShowBestMatchFor */
 
 // Returns a string for the classifier class_id: either the corresponding
@@ -2796,8 +2555,7 @@ bool Classify::TempConfigReliable(CLASS_ID class_id,
   return true;
 }
 
-void Classify::UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm,
-                                 TBLOB *Blob) {
+void Classify::UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob) {
   const UnicharIdVector *ambigs =
     getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
   int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
@@ -2818,7 +2576,7 @@ void Classify::UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm,
                   getDict().getUnicharset().debug_str(
                       ambig_class_id).string());
         }
-        MakePermanent(AdaptedTemplates, ambig_class_id, cfg, denorm, Blob);
+        MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
       }
     }
   }
diff --git a/classify/baseline.h b/classify/baseline.h
deleted file mode 100644
index e4addca8b..000000000
--- a/classify/baseline.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* -*-C-*-
- ********************************************************************************
- *
- * File:        baseline.h  (Formerly baseline.h)
- * Description:
- * Author:       Mark Seaman, SW Productivity
- * Created:      Fri Oct 16 14:37:00 1987
- * Modified:     Wed Feb 27 13:39:35 1991 (Mark Seaman) marks@hpgrlt
- * Language:     C
- * Package:      N/A
- * Status:       Reusable Software Component
- *
- * (c) Copyright 1987, Hewlett-Packard Company.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- *************************************************************************/
-#ifndef BASELINE_H
-#define BASELINE_H
-
-/*----------------------------------------------------------------------
-              I n c l u d e s
-----------------------------------------------------------------------*/
-#include "host.h"
-#include "blobs.h"
-#include "params.h"
-
-/*----------------------------------------------------------------------
-              T y p e s
-----------------------------------------------------------------------*/
-#define BASELINE_OFFSET 64
-#define BASELINE_SCALE  128
-
-#endif
diff --git a/classify/blobclass.cpp b/classify/blobclass.cpp
index 93ea3fc03..cac3b409f 100644
--- a/classify/blobclass.cpp
+++ b/classify/blobclass.cpp
@@ -49,8 +49,11 @@ extern char imagefile[];
 ----------------------------------------------------------------------------**/
 
 /*---------------------------------------------------------------------------*/
+// As all TBLOBs, Blob is in baseline normalized coords.
+// See SetupBLCNDenorms in intfx.cpp for other args.
 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& denorm, const char* BlobText) {
+               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
 /*
  **      Parameters:
  **              Blob            blob whose micro-features are to be learned
@@ -95,18 +98,20 @@ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
     cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
   }
 
-  LearnBlob(FeatureDefs, FeatureFile, Blob, denorm, BlobText,
-            CurrFontName.string());
+  LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
+            BlobText, CurrFontName.string());
 }                                // LearnBlob
 
 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
-               TBLOB* Blob, const DENORM& denorm,
+               TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info,
                const char* BlobText, const char* FontName) {
   CHAR_DESC CharDesc;
 
   ASSERT_HOST(FeatureFile != NULL);
 
-  CharDesc = ExtractBlobFeatures(FeatureDefs, denorm, Blob);
+  CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
+                                 Blob);
   if (CharDesc == NULL) {
     cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
     return;
diff --git a/classify/blobclass.h b/classify/blobclass.h
index 57d27a0da..95510a2f9 100644
--- a/classify/blobclass.h
+++ b/classify/blobclass.h
@@ -40,11 +40,14 @@
           Public Function Prototypes
 ----------------------------------------------------------------------------**/
 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& denorm, const char* BlobText);
+               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info,
+               const char* BlobText);
 
 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* File, TBLOB* Blob,
-               const DENORM& denorm, const char* BlobText,
-               const char* FontName);
+               const DENORM& bl_denorm, const DENORM& cn_denorm,
+               const INT_FX_RESULT_STRUCT& fx_info,
+               const char* BlobText, const char* FontName);
 
 /**----------------------------------------------------------------------------
         Global Data Definitions and Declarations
diff --git a/classify/classify.cpp b/classify/classify.cpp
index b88c1510f..1eca2e9c2 100644
--- a/classify/classify.cpp
+++ b/classify/classify.cpp
@@ -26,6 +26,7 @@
 #include "intproto.h"
 #include "mfoutline.h"
 #include "scrollview.h"
+#include "shapeclassifier.h"
 #include "shapetable.h"
 #include "unicity_table.h"
 #include <string.h>
@@ -52,6 +53,11 @@ Classify::Classify()
                   this->params()),  /* PREV DEFAULT 0.1 */
     double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...",
                   this->params()),  /* PREV DEFAULT 0.3 */
+    double_MEMBER(classify_max_rating_ratio, 1.5,
+                  "Veto ratio between classifier ratings", this->params()),
+    double_MEMBER(classify_max_certainty_margin, 5.5,
+                  "Veto difference between classifier certainties",
+                  this->params()),
     BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
                 this->params()),
     BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
@@ -65,6 +71,8 @@ Classify::Classify()
                "Save adapted templates to a file", this->params()),
     BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
                 this->params()),
+    BOOL_MEMBER(classify_nonlinear_norm, 0,
+                "Non-linear stroke-density normalization", this->params()),
     INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
     INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
     INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
@@ -100,6 +108,12 @@ Classify::Classify()
                   this->params()),
     double_MEMBER(tessedit_class_miss_scale, 0.00390625,
                   "Scale factor for features not used", this->params()),
+    double_MEMBER(classify_adapted_pruning_factor, 2.5,
+                  "Prune poor adapted results this much worse than best result",
+                  this->params()),
+    double_MEMBER(classify_adapted_pruning_threshold, -1.0,
+                  "Threshold at which classify_adapted_pruning_factor starts",
+                  this->params()),
     INT_MEMBER(classify_adapt_proto_threshold, 230,
                "Threshold for good protos during adaptive 0-255",
                this->params()),
@@ -122,19 +136,24 @@ Classify::Classify()
                   this->params()),
     INT_MEMBER(classify_class_pruner_threshold, 229,
                "Class Pruner Threshold 0-255", this->params()),
-    INT_MEMBER(classify_class_pruner_multiplier, 30,
+    INT_MEMBER(classify_class_pruner_multiplier, 15,
                "Class Pruner Multiplier 0-255:       ", this->params()),
     INT_MEMBER(classify_cp_cutoff_strength, 7,
                "Class Pruner CutoffStrength:         ", this->params()),
-    INT_MEMBER(classify_integer_matcher_multiplier, 14,
+    INT_MEMBER(classify_integer_matcher_multiplier, 10,
                "Integer Matcher Multiplier  0-255:   ", this->params()),
     EnableLearning(true),
     INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
                this->params()),
     BOOL_MEMBER(classify_bln_numeric_mode, 0,
                 "Assume the input is numbers [0-9].", this->params()),
+    double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
+                  this->params()),
+    double_MEMBER(speckle_rating_penalty, 10.0,
+                  "Penalty to add to worst rating for noise", this->params()),
     shape_table_(NULL),
-    dict_(&image_) {
+    dict_(&image_),
+    static_classifier_(NULL) {
   fontinfo_table_.set_compare_callback(
       NewPermanentTessCallback(CompareFontInfo));
   fontinfo_table_.set_clear_callback(
@@ -184,4 +203,45 @@ Classify::~Classify() {
   delete[] BaselineCutoffs;
 }
 
+
+// Takes ownership of the given classifier, and uses it for future calls
+// to CharNormClassifier.
+void Classify::SetStaticClassifier(ShapeClassifier* static_classifier) {
+  delete static_classifier_;
+  static_classifier_ = static_classifier;
+}
+
+// Moved from speckle.cpp
+// Adds a noise classification result that is a bit worse than the worst
+// current result, or the worst possible result if no current results.
+void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {
+    BLOB_CHOICE_IT bc_it(choices);
+  // If there is no classifier result, we will use the worst possible certainty
+  // and corresponding rating.
+  float certainty = -getDict().certainty_scale;
+  float rating = rating_scale * blob_length;
+  if (!choices->empty() && blob_length > 0) {
+    bc_it.move_to_last();
+    BLOB_CHOICE* worst_choice = bc_it.data();
+    // Add speckle_rating_penalty to worst rating, matching old value.
+    rating = worst_choice->rating() + speckle_rating_penalty;
+    // Compute the rating to correspond to the certainty. (Used to be kept
+    // the same, but that messes up the language model search.)
+    certainty = -rating * getDict().certainty_scale /
+        (rating_scale * blob_length);
+  }
+  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
+                                             -1, -1, 0, 0, MAX_FLOAT32, 0,
+                                             BCC_SPECKLE_CLASSIFIER);
+  bc_it.add_to_end(blob_choice);
+}
+
+// Returns true if the blob is small enough to be a large speckle.
+bool Classify::LargeSpeckle(const TBLOB &blob) {
+  double speckle_size = kBlnXHeight * speckle_large_max_size;
+  TBOX bbox = blob.bounding_box();
+  return bbox.width() < speckle_size && bbox.height() < speckle_size;
+}
+
+
 }  // namespace tesseract
diff --git a/classify/classify.h b/classify/classify.h
index abdceef2d..92629da71 100644
--- a/classify/classify.h
+++ b/classify/classify.h
@@ -43,8 +43,10 @@ static const int kBlankFontinfoId = -2;
 
 namespace tesseract {
 
+class ShapeClassifier;
 struct ShapeRating;
 class ShapeTable;
+struct UnicharRating;
 
 // How segmented is a blob. In this enum, character refers to a classifiable
 // unit, but that is too long and character is usually easier to understand.
@@ -67,6 +69,17 @@ class Classify : public CCStruct {
     return shape_table_;
   }
 
+  // Takes ownership of the given classifier, and uses it for future calls
+  // to CharNormClassifier.
+  void SetStaticClassifier(ShapeClassifier* static_classifier);
+
+  // Adds a noise classification result that is a bit worse than the worst
+  // current result, or the worst possible result if no current results.
+  void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);
+
+  // Returns true if the blob is small enough to be a large speckle.
+  bool LargeSpeckle(const TBLOB &blob);
+
   /* adaptive.cpp ************************************************************/
   ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
   int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
@@ -112,9 +125,7 @@ class Classify : public CCStruct {
   // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
   // is called and the data will be written to a file for static training.
   // Otherwise AdaptToBlob is called for adaption within a document.
-  // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
-  // be learned, otherwise all chars with good correct_text are learned.
-  void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
+  void LearnWord(const char* filename, WERD_RES *word);
 
   // Builds a blob of length fragments, from the word, starting at start,
   // and then learn it, as having the given correct_text.
@@ -130,18 +141,15 @@ class Classify : public CCStruct {
                    const char* correct_text, WERD_RES *word);
   void InitAdaptiveClassifier(bool load_pre_trained_templates);
   void InitAdaptedClass(TBLOB *Blob,
-                        const DENORM& denorm,
                         CLASS_ID ClassId,
                         int FontinfoId,
                         ADAPT_CLASS Class,
                         ADAPT_TEMPLATES Templates);
   void AdaptToPunc(TBLOB *Blob,
-                   const DENORM& denorm,
                    CLASS_ID ClassId,
                    int FontinfoId,
                    FLOAT32 Threshold);
   void AmbigClassifier(TBLOB *Blob,
-                       const DENORM& denorm,
                        INT_TEMPLATES Templates,
                        ADAPT_CLASS *Classes,
                        UNICHAR_ID *Ambiguities,
@@ -194,15 +202,8 @@ class Classify : public CCStruct {
 
 #ifndef GRAPHICS_DISABLED
   void DebugAdaptiveClassifier(TBLOB *Blob,
-                               const DENORM& denorm,
                                ADAPT_RESULTS *Results);
 #endif
-  void GetAdaptThresholds (TWERD * Word,
-                           const DENORM& denorm,
-                           const WERD_CHOICE& BestChoice,
-                           const WERD_CHOICE& BestRawChoice,
-                           FLOAT32 Thresholds[]);
-
   PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
                              int NumBadFeat,
                              FEATURE_ID BadFeat[],
@@ -218,19 +219,14 @@ class Classify : public CCStruct {
   void MakePermanent(ADAPT_TEMPLATES Templates,
                      CLASS_ID ClassId,
                      int ConfigId,
-                     const DENORM& denorm,
                      TBLOB *Blob);
   void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
   void RemoveExtraPuncs(ADAPT_RESULTS *Results);
   void RemoveBadMatches(ADAPT_RESULTS *Results);
   void SetAdaptiveThreshold(FLOAT32 Threshold);
-  void ShowBestMatchFor(TBLOB *Blob,
-                        const DENORM& denorm,
-                        CLASS_ID ClassId,
-                        int shape_id,
-                        BOOL8 AdaptiveOn,
-                        BOOL8 PreTrainedOn,
-                        ADAPT_RESULTS *Results);
+  void ShowBestMatchFor(int shape_id,
+                        const INT_FEATURE_STRUCT* features,
+                        int num_features);
   // Returns a string for the classifier class_id: either the corresponding
   // unicharset debug_str or the shape_table_ debug str.
   STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
@@ -251,59 +247,46 @@ class Classify : public CCStruct {
   // unichar-id!). Uses a search, so not fast.
   int ShapeIDToClassID(int shape_id) const;
   UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
-                                 const DENORM& denorm,
                                  ADAPT_TEMPLATES Templates,
                                  ADAPT_RESULTS *Results);
   int CharNormClassifier(TBLOB *Blob,
-                         const DENORM& denorm,
                          INT_TEMPLATES Templates,
                          ADAPT_RESULTS *Results);
 
   // As CharNormClassifier, but operates on a TrainingSample and outputs to
   // a GenericVector of ShapeRating without conversion to classes.
-  int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample,
-                             GenericVector<ShapeRating>* results);
-  UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
-                             const DENORM& denorm,
-                             CLASS_ID CorrectClass);
-  void DoAdaptiveMatch(TBLOB *Blob,
-                       const DENORM& denorm,
-                       ADAPT_RESULTS *Results);
+  int CharNormTrainingSample(bool pruner_only, int keep_this,
+                             const TrainingSample& sample,
+                             GenericVector<UnicharRating>* results);
+  UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);
+  void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);
   void AdaptToChar(TBLOB *Blob,
-                   const DENORM& denorm,
                    CLASS_ID ClassId,
                    int FontinfoId,
                    FLOAT32 Threshold);
-  void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
-                          INT_CLASS_STRUCT* int_class);
-  int AdaptableWord(TWERD *Word,
-                  const WERD_CHOICE &BestChoiceWord,
-                  const WERD_CHOICE &RawChoiceWord);
+  void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
+  bool AdaptableWord(WERD_RES* word);
   void EndAdaptiveClassifier();
   void PrintAdaptiveStatistics(FILE *File);
   void SettupPass1();
   void SettupPass2();
   void AdaptiveClassifier(TBLOB *Blob,
-                          const DENORM& denorm,
                           BLOB_CHOICE_LIST *Choices,
                           CLASS_PRUNER_RESULTS cp_results);
   void ClassifyAsNoise(ADAPT_RESULTS *Results);
   void ResetAdaptiveClassifierInternal();
 
   int GetBaselineFeatures(TBLOB *Blob,
-                          const DENORM& denorm,
                           INT_TEMPLATES Templates,
                           INT_FEATURE_ARRAY IntFeatures,
                           uinT8* CharNormArray,
                           inT32 *BlobLength);
   int GetCharNormFeatures(TBLOB *Blob,
-                          const DENORM& denorm,
                           INT_TEMPLATES Templates,
                           INT_FEATURE_ARRAY IntFeatures,
                           uinT8* PrunerNormArray,
                           uinT8* CharNormArray,
-                          inT32 *BlobLength,
-                          inT32 *FeatureOutlineIndex);
+                          inT32 *BlobLength);
   // Computes the char_norm_array for the unicharset and, if not NULL, the
   // pruner_array as appropriate according to the existence of the shape_table.
   // The norm_feature is deleted as it is almost certainly no longer needed.
@@ -313,13 +296,54 @@ class Classify : public CCStruct {
                              uinT8* pruner_array);
 
   bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
-  void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob);
+  void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
 
   void ResetFeaturesHaveBeenExtracted();
   bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
-  bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob);
+  bool LooksLikeGarbage(TBLOB *blob);
   void RefreshDebugWindow(ScrollView **win, const char *msg,
                           int y_offset, const TBOX &wbox);
+  // intfx.cpp
+  // Computes the DENORMS for bl(baseline) and cn(character) normalization
+  // during feature extraction. The input denorm describes the current state
+  // of the blob, which is usually a baseline-normalized word.
+  // The Transforms setup are as follows:
+  // Baseline Normalized (bl) Output:
+  //   We center the grapheme by aligning the x-coordinate of its centroid with
+  //   x=128 and leaving the already-baseline-normalized y as-is.
+  //
+  // Character Normalized (cn) Output:
+  //   We align the grapheme's centroid at the origin and scale it
+  //   asymmetrically in x and y so that the 2nd moments are a standard value
+  //   (51.2) ie the result is vaguely square.
+  // If classify_nonlinear_norm is true:
+  //   A non-linear normalization is setup that attempts to evenly distribute
+  //   edges across x and y.
+  //
+  // Some of the fields of fx_info are also setup:
+  // Length: Total length of outline.
+  // Rx:     Rounded y second moment. (Reversed by convention.)
+  // Ry:     rounded x second moment.
+  // Xmean:  Rounded x center of mass of the blob.
+  // Ymean:  Rounded y center of mass of the blob.
+  static void SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
+                               DENORM* bl_denorm, DENORM* cn_denorm,
+                               INT_FX_RESULT_STRUCT* fx_info);
+
+  // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
+  // (x,y) position and angle as measured counterclockwise from the vector
+  // <-1, 0>, from blob using two normalizations defined by bl_denorm and
+  // cn_denorm. See SetpuBLCNDenorms for definitions.
+  // If outline_cn_counts is not NULL, on return it contains the cumulative
+  // number of cn features generated for each outline in the blob (in order).
+  // Thus after the first outline, there were (*outline_cn_counts)[0] features,
+  // after the second outline, there were (*outline_cn_counts)[1] features etc.
+  static void ExtractFeatures(const TBLOB& blob,
+                              bool nonlinear_norm,
+                              GenericVector<INT_FEATURE_STRUCT>* bl_features,
+                              GenericVector<INT_FEATURE_STRUCT>* cn_features,
+                              INT_FX_RESULT_STRUCT* results,
+                              GenericVector<int>* outline_cn_counts);
   /* float2int.cpp ************************************************************/
   void ClearCharNormArray(uinT8* char_norm_array);
   void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
@@ -336,6 +360,9 @@ class Classify : public CCStruct {
   UnicityTable<FontInfo>& get_fontinfo_table() {
     return fontinfo_table_;
   }
+  const UnicityTable<FontInfo>& get_fontinfo_table() const {
+    return fontinfo_table_;
+  }
   UnicityTable<FontSet>& get_fontset_table() {
     return fontset_table_;
   }
@@ -365,6 +392,10 @@ class Classify : public CCStruct {
   double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
   double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
   double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
+  double_VAR_H(classify_max_rating_ratio, 1.5,
+               "Veto ratio between classifier ratings");
+  double_VAR_H(classify_max_certainty_margin, 5.5,
+               "Veto difference between classifier certainties");
 
   /* adaptmatch.cpp ***********************************************************/
   BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
@@ -375,6 +406,8 @@ class Classify : public CCStruct {
   BOOL_VAR_H(classify_save_adapted_templates, 0,
              "Save adapted templates to a file");
   BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
+  BOOL_VAR_H(classify_nonlinear_norm, 0,
+             "Non-linear stroke-density normalization");
   INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
   INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
   INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
@@ -398,6 +431,10 @@ class Classify : public CCStruct {
   double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
   double_VAR_H(tessedit_class_miss_scale, 0.00390625,
                "Scale factor for features not used");
+  double_VAR_H(classify_adapted_pruning_factor, 2.5,
+               "Prune poor adapted results this much worse than best result");
+  double_VAR_H(classify_adapted_pruning_threshold, -1.0,
+               "Threshold at which classify_adapted_pruning_factor starts");
   INT_VAR_H(classify_adapt_proto_threshold, 230,
             "Threshold for good protos during adaptive 0-255");
   INT_VAR_H(classify_adapt_feature_threshold, 230,
@@ -418,11 +455,11 @@ class Classify : public CCStruct {
   /* intmatcher.cpp **********************************************************/
   INT_VAR_H(classify_class_pruner_threshold, 229,
             "Class Pruner Threshold 0-255");
-  INT_VAR_H(classify_class_pruner_multiplier, 30,
+  INT_VAR_H(classify_class_pruner_multiplier, 15,
             "Class Pruner Multiplier 0-255:       ");
   INT_VAR_H(classify_cp_cutoff_strength, 7,
             "Class Pruner CutoffStrength:         ");
-  INT_VAR_H(classify_integer_matcher_multiplier, 14,
+  INT_VAR_H(classify_integer_matcher_multiplier, 10,
             "Integer Matcher Multiplier  0-255:   ");
 
   // Use class variables to hold onto built-in templates and adapted templates.
@@ -453,6 +490,9 @@ class Classify : public CCStruct {
   INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
   BOOL_VAR_H(classify_bln_numeric_mode, 0,
              "Assume the input is numbers [0-9].");
+  double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");
+  double_VAR_H(speckle_rating_penalty, 10.0,
+               "Penalty to add to worst rating for noise");
 
  protected:
   IntegerMatcher im_;
@@ -466,6 +506,8 @@ class Classify : public CCStruct {
  private:
 
   Dict dict_;
+  // The currently active static classifier.
+  ShapeClassifier* static_classifier_;
 
   /* variables used to hold performance statistics */
   int AdaptiveMatcherCalls;
diff --git a/classify/cluster.cpp b/classify/cluster.cpp
index 964ab2b7b..6c78c6f14 100644
--- a/classify/cluster.cpp
+++ b/classify/cluster.cpp
@@ -15,11 +15,12 @@
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  ******************************************************************************/
-#include "oldheap.h"
 #include "const.h"
 #include "cluster.h"
 #include "emalloc.h"
+#include "genericheap.h"
 #include "helpers.h"
+#include "kdpair.h"
 #include "matrix.h"
 #include "tprintf.h"
 #include "danerror.h"
@@ -164,6 +165,9 @@ struct TEMPCLUSTER {
   CLUSTER *Neighbor;
 };
 
+typedef tesseract::KDPairInc<float, TEMPCLUSTER*> ClusterPair;
+typedef tesseract::GenericHeap<ClusterPair> ClusterHeap;
+
 struct STATISTICS {
   FLOAT32 AvgVariance;
   FLOAT32 *CoVariance;
@@ -190,7 +194,7 @@ struct CHISTRUCT{
 
 // For use with KDWalk / MakePotentialClusters
 struct ClusteringContext {
-  HEAP *heap;  // heap used to hold temp clusters, "best" on top
+  ClusterHeap *heap;  // heap used to hold temp clusters, "best" on top
   TEMPCLUSTER *candidates;  // array of potential clusters
   KDTREE *tree;  // kd-tree to be searched for neighbors
   inT32 next;  // next candidate to be used
@@ -693,7 +697,7 @@ History:	5/29/89, DSJ, Created.
 ******************************************************************************/
 void CreateClusterTree(CLUSTERER *Clusterer) {
   ClusteringContext context;
-  HEAPENTRY HeapEntry;
+  ClusterPair HeapEntry;
   TEMPCLUSTER *PotentialCluster;
 
   // each sample and its nearest neighbor form a "potential" cluster
@@ -702,12 +706,12 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
   context.candidates = (TEMPCLUSTER *)
     Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
   context.next = 0;
-  context.heap = MakeHeap(Clusterer->NumberOfSamples);
+  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
   KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
 
   // form potential clusters into actual clusters - always do "best" first
-  while (GetTopOfHeap(context.heap, &HeapEntry) != EMPTY) {
-    PotentialCluster = (TEMPCLUSTER *)HeapEntry.Data;
+  while (context.heap->Pop(&HeapEntry)) {
+    PotentialCluster = HeapEntry.data;
 
     // if main cluster of potential cluster is already in another cluster
     // then we don't need to worry about it
@@ -720,9 +724,9 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
     else if (PotentialCluster->Neighbor->Clustered) {
       PotentialCluster->Neighbor =
         FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
-                            &HeapEntry.Key);
+                            &HeapEntry.key);
       if (PotentialCluster->Neighbor != NULL) {
-        HeapStore(context.heap, &HeapEntry);
+        context.heap->Push(&HeapEntry);
       }
     }
 
@@ -732,9 +736,9 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
           MakeNewCluster(Clusterer, PotentialCluster);
       PotentialCluster->Neighbor =
           FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
-                              &HeapEntry.Key);
+                              &HeapEntry.key);
       if (PotentialCluster->Neighbor != NULL) {
-        HeapStore(context.heap, &HeapEntry);
+        context.heap->Push(&HeapEntry);
       }
     }
   }
@@ -745,7 +749,7 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
   // free up the memory used by the K-D tree, heap, and temp clusters
   FreeKDTree(context.tree);
   Clusterer->KDTree = NULL;
-  FreeHeap(context.heap);
+  delete context.heap;
   memfree(context.candidates);
 }                                // CreateClusterTree
 
@@ -763,16 +767,16 @@ void CreateClusterTree(CLUSTERER *Clusterer) {
 ******************************************************************************/
 void MakePotentialClusters(ClusteringContext *context,
                            CLUSTER *Cluster, inT32 Level) {
-  HEAPENTRY HeapEntry;
+  ClusterPair HeapEntry;
   int next = context->next;
   context->candidates[next].Cluster = Cluster;
-  HeapEntry.Data = (char *) &(context->candidates[next]);
+  HeapEntry.data = &(context->candidates[next]);
   context->candidates[next].Neighbor =
       FindNearestNeighbor(context->tree,
                           context->candidates[next].Cluster,
-                          &HeapEntry.Key);
+                          &HeapEntry.key);
   if (context->candidates[next].Neighbor != NULL) {
-    HeapStore(context->heap, &HeapEntry);
+    context->heap->Push(&HeapEntry);
     context->next++;
   }
 }                                // MakePotentialClusters
diff --git a/classify/errorcounter.cpp b/classify/errorcounter.cpp
index 06d973546..706e534a2 100644
--- a/classify/errorcounter.cpp
+++ b/classify/errorcounter.cpp
@@ -27,6 +27,9 @@
 
 namespace tesseract {
 
+// Difference in result rating to be thought of as an "equal" choice.
+const double kRatingEpsilon = 1.0 / 32;
+
 // Tests a classifier, computing its error rate.
 // See errorcounter.h for description of arguments.
 // Iterates over the samples, calling the classifier in normal/silent mode.
@@ -35,14 +38,12 @@ namespace tesseract {
 // with a debug flag and a keep_this argument to find out what is going on.
 double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
     int report_level, CountTypes boosting_mode,
-    const UnicityTable<FontInfo>& fontinfo_table,
+    const FontInfoTable& fontinfo_table,
     const GenericVector<Pix*>& page_images, SampleIterator* it,
     double* unichar_error,  double* scaled_error, STRING* fonts_report) {
-  int charsetsize = it->shape_table()->unicharset().size();
-  int shapesize = it->CompactCharsetSize();
   int fontsize = it->sample_set()->NumFonts();
-  ErrorCounter counter(charsetsize, shapesize, fontsize);
-  GenericVector<ShapeRating> results;
+  ErrorCounter counter(classifier->GetUnicharset(), fontsize);
+  GenericVector<UnicharRating> results;
 
   clock_t start = clock();
   int total_samples = 0;
@@ -56,21 +57,28 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
     Pix* page_pix = 0 <= page_index && page_index < page_images.size()
                   ? page_images[page_index] : NULL;
     // No debug, no keep this.
-    classifier->ClassifySample(*mutable_sample, page_pix, 0, INVALID_UNICHAR_ID,
-                               &results);
-    if (mutable_sample->class_id() == 0) {
+    classifier->UnicharClassifySample(*mutable_sample, page_pix, 0,
+                                      INVALID_UNICHAR_ID, &results);
+    bool debug_it = false;
+    int correct_id = mutable_sample->class_id();
+    if (counter.unicharset_.has_special_codes() &&
+        (correct_id == UNICHAR_SPACE || correct_id == UNICHAR_JOINED ||
+         correct_id == UNICHAR_BROKEN)) {
       // This is junk so use the special counter.
-      counter.AccumulateJunk(*it->shape_table(), results, mutable_sample);
-    } else if (counter.AccumulateErrors(report_level > 3, boosting_mode,
-                                        fontinfo_table, *it->shape_table(),
-                                        results, mutable_sample) &&
-               error_samples > 0) {
+      debug_it = counter.AccumulateJunk(report_level > 3,
+                                        results,
+                                        mutable_sample);
+    } else {
+      debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode,
+                                          fontinfo_table,
+                                          results, mutable_sample);
+    }
+    if (debug_it && error_samples > 0) {
       // Running debug, keep the correct answer, and debug the classifier.
-      tprintf("Error on sample %d: Classifier debug output:\n",
-              it->GlobalSampleIndex());
-      int keep_this = it->GetSparseClassID();
-      classifier->ClassifySample(*mutable_sample, page_pix, 1, keep_this,
-                                 &results);
+      tprintf("Error on sample %d: %s Classifier debug output:\n",
+              it->GlobalSampleIndex(),
+              it->sample_set()->SampleToString(*mutable_sample).string());
+      classifier->DebugDisplay(*mutable_sample, page_pix, correct_id);
       --error_samples;
     }
     ++total_samples;
@@ -89,12 +97,70 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier* classifier,
   return unscaled_error;
 }
 
+// Tests a pair of classifiers, debugging errors of the new against the old.
+// See errorcounter.h for description of arguments.
+// Iterates over the samples, calling the classifiers in normal/silent mode.
+// If the new_classifier makes a boosting_mode error that the old_classifier
+// does not, it will then call the new_classifier again with a debug flag
+// and a keep_this argument to find out what is going on.
+void ErrorCounter::DebugNewErrors(
+    ShapeClassifier* new_classifier, ShapeClassifier* old_classifier,
+    CountTypes boosting_mode,
+    const FontInfoTable& fontinfo_table,
+    const GenericVector<Pix*>& page_images, SampleIterator* it) {
+  int fontsize = it->sample_set()->NumFonts();
+  ErrorCounter old_counter(old_classifier->GetUnicharset(), fontsize);
+  ErrorCounter new_counter(new_classifier->GetUnicharset(), fontsize);
+  GenericVector<UnicharRating> results;
+
+  int total_samples = 0;
+  int error_samples = 25;
+  int total_new_errors = 0;
+  // Iterate over all the samples, accumulating errors.
+  for (it->Begin(); !it->AtEnd(); it->Next()) {
+    TrainingSample* mutable_sample = it->MutableSample();
+    int page_index = mutable_sample->page_num();
+    Pix* page_pix = 0 <= page_index && page_index < page_images.size()
+                  ? page_images[page_index] : NULL;
+    // No debug, no keep this.
+    old_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0,
+                                          INVALID_UNICHAR_ID, &results);
+    int correct_id = mutable_sample->class_id();
+    if (correct_id != 0 &&
+        !old_counter.AccumulateErrors(true, boosting_mode, fontinfo_table,
+                                      results, mutable_sample)) {
+      // old classifier was correct, check the new one.
+      new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 0,
+                                            INVALID_UNICHAR_ID, &results);
+      if (correct_id != 0 &&
+          new_counter.AccumulateErrors(true, boosting_mode, fontinfo_table,
+                                        results, mutable_sample)) {
+        tprintf("New Error on sample %d: Classifier debug output:\n",
+                it->GlobalSampleIndex());
+        ++total_new_errors;
+        new_classifier->UnicharClassifySample(*mutable_sample, page_pix, 1,
+                                              correct_id, &results);
+        if (results.size() > 0 && error_samples > 0) {
+          new_classifier->DebugDisplay(*mutable_sample, page_pix, correct_id);
+          --error_samples;
+        }
+      }
+    }
+    ++total_samples;
+  }
+  tprintf("Total new errors = %d\n", total_new_errors);
+}
+
 // Constructor is private. Only anticipated use of ErrorCounter is via
 // the static ComputeErrorRate.
-ErrorCounter::ErrorCounter(int charsetsize, int shapesize, int fontsize)
-  : scaled_error_(0.0), unichar_counts_(charsetsize, shapesize, 0) {
+ErrorCounter::ErrorCounter(const UNICHARSET& unicharset, int fontsize)
+  : scaled_error_(0.0), rating_epsilon_(kRatingEpsilon),
+    unichar_counts_(unicharset.size(), unicharset.size(), 0),
+    ok_score_hist_(0, 101), bad_score_hist_(0, 101),
+    unicharset_(unicharset) {
   Counts empty_counts;
   font_counts_.init_to_size(fontsize, empty_counts);
+  multi_unichar_counts_.init_to_size(unicharset.size(), 0);
 }
 ErrorCounter::~ErrorCounter() {
 }
@@ -107,13 +173,11 @@ ErrorCounter::~ErrorCounter() {
 // for error counting and shape_table is used to understand the relationship
 // between unichar_ids and shape_ids in the results
 bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
-                                    const UnicityTable<FontInfo>& font_table,
-                                    const ShapeTable& shape_table,
-                                    const GenericVector<ShapeRating>& results,
+                                    const FontInfoTable& font_table,
+                                    const GenericVector<UnicharRating>& results,
                                     TrainingSample* sample) {
   int num_results = results.size();
-  int res_index = 0;
-  bool debug_it = false;
+  int answer_actual_rank = -1;
   int font_id = sample->font_id();
   int unichar_id = sample->class_id();
   sample->set_is_error(false);
@@ -123,107 +187,143 @@ bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
     // improve the classifier.
     sample->set_is_error(true);
     ++font_counts_[font_id].n[CT_REJECT];
-  } else if (shape_table.GetShape(results[0].shape_id).
-          ContainsUnicharAndFont(unichar_id, font_id)) {
-    ++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT];
-    // Unichar and font OK, but count if multiple unichars.
-    if (shape_table.GetShape(results[0].shape_id).size() > 1)
-      ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
   } else {
-    // This is a top shape error.
-    ++font_counts_[font_id].n[CT_SHAPE_TOP_ERR];
-    // Check to see if any font in the top choice has attributes that match.
-    bool attributes_match = false;
-    uinT32 font_props = font_table.get(font_id).properties;
-    const Shape& shape = shape_table.GetShape(results[0].shape_id);
-    for (int c = 0; c < shape.size() && !attributes_match; ++c) {
-      for (int f = 0; f < shape[c].font_ids.size(); ++f) {
-        if (font_table.get(shape[c].font_ids[f]).properties == font_props) {
-          attributes_match = true;
-          break;
-        }
+    // Find rank of correct unichar answer, using rating_epsilon_ to allow
+    // different answers to score as equal. (Ignoring the font.)
+    int epsilon_rank = 0;
+    int answer_epsilon_rank = -1;
+    int num_top_answers = 0;
+    double prev_rating = results[0].rating;
+    bool joined = false;
+    bool broken = false;
+    int res_index = 0;
+    while (res_index < num_results) {
+      if (results[res_index].rating < prev_rating - rating_epsilon_) {
+        ++epsilon_rank;
+        prev_rating = results[res_index].rating;
       }
-    }
-    // TODO(rays) It is easy to add counters for individual font attributes
-    // here if we want them.
-    if (!attributes_match)
-      ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
-    if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true);
-    // Find rank of correct unichar answer. (Ignoring the font.)
-    while (res_index < num_results &&
-           !shape_table.GetShape(results[res_index].shape_id).
-                ContainsUnichar(unichar_id)) {
+      if (results[res_index].unichar_id == unichar_id &&
+          answer_epsilon_rank < 0) {
+        answer_epsilon_rank = epsilon_rank;
+        answer_actual_rank = res_index;
+      }
+      if (results[res_index].unichar_id == UNICHAR_JOINED &&
+          unicharset_.has_special_codes())
+        joined = true;
+      else if (results[res_index].unichar_id == UNICHAR_BROKEN &&
+               unicharset_.has_special_codes())
+        broken = true;
+      else if (epsilon_rank == 0)
+        ++num_top_answers;
       ++res_index;
     }
-    if (res_index == 0) {
+    if (answer_actual_rank != 0) {
+      // Correct result is not absolute top.
+      ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR];
+      if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) sample->set_is_error(true);
+    }
+    if (answer_epsilon_rank == 0) {
+      ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK];
       // Unichar OK, but count if multiple unichars.
-      if (shape_table.GetShape(results[res_index].shape_id).size() > 1) {
+      if (num_top_answers > 1) {
         ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
+        ++multi_unichar_counts_[unichar_id];
+      }
+      // Check to see if any font in the top choice has attributes that match.
+      // TODO(rays) It is easy to add counters for individual font attributes
+      // here if we want them.
+      if (font_table.SetContainsFontProperties(
+          font_id, results[answer_actual_rank].fonts)) {
+        // Font attributes were matched.
+        // Check for multiple properties.
+        if (font_table.SetContainsMultipleFontProperties(
+            results[answer_actual_rank].fonts))
+          ++font_counts_[font_id].n[CT_OK_MULTI_FONT];
+      } else {
+        // Font attributes weren't matched.
+        ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
       }
     } else {
-      // Count maps from unichar id to shape id.
-      if (num_results > 0)
-        ++unichar_counts_(unichar_id, results[0].shape_id);
-      // This is a unichar error.
+      // This is a top unichar error.
       ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
       if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
-      if (res_index >= MIN(2, num_results)) {
+      // Count maps from unichar id to wrong unichar id.
+      ++unichar_counts_(unichar_id, results[0].unichar_id);
+      if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {
         // It is also a 2nd choice unichar error.
         ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
         if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
       }
-      if (res_index >= num_results) {
+      if (answer_epsilon_rank < 0) {
         // It is also a top-n choice unichar error.
         ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
         if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
-        debug_it = debug;
+        answer_epsilon_rank = epsilon_rank;
       }
     }
+    // Compute mean number of return values and mean rank of correct answer.
+    font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
+    font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank;
+    if (joined)
+      ++font_counts_[font_id].n[CT_OK_JOINED];
+    if (broken)
+      ++font_counts_[font_id].n[CT_OK_BROKEN];
   }
-  // Compute mean number of return values and mean rank of correct answer.
-  font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
-  font_counts_[font_id].n[CT_RANK] += res_index;
   // If it was an error for boosting then sum the weight.
   if (sample->is_error()) {
     scaled_error_ += sample->weight();
-  }
-  if (debug_it) {
-    tprintf("%d results for char %s font %d :",
-            num_results, shape_table.unicharset().id_to_unichar(unichar_id),
-            font_id);
-    for (int i = 0; i < num_results; ++i) {
-      tprintf(" %.3f/%.3f:%s",
-              results[i].rating, results[i].font,
-              shape_table.DebugStr(results[i].shape_id).string());
+    if (debug) {
+      tprintf("%d results for char %s font %d :",
+              num_results, unicharset_.id_to_unichar(unichar_id),
+              font_id);
+      for (int i = 0; i < num_results; ++i) {
+        tprintf(" %.3f : %s\n",
+                results[i].rating,
+                unicharset_.id_to_unichar(results[i].unichar_id));
+      }
+      return true;
     }
-    tprintf("\n");
-    return true;
+    int percent = 0;
+    if (num_results > 0)
+      percent = IntCastRounded(results[0].rating * 100);
+    bad_score_hist_.add(percent, 1);
+  } else {
+    int percent = 0;
+    if (answer_actual_rank >= 0)
+      percent = IntCastRounded(results[answer_actual_rank].rating * 100);
+    ok_score_hist_.add(percent, 1);
   }
   return false;
 }
 
 // Accumulates counts for junk. Counts only whether the junk was correctly
 // rejected or not.
-void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table,
-                                  const GenericVector<ShapeRating>& results,
+bool ErrorCounter::AccumulateJunk(bool debug,
+                                  const GenericVector<UnicharRating>& results,
                                   TrainingSample* sample) {
   // For junk we accept no answer, or an explicit shape answer matching the
   // class id of the sample.
   int num_results = results.size();
   int font_id = sample->font_id();
   int unichar_id = sample->class_id();
-  if (num_results > 0 &&
-      !shape_table.GetShape(results[0].shape_id).ContainsUnichar(unichar_id)) {
+  int percent = 0;
+  if (num_results > 0)
+    percent = IntCastRounded(results[0].rating * 100);
+  if (num_results > 0 && results[0].unichar_id != unichar_id) {
     // This is a junk error.
     ++font_counts_[font_id].n[CT_ACCEPTED_JUNK];
     sample->set_is_error(true);
     // It counts as an error for boosting too so sum the weight.
     scaled_error_ += sample->weight();
+    bad_score_hist_.add(percent, 1);
+    return debug;
   } else {
     // Correctly rejected.
     ++font_counts_[font_id].n[CT_REJECTED_JUNK];
     sample->set_is_error(false);
+    ok_score_hist_.add(percent, 1);
   }
+  return false;
 }
 
 // Creates a report of the error rate. The report_level controls the detail
@@ -239,7 +339,7 @@ void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table,
 // If not NULL, the report string is saved in fonts_report.
 // (Ignoring report_level).
 double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
-                                  const UnicityTable<FontInfo>& fontinfo_table,
+                                  const FontInfoTable& fontinfo_table,
                                   const SampleIterator& it,
                                   double* unichar_error,
                                   STRING* fonts_report) {
@@ -251,7 +351,7 @@ double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
     // Accumulate counts over fonts.
     totals += font_counts_[f];
     STRING font_report;
-    if (ReportString(font_counts_[f], &font_report)) {
+    if (ReportString(false, font_counts_[f], &font_report)) {
       if (fonts_report != NULL) {
         *fonts_report += fontinfo_table.get(f).name;
         *fonts_report += ": ";
@@ -264,39 +364,59 @@ double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
       }
     }
   }
+  // Report the totals.
+  STRING total_report;
+  bool any_results = ReportString(true, totals, &total_report);
+  if (fonts_report != NULL && fonts_report->length() == 0) {
+    // Make sure we return something even if there were no samples.
+    *fonts_report = "NoSamplesFound: ";
+    *fonts_report += total_report;
+    *fonts_report += "\n";
+  }
   if (report_level > 0) {
     // Report the totals.
     STRING total_report;
-    if (ReportString(totals, &total_report)) {
+    if (any_results) {
       tprintf("TOTAL Scaled Err=%.4g%%, %s\n",
               scaled_error_ * 100.0, total_report.string());
     }
     // Report the worst substitution error only for now.
     if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {
-      const UNICHARSET& unicharset = it.shape_table()->unicharset();
-      int charsetsize = unicharset.size();
-      int shapesize = it.CompactCharsetSize();
+      int charsetsize = unicharset_.size();
       int worst_uni_id = 0;
-      int worst_shape_id = 0;
+      int worst_result_id = 0;
       int worst_err = 0;
       for (int u = 0; u < charsetsize; ++u) {
-        for (int s = 0; s < shapesize; ++s) {
-          if (unichar_counts_(u, s) > worst_err) {
-            worst_err = unichar_counts_(u, s);
+        for (int v = 0; v < charsetsize; ++v) {
+          if (unichar_counts_(u, v) > worst_err) {
+            worst_err = unichar_counts_(u, v);
             worst_uni_id = u;
-            worst_shape_id = s;
+            worst_result_id = v;
           }
         }
       }
       if (worst_err > 0) {
         tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
-                worst_uni_id, unicharset.id_to_unichar(worst_uni_id),
-                it.shape_table()->DebugStr(worst_shape_id).string(),
+                worst_uni_id, unicharset_.id_to_unichar(worst_uni_id),
+                unicharset_.id_to_unichar(worst_result_id),
                 worst_err, totals.n[CT_UNICHAR_TOP1_ERR],
                 100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);
       }
     }
+    tprintf("Multi-unichar shape use:\n");
+    for (int u = 0; u < multi_unichar_counts_.size(); ++u) {
+      if (multi_unichar_counts_[u] > 0) {
+        tprintf("%d multiple answers for unichar: %s\n",
+                multi_unichar_counts_[u],
+                unicharset_.id_to_unichar(u));
+      }
+    }
+    tprintf("OK Score histogram:\n");
+    ok_score_hist_.print();
+    tprintf("ERROR Score histogram:\n");
+    bad_score_hist_.print();
   }
+
   double rates[CT_SIZE];
   if (!ComputeRates(totals, rates))
     return 0.0;
@@ -308,32 +428,37 @@ double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
 
 // Sets the report string to a combined human and machine-readable report
 // string of the error rates.
-// Returns false if there is no data, leaving report unchanged.
-bool ErrorCounter::ReportString(const Counts& counts, STRING* report) {
+// Returns false if there is no data, leaving report unchanged, unless
+// even_if_empty is true.
+bool ErrorCounter::ReportString(bool even_if_empty, const Counts& counts,
+                                STRING* report) {
   // Compute the error rates.
   double rates[CT_SIZE];
-  if (!ComputeRates(counts, rates))
+  if (!ComputeRates(counts, rates) && !even_if_empty)
     return false;
   // Using %.4g%%, the length of the output string should exactly match the
   // length of the format string, but in case of overflow, allow for +eddd
   // on each number.
   const int kMaxExtraLength = 5;  // Length of +eddd.
   // Keep this format string and the snprintf in sync with the CountTypes enum.
-  const char* format_str = "ShapeErr=%.4g%%, FontAttr=%.4g%%, "
-                           "Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], "
-                           "Multi=%.4g%%, Rej=%.4g%%, "
+  const char* format_str = "Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] "
+                           "Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, "
+                           "FontAttr=%.4g%%, Multi=%.4g%%, "
                            "Answers=%.3g, Rank=%.3g, "
                            "OKjunk=%.4g%%, Badjunk=%.4g%%";
   int max_str_len = strlen(format_str) + kMaxExtraLength * (CT_SIZE - 1) + 1;
   char* formatted_str = new char[max_str_len];
   snprintf(formatted_str, max_str_len, format_str,
-           rates[CT_SHAPE_TOP_ERR] * 100.0,
-           rates[CT_FONT_ATTR_ERR] * 100.0,
            rates[CT_UNICHAR_TOP1_ERR] * 100.0,
            rates[CT_UNICHAR_TOP2_ERR] * 100.0,
            rates[CT_UNICHAR_TOPN_ERR] * 100.0,
+           rates[CT_UNICHAR_TOPTOP_ERR] * 100.0,
            rates[CT_OK_MULTI_UNICHAR] * 100.0,
+           rates[CT_OK_JOINED] * 100.0,
+           rates[CT_OK_BROKEN] * 100.0,
            rates[CT_REJECT] * 100.0,
+           rates[CT_FONT_ATTR_ERR] * 100.0,
+           rates[CT_OK_MULTI_FONT] * 100.0,
            rates[CT_NUM_RESULTS],
            rates[CT_RANK],
            100.0 * rates[CT_REJECTED_JUNK],
@@ -350,13 +475,9 @@ bool ErrorCounter::ReportString(const Counts& counts, STRING* report) {
 // Computes the error rates and returns in rates which is an array of size
 // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
 bool ErrorCounter::ComputeRates(const Counts& counts, double rates[CT_SIZE]) {
-  int ok_samples = counts.n[CT_SHAPE_TOP_CORRECT] + counts.n[CT_SHAPE_TOP_ERR] +
+  int ok_samples = counts.n[CT_UNICHAR_TOP_OK] + counts.n[CT_UNICHAR_TOP1_ERR] +
       counts.n[CT_REJECT];
   int junk_samples = counts.n[CT_REJECTED_JUNK] + counts.n[CT_ACCEPTED_JUNK];
-  if (ok_samples == 0 && junk_samples == 0) {
-    // There is no data.
-    return false;
-  }
   // Compute rates for normal chars.
   double denominator = static_cast<double>(MAX(ok_samples, 1));
   for (int ct = 0; ct <= CT_RANK; ++ct)
@@ -365,7 +486,7 @@ bool ErrorCounter::ComputeRates(const Counts& counts, double rates[CT_SIZE]) {
   denominator = static_cast<double>(MAX(junk_samples, 1));
   for (int ct = CT_REJECTED_JUNK; ct <= CT_ACCEPTED_JUNK; ++ct)
     rates[ct] = counts.n[ct] / denominator;
-  return true;
+  return ok_samples != 0 || junk_samples != 0;
 }
 
 ErrorCounter::Counts::Counts() {
diff --git a/classify/errorcounter.h b/classify/errorcounter.h
index 618d56878..61af5014c 100644
--- a/classify/errorcounter.h
+++ b/classify/errorcounter.h
@@ -18,6 +18,7 @@
 
 #include "genericvector.h"
 #include "matrix.h"
+#include "statistc.h"
 
 struct Pix;
 template <typename T> class UnicityTable;
@@ -25,11 +26,11 @@ template <typename T> class UnicityTable;
 namespace tesseract {
 
 struct FontInfo;
+class FontInfoTable;
 class SampleIterator;
 class ShapeClassifier;
-class ShapeRating;
-class ShapeTable;
 class TrainingSample;
+class UnicharRating;
 
 // Enumeration of the different types of error count.
 // Error counts work as follows:
@@ -37,22 +38,21 @@ class TrainingSample;
 // Ground truth is a valid unichar-id / font-id pair:
 //        Number of classifier answers?
 //          0                       >0
-//     CT_REJECT     BOTH unichar-id and font-id match top shape?
-//     __________             yes!              no
-//                   CT_SHAPE_TOP_CORRECT  CT_SHAPE_TOP_ERR
-//                           |            Font attributes match?
-//                           |               yes!        no
-//                           |                 |     CT_FONT_ATTR_ERROR
-//                           |         Top unichar-id matches?
-//                           |         yes!          no
-//       Top shape-id has multiple unichars?    CT_UNICHAR_TOP1_ERR
-//               yes!            no           2nd shape unichar id matches?
-//        CT_OK_MULTI_UNICHAR   ________        yes!              no
-//        ___________________                  _____  CT_UNICHAR_TOP2_ERR
-//                                                    Any unichar-id matches?
-//                                                    yes!        no
-//                                                   ______ CT_UNICHAR_TOPN_ERR
-//                                                           _________________
+//     CT_REJECT          unichar-id matches top shape?
+//     __________             yes!                      no
+//                   CT_UNICHAR_TOP_OK           CT_UNICHAR_TOP1_ERR
+//      Top shape-id has multiple unichars?   2nd shape unichar id matches?
+//            yes!              no              yes!              no
+//      CT_OK_MULTI_UNICHAR     |              _____    CT_UNICHAR_TOP2_ERR
+//             Font attributes match?                 Any unichar-id matches?
+//              yes!              no                  yes!        no
+//      CT_FONT_ATTR_OK   CT_FONT_ATTR_ERR          ______  CT_UNICHAR_TOPN_ERR
+//                |       __________________                 _________________
+//      Top shape-id has multiple font attrs?
+//            yes!              no
+//      CT_OK_MULTI_FONT
+//      _____________________________
+//
 // Note that multiple counts may be activated for a single sample!
 //
 // Ground truth is for a fragment/n-gram that is NOT in the unicharset.
@@ -67,14 +67,20 @@ class TrainingSample;
 //
 // Keep in sync with the ReportString function.
 enum CountTypes {
-  CT_SHAPE_TOP_CORRECT,  // Top shape id is actually correct.
-  CT_SHAPE_TOP_ERR,      // Top shape id is not correct.
-  CT_FONT_ATTR_ERR,      // Font attributes incorrect, ignoring unichar.
+  CT_UNICHAR_TOP_OK,     // Top shape contains correct unichar id.
+  // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
+  // kRatingEpsilon from the first result in each group. The real top choice
+  // is measured using TOPTOP.
   CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
   CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
   CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
+  CT_UNICHAR_TOPTOP_ERR,   // Very top choice not correct.
   CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
+  CT_OK_JOINED,          // Top shape id is correct but marked joined.
+  CT_OK_BROKEN,          // Top shape id is correct but marked broken.
   CT_REJECT,             // Classifier hates this.
+  CT_FONT_ATTR_ERR,      // Top unichar OK, but font attributes incorrect.
+  CT_OK_MULTI_FONT,      // CT_FONT_ATTR_OK but there are multiple font attrs.
   CT_NUM_RESULTS,        // Number of answers produced.
   CT_RANK,               // Rank of correct answer.
   CT_REJECTED_JUNK,      // Junk that was correctly rejected.
@@ -115,12 +121,24 @@ class ErrorCounter {
   // * The return value is the un-weighted version of the scaled_error.
   static double ComputeErrorRate(ShapeClassifier* classifier,
                                  int report_level, CountTypes boosting_mode,
-                                 const UnicityTable<FontInfo>& fontinfo_table,
+                                 const FontInfoTable& fontinfo_table,
                                  const GenericVector<Pix*>& page_images,
                                  SampleIterator* it,
                                  double* unichar_error,
                                  double* scaled_error,
                                  STRING* fonts_report);
+  // Tests a pair of classifiers, debugging errors of the new against the old.
+  // See errorcounter.h for description of arguments.
+  // Iterates over the samples, calling the classifiers in normal/silent mode.
+  // If the new_classifier makes a boosting_mode error that the old_classifier
+  // does not, and the appropriate, it will then call the new_classifier again
+  // with a debug flag and a keep_this argument to find out what is going on.
+  static void DebugNewErrors(ShapeClassifier* new_classifier,
+                             ShapeClassifier* old_classifier,
+                             CountTypes boosting_mode,
+                             const FontInfoTable& fontinfo_table,
+                             const GenericVector<Pix*>& page_images,
+                             SampleIterator* it);
 
  private:
   // Simple struct to hold an array of counts.
@@ -134,7 +152,7 @@ class ErrorCounter {
 
   // Constructor is private. Only anticipated use of ErrorCounter is via
   // the static ComputeErrorRate.
-  ErrorCounter(int charsetsize, int shapesize, int fontsize);
+  ErrorCounter(const UNICHARSET& unicharset, int fontsize);
   ~ErrorCounter();
 
   // Accumulates the errors from the classifier results on a single sample.
@@ -145,15 +163,13 @@ class ErrorCounter {
   // for error counting and shape_table is used to understand the relationship
   // between unichar_ids and shape_ids in the results
   bool AccumulateErrors(bool debug, CountTypes boosting_mode,
-                        const UnicityTable<FontInfo>& font_table,
-                        const ShapeTable& shape_table,
-                        const GenericVector<ShapeRating>& results,
+                        const FontInfoTable& font_table,
+                        const GenericVector<UnicharRating>& results,
                         TrainingSample* sample);
 
   // Accumulates counts for junk. Counts only whether the junk was correctly
   // rejected or not.
-  void AccumulateJunk(const ShapeTable& shape_table,
-                      const GenericVector<ShapeRating>& results,
+  bool AccumulateJunk(bool debug, const GenericVector<UnicharRating>& results,
                       TrainingSample* sample);
 
   // Creates a report of the error rate. The report_level controls the detail
@@ -169,15 +185,17 @@ class ErrorCounter {
   // If not NULL, the report string is saved in fonts_report.
   // (Ignoring report_level).
   double ReportErrors(int report_level, CountTypes boosting_mode,
-                      const UnicityTable<FontInfo>& fontinfo_table,
+                      const FontInfoTable& fontinfo_table,
                       const SampleIterator& it,
                       double* unichar_error,
                       STRING* fonts_report);
 
   // Sets the report string to a combined human and machine-readable report
   // string of the error rates.
-  // Returns false if there is no data, leaving report unchanged.
-  static bool ReportString(const Counts& counts, STRING* report);
+  // Returns false if there is no data, leaving report unchanged, unless
+  // even_if_empty is true.
+  static bool ReportString(bool even_if_empty, const Counts& counts,
+                           STRING* report);
 
   // Computes the error rates and returns in rates which is an array of size
   // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
@@ -186,11 +204,22 @@ class ErrorCounter {
 
   // Total scaled error used by boosting algorithms.
   double scaled_error_;
+  // Difference in result rating to be thought of as an "equal" choice.
+  double rating_epsilon_;
   // Vector indexed by font_id from the samples of error accumulators.
   GenericVector<Counts> font_counts_;
   // Counts of the results that map each unichar_id (from samples) to an
   // incorrect shape_id.
   GENERIC_2D_ARRAY<int> unichar_counts_;
+  // Count of the number of times each shape_id occurs, is correct, and multi-
+  // unichar.
+  GenericVector<int> multi_unichar_counts_;
+  // Histogram of scores (as percent) for correct answers.
+  STATS ok_score_hist_;
+  // Histogram of scores (as percent) for incorrect answers.
+  STATS bad_score_hist_;
+  // Unicharset for printing character ids in results.
+  const UNICHARSET& unicharset_;
 };
 
 }  // namespace tesseract.
diff --git a/classify/extract.cpp b/classify/extract.cpp
index b8e595a7c..822c733e4 100644
--- a/classify/extract.cpp
+++ b/classify/extract.cpp
@@ -49,8 +49,10 @@ void ExtractorStub();
  * @note History: Sun Jan 21 10:07:28 1990, DSJ, Created.
  */
 CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              const DENORM& denorm, TBLOB *Blob) {
-  return (ExtractFlexFeatures(FeatureDefs, Blob, denorm));
+                              const DENORM& bl_denorm, const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info,
+                              TBLOB *Blob) {
+  return ExtractFlexFeatures(FeatureDefs, Blob, bl_denorm, cn_denorm, fx_info);
 }                                /* ExtractBlobFeatures */
 
 /*-----------------------------------------------------------------------------
diff --git a/classify/extract.h b/classify/extract.h
index 844393157..1f80c20e4 100644
--- a/classify/extract.h
+++ b/classify/extract.h
@@ -26,8 +26,12 @@ class DENORM;
 /*-----------------------------------------------------------------------------
           Public Function Prototypes
 -----------------------------------------------------------------------------*/
+// Deprecated! Will be deleted soon!
+// In the meantime, as all TBLOBs, Blob is in baseline normalized coords.
+// See SetupBLCNDenorms in intfx.cpp for other args.
 CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              const DENORM& denorm, TBLOB *Blob);
+                              const DENORM& bl_denorm, const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info, TBLOB *Blob);
 
 /*---------------------------------------------------------------------------
           Private Function Prototypes
diff --git a/classify/featdefs.cpp b/classify/featdefs.cpp
index 95d8cd8c3..df03bf7f5 100644
--- a/classify/featdefs.cpp
+++ b/classify/featdefs.cpp
@@ -19,7 +19,7 @@
           Include Files and Type Defines
 -----------------------------------------------------------------------------*/
 #ifdef _MSC_VER
-#include "mathfix.h"
+#include <mathfix.h>
 #endif
 
 #include "featdefs.h"
diff --git a/classify/flexfx.cpp b/classify/flexfx.cpp
index 44f975e7e..2ddbe3a02 100644
--- a/classify/flexfx.cpp
+++ b/classify/flexfx.cpp
@@ -28,8 +28,13 @@
               Public Code
 ----------------------------------------------------------------------------**/
 /*---------------------------------------------------------------------------*/
+// Deprecated! Will be deleted soon!
+// In the meantime, as all TBLOBs, Blob is in baseline normalized coords.
+// See SetupBLCNDenorms in intfx.cpp for other args.
 CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              TBLOB *Blob, const DENORM& denorm) {
+                              TBLOB *Blob, const DENORM& bl_denorm,
+                              const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info) {
 /*
  **	Parameters:
  **		Blob		blob to extract features from
@@ -50,8 +55,13 @@ CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
     if (FeatureDefs.FeatureExtractors[Type] != NULL &&
         FeatureDefs.FeatureExtractors[Type]->Extractor != NULL) {
       CharDesc->FeatureSets[Type] =
-        (FeatureDefs.FeatureExtractors[Type])->Extractor(Blob, denorm);
+        (FeatureDefs.FeatureExtractors[Type])->Extractor(Blob,
+                                                         bl_denorm,
+                                                         cn_denorm,
+                                                         fx_info);
       if (CharDesc->FeatureSets[Type] == NULL) {
+        tprintf("Feature extractor for type %d = %s returned NULL!\n",
+                Type, FeatureDefs.FeatureDesc[Type]->ShortName);
         FreeCharDescription(CharDesc);
         return NULL;
       }
diff --git a/classify/flexfx.h b/classify/flexfx.h
index 52e45a6a3..21c4fa261 100644
--- a/classify/flexfx.h
+++ b/classify/flexfx.h
@@ -27,7 +27,10 @@
 /**----------------------------------------------------------------------------
           Public Function Prototypes
 ----------------------------------------------------------------------------**/
+// As with all TBLOBs this one is also baseline normalized.
 CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              TBLOB *Blob, const DENORM& denorm);
+                              TBLOB *Blob, const DENORM& bl_denorm,
+                              const DENORM& cn_denorm,
+                              const INT_FX_RESULT_STRUCT& fx_info);
 
 #endif
diff --git a/classify/intfeaturespace.cpp b/classify/intfeaturespace.cpp
index 5f911dcee..866a539e7 100644
--- a/classify/intfeaturespace.cpp
+++ b/classify/intfeaturespace.cpp
@@ -90,8 +90,7 @@ void IntFeatureSpace::IndexAndSortFeatures(
 // window, or -1 if the feature is a miss.
 int IntFeatureSpace::XYToFeatureIndex(int x, int y) const {
   // Round the x,y position to a feature. Search for a valid theta.
-  INT_FEATURE_STRUCT feature = {static_cast<uinT8>(x), static_cast<uinT8>(y),
-                                0, 0};
+  INT_FEATURE_STRUCT feature(x, y, 0);
   int index = -1;
   for (int theta = 0; theta <= MAX_UINT8 && index < 0; ++theta) {
     feature.Theta = theta;
@@ -127,16 +126,10 @@ int IntFeatureSpace::XYToFeatureIndex(int x, int y) const {
 INT_FEATURE_STRUCT IntFeatureSpace::PositionFromBuckets(int x,
                                                         int y,
                                                         int theta) const {
-  INT_FEATURE_STRUCT pos = {
-      static_cast<uinT8>(ClipToRange(
-          (x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,
-          0, MAX_UINT8)),
-      static_cast<uinT8>(ClipToRange(
-          (y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,
-          0, MAX_UINT8)),
-      static_cast<uinT8>(ClipToRange(
-          DivRounded(theta * kIntFeatureExtent, theta_buckets_),
-          0, MAX_UINT8))};
+  INT_FEATURE_STRUCT pos(
+      (x * kIntFeatureExtent + kIntFeatureExtent / 2) / x_buckets_,
+      (y * kIntFeatureExtent + kIntFeatureExtent / 2) / y_buckets_,
+      DivRounded(theta * kIntFeatureExtent, theta_buckets_));
   return pos;
 }
 
diff --git a/classify/intfx.cpp b/classify/intfx.cpp
index 0763b7afb..63d6ddb1e 100644
--- a/classify/intfx.cpp
+++ b/classify/intfx.cpp
@@ -1,8 +1,11 @@
 /******************************************************************************
  **      Filename:    intfx.c
  **      Purpose:     Integer character normalization & feature extraction
- **      Author:      Robert Moss
+ **      Author:      Robert Moss, rays@google.com (Ray Smith)
  **      History:     Tue May 21 15:51:57 MDT 1991, RWM, Created.
+ **                   Tue Feb 28 10:42:00 PST 2012, vastly rewritten to allow
+                                                    greyscale fx and non-linear
+                                                    normalization.
  **
  **      (c) Copyright Hewlett-Packard Company, 1988.
  ** Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,48 +22,26 @@
           Include Files and Type Defines
 ----------------------------------------------------------------------------**/
 #include "intfx.h"
-#include "intmatcher.h"
+#include "allheaders.h"
+#include "ccutil.h"
+#include "classify.h"
 #include "const.h"
 #include "helpers.h"
-#include "ccutil.h"
+#include "intmatcher.h"
+#include "linlsq.h"
+#include "ndminx.h"
+#include "normalis.h"
 #include "statistc.h"
 #include "trainingsample.h"
-#ifdef __UNIX__
-#endif
 
 using tesseract::TrainingSample;
 
-/**----------------------------------------------------------------------------
-          Private Function Prototypes
-----------------------------------------------------------------------------**/
-int SaveFeature();
-uinT8 BinaryAnglePlusPi(inT32 Y, inT32 X);
-uinT8 MySqrt2();
-void ClipRadius();
-
-INT_VAR(classify_radius_gyr_min_man, 255,
-        "Minimum Radius of Gyration Mantissa 0-255:        ");
-
-INT_VAR(classify_radius_gyr_min_exp, 0,
-        "Minimum Radius of Gyration Exponent 0-255:        ");
-
-INT_VAR(classify_radius_gyr_max_man, 158,
-        "Maximum Radius of Gyration Mantissa 0-255:        ");
-
-INT_VAR(classify_radius_gyr_max_exp, 8,
-        "Maximum Radius of Gyration Exponent 0-255:        ");
-
 /**----------------------------------------------------------------------------
         Global Data Definitions and Declarations
 ----------------------------------------------------------------------------**/
-#define  ATAN_TABLE_SIZE    64
-
-// Look up table for arc tangent containing:
-//    atan(0.0) ... atan(ATAN_TABLE_SIZE - 1 / ATAN_TABLE_SIZE)
-// The entries are in binary degrees where a full circle is 256 binary degrees.
-static uinT8 AtanTable[ATAN_TABLE_SIZE];
 // Look up table for cos and sin to turn the intfx feature angle to a vector.
-// Also protected by atan_table_mutex.
+// Protected by atan_table_mutex.
+// The entries are in binary degrees where a full circle is 256 binary degrees.
 static float cos_table[INT_CHAR_NORM_RANGE];
 static float sin_table[INT_CHAR_NORM_RANGE];
 // Guards write access to AtanTable so we dont create it more than once.
@@ -75,10 +56,6 @@ void InitIntegerFX() {
   static bool atan_table_init = false;
   atan_table_mutex.Lock();
   if (!atan_table_init) {
-    for (int i = 0; i < ATAN_TABLE_SIZE; i++) {
-      AtanTable[i] =
-          (uinT8) (atan ((i / (float) ATAN_TABLE_SIZE)) * 128.0 / PI + 0.5);
-    }
     for (int i = 0; i < INT_CHAR_NORM_RANGE; ++i) {
       cos_table[i] = cos(i * 2 * PI / INT_CHAR_NORM_RANGE + PI);
       sin_table[i] = sin(i * 2 * PI / INT_CHAR_NORM_RANGE + PI);
@@ -94,31 +71,435 @@ FCOORD FeatureDirection(uinT8 theta) {
   return FCOORD(cos_table[theta], sin_table[theta]);
 }
 
-TrainingSample* GetIntFeatures(tesseract::NormalizationMode mode,
-                               TBLOB *blob, const DENORM& denorm) {
-  INT_FEATURE_ARRAY blfeatures;
-  INT_FEATURE_ARRAY cnfeatures;
+namespace tesseract {
+
+// Generates a TrainingSample from a TBLOB. Extracts features and sets
+// the bounding box, so classifiers that operate on the image can work.
+// TODO(rays) BlobToTrainingSample must remain a global function until
+// the FlexFx and FeatureDescription code can be removed and LearnBlob
+// made a member of Classify.
+TrainingSample* BlobToTrainingSample(const TBLOB& blob,
+                                     tesseract::NormalizationMode mode,
+                                     bool nonlinear_norm) {
   INT_FX_RESULT_STRUCT fx_info;
-  ExtractIntFeat(blob, denorm, blfeatures, cnfeatures, &fx_info, NULL);
+  GenericVector<INT_FEATURE_STRUCT> bl_features;
+  GenericVector<INT_FEATURE_STRUCT> cn_features;
+  Classify::ExtractFeatures(blob, nonlinear_norm, &bl_features,
+                            &cn_features, &fx_info, NULL);
+  // TODO(rays) Use blob->PreciseBoundingBox() instead.
+  TBOX box = blob.bounding_box();
   TrainingSample* sample = NULL;
   if (mode == tesseract::NM_CHAR_ANISOTROPIC) {
     int num_features = fx_info.NumCN;
     if (num_features > 0) {
-      sample = TrainingSample::CopyFromFeatures(fx_info, cnfeatures,
+      sample = TrainingSample::CopyFromFeatures(fx_info, box, &cn_features[0],
                                                 num_features);
     }
   } else if (mode == tesseract::NM_BASELINE) {
     int num_features = fx_info.NumBL;
     if (num_features > 0) {
-      sample = TrainingSample::CopyFromFeatures(fx_info, blfeatures,
+      sample = TrainingSample::CopyFromFeatures(fx_info, box, &bl_features[0],
                                                 num_features);
     }
   } else {
     ASSERT_HOST(!"Unsupported normalization mode!");
   }
+  if (sample != NULL) {
+    // Set the bounding box (in original image coordinates) in the sample.
+    TPOINT topleft, botright;
+    topleft.x = box.left();
+    topleft.y = box.top();
+    botright.x = box.right();
+    botright.y = box.bottom();
+    TPOINT original_topleft, original_botright;
+    blob.denorm().DenormTransform(NULL, topleft, &original_topleft);
+    blob.denorm().DenormTransform(NULL, botright, &original_botright);
+    sample->set_bounding_box(TBOX(original_topleft.x, original_botright.y,
+                                  original_botright.x, original_topleft.y));
+  }
   return sample;
 }
 
+// Computes the DENORMS for bl(baseline) and cn(character) normalization
+// during feature extraction. The input denorm describes the current state
+// of the blob, which is usually a baseline-normalized word.
+// The Transforms setup are as follows:
+// Baseline Normalized (bl) Output:
+//   We center the grapheme by aligning the x-coordinate of its centroid with
+//   x=128 and leaving the already-baseline-normalized y as-is.
+//
+// Character Normalized (cn) Output:
+//   We align the grapheme's centroid at the origin and scale it
+//   asymmetrically in x and y so that the 2nd moments are a standard value
+//   (51.2) ie the result is vaguely square.
+// If classify_nonlinear_norm is true:
+//   A non-linear normalization is setup that attempts to evenly distribute
+//   edges across x and y.
+//
+// Some of the fields of fx_info are also setup:
+// Length: Total length of outline.
+// Rx:     Rounded y second moment. (Reversed by convention.)
+// Ry:     rounded x second moment.
+// Xmean:  Rounded x center of mass of the blob.
+// Ymean:  Rounded y center of mass of the blob.
+void Classify::SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
+                                DENORM* bl_denorm, DENORM* cn_denorm,
+                                INT_FX_RESULT_STRUCT* fx_info) {
+  // Compute 1st and 2nd moments of the original outline.
+  FCOORD center, second_moments;
+  int length = blob.ComputeMoments(&center, &second_moments);
+  if (fx_info != NULL) {
+    fx_info->Length = length;
+    fx_info->Rx = IntCastRounded(second_moments.y());
+    fx_info->Ry = IntCastRounded(second_moments.x());
+
+    fx_info->Xmean = IntCastRounded(center.x());
+    fx_info->Ymean = IntCastRounded(center.y());
+  }
+  // Setup the denorm for Baseline normalization.
+  bl_denorm->SetupNormalization(NULL, NULL, &blob.denorm(), center.x(), 128.0f,
+                                1.0f, 1.0f, 128.0f, 128.0f);
+  // Setup the denorm for character normalization.
+  if (nonlinear_norm) {
+    GenericVector<GenericVector<int> > x_coords;
+    GenericVector<GenericVector<int> > y_coords;
+    TBOX box;
+    blob.GetPreciseBoundingBox(&box);
+    box.pad(1, 1);
+    blob.GetEdgeCoords(box, &x_coords, &y_coords);
+    cn_denorm->SetupNonLinear(&blob.denorm(), box, MAX_UINT8, MAX_UINT8,
+                              0.0f, 0.0f, x_coords, y_coords);
+  } else {
+    cn_denorm->SetupNormalization(NULL, NULL, &blob.denorm(),
+                                  center.x(), center.y(),
+                                  51.2f / second_moments.x(),
+                                  51.2f / second_moments.y(),
+                                  128.0f, 128.0f);
+  }
+}
+
+// Helper normalizes the direction, assuming that it is at the given
+// unnormed_pos, using the given denorm, starting at the root_denorm.
+uinT8 NormalizeDirection(uinT8 dir, const FCOORD& unnormed_pos,
+                         const DENORM& denorm, const DENORM* root_denorm) {
+  // Convert direction to a vector.
+  FCOORD unnormed_end;
+  unnormed_end.from_direction(dir);
+  unnormed_end += unnormed_pos;
+  FCOORD normed_pos, normed_end;
+  denorm.NormTransform(root_denorm, unnormed_pos, &normed_pos);
+  denorm.NormTransform(root_denorm, unnormed_end, &normed_end);
+  normed_end -= normed_pos;
+  return normed_end.to_direction();
+}
+
+// Helper returns the mean direction vector from the given stats. Use the
+// mean direction from dirs if there is information available, otherwise, use
+// the fit_vector from point_diffs.
+static FCOORD MeanDirectionVector(const LLSQ& point_diffs, const LLSQ& dirs,
+                                  const FCOORD& start_pt,
+                                  const FCOORD& end_pt) {
+  FCOORD fit_vector;
+  if (dirs.count() > 0) {
+    // There were directions, so use them. To avoid wrap-around problems, we
+    // have 2 accumulators in dirs: x for normal directions and y for
+    // directions offset by 128. We will use the one with the least variance.
+    FCOORD mean_pt = dirs.mean_point();
+    double mean_dir = 0.0;
+    if (dirs.x_variance() <= dirs.y_variance()) {
+      mean_dir = mean_pt.x();
+    } else {
+      mean_dir = mean_pt.y() + 128;
+    }
+    fit_vector.from_direction(Modulo(IntCastRounded(mean_dir), 256));
+  } else {
+    // There were no directions, so we rely on the vector_fit to the points.
+    // Since the vector_fit is 180 degrees ambiguous, we align with the
+    // supplied feature_dir by making the scalar product non-negative.
+    FCOORD feature_dir(end_pt - start_pt);
+    fit_vector = point_diffs.vector_fit();
+    if (fit_vector.x() == 0.0f && fit_vector.y() == 0.0f) {
+      // There was only a single point. Use feature_dir directly.
+      fit_vector = feature_dir;
+    } else {
+      // Sometimes the least mean squares fit is wrong, due to the small sample
+      // of points and scaling. Use a 90 degree rotated vector if that matches
+      // feature_dir better.
+      FCOORD fit_vector2 = !fit_vector;
+      // The fit_vector is 180 degrees ambiguous, so resolve the ambiguity by
+      // insisting that the scalar product with the feature_dir should be +ve.
+      if (fit_vector % feature_dir < 0.0)
+        fit_vector = -fit_vector;
+      if (fit_vector2 % feature_dir < 0.0)
+        fit_vector2 = -fit_vector2;
+      // Even though fit_vector2 has a higher mean squared error, it might be
+      // a better fit, so use it if the dot product with feature_dir is bigger.
+      if (fit_vector2 % feature_dir > fit_vector % feature_dir)
+        fit_vector = fit_vector2;
+    }
+  }
+  return fit_vector;
+}
+
+// Helper computes one or more features corresponding to the given points.
+// Emitted features are on the line defined by:
+// start_pt + lambda * (end_pt - start_pt) for scalar lambda.
+// Features are spaced at feature_length intervals.
+static int ComputeFeatures(const FCOORD& start_pt, const FCOORD& end_pt,
+                           double feature_length,
+                           GenericVector<INT_FEATURE_STRUCT>* features) {
+  FCOORD feature_vector(end_pt - start_pt);
+  if (feature_vector.x() == 0.0f && feature_vector.y() == 0.0f) return 0;
+  // Compute theta for the feature based on its direction.
+  uinT8 theta = feature_vector.to_direction();
+  // Compute the number of features and lambda_step.
+  double target_length = feature_vector.length();
+  int num_features = IntCastRounded(target_length / feature_length);
+  if (num_features == 0) return 0;
+  // Divide the length evenly into num_features pieces.
+  double lambda_step = 1.0 / num_features;
+  double lambda = lambda_step / 2.0;
+  for (int f = 0; f < num_features; ++f, lambda += lambda_step) {
+    FCOORD feature_pt(start_pt);
+    feature_pt += feature_vector * lambda;
+    INT_FEATURE_STRUCT feature(feature_pt, theta);
+    features->push_back(feature);
+  }
+  return num_features;
+}
+
+// Gathers outline points and their directions from start_index into dirs by
+// stepping along the outline and normalizing the coordinates until the
+// required feature_length has been collected or end_index is reached.
+// On input pos must point to the position corresponding to start_index and on
+// return pos is updated to the current raw position, and pos_normed is set to
+// the normed version of pos.
+// Since directions wrap-around, they need special treatment to get the mean.
+// Provided the cluster of directions doesn't straddle the wrap-around point,
+// the simple mean works. If they do, then, unless the directions are wildly
+// varying, the cluster rotated by 180 degrees will not straddle the wrap-
+// around point, so mean(dir + 180 degrees) - 180 degrees will work. Since
+// LLSQ conveniently stores the mean of 2 variables, we use it to store
+// dir and dir+128 (128 is 180 degrees) and then use the resulting mean
+// with the least variance.
+static int GatherPoints(const C_OUTLINE* outline, double feature_length,
+                        const DENORM& denorm, const DENORM* root_denorm,
+                        int start_index, int end_index,
+                        ICOORD* pos, FCOORD* pos_normed,
+                        LLSQ* points, LLSQ* dirs) {
+  int step_length = outline->pathlength();
+  ICOORD step = outline->step(start_index % step_length);
+  // Prev_normed is the start point of this collection and will be set on the
+  // first iteration, and on later iterations used to determine the length
+  // that has been collected.
+  FCOORD prev_normed;
+  points->clear();
+  dirs->clear();
+  int num_points = 0;
+  int index;
+  for (index = start_index; index <= end_index; ++index, *pos += step) {
+    step = outline->step(index % step_length);
+    int edge_weight = outline->edge_strength_at_index(index % step_length);
+    if (edge_weight == 0) {
+      // This point has conflicting gradient and step direction, so ignore it.
+      continue;
+    }
+    // Get the sub-pixel precise location and normalize.
+    FCOORD f_pos = outline->sub_pixel_pos_at_index(*pos, index % step_length);
+    denorm.NormTransform(root_denorm, f_pos, pos_normed);
+    if (num_points == 0) {
+      // The start of this segment.
+      prev_normed = *pos_normed;
+    } else {
+      FCOORD offset = *pos_normed - prev_normed;
+      float length = offset.length();
+      if (length > feature_length) {
+        // We have gone far enough from the start. We will use this point in
+        // the next set so return what we have so far.
+        return index;
+      }
+    }
+    points->add(pos_normed->x(), pos_normed->y(), edge_weight);
+    int direction = outline->direction_at_index(index % step_length);
+    if (direction >= 0) {
+      direction = NormalizeDirection(direction, f_pos, denorm, root_denorm);
+      // Use both the direction and direction +128 so we are not trying to
+      // take the mean of something straddling the wrap-around point.
+      dirs->add(direction, Modulo(direction + 128, 256));
+    }
+    ++num_points;
+  }
+  return index;
+}
+
+// Extracts Tesseract features and appends them to the features vector.
+// Startpt to lastpt, inclusive, MUST have the same src_outline member,
+// which may be NULL. The vector from lastpt to its next is included in
+// the feature extraction. Hidden edges should be excluded by the caller.
+// If force_poly is true, the features will be extracted from the polygonal
+// approximation even if more accurate data is available.
+static void ExtractFeaturesFromRun(
+    const EDGEPT* startpt, const EDGEPT* lastpt,
+    const DENORM& denorm, double feature_length, bool force_poly,
+    GenericVector<INT_FEATURE_STRUCT>* features) {
+  const EDGEPT* endpt = lastpt->next;
+  const C_OUTLINE* outline = startpt->src_outline;
+  if (outline != NULL && !force_poly) {
+    // Detailed information is available. We have to normalize only from
+    // the root_denorm to denorm.
+    const DENORM* root_denorm = denorm.RootDenorm();
+    int total_features = 0;
+    // Get the features from the outline.
+    int step_length = outline->pathlength();
+    int start_index = startpt->start_step;
+    // pos is the integer coordinates of the binary image steps.
+    ICOORD pos = outline->position_at_index(start_index);
+    // We use an end_index that allows us to use a positive increment, but that
+    // may be beyond the bounds of the outline steps/ due to wrap-around, to
+    // so we use % step_length everywhere, except for start_index.
+    int end_index = lastpt->start_step + lastpt->step_count;
+    if (end_index <= start_index)
+      end_index += step_length;
+    LLSQ prev_points;
+    LLSQ prev_dirs;
+    FCOORD prev_normed_pos = outline->sub_pixel_pos_at_index(pos, start_index);
+    denorm.NormTransform(root_denorm, prev_normed_pos, &prev_normed_pos);
+    LLSQ points;
+    LLSQ dirs;
+    FCOORD normed_pos;
+    int index = GatherPoints(outline, feature_length, denorm, root_denorm,
+                             start_index, end_index, &pos, &normed_pos,
+                             &points, &dirs);
+    while (index <= end_index) {
+      // At each iteration we nominally have 3 accumulated sets of points and
+      // dirs: prev_points/dirs, points/dirs, next_points/dirs and sum them
+      // into sum_points/dirs, but we don't necessarily get any features out,
+      // so if that is the case, we keep accumulating instead of rotating the
+      // accumulators.
+      LLSQ next_points;
+      LLSQ next_dirs;
+      FCOORD next_normed_pos;
+      index = GatherPoints(outline, feature_length, denorm, root_denorm,
+                           index, end_index, &pos, &next_normed_pos,
+                           &next_points, &next_dirs);
+      LLSQ sum_points(prev_points);
+      // TODO(rays) find out why it is better to use just dirs and next_dirs
+      // in sum_dirs, instead of using prev_dirs as well.
+      LLSQ sum_dirs(dirs);
+      sum_points.add(points);
+      sum_points.add(next_points);
+      sum_dirs.add(next_dirs);
+      bool made_features = false;
+      // If we have some points, we can try making some features.
+      if (sum_points.count() > 0) {
+        // We have gone far enough from the start. Make a feature and restart.
+        FCOORD fit_pt = sum_points.mean_point();
+        FCOORD fit_vector = MeanDirectionVector(sum_points, sum_dirs,
+                                                prev_normed_pos, normed_pos);
+        // The segment to which we fit features is the line passing through
+        // fit_pt in direction of fit_vector that starts nearest to
+        // prev_normed_pos and ends nearest to normed_pos.
+        FCOORD start_pos = prev_normed_pos.nearest_pt_on_line(fit_pt,
+                                                              fit_vector);
+        FCOORD end_pos = normed_pos.nearest_pt_on_line(fit_pt, fit_vector);
+        // Possible correction to match the adjacent polygon segment.
+        if (total_features == 0 && startpt != endpt) {
+          FCOORD poly_pos(startpt->pos.x, startpt->pos.y);
+          denorm.LocalNormTransform(poly_pos, &start_pos);
+        }
+        if (index > end_index && startpt != endpt) {
+          FCOORD poly_pos(endpt->pos.x, endpt->pos.y);
+          denorm.LocalNormTransform(poly_pos, &end_pos);
+        }
+        int num_features = ComputeFeatures(start_pos, end_pos, feature_length,
+                                           features);
+        if (num_features > 0) {
+          // We made some features so shuffle the accumulators.
+          prev_points = points;
+          prev_dirs = dirs;
+          prev_normed_pos = normed_pos;
+          points = next_points;
+          dirs = next_dirs;
+          made_features = true;
+          total_features += num_features;
+        }
+        // The end of the next set becomes the end next time around.
+        normed_pos = next_normed_pos;
+      }
+      if (!made_features) {
+        // We didn't make any features, so keep the prev accumulators and
+        // add the next ones into the current.
+        points.add(next_points);
+        dirs.add(next_dirs);
+      }
+    }
+  } else {
+    // There is no outline, so we are forced to use the polygonal approximation.
+    const EDGEPT* pt = startpt;
+    do {
+      FCOORD start_pos(pt->pos.x, pt->pos.y);
+      FCOORD end_pos(pt->next->pos.x, pt->next->pos.y);
+      denorm.LocalNormTransform(start_pos, &start_pos);
+      denorm.LocalNormTransform(end_pos, &end_pos);
+      ComputeFeatures(start_pos, end_pos, feature_length, features);
+    } while ((pt = pt->next) != endpt);
+  }
+}
+
+// Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
+// (x,y) position and angle as measured counterclockwise from the vector
+// <-1, 0>, from blob using two normalizations defined by bl_denorm and
+// cn_denorm. See SetpuBLCNDenorms for definitions.
+// If outline_cn_counts is not NULL, on return it contains the cumulative
+// number of cn features generated for each outline in the blob (in order).
+// Thus after the first outline, there were (*outline_cn_counts)[0] features,
+// after the second outline, there were (*outline_cn_counts)[1] features etc.
+void Classify::ExtractFeatures(const TBLOB& blob,
+                               bool nonlinear_norm,
+                               GenericVector<INT_FEATURE_STRUCT>* bl_features,
+                               GenericVector<INT_FEATURE_STRUCT>* cn_features,
+                               INT_FX_RESULT_STRUCT* results,
+                               GenericVector<int>* outline_cn_counts) {
+  DENORM bl_denorm, cn_denorm;
+  tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm,
+                                        &bl_denorm, &cn_denorm, results);
+  if (outline_cn_counts != NULL)
+    outline_cn_counts->truncate(0);
+  // Iterate the outlines.
+  for (TESSLINE* ol = blob.outlines; ol != NULL; ol = ol->next) {
+    // Iterate the polygon.
+    EDGEPT* loop_pt = ol->FindBestStartPt();
+    EDGEPT* pt = loop_pt;
+    if (pt == NULL) continue;
+    do {
+      if (pt->IsHidden()) continue;
+      // Find a run of equal src_outline.
+      EDGEPT* last_pt = pt;
+      do {
+        last_pt = last_pt->next;
+      } while (last_pt != loop_pt && !last_pt->IsHidden() &&
+               last_pt->src_outline == pt->src_outline);
+      last_pt = last_pt->prev;
+      // Until the adaptive classifier can be weaned off polygon segments,
+      // we have to force extraction from the polygon for the bl_features.
+      ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength,
+                             true, bl_features);
+      ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength,
+                             false, cn_features);
+      pt = last_pt;
+    } while ((pt = pt->next) != loop_pt);
+    if (outline_cn_counts != NULL)
+      outline_cn_counts->push_back(cn_features->size());
+  }
+  results->NumBL = bl_features->size();
+  results->NumCN = cn_features->size();
+  results->YBottom = blob.bounding_box().bottom();
+  results->YTop = blob.bounding_box().top();
+  results->Width = blob.bounding_box().width();
+}
+
+}  // namespace tesseract
+
 
 /*--------------------------------------------------------------------------*/
 // Extract a set of standard-sized features from Blobs and write them out in
@@ -140,608 +521,25 @@ TrainingSample* GetIntFeatures(tesseract::NormalizationMode mode,
 //   We align the grapheme's centroid at the origin and scale it asymmetrically
 //   in x and y so that the result is vaguely square.
 //
-int ExtractIntFeat(TBLOB *Blob,
-                   const DENORM& denorm,
-                   INT_FEATURE_ARRAY BLFeat,
-                   INT_FEATURE_ARRAY CNFeat,
-                   INT_FX_RESULT_STRUCT* Results,
-                   inT32 *FeatureOutlineArray) {
-
-  TESSLINE *OutLine;
-  EDGEPT *Loop, *LoopStart, *Segment;
-  inT16 LastX, LastY, Xmean, Ymean;
-  inT32 NormX, NormY, DeltaX, DeltaY;
-  inT32 Xsum, Ysum;
-  uinT32 Ix, Iy, LengthSum;
-  uinT16 n;
-  // n - the number of features to extract from a given outline segment.
-  //   We extract features from every outline segment longer than ~6 units.
-  //   We chop these long segments into standard-sized features approximately
-  //   13 (= 64 / 5) units in length.
-  uinT8 Theta;
-  uinT16 NumBLFeatures, NumCNFeatures;
-  uinT8 RxInv, RyInv;            /* x.xxxxxxx  *  2^Exp  */
-  uinT8 RxExp, RyExp;
-                                 /* sxxxxxxxxxxxxxxxxxxxxxxx.xxxxxxxx */
-  register inT32 pfX, pfY, dX, dY;
-  uinT16 Length;
-  register int i;
-
-  Results->Length = 0;
-  Results->Xmean = 0;
-  Results->Ymean = 0;
-  Results->Rx = 0;
-  Results->Ry = 0;
-  Results->NumBL = 0;
-  Results->NumCN = 0;
-  Results->YBottom = MAX_UINT8;
-  Results->YTop = 0;
-
-  // Calculate the centroid (Xmean, Ymean) for the blob.
-  //   We use centroid (instead of center of bounding box or center of smallest
-  //   enclosing circle) so the algorithm will not be too greatly influenced by
-  //   small amounts of information at the edge of a character's bounding box.
-  NumBLFeatures = 0;
-  NumCNFeatures = 0;
-  OutLine = Blob->outlines;
-  Xsum = 0;
-  Ysum = 0;
-  LengthSum = 0;
-  while (OutLine != NULL) {
-    LoopStart = OutLine->loop;
-    Loop = LoopStart;
-    LastX = Loop->pos.x;
-    LastY = Loop->pos.y;
-    /* Check for bad loops */
-    if ((Loop == NULL) || (Loop->next == NULL) || (Loop->next == LoopStart))
-      return FALSE;
-    do {
-      Segment = Loop;
-      Loop = Loop->next;
-      NormX = Loop->pos.x;
-      NormY = Loop->pos.y;
-
-      n = 1;
-      if (!Segment->IsHidden()) {
-        DeltaX = NormX - LastX;
-        DeltaY = NormY - LastY;
-        Length = MySqrt(DeltaX, DeltaY);
-        n = ((Length << 2) + Length + 32) >> 6;
-        if (n != 0) {
-          Xsum += ((LastX << 1) + DeltaX) * (int) Length;
-          Ysum += ((LastY << 1) + DeltaY) * (int) Length;
-          LengthSum += Length;
-        }
-      }
-      if (n != 0) {              /* Throw away a point that is too close */
-        LastX = NormX;
-        LastY = NormY;
-      }
-    }
-    while (Loop != LoopStart);
-    OutLine = OutLine->next;
+// Deprecated! Prefer tesseract::Classify::ExtractFeatures instead.
+bool ExtractIntFeat(const TBLOB& blob,
+                    bool nonlinear_norm,
+                    INT_FEATURE_ARRAY baseline_features,
+                    INT_FEATURE_ARRAY charnorm_features,
+                    INT_FX_RESULT_STRUCT* results) {
+  GenericVector<INT_FEATURE_STRUCT> bl_features;
+  GenericVector<INT_FEATURE_STRUCT> cn_features;
+  tesseract::Classify::ExtractFeatures(blob, nonlinear_norm,
+                                       &bl_features, &cn_features, results,
+                                       NULL);
+  if (bl_features.size() == 0 || cn_features.size() == 0 ||
+      bl_features.size() > MAX_NUM_INT_FEATURES ||
+      cn_features.size() > MAX_NUM_INT_FEATURES) {
+    return false;  // Feature extraction failed.
   }
-  if (LengthSum == 0)
-    return FALSE;
-  Xmean = (Xsum / (inT32) LengthSum) >> 1;
-  Ymean = (Ysum / (inT32) LengthSum) >> 1;
-
-  Results->Length = LengthSum;
-  Results->Xmean = Xmean;
-  Results->Ymean = Ymean;
-
-  // Extract Baseline normalized features,
-  // and find 2nd moments (Ix, Iy) & radius of gyration (Rx, Ry).
-  //
-  //   Ix = Sum y^2 dA, where:
-  //     Ix: the second moment of area about the axis x
-  //     dA = 1 for our standard-sized piece of outline
-  //      y: the perependicular distance to the x axis
-  //   Rx = sqrt(Ix / A)
-  //      Note: 1 <= Rx <= height of blob / 2
-  //   Ry = sqrt(Iy / A)
-  //      Note: 1 <= Ry <=  width of blob / 2
-  Ix = 0;
-  Iy = 0;
-  NumBLFeatures = 0;
-  OutLine = Blob->outlines;
-  int min_x = 0;
-  int max_x = 0;
-  while (OutLine != NULL) {
-    LoopStart = OutLine->loop;
-    Loop = LoopStart;
-    LastX = Loop->pos.x - Xmean;
-    LastY = Loop->pos.y;
-    /* Check for bad loops */
-    if ((Loop == NULL) || (Loop->next == NULL) || (Loop->next == LoopStart))
-      return FALSE;
-    do {
-      Segment = Loop;
-      Loop = Loop->next;
-      NormX = Loop->pos.x - Xmean;
-      NormY = Loop->pos.y;
-      if (NormY < Results->YBottom)
-        Results->YBottom = ClipToRange(NormY, 0, MAX_UINT8);
-      if (NormY > Results->YTop)
-        Results->YTop = ClipToRange(NormY, 0, MAX_UINT8);
-      UpdateRange(NormX, &min_x, &max_x);
-
-      n = 1;
-      if (!Segment->IsHidden()) {
-        DeltaX = NormX - LastX;
-        DeltaY = NormY - LastY;
-        Length = MySqrt(DeltaX, DeltaY);
-        n = ((Length << 2) + Length + 32) >> 6;
-        if (n != 0) {
-          Theta = BinaryAnglePlusPi(DeltaY, DeltaX);
-          dX = (DeltaX << 8) / n;
-          dY = (DeltaY << 8) / n;
-          pfX = (LastX << 8) + (dX >> 1);
-          pfY = (LastY << 8) + (dY >> 1);
-          Ix += ((pfY >> 8) - Ymean) * ((pfY >> 8) - Ymean);
-          // TODO(eger): Hmmm... Xmean is not necessarily 0.
-          //   Figure out if we should center against Xmean for these
-          //   features, and if so fix Iy & SaveFeature().
-          Iy += (pfX >> 8) * (pfX >> 8);
-          if (SaveFeature(BLFeat,
-                          NumBLFeatures,
-                          (inT16) (pfX >> 8),
-                          (inT16) ((pfY >> 8) - 128),
-                          Theta) == FALSE)
-            return FALSE;
-          NumBLFeatures++;
-          for (i = 1; i < n; i++) {
-            pfX += dX;
-            pfY += dY;
-            Ix += ((pfY >> 8) - Ymean) * ((pfY >> 8) - Ymean);
-            Iy += (pfX >> 8) * (pfX >> 8);
-            if (SaveFeature(BLFeat,
-                            NumBLFeatures,
-                            (inT16) (pfX >> 8),
-                            (inT16) ((pfY >> 8) - 128),
-                            Theta) == FALSE)
-              return FALSE;
-            NumBLFeatures++;
-          }
-        }
-      }
-      if (n != 0) {              /* Throw away a point that is too close */
-        LastX = NormX;
-        LastY = NormY;
-      }
-    }
-    while (Loop != LoopStart);
-    OutLine = OutLine->next;
-  }
-  Results->Width = max_x - min_x;
-  if (Ix == 0)
-    Ix = 1;
-  if (Iy == 0)
-    Iy = 1;
-  RxInv = MySqrt2 (NumBLFeatures, Ix, &RxExp);
-  RyInv = MySqrt2 (NumBLFeatures, Iy, &RyExp);
-  ClipRadius(&RxInv, &RxExp, &RyInv, &RyExp);
-
-  Results->Rx = (inT16) (51.2 / (double) RxInv * pow (2.0, (double) RxExp));
-  Results->Ry = (inT16) (51.2 / (double) RyInv * pow (2.0, (double) RyExp));
-  if (Results->Ry == 0) {
-    /*
-        This would result in features having 'nan' values.
-        Since the expression is always > 0, assign a value of 1.
-    */
-    Results->Ry = 1;
-  }
-  if (Results->Rx == 0) {
-    Results->Rx = 1;
-  }
-  Results->NumBL = NumBLFeatures;
-
-  // Extract character normalized features
-  //
-  //   Rescale the co-ordinates to "equalize" distribution in X and Y, making
-  //   all of the following unichars be sized to look similar:  , ' 1 i
-  //
-  //   We calculate co-ordinates relative to the centroid, and then scale them
-  //   as follows (accomplishing a scale of up to 102.4 / dimension):
-  //     y *= 51.2 / Rx    [ y scaled by 0.0 ... 102.4 / height of glyph  ]
-  //     x *= 51.2 / Ry    [ x scaled by 0.0 ... 102.4 / width of glyph ]
-  //   Although tempting to think so, this does not guarantee that our range
-  //   is within [-102.4...102.4] x [-102.4...102.4] because (Xmean, Ymean)
-  //   is the centroid, not the center of the bounding box.  Instead, we can
-  //   only bound the result to [-204 ... 204] x [-204 ... 204]
-  //
-  NumCNFeatures = 0;
-  OutLine = Blob->outlines;
-  int OutLineIndex = -1;
-  while (OutLine != NULL) {
-    LoopStart = OutLine->loop;
-    Loop = LoopStart;
-    LastX = (Loop->pos.x - Xmean) * RyInv;
-    LastY = (Loop->pos.y - Ymean) * RxInv;
-    LastX >>= (inT8) RyExp;
-    LastY >>= (inT8) RxExp;
-    OutLineIndex++;
-
-    /* Check for bad loops */
-    if ((Loop == NULL) || (Loop->next == NULL) || (Loop->next == LoopStart))
-      return FALSE;
-    do {
-      Segment = Loop;
-      Loop = Loop->next;
-      NormX = (Loop->pos.x - Xmean) * RyInv;
-      NormY = (Loop->pos.y - Ymean) * RxInv;
-      NormX >>= (inT8) RyExp;
-      NormY >>= (inT8) RxExp;
-
-      n = 1;
-      if (!Segment->IsHidden()) {
-        DeltaX = NormX - LastX;
-        DeltaY = NormY - LastY;
-        Length = MySqrt(DeltaX, DeltaY);
-        n = ((Length << 2) + Length + 32) >> 6;
-        if (n != 0) {
-          Theta = BinaryAnglePlusPi(DeltaY, DeltaX);
-          dX = (DeltaX << 8) / n;
-          dY = (DeltaY << 8) / n;
-          pfX = (LastX << 8) + (dX >> 1);
-          pfY = (LastY << 8) + (dY >> 1);
-          if (SaveFeature(CNFeat,
-                          NumCNFeatures,
-                          (inT16) (pfX >> 8),
-                          (inT16) (pfY >> 8),
-                          Theta) == FALSE)
-            return FALSE;
-          if (FeatureOutlineArray) {
-            FeatureOutlineArray[NumCNFeatures] = OutLineIndex;
-          }
-          NumCNFeatures++;
-          for (i = 1; i < n; i++) {
-            pfX += dX;
-            pfY += dY;
-            if (SaveFeature(CNFeat,
-                            NumCNFeatures,
-                            (inT16) (pfX >> 8),
-                            (inT16) (pfY >> 8),
-                            Theta) == FALSE)
-              return FALSE;
-            if (FeatureOutlineArray) {
-              FeatureOutlineArray[NumCNFeatures] = OutLineIndex;
-            }
-            NumCNFeatures++;
-          }
-        }
-      }
-      if (n != 0) {              /* Throw away a point that is too close */
-        LastX = NormX;
-        LastY = NormY;
-      }
-    }
-    while (Loop != LoopStart);
-    OutLine = OutLine->next;
-  }
-
-  Results->NumCN = NumCNFeatures;
-  return TRUE;
-}
-
-
-/*--------------------------------------------------------------------------*/
-// Return the "binary angle" [0..255]
-//    made by vector <X, Y> as measured counterclockwise from <-1, 0>
-// The order of the arguments follows the convention of atan2(3)
-uinT8 BinaryAnglePlusPi(inT32 Y, inT32 X) {
-  inT16 Angle, Atan;
-  uinT16 Ratio;
-  uinT32 AbsX, AbsY;
-
-  assert ((X != 0) || (Y != 0));
-  if (X < 0)
-    AbsX = -X;
-  else
-    AbsX = X;
-  if (Y < 0)
-    AbsY = -Y;
-  else
-    AbsY = Y;
-  if (AbsX > AbsY)
-    Ratio = AbsY * ATAN_TABLE_SIZE / AbsX;
-  else
-    Ratio = AbsX * ATAN_TABLE_SIZE / AbsY;
-  if (Ratio >= ATAN_TABLE_SIZE)
-    Ratio = ATAN_TABLE_SIZE - 1;
-  Atan = AtanTable[Ratio];
-  if (X >= 0)
-    if (Y >= 0)
-      if (AbsX > AbsY)
-        Angle = Atan;
-      else
-        Angle = 64 - Atan;
-    else if (AbsX > AbsY)
-      Angle = 256 - Atan;
-    else
-      Angle = 192 + Atan;
-  else if (Y >= 0)
-    if (AbsX > AbsY)
-      Angle = 128 - Atan;
-    else
-      Angle = 64 + Atan;
-  else if (AbsX > AbsY)
-    Angle = 128 + Atan;
-  else
-    Angle = 192 - Atan;
-
-  /* reverse angles to match old feature extractor:   Angle += PI */
-  Angle += 128;
-  Angle &= 255;
-  return (uinT8) Angle;
-}
-
-
-/*--------------------------------------------------------------------------*/
-int SaveFeature(INT_FEATURE_ARRAY FeatureArray,
-                uinT16 FeatureNum,
-                inT16 X,
-                inT16 Y,
-                uinT8 Theta) {
-  INT_FEATURE Feature;
-
-  if (FeatureNum >= MAX_NUM_INT_FEATURES)
-    return FALSE;
-
-  Feature = &(FeatureArray[FeatureNum]);
-
-  X = X + 128;
-  Y = Y + 128;
-
-  Feature->X = ClipToRange<inT16>(X, 0, 255);
-  Feature->Y = ClipToRange<inT16>(Y, 0, 255);
-  Feature->Theta = Theta;
-  Feature->CP_misses = 0;
-
-  return TRUE;
-}
-
-
-/*---------------------------------------------------------------------------*/
-// Return floor(sqrt(min(emm, x)^2 + min(emm, y)^2))
-//    where emm = EvidenceMultMask.
-uinT16 MySqrt(inT32 X, inT32 Y) {
-  register uinT16 SqRoot;
-  register uinT32 Square;
-  register uinT16 BitLocation;
-  register uinT32 Sum;
-  const uinT32 EvidenceMultMask =
-    ((1 << IntegerMatcher::kIntEvidenceTruncBits) - 1);
-
-  if (X < 0)
-    X = -X;
-  if (Y < 0)
-    Y = -Y;
-
-  if (X > EvidenceMultMask)
-    X = EvidenceMultMask;
-  if (Y > EvidenceMultMask)
-    Y = EvidenceMultMask;
-
-  Sum = X * X + Y * Y;
-
-  BitLocation = (EvidenceMultMask + 1) << 1;
-  SqRoot = 0;
-  do {
-    Square = (SqRoot | BitLocation) * (SqRoot | BitLocation);
-    if (Square <= Sum)
-      SqRoot |= BitLocation;
-    BitLocation >>= 1;
-  }
-  while (BitLocation);
-
-  return SqRoot;
-}
-
-
-/*--------------------------------------------------------------------------*/
-// Return two integers which can be used to express the sqrt(I/N):
-//   sqrt(I/N) = 51.2 * 2^(*Exp) / retval
-uinT8 MySqrt2(uinT16 N, uinT32 I, uinT8 *Exp) {
-  register inT8 k;
-  register uinT32 N2;
-  register uinT8 SqRoot;
-  register uinT16 Square;
-  register uinT8 BitLocation;
-  register uinT16 Ratio;
-
-  N2 = N * 41943;
-
-  k = 9;
-  while ((N2 & 0xc0000000) == 0) {
-    N2 <<= 2;
-    k += 1;
-  }
-
-  while ((I & 0xc0000000) == 0) {
-    I <<= 2;
-    k -= 1;
-  }
-
-  if (((N2 & 0x80000000) == 0) && ((I & 0x80000000) == 0)) {
-    N2 <<= 1;
-    I <<= 1;
-  }
-
-  N2 &= 0xffff0000;
-  I >>= 14;
-  Ratio = N2 / I;
-
-  BitLocation = 128;
-  SqRoot = 0;
-  do {
-    Square = (SqRoot | BitLocation) * (SqRoot | BitLocation);
-    if (Square <= Ratio)
-      SqRoot |= BitLocation;
-    BitLocation >>= 1;
-  }
-  while (BitLocation);
-
-  if (k < 0) {
-    *Exp = 0;
-    return 255;
-  }
-  else {
-    *Exp = k;
-    return SqRoot;
-  }
-}
-
-
-/*-------------------------------------------------------------------------*/
-void ClipRadius(uinT8 *RxInv, uinT8 *RxExp, uinT8 *RyInv, uinT8 *RyExp) {
-  register uinT8 AM, BM, AE, BE;
-  register uinT8 BitN, LastCarry;
-  int RxInvLarge, RyInvSmall;
-
-  AM = classify_radius_gyr_min_man;
-  AE = classify_radius_gyr_min_exp;
-  BM = *RxInv;
-  BE = *RxExp;
-  LastCarry = 1;
-  while ((AM != 0) || (BM != 0)) {
-    if (AE > BE) {
-      BitN = LastCarry + (AM & 1) + 1;
-      AM >>= 1;
-      AE--;
-    }
-    else if (AE < BE) {
-      BitN = LastCarry + (!(BM & 1));
-      BM >>= 1;
-      BE--;
-    }
-    else {                       /* AE == BE */
-      BitN = LastCarry + (AM & 1) + (!(BM & 1));
-      AM >>= 1;
-      BM >>= 1;
-      AE--;
-      BE--;
-    }
-    LastCarry = (BitN & 2) > 1;
-    BitN = BitN & 1;
-  }
-  BitN = LastCarry + 1;
-  LastCarry = (BitN & 2) > 1;
-  BitN = BitN & 1;
-
-  if (BitN == 1) {
-    *RxInv = classify_radius_gyr_min_man;
-    *RxExp = classify_radius_gyr_min_exp;
-  }
-
-  AM = classify_radius_gyr_min_man;
-  AE = classify_radius_gyr_min_exp;
-  BM = *RyInv;
-  BE = *RyExp;
-  LastCarry = 1;
-  while ((AM != 0) || (BM != 0)) {
-    if (AE > BE) {
-      BitN = LastCarry + (AM & 1) + 1;
-      AM >>= 1;
-      AE--;
-    }
-    else if (AE < BE) {
-      BitN = LastCarry + (!(BM & 1));
-      BM >>= 1;
-      BE--;
-    }
-    else {                       /* AE == BE */
-      BitN = LastCarry + (AM & 1) + (!(BM & 1));
-      AM >>= 1;
-      BM >>= 1;
-      AE--;
-      BE--;
-    }
-    LastCarry = (BitN & 2) > 1;
-    BitN = BitN & 1;
-  }
-  BitN = LastCarry + 1;
-  LastCarry = (BitN & 2) > 1;
-  BitN = BitN & 1;
-
-  if (BitN == 1) {
-    *RyInv = classify_radius_gyr_min_man;
-    *RyExp = classify_radius_gyr_min_exp;
-  }
-
-  AM = classify_radius_gyr_max_man;
-  AE = classify_radius_gyr_max_exp;
-  BM = *RxInv;
-  BE = *RxExp;
-  LastCarry = 1;
-  while ((AM != 0) || (BM != 0)) {
-    if (AE > BE) {
-      BitN = LastCarry + (AM & 1) + 1;
-      AM >>= 1;
-      AE--;
-    }
-    else if (AE < BE) {
-      BitN = LastCarry + (!(BM & 1));
-      BM >>= 1;
-      BE--;
-    }
-    else {                       /* AE == BE */
-      BitN = LastCarry + (AM & 1) + (!(BM & 1));
-      AM >>= 1;
-      BM >>= 1;
-      AE--;
-      BE--;
-    }
-    LastCarry = (BitN & 2) > 1;
-    BitN = BitN & 1;
-  }
-  BitN = LastCarry + 1;
-  LastCarry = (BitN & 2) > 1;
-  BitN = BitN & 1;
-
-  if (BitN == 1)
-    RxInvLarge = 1;
-  else
-    RxInvLarge = 0;
-
-  AM = *RyInv;
-  AE = *RyExp;
-  BM = classify_radius_gyr_max_man;
-  BE = classify_radius_gyr_max_exp;
-  LastCarry = 1;
-  while ((AM != 0) || (BM != 0)) {
-    if (AE > BE) {
-      BitN = LastCarry + (AM & 1) + 1;
-      AM >>= 1;
-      AE--;
-    }
-    else if (AE < BE) {
-      BitN = LastCarry + (!(BM & 1));
-      BM >>= 1;
-      BE--;
-    }
-    else {                       /* AE == BE */
-      BitN = LastCarry + (AM & 1) + (!(BM & 1));
-      AM >>= 1;
-      BM >>= 1;
-      AE--;
-      BE--;
-    }
-    LastCarry = (BitN & 2) > 1;
-    BitN = BitN & 1;
-  }
-  BitN = LastCarry + 1;
-  LastCarry = (BitN & 2) > 1;
-  BitN = BitN & 1;
-
-  if (BitN == 1)
-    RyInvSmall = 1;
-  else
-    RyInvSmall = 0;
-
-  if (RxInvLarge && RyInvSmall) {
-    *RyInv = classify_radius_gyr_max_man;
-    *RyExp = classify_radius_gyr_max_exp;
-  }
-
+  memcpy(baseline_features, &bl_features[0],
+         bl_features.size() * sizeof(bl_features[0]));
+  memcpy(charnorm_features, &cn_features[0],
+         cn_features.size() * sizeof(cn_features[0]));
+  return true;
 }
diff --git a/classify/intfx.h b/classify/intfx.h
index 30150947d..11a68377c 100644
--- a/classify/intfx.h
+++ b/classify/intfx.h
@@ -1,10 +1,10 @@
 /******************************************************************************
- **	Filename:    intfx.h
- **	Purpose:     Interface to high level integer feature extractor.
- **	Author:      Robert Moss
- **	History:     Tue May 21 15:51:57 MDT 1991, RWM, Created.
+ **  Filename:    intfx.h
+ **  Purpose:     Interface to high level integer feature extractor.
+ **  Author:      Robert Moss
+ **  History:     Tue May 21 15:51:57 MDT 1991, RWM, Created.
  **
- **	(c) Copyright Hewlett-Packard Company, 1988.
+ **  (c) Copyright Hewlett-Packard Company, 1988.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
@@ -42,6 +42,9 @@ struct INT_FX_RESULT_STRUCT {
   uinT8 YTop;                    // Top of blob in BLN coords.
 };
 
+// The standard feature length
+const double kStandardFeatureLength = 64.0 / 5;
+
 /**----------------------------------------------------------------------------
           Public Function Prototypes
 ----------------------------------------------------------------------------**/
@@ -51,28 +54,22 @@ void InitIntegerFX();
 // theta direction in an INT_FEATURE_STRUCT.
 FCOORD FeatureDirection(uinT8 theta);
 
-tesseract::TrainingSample* GetIntFeatures(
-    tesseract::NormalizationMode mode, TBLOB *blob,
-    const DENORM& denorm);
+namespace tesseract {
+  // Generates a TrainingSample from a TBLOB. Extracts features and sets
+  // the bounding box, so classifiers that operate on the image can work.
+  // TODO(rays) BlobToTrainingSample must remain a global function until
+  // the FlexFx and FeatureDescription code can be removed and LearnBlob
+  // made a member of Classify.
+  TrainingSample* BlobToTrainingSample(const TBLOB& blob,
+                                       tesseract::NormalizationMode mode,
+                                       bool nonlinear_norm);
+}
 
-int ExtractIntFeat(TBLOB *Blob,
-                   const DENORM& denorm,
-                   INT_FEATURE_ARRAY BLFeat,
-                   INT_FEATURE_ARRAY CNFeat,
-                   INT_FX_RESULT_STRUCT* Results,
-                   inT32 *FeatureOutlineArray = 0);
+// Deprecated! Prefer tesseract::Classify::ExtractFeatures instead.
+bool ExtractIntFeat(const TBLOB& blob,
+                    bool nonlinear_norm,
+                    INT_FEATURE_ARRAY BLFeat,
+                    INT_FEATURE_ARRAY CNFeat,
+                    INT_FX_RESULT_STRUCT* Results);
 
-uinT8 BinaryAnglePlusPi(inT32 Y, inT32 X);
-
-int SaveFeature(INT_FEATURE_ARRAY FeatureArray,
-                uinT16 FeatureNum,
-                inT16 X,
-                inT16 Y,
-                uinT8 Theta);
-
-uinT16 MySqrt(inT32 X, inT32 Y);
-
-uinT8 MySqrt2(uinT16 N, uinT32 I, uinT8 *Exp);
-
-void ClipRadius(uinT8 *RxInv, uinT8 *RxExp, uinT8 *RyInv, uinT8 *RyExp);
 #endif
diff --git a/classify/intmatcher.h b/classify/intmatcher.h
index 50dbd9796..5598d273a 100644
--- a/classify/intmatcher.h
+++ b/classify/intmatcher.h
@@ -28,7 +28,7 @@ extern BOOL_VAR_H(disable_character_fragments, FALSE,
                   "Do not include character fragments in the"
                   " results of the classifier");
 
-extern INT_VAR_H(classify_integer_matcher_multiplier, 14,
+extern INT_VAR_H(classify_integer_matcher_multiplier, 10,
                  "Integer Matcher Multiplier  0-255:   ");
 
 
diff --git a/classify/intproto.cpp b/classify/intproto.cpp
index 800e673d3..e5621b1d4 100644
--- a/classify/intproto.cpp
+++ b/classify/intproto.cpp
@@ -37,6 +37,7 @@
 #include "mfoutline.h"
 #include "ndminx.h"
 #include "picofeat.h"
+#include "points.h"
 #include "shapetable.h"
 #include "svmnode.h"
 
@@ -206,6 +207,22 @@ double_VAR(classify_pp_side_pad, 2.5, "Proto Pruner Side Pad");
 /*-----------------------------------------------------------------------------
               Public Code
 -----------------------------------------------------------------------------*/
+// Builds a feature from an FCOORD for position with all the necessary
+// clipping and rounding.
+INT_FEATURE_STRUCT::INT_FEATURE_STRUCT(const FCOORD& pos, uinT8 theta)
+  : X(ClipToRange<inT16>(static_cast<inT16>(pos.x() + 0.5), 0, 255)),
+    Y(ClipToRange<inT16>(static_cast<inT16>(pos.y() + 0.5), 0, 255)),
+    Theta(theta),
+    CP_misses(0) {
+}
+// Builds a feature from ints with all the necessary clipping and casting.
+INT_FEATURE_STRUCT::INT_FEATURE_STRUCT(int x, int y, int theta)
+  : X(static_cast<uinT8>(ClipToRange(x, 0, MAX_UINT8))),
+    Y(static_cast<uinT8>(ClipToRange(y, 0, MAX_UINT8))),
+    Theta(static_cast<uinT8>(ClipToRange(theta, 0, MAX_UINT8))),
+    CP_misses(0) {
+}
+
 /*---------------------------------------------------------------------------*/
 /**
  * This routine adds a new class structure to a set of
diff --git a/classify/intproto.h b/classify/intproto.h
index 718689d67..302048757 100644
--- a/classify/intproto.h
+++ b/classify/intproto.h
@@ -28,6 +28,8 @@
 #include "scrollview.h"
 #include "unicharset.h"
 
+class FCOORD;
+
 /* define order of params in pruners */
 #define PRUNER_X      0
 #define PRUNER_Y      1
@@ -130,8 +132,14 @@ INT_TEMPLATES_STRUCT, *INT_TEMPLATES;
 #define MAX_NUM_INT_FEATURES 512
 #define INT_CHAR_NORM_RANGE  256
 
-struct INT_FEATURE_STRUCT
-{
+struct INT_FEATURE_STRUCT {
+  INT_FEATURE_STRUCT() : X(0), Y(0), Theta(0), CP_misses(0) { }
+  // Builds a feature from an FCOORD for position with all the necessary
+  // clipping and rounding.
+  INT_FEATURE_STRUCT(const FCOORD& pos, uinT8 theta);
+  // Builds a feature from ints with all the necessary clipping and casting.
+  INT_FEATURE_STRUCT(int x, int y, int theta);
+
   uinT8 X;
   uinT8 Y;
   uinT8 Theta;
diff --git a/classify/mastertrainer.cpp b/classify/mastertrainer.cpp
index 3da91e6b1..1e69da14b 100644
--- a/classify/mastertrainer.cpp
+++ b/classify/mastertrainer.cpp
@@ -30,6 +30,7 @@
 #include "allheaders.h"
 #include "boxread.h"
 #include "classify.h"
+#include "efio.h"
 #include "errorcounter.h"
 #include "featdefs.h"
 #include "sampleiterator.h"
@@ -58,10 +59,6 @@ MasterTrainer::MasterTrainer(NormalizationMode norm_mode,
     enable_shape_anaylsis_(shape_analysis),
     enable_replication_(replicate_samples),
     fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
-  fontinfo_table_.set_compare_callback(
-      NewPermanentTessCallback(CompareFontInfo));
-  fontinfo_table_.set_clear_callback(
-      NewPermanentTessCallback(FontInfoDeleteCallback));
 }
 
 MasterTrainer::~MasterTrainer() {
@@ -82,10 +79,7 @@ bool MasterTrainer::Serialize(FILE* fp) const {
   if (!verify_samples_.Serialize(fp)) return false;
   if (!master_shapes_.Serialize(fp)) return false;
   if (!flat_shapes_.Serialize(fp)) return false;
-  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_info)))
-    return false;
-  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_spacing_info)))
-    return false;
+  if (!fontinfo_table_.Serialize(fp)) return false;
   if (!xheights_.Serialize(fp)) return false;
   return true;
 }
@@ -106,11 +100,7 @@ bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
   if (!verify_samples_.DeSerialize(swap, fp)) return false;
   if (!master_shapes_.DeSerialize(swap, fp)) return false;
   if (!flat_shapes_.DeSerialize(swap, fp)) return false;
-  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap))
-    return false;
-  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_spacing_info),
-                            swap))
-    return false;
+  if (!fontinfo_table_.DeSerialize(swap, fp)) return false;
   if (!xheights_.DeSerialize(swap, fp)) return false;
   return true;
 }
@@ -122,8 +112,10 @@ void MasterTrainer::LoadUnicharset(const char* filename) {
             "Building unicharset for training from scratch...\n",
             filename);
     unicharset_.clear();
-    // Space character needed to represent NIL_LIST classification.
-    unicharset_.unichar_insert(" ");
+    UNICHARSET initialized;
+    // Add special characters, as they were removed by the clear, but the
+    // default constructor puts them in.
+    unicharset_.AppendOtherUnicharset(initialized);
   }
   charsetsize_ = unicharset_.size();
   delete [] fragments_;
@@ -138,7 +130,7 @@ void MasterTrainer::LoadUnicharset(const char* filename) {
 // adding them to the trainer with the font_id from the content of the file.
 // See mftraining.cpp for a description of the file format.
 // If verification, then these are verification samples, not training.
-void MasterTrainer::ReadTrainingSamples(FILE  *fp,
+void MasterTrainer::ReadTrainingSamples(const char* page_name,
                                         const FEATURE_DEFS_STRUCT& feature_defs,
                                         bool verification) {
   char buffer[2048];
@@ -148,6 +140,12 @@ void MasterTrainer::ReadTrainingSamples(FILE  *fp,
   int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
   int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
 
+  FILE* fp = Efopen(page_name, "rb");
+  if (fp == NULL) {
+    tprintf("Failed to open tr file: %s\n", page_name);
+    return;
+  }
+  tr_filenames_.push_back(STRING(page_name));
   while (fgets(buffer, sizeof(buffer), fp) != NULL) {
     if (buffer[0] == '\n')
       continue;
@@ -159,6 +157,7 @@ void MasterTrainer::ReadTrainingSamples(FILE  *fp,
     }
     *space++ = '\0';
     int font_id = GetFontInfoId(buffer);
+    if (font_id < 0) font_id = 0;
     int page_number;
     STRING unichar;
     TBOX bounding_box;
@@ -177,6 +176,7 @@ void MasterTrainer::ReadTrainingSamples(FILE  *fp,
     FreeCharDescription(char_desc);
   }
   charsetsize_ = unicharset_.size();
+  fclose(fp);
 }
 
 // Adds the given single sample to the trainer, setting the classid
@@ -278,23 +278,23 @@ void MasterTrainer::SetupMasterShapes() {
     const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
 
     if (fragment == NULL)
-      char_shapes.AppendMasterShapes(shapes);
+      char_shapes.AppendMasterShapes(shapes, NULL);
     else if (fragment->is_beginning())
-      char_shapes_begin_fragment.AppendMasterShapes(shapes);
+      char_shapes_begin_fragment.AppendMasterShapes(shapes, NULL);
     else if (fragment->is_ending())
-      char_shapes_end_fragment.AppendMasterShapes(shapes);
+      char_shapes_end_fragment.AppendMasterShapes(shapes, NULL);
     else
-      char_shapes.AppendMasterShapes(shapes);
+      char_shapes.AppendMasterShapes(shapes, NULL);
   }
   ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                 kFontMergeDistance, &char_shapes_begin_fragment);
-  char_shapes.AppendMasterShapes(char_shapes_begin_fragment);
+  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, NULL);
   ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                 kFontMergeDistance, &char_shapes_end_fragment);
-  char_shapes.AppendMasterShapes(char_shapes_end_fragment);
+  char_shapes.AppendMasterShapes(char_shapes_end_fragment, NULL);
   ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                 kFontMergeDistance, &char_shapes);
-  master_shapes_.AppendMasterShapes(char_shapes);
+  master_shapes_.AppendMasterShapes(char_shapes, NULL);
   tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
 }
 
@@ -401,7 +401,7 @@ bool MasterTrainer::LoadXHeights(const char* filename) {
       continue;
     fontinfo.name = buffer;
     if (!fontinfo_table_.contains(fontinfo)) continue;
-    int fontinfo_id = fontinfo_table_.get_id(fontinfo);
+    int fontinfo_id = fontinfo_table_.get_index(fontinfo);
     xheights_[fontinfo_id] = xht;
     total_xheight += xht;
     ++xheight_count;
@@ -439,7 +439,7 @@ bool MasterTrainer::AddSpacingInfo(const char *filename) {
   char kerned_uch[UNICHAR_LEN];
   int x_gap, x_gap_before, x_gap_after, num_kerned;
   ASSERT_HOST(fscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
-  FontInfo *fi = fontinfo_table_.get_mutable(fontinfo_id);
+  FontInfo *fi = &fontinfo_table_.get(fontinfo_id);
   fi->init_spacing(unicharset_.size());
   FontSpacingInfo *spacing = NULL;
   for (int l = 0; l < num_unichars; ++l) {
@@ -480,11 +480,7 @@ int MasterTrainer::GetFontInfoId(const char* font_name) {
   fontinfo.name = const_cast<char*>(font_name);
   fontinfo.properties = 0;  // Not used to lookup in the table
   fontinfo.universal_id = 0;
-  if (!fontinfo_table_.contains(fontinfo)) {
-    return -1;
-  } else {
-    return fontinfo_table_.get_id(fontinfo);
-  }
+  return fontinfo_table_.get_index(fontinfo);
 }
 // Returns the font_id of the closest matching font name to the given
 // filename. It is assumed that a substring of the filename will match
@@ -585,7 +581,7 @@ void MasterTrainer::WriteInttempAndPFFMTable(const UNICHARSET& unicharset,
                                              const char* pffmtable_file) {
   tesseract::Classify *classify = new tesseract::Classify();
   // Move the fontinfo table to classify.
-  classify->get_fontinfo_table().move(&fontinfo_table_);
+  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
   INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
                                                              shape_set);
   FILE* fp = fopen(inttemp_file, "wb");
@@ -750,17 +746,29 @@ void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
 }
 #endif  // GRAPHICS_DISABLED
 
+void MasterTrainer::TestClassifierVOld(bool replicate_samples,
+                                       ShapeClassifier* test_classifier,
+                                       ShapeClassifier* old_classifier) {
+  SampleIterator sample_it;
+  sample_it.Init(NULL, NULL, replicate_samples, &samples_);
+  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
+                               CT_UNICHAR_TOPN_ERR, fontinfo_table_,
+                               page_images_, &sample_it);
+}
+
 // Tests the given test_classifier on the internal samples.
 // See TestClassifier for details.
-void MasterTrainer::TestClassifierOnSamples(int report_level,
+void MasterTrainer::TestClassifierOnSamples(CountTypes error_mode,
+                                            int report_level,
                                             bool replicate_samples,
                                             ShapeClassifier* test_classifier,
                                             STRING* report_string) {
-  TestClassifier(report_level, replicate_samples, &samples_,
+  TestClassifier(error_mode, report_level, replicate_samples, &samples_,
                  test_classifier, report_string);
 }
 
-// Tests the given test_classifier on the given samples
+// Tests the given test_classifier on the given samples.
+// error_mode indicates what counts as an error.
 // report_levels:
 // 0 = no output.
 // 1 = bottom-line error rate.
@@ -772,14 +780,14 @@ void MasterTrainer::TestClassifierOnSamples(int report_level,
 // sample including replicated and systematically perturbed samples.
 // If report_string is non-NULL, a summary of the results for each font
 // is appended to the report_string.
-double MasterTrainer::TestClassifier(int report_level,
+double MasterTrainer::TestClassifier(CountTypes error_mode,
+                                     int report_level,
                                      bool replicate_samples,
                                      TrainingSampleSet* samples,
                                      ShapeClassifier* test_classifier,
                                      STRING* report_string) {
   SampleIterator sample_it;
-  sample_it.Init(NULL, test_classifier->GetShapeTable(), replicate_samples,
-                 samples);
+  sample_it.Init(NULL, NULL, replicate_samples, samples);
   if (report_level > 0) {
     int num_samples = 0;
     for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
@@ -791,7 +799,7 @@ double MasterTrainer::TestClassifier(int report_level,
   }
   double unichar_error = 0.0;
   ErrorCounter::ComputeErrorRate(test_classifier, report_level,
-                                 CT_SHAPE_TOP_ERR, fontinfo_table_,
+                                 error_mode, fontinfo_table_,
                                  page_images_, &sample_it, &unichar_error,
                                  NULL, report_string);
   return unichar_error;
diff --git a/classify/mastertrainer.h b/classify/mastertrainer.h
index 633c39750..0cc2ea11c 100644
--- a/classify/mastertrainer.h
+++ b/classify/mastertrainer.h
@@ -29,6 +29,7 @@
 #include "cluster.h"
 #include "intfx.h"
 #include "elst.h"
+#include "errorcounter.h"
 #include "featdefs.h"
 #include "fontinfo.h"
 #include "indexmapbidi.h"
@@ -89,7 +90,7 @@ class MasterTrainer {
   // Reads the samples and their features from the given file,
   // adding them to the trainer with the font_id from the content of the file.
   // If verification, then these are verification samples, not training.
-  void ReadTrainingSamples(FILE  *fp,
+  void ReadTrainingSamples(const char* page_name,
                            const FEATURE_DEFS_STRUCT& feature_defs,
                            bool verification);
 
@@ -159,6 +160,12 @@ class MasterTrainer {
   // one of the fonts. If more than one is matched, the longest is returned.
   int GetBestMatchingFontInfoId(const char* filename);
 
+  // Returns the filename of the tr file corresponding to the command-line
+  // argument with the given index.
+  const STRING& GetTRFileName(int index) const {
+    return tr_filenames_[index];
+  }
+
   // Sets up a flat shapetable with one shape per class/font combination.
   void SetupFlatShapeTable(ShapeTable* shape_table);
 
@@ -207,13 +214,19 @@ class MasterTrainer {
                       const char* unichar_str2, int canonical_font);
   #endif  // GRAPHICS_DISABLED
 
+  void TestClassifierVOld(bool replicate_samples,
+                          ShapeClassifier* test_classifier,
+                          ShapeClassifier* old_classifier);
+
   // Tests the given test_classifier on the internal samples.
   // See TestClassifier for details.
-  void TestClassifierOnSamples(int report_level,
+  void TestClassifierOnSamples(CountTypes error_mode,
+                               int report_level,
                                bool replicate_samples,
                                ShapeClassifier* test_classifier,
                                STRING* report_string);
   // Tests the given test_classifier on the given samples
+  // error_mode indicates what counts as an error.
   // report_levels:
   // 0 = no output.
   // 1 = bottom-line error rate.
@@ -225,7 +238,8 @@ class MasterTrainer {
   // sample including replicated and systematically perturbed samples.
   // If report_string is non-NULL, a summary of the results for each font
   // is appended to the report_string.
-  double TestClassifier(int report_level,
+  double TestClassifier(CountTypes error_mode,
+                        int report_level,
                         bool replicate_samples,
                         TrainingSampleSet* samples,
                         ShapeClassifier* test_classifier,
@@ -263,9 +277,9 @@ class MasterTrainer {
   // Flat shape table has each unichar/font id pair in a separate shape.
   ShapeTable flat_shapes_;
   // Font metrics gathered from multiple files.
-  UnicityTable<FontInfo> fontinfo_table_;
+  FontInfoTable fontinfo_table_;
   // Array of xheights indexed by font ids in fontinfo_table_;
-  GenericVector<int> xheights_;
+  GenericVector<inT32> xheights_;
 
   // Non-serialized data initialized by other means or used temporarily
   // during loading of training samples.
@@ -291,6 +305,8 @@ class MasterTrainer {
   // Indexed by page_num_ in the samples.
   // These images are owned by the trainer and need to be pixDestroyed.
   GenericVector<Pix*> page_images_;
+  // Vector of filenames of loaded tr files.
+  GenericVector<STRING> tr_filenames_;
 };
 
 }  // namespace tesseract.
diff --git a/classify/mf.cpp b/classify/mf.cpp
index 714f04083..ad1ba285f 100644
--- a/classify/mf.cpp
+++ b/classify/mf.cpp
@@ -33,7 +33,9 @@
               Private Code
 ----------------------------------------------------------------------------**/
 /*---------------------------------------------------------------------------*/
-FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& denorm) {
+FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm,
+                          const DENORM& cn_denorm,
+                          const INT_FX_RESULT_STRUCT& fx_info) {
 /*
  **	Parameters:
  **		Blob		blob to extract micro-features from
@@ -52,7 +54,8 @@ FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& denorm) {
   FEATURE Feature;
   MICROFEATURE OldFeature;
 
-  OldFeatures = (MICROFEATURES)BlobMicroFeatures(Blob, denorm);
+  OldFeatures = (MICROFEATURES)BlobMicroFeatures(Blob, bl_denorm, cn_denorm,
+                                                 fx_info);
   if (OldFeatures == NULL)
     return NULL;
   NumFeatures = count (OldFeatures);
diff --git a/classify/mf.h b/classify/mf.h
index 0f5e3f64b..716f5b8c0 100644
--- a/classify/mf.h
+++ b/classify/mf.h
@@ -34,6 +34,8 @@ typedef float MicroFeature[MFCount];
 /*----------------------------------------------------------------------------
           Private Function Prototypes
 -----------------------------------------------------------------------------*/
-FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& denorm);
+FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm,
+                          const DENORM& cn_denorm,
+                          const INT_FX_RESULT_STRUCT& fx_info);
 
 #endif
diff --git a/classify/mfoutline.cpp b/classify/mfoutline.cpp
index 5903f5d3a..f39ec0ec5 100644
--- a/classify/mfoutline.cpp
+++ b/classify/mfoutline.cpp
@@ -103,56 +103,6 @@ LIST ConvertOutlines(TESSLINE *outline,
   return mf_outlines;
 }
 
-
-/*---------------------------------------------------------------------------*/
-void ComputeOutlineStats(LIST Outlines, OUTLINE_STATS *OutlineStats) {
-/*
- ** Parameters:
- **   Outlines  list of outlines to compute stats for
- **   OutlineStats  place to put results
- ** Globals: none
- ** Operation: This routine computes several statistics about the outlines
- **   in Outlines.  These statistics are usually used to perform
- **   anistropic normalization of all of the outlines.  The
- **   statistics generated are:
- **     first moments about x and y axes
- **     total length of all outlines
- **     center of mass of all outlines
- **     second moments about center of mass axes
- **     radius of gyration about center of mass axes
- ** Return: none (results are returned in OutlineStats)
- ** Exceptions: none
- ** History: Fri Dec 14 08:32:03 1990, DSJ, Created.
- */
-  MFOUTLINE Outline;
-  MFOUTLINE EdgePoint;
-  MFEDGEPT *Current;
-  MFEDGEPT *Last;
-
-  InitOutlineStats(OutlineStats);
-  iterate(Outlines) {
-    Outline = (MFOUTLINE) first_node (Outlines);
-
-    Last = PointAt (Outline);
-    Outline = NextPointAfter (Outline);
-    EdgePoint = Outline;
-    do {
-      Current = PointAt (EdgePoint);
-
-      UpdateOutlineStats (OutlineStats,
-        Last->Point.x, Last->Point.y,
-        Current->Point.x, Current->Point.y);
-
-      Last = Current;
-      EdgePoint = NextPointAfter (EdgePoint);
-    }
-    while (EdgePoint != Outline);
-  }
-  FinishOutlineStats(OutlineStats);
-
-}                                /* ComputeOutlineStats */
-
-
 /*---------------------------------------------------------------------------*/
 void FindDirectionChanges(MFOUTLINE Outline,
                           FLOAT32 MinSlope,
@@ -334,7 +284,8 @@ void NormalizeOutline(MFOUTLINE Outline,
   MFOUTLINE EdgePoint = Outline;
   do {
     MFEDGEPT *Current = PointAt(EdgePoint);
-    Current->Point.y = MF_SCALE_FACTOR * (Current->Point.y - BASELINE_OFFSET);
+    Current->Point.y = MF_SCALE_FACTOR *
+        (Current->Point.y - kBlnBaselineOffset);
     Current->Point.x = MF_SCALE_FACTOR * (Current->Point.x - XOrigin);
     EdgePoint = NextPointAfter(EdgePoint);
   } while (EdgePoint != Outline);
@@ -365,34 +316,10 @@ void Classify::NormalizeOutlines(LIST Outlines,
  ** History: Fri Dec 14 08:14:55 1990, DSJ, Created.
  */
   MFOUTLINE Outline;
-  OUTLINE_STATS OutlineStats;
-  FLOAT32 BaselineScale;
 
   switch (classify_norm_method) {
     case character:
-      ComputeOutlineStats(Outlines, &OutlineStats);
-
-      /* limit scale factor to avoid overscaling small blobs (.,`'),
-         thin blobs (l1ift), and merged blobs */
-      *XScale = *YScale = BaselineScale = MF_SCALE_FACTOR;
-      *XScale *= OutlineStats.Ry;
-      *YScale *= OutlineStats.Rx;
-      if (*XScale < classify_min_norm_scale_x)
-        *XScale = classify_min_norm_scale_x;
-      if (*YScale < classify_min_norm_scale_y)
-        *YScale = classify_min_norm_scale_y;
-      if (*XScale > classify_max_norm_scale_x &&
-          *YScale <= classify_max_norm_scale_y)
-        *XScale = classify_max_norm_scale_x;
-      *XScale = classify_char_norm_range * BaselineScale / *XScale;
-      *YScale = classify_char_norm_range * BaselineScale / *YScale;
-
-      iterate(Outlines) {
-        Outline = (MFOUTLINE) first_node (Outlines);
-        CharNormalizeOutline (Outline,
-          OutlineStats.x, OutlineStats.y,
-          *XScale, *YScale);
-      }
+      ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
       break;
 
     case baseline:
@@ -436,11 +363,7 @@ void ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction) {
 
 
 /*---------------------------------------------------------------------------*/
-void CharNormalizeOutline(MFOUTLINE Outline,
-                          FLOAT32 XCenter,
-                          FLOAT32 YCenter,
-                          FLOAT32 XScale,
-                          FLOAT32 YScale) {
+void CharNormalizeOutline(MFOUTLINE Outline, const DENORM& cn_denorm) {
 /*
  ** Parameters:
  **   Outline     outline to be character normalized
@@ -463,13 +386,13 @@ void CharNormalizeOutline(MFOUTLINE Outline,
   First = Outline;
   Current = First;
   do {
-    CurrentPoint = PointAt (Current);
-    CurrentPoint->Point.x =
-      (CurrentPoint->Point.x - XCenter) * XScale;
-    CurrentPoint->Point.y =
-      (CurrentPoint->Point.y - YCenter) * YScale;
+    CurrentPoint = PointAt(Current);
+    FCOORD pos(CurrentPoint->Point.x, CurrentPoint->Point.y);
+    cn_denorm.LocalNormTransform(pos, &pos);
+    CurrentPoint->Point.x = (pos.x() - MAX_UINT8 / 2) * MF_SCALE_FACTOR;
+    CurrentPoint->Point.y = (pos.y() - MAX_UINT8 / 2) * MF_SCALE_FACTOR;
 
-    Current = NextPointAfter (Current);
+    Current = NextPointAfter(Current);
   }
   while (Current != First);
 
diff --git a/classify/mfoutline.h b/classify/mfoutline.h
index 71dd310b8..bdf20f372 100644
--- a/classify/mfoutline.h
+++ b/classify/mfoutline.h
@@ -21,10 +21,10 @@
 /**----------------------------------------------------------------------------
           Include Files and Type Defines
 ----------------------------------------------------------------------------**/
+#include "blobs.h"
 #include "host.h"
 #include "oldlist.h"
 #include "fpoint.h"
-#include "baseline.h"
 #include "params.h"
 
 #define NORMAL_X_HEIGHT   (0.5)
@@ -68,7 +68,7 @@ typedef enum {
 #define AverageOf(A,B)    (((A) + (B)) / 2)
 
 /* macro for computing the scale factor to use to normalize characters */
-#define MF_SCALE_FACTOR  (NORMAL_X_HEIGHT / BASELINE_SCALE)
+#define MF_SCALE_FACTOR  (NORMAL_X_HEIGHT / kBlnXHeight)
 
 /* macros for manipulating micro-feature outlines */
 #define DegenerateOutline(O)  (((O) == NIL_LIST) || ((O) == list_rest(O)))
@@ -93,8 +93,6 @@ LIST ConvertOutlines(TESSLINE *Outline,
                      LIST ConvertedOutlines,
                      OUTLINETYPE OutlineType);
 
-void ComputeOutlineStats(LIST Outlines, OUTLINE_STATS *OutlineStats);
-
 void FilterEdgeNoise(MFOUTLINE Outline, FLOAT32 NoiseSegmentLength);
 
 void FindDirectionChanges(MFOUTLINE Outline,
@@ -119,11 +117,10 @@ void NormalizeOutline(MFOUTLINE Outline,
 -----------------------------------------------------------------------------*/
 void ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction);
 
-void CharNormalizeOutline(MFOUTLINE Outline,
-                          FLOAT32 XCenter,
-                          FLOAT32 YCenter,
-                          FLOAT32 XScale,
-                          FLOAT32 YScale);
+// Normalizes the Outline in-place using cn_denorm's local transformation,
+// then converts from the integer feature range [0,255] to the clusterer
+// feature range of [-0.5, 0.5].
+void CharNormalizeOutline(MFOUTLINE Outline, const DENORM& cn_denorm);
 
 void ComputeDirection(MFEDGEPT *Start,
                       MFEDGEPT *Finish,
diff --git a/classify/mfx.cpp b/classify/mfx.cpp
index a053a051f..9f3e3d242 100644
--- a/classify/mfx.cpp
+++ b/classify/mfx.cpp
@@ -59,7 +59,9 @@ MICROFEATURE ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End);
 ----------------------------------------------------------------------------**/
 
 /*---------------------------------------------------------------------------*/
-CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm) {
+CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                const DENORM& cn_denorm,
+                                const INT_FX_RESULT_STRUCT& fx_info) {
 /*
  **      Parameters:
  **              Blob            blob to extract micro-features from
@@ -74,35 +76,25 @@ CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm) {
  **      History: 7/21/89, DSJ, Created.
  */
   MICROFEATURES MicroFeatures = NIL_LIST;
-  FLOAT32 XScale, YScale;
   LIST Outlines;
   LIST RemainingOutlines;
   MFOUTLINE Outline;
-  INT_FEATURE_ARRAY blfeatures;
-  INT_FEATURE_ARRAY cnfeatures;
-  INT_FX_RESULT_STRUCT results;
 
   if (Blob != NULL) {
-    Outlines = ConvertBlob (Blob);
-    if (!ExtractIntFeat(Blob, denorm, blfeatures, cnfeatures, &results))
-      return NULL;
-    XScale = 0.2f / results.Ry;
-    YScale = 0.2f / results.Rx;
+    Outlines = ConvertBlob(Blob);
 
     RemainingOutlines = Outlines;
     iterate(RemainingOutlines) {
       Outline = (MFOUTLINE) first_node (RemainingOutlines);
-      CharNormalizeOutline (Outline,
-        results.Xmean, results.Ymean,
-        XScale, YScale);
+      CharNormalizeOutline(Outline, cn_denorm);
     }
 
     RemainingOutlines = Outlines;
     iterate(RemainingOutlines) {
-      Outline = (MFOUTLINE) first_node (RemainingOutlines);
+      Outline = (MFOUTLINE) first_node(RemainingOutlines);
       FindDirectionChanges(Outline, classify_min_slope, classify_max_slope);
       MarkDirectionChanges(Outline);
-      MicroFeatures = ConvertToMicroFeatures (Outline, MicroFeatures);
+      MicroFeatures = ConvertToMicroFeatures(Outline, MicroFeatures);
     }
     FreeOutlines(Outlines);
   }
diff --git a/classify/mfx.h b/classify/mfx.h
index bd3139967..7e7fe1cfb 100644
--- a/classify/mfx.h
+++ b/classify/mfx.h
@@ -35,6 +35,8 @@ extern double_VAR_H(classify_max_slope, 2.414213562,
 /**----------------------------------------------------------------------------
           Public Function Prototypes
 ----------------------------------------------------------------------------**/
-CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& denorm);
+CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                const DENORM& cn_denorm,
+                                const INT_FX_RESULT_STRUCT& fx_info);
 
 #endif
diff --git a/classify/normfeat.cpp b/classify/normfeat.cpp
index a377a6b38..3f8013aa3 100644
--- a/classify/normfeat.cpp
+++ b/classify/normfeat.cpp
@@ -59,22 +59,18 @@ FLOAT32 ActualOutlineLength(FEATURE Feature) {
 //     the x center of the grapheme's bounding box.
 //     English: [0.011, 0.31]
 //
-FEATURE_SET ExtractCharNormFeatures(TBLOB *blob, const DENORM& denorm) {
+FEATURE_SET ExtractCharNormFeatures(TBLOB *blob, const DENORM& bl_denorm,
+                                    const DENORM& cn_denorm,
+                                    const INT_FX_RESULT_STRUCT& fx_info) {
   FEATURE_SET feature_set = NewFeatureSet(1);
   FEATURE feature = NewFeature(&CharNormDesc);
 
-  INT_FEATURE_ARRAY blfeatures;
-  INT_FEATURE_ARRAY cnfeatures;
-  INT_FX_RESULT_STRUCT FXInfo;
-
-  ExtractIntFeat(blob, denorm, blfeatures, cnfeatures, &FXInfo);
-
   feature->Params[CharNormY] =
-      MF_SCALE_FACTOR * (FXInfo.Ymean - BASELINE_OFFSET);
+      MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
   feature->Params[CharNormLength] =
-      MF_SCALE_FACTOR * FXInfo.Length / LENGTH_COMPRESSION;
-  feature->Params[CharNormRx] = MF_SCALE_FACTOR * FXInfo.Rx;
-  feature->Params[CharNormRy] = MF_SCALE_FACTOR * FXInfo.Ry;
+      MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
+  feature->Params[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
+  feature->Params[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;
 
   AddFeature(feature_set, feature);
 
diff --git a/classify/normfeat.h b/classify/normfeat.h
index 54bf6ae57..59703a517 100644
--- a/classify/normfeat.h
+++ b/classify/normfeat.h
@@ -34,6 +34,8 @@ typedef enum {
 ----------------------------------------------------------------------------**/
 FLOAT32 ActualOutlineLength(FEATURE Feature);
 
-FEATURE_SET ExtractCharNormFeatures(TBLOB *Blob, const DENORM& denorm);
+FEATURE_SET ExtractCharNormFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                    const DENORM& cn_denorm,
+                                    const INT_FX_RESULT_STRUCT& fx_info);
 
 #endif
diff --git a/classify/normmatch.cpp b/classify/normmatch.cpp
index 830181e85..9dfe5a76d 100644
--- a/classify/normmatch.cpp
+++ b/classify/normmatch.cpp
@@ -94,7 +94,7 @@ FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId,
   PROTOTYPE *Proto;
   int ProtoId;
 
-  if(ClassId > NormProtos->NumProtos) {
+  if (ClassId > NormProtos->NumProtos) {
     ClassId = NO_CLASS;
   }
 
diff --git a/classify/ocrfeatures.cpp b/classify/ocrfeatures.cpp
index 3685c5c6b..7a791338e 100644
--- a/classify/ocrfeatures.cpp
+++ b/classify/ocrfeatures.cpp
@@ -230,7 +230,7 @@ void WriteFeature(FILE *File, FEATURE Feature) {
   int i;
 
   for (i = 0; i < Feature->Type->NumParams; i++) {
-#ifndef _WIN32
+#ifndef WIN32
     assert(!isnan(Feature->Params[i]));
 #endif
     fprintf(File, " %g", Feature->Params[i]);
diff --git a/classify/ocrfeatures.h b/classify/ocrfeatures.h
index 8ca9e5975..734b4ff07 100644
--- a/classify/ocrfeatures.h
+++ b/classify/ocrfeatures.h
@@ -26,6 +26,7 @@
 #include <stdio.h>
 
 class DENORM;
+struct INT_FX_RESULT_STRUCT;
 
 #undef Min
 #undef Max
@@ -78,7 +79,8 @@ typedef FEATURE_SET_STRUCT *FEATURE_SET;
 // classifier does not need to know the details of this data structure.
 typedef char *CHAR_FEATURES;
 
-typedef FEATURE_SET (*FX_FUNC) (TBLOB *, const DENORM&);
+typedef FEATURE_SET (*FX_FUNC)(TBLOB *, const DENORM&, const DENORM&,
+                               const INT_FX_RESULT_STRUCT&);
 
 struct FEATURE_EXT_STRUCT {
   FX_FUNC Extractor;             // func to extract features
diff --git a/classify/picofeat.cpp b/classify/picofeat.cpp
index 9f2a4ead3..ba19fb1ca 100644
--- a/classify/picofeat.cpp
+++ b/classify/picofeat.cpp
@@ -224,7 +224,9 @@ void NormalizePicoX(FEATURE_SET FeatureSet) {
 }                                /* NormalizePicoX */
 
 /*---------------------------------------------------------------------------*/
-FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
+FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& bl_denorm,
+                                 const DENORM& cn_denorm,
+                                 const INT_FX_RESULT_STRUCT& fx_info) {
 /*
  ** Parameters:
  **   blob    blob to extract features from
@@ -233,8 +235,8 @@ FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
  ** Exceptions: none
  ** History: 8/8/2011, rays, Created.
  */
-  tesseract::TrainingSample* sample = GetIntFeatures(
-      tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
+  tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
+      *blob, tesseract::NM_CHAR_ANISOTROPIC, false);
   if (sample == NULL) return NULL;
 
   int num_features = sample->num_features();
@@ -254,7 +256,9 @@ FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& denorm) {
 }                                /* ExtractIntCNFeatures */
 
 /*---------------------------------------------------------------------------*/
-FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& denorm) {
+FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& bl_denorm,
+                                  const DENORM& cn_denorm,
+                                  const INT_FX_RESULT_STRUCT& fx_info) {
 /*
  ** Parameters:
  **   blob    blob to extract features from
@@ -263,8 +267,8 @@ FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& denorm) {
  ** Exceptions: none
  ** History: 8/8/2011, rays, Created.
  */
-  tesseract::TrainingSample* sample = GetIntFeatures(
-      tesseract::NM_CHAR_ANISOTROPIC, blob, denorm);
+  tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample(
+      *blob, tesseract::NM_CHAR_ANISOTROPIC, false);
   if (sample == NULL) return NULL;
 
   FEATURE_SET feature_set = NewFeatureSet(1);
diff --git a/classify/picofeat.h b/classify/picofeat.h
index db63d2119..ab37ba038 100644
--- a/classify/picofeat.h
+++ b/classify/picofeat.h
@@ -58,8 +58,12 @@ extern double_VAR_H(classify_pico_feature_length, 0.05, "Pico Feature Length");
 ----------------------------------------------------------------------------**/
 #define GetPicoFeatureLength()  (PicoFeatureLength)
 
-FEATURE_SET ExtractIntCNFeatures(TBLOB *Blob, const DENORM& denorm);
-FEATURE_SET ExtractIntGeoFeatures(TBLOB *Blob, const DENORM& denorm);
+FEATURE_SET ExtractIntCNFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                 const DENORM& cn_denorm,
+                                 const INT_FX_RESULT_STRUCT& fx_info);
+FEATURE_SET ExtractIntGeoFeatures(TBLOB *Blob, const DENORM& bl_denorm,
+                                  const DENORM& cn_denorm,
+                                  const INT_FX_RESULT_STRUCT& fx_info);
 
 /**----------------------------------------------------------------------------
         Global Data Definitions and Declarations
diff --git a/classify/shapeclassifier.cpp b/classify/shapeclassifier.cpp
new file mode 100644
index 000000000..e357f66fc
--- /dev/null
+++ b/classify/shapeclassifier.cpp
@@ -0,0 +1,230 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        shapeclassifier.h
+// Description: Base interface class for classifiers that return a
+//              shape index.
+// Author:      Ray Smith
+// Created:     Thu Dec 15 15:24:27 PST 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "shapeclassifier.h"
+#include "genericvector.h"
+#include "scrollview.h"
+#include "shapetable.h"
+#include "svmnode.h"
+#include "trainingsample.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Classifies the given [training] sample, writing to results.
+// See shapeclassifier.h for a full description.
+// Default implementation calls the ShapeRating version.
+int ShapeClassifier::UnicharClassifySample(
+    const TrainingSample& sample, Pix* page_pix, int debug,
+    UNICHAR_ID keep_this, GenericVector<UnicharRating>* results) {
+  results->truncate(0);
+  GenericVector<ShapeRating> shape_results;
+  int num_shape_results = ClassifySample(sample, page_pix, debug, keep_this,
+                                         &shape_results);
+  const ShapeTable* shapes = GetShapeTable();
+  GenericVector<int> unichar_map;
+  unichar_map.init_to_size(shapes->unicharset().size(), -1);
+  for (int r = 0; r < num_shape_results; ++r) {
+    shapes->AddShapeToResults(shape_results[r], &unichar_map, results);
+  }
+  return results->size();
+}
+
+// Classifies the given [training] sample, writing to results.
+// See shapeclassifier.h for a full description.
+// Default implementation aborts.
+int ShapeClassifier::ClassifySample(const TrainingSample& sample, Pix* page_pix,
+                           int debug, int keep_this,
+                           GenericVector<ShapeRating>* results) {
+  ASSERT_HOST("Must implement ClassifySample!" == NULL);
+  return 0;
+}
+
+// Returns the shape that contains unichar_id that has the best result.
+// If result is not NULL, it is set with the shape_id and rating.
+// Does not need to be overridden if ClassifySample respects the keep_this
+// rule.
+int ShapeClassifier::BestShapeForUnichar(const TrainingSample& sample,
+                                         Pix* page_pix, UNICHAR_ID unichar_id,
+                                         ShapeRating* result) {
+  GenericVector<ShapeRating> results;
+  const ShapeTable* shapes = GetShapeTable();
+  int num_results = ClassifySample(sample, page_pix, 0, unichar_id, &results);
+  for (int r = 0; r < num_results; ++r) {
+    if (shapes->GetShape(results[r].shape_id).ContainsUnichar(unichar_id)) {
+      if (result != NULL)
+        *result = results[r];
+      return results[r].shape_id;
+    }
+  }
+  return -1;
+}
+
+// Provides access to the UNICHARSET that this classifier works with.
+// Only needs to be overridden if GetShapeTable() can return NULL.
+const UNICHARSET& ShapeClassifier::GetUnicharset() const {
+  return GetShapeTable()->unicharset();
+}
+
+// Visual debugger classifies the given sample, displays the results and
+// solicits user input to display other classifications. Returns when
+// the user has finished with debugging the sample.
+// Probably doesn't need to be overridden if the subclass provides
+// DisplayClassifyAs.
+void ShapeClassifier::DebugDisplay(const TrainingSample& sample,
+                                   Pix* page_pix,
+                                   UNICHAR_ID unichar_id) {
+  static ScrollView* terminator = NULL;
+  if (terminator == NULL) {
+    terminator = new ScrollView("XIT", 0, 0, 50, 50, 50, 50, true);
+  }
+  ScrollView* debug_win = CreateFeatureSpaceWindow("ClassifierDebug", 0, 0);
+  // Provide a right-click menu to choose the class.
+  SVMenuNode* popup_menu = new SVMenuNode();
+  popup_menu->AddChild("Choose class to debug", 0, "x", "Class to debug");
+  popup_menu->BuildMenu(debug_win, false);
+  // Display the features in green.
+  const INT_FEATURE_STRUCT* features = sample.features();
+  int num_features = sample.num_features();
+  for (int f = 0; f < num_features; ++f) {
+    RenderIntFeature(debug_win, &features[f], ScrollView::GREEN);
+  }
+  debug_win->Update();
+  GenericVector<UnicharRating> results;
+  // Debug classification until the user quits.
+  const UNICHARSET& unicharset = GetUnicharset();
+  SVEvent* ev;
+  SVEventType ev_type;
+  do {
+    PointerVector<ScrollView> windows;
+    if (unichar_id >= 0) {
+      tprintf("Debugging class %d = %s\n",
+              unichar_id, unicharset.id_to_unichar(unichar_id));
+      UnicharClassifySample(sample, page_pix, 1, unichar_id, &results);
+      DisplayClassifyAs(sample, page_pix, unichar_id, 1, &windows);
+    } else {
+      tprintf("Invalid unichar_id: %d\n", unichar_id);
+      UnicharClassifySample(sample, page_pix, 1, -1, &results);
+    }
+    if (unichar_id >= 0) {
+      tprintf("Debugged class %d = %s\n",
+              unichar_id, unicharset.id_to_unichar(unichar_id));
+    }
+    tprintf("Right-click in ClassifierDebug window to choose debug class,");
+    tprintf(" Left-click or close window to quit...\n");
+    UNICHAR_ID old_unichar_id;
+    do {
+      old_unichar_id = unichar_id;
+      ev = debug_win->AwaitEvent(SVET_ANY);
+      ev_type = ev->type;
+      if (ev_type == SVET_POPUP) {
+        if (unicharset.contains_unichar(ev->parameter)) {
+          unichar_id = unicharset.unichar_to_id(ev->parameter);
+        } else {
+          tprintf("Char class '%s' not found in unicharset", ev->parameter);
+        }
+      }
+      delete ev;
+    } while (unichar_id == old_unichar_id &&
+             ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
+  } while (ev_type != SVET_CLICK && ev_type != SVET_DESTROY);
+  delete debug_win;
+}
+
+// Displays classification as the given shape_id. Creates as many windows
+// as it feels fit, using index as a guide for placement. Adds any created
+// windows to the windows output and returns a new index that may be used
+// by any subsequent classifiers. Caller waits for the user to view and
+// then destroys the windows by clearing the vector.
+int ShapeClassifier::DisplayClassifyAs(
+    const TrainingSample& sample, Pix* page_pix,
+    UNICHAR_ID unichar_id, int index,
+    PointerVector<ScrollView>* windows) {
+  // Does nothing in the default implementation.
+  return index;
+}
+
+// Prints debug information on the results.
+void ShapeClassifier::UnicharPrintResults(
+    const char* context, const GenericVector<UnicharRating>& results) const {
+  tprintf("%s\n", context);
+  for (int i = 0; i < results.size(); ++i) {
+    tprintf("%g: c_id=%d=%s", results[i].rating, results[i].unichar_id,
+            GetUnicharset().id_to_unichar(results[i].unichar_id));
+    if (results[i].fonts.size() != 0) {
+      tprintf(" Font Vector:");
+      for (int f = 0; f < results[i].fonts.size(); ++f) {
+        tprintf(" %d", results[i].fonts[f]);
+      }
+    }
+    tprintf("\n");
+  }
+}
+void ShapeClassifier::PrintResults(
+    const char* context, const GenericVector<ShapeRating>& results) const {
+  tprintf("%s\n", context);
+  for (int i = 0; i < results.size(); ++i) {
+    tprintf("%g:", results[i].rating);
+    if (results[i].joined)
+      tprintf("[J]");
+    if (results[i].broken)
+      tprintf("[B]");
+    tprintf(" %s\n", GetShapeTable()->DebugStr(results[i].shape_id).string());
+  }
+}
+
+// Removes any result that has all its unichars covered by a better choice,
+// regardless of font.
+void ShapeClassifier::FilterDuplicateUnichars(
+    GenericVector<ShapeRating>* results) const {
+  GenericVector<ShapeRating> filtered_results;
+  // Copy results to filtered results and knock out duplicate unichars.
+  const ShapeTable* shapes = GetShapeTable();
+  for (int r = 0; r < results->size(); ++r) {
+    if (r > 0) {
+      const Shape& shape_r = shapes->GetShape((*results)[r].shape_id);
+      int c;
+      for (c = 0; c < shape_r.size(); ++c) {
+        int unichar_id = shape_r[c].unichar_id;
+        int s;
+        for (s = 0; s < r; ++s) {
+          const Shape& shape_s = shapes->GetShape((*results)[s].shape_id);
+          if (shape_s.ContainsUnichar(unichar_id))
+            break;  // We found unichar_id.
+        }
+        if (s == r)
+          break;  // We didn't find unichar_id.
+      }
+      if (c == shape_r.size())
+        continue;  // We found all the unichar ids in previous answers.
+    }
+    filtered_results.push_back((*results)[r]);
+  }
+  *results = filtered_results;
+}
+
+}  // namespace tesseract.
+
+
+
+
+
diff --git a/classify/shapeclassifier.h b/classify/shapeclassifier.h
index 08808127f..5069f375b 100644
--- a/classify/shapeclassifier.h
+++ b/classify/shapeclassifier.h
@@ -23,44 +23,21 @@
 #ifndef TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
 #define TESSERACT_CLASSIFY_SHAPECLASSIFIER_H_
 
+#include "unichar.h"
+
 template <typename T> class GenericVector;
 struct Pix;
+class ScrollView;
+class UNICHARSET;
 
 namespace tesseract {
 
+template <typename T> class PointerVector;
+struct ShapeRating;
 class ShapeTable;
 class TrainingSample;
-
-// Classifier result from a low-level classification is an index into some
-// ShapeTable and a rating.
-struct ShapeRating {
-  ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f) {}
-  ShapeRating(int s, float r)
-    : shape_id(s), rating(r), raw(1.0f), font(0.0f) {}
-
-  // Sort function to sort ratings appropriately by descending rating.
-  static int SortDescendingRating(const void* t1, const void* t2) {
-    const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
-    const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
-    if (a->rating > b->rating) {
-      return -1;
-    } else if (a->rating < b->rating) {
-      return 1;
-    } else {
-      return a->shape_id - b->shape_id;
-    }
-  }
-
-  // Index into some shape table indicates the class of the answer.
-  int shape_id;
-  // Rating from classifier with 1.0 perfect and 0.0 impossible.
-  // Call it a probability if you must.
-  float rating;
-  // Subsidiary rating that a classifier may use internally.
-  float raw;
-  // Subsidiary rating that a classifier may use internally.
-  float font;
-};
+class TrainingSampleSet;
+struct UnicharRating;
 
 // Interface base class for classifiers that produce ShapeRating results.
 class ShapeClassifier {
@@ -76,18 +53,70 @@ class ShapeClassifier {
   // to get the appropriate tesseract features.
   // If debug is non-zero, then various degrees of classifier dependent debug
   // information is provided.
-  // If keep_this (a shape index) is >= 0, then the results should always
+  // If keep_this (a UNICHAR_ID) is >= 0, then the results should always
   // contain keep_this, and (if possible) anything of intermediate confidence.
-  // (Used for answering "Why didn't it get that right?" questions.)
+  // (Used for answering "Why didn't it get that right?" questions.) It must
+  // be a UNICHAR_ID as the callers have no clue how to choose the best shape
+  // that may contain a desired answer.
   // The return value is the number of classes saved in results.
-  // NOTE that overriding functions MUST clear results unless the classifier
-  // is working with a team of such classifiers.
+  // NOTE that overriding functions MUST clear and sort the results by
+  // descending rating unless the classifier is working with a team of such
+  // classifiers.
+  // NOTE: Neither overload of ClassifySample is pure, but at least one must
+  // be overridden by a classifier in order for it to do anything.
+  virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
+                                    int debug, UNICHAR_ID keep_this,
+                                    GenericVector<UnicharRating>* results);
+
+ protected:
   virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
-                             int debug, int keep_this,
-                             GenericVector<ShapeRating>* results) = 0;
+                             int debug, UNICHAR_ID keep_this,
+                             GenericVector<ShapeRating>* results);
+
+ public:
+  // Returns the shape that contains unichar_id that has the best result.
+  // If result is not NULL, it is set with the shape_id and rating.
+  // Returns -1 if ClassifySample fails to provide any result containing
+  // unichar_id. BestShapeForUnichar does not need to be overridden if
+  // ClassifySample respects the keep_this rule.
+  virtual int BestShapeForUnichar(const TrainingSample& sample, Pix* page_pix,
+                                  UNICHAR_ID unichar_id, ShapeRating* result);
 
   // Provides access to the ShapeTable that this classifier works with.
   virtual const ShapeTable* GetShapeTable() const = 0;
+  // Provides access to the UNICHARSET that this classifier works with.
+  // Must be overridden IFF GetShapeTable() returns NULL.
+  virtual const UNICHARSET& GetUnicharset() const;
+
+  // Visual debugger classifies the given sample, displays the results and
+  // solicits user input to display other classifications. Returns when
+  // the user has finished with debugging the sample.
+  // Probably doesn't need to be overridden if the subclass provides
+  // DisplayClassifyAs.
+  virtual void DebugDisplay(const TrainingSample& sample, Pix* page_pix,
+                            UNICHAR_ID unichar_id);
+
+
+  // Displays classification as the given unichar_id. Creates as many windows
+  // as it feels fit, using index as a guide for placement. Adds any created
+  // windows to the windows output and returns a new index that may be used
+  // by any subsequent classifiers. Caller waits for the user to view and
+  // then destroys the windows by clearing the vector.
+  virtual int DisplayClassifyAs(const TrainingSample& sample,  Pix* page_pix,
+                                UNICHAR_ID unichar_id, int index,
+                                PointerVector<ScrollView>* windows);
+
+  // Prints debug information on the results. context is some introductory/title
+  // message.
+  virtual void UnicharPrintResults(
+      const char* context, const GenericVector<UnicharRating>& results) const;
+  virtual void PrintResults(const char* context,
+                            const GenericVector<ShapeRating>& results) const;
+
+ protected:
+  // Removes any result that has all its unichars covered by a better choice,
+  // regardless of font.
+  void FilterDuplicateUnichars(GenericVector<ShapeRating>* results) const;
 };
 
 }  // namespace tesseract.
diff --git a/classify/shapetable.cpp b/classify/shapetable.cpp
index dd0e4772c..325a0e283 100644
--- a/classify/shapetable.cpp
+++ b/classify/shapetable.cpp
@@ -22,12 +22,47 @@
 
 #include "shapetable.h"
 
+#include "bitvector.h"
+#include "fontinfo.h"
 #include "intfeaturespace.h"
 #include "strngs.h"
 #include "unicharset.h"
+#include "unicity_table.h"
 
 namespace tesseract {
 
+// Helper function to get the index of the first result with the required
+// unichar_id. If the results are sorted by rating, this will also be the
+// best result with the required unichar_id.
+// Returns -1 if the unichar_id is not found
+int ShapeRating::FirstResultWithUnichar(
+    const GenericVector<ShapeRating>& results,
+    const ShapeTable& shape_table,
+    UNICHAR_ID unichar_id) {
+  for (int r = 0; r < results.size(); ++r) {
+    int shape_id = results[r].shape_id;
+    const Shape& shape = shape_table.GetShape(shape_id);
+    if (shape.ContainsUnichar(unichar_id)) {
+      return r;
+    }
+  }
+  return -1;
+}
+
+// Helper function to get the index of the first result with the required
+// unichar_id. If the results are sorted by rating, this will also be the
+// best result with the required unichar_id.
+// Returns -1 if the unichar_id is not found
+int UnicharRating::FirstResultWithUnichar(
+    const GenericVector<UnicharRating>& results,
+    UNICHAR_ID unichar_id) {
+  for (int r = 0; r < results.size(); ++r) {
+    if (results[r].unichar_id == unichar_id)
+      return r;
+  }
+  return -1;
+}
+
 // Writes to the given file. Returns false in case of error.
 bool UnicharAndFonts::Serialize(FILE* fp) const {
   if (fwrite(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
@@ -138,6 +173,39 @@ bool Shape::ContainsFont(int font_id) const {
   }
   return false;
 }
+// Returns true if the shape contains the given font properties, ignoring
+// unichar_id.
+bool Shape::ContainsFontProperties(const FontInfoTable& font_table,
+                                   uinT32 properties) const {
+  for (int c = 0; c < unichars_.size(); ++c) {
+    GenericVector<int>& font_list = unichars_[c].font_ids;
+    for (int f = 0; f < font_list.size(); ++f) {
+      if (font_table.get(font_list[f]).properties == properties)
+        return true;
+    }
+  }
+  return false;
+}
+// Returns true if the shape contains multiple different font properties,
+// ignoring unichar_id.
+bool Shape::ContainsMultipleFontProperties(
+    const FontInfoTable& font_table) const {
+  uinT32 properties = font_table.get(unichars_[0].font_ids[0]).properties;
+  for (int c = 0; c < unichars_.size(); ++c) {
+    GenericVector<int>& font_list = unichars_[c].font_ids;
+    for (int f = 0; f < font_list.size(); ++f) {
+      if (font_table.get(font_list[f]).properties != properties)
+        return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if this shape is equal to other (ignoring order of unichars
+// and fonts).
+bool Shape::operator==(const Shape& other) const {
+  return IsSubsetOf(other) && other.IsSubsetOf(*this);
+}
 
 // Returns true if this is a subset (including equal) of other.
 bool Shape::IsSubsetOf(const Shape& other) const {
@@ -172,10 +240,10 @@ void Shape::SortUnichars() {
   unichars_sorted_ = true;
 }
 
-ShapeTable::ShapeTable() : unicharset_(NULL) {
+ShapeTable::ShapeTable() : unicharset_(NULL), num_fonts_(0) {
 }
 ShapeTable::ShapeTable(const UNICHARSET& unicharset)
-  : unicharset_(&unicharset) {
+  : unicharset_(&unicharset), num_fonts_(0) {
 }
 
 // Writes to the given file. Returns false in case of error.
@@ -187,9 +255,38 @@ bool ShapeTable::Serialize(FILE* fp) const {
 // If swap is true, assumes a big/little-endian swap is needed.
 bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
   if (!shape_table_.DeSerialize(swap, fp)) return false;
+  num_fonts_ = 0;
   return true;
 }
 
+// Returns the number of fonts used in this ShapeTable, computing it if
+// necessary.
+int ShapeTable::NumFonts() const {
+  if (num_fonts_ <= 0) {
+    for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
+      const Shape& shape = *shape_table_[shape_id];
+      for (int c = 0; c < shape.size(); ++c) {
+        for (int f = 0; f < shape[c].font_ids.size(); ++f) {
+          if (shape[c].font_ids[f] >= num_fonts_)
+            num_fonts_ = shape[c].font_ids[f] + 1;
+        }
+      }
+    }
+  }
+  return num_fonts_;
+}
+
+// Re-indexes the class_ids in the shapetable according to the given map.
+// Useful in conjunction with set_unicharset.
+void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) {
+  for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
+    Shape* shape = shape_table_[shape_id];
+    for (int c = 0; c < shape->size(); ++c) {
+      shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
+    }
+  }
+}
+
 // Returns a string listing the classes/fonts in a shape.
 STRING ShapeTable::DebugStr(int shape_id) const {
   if (shape_id < 0 || shape_id >= shape_table_.size())
@@ -251,15 +348,22 @@ int ShapeTable::AddShape(int unichar_id, int font_id) {
   Shape* shape = new Shape;
   shape->AddToShape(unichar_id, font_id);
   shape_table_.push_back(shape);
+  num_fonts_ = MAX(num_fonts_, font_id + 1);
   return index;
 }
 
-// Adds a copy of the given shape.
-// Returns the assigned index.
+// Adds a copy of the given shape unless it is already present.
+// Returns the assigned index or index of existing shape if already present.
 int ShapeTable::AddShape(const Shape& other) {
-  int index = shape_table_.size();
-  Shape* shape = new Shape(other);
-  shape_table_.push_back(shape);
+  int index;
+  for (index = 0; index < shape_table_.size() &&
+       !(other == *shape_table_[index]); ++index)
+    continue;
+  if (index == shape_table_.size()) {
+    Shape* shape = new Shape(other);
+    shape_table_.push_back(shape);
+  }
+  num_fonts_ = 0;
   return index;
 }
 
@@ -275,12 +379,14 @@ void ShapeTable::DeleteShape(int shape_id) {
 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
   Shape& shape = *shape_table_[shape_id];
   shape.AddToShape(unichar_id, font_id);
+  num_fonts_ = MAX(num_fonts_, font_id + 1);
 }
 
 // Adds the given shape to the existing shape with the given index.
 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
   Shape& shape = *shape_table_[shape_id];
   shape.AddShape(other);
+  num_fonts_ = 0;
 }
 
 // Returns the id of the shape that contains the given unichar and font.
@@ -316,25 +422,26 @@ void ShapeTable::GetFirstUnicharAndFont(int shape_id,
 // a ShapeTable.
 int ShapeTable::BuildFromShape(const Shape& shape,
                                const ShapeTable& master_shapes) {
-  int num_masters = 0;
+  BitVector shape_map(master_shapes.NumShapes());
   for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
     for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
       int c = shape[u_ind].unichar_id;
       int f = shape[u_ind].font_ids[f_ind];
-      if (FindShape(c, f) < 0) {
-        int shape_id = AddShape(c, f);
-        int master_id = master_shapes.FindShape(c, f);
-        if (master_id >= 0 && shape.size() > 1) {
-          const Shape& master = master_shapes.GetShape(master_id);
-          if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) {
-            // Add everything else from the master shape.
-            shape_table_[shape_id]->AddShape(master);
-            ++num_masters;
-          }
-        }
+      int master_id = master_shapes.FindShape(c, f);
+      if (master_id >= 0) {
+        shape_map.SetBit(master_id);
+      } else if (FindShape(c, f) < 0) {
+        AddShape(c, f);
       }
     }
   }
+  int num_masters = 0;
+  for (int s = 0; s < master_shapes.NumShapes(); ++s) {
+    if (shape_map[s]) {
+      AddShape(master_shapes.GetShape(s));
+      ++num_masters;
+    }
+  }
   return num_masters;
 }
 
@@ -381,7 +488,7 @@ void ShapeTable::ForceFontMerges(int start, int end) {
     }
   }
   ShapeTable compacted(*unicharset_);
-  compacted.AppendMasterShapes(*this);
+  compacted.AppendMasterShapes(*this, NULL);
   *this = compacted;
 }
 
@@ -422,6 +529,13 @@ void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
   shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
 }
 
+// Swaps two shape_ids.
+void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
+  Shape* tmp = shape_table_[shape_id1];
+  shape_table_[shape_id1] = shape_table_[shape_id2];
+  shape_table_[shape_id2] = tmp;
+}
+
 // Returns the destination of this shape, (if merged), taking into account
 // the fact that the destination may itself have been merged.
 int ShapeTable::MasterDestinationIndex(int shape_id) const {
@@ -435,11 +549,129 @@ int ShapeTable::MasterDestinationIndex(int shape_id) const {
   return master_id;
 }
 
+// Returns false if the unichars in neither shape is a subset of the other.
+bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  int c1, c2;
+  for (c1 = 0; c1 < shape1.size(); ++c1) {
+    int unichar_id1 = shape1[c1].unichar_id;
+    if (!shape2.ContainsUnichar(unichar_id1))
+      break;
+  }
+  for (c2 = 0; c2 < shape2.size(); ++c2) {
+    int unichar_id2 = shape2[c2].unichar_id;
+    if (!shape1.ContainsUnichar(unichar_id2))
+      break;
+  }
+  return c1 == shape1.size() || c2 == shape2.size();
+}
+
+// Returns false if the unichars in neither shape is a subset of the other.
+bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2,
+                                    int shape_id) const {
+  const Shape& merge1 = GetShape(merge_id1);
+  const Shape& merge2 = GetShape(merge_id2);
+  const Shape& shape = GetShape(shape_id);
+  int cm1, cm2, cs;
+  for (cs = 0; cs < shape.size(); ++cs) {
+    int unichar_id = shape[cs].unichar_id;
+    if (!merge1.ContainsUnichar(unichar_id) &&
+        !merge2.ContainsUnichar(unichar_id))
+      break;  // Shape is not a subset of the merge.
+  }
+  for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
+    int unichar_id1 = merge1[cm1].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id1))
+      break;  // Merge is not a subset of shape
+  }
+  for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
+    int unichar_id2 = merge2[cm2].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id2))
+      break;  // Merge is not a subset of shape
+  }
+  return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
+}
+
+// Returns true if the unichar sets are equal between the shapes.
+bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  for (int c1 = 0; c1 < shape1.size(); ++c1) {
+    int unichar_id1 = shape1[c1].unichar_id;
+    if (!shape2.ContainsUnichar(unichar_id1))
+      return false;
+  }
+  for (int c2 = 0; c2 < shape2.size(); ++c2) {
+    int unichar_id2 = shape2[c2].unichar_id;
+    if (!shape1.ContainsUnichar(unichar_id2))
+      return false;
+  }
+  return true;
+}
+
+// Returns true if the unichar sets are equal between the shapes.
+bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2,
+                                    int shape_id) const {
+  const Shape& merge1 = GetShape(merge_id1);
+  const Shape& merge2 = GetShape(merge_id2);
+  const Shape& shape = GetShape(shape_id);
+  for (int cs = 0; cs < shape.size(); ++cs) {
+    int unichar_id = shape[cs].unichar_id;
+    if (!merge1.ContainsUnichar(unichar_id) &&
+        !merge2.ContainsUnichar(unichar_id))
+      return false;  // Shape has a unichar that appears in neither merge.
+  }
+  for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
+    int unichar_id1 = merge1[cm1].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id1))
+      return false;  // Merge has a unichar that is not in shape.
+  }
+  for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
+    int unichar_id2 = merge2[cm2].unichar_id;
+    if (!shape.ContainsUnichar(unichar_id2))
+      return false;  // Merge has a unichar that is not in shape.
+  }
+  return true;
+}
+
+// Returns true if there is a common unichar between the shapes.
+bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  for (int c1 = 0; c1 < shape1.size(); ++c1) {
+    int unichar_id1 = shape1[c1].unichar_id;
+    if (shape2.ContainsUnichar(unichar_id1))
+      return true;
+  }
+  return false;
+}
+
+// Returns true if there is a common font id between the shapes.
+bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
+  const Shape& shape1 = GetShape(shape_id1);
+  const Shape& shape2 = GetShape(shape_id2);
+  for (int c1 = 0; c1 < shape1.size(); ++c1) {
+    const GenericVector<int>& font_list1 = shape1[c1].font_ids;
+    for (int f = 0; f < font_list1.size(); ++f) {
+      if (shape2.ContainsFont(font_list1[f]))
+        return true;
+    }
+  }
+  return false;
+}
+
 // Appends the master shapes from other to this.
-void ShapeTable::AppendMasterShapes(const ShapeTable& other) {
+// If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
+void ShapeTable::AppendMasterShapes(const ShapeTable& other,
+                                    GenericVector<int>* shape_map) {
+  if (shape_map != NULL)
+    shape_map->init_to_size(other.NumShapes(), -1);
   for (int s = 0; s < other.shape_table_.size(); ++s) {
     if (other.shape_table_[s]->destination_index() < 0) {
-      AddShape(*other.shape_table_[s]);
+      int index = AddShape(*other.shape_table_[s]);
+      if (shape_map != NULL)
+        (*shape_map)[s] = index;
     }
   }
 }
@@ -455,6 +687,46 @@ int ShapeTable::NumMasterShapes() const {
 }
 
 
+// Adds the unichars of the given shape_id to the vector of results. Any
+// unichar_id that is already present just has the fonts added to the
+// font set for that result without adding a new entry in the vector.
+// NOTE: it is assumed that the results are given to this function in order
+// of decreasing rating.
+// The unichar_map vector indicates the index of the results entry containing
+// each unichar, or -1 if the unichar is not yet included in results.
+void ShapeTable::AddShapeToResults(const ShapeRating& shape_rating,
+                                   GenericVector<int>* unichar_map,
+                                   GenericVector<UnicharRating>* results)const {
+  if (shape_rating.joined) {
+    AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map,
+                        results);
+  }
+  if (shape_rating.broken) {
+    AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map,
+                        results);
+  }
+  const Shape& shape = GetShape(shape_rating.shape_id);
+  for (int u = 0; u < shape.size(); ++u) {
+    int result_index = AddUnicharToResults(shape[u].unichar_id,
+                                           shape_rating.rating,
+                                           unichar_map, results);
+    (*results)[result_index].fonts += shape[u].font_ids;
+  }
+}
+
+// Adds the given unichar_id to the results if needed, updating unichar_map
+// and returning the index of unichar in results.
+int ShapeTable::AddUnicharToResults(
+    int unichar_id, float rating, GenericVector<int>* unichar_map,
+    GenericVector<UnicharRating>* results) const {
+  int result_index = unichar_map->get(unichar_id);
+  if (result_index < 0) {
+    UnicharRating result(unichar_id, rating);
+    result_index = results->push_back(result);
+    (*unichar_map)[unichar_id] = result_index;
+  }
+  return result_index;
+}
+
+
 }  // namespace tesseract
-
-
diff --git a/classify/shapetable.h b/classify/shapetable.h
index 0992fbcb6..87f4245fd 100644
--- a/classify/shapetable.h
+++ b/classify/shapetable.h
@@ -23,6 +23,8 @@
 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
 #define TESSERACT_CLASSIFY_SHAPETABLE_H_
 
+#include "bitvector.h"
+#include "genericheap.h"
 #include "genericvector.h"
 #include "intmatcher.h"
 
@@ -31,6 +33,113 @@ class UNICHARSET;
 
 namespace tesseract {
 
+struct FontInfo;
+class FontInfoTable;
+class ShapeTable;
+
+// Simple struct to hold a single classifier unichar selection, a corresponding
+// rating, and a list of appropriate fonts.
+struct UnicharRating {
+  UnicharRating() : unichar_id(0), rating(0.0f) {}
+  UnicharRating(int u, float r)
+    : unichar_id(u), rating(r) {}
+
+  // Sort function to sort ratings appropriately by descending rating.
+  static int SortDescendingRating(const void* t1, const void* t2) {
+    const UnicharRating* a = reinterpret_cast<const UnicharRating *>(t1);
+    const UnicharRating* b = reinterpret_cast<const UnicharRating *>(t2);
+    if (a->rating > b->rating) {
+      return -1;
+    } else if (a->rating < b->rating) {
+      return 1;
+    } else {
+      return a->unichar_id - b->unichar_id;
+    }
+  }
+  // Helper function to get the index of the first result with the required
+  // unichar_id. If the results are sorted by rating, this will also be the
+  // best result with the required unichar_id.
+  // Returns -1 if the unichar_id is not found
+  static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results,
+                                    UNICHAR_ID unichar_id);
+
+  // Index into some UNICHARSET table indicates the class of the answer.
+  UNICHAR_ID unichar_id;
+  // Rating from classifier with 1.0 perfect and 0.0 impossible.
+  // Call it a probability if you must.
+  float rating;
+  // Set of fonts for this shape in order of decreasing preference.
+  // (There is no mechanism for storing scores for fonts as yet.)
+  GenericVector<int> fonts;
+};
+
+// Classifier result from a low-level classification is an index into some
+// ShapeTable and a rating.
+struct ShapeRating {
+  ShapeRating()
+    : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f),
+      joined(false), broken(false) {}
+  ShapeRating(int s, float r)
+    : shape_id(s), rating(r), raw(1.0f), font(0.0f),
+      joined(false), broken(false) {}
+
+  // Sort function to sort ratings appropriately by descending rating.
+  static int SortDescendingRating(const void* t1, const void* t2) {
+    const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1);
+    const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2);
+    if (a->rating > b->rating) {
+      return -1;
+    } else if (a->rating < b->rating) {
+      return 1;
+    } else {
+      return a->shape_id - b->shape_id;
+    }
+  }
+  // Helper function to get the index of the first result with the required
+  // unichar_id. If the results are sorted by rating, this will also be the
+  // best result with the required unichar_id.
+  // Returns -1 if the unichar_id is not found
+  static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results,
+                                    const ShapeTable& shape_table,
+                                    UNICHAR_ID unichar_id);
+
+  // Index into some shape table indicates the class of the answer.
+  int shape_id;
+  // Rating from classifier with 1.0 perfect and 0.0 impossible.
+  // Call it a probability if you must.
+  float rating;
+  // Subsidiary rating that a classifier may use internally.
+  float raw;
+  // Subsidiary rating that a classifier may use internally.
+  float font;
+  // Flag indicating that the input may be joined.
+  bool joined;
+  // Flag indicating that the input may be broken (a fragment).
+  bool broken;
+};
+
+// Simple struct to hold an entry for a heap-based priority queue of
+// ShapeRating.
+struct ShapeQueueEntry {
+  ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}
+  ShapeQueueEntry(const ShapeRating& rating, int level0)
+    : result(rating), level(level0) {}
+
+  // Sort by decreasing rating and decreasing level for equal rating.
+  bool operator<(const ShapeQueueEntry& other) const {
+    if (result.rating > other.result.rating) return true;
+    if (result.rating == other.result.rating)
+      return level > other.level;
+    return false;
+  }
+
+  // Output from classifier.
+  ShapeRating result;
+  // Which level in the tree did this come from?
+  int level;
+};
+typedef GenericHeap<ShapeQueueEntry> ShapeQueue;
+
 // Simple struct to hold a set of fonts associated with a single unichar-id.
 // A vector of UnicharAndFonts makes a shape.
 struct UnicharAndFonts {
@@ -83,6 +192,10 @@ class Shape {
   const UnicharAndFonts& operator[](int index) const {
     return unichars_[index];
   }
+  // Sets the unichar_id of the given index to the new unichar_id.
+  void SetUnicharId(int index, int unichar_id) {
+    unichars_[index].unichar_id = unichar_id;
+  }
   // Adds a font_id for the given unichar_id. If the unichar_id is not
   // in the shape, it is added.
   void AddToShape(int unichar_id, int font_id);
@@ -94,6 +207,16 @@ class Shape {
   bool ContainsUnichar(int unichar_id) const;
   // Returns true if the shape contains the given font, ignoring unichar_id.
   bool ContainsFont(int font_id) const;
+  // Returns true if the shape contains the given font properties, ignoring
+  // unichar_id.
+  bool ContainsFontProperties(const FontInfoTable& font_table,
+                              uinT32 properties) const;
+  // Returns true if the shape contains multiple different font properties,
+  // ignoring unichar_id.
+  bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const;
+  // Returns true if this shape is equal to other (ignoring order of unichars
+  // and fonts).
+  bool operator==(const Shape& other) const;
   // Returns true if this is a subset (including equal) of other.
   bool IsSubsetOf(const Shape& other) const;
   // Returns true if the lists of unichar ids are the same in this and other,
@@ -143,11 +266,17 @@ class ShapeTable {
   const UNICHARSET& unicharset() const {
     return *unicharset_;
   }
+  // Returns the number of fonts used in this ShapeTable, computing it if
+  // necessary.
+  int NumFonts() const;
   // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
   // entire life of the ShapeTable.
   void set_unicharset(const UNICHARSET& unicharset) {
     unicharset_ = &unicharset;
   }
+  // Re-indexes the class_ids in the shapetable according to the given map.
+  // Useful in conjunction with set_unicharset.
+  void ReMapClassIds(const GenericVector<int>& unicharset_map);
   // Returns a string listing the classes/fonts in a shape.
   STRING DebugStr(int shape_id) const;
   // Returns a debug string summarizing the table.
@@ -156,8 +285,8 @@ class ShapeTable {
   // Adds a new shape starting with the given unichar_id and font_id.
   // Returns the assigned index.
   int AddShape(int unichar_id, int font_id);
-  // Adds a copy of the given shape.
-  // Returns the assigned index.
+  // Adds a copy of the given shape unless it is already present.
+  // Returns the assigned index or index of existing shape if already present.
   int AddShape(const Shape& other);
   // Removes the shape given by the shape index. All indices above are changed!
   void DeleteShape(int shape_id);
@@ -204,10 +333,14 @@ class ShapeTable {
   int MergedUnicharCount(int shape_id1, int shape_id2) const;
   // Merges two shape_ids, leaving shape_id2 marked as merged.
   void MergeShapes(int shape_id1, int shape_id2);
+  // Swaps two shape_ids.
+  void SwapShapes(int shape_id1, int shape_id2);
   // Appends the master shapes from other to this.
   // Used to create a clean ShapeTable from a merged one, or to create a
   // copy of a ShapeTable.
-  void AppendMasterShapes(const ShapeTable& other);
+  // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
+  void AppendMasterShapes(const ShapeTable& other,
+                          GenericVector<int>* shape_map);
   // Returns the number of master shapes remaining after merging.
   int NumMasterShapes() const;
   // Returns the destination of this shape, (if merged), taking into account
@@ -215,11 +348,43 @@ class ShapeTable {
   // For a non-merged shape, returns the input shape_id.
   int MasterDestinationIndex(int shape_id) const;
 
+  // Returns false if the unichars in neither shape is a subset of the other..
+  bool SubsetUnichar(int shape_id1, int shape_id2) const;
+  // Returns false if the unichars in neither shape is a subset of the other..
+  bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const;
+  // Returns true if the unichar sets are equal between the shapes.
+  bool EqualUnichars(int shape_id1, int shape_id2) const;
+  bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const;
+  // Returns true if there is a common unichar between the shapes.
+  bool CommonUnichars(int shape_id1, int shape_id2) const;
+  // Returns true if there is a common font id between the shapes.
+  bool CommonFont(int shape_id1, int shape_id2) const;
+
+  // Adds the unichars of the given shape_id to the vector of results. Any
+  // unichar_id that is already present just has the fonts added to the
+  // font set for that result without adding a new entry in the vector.
+  // NOTE: it is assumed that the results are given to this function in order
+  // of decreasing rating.
+  // The unichar_map vector indicates the index of the results entry containing
+  // each unichar, or -1 if the unichar is not yet included in results.
+  void AddShapeToResults(const ShapeRating& shape_rating,
+                         GenericVector<int>* unichar_map,
+                         GenericVector<UnicharRating>* results) const;
+
  private:
+  // Adds the given unichar_id to the results if needed, updating unichar_map
+  // and returning the index of unichar in results.
+  int AddUnicharToResults(int unichar_id, float rating,
+                          GenericVector<int>* unichar_map,
+                          GenericVector<UnicharRating>* results) const;
+
   // Pointer to a provided unicharset used only by the Debugstr member.
   const UNICHARSET* unicharset_;
   // Vector of pointers to the Shapes in this ShapeTable.
   PointerVector<Shape> shape_table_;
+
+  // Cached data calculated on demand.
+  mutable int num_fonts_;
 };
 
 }  // namespace tesseract.
diff --git a/classify/speckle.cpp b/classify/speckle.cpp
deleted file mode 100644
index e33ce5f7c..000000000
--- a/classify/speckle.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/******************************************************************************
- **	Filename:    speckle.c
- **	Purpose:     Routines used by classifier to filter out speckle.
- **	Author:      Dan Johnson
- **	History:     Mon Mar 11 10:06:14 1991, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-/*-----------------------------------------------------------------------------
-          Include Files and Type Defines
------------------------------------------------------------------------------*/
-#include "speckle.h"
-
-#include "blobs.h"
-#include "ratngs.h"
-#include "params.h"
-
-/*-----------------------------------------------------------------------------
-        Global Data Definitions and Declarations
------------------------------------------------------------------------------*/
-/** define control knobs for adjusting definition of speckle*/
-double_VAR(speckle_large_max_size, 0.30, "Max large speckle size");
-
-double_VAR(speckle_small_penalty, 10.0, "Small speckle penalty");
-
-double_VAR(speckle_large_penalty, 10.0, "Large speckle penalty");
-
-double_VAR(speckle_small_certainty, -1.0, "Small speckle certainty");
-
-/*-----------------------------------------------------------------------------
-              Public Code
------------------------------------------------------------------------------*/
-/*---------------------------------------------------------------------------*/
-/**
- * This routine adds a null choice to Choices with a
- * rating equal to the worst rating in Choices plus a pad.
- * The certainty of the new choice is the same as the
- * certainty of the worst choice in Choices.  The new choice
- * is added to the end of Choices.
- *
- * Globals:
- * - #speckle_small_penalty rating for a small speckle
- * - #speckle_large_penalty rating penalty for a large speckle
- * - #speckle_small_certainty certainty for a small speckle
- *
- * @param Choices choices to add a speckle choice to
- *
- * @return New Choices list with null choice added to end.
- *
- * Exceptions: none
- * History: Mon Mar 11 11:08:11 1991, DSJ, Created.
- */
-void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices) {
-  assert(Choices != NULL);
-  BLOB_CHOICE *blob_choice;
-  BLOB_CHOICE_IT temp_it;
-  temp_it.set_to_list(Choices);
-
-  // If there are no other choices, use the small speckle penalty plus
-  // the large speckle penalty.
-  if (Choices->length() == 0) {
-    blob_choice =
-      new BLOB_CHOICE(0, speckle_small_certainty + speckle_large_penalty,
-                      speckle_small_certainty, -1, -1, NULL, 0, 0, false);
-    temp_it.add_to_end(blob_choice);
-    return;
-  }
-
-  // If there are other choices,  add a null choice that is slightly worse
-  // than the worst choice so far.
-  temp_it.move_to_last();
-  blob_choice = temp_it.data();  // pick the worst choice
-  temp_it.add_to_end(
-      new BLOB_CHOICE(0, blob_choice->rating() + speckle_large_penalty,
-                      blob_choice->certainty(), -1, -1, NULL, 0, 0, false));
-}                                /* AddLargeSpeckleTo */
-
-
-/*---------------------------------------------------------------------------*/
-/**
- * This routine returns TRUE if both the width of height
- * of Blob are less than the MaxLargeSpeckleSize.
- *
- * Globals:
- * - #speckle_large_max_size largest allowed speckle
- *
- * Exceptions: none
- * History: Mon Mar 11 10:06:49 1991, DSJ, Created.
- *
- * @param blob blob to test against speckle criteria
- *
- * @return TRUE if blob is speckle, FALSE otherwise.
- */
-BOOL8 LargeSpeckle(TBLOB *blob) {
-  double speckle_size = BASELINE_SCALE * speckle_large_max_size;
-  TBOX bbox = blob->bounding_box();
-  return (bbox.width() < speckle_size && bbox.height() < speckle_size);
-}                                /* LargeSpeckle */
diff --git a/classify/speckle.h b/classify/speckle.h
deleted file mode 100644
index 9676dc0a4..000000000
--- a/classify/speckle.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/******************************************************************************
- **	Filename:    speckle.h
- **	Purpose:     Interface to classifier speckle filtering routines.
- **	Author:      Dan Johnson
- **	History:     Mon Mar 11 10:14:16 1991, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-#ifndef SPECKLE_H
-#define SPECKLE_H
-
-/*-----------------------------------------------------------------------------
-          Include Files and Type Defines
------------------------------------------------------------------------------*/
-
-#include "baseline.h"
-#include "ratngs.h"
-
-/*-----------------------------------------------------------------------------
-          Public Function Prototypes
------------------------------------------------------------------------------*/
-void AddLargeSpeckleTo(BLOB_CHOICE_LIST *Choices);
-
-BOOL8 LargeSpeckle(TBLOB *Blob);
-
-#endif
diff --git a/classify/tessclassifier.cpp b/classify/tessclassifier.cpp
index f7735d8a2..4b6cad019 100644
--- a/classify/tessclassifier.cpp
+++ b/classify/tessclassifier.cpp
@@ -28,17 +28,25 @@ namespace tesseract {
 
 // Classifies the given [training] sample, writing to results.
 // See ShapeClassifier for a full description.
-int TessClassifier::ClassifySample(const TrainingSample& sample,
-                                   Pix* page_pix, int debug, int keep_this,
-                                   GenericVector<ShapeRating>* results) {
+int TessClassifier::UnicharClassifySample(
+    const TrainingSample& sample, Pix* page_pix, int debug,
+    UNICHAR_ID keep_this, GenericVector<UnicharRating>* results) {
+  int old_matcher_level = classify_->matcher_debug_level;
+  int old_matcher_flags = classify_->matcher_debug_flags;
+  int old_classify_level = classify_->classify_debug_level;
   if (debug) {
-    classify_->matcher_debug_level.set_value(debug ? 2 : 0);
-    classify_->matcher_debug_flags.set_value(debug ? 25 : 0);
-    classify_->classify_debug_level.set_value(debug ? 3 : 0);
-  } else {
-    classify_->classify_debug_level.set_value(debug ? 2 : 0);
+    // Explicitly set values of various control parameters to generate debug
+    // output if required, restoring the old values after classifying.
+    classify_->matcher_debug_level.set_value(2);
+    classify_->matcher_debug_flags.set_value(25);
+    classify_->classify_debug_level.set_value(3);
+  }
+  classify_->CharNormTrainingSample(pruner_only_, keep_this, sample, results);
+  if (debug) {
+    classify_->matcher_debug_level.set_value(old_matcher_level);
+    classify_->matcher_debug_flags.set_value(old_matcher_flags);
+    classify_->classify_debug_level.set_value(old_classify_level);
   }
-  classify_->CharNormTrainingSample(pruner_only_, sample, results);
   return results->size();
 }
 
@@ -46,6 +54,32 @@ int TessClassifier::ClassifySample(const TrainingSample& sample,
 const ShapeTable* TessClassifier::GetShapeTable() const {
   return classify_->shape_table();
 }
+// Provides access to the UNICHARSET that this classifier works with.
+// Only needs to be overridden if GetShapeTable() can return NULL.
+const UNICHARSET& TessClassifier::GetUnicharset() const {
+  return classify_->unicharset;
+}
+
+// Displays classification as the given shape_id. Creates as many windows
+// as it feels fit, using index as a guide for placement. Adds any created
+// windows to the windows output and returns a new index that may be used
+// by any subsequent classifiers. Caller waits for the user to view and
+// then destroys the windows by clearing the vector.
+int TessClassifier::DisplayClassifyAs(
+    const TrainingSample& sample, Pix* page_pix, int unichar_id, int index,
+    PointerVector<ScrollView>* windows) {
+  int shape_id = unichar_id;
+  if (GetShapeTable() != NULL)
+    shape_id = BestShapeForUnichar(sample, page_pix, unichar_id, NULL);
+  if (shape_id < 0) return index;
+  if (UnusedClassIdIn(classify_->PreTrainedTemplates, shape_id)) {
+    tprintf("No built-in templates for class/shape %d\n", shape_id);
+    return index;
+  }
+  classify_->ShowBestMatchFor(shape_id, sample.features(),
+                              sample.num_features());
+  return index;
+}
 
 }  // namespace tesseract
 
diff --git a/classify/tessclassifier.h b/classify/tessclassifier.h
index f2483b7a0..57a04861e 100644
--- a/classify/tessclassifier.h
+++ b/classify/tessclassifier.h
@@ -41,11 +41,23 @@ class TessClassifier : public ShapeClassifier {
 
   // Classifies the given [training] sample, writing to results.
   // See ShapeClassifier for a full description.
-  virtual int ClassifySample(const TrainingSample& sample, Pix* page_pix,
-                             int debug, int keep_this,
-                             GenericVector<ShapeRating>* results);
+  virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix,
+                                    int debug, UNICHAR_ID keep_this,
+                                    GenericVector<UnicharRating>* results);
   // Provides access to the ShapeTable that this classifier works with.
   virtual const ShapeTable* GetShapeTable() const;
+  // Provides access to the UNICHARSET that this classifier works with.
+  // Only needs to be overridden if GetShapeTable() can return NULL.
+  virtual const UNICHARSET& GetUnicharset() const;
+
+  // Displays classification as the given shape_id. Creates as many windows
+  // as it feels fit, using index as a guide for placement. Adds any created
+  // windows to the windows output and returns a new index that may be used
+  // by any subsequent classifiers. Caller waits for the user to view and
+  // then destroys the windows by clearing the vector.
+  virtual int DisplayClassifyAs(const TrainingSample& sample, Pix* page_pix,
+                                int unichar_id, int index,
+                                PointerVector<ScrollView>* windows);
 
  private:
   // Indicates that this classifier is to use just the ClassPruner, or the
diff --git a/classify/trainingsample.cpp b/classify/trainingsample.cpp
index 450b925f3..4557da489 100644
--- a/classify/trainingsample.cpp
+++ b/classify/trainingsample.cpp
@@ -59,6 +59,8 @@ bool TrainingSample::Serialize(FILE* fp) const {
   if (fwrite(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
   if (fwrite(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
     return false;
+  if (fwrite(&outline_length_, sizeof(outline_length_), 1, fp) != 1)
+    return false;
   if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_)
     return false;
   if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_,
@@ -90,10 +92,13 @@ bool TrainingSample::DeSerialize(bool swap, FILE* fp) {
   if (fread(&num_features_, sizeof(num_features_), 1, fp) != 1) return false;
   if (fread(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1)
     return false;
+  if (fread(&outline_length_, sizeof(outline_length_), 1, fp) != 1)
+    return false;
   if (swap) {
     ReverseN(&class_id_, sizeof(class_id_));
     ReverseN(&num_features_, sizeof(num_features_));
     ReverseN(&num_micro_features_, sizeof(num_micro_features_));
+    ReverseN(&outline_length_, sizeof(outline_length_));
   }
   delete [] features_;
   features_ = new INT_FEATURE_STRUCT[num_features_];
@@ -113,20 +118,40 @@ bool TrainingSample::DeSerialize(bool swap, FILE* fp) {
 
 // Saves the given features into a TrainingSample.
 TrainingSample* TrainingSample::CopyFromFeatures(
-    const INT_FX_RESULT_STRUCT& fx_info, const INT_FEATURE_STRUCT* features,
+    const INT_FX_RESULT_STRUCT& fx_info,
+    const TBOX& bounding_box,
+    const INT_FEATURE_STRUCT* features,
     int num_features) {
   TrainingSample* sample = new TrainingSample;
   sample->num_features_ = num_features;
   sample->features_ = new INT_FEATURE_STRUCT[num_features];
+  sample->outline_length_ = fx_info.Length;
   memcpy(sample->features_, features, num_features * sizeof(features[0]));
-  sample->geo_feature_[GeoBottom] = fx_info.YBottom;
-  sample->geo_feature_[GeoTop] = fx_info.YTop;
-  sample->geo_feature_[GeoWidth] = fx_info.Width;
+  sample->geo_feature_[GeoBottom] = bounding_box.bottom();
+  sample->geo_feature_[GeoTop] = bounding_box.top();
+  sample->geo_feature_[GeoWidth] = bounding_box.width();
+
+  // Generate the cn_feature_ from the fx_info.
+  sample->cn_feature_[CharNormY] =
+      MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
+  sample->cn_feature_[CharNormLength] =
+      MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
+  sample->cn_feature_[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
+  sample->cn_feature_[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;
+
   sample->features_are_indexed_ = false;
   sample->features_are_mapped_ = false;
   return sample;
 }
 
+// Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.
+FEATURE_STRUCT* TrainingSample::GetCNFeature() const {
+  FEATURE feature = NewFeature(&CharNormDesc);
+  for (int i = 0; i < kNumCNParams; ++i)
+    feature->Params[i] = cn_feature_[i];
+  return feature;
+}
+
 // Constructs and returns a copy randomized by the method given by
 // the randomizer index. If index is out of [0, kSampleRandomSize) then
 // an exact copy is returned.
diff --git a/classify/trainingsample.h b/classify/trainingsample.h
index 821bbed99..6df1ce824 100644
--- a/classify/trainingsample.h
+++ b/classify/trainingsample.h
@@ -54,7 +54,7 @@ class TrainingSample : public ELIST_LINK {
  public:
   TrainingSample()
     : class_id_(INVALID_UNICHAR_ID), font_id_(0), page_num_(0),
-      num_features_(0), num_micro_features_(0),
+      num_features_(0), num_micro_features_(0), outline_length_(0),
       features_(NULL), micro_features_(NULL), weight_(1.0),
       max_dist_(0.0), sample_index_(0),
       features_are_indexed_(false), features_are_mapped_(false),
@@ -65,8 +65,11 @@ class TrainingSample : public ELIST_LINK {
   // Saves the given features into a TrainingSample. The features are copied,
   // so may be deleted afterwards. Delete the return value after use.
   static TrainingSample* CopyFromFeatures(const INT_FX_RESULT_STRUCT& fx_info,
+                                          const TBOX& bounding_box,
                                           const INT_FEATURE_STRUCT* features,
                                           int num_features);
+  // Returns the cn_feature as a FEATURE_STRUCT* needed by cntraining.
+  FEATURE_STRUCT* GetCNFeature() const;
   // Constructs and returns a copy "randomized" by the method given by
   // the randomizer index. If index is out of [0, kSampleRandomSize) then
   // an exact copy is returned.
@@ -146,6 +149,9 @@ class TrainingSample : public ELIST_LINK {
   const MicroFeature* micro_features() const {
     return micro_features_;
   }
+  int outline_length() const {
+    return outline_length_;
+  }
   float cn_feature(int index) const {
     return cn_feature_[index];
   }
@@ -203,6 +209,10 @@ class TrainingSample : public ELIST_LINK {
   int num_features_;
   // Number of MicroFeature in micro_features_ array.
   int num_micro_features_;
+  // Total length of outline in the baseline normalized coordinate space.
+  // See comment in WERD_RES class definition for a discussion of coordinate
+  // spaces.
+  int outline_length_;
   // Array of features.
   INT_FEATURE_STRUCT* features_;
   // Array of features.
diff --git a/classify/trainingsampleset.cpp b/classify/trainingsampleset.cpp
index 2e7f77da4..afbf3f420 100644
--- a/classify/trainingsampleset.cpp
+++ b/classify/trainingsampleset.cpp
@@ -67,7 +67,7 @@ bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
   return true;
 }
 
-TrainingSampleSet::TrainingSampleSet(const UnicityTable<FontInfo>& font_table)
+TrainingSampleSet::TrainingSampleSet(const FontInfoTable& font_table)
   : num_raw_samples_(0), unicharset_size_(0),
     font_class_array_(NULL), fontinfo_table_(font_table) {
 }
@@ -115,11 +115,12 @@ bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
 void TrainingSampleSet::LoadUnicharset(const char* filename) {
   if (!unicharset_.load_from_file(filename)) {
     tprintf("Failed to load unicharset from file %s\n"
-            "Building unicharset for boosting from scratch...\n",
+            "Building unicharset from scratch...\n",
             filename);
     unicharset_.clear();
-    // Space character needed to represent NIL_LIST classification.
-    unicharset_.unichar_insert(" ");
+    // Add special characters as they were removed by the clear.
+    UNICHARSET empty;
+    unicharset_.AppendOtherUnicharset(empty);
   }
   unicharset_size_ = unicharset_.size();
 }
@@ -708,14 +709,6 @@ void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map,
             continue;
           GenericVector<int> features2 = samples_[s2]->indexed_features();
           double dist = f_table.FeatureDistance(features2);
-          int height = samples_[s2]->geo_feature(GeoTop) -
-              samples_[s2]->geo_feature(GeoBottom);
-          if (dist == 1.0 && height > 64) {
-            // TODO(rays) rethink this when the polygonal approximation goes.
-            // Currently it is possible for dots and other small characters
-            // to be completely different, even within the same class.
-            f_table.DebugFeatureDistance(features2);
-          }
           if (dist > max_dist) {
             max_dist = dist;
             if (dist > max_max_dist) {
diff --git a/classify/trainingsampleset.h b/classify/trainingsampleset.h
index 4ff4e86e0..4c843f41c 100644
--- a/classify/trainingsampleset.h
+++ b/classify/trainingsampleset.h
@@ -24,11 +24,11 @@
 #include "trainingsample.h"
 
 class UNICHARSET;
-template <typename T> class UnicityTable;
 
 namespace tesseract {
 
 struct FontInfo;
+class FontInfoTable;
 class IntFeatureMap;
 class IntFeatureSpace;
 class TrainingSample;
@@ -42,7 +42,7 @@ class UnicharAndFonts;
 // metrics.
 class TrainingSampleSet {
  public:
-  explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table);
+  explicit TrainingSampleSet(const FontInfoTable& fontinfo_table);
   ~TrainingSampleSet();
 
   // Writes to the given file. Returns false in case of error.
@@ -67,6 +67,9 @@ class TrainingSampleSet {
   int charsetsize() const {
     return unicharset_size_;
   }
+  const FontInfoTable& fontinfo_table() const {
+    return fontinfo_table_;
+  }
 
   // Loads an initial unicharset, or sets one up if the file cannot be read.
   void LoadUnicharset(const char* filename);
@@ -281,7 +284,7 @@ class TrainingSampleSet {
 
   // Reference to the fontinfo_table_ in MasterTrainer. Provides names
   // for font_ids in the samples. Not serialized!
-  const UnicityTable<FontInfo>& fontinfo_table_;
+  const FontInfoTable& fontinfo_table_;
 };
 
 }  // namespace tesseract.