Merge branch 'old_doxygen_merge' into more-doxygen

2025-06-08 02:12:40 +08:00 · 2015-05-18 15:15:35 +01:00 · 2015-05-18 15:15:35 +01:00 · 06190bad64
commit 06190bad64
parent 229d218445 6418da35ef
185 changed files with 12650 additions and 9704 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,7 @@
 Note that this is a text-only and possibly out-of-date version of the 
 wiki ReadMe, which is located at:
- http://code.google.com/p/tesseract-ocr/wiki/ReadMe
+
+  https://github.com/tesseract-ocr/tesseract/blob/master/README

 Introduction
 ============
@ -10,15 +11,15 @@ Originally developed at Hewlett Packard Laboratories Bristol and
 at Hewlett Packard Co, Greeley Colorado, all the code
 in this distribution is now licensed under the Apache License:

-** Licensed under the Apache License, Version 2.0 (the "License");
-** you may not use this file except in compliance with the License.
-** You may obtain a copy of the License at
-** http://www.apache.org/licenses/LICENSE-2.0
-** Unless required by applicable law or agreed to in writing, software
-** distributed under the License is distributed on an "AS IS" BASIS,
-** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-** See the License for the specific language governing permissions and
-** limitations under the License.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.


 Dependencies and Licenses
@ -56,7 +57,7 @@ those that want to do their own training. Most users should NOT download
 these files.

 Instructions for using the training tools are documented separately at 
-Tesseract wiki http://code.google.com/p/tesseract-ocr/w/list
+Tesseract wiki https://github.com/tesseract-ocr/tesseract/wiki


 Windows
@ -64,6 +65,9 @@ Windows

 Please use installer (for 3.00 and above). Tesseract is library with 
 command line interface. If you need GUI, please check AddOns wiki page
+
+TODO-UPDATE-WIKI-LINKS
+
 http://code.google.com/p/tesseract-ocr/wiki/AddOns#GUI

 If you are building from the sources, the recommended build platform is 
@ -82,6 +86,9 @@ tesseract imagename outputbase [-l lang] [-psm pagesegmode] [configfiles...]

 If you need interface to other applications, please check wrapper section
 on AddOns wiki page:
+
+TODO-UPDATE-WIKI-LINKS
+
 http://code.google.com/p/tesseract-ocr/wiki/AddOns#Tesseract_3.0x


@ -112,6 +119,10 @@ If you are linking to the libraries, as Ocropus does, please link to
 libtesseract_api.


+If you get `leptonica not found` and you've installed it with e.g. homebrew, you
+can run `CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/lib" ./configure`
+instead of `./configure` above.
+

 History
 =======
--- a/android/AndroidManifest.xml
+++ b/android/AndroidManifest.xml
@ -0,0 +1,4 @@
+<!--
+    This file is needed by the android_native_library rule to determine the
+    project directory for ndk-build.
+-->
--- a/android/Makefile.am
+++ b/android/Makefile.am
@ -0,0 +1 @@
+EXTRA_DIST = AndroidManifest.xml jni/Android.mk jni/Application.mk
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -0,0 +1,57 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := tesseract-$(APP_ABI)
+
+LOCAL_STATIC_LIBRARIES := \
+    mobile_base \
+    leptonica-$(APP_ABI)
+
+LOCAL_C_INCLUDES := $(APP_C_INCLUDES)
+
+LOCAL_C_INCLUDES += \
+  $(LOCAL_PATH)/../../api \
+  $(LOCAL_PATH)/../../ccmain\
+  $(LOCAL_PATH)/../../ccstruct\
+  $(LOCAL_PATH)/../../ccutil\
+  $(LOCAL_PATH)/../../classify\
+  $(LOCAL_PATH)/../../cutil\
+  $(LOCAL_PATH)/../../dict\
+  $(LOCAL_PATH)/../../image\
+  $(LOCAL_PATH)/../../textord\
+  $(LOCAL_PATH)/../../third_party\
+  $(LOCAL_PATH)/../../wordrec\
+  $(LOCAL_PATH)/../../opencl\
+  $(LOCAL_PATH)/../../viewer\
+  $(LOCAL_PATH)/../../../leptonica/include
+
+$(info local c includes=$(LOCAL_C_INCLUDES))
+$(info local path=$(LOCAL_PATH))
+LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../ccmain/*.cpp $(LOCAL_PATH)/../../ccstruct/*.cpp $(LOCAL_PATH)/../../ccutil/*.cpp $(LOCAL_PATH)/../../classify/*.cpp $(LOCAL_PATH)/../../cutil/*.cpp $(LOCAL_PATH)/../../dict/*.cpp $(LOCAL_PATH)/../../image/*.cpp $(LOCAL_PATH)/../../textord/*.cpp $(LOCAL_PATH)/../../viewer/*.cpp $(LOCAL_PATH)/../../wordrec/*.cpp)
+
+EXPLICIT_SRC_EXCLUDES := \
+  $(LOCAL_PATH)/../../ccmain/cubeclassifier.cpp \
+  $(LOCAL_PATH)/../../ccmain/cubeclassifier.h  \
+  $(LOCAL_PATH)/../../ccmain/cube_control.cpp \
+  $(LOCAL_PATH)/../../ccmain/cube_reco_context.cpp \
+  $(LOCAL_PATH)/../../ccmain/cube_reco_context.h \
+  $(LOCAL_PATH)/../../ccmain/tesseract_cube_combiner.cpp \
+  $(LOCAL_PATH)/../../ccmain/tesseract_cube_combiner.h \
+  $(LOCAL_PATH)/../../api/pdfrenderer.cpp \
+  $(LOCAL_PATH)/../../api/tesseractmain.cpp \
+
+LOCAL_SRC_FILES := $(filter-out $(EXPLICIT_SRC_EXCLUDES), $(LOCAL_SRC_FILES))
+
+LOCAL_SRC_FILES := $(LOCAL_SRC_FILES:$(LOCAL_PATH)/%=%)
+
+$(info local src files  = $(LOCAL_SRC_FILES))
+
+LOCAL_LDLIBS := -ldl -llog -ljnigraphics
+LOCAL_CFLAGS := -DANDROID_BUILD -DGRAPHICS_DISABLED
+
+include $(BUILD_SHARED_LIBRARY)
+
+$(call import-module,mobile/base)
+$(call import-module,mobile/base)
+$(call import-module,mobile/util/hash)
+$(call import-module,third_party/leptonica/android/jni)
--- a/android/jni/Application.mk
+++ b/android/jni/Application.mk
@ -0,0 +1,13 @@
+# Include common.mk for building google3 native code.
+DEPOT_PATH := $(firstword $(subst /google3, ,$(abspath $(call my-dir))))
+ifneq ($(wildcard $(DEPOT_PATH)/google3/mobile/build/common.mk),)
+  include $(DEPOT_PATH)/google3/mobile/build/common.mk
+else
+  include $(DEPOT_PATH)/READONLY/google3/mobile/build/common.mk
+endif
+
+# Specify the hash namespace that we're using, based on the APP_STL we're using.
+APP_CFLAGS += -Werror -DHASH_NAMESPACE=__gnu_cxx -Wno-error=deprecated-register
+APP_PLATFORM := android-16
+APP_STL := gnustl_static
+NDK_TOOLCHAIN_VERSION := clang
--- a/api/Makefile.am
+++ b/api/Makefile.am
@ -66,7 +66,7 @@ libtesseract_la_LIBADD = \
 libtesseract_la_LDFLAGS += -version-info $(GENERIC_LIBRARY_VERSION)

 bin_PROGRAMS = tesseract
-tesseract_SOURCES = $(top_srcdir)/api/tesseractmain.cpp
+tesseract_SOURCES = tesseractmain.cpp
 tesseract_CPPFLAGS = $(AM_CPPFLAGS)
 if VISIBILITY
 tesseract_CPPFLAGS += -DTESS_IMPORTS
@ -78,7 +78,7 @@ if USE_OPENCL
 tesseract_LDADD += $(OPENCL_LIB)
 endif

-if MINGW
+if T_WIN
 tesseract_LDADD += -lws2_32
 libtesseract_la_LDFLAGS += -no-undefined -Wl,--as-needed -lws2_32
 endif
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@ -28,6 +28,7 @@

 #if defined(_WIN32)
 #ifdef _MSC_VER
+#include "vcsversion.h"
 #include "mathfix.h"
 #elif MINGW
 // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME
@ -51,6 +52,7 @@
 #include "allheaders.h"

 #include "baseapi.h"
+#include "blobclass.h"
 #include "resultiterator.h"
 #include "mutableiterator.h"
 #include "thresholder.h"
@ -138,7 +140,11 @@ TessBaseAPI::~TessBaseAPI() {
 * Returns the version identifier as a static string. Do not delete.
 */
 const char* TessBaseAPI::Version() {
+#if defined(GIT_REV) && (defined(DEBUG) || defined(_DEBUG))
+  return GIT_REV;
+#else
  return TESSERACT_VERSION_STR;
+#endif
 }

 /**
@ -741,6 +747,7 @@ void TessBaseAPI::DumpPGM(const char* filename) {
  fclose(fp);
 }

+#ifndef ANDROID_BUILD
 /**
 * Placeholder for call to Cube and test that the input data is correct.
 * reskew is the direction of baselines in the skewed image in
@ -785,6 +792,7 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
  ASSERT_HOST(pr_word == word_count);
  return 0;
 }
+#endif

 /**
 * Runs page layout analysis in the mode set by SetPageSegMode.
@ -870,7 +878,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
    page_res_ = NULL;
    return -1;
  } else if (tesseract_->tessedit_train_from_boxes) {
-    tesseract_->ApplyBoxTraining(*output_file_, page_res_);
+    STRING fontname;
+    ExtractFontName(*output_file_, &fontname);
+    tesseract_->ApplyBoxTraining(fontname, page_res_);
  } else if (tesseract_->tessedit_ambigs_training) {
    FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
    // OCR the page segmented into words by tesseract.
@ -1019,6 +1029,7 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
                                            int timeout_millisec,
                                            TessResultRenderer* renderer,
                                            int tessedit_page_number) {
+#ifndef ANDROID_BUILD
  Pix *pix = NULL;
 #ifdef USE_OPENCL
  OpenclDevice od;
@ -1049,6 +1060,26 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
    if (tessedit_page_number >= 0) break;
  }
  return true;
+#else
+  return false;
+#endif
+}
+
+// Master ProcessPages calls ProcessPagesInternal and then does any post-
+// processing required due to being in a training mode.
+bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
+                               int timeout_millisec,
+                               TessResultRenderer* renderer) {
+  bool result =
+      ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
+  if (result) {
+    if (tesseract_->tessedit_train_from_boxes &&
+        !tesseract_->WriteTRFile(*output_file_)) {
+      tprintf("Write of TR file failed: %s\n", output_file_->string());
+      return false;
+    }
+  }
+  return result;
 }

 // In the ideal scenario, Tesseract will start working on data as soon
@ -1063,9 +1094,11 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
 // identify the scenario that really matters: filelists on
 // stdin. We'll still do our best if the user likes pipes.  That means
 // piling up any data coming into stdin into a memory buffer.
-bool TessBaseAPI::ProcessPages(const char* filename,
-                               const char* retry_config, int timeout_millisec,
+bool TessBaseAPI::ProcessPagesInternal(const char* filename,
+                                       const char* retry_config,
+                                       int timeout_millisec,
                                       TessResultRenderer* renderer) {
+#ifndef ANDROID_BUILD
  PERF_COUNT_START("ProcessPages")
  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
  if (stdInput) {
@ -1153,6 +1186,9 @@ bool TessBaseAPI::ProcessPages(const char* filename,
  }
  PERF_COUNT_END
  return true;
+#else
+  return false;
+#endif
 }

 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
@ -1186,8 +1222,10 @@ bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
    failed = Recognize(NULL) < 0;
  }
  if (tesseract_->tessedit_write_images) {
+#ifndef ANDROID_BUILD
    Pix* page_pix = GetThresholdedImage();
    pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
+#endif
  }
  if (failed && retry_config != NULL && retry_config[0] != '\0') {
    // Save current config variables before switching modes.
@ -1477,11 +1515,7 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
    do {
      const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
      if (grapheme && grapheme[0] != 0) {
-        if (grapheme[1] == 0) {
        hocr_str += HOcrEscape(grapheme);
-        } else {
-          hocr_str += grapheme;
-        }
      }
      delete []grapheme;
      res_it->Next(RIL_SYMBOL);
@ -1892,6 +1926,10 @@ void TessBaseAPI::ClearPersistentCache() {
 int TessBaseAPI::IsValidWord(const char *word) {
  return tesseract_->getDict().valid_word(word);
 }
+// Returns true if utf8_character is defined in the UniCharset.
+bool TessBaseAPI::IsValidCharacter(const char *utf8_character) {
+    return tesseract_->unicharset.contains_unichar(utf8_character);
+}


 // TODO(rays) Obsolete this function and replace with a more aptly named
@ -2592,10 +2630,12 @@ int TessBaseAPI::NumDawgs() const {
  return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
 }

+#ifndef ANDROID_BUILD
 /** Return a pointer to underlying CubeRecoContext object if present. */
 CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const {
  return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext();
 }
+#endif

 /** Escape a char string - remove <>&"' with HTML codes. */
 STRING HOcrEscape(const char* text) {
--- a/api/baseapi.h
+++ b/api/baseapi.h
@ -538,9 +538,11 @@ class TESS_API TessBaseAPI {
   *
   * Returns true if successful, false on error.
   */
-  bool ProcessPages(const char* filename,
-                    const char* retry_config, int timeout_millisec,
-                    TessResultRenderer* renderer);
+  bool ProcessPages(const char* filename, const char* retry_config,
+                    int timeout_millisec, TessResultRenderer* renderer);
+  // Does the real work of ProcessPages.
+  bool ProcessPagesInternal(const char* filename, const char* retry_config,
+                            int timeout_millisec, TessResultRenderer* renderer);

  /**
   * Turn a single image into symbolic text.
@ -656,6 +658,9 @@ class TESS_API TessBaseAPI {
   * in a separate API at some future time.
   */
  int IsValidWord(const char *word);
+  // Returns true if utf8_character is defined in the UniCharset.
+  bool IsValidCharacter(const char *utf8_character);
+

  bool GetTextDirection(int* out_offset, float* out_slope);

--- a/api/capi.cpp
+++ b/api/capi.cpp
@ -667,6 +667,18 @@ TESS_API void TESS_CALL TessPageIteratorOrientation(TessPageIterator* handle, Te
    handle->Orientation(orientation, writing_direction, textline_order, deskew_angle);
 }

+TESS_API void  TESS_CALL TessPageIteratorParagraphInfo(TessPageIterator* handle, TessParagraphJustification* justification,
+                                                       BOOL *is_list_item, BOOL *is_crown, int *first_line_indent)
+{
+    bool bool_is_list_item, bool_is_crown;
+    handle->ParagraphInfo(justification, &bool_is_list_item, &bool_is_crown, first_line_indent);
+    if (is_list_item)
+        *is_list_item = bool_is_list_item ? TRUE : FALSE;
+    if (is_crown)
+        *is_crown = bool_is_crown ? TRUE : FALSE;
+}
+
+
 TESS_API void TESS_CALL TessResultIteratorDelete(TessResultIterator* handle)
 {
    delete handle;
@ -687,7 +699,7 @@ TESS_API const TessPageIterator* TESS_CALL TessResultIteratorGetPageIteratorCons
    return handle;
 }

-TESS_API const TessChoiceIterator* TESS_CALL TessResultIteratorGetChoiceIterator(const TessResultIterator* handle)
+TESS_API TessChoiceIterator* TESS_CALL TessResultIteratorGetChoiceIterator(const TessResultIterator* handle)
 {
    return new TessChoiceIterator(*handle);
 }
--- a/api/capi.h
+++ b/api/capi.h
@ -53,6 +53,7 @@ typedef tesseract::Dawg TessDawg;
 typedef tesseract::TruthCallback TessTruthCallback;
 typedef tesseract::CubeRecoContext TessCubeRecoContext;
 typedef tesseract::Orientation TessOrientation;
+typedef tesseract::ParagraphJustification TessParagraphJustification;
 typedef tesseract::WritingDirection TessWritingDirection;
 typedef tesseract::TextlineOrder TessTextlineOrder;
 typedef PolyBlockType TessPolyBlockType;
@ -77,6 +78,7 @@ typedef enum TessPolyBlockType     { PT_UNKNOWN, PT_FLOWING_TEXT, PT_HEADING_TEX
                                     PT_TABLE, PT_VERTICAL_TEXT, PT_CAPTION_TEXT, PT_FLOWING_IMAGE, PT_HEADING_IMAGE,
                                     PT_PULLOUT_IMAGE, PT_HORZ_LINE, PT_VERT_LINE, PT_NOISE, PT_COUNT } TessPolyBlockType;
 typedef enum TessOrientation       { ORIENTATION_PAGE_UP, ORIENTATION_PAGE_RIGHT, ORIENTATION_PAGE_DOWN, ORIENTATION_PAGE_LEFT } TessOrientation;
+typedef enum TessParagraphJustification { JUSTIFICATION_UNKNOWN, JUSTIFICATION_LEFT, JUSTIFICATION_CENTER, JUSTIFICATION_RIGHT } TessParagraphJustification;
 typedef enum TessWritingDirection  { WRITING_DIRECTION_LEFT_TO_RIGHT, WRITING_DIRECTION_RIGHT_TO_LEFT, WRITING_DIRECTION_TOP_TO_BOTTOM } TessWritingDirection;
 typedef enum TessTextlineOrder     { TEXTLINE_ORDER_LEFT_TO_RIGHT, TEXTLINE_ORDER_RIGHT_TO_LEFT, TEXTLINE_ORDER_TOP_TO_BOTTOM } TessTextlineOrder;
 typedef struct ETEXT_DESC ETEXT_DESC;
@ -299,7 +301,7 @@ TESS_API TessCubeRecoContext*

 TESS_API void  TESS_CALL TessBaseAPISetMinOrientationMargin(TessBaseAPI* handle, double margin);
 #ifdef TESS_CAPI_INCLUDE_BASEAPI
-TESS_API void  TESS_CALL TessBaseGetBlockTextOrientations(TessBaseAPI* handle, int** block_orientation, bool** vertical_writing);
+TESS_API void  TESS_CALL TessBaseGetBlockTextOrientations(TessBaseAPI* handle, int** block_orientation, BOOL** vertical_writing);

 TESS_API BLOCK_LIST*
               TESS_CALL TessBaseAPIFindLinesCreateBlockList(TessBaseAPI* handle);
@ -335,6 +337,9 @@ TESS_API void  TESS_CALL TessPageIteratorOrientation(TessPageIterator* handle, T
                                                     TessWritingDirection* writing_direction, TessTextlineOrder* textline_order,
                                                     float* deskew_angle);

+TESS_API void  TESS_CALL TessPageIteratorParagraphInfo(TessPageIterator* handle, TessParagraphJustification* justification,
+                                                       BOOL *is_list_item, BOOL *is_crown, int *first_line_indent);
+
 /* Result iterator */

 TESS_API void  TESS_CALL TessResultIteratorDelete(TessResultIterator* handle);
@ -344,7 +349,7 @@ TESS_API TessPageIterator*
               TESS_CALL TessResultIteratorGetPageIterator(TessResultIterator* handle);
 TESS_API const TessPageIterator*
               TESS_CALL TessResultIteratorGetPageIteratorConst(const TessResultIterator* handle);
-TESS_API const TessChoiceIterator*
+TESS_API TessChoiceIterator*
               TESS_CALL TessResultIteratorGetChoiceIterator(const TessResultIterator* handle);

 TESS_API BOOL  TESS_CALL TessResultIteratorNext(TessResultIterator* handle, TessPageIteratorLevel level);
--- a/api/pdfrenderer.cpp
+++ b/api/pdfrenderer.cpp
@ -14,6 +14,139 @@
 #include "mathfix.h"
 #endif

+/*
+
+Design notes from Ken Sharp, with light editing.
+
+We think one solution is a font with a single glyph (.notdef) and a
+CIDToGIDMap which maps all the CIDs to 0. That map would then be
+stored as a stream in the PDF file, and when flate compressed should
+be pretty small. The font, of course, will be approximately the same
+size as the one you currently use.
+
+I'm working on such a font now, the CIDToGIDMap is trivial, you just
+create a stream object which contains 128k bytes (2 bytes per possible
+CID and your CIDs range from 0 to 65535) and where you currently have
+"/CIDToGIDMap /Identity" you would have "/CIDToGIDMap <object> 0 R".
+
+Note that if, in future, you were to use a different (ie not 2 byte)
+CMap for character codes you could trivially extend the CIDToGIDMap.
+
+The following is an explanation of how some of the font stuff works,
+this may be too simple for you in which case please accept my
+apologies, its hard to know how much knowledge someone has. You can
+skip all this anyway, its just for information.
+
+The font embedded in a PDF file is usually intended just to be
+rendered, but extensions allow for at least some ability to locate (or
+copy) text from a document. This isn't something which was an original
+goal of the PDF format, but its been retro-fitted, presumably due to
+popular demand.
+
+To do this reliably the PDF file must contain a ToUnicode CMap, a
+device for mapping character codes to Unicode code points. If one of
+these is present, then this will be used to convert the character
+codes into Unicode values. If its not present then the reader will
+fall back through a series of heuristics to try and guess the
+result. This is, as you would expect, prone to failure.
+
+This doesn't concern you of course, since you always write a ToUnicode
+CMap, so because you are writing the text in text rendering mode 3 it
+would seem that you don't really need to worry about this, but in the
+PDF spec you cannot have an isolated ToUnicode CMap, it has to be
+attached to a font, so in order to get even copy/paste to work you
+need to define a font.
+
+This is what leads to problems, tools like pdfwrite assume that they
+are going to be able to (or even have to) modify the font entries, so
+they require that the font being embedded be valid, and to be honest
+the font Tesseract embeds isn't valid (for this purpose).
+
+
+To see why lets look at how text is specified in a PDF file:
+
+(Test) Tj
+
+Now that looks like text but actually it isn't. Each of those bytes is
+a 'character code'. When it comes to rendering the text a complex
+sequence of events takes place, which converts the character code into
+'something' which the font understands. Its entirely possible via
+character mappings to have that text render as 'Sftu'
+
+For simple fonts (PostScript type 1), we use the character code as the
+index into an Encoding array (256 elements), each element of which is
+a glyph name, so this gives us a glyph name. We then consult the
+CharStrings dictionary in the font, that's a complex object which
+contains pairs of keys and values, you can use the key to retrieve a
+given value. So we have a glyph name, we then use that as the key to
+the dictionary and retrieve the associated value. For a type 1 font,
+the value is a glyph program that describes how to draw the glyph.
+
+For CIDFonts, its a little more complicated. Because CIDFonts can be
+large, using a glyph name as the key is unreasonable (it would also
+lead to unfeasibly large Encoding arrays), so instead we use a 'CID'
+as the key. CIDs are just numbers.
+
+But.... We don't use the character code as the CID. What we do is use
+a CMap to convert the character code into a CID. We then use the CID
+to key the CharStrings dictionary and proceed as before. So the 'CMap'
+is the equivalent of the Encoding array, but its a more compact and
+flexible representation.
+
+Note that you have to use the CMap just to find out how many bytes
+constitute a character code, and it can be variable. For example you
+can say if the first byte is 0x00->0x7f then its just one byte, if its
+0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I
+have seen CMaps defining character codes up to 5 bytes wide.
+
+Now that's fine for 'PostScript' CIDFonts, but its not sufficient for
+TrueType CIDFonts. The thing is that TrueType fonts are accessed using
+a Glyph ID (GID) (and the LOCA table) which may well not be anything
+like the CID. So for this case PDF includes a CIDToGIDMap. That maps
+the CIDs to GIDs, and we can then use the GID to get the glyph
+description from the GLYF table of the font.
+
+So for a TrueType CIDFont, character-code->CID->GID->glyf-program.
+
+Looking at the PDF file I was supplied with we see that it contains
+text like :
+
+<0x0075> Tj
+
+So we start by taking the character code (117) and look it up in the
+CMap. Well you don't supply a CMap, you just use the Identity-H one
+which is predefined. So character code 117 maps to CID 117. Then we
+use the CIDToGIDMap, again you don't supply one, you just use the
+predefined 'Identity' map. So CID 117 maps to GID 117. But the font we
+were supplied with only contains 116 glyphs.
+
+Now for Latin that's not a huge problem, you can just supply a bigger
+font. But for more complex languages that *is* going to be more of a
+problem. Either you need to supply a font which contains glyphs for
+all the possible CID->GID mappings, or we need to think laterally.
+
+Our solution using a TrueType CIDFont is to intervene at the
+CIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a
+font with just one glyph, the .notdef glyph at GID 0. This is what I'm
+looking into now.
+
+It would also be possible to have a 'PostScript' (ie type 1 outlines)
+CIDFont which contained 1 glyph, and a CMap which mapped all character
+codes to CID 0. The effect would be the same.
+
+Its possible (I haven't checked) that the PostScript CIDFont and
+associated CMap would be smaller than the TrueType font and associated
+CIDToGIDMap.
+
+--- in a followup ---
+
+OK there is a small problem there, if I use GID 0 then Acrobat gets
+upset about it and complains it cannot extract the font. If I set the
+CIDToGIDMap so that all the entries are 1 instead, its happy. Totally
+mad......
+
+*/
+
 namespace tesseract {

 // Use for PDF object fragments. Must be large enough
@ -60,74 +193,22 @@ long dist2(int x1, int y1, int x2, int y2) {
  return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
 }

-char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
-                                         double width, double height) {
-  double ppi = api->GetSourceYResolution();
-  STRING pdf_str("");
-  double old_x = 0.0, old_y = 0.0;
-  int old_pointsize = 0;
-
-  // TODO(jbreiden) Slightly cleaner from an abstraction standpoint
-  // if this were to live inside a separate text object.
-  pdf_str += "q ";
-  pdf_str.add_str_double("", prec(width));
-  pdf_str += " 0 0 ";
-  pdf_str.add_str_double("", prec(height));
-  pdf_str += " 0 0 cm /Im1 Do Q\n";
-
-  ResultIterator *res_it = api->GetIterator();
-
-  while (!res_it->Empty(RIL_BLOCK)) {
-    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
-      pdf_str += "BT\n3 Tr\n";  // Begin text object, use invisible ink
-      old_pointsize = 0.0;      // Every block will declare its font
-    }
-
-    int line_x1, line_y1, line_x2, line_y2;
-    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
-      res_it->Baseline(RIL_TEXTLINE,
-                       &line_x1, &line_y1, &line_x2, &line_y2);
-      double rise = abs(line_y2 - line_y1) * 72 / ppi;
-      double run = abs(line_x2 - line_x1) * 72 / ppi;
-      // There are some really stupid PDF viewers in the wild, such as
-      // 'Preview' which ships with the Mac. They might do a better
-      // job with text selection and highlighting when given perfectly
-      // straight text instead of very slightly tilted text. I chose
-      // this threshold large enough to absorb noise, but small enough
-      // that lines probably won't cross each other if the whole page
-      // is tilted at almost exactly the clipping threshold.
-      if (rise < 2.0 && 2.0 < run)
-        line_y1 = line_y2 = (line_y1 + line_y2) / 2;
-    }
-
-    if (res_it->Empty(RIL_WORD)) {
-      res_it->Next(RIL_WORD);
-      continue;
-    }
-
-    int word_x1, word_y1, word_x2, word_y2;
-    res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
-
-    // The critical one is writing_direction
-    tesseract::Orientation orientation;
-    tesseract::WritingDirection writing_direction;
-    tesseract::TextlineOrder textline_order;
-    float deskew_angle;
-    res_it->Orientation(&orientation, &writing_direction,
-                        &textline_order, &deskew_angle);
-
-    // Unlike Tesseract, we always want the word baseline in reading order.
+// Viewers like evince can get really confused during copy-paste when
+// the baseline wanders around. So I've decided to project every word
+// onto the (straight) line baseline. All numbers are in the native
+// PDF coordinate system, which has the origin in the bottom left and
+// the unit is points, which is 1/72 inch. Tesseract reports baselines
+// left-to-right no matter what the reading order is. We need the
+// word baseline in reading order, so we do that conversion here. Returns
+// the word's baseline origin and length.
+void GetWordBaseline(int writing_direction, int ppi, int height,
+                     int word_x1, int word_y1, int word_x2, int word_y2,
+                     int line_x1, int line_y1, int line_x2, int line_y2,
+                     double *x0, double *y0, double *length) {
  if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
    Swap(&word_x1, &word_x2);
    Swap(&word_y1, &word_y2);
  }
-
-    // Viewers like evince can get really confused during copy-paste
-    // when the baseline wanders around. I've decided to force every
-    // word to match the (straight) baseline.  The math below is just
-    // projecting the word origin onto the baseline.  All numbers are
-    // in the native PDF coordinate system, which has the origin in
-    // the bottom left and the unit is points, which is 1/72 inch.
  double word_length;
  double x, y;
  {
@ -149,76 +230,179 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
    x = x * 72 / ppi;
    y = height - (y * 72.0 / ppi);
  }
+  *x0 = x;
+  *y0 = y;
+  *length = word_length;
+}

-    int pointsize = 0;
-    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
-      // Calculate the rotation angle in the PDF cooordinate system,
-      // which has the origin in the bottom left. The Tesseract
-      // coordinate system has the origin in the upper left.
-      //
-      // PDF is kind of a like turtle graphics, and we orient the
-      // turtle (errr... initial cursor position) with an affine
-      // transformation.
-      //
-      //                                Rotate              RTL    Translate
-      //
-      // [ x' y' 1 ]  = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ]  [ -1 0 0 ] [ 1 0 0 ]
-      //                          [ sin𝜃  cos𝜃 0 ]  [  0 1 0 ] [ 0 1 0 ]
-      //                          [   0    0   1 ]  [  0 0 1 ] [ x y 1 ]
-      //
+// Compute coefficients for an affine matrix describing the rotation
+// of the text. If the text is right-to-left such as Arabic or Hebrew,
+// we reflect over the Y-axis. This matrix will set the coordinate
+// system for placing text in the PDF file.
+//
+//                           RTL
+// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
+// [ y' ]   [ c d ][ y ]   [ 0 1 ] [-sin cos ][ y ]
+void AffineMatrix(int writing_direction,
+                  int line_x1, int line_y1, int line_x2, int line_y2,
+                  double *a, double *b, double *c, double *d) {
  double theta = atan2(static_cast<double>(line_y1 - line_y2),
                       static_cast<double>(line_x2 - line_x1));
-      double a, b, c, d;
-      a = cos(theta);
-      b = sin(theta);
-      c = -sin(theta);
-      d = cos(theta);
+  *a = cos(theta);
+  *b = sin(theta);
+  *c = -sin(theta);
+  *d = cos(theta);
  switch(writing_direction) {
    case WRITING_DIRECTION_RIGHT_TO_LEFT:
-          a = -a;
-          b = -b;
-          c = -c;
+      *a = -*a;
+      *b = -*b;
      break;
    case WRITING_DIRECTION_TOP_TO_BOTTOM:
-          // TODO(jbreiden) Consider switching PDF writing mode to vertical.
+      // TODO(jbreiden) Consider using the vertical PDF writing mode.
      break;
    default:
      break;
  }
+}

-      pdf_str.add_str_double("",  prec(a));  // . This affine matrix
+// There are some really stupid PDF viewers in the wild, such as
+// 'Preview' which ships with the Mac. They do a better job with text
+// selection and highlighting when given perfectly flat baseline
+// instead of very slightly tilted. We clip small tilts to appease
+// these viewers. I chose this threshold large enough to absorb noise,
+// but small enough that lines probably won't cross each other if the
+// whole page is tilted at almost exactly the clipping threshold.
+void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
+                  int *line_x1, int *line_y1,
+                  int *line_x2, int *line_y2) {
+  *line_x1 = x1;
+  *line_y1 = y1;
+  *line_x2 = x2;
+  *line_y2 = y2;
+  double rise = abs(y2 - y1) * 72 / ppi;
+  double run = abs(x2 - x1) * 72 / ppi;
+  if (rise < 2.0 && 2.0 < run)
+    *line_y1 = *line_y2 = (y1 + y2) / 2;
+}
+
+char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
+                                         double width, double height) {
+  STRING pdf_str("");
+  double ppi = api->GetSourceYResolution();
+
+  // These initial conditions are all arbitrary and will be overwritten
+  double old_x = 0.0, old_y = 0.0;
+  int old_fontsize = 0;
+  tesseract::WritingDirection old_writing_direction =
+      WRITING_DIRECTION_LEFT_TO_RIGHT;
+  bool new_block = true;
+  int fontsize = 0;
+  double a = 1;
+  double b = 0;
+  double c = 0;
+  double d = 1;
+
+  // TODO(jbreiden) This marries the text and image together.
+  // Slightly cleaner from an abstraction standpoint if this were to
+  // live inside a separate text object.
+  pdf_str += "q ";
+  pdf_str.add_str_double("", prec(width));
+  pdf_str += " 0 0 ";
+  pdf_str.add_str_double("", prec(height));
+  pdf_str += " 0 0 cm /Im1 Do Q\n";
+
+  ResultIterator *res_it = api->GetIterator();
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
+      pdf_str += "BT\n3 Tr";     // Begin text object, use invisible ink
+      old_fontsize = 0;          // Every block will declare its fontsize
+      new_block = true;          // Every block will declare its affine matrix
+    }
+
+    int line_x1, line_y1, line_x2, line_y2;
+    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+      int x1, y1, x2, y2;
+      res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
+      ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
+    }
+
+    if (res_it->Empty(RIL_WORD)) {
+      res_it->Next(RIL_WORD);
+      continue;
+    }
+
+    // Writing direction changes at a per-word granularity
+    tesseract::WritingDirection writing_direction;
+    {
+      tesseract::Orientation orientation;
+      tesseract::TextlineOrder textline_order;
+      float deskew_angle;
+      res_it->Orientation(&orientation, &writing_direction,
+                          &textline_order, &deskew_angle);
+      if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
+        switch (res_it->WordDirection()) {
+          case DIR_LEFT_TO_RIGHT:
+            writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
+            break;
+          case DIR_RIGHT_TO_LEFT:
+            writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
+            break;
+          default:
+            writing_direction = old_writing_direction;
+        }
+      }
+    }
+
+    // Where is word origin and how long is it?
+    double x, y, word_length;
+    {
+      int word_x1, word_y1, word_x2, word_y2;
+      res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
+      GetWordBaseline(writing_direction, ppi, height,
+                      word_x1, word_y1, word_x2, word_y2,
+                      line_x1, line_y1, line_x2, line_y2,
+                      &x, &y, &word_length);
+    }
+
+    if (writing_direction != old_writing_direction || new_block) {
+      AffineMatrix(writing_direction,
+                   line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
+      pdf_str.add_str_double(" ", prec(a));  // . This affine matrix
      pdf_str.add_str_double(" ", prec(b));  // . sets the coordinate
      pdf_str.add_str_double(" ", prec(c));  // . system for all
-      pdf_str.add_str_double(" ", prec(d));  // . text in the entire
-      pdf_str.add_str_double(" ", prec(x));  // . line.
+      pdf_str.add_str_double(" ", prec(d));  // . text that follows.
+      pdf_str.add_str_double(" ", prec(x));  // .
      pdf_str.add_str_double(" ", prec(y));  // .
      pdf_str += (" Tm ");                   // Place cursor absolutely
+      new_block = false;
    } else {
-      double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
-      pdf_str.add_str_double(" ", prec(offset));  // Delta x in pts
-      pdf_str.add_str_double(" ", 0);             // Delta y in pts
+      double dx = x - old_x;
+      double dy = y - old_y;
+      pdf_str.add_str_double(" ", prec(dx * a + dy * b));
+      pdf_str.add_str_double(" ", prec(dx * c + dy * d));
      pdf_str += (" Td ");                   // Relative moveto
    }
    old_x = x;
    old_y = y;
+    old_writing_direction = writing_direction;

    // Adjust font size on a per word granularity. Pay attention to
-    // pointsize, old_pointsize, and pdf_str. We've found that for
-    // in Arabic, Tesseract will happily return a pointsize of zero,
+    // fontsize, old_fontsize, and pdf_str. We've found that for
+    // in Arabic, Tesseract will happily return a fontsize of zero,
    // so we make up a default number to protect ourselves.
    {
      bool bold, italic, underlined, monospace, serif, smallcaps;
      int font_id;
      res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
-                                 &serif, &smallcaps, &pointsize, &font_id);
-      const int kDefaultPointSize = 8;
-      if (pointsize <= 0)
-        pointsize = kDefaultPointSize;
-      if (pointsize != old_pointsize) {
+                                 &serif, &smallcaps, &fontsize, &font_id);
+      const int kDefaultFontsize = 8;
+      if (fontsize <= 0)
+        fontsize = kDefaultFontsize;
+      if (fontsize != old_fontsize) {
        char textfont[20];
-        snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
+        snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
        pdf_str += textfont;
-        old_pointsize = pointsize;
+        old_fontsize = fontsize;
      }
    }

@ -243,9 +427,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
      delete []grapheme;
      res_it->Next(RIL_SYMBOL);
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
-    if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
+    if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
      double h_stretch =
-          kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len));
+          kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
      pdf_str.add_str_double("", h_stretch);
      pdf_str += " Tz";          // horizontal stretch
      pdf_str += " [ ";
@ -267,21 +451,25 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,

 bool TessPDFRenderer::BeginDocumentHandler() {
  char buf[kBasicBufSize];
+  size_t n;

-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "%%PDF-1.5\n"
               "%%%c%c%c%c\n",
               0xDE, 0xAD, 0xBE, 0xEB);
+  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // CATALOG
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "1 0 obj\n"
               "<<\n"
               "  /Type /Catalog\n"
               "  /Pages %ld 0 R\n"
               ">>\n"
-           "endobj\n", 2L);
+               "endobj\n",
+               2L);
+  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // We are reserving object #2 for the /Pages
@ -290,7 +478,7 @@ bool TessPDFRenderer::BeginDocumentHandler() {
  AppendPDFObject("");

  // TYPE0 FONT
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "3 0 obj\n"
               "<<\n"
               "  /BaseFont /GlyphLessFont\n"
@ -302,16 +490,17 @@ bool TessPDFRenderer::BeginDocumentHandler() {
               ">>\n"
               "endobj\n",
               4L,         // CIDFontType2 font
-           5L           // ToUnicode
+               6L          // ToUnicode
               );
+  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // CIDFONTTYPE2
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "4 0 obj\n"
               "<<\n"
               "  /BaseFont /GlyphLessFont\n"
-           "  /CIDToGIDMap /Identity\n"
+               "  /CIDToGIDMap %ld 0 R\n"
               "  /CIDSystemInfo\n"
               "  <<\n"
               "     /Ordering (Identity)\n"
@ -324,10 +513,44 @@ bool TessPDFRenderer::BeginDocumentHandler() {
               "  /DW %d\n"
               ">>\n"
               "endobj\n",
-           6L,         // Font descriptor
+               5L,         // CIDToGIDMap
+               7L,         // Font descriptor
               1000 / kCharWidth);
+  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

+  // CIDTOGIDMAP
+  const int kCIDToGIDMapSize = 2 * (1 << 16);
+  unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize];
+  for (int i = 0; i < kCIDToGIDMapSize; i++) {
+    cidtogidmap[i] = (i % 2) ? 1 : 0;
+  }
+  size_t len;
+  unsigned char *comp =
+      zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
+  delete[] cidtogidmap;
+  n = snprintf(buf, sizeof(buf),
+               "5 0 obj\n"
+               "<<\n"
+               "  /Length %ld /Filter /FlateDecode\n"
+               ">>\n"
+               "stream\n", len);
+  if (n >= sizeof(buf)) {
+    lept_free(comp);
+    return false;
+  }
+  AppendString(buf);
+  long objsize = strlen(buf);
+  AppendData(reinterpret_cast<char *>(comp), len);
+  objsize += len;
+  lept_free(comp);
+  const char *endstream_endobj =
+      "endstream\n"
+      "endobj\n";
+  AppendString(endstream_endobj);
+  objsize += strlen(endstream_endobj);
+  AppendPDFObjectDIY(objsize);
+
  const char *stream =
      "/CIDInit /ProcSet findresource begin\n"
      "12 dict begin\n"
@ -352,19 +575,20 @@ bool TessPDFRenderer::BeginDocumentHandler() {
      "end\n";

  // TOUNICODE
-  snprintf(buf, sizeof(buf),
-           "5 0 obj\n"
+  n = snprintf(buf, sizeof(buf),
+               "6 0 obj\n"
               "<< /Length %lu >>\n"
               "stream\n"
               "%s"
               "endstream\n"
               "endobj\n", (unsigned long) strlen(stream), stream);
+  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

  // FONT DESCRIPTOR
  const int kCharHeight = 2;  // Effect: highlights are half height
-  snprintf(buf, sizeof(buf),
-           "6 0 obj\n"
+  n = snprintf(buf, sizeof(buf),
+               "7 0 obj\n"
               "<<\n"
               "  /Ascent %d\n"
               "  /CapHeight %d\n"
@ -382,14 +606,18 @@ bool TessPDFRenderer::BeginDocumentHandler() {
               1000 / kCharHeight,
               1000 / kCharWidth,
               1000 / kCharHeight,
-           7L      // Font data
+               8L      // Font data
               );
+  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);

-  snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
+  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
+  if (n >= sizeof(buf)) return false;
  FILE *fp = fopen(buf, "rb");
-  if (!fp)
+  if (!fp) {
+    fprintf(stderr, "Can not open file \"%s\"!\n", buf);
    return false;
+  }
  fseek(fp, 0, SEEK_END);
  long int size = ftell(fp);
  fseek(fp, 0, SEEK_SET);
@ -401,33 +629,31 @@ bool TessPDFRenderer::BeginDocumentHandler() {
  }
  fclose(fp);
  // FONTFILE2
-  snprintf(buf, sizeof(buf),
-           "7 0 obj\n"
+  n = snprintf(buf, sizeof(buf),
+               "8 0 obj\n"
               "<<\n"
               "  /Length %ld\n"
               "  /Length1 %ld\n"
               ">>\n"
               "stream\n", size, size);
+  if (n >= sizeof(buf)) return false;
  AppendString(buf);
-  size_t objsize  = strlen(buf);
+  objsize  = strlen(buf);
  AppendData(buffer, size);
  delete[] buffer;
  objsize += size;
-  snprintf(buf, sizeof(buf),
-           "endstream\n"
-           "endobj\n");
-  AppendString(buf);
-  objsize += strlen(buf);
+  AppendString(endstream_endobj);
+  objsize += strlen(endstream_endobj);
  AppendPDFObjectDIY(objsize);
  return true;
 }

-bool TessPDFRenderer::imageToPDFObj(TessBaseAPI* api,
-                                    Pix *pix,
+bool TessPDFRenderer::imageToPDFObj(Pix *pix,
                                    char *filename,
                                    long int objnum,
                                    char **pdf_object,
                                    long int *pdf_object_size) {
+  size_t n;
  char b0[kBasicBufSize];
  char b1[kBasicBufSize];
  char b2[kBasicBufSize];
@ -439,18 +665,26 @@ bool TessPDFRenderer::imageToPDFObj(TessBaseAPI* api,
    return false;

  L_COMP_DATA *cid = NULL;
-  int kJpegQuality;
-  int encoding_type;
-  api->GetIntVariable("tessedit_pdf_jpg_quality", &kJpegQuality);
-  api->GetIntVariable("tessedit_pdf_compression", &encoding_type);
-  if (encoding_type > 0 && encoding_type < 4) {
-      if (pixGenerateCIData(pix, encoding_type, kJpegQuality, 0, &cid) != 0)
-         return false;
+  const int kJpegQuality = 85;
+
+  // TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain
+  // types of PNG files, especially if there are 2 samples per pixel.
+  // We can get rid of this logic after Leptonica 1.72 is released and
+  // has propagated everywhere. Bug discussion as follows.
+  // https://code.google.com/p/tesseract-ocr/issues/detail?id=1300
+  int format, sad;
+  findFileFormat(filename, &format);
+  if (pixGetSpp(pix) == 4 && format == IFF_PNG) {
+    pixSetSpp(pix, 3);
+    sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
  } else {
-    l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
+    sad = l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
  }
-  if (!cid)
+
+  if (sad || !cid) {
+    l_CIDataDestroy(&cid);
    return false;
+  }

  const char *group4 = "";
  const char *filter;
@ -473,28 +707,26 @@ bool TessPDFRenderer::imageToPDFObj(TessBaseAPI* api,
      return false;
  }

-  // Prevent data corruption. Otherwise we'll end up clipping the
-  // PDF representation of the colormap.
-  if (cid->ncolors > 256) {
-    l_CIDataDestroy(&cid);
-    return false;
-  }
-
  // Maybe someday we will accept RGBA but today is not that day.
  // It requires creating an /SMask for the alpha channel.
  // http://stackoverflow.com/questions/14220221
  const char *colorspace;
  if (cid->ncolors > 0) {
-    snprintf(b0, sizeof(b0), "[ /Indexed /DeviceRGB %d %s ]",
+    n = snprintf(b0, sizeof(b0),
+                 "  /ColorSpace [ /Indexed /DeviceRGB %d %s ]\n",
                 cid->ncolors - 1, cid->cmapdatahex);
+    if (n >= sizeof(b0)) {
+      l_CIDataDestroy(&cid);
+      return false;
+    }
    colorspace = b0;
  } else {
    switch (cid->spp) {
      case 1:
-        colorspace = "/DeviceGray";
+        colorspace = "  /ColorSpace /DeviceGray\n";
        break;
      case 3:
-        colorspace = "/DeviceRGB";
+        colorspace = "  /ColorSpace /DeviceRGB\n";
        break;
      default:
        l_CIDataDestroy(&cid);
@ -502,52 +734,75 @@ bool TessPDFRenderer::imageToPDFObj(TessBaseAPI* api,
    }
  }

-  const char *predictor = (cid->predictor) ? "    /Predictor 14\n" : "";
+  int predictor = (cid->predictor) ? 14 : 1;

  // IMAGE
-  snprintf(b1, sizeof(b1),
+  n = snprintf(b1, sizeof(b1),
               "%ld 0 obj\n"
               "<<\n"
               "  /Length %ld\n"
-           "  /Subtype /Image\n"
-           "  /ColorSpace %s\n"
+               "  /Subtype /Image\n",
+               objnum, (unsigned long) cid->nbytescomp);
+  if (n >= sizeof(b1)) {
+    l_CIDataDestroy(&cid);
+    return false;
+  }
+
+  n = snprintf(b2, sizeof(b2),
               "  /Width %d\n"
               "  /Height %d\n"
               "  /BitsPerComponent %d\n"
               "  /Filter %s\n"
               "  /DecodeParms\n"
               "  <<\n"
-           "%s"
+               "    /Predictor %d\n"
+               "    /Colors %d\n"
               "%s"
               "    /Columns %d\n"
               "    /BitsPerComponent %d\n"
               "  >>\n"
               ">>\n"
               "stream\n",
-           objnum, (unsigned long) cid->nbytescomp, colorspace,
-           cid->w, cid->h, cid->bps, filter, predictor, group4,
-           cid->w, cid->bps);
-  size_t b1_len = strlen(b1);
-  snprintf(b2, sizeof(b2),
-           "\n"
-           "endstream\n"
-           "endobj\n");
-  size_t b2_len = strlen(b2);
+               cid->w, cid->h, cid->bps, filter, predictor, cid->spp,
+               group4, cid->w, cid->bps);
+  if (n >= sizeof(b2)) {
+    l_CIDataDestroy(&cid);
+    return false;
+  }

-  *pdf_object_size = b1_len + cid->nbytescomp + b2_len;
+  const char *b3 =
+      "endstream\n"
+      "endobj\n";
+
+  size_t b1_len = strlen(b1);
+  size_t b2_len = strlen(b2);
+  size_t b3_len = strlen(b3);
+  size_t colorspace_len = strlen(colorspace);
+
+  *pdf_object_size =
+      b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
  *pdf_object = new char[*pdf_object_size];
  if (!pdf_object) {
    l_CIDataDestroy(&cid);
    return false;
  }
-  memcpy(*pdf_object, b1, b1_len);
-  memcpy(*pdf_object + b1_len, cid->datacomp, cid->nbytescomp);
-  memcpy(*pdf_object + b1_len + cid->nbytescomp, b2, b2_len);
+
+  char *p = *pdf_object;
+  memcpy(p, b1, b1_len);
+  p += b1_len;
+  memcpy(p, colorspace, colorspace_len);
+  p += colorspace_len;
+  memcpy(p, b2, b2_len);
+  p += b2_len;
+  memcpy(p, cid->datacomp, cid->nbytescomp);
+  p += cid->nbytescomp;
+  memcpy(p, b3, b3_len);
  l_CIDataDestroy(&cid);
  return true;
 }

 bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
+  size_t n;
  char buf[kBasicBufSize];
  Pix *pix = api->GetInputImage();
  char *filename = (char *)api->GetInputName();
@ -558,7 +813,7 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
  double height = pixGetHeight(pix) * 72.0 / ppi;

  // PAGE
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Type /Page\n"
@ -580,6 +835,7 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
               obj_ + 1,      // Contents object
               obj_ + 2,      // Image object
               3L);           // Type0 Font
+  if (n >= sizeof(buf)) return false;
  pages_.push_back(obj_);
  AppendPDFObject(buf);

@ -589,32 +845,34 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
  unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
  size_t len;
  unsigned char *comp_pdftext =
-      zlibCompress(pdftext_casted,
-                   pdftext_len,
-                   &len);
+      zlibCompress(pdftext_casted, pdftext_len, &len);
  long comp_pdftext_len = len;
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Length %ld /Filter /FlateDecode\n"
               ">>\n"
               "stream\n", obj_, comp_pdftext_len);
+  if (n >= sizeof(buf)) {
+    delete[] pdftext;
+    lept_free(comp_pdftext);
+    return false;
+  }
  AppendString(buf);
  long objsize = strlen(buf);
  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
  objsize += comp_pdftext_len;
  lept_free(comp_pdftext);
-
  delete[] pdftext;
-  snprintf(buf, sizeof(buf),
+  const char *b2 =
      "endstream\n"
-           "endobj\n");
-  AppendString(buf);
-  objsize += strlen(buf);
+      "endobj\n";
+  AppendString(b2);
+  objsize += strlen(b2);
  AppendPDFObjectDIY(objsize);

  char *pdf_object;
-  if (!imageToPDFObj(api, pix, filename, obj_, &pdf_object, &objsize)) {
+  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
    return false;
  }
  AppendData(pdf_object, objsize);
@ -625,6 +883,7 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {


 bool TessPDFRenderer::EndDocumentHandler() {
+  size_t n;
  char buf[kBasicBufSize];

  // We reserved the /Pages object number early, so that the /Page
@ -636,31 +895,34 @@ bool TessPDFRenderer::EndDocumentHandler() {
  // PAGES
  const long int kPagesObjectNumber = 2;
  offsets_[kPagesObjectNumber] = offsets_.back();  // manipulation #1
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Type /Pages\n"
               "  /Kids [ ", kPagesObjectNumber);
+  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  size_t pages_objsize  = strlen(buf);
  for (size_t i = 0; i < pages_.size(); i++) {
-    snprintf(buf, sizeof(buf),
+    n = snprintf(buf, sizeof(buf),
                 "%ld 0 R ", pages_[i]);
+    if (n >= sizeof(buf)) return false;
    AppendString(buf);
    pages_objsize += strlen(buf);
  }
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "]\n"
               "  /Count %d\n"
               ">>\n"
               "endobj\n", pages_.size());
+  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  pages_objsize += strlen(buf);
  offsets_.back() += pages_objsize;    // manipulation #2

  // INFO
  char* datestr = l_getFormattedDate();
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "%ld 0 obj\n"
               "<<\n"
               "  /Producer (Tesseract %s)\n"
@ -669,18 +931,20 @@ bool TessPDFRenderer::EndDocumentHandler() {
               ">>\n"
               "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
  lept_free(datestr);
+  if (n >= sizeof(buf)) return false;
  AppendPDFObject(buf);
-
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "xref\n"
               "0 %ld\n"
               "0000000000 65535 f \n", obj_);
+  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  for (int i = 1; i < obj_; i++) {
-    snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
+    n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
+    if (n >= sizeof(buf)) return false;
    AppendString(buf);
  }
-  snprintf(buf, sizeof(buf),
+  n = snprintf(buf, sizeof(buf),
               "trailer\n"
               "<<\n"
               "  /Size %ld\n"
@ -694,9 +958,8 @@ bool TessPDFRenderer::EndDocumentHandler() {
               1L,               // catalog
               obj_ - 1,         // info
               offsets_.back());
-
+  if (n >= sizeof(buf)) return false;
  AppendString(buf);
  return true;
 }
-
 }  // namespace tesseract
--- a/api/renderer.cpp
+++ b/api/renderer.cpp
@ -114,6 +114,13 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
  AppendString(utf8);
  delete[] utf8;

+  bool pageBreak = false;
+  api->GetBoolVariable("include_page_breaks", &pageBreak);
+  const char* pageSeparator = api->GetStringVariable("page_separator");
+  if (pageBreak) {
+    AppendString(pageSeparator);
+  }
+
  return true;
 }

--- a/api/renderer.h
+++ b/api/renderer.h
@ -194,9 +194,8 @@ private:
  static char* GetPDFTextObjects(TessBaseAPI* api,
                                 double width, double height);
  // Turn an image into a PDF object. Only transcode if we have to.
-  static bool imageToPDFObj(tesseract::TessBaseAPI *api, Pix *pix,
-                            char *filename, long int objnum, char **pdf_object,
-                            long int *pdf_object_size);
+  static bool imageToPDFObj(Pix *pix, char *filename, long int objnum,
+                          char **pdf_object, long int *pdf_object_size);
 };


--- a/api/tesseractmain.cpp
+++ b/api/tesseractmain.cpp
@ -287,36 +287,38 @@ int main(int argc, char **argv) {
    exit(ret_val);
  }

-  tesseract::TessResultRenderer* renderer = NULL;
  bool b;
+  tesseract::PointerVector<tesseract::TessResultRenderer> renderers;
  api.GetBoolVariable("tessedit_create_hocr", &b);
  if (b) {
    bool font_info;
    api.GetBoolVariable("hocr_font_info", &font_info);
-    renderer = new tesseract::TessHOcrRenderer(outputbase, font_info);
+    renderers.push_back(new tesseract::TessHOcrRenderer(outputbase, font_info));
  }
-
  api.GetBoolVariable("tessedit_create_pdf", &b);
-  if (b && renderer == NULL)
-    renderer = new tesseract::TessPDFRenderer(outputbase, api.GetDatapath());
-
+  if (b) {
+    renderers.push_back(new tesseract::TessPDFRenderer(outputbase,
+                                                       api.GetDatapath()));
+  }
  api.GetBoolVariable("tessedit_write_unlv", &b);
-  if (b && renderer == NULL)
-    renderer = new tesseract::TessUnlvRenderer(outputbase);
-
+  if (b) renderers.push_back(new tesseract::TessUnlvRenderer(outputbase));
  api.GetBoolVariable("tessedit_create_boxfile", &b);
-  if (b && renderer == NULL)
-    renderer = new tesseract::TessBoxTextRenderer(outputbase);
-
-  if (renderer == NULL)
-    renderer = new tesseract::TessTextRenderer(outputbase);
-
-  if (!api.ProcessPages(image, NULL, 0, renderer)) {
-    delete renderer;
+  if (b) renderers.push_back(new tesseract::TessBoxTextRenderer(outputbase));
+  api.GetBoolVariable("tessedit_create_txt", &b);
+  if (b) renderers.push_back(new tesseract::TessTextRenderer(outputbase));
+  if (!renderers.empty()) {
+    // Since the PointerVector auto-deletes, null-out the renderers that are
+    // added to the root, and leave the root in the vector.
+    for (int r = 1; r < renderers.size(); ++r) {
+      renderers[0]->insert(renderers[r]);
+      renderers[r] = NULL;
+    }
+    if (!api.ProcessPages(image, NULL, 0, renderers[0])) {
      fprintf(stderr, "Error during processing.\n");
      exit(1);
    }
-  delete renderer;
+  }
+
  PERF_COUNT_END
  return 0;                      // Normal exit
 }
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@ -272,7 +272,7 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
    // limited by the ability of the chopper to find suitable chop points,
    // and not by the value of the certainties.
    BLOB_CHOICE* choice =
-        new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE);
+        new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
    blob_choices.push_back(choice);
    rating -= 0.125f;
  }
@ -291,8 +291,8 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
      left_choice->set_certainty(-rating);
      // combine confidence w/ serial #
      BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
-                                                  rating - 0.125f, -rating,
-                                                  -1, -1, 0, 0, 0, 0, BCC_FAKE);
+                                                  rating - 0.125f, -rating, -1,
+                                                  0.0f, 0.0f, 0.0f, BCC_FAKE);
      blob_choices.insert(right_choice, blob_number + 1);
    }
  }
@ -582,7 +582,7 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
    int blob_count = 1;
    for (int s = 0; s < word_res->seam_array.size(); ++s) {
      SEAM* seam = word_res->seam_array[s];
-      if (seam->split1 == NULL) {
+      if (!seam->HasAnySplits()) {
        word_res->best_state.push_back(blob_count);
        blob_count = 1;
      } else {
@ -775,13 +775,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
 }

 // Calls LearnWord to extract features for labelled blobs within each word.
-// Features are written to the given filename.
-void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
+// Features are stored in an internal buffer.
+void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
  PAGE_RES_IT pr_it(page_res);
  int word_count = 0;
  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
       word_res = pr_it.forward()) {
-    LearnWord(filename.string(), word_res);
+    LearnWord(fontname.string(), word_res);
    ++word_count;
  }
  tprintf("Generated training data for %d words\n", word_count);
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@ -93,8 +93,7 @@ BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {

  WordData word_data(*pr_it);
  SetupWordPassN(2, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass2, pr_it,
-                             &word_data);
+  classify_word_and_language(2, pr_it, &word_data);
  if (tessedit_debug_quality_metrics) {
    WERD_RES* word_res = pr_it->word();
    word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
@ -190,6 +189,7 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
      if (word->word->x_height == 0.0f)
        word->word->x_height = word->row->x_height();
    }
+    word->lang_words.truncate(0);
    for (int s = 0; s <= sub_langs_.size(); ++s) {
      // The sub_langs_.size() entry is for the master language.
      Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
@ -249,15 +249,22 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
    while (pr_it->word() != NULL && pr_it->word() != word->word)
      pr_it->forward();
    ASSERT_HOST(pr_it->word() != NULL);
-    WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
-                                            : &Tesseract::classify_word_pass2;
-    classify_word_and_language(recognizer, pr_it, word);
-    if (tessedit_dump_choices) {
+    bool make_next_word_fuzzy = false;
+    if (ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
+      // Needs to be setup again to see the new outlines in the chopped_word.
+      SetupWordPassN(pass_n, word);
+    }
+
+    classify_word_and_language(pass_n, pr_it, word);
+    if (tessedit_dump_choices || debug_noise_removal) {
      tprintf("Pass%d: %s [%s]\n", pass_n,
              word->word->best_choice->unichar_string().string(),
              word->word->best_choice->debug_string().string());
    }
    pr_it->forward();
+    if (make_next_word_fuzzy && pr_it->word() != NULL) {
+      pr_it->MakeCurrentWordFuzzy();
+    }
  }
  return true;
 }
@ -357,7 +364,7 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,

  // ****************** Pass 2 *******************
  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
-      tessedit_ocr_engine_mode != OEM_CUBE_ONLY ) {
+      AnyTessLang()) {
    page_res_it.restart_page();
    GenericVector<WordData> words;
    SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
@ -371,8 +378,7 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,

  // The next passes can only be run if tesseract has been used, as cube
  // doesn't set all the necessary outputs in WERD_RES.
-  if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
-      tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
+  if (AnyTessLang()) {
    // ****************** Pass 3 *******************
    // Fix fuzzy spaces.
    set_global_loc_code(LOC_FUZZY_SPACE);
@ -388,12 +394,14 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
    // ****************** Pass 5,6 *******************
    rejection_passes(page_res, monitor, target_word_box, word_config);

+#ifndef ANDROID_BUILD
    // ****************** Pass 7 *******************
    // Cube combiner.
    // If cube is loaded and its combiner is present, run it.
    if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
      run_cube_combiner(page_res);
    }
+#endif

    // ****************** Pass 8 *******************
    font_recognition_pass(page_res);
@ -897,6 +905,359 @@ static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
  return true;
 }

+// Moves good-looking "noise"/diacritics from the reject list to the main
+// blob list on the current word. Returns true if anything was done, and
+// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+                                   bool* make_next_word_fuzzy) {
+  *make_next_word_fuzzy = false;
+  WERD* real_word = pr_it->word()->word;
+  if (real_word->rej_cblob_list()->empty() ||
+      real_word->cblob_list()->empty() ||
+      real_word->rej_cblob_list()->length() > noise_maxperword)
+    return false;
+  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
+  // Get the noise outlines into a vector with matching bool map.
+  GenericVector<C_OUTLINE*> outlines;
+  real_word->GetNoiseOutlines(&outlines);
+  GenericVector<bool> word_wanted;
+  GenericVector<bool> overlapped_any_blob;
+  GenericVector<C_BLOB*> target_blobs;
+  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
+                                     &word_wanted, &overlapped_any_blob,
+                                     &target_blobs);
+  // Filter the outlines that overlapped any blob and put them into the word
+  // now. This simplifies the remaining task and also makes it more accurate
+  // as it has more completed blobs to work on.
+  GenericVector<bool> wanted;
+  GenericVector<C_BLOB*> wanted_blobs;
+  GenericVector<C_OUTLINE*> wanted_outlines;
+  int num_overlapped = 0;
+  int num_overlapped_used = 0;
+  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
+    if (overlapped_any_blob[i]) {
+      ++num_overlapped;
+      if (word_wanted[i]) ++num_overlapped_used;
+      wanted.push_back(word_wanted[i]);
+      wanted_blobs.push_back(target_blobs[i]);
+      wanted_outlines.push_back(outlines[i]);
+      outlines[i] = NULL;
+    }
+  }
+  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
+  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
+                             &target_blobs);
+  int non_overlapped = 0;
+  int non_overlapped_used = 0;
+  for (int i = 0; i < word_wanted.size(); ++i) {
+    if (word_wanted[i]) ++non_overlapped_used;
+    if (outlines[i] != NULL) ++non_overlapped_used;
+  }
+  if (debug_noise_removal) {
+    tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
+            num_overlapped_used, num_overlapped, non_overlapped_used,
+            non_overlapped);
+    real_word->bounding_box().print();
+  }
+  // Now we have decided which outlines we want, put them into the real_word.
+  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
+                                     make_next_word_fuzzy)) {
+    pr_it->MakeCurrentWordFuzzy();
+  }
+  // TODO(rays) Parts of combos have a deep copy of the real word, and need
+  // to have their noise outlines moved/assigned in the same way!!
+  return num_overlapped_used != 0 || non_overlapped_used != 0;
+}
+
+// Attempts to put noise/diacritic outlines into the blobs that they overlap.
+// Input: a set of noisy outlines that probably belong to the real_word.
+// Output: word_wanted indicates which outlines are to be assigned to a blob,
+//   target_blobs indicates which to assign to, and overlapped_any_blob is
+//   true for all outlines that overlapped a blob.
+void Tesseract::AssignDiacriticsToOverlappingBlobs(
+    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+    GenericVector<bool>* overlapped_any_blob,
+    GenericVector<C_BLOB*>* target_blobs) {
+  GenericVector<bool> blob_wanted;
+  word_wanted->init_to_size(outlines.size(), false);
+  overlapped_any_blob->init_to_size(outlines.size(), false);
+  target_blobs->init_to_size(outlines.size(), NULL);
+  // For each real blob, find the outlines that seriously overlap it.
+  // A single blob could be several merged characters, so there can be quite
+  // a few outlines overlapping, and the full engine needs to be used to chop
+  // and join to get a sensible result.
+  C_BLOB_IT blob_it(real_word->cblob_list());
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    C_BLOB* blob = blob_it.data();
+    TBOX blob_box = blob->bounding_box();
+    blob_wanted.init_to_size(outlines.size(), false);
+    int num_blob_outlines = 0;
+    for (int i = 0; i < outlines.size(); ++i) {
+      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
+          !(*word_wanted)[i]) {
+        blob_wanted[i] = true;
+        (*overlapped_any_blob)[i] = true;
+        ++num_blob_outlines;
+      }
+    }
+    if (debug_noise_removal) {
+      tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
+      blob_box.print();
+    }
+    // If any outlines overlap the blob, and not too many, classify the blob
+    // (using the full engine, languages and all), and choose the maximal
+    // combination of outlines that doesn't hurt the end-result classification
+    // by too much. Mark them as wanted.
+    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
+      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
+                                      outlines, num_blob_outlines,
+                                      &blob_wanted)) {
+        for (int i = 0; i < blob_wanted.size(); ++i) {
+          if (blob_wanted[i]) {
+            // Claim the outline and record where it is going.
+            (*word_wanted)[i] = true;
+            (*target_blobs)[i] = blob;
+          }
+        }
+      }
+    }
+  }
+}
+
+// Attempts to assign non-overlapping outlines to their nearest blobs or
+// make new blobs out of them.
+void Tesseract::AssignDiacriticsToNewBlobs(
+    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+    GenericVector<C_BLOB*>* target_blobs) {
+  GenericVector<bool> blob_wanted;
+  word_wanted->init_to_size(outlines.size(), false);
+  target_blobs->init_to_size(outlines.size(), NULL);
+  // Check for outlines that need to be turned into stand-alone blobs.
+  for (int i = 0; i < outlines.size(); ++i) {
+    if (outlines[i] == NULL) continue;
+    // Get a set of adjacent outlines that don't overlap any existing blob.
+    blob_wanted.init_to_size(outlines.size(), false);
+    int num_blob_outlines = 0;
+    TBOX total_ol_box(outlines[i]->bounding_box());
+    while (i < outlines.size() && outlines[i] != NULL) {
+      blob_wanted[i] = true;
+      total_ol_box += outlines[i]->bounding_box();
+      ++i;
+      ++num_blob_outlines;
+    }
+    // Find the insertion point.
+    C_BLOB_IT blob_it(real_word->cblob_list());
+    while (!blob_it.at_last() &&
+           blob_it.data_relative(1)->bounding_box().left() <=
+               total_ol_box.left()) {
+      blob_it.forward();
+    }
+    // Choose which combination of them we actually want and where to put
+    // them.
+    if (debug_noise_removal)
+      tprintf("Num blobless outlines = %d\n", num_blob_outlines);
+    C_BLOB* left_blob = blob_it.data();
+    TBOX left_box = left_blob->bounding_box();
+    C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
+    if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
+         !right_blob->bounding_box().x_overlap(total_ol_box)) &&
+        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
+                                    outlines, num_blob_outlines,
+                                    &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Added to left blob\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = left_blob;
+        }
+      }
+    } else if (right_blob != NULL &&
+               (!left_box.x_overlap(total_ol_box) ||
+                right_blob->bounding_box().x_overlap(total_ol_box)) &&
+               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,
+                                           right_blob, outlines,
+                                           num_blob_outlines, &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Added to right blob\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = right_blob;
+        }
+      }
+    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
+                                           outlines, num_blob_outlines,
+                                           &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Fitted between blobs\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = NULL;
+        }
+      }
+    }
+  }
+}
+
+// Starting with ok_outlines set to indicate which outlines overlap the blob,
+// chooses the optimal set (approximately) and returns true if any outlines
+// are desired, in which case ok_outlines indicates which ones.
+bool Tesseract::SelectGoodDiacriticOutlines(
+    int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
+    const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
+    GenericVector<bool>* ok_outlines) {
+  STRING best_str;
+  float target_cert = certainty_threshold;
+  if (blob != NULL) {
+    float target_c2;
+    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
+    if (debug_noise_removal) {
+      tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
+              target_cert, target_c2);
+      blob->bounding_box().print();
+    }
+    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
+  }
+  GenericVector<bool> test_outlines = *ok_outlines;
+  // Start with all the outlines in.
+  STRING all_str;
+  GenericVector<bool> best_outlines = *ok_outlines;
+  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+                                             pr_it, blob, &all_str);
+  if (debug_noise_removal) {
+    TBOX ol_box;
+    for (int i = 0; i < test_outlines.size(); ++i) {
+      if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
+    }
+    tprintf("All Noise blob classified as %s=%g, delta=%g at:",
+            all_str.string(), best_cert, best_cert - target_cert);
+    ol_box.print();
+  }
+  // Iteratively zero out the bit that improves the certainty the most, until
+  // we get past the threshold, have zero bits, or fail to improve.
+  int best_index = 0;  // To zero out.
+  while (num_outlines > 1 && best_index >= 0 &&
+         (blob == NULL || best_cert < target_cert || blob != NULL)) {
+    // Find the best bit to zero out.
+    best_index = -1;
+    for (int i = 0; i < outlines.size(); ++i) {
+      if (test_outlines[i]) {
+        test_outlines[i] = false;
+        STRING str;
+        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+                                              pr_it, blob, &str);
+        if (debug_noise_removal) {
+          TBOX ol_box;
+          for (int j = 0; j < outlines.size(); ++j) {
+            if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
+            tprintf("%d", test_outlines[j]);
+          }
+          tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
+                  cert, cert - target_cert);
+          ol_box.print();
+        }
+        if (cert > best_cert) {
+          best_cert = cert;
+          best_index = i;
+          best_outlines = test_outlines;
+        }
+        test_outlines[i] = true;
+      }
+    }
+    if (best_index >= 0) {
+      test_outlines[best_index] = false;
+      --num_outlines;
+    }
+  }
+  if (best_cert >= target_cert) {
+    // Save the best combination.
+    *ok_outlines = best_outlines;
+    if (debug_noise_removal) {
+      tprintf("%s noise combination ", blob ? "Adding" : "New");
+      for (int i = 0; i < best_outlines.size(); ++i) {
+        tprintf("%d", best_outlines[i]);
+      }
+      tprintf(" yields certainty %g, beating target of %g\n", best_cert,
+              target_cert);
+    }
+    return true;
+  }
+  return false;
+}
+
+// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+// the inclusion of the outlines, and returns the certainty of the raw choice.
+float Tesseract::ClassifyBlobPlusOutlines(
+    const GenericVector<bool>& ok_outlines,
+    const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
+    C_BLOB* blob, STRING* best_str) {
+  C_OUTLINE_IT ol_it;
+  C_OUTLINE* first_to_keep = NULL;
+  if (blob != NULL) {
+    // Add the required outlines to the blob.
+    ol_it.set_to_list(blob->out_list());
+    first_to_keep = ol_it.data();
+  }
+  for (int i = 0; i < ok_outlines.size(); ++i) {
+    if (ok_outlines[i]) {
+      // This outline is to be added.
+      if (blob == NULL) {
+        blob = new C_BLOB(outlines[i]);
+        ol_it.set_to_list(blob->out_list());
+      } else {
+        ol_it.add_before_stay_put(outlines[i]);
+      }
+    }
+  }
+  float c2;
+  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
+  ol_it.move_to_first();
+  if (first_to_keep == NULL) {
+    // We created blob. Empty its outlines and delete it.
+    for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
+    delete blob;
+    cert = -c2;
+  } else {
+    // Remove the outlines that we put in.
+    for (; ol_it.data() != first_to_keep; ol_it.forward()) {
+      ol_it.extract();
+    }
+  }
+  return cert;
+}
+
+// Classifies the given blob (part of word_data->word->word) as an individual
+// word, using languages, chopper etc, returning only the certainty of the
+// best raw choice, and undoing all the work done to fake out the word.
+float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it,
+                                    C_BLOB* blob, STRING* best_str, float* c2) {
+  WERD* real_word = pr_it->word()->word;
+  WERD* word = real_word->ConstructFromSingleBlob(
+      real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
+  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
+  // Get a new iterator that points to the new word.
+  PAGE_RES_IT it(pr_it->page_res);
+  while (it.word() != word_res && it.word() != NULL) it.forward();
+  ASSERT_HOST(it.word() == word_res);
+  WordData wd(it);
+  // Force full initialization.
+  SetupWordPassN(1, &wd);
+  classify_word_and_language(pass_n, &it, &wd);
+  if (debug_noise_removal) {
+    tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
+            wd.row->x_height(), wd.word->raw_choice->min_x_height(),
+            wd.word->raw_choice->max_x_height());
+  }
+  float cert = wd.word->raw_choice->certainty();
+  float rat = wd.word->raw_choice->rating();
+  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
+  *best_str = wd.word->raw_choice->unichar_string();
+  it.DeleteCurrentWord();
+  pr_it->ResetWordIterator();
+  return cert;
+}
+
 // Generic function for classifying a word. Can be used either for pass1 or
 // pass2 according to the function passed to recognizer.
 // word_data holds the word to be recognized, and its block and row, and
@ -905,9 +1266,10 @@ static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
 // Recognizes in the current language, and if successful that is all.
 // If recognition was not successful, tries all available languages until
 // it gets a successful result or runs out of languages. Keeps the best result.
-void Tesseract::classify_word_and_language(WordRecognizer recognizer,
-                                           PAGE_RES_IT* pr_it,
+void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
                                           WordData* word_data) {
+  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
+                                          : &Tesseract::classify_word_pass2;
  // Best result so far.
  PointerVector<WERD_RES> best_words;
  // Points to the best result. May be word or in lang_words.
@ -987,11 +1349,13 @@ void Tesseract::classify_word_pass1(const WordData& word_data,
  BLOCK* block = word_data.block;
  prev_word_best_choice_ = word_data.prev_word != NULL
      ? word_data.prev_word->word->best_choice : NULL;
+#ifndef ANDROID_BUILD
  // If we only intend to run cube - run it and return.
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
    cube_word_pass1(block, row, *in_word);
    return;
  }
+#endif
  WERD_RES* word = *in_word;
  match_word_pass_n(1, word, row, block);
  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
@ -1041,14 +1405,47 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
  int original_misfits = CountMisfitTops(word);
  if (original_misfits == 0)
    return false;
-  float new_x_ht = ComputeCompatibleXheight(word);
+  float baseline_shift = 0.0f;
+  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
+  if (baseline_shift != 0.0f) {
+    // Try the shift on its own first.
+    if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
+                              word, block, row))
+      return false;
+    original_misfits = CountMisfitTops(word);
+    if (original_misfits > 0) {
+      float new_baseline_shift;
+      // Now recompute the new x_height.
+      new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
      if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+        // No test of return value here, as we are definitely making a change
+        // to the word by shifting the baseline.
+        TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
+                             word, block, row);
+      }
+    }
+    return true;
+  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+    return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
+                                word, block, row);
+  } else {
+    return false;
+  }
+}
+
+// Runs recognition with the test baseline shift and x-height and returns true
+// if there was an improvement in recognition result.
+bool Tesseract::TestNewNormalization(int original_misfits,
+                                     float baseline_shift, float new_x_ht,
+                                     WERD_RES *word, BLOCK* block, ROW *row) {
+  bool accept_new_x_ht = false;
  WERD_RES new_x_ht_word(word->word);
  if (word->blamer_bundle != NULL) {
    new_x_ht_word.blamer_bundle = new BlamerBundle();
    new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
  }
  new_x_ht_word.x_height = new_x_ht;
+  new_x_ht_word.baseline_shift = baseline_shift;
  new_x_ht_word.caps_height = 0.0;
  new_x_ht_word.SetupForRecognition(
        unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
@ -1080,7 +1477,6 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
    word->ConsumeWordResults(&new_x_ht_word);
    return true;
  }
-  }
  return false;
 }

@ -1098,6 +1494,9 @@ void Tesseract::classify_word_pass2(const WordData& word_data,
      tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED &&
      word_data.word->best_choice != NULL)
    return;
+  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
+    return;
+  }
  ROW* row = word_data.row;
  BLOCK* block = word_data.block;
  WERD_RES* word = *in_word;
@ -1246,7 +1645,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
  word_res->done = TRUE;

  // Measure the mean space.
-  int total_gap = 0;
  int gap_count = 0;
  WERD* werd = word_res->word;
  C_BLOB_IT blob_it(werd->cblob_list());
@ -1255,7 +1653,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
    C_BLOB* blob = blob_it.data();
    int gap = blob->bounding_box().left();
    gap -= prev_blob->bounding_box().right();
-    total_gap += gap;
    ++gap_count;
    prev_blob = blob;
  }
@ -1376,13 +1773,13 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
    return FALSE;

  tessedit_rejection_debug.set_value (FALSE);
-  debug_x_ht_level.set_value (0);
+  debug_x_ht_level.set_value(0);

  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
    if (location < 0)
      return TRUE;               // For breakpoint use
    tessedit_rejection_debug.set_value (TRUE);
-    debug_x_ht_level.set_value (20);
+    debug_x_ht_level.set_value(2);
    tprintf ("\n\nTESTWD::");
    switch (location) {
      case 0:
@ -1487,62 +1884,54 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
  if (word->chopped_word == NULL) return;
  ASSERT_HOST(word->best_choice != NULL);

-  inT32 index;                   // char id index
-                                 // character iterator
-  BLOB_CHOICE_IT choice_it;      // choice iterator
  int fontinfo_size = get_fontinfo_table().size();
-  int fontset_size = get_fontset_table().size();
-  if (fontinfo_size == 0 || fontset_size == 0) return;
-  STATS fonts(0, fontinfo_size);  // font counters
+  if (fontinfo_size == 0) return;
+  GenericVector<int> font_total_score;
+  font_total_score.init_to_size(fontinfo_size, 0);

  word->italic = 0;
  word->bold = 0;
-  if (!word->best_choice_fontinfo_ids.empty()) {
-    word->best_choice_fontinfo_ids.clear();
-  }
-  // Compute the modal font for the word
-  for (index = 0; index < word->best_choice->length(); ++index) {
-    UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
-    choice_it.set_to_list(word->GetBlobChoices(index));
+  // Compute the font scores for the word
  if (tessedit_debug_fonts) {
    tprintf("Examining fonts in %s\n",
            word->best_choice->debug_string().string());
  }
-    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
-         choice_it.forward()) {
-      UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
-      if (blob_ch_id == word_ch_id) {
-        if (tessedit_debug_fonts) {
-          tprintf("%s font %s (%d) font2 %s (%d)\n",
-                  word->uch_set->id_to_unichar(blob_ch_id),
-                  choice_it.data()->fontinfo_id() < 0 ? "unknown" :
-                  fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
-                  choice_it.data()->fontinfo_id(),
-                  choice_it.data()->fontinfo_id2() < 0 ? "unknown" :
-                  fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name,
-                  choice_it.data()->fontinfo_id2());
-        }
-        // 1st choice font gets 2 pts, 2nd choice 1 pt.
-        if (choice_it.data()->fontinfo_id() >= 0) {
-          fonts.add(choice_it.data()->fontinfo_id(), 2);
-        }
-        if (choice_it.data()->fontinfo_id2() >= 0) {
-          fonts.add(choice_it.data()->fontinfo_id2(), 1);
-        }
-        break;
+  for (int b = 0; b < word->best_choice->length(); ++b) {
+    BLOB_CHOICE* choice = word->GetBlobChoice(b);
+    if (choice == NULL) continue;
+    const GenericVector<ScoredFont>& fonts = choice->fonts();
+    for (int f = 0; f < fonts.size(); ++f) {
+      int fontinfo_id = fonts[f].fontinfo_id;
+      if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
+        font_total_score[fontinfo_id] += fonts[f].score;
      }
    }
  }
-  inT16 font_id1, font_id2;
-  find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count);
-  find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count);
+  // Find the top and 2nd choice for the word.
+  int score1 = 0, score2 = 0;
+  inT16 font_id1 = -1, font_id2 = -1;
+  for (int f = 0; f < fontinfo_size; ++f) {
+    if (tessedit_debug_fonts && font_total_score[f] > 0) {
+      tprintf("Font %s, total score = %d\n",
+              fontinfo_table_.get(f).name, font_total_score[f]);
+    }
+    if (font_total_score[f] > score1) {
+      score2 = score1;
+      font_id2 = font_id1;
+      score1 = font_total_score[f];
+      font_id1 = f;
+    } else if (font_total_score[f] > score2) {
+      score2 = font_total_score[f];
+      font_id2 = f;
+    }
+  }
  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
-  // All the blobs get the word's best choice font.
-  for (int i = 0; i < word->best_choice->length(); ++i) {
-    word->best_choice_fontinfo_ids.push_back(font_id1);
-  }
-  if (word->fontinfo_id_count > 0) {
+  // Each score has a limit of MAX_UINT16, so divide by that to get the number
+  // of "votes" for that font, ie number of perfect scores.
+  word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);
+  word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);
+  if (score1 > 0) {
    FontInfo fi = fontinfo_table_.get(font_id1);
    if (tessedit_debug_fonts) {
      if (word->fontinfo_id2_count > 0) {
@ -1555,9 +1944,8 @@ void Tesseract::set_word_fonts(WERD_RES *word) {
                fi.name, word->fontinfo_id_count);
      }
    }
-    // 1st choices got 2 pts, so we need to halve the score for the mode.
-    word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
-    word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
+    word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
+    word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
  }
 }

@ -1611,8 +1999,7 @@ void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
    word = page_res_it.word();
    int length = word->best_choice->length();

-    // 1st choices got 2 pts, so we need to halve the score for the mode.
-    int count = (word->fontinfo_id_count + 1) / 2;
+    int count = word->fontinfo_id_count;
    if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
      word->fontinfo = modal_font;
      // Counts only get 1 as it came from the doc.
--- a/ccmain/cube_control.cpp
+++ b/ccmain/cube_control.cpp
@ -383,8 +383,8 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
  for (int i = 0; i < num_chars; ++i) {
    UNICHAR_ID uch_id =
        cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel());
-    choices[i] = new BLOB_CHOICE(uch_id, 0.0, cube_certainty, -1, -1,
-                                 0, 0, 0, 0, BCC_STATIC_CLASSIFIER);
+    choices[i] = new BLOB_CHOICE(uch_id, -cube_certainty, cube_certainty,
+                                 -1, 0.0f, 0.0f, 0.0f, BCC_STATIC_CLASSIFIER);
  }
  word->FakeClassifyWord(num_chars, choices);
  // within a word, cube recognizes the word in reading order.
--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
    if ((!word->part_of_combo) && (word->box_word == NULL)) {
      WordData word_data(block, row, word);
      SetupWordPassN(2, &word_data);
-      classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
-                                 &word_data);
+      classify_word_and_language(2, NULL, &word_data);
    }
    prev_word_best_choice_ = word->best_choice;
  }
--- a/ccmain/fixxht.cpp
+++ b/ccmain/fixxht.cpp
@ -35,6 +35,8 @@ namespace tesseract {
 // guessed that the blob tops are caps and will have placed the xheight too low.
 // 3. Noise/logos beside words, or changes in font size on a line. Such
 // things can blow the statistics and cause an incorrect estimate.
+// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
+// In this case the x-height is often still correct.
 //
 // Algorithm.
 // Compare the vertical position (top only) of alphnumerics in a word with
@ -54,6 +56,10 @@ namespace tesseract {
 // even if the x-height is incorrect. This is not a terrible assumption, but
 // it is not great. An improvement would be to use a classifier that does
 // not care about vertical position or scaling at all.
+// Separately collect stats on shifted baselines and apply the same logic to
+// computing a best-fit shift to fix the error. If the baseline needs to be
+// shifted, but the x-height is OK, returns the original x-height along with
+// the baseline shift to indicate that recognition needs to re-run.

 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange
 // then the char top cannot be used to judge misfits or suggest a new top.
@ -92,18 +98,25 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) {

 // Returns a new x-height maximally compatible with the result in word_res.
 // See comment above for overall algorithm.
-float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
+float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
+                                          float* baseline_shift) {
  STATS top_stats(0, MAX_UINT8);
+  STATS shift_stats(-MAX_UINT8, MAX_UINT8);
+  int bottom_shift = 0;
  int num_blobs = word_res->rebuild_word->NumBlobs();
+  do {
+    top_stats.clear();
+    shift_stats.clear();
    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
      TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
-    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
-      int top = blob->bounding_box().top();
+      if (unicharset.get_isalpha(class_id) ||
+          unicharset.get_isdigit(class_id)) {
+        int top = blob->bounding_box().top() + bottom_shift;
        // Clip the top to the limit of normalized feature space.
        if (top >= INT_FEAT_RANGE)
          top = INT_FEAT_RANGE - 1;
-      int bottom = blob->bounding_box().bottom();
+        int bottom = blob->bounding_box().bottom() + bottom_shift;
        int min_bottom, max_bottom, min_top, max_top;
        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
                                  &min_top, &max_top);
@ -113,8 +126,8 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
        int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
                            top - (max_top + x_ht_acceptance_tolerance));
        int height = top - kBlnBaselineOffset;
-      if (debug_x_ht_level >= 20) {
-        tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
+        if (debug_x_ht_level >= 2) {
+          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
                  unicharset.id_to_unichar(class_id),
                  height, min_bottom, max_bottom, min_top, max_top,
                  bottom, top);
@ -132,25 +145,61 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
                                   max_top - kBlnBaselineOffset);
          int max_xht = DivRounded(height * kBlnXHeight,
                                   min_top - kBlnBaselineOffset);
-        if (debug_x_ht_level >= 20) {
-          tprintf(" xht range min=%d, max=%d\n",
-                  min_xht, max_xht);
+          if (debug_x_ht_level >= 2) {
+            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
          }
          // The range of expected heights gets a vote equal to the distance
          // of the actual top from the expected top.
          for (int y = min_xht; y <= max_xht; ++y)
            top_stats.add(y, misfit_dist);
-      } else if (debug_x_ht_level >= 20) {
+        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
+                    bottom - x_ht_acceptance_tolerance > max_bottom) &&
+                   bottom_shift == 0) {
+          // Get the range of required bottom shift.
+          int min_shift = min_bottom - bottom;
+          int max_shift = max_bottom - bottom;
+          if (debug_x_ht_level >= 2) {
+            tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
+          }
+          // The range of expected shifts gets a vote equal to the min distance
+          // of the actual bottom from the expected bottom, spread over the
+          // range of its acceptance.
+          int misfit_weight = abs(min_shift);
+          if (max_shift > min_shift)
+            misfit_weight /= max_shift - min_shift;
+          for (int y = min_shift; y <= max_shift; ++y)
+            shift_stats.add(y, misfit_weight);
+        } else {
+          if (bottom_shift == 0) {
+            // Things with bottoms that are already ok need to say so, on the
+            // 1st iteration only.
+            shift_stats.add(0, kBlnBaselineOffset);
+          }
+          if (debug_x_ht_level >= 2) {
            tprintf(" already OK\n");
          }
        }
      }
+    }
+    if (shift_stats.get_total() > top_stats.get_total()) {
+      bottom_shift = IntCastRounded(shift_stats.median());
+      if (debug_x_ht_level >= 2) {
+        tprintf("Applying bottom shift=%d\n", bottom_shift);
+      }
+    }
+  } while (bottom_shift != 0 &&
+           top_stats.get_total() < shift_stats.get_total());
+  // Baseline shift is opposite sign to the bottom shift.
+  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
+  if (debug_x_ht_level >= 2) {
+    tprintf("baseline shift=%g\n", *baseline_shift);
+  }
  if (top_stats.get_total() == 0)
-    return 0.0f;
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
  // The new xheight is just the median vote, which is then scaled out
  // of BLN space back to pixel space to get the x-height in pixel space.
  float new_xht = top_stats.median();
-  if (debug_x_ht_level >= 20) {
+  if (debug_x_ht_level >= 2) {
    tprintf("Median xht=%f\n", new_xht);
    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
            new_xht, new_xht / word_res->denorm.y_scale());
@ -159,7 +208,7 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
    return new_xht / word_res->denorm.y_scale();
  else
-    return 0.0f;
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
 }

 }  // namespace tesseract
--- a/ccmain/pageiterator.cpp
+++ b/ccmain/pageiterator.cpp
@ -26,15 +26,23 @@

 namespace tesseract {

-PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
-                           int scale, int scaled_yres,
-                           int rect_left, int rect_top,
+PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
+                           int scaled_yres, int rect_left, int rect_top,
                           int rect_width, int rect_height)
-  : page_res_(page_res), tesseract_(tesseract),
-    word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL),
-    scale_(scale), scaled_yres_(scaled_yres),
-    rect_left_(rect_left), rect_top_(rect_top),
-    rect_width_(rect_width), rect_height_(rect_height) {
+    : page_res_(page_res),
+      tesseract_(tesseract),
+      word_(NULL),
+      word_length_(0),
+      blob_index_(0),
+      cblob_it_(NULL),
+      include_upper_dots_(false),
+      include_lower_dots_(false),
+      scale_(scale),
+      scaled_yres_(scaled_yres),
+      rect_left_(rect_left),
+      rect_top_(rect_top),
+      rect_width_(rect_width),
+      rect_height_(rect_height) {
  it_ = new PAGE_RES_IT(page_res);
  PageIterator::Begin();
 }
@ -50,12 +58,20 @@ PageIterator::~PageIterator() {
 * objects at a higher level.
 */
 PageIterator::PageIterator(const PageIterator& src)
-  : page_res_(src.page_res_), tesseract_(src.tesseract_),
-    word_(NULL), word_length_(src.word_length_),
-    blob_index_(src.blob_index_), cblob_it_(NULL),
-    scale_(src.scale_), scaled_yres_(src.scaled_yres_),
-    rect_left_(src.rect_left_), rect_top_(src.rect_top_),
-    rect_width_(src.rect_width_), rect_height_(src.rect_height_) {
+    : page_res_(src.page_res_),
+      tesseract_(src.tesseract_),
+      word_(NULL),
+      word_length_(src.word_length_),
+      blob_index_(src.blob_index_),
+      cblob_it_(NULL),
+      include_upper_dots_(src.include_upper_dots_),
+      include_lower_dots_(src.include_lower_dots_),
+      scale_(src.scale_),
+      scaled_yres_(src.scaled_yres_),
+      rect_left_(src.rect_left_),
+      rect_top_(src.rect_top_),
+      rect_width_(src.rect_width_),
+      rect_height_(src.rect_height_) {
  it_ = new PAGE_RES_IT(*src.it_);
  BeginWord(src.blob_index_);
 }
@ -63,6 +79,8 @@ PageIterator::PageIterator(const PageIterator& src)
 const PageIterator& PageIterator::operator=(const PageIterator& src) {
  page_res_ = src.page_res_;
  tesseract_ = src.tesseract_;
+  include_upper_dots_ = src.include_upper_dots_;
+  include_lower_dots_ = src.include_lower_dots_;
  scale_ = src.scale_;
  scaled_yres_ = src.scaled_yres_;
  rect_left_ = src.rect_left_;
@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
  PARA *para = NULL;
  switch (level) {
    case RIL_BLOCK:
-      box = it_->block()->block->bounding_box();
+      box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
+                                                         include_lower_dots_);
      break;
    case RIL_PARA:
      para = it_->row()->row->para();
      // explicit fall-through.
    case RIL_TEXTLINE:
-      box = it_->row()->row->bounding_box();
+      box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
+                                                     include_lower_dots_);
      break;
    case RIL_WORD:
-      box = it_->word()->word->bounding_box();
+      box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
+                                                       include_lower_dots_);
      break;
    case RIL_SYMBOL:
      if (cblob_it_ == NULL)
@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
  int left, top, right, bottom;
  if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
    return NULL;
-  Pix* pix = NULL;
-  switch (level) {
-    case RIL_BLOCK:
-    case RIL_PARA:
-      int bleft, btop, bright, bbottom;
-      BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom);
-      pix = it_->block()->block->render_mask();
-      // AND the mask and the image.
-      pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
-                  PIX_SRC & PIX_DST, tesseract_->pix_binary(),
-                  bleft, btop);
-      if (level == RIL_PARA) {
-        // RIL_PARA needs further attention:
-        //   clip the paragraph from the block mask.
-        Box* box = boxCreate(left - bleft, top - btop,
-                             right - left, bottom - top);
-        Pix* pix2 = pixClipRectangle(pix, box, NULL);
-        boxDestroy(&box);
-        pixDestroy(&pix);
-        pix = pix2;
-      }
-      break;
-    case RIL_TEXTLINE:
-    case RIL_WORD:
-    case RIL_SYMBOL:
  if (level == RIL_SYMBOL && cblob_it_ != NULL &&
      cblob_it_->data()->area() != 0)
    return cblob_it_->data()->render();
-      // Just clip from the bounding box.
  Box* box = boxCreate(left, top, right - left, bottom - top);
-      pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
+  Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
  boxDestroy(&box);
-      break;
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    int mask_x = left - mask_box.left();
+    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
+    // AND the mask and pix, putting the result in pix.
+    pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix),
+                pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x),
+                MAX(0, mask_y));
+    pixDestroy(&mask);
  }
  return pix;
 }
@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
  Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
  Pix* grey_pix = pixClipRectangle(original_img, box, NULL);
  boxDestroy(&box);
-  if (level == RIL_BLOCK) {
-    Pix* mask = it_->block()->block->render_mask();
-    Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
-    pixRasterop(expanded_mask, padding, padding,
-                pixGetWidth(mask), pixGetHeight(mask),
-                PIX_SRC, mask, 0, 0);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    // Copy the mask registered correctly into an image the size of grey_pix.
+    int mask_x = *left - mask_box.left();
+    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
+    int width = pixGetWidth(grey_pix);
+    int height = pixGetHeight(grey_pix);
+    Pix* resized_mask = pixCreate(width, height, 1);
+    pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height,
+                PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y));
    pixDestroy(&mask);
-    pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1);
-    pixInvert(expanded_mask, expanded_mask);
-    pixSetMasked(grey_pix, expanded_mask, MAX_UINT32);
-    pixDestroy(&expanded_mask);
+    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
+                   2 * padding + 1);
+    pixInvert(resized_mask, resized_mask);
+    pixSetMasked(grey_pix, resized_mask, MAX_UINT32);
+    pixDestroy(&resized_mask);
  }
  return grey_pix;
 }
--- a/ccmain/pageiterator.h
+++ b/ccmain/pageiterator.h
@ -179,6 +179,21 @@ class TESS_API PageIterator {
  // If an image rectangle has been set in the API, then returned coordinates
  // relate to the original (full) image, rather than the rectangle.

+  /**
+   * Controls what to include in a bounding box. Bounding boxes of all levels
+   * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
+   * Between layout analysis and recognition, it isn't known where all
+   * diacritics belong, so this control is used to include or exclude some
+   * diacritics that are above or below the main body of the word. In most cases
+   * where the placement is obvious, and after recognition, it doesn't make as
+   * much difference, as the diacritics will already be included in the word.
+   */
+  void SetBoundingBoxComponents(bool include_upper_dots,
+                                bool include_lower_dots) {
+    include_upper_dots_ = include_upper_dots;
+    include_lower_dots_ = include_lower_dots;
+  }
+
  /**
   * Returns the bounding rectangle of the current object at the given level.
   * See comment on coordinate system above.
@ -332,6 +347,9 @@ class TESS_API PageIterator {
   * Owned by this ResultIterator.
   */
  C_BLOB_IT* cblob_it_;
+  /** Control over what to include in bounding boxes. */
+  bool include_upper_dots_;
+  bool include_lower_dots_;
  /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
  int scale_;
  int scaled_yres_;
--- a/ccmain/pagesegmain.cpp
+++ b/ccmain/pagesegmain.cpp
@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
    // UNLV file present. Use PSM_SINGLE_BLOCK.
    pageseg_mode = PSM_SINGLE_BLOCK;
  }
+  // The diacritic_blobs holds noise blobs that may be diacritics. They
+  // are separated out on areas of the image that seem noisy and short-circuit
+  // the layout process, going straight from the initial partition creation
+  // right through to after word segmentation, where they are added to the
+  // rej_cblobs list of the most appropriate word. From there classification
+  // will determine whether they are used.
+  BLOBNBOX_LIST diacritic_blobs;
  int auto_page_seg_ret_val = 0;
  TO_BLOCK_LIST to_blocks;
  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
      PSM_SPARSE(pageseg_mode)) {
-    auto_page_seg_ret_val =
-        AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
+    auto_page_seg_ret_val = AutoPageSeg(
+        pageseg_mode, blocks, &to_blocks,
+        enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
    if (pageseg_mode == PSM_OSD_ONLY)
      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,

  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                       pix_thresholds_, pix_grey_, splitting || cjk_mode,
-                       blocks, &to_blocks);
+                       &diacritic_blobs, blocks, &to_blocks);
  return auto_page_seg_ret_val;
 }

@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
  pixDestroy(&grey_pix);
 }

-
 /**
 * Auto page segmentation. Divide the page image into blocks of uniform
 * text linespacing and images.
@ -207,9 +214,14 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
 * The output goes in the blocks list with corresponding TO_BLOCKs in the
 * to_blocks list.
 *
- * If single_column is true, then no attempt is made to divide the image
- * into columns, but multiple blocks are still made if the text is of
- * non-uniform linespacing.
+ * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
+ * the image into columns, but multiple blocks are still made if the text is
+ * of non-uniform linespacing.
+ *
+ * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+ * confuse layout anaylsis by causing textline overlap, are placed there,
+ * with the expectation that they will be reassigned to words later and
+ * noise/diacriticness determined via classification.
 *
 * If osd (orientation and script detection) is true then that is performed
 * as well. If only_osd is true, then only orientation and script detection is
@ -217,9 +229,10 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
 * another Tesseract that was initialized especially for osd, and the results
 * will be output into osr (orientation and script result).
 */
-int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
-                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
-                           Tesseract* osd_tess, OSResults* osr) {
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+                           TO_BLOCK_LIST* to_blocks,
+                           BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
+                           OSResults* osr) {
  if (textord_debug_images) {
    WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
  }
@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
    if (equ_detect_) {
      finder->SetEquationDetect(equ_detect_);
    }
-    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
-                                to_block, photomask_pix,
-                                pix_thresholds_, pix_grey_,
-                                &found_blocks, to_blocks);
+    result = finder->FindBlocks(
+        pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
+        pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
    if (result >= 0)
      finder->GetDeskewVectors(&deskew_, &reskew_);
    delete finder;
@ -340,6 +352,7 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    finder = new ColumnFinder(static_cast<int>(to_block->line_size),
                              blkbox.botleft(), blkbox.topright(),
                              source_resolution_, textord_use_cjk_fp_model,
+                              textord_tabfind_aligned_gap_fraction,
                              &v_lines, &h_lines, vertical_x, vertical_y);

    finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
@ -354,7 +367,12 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    // We want the text lines horizontal, (vertical text indicates vertical
    // textlines) which may conflict (eg vertically written CJK).
    int osd_orientation = 0;
-    bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
+    bool vertical_text = textord_tabfind_force_vertical_text;
+    if (!vertical_text && textord_tabfind_vertical_text) {
+      vertical_text =
+          finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
+                                          to_block, &osd_blobs);
+    }
    if (osd && osd_tess != NULL && osr != NULL) {
      GenericVector<int> osd_scripts;
      if (osd_tess != this) {
--- a/ccmain/paramsd.h
+++ b/ccmain/paramsd.h
@ -24,7 +24,9 @@
 #define VARABLED_H

 #include "elst.h"
+#ifndef ANDROID_BUILD
 #include "scrollview.h"
+#endif
 #include "params.h"
 #include "tesseractclass.h"

--- a/ccmain/pgedit.cpp
+++ b/ccmain/pgedit.cpp
@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) {
  FCOORD pt(x, y);
  PAGE_RES_IT pr_it(page_res);

-  char msg[160];
+  const int kBufsize = 512;
+  char msg[kBufsize];
  char *msg_ptr = msg;

  msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
--- a/ccmain/recogtraining.cpp
+++ b/ccmain/recogtraining.cpp
@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label,
  fflush(stdout);
  WordData word_data(*pr_it);
  SetupWordPassN(1, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass1,
-                             pr_it, &word_data);
+  classify_word_and_language(1, pr_it, &word_data);
  WERD_RES* werd_res = word_data.word;
  WERD_CHOICE *best_choice = werd_res->best_choice;
  ASSERT_HOST(best_choice != NULL);
--- a/ccmain/resultiterator.cpp
+++ b/ccmain/resultiterator.cpp
@ -34,6 +34,13 @@ ResultIterator::ResultIterator(const LTRResultIterator &resit)
    : LTRResultIterator(resit) {
  in_minor_direction_ = false;
  at_beginning_of_minor_run_ = false;
+  preserve_interword_spaces_ = false;
+
+  BoolParam *p = ParamUtils::FindParam<BoolParam>(
+      "preserve_interword_spaces", GlobalParams()->bool_params,
+      tesseract_->params()->bool_params);
+  if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
+
  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
  MoveToLogicalStartOfTextline();
 }
@ -629,14 +636,17 @@ void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {

  int words_appended = 0;
  do {
+    int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space()
+                                               : (words_appended > 0);
+    for (int i = 0; i < numSpaces; ++i) {
+      *text += " ";
+    }
    AppendUTF8WordText(text);
    words_appended++;
-    *text += " ";
  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
  if (BidiDebug(1)) {
    tprintf("%d words printed\n", words_appended);
  }
-  text->truncate_at(text->length() - 1);
  *text += line_separator_;
  // If we just finished a paragraph, add an extra newline.
  if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
--- a/ccmain/resultiterator.h
+++ b/ccmain/resultiterator.h
@ -231,6 +231,12 @@ class TESS_API ResultIterator : public LTRResultIterator {

  /** Is the currently pointed-at character in a minor-direction sequence? */
  bool in_minor_direction_;
+
+  /**
+   * Should detected inter-word spaces be preserved, or "compressed" to a single
+   * space character (default behavior).
+   */
+  bool preserve_interword_spaces_;
 };

 }  // namespace tesseract.
--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@ -194,7 +194,11 @@ bool Tesseract::init_tesseract_lang_data(
    if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
  }

-  // Load Cube objects if necessary.
+  // The various OcrEngineMode settings (see publictypes.h) determine which
+  // engine-specific data files need to be loaded. Currently everything needs
+  // the base tesseract data, which supplies other useful information, but
+  // alternative engines, such as cube and LSTM are optional.
+#ifndef ANDROID_BUILD
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
    ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
    if (tessdata_manager_debug_level)
@ -204,7 +208,7 @@ bool Tesseract::init_tesseract_lang_data(
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube with combiner\n");
  }
-
+#endif
  // Init ParamsModel.
  // Load pass1 and pass2 weights (for now these two sets are the same, but in
  // the future separate sets of weights can be generated).
@ -475,5 +479,4 @@ enum CMD_EVENTS
  RECOG_PSEUDO,
  ACTION_2_CMD_EVENT
 };
-
 }  // namespace tesseract
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@ -1,7 +1,23 @@
 ///////////////////////////////////////////////////////////////////////
 // File:        tesseractclass.cpp
-// Description: An instance of Tesseract. For thread safety, *every*
-//              global variable goes in here, directly, or indirectly.
+// Description: The Tesseract class. It holds/owns everything needed
+//              to run Tesseract on a single language, and also a set of
+//              sub-Tesseracts to run sub-languages. For thread safety, *every*
+//              variable that was previously global or static (except for
+//              constant data, and some visual debugging flags) has been moved
+//              in here, directly, or indirectly.
+//              This makes it safe to run multiple Tesseracts in different
+//              threads in parallel, and keeps the different language
+//              instances separate.
+//              Some global functions remain, but they are isolated re-entrant
+//              functions that operate on their arguments. Functions that work
+//              on variable data have been moved to an appropriate class based
+//              mostly on the directory hierarchy. For more information see
+//              slide 6 of "2ArchitectureAndDataStructures" in
+// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
+//              Some global data and related functions still exist in the
+//              training-related code, but they don't interfere with normal
+//              recognition operation.
 // Author:      Ray Smith
 // Created:     Fri Mar 07 08:17:01 PST 2008
 //
@ -21,11 +37,15 @@
 #include "tesseractclass.h"

 #include "allheaders.h"
+#ifndef ANDROID_BUILD
 #include "cube_reco_context.h"
+#endif
 #include "edgblob.h"
 #include "equationdetect.h"
 #include "globals.h"
+#ifndef ANDROID_BUILD
 #include "tesseract_cube_combiner.h"
+#endif

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
@ -50,7 +70,8 @@ Tesseract::Tesseract()
                  this->params()),
      // The default for pageseg_mode is the old behaviour, so as not to
      // upset anything that relies on that.
-    INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+      INT_MEMBER(
+          tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
          "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
          " 5=line, 6=word, 7=char"
          " (Values from PageSegMode enum in publictypes.h)",
@ -65,55 +86,67 @@ Tesseract::Tesseract()
                    "Blacklist of chars not to recognize", this->params()),
      STRING_MEMBER(tessedit_char_whitelist, "",
                    "Whitelist of chars to recognize", this->params()),
+      STRING_MEMBER(tessedit_char_unblacklist, "",
+                    "List of chars to override tessedit_char_blacklist",
+                    this->params()),
      BOOL_MEMBER(tessedit_ambigs_training, false,
                  "Perform training for ambiguities", this->params()),
      INT_MEMBER(pageseg_devanagari_split_strategy,
                 tesseract::ShiroRekhaSplitter::NO_SPLIT,
                 "Whether to use the top-line splitting process for Devanagari "
-              "documents while performing page-segmentation.", this->params()),
+                 "documents while performing page-segmentation.",
+                 this->params()),
      INT_MEMBER(ocr_devanagari_split_strategy,
                 tesseract::ShiroRekhaSplitter::NO_SPLIT,
                 "Whether to use the top-line splitting process for Devanagari "
-              "documents while performing ocr.", this->params()),
+                 "documents while performing ocr.",
+                 this->params()),
      STRING_MEMBER(tessedit_write_params_to_file, "",
                    "Write all parameters to the given file.", this->params()),
-    BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
-                " information for adaption", this->params()),
+      BOOL_MEMBER(tessedit_adaption_debug, false,
+                  "Generate and print debug"
+                  " information for adaption",
+                  this->params()),
      INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
      INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
-    INT_MEMBER(applybox_page, 0,
-               "Page number to apply boxes from", this->params()),
-    STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows"
+      INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
+                 this->params()),
+      STRING_MEMBER(applybox_exposure_pattern, ".exp",
+                    "Exposure value follows"
                    " this pattern in the image filename. The name of the image"
                    " files are expected to be in the form"
-                  " [lang].[fontname].exp[num].tif", this->params()),
+                    " [lang].[fontname].exp[num].tif",
+                    this->params()),
      BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
                  "Learn both character fragments (as is done in the"
                  " special low exposure mode) as well as unfragmented"
-               " characters.", this->params()),
-    BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box"
+                  " characters.",
+                  this->params()),
+      BOOL_MEMBER(applybox_learn_ngrams_mode, false,
+                  "Each bounding box"
                  " is assumed to contain ngrams. Only learn the ngrams"
-                " whose outlines overlap horizontally.", this->params()),
-    BOOL_MEMBER(tessedit_display_outwords, false,
-                "Draw output words", this->params()),
-    BOOL_MEMBER(tessedit_dump_choices, false,
-                "Dump char choices", this->params()),
+                  " whose outlines overlap horizontally.",
+                  this->params()),
+      BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
+                  this->params()),
+      BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
+                  this->params()),
      BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
                  this->params()),
      BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
                  "Try to improve fuzzy spaces", this->params()),
      BOOL_MEMBER(tessedit_unrej_any_wd, false,
                  "Dont bother with word plausibility", this->params()),
-    BOOL_MEMBER(tessedit_fix_hyphens, true,
-                "Crunch double hyphens?", this->params()),
-    BOOL_MEMBER(tessedit_redo_xheight, true,
-                "Check/Correct x-height", this->params()),
+      BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
+                  this->params()),
+      BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
+                  this->params()),
      BOOL_MEMBER(tessedit_enable_doc_dict, true,
                  "Add words to the document dictionary", this->params()),
-    BOOL_MEMBER(tessedit_debug_fonts, false,
-                "Output font info per char", this->params()),
-    BOOL_MEMBER(tessedit_debug_block_rejection, false,
-                "Block and Row stats", this->params()),
+      BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
+                  this->params()),
+      BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
+                  this->params()),
      BOOL_MEMBER(tessedit_enable_bigram_correction, true,
                  "Enable correction based on the word bigram dictionary.",
                  this->params()),
@ -123,15 +156,42 @@ Tesseract::Tesseract()
      INT_MEMBER(tessedit_bigram_debug, 0,
                 "Amount of debug output for bigram correction.",
                 this->params()),
+      BOOL_MEMBER(enable_noise_removal, true,
+                  "Remove and conditionally reassign small outlines when they"
+                  " confuse layout analysis, determining diacritics vs noise",
+                  this->params()),
+      INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
+                 this->params()),
+      // Worst (min) certainty, for which a diacritic is allowed to make the
+      // base
+      // character worse and still be included.
+      double_MEMBER(noise_cert_basechar, -8.0,
+                    "Hingepoint for base char certainty", this->params()),
+      // Worst (min) certainty, for which a non-overlapping diacritic is allowed
+      // to make the base character worse and still be included.
+      double_MEMBER(noise_cert_disjoint, -1.0,
+                    "Hingepoint for disjoint certainty", this->params()),
+      // Worst (min) certainty, for which a diacritic is allowed to make a new
+      // stand-alone blob.
+      double_MEMBER(noise_cert_punc, -3.0,
+                    "Threshold for new punc char certainty", this->params()),
+      // Factor of certainty margin for adding diacritics to not count as worse.
+      double_MEMBER(noise_cert_factor, 0.375,
+                    "Scaling on certainty diff from Hingepoint",
+                    this->params()),
+      INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
+                 this->params()),
+      INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
+                 this->params()),
      INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
-    BOOL_MEMBER(debug_acceptable_wds, false,
-                "Dump word pass/fail chk", this->params()),
-    STRING_MEMBER(chs_leading_punct, "('`\"",
-                  "Leading punctuation", this->params()),
-    STRING_MEMBER(chs_trailing_punct1, ").,;:?!",
-                  "1st Trailing punctuation", this->params()),
-    STRING_MEMBER(chs_trailing_punct2, ")'`\"",
-                  "2nd Trailing punctuation", this->params()),
+      BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
+                  this->params()),
+      STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
+                    this->params()),
+      STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
+                    this->params()),
+      STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
+                    this->params()),
      double_MEMBER(quality_rej_pc, 0.08,
                    "good_quality_doc lte rejection limit", this->params()),
      double_MEMBER(quality_blob_pc, 0.0,
@ -140,16 +200,16 @@ Tesseract::Tesseract()
                    "good_quality_doc lte outline error limit", this->params()),
      double_MEMBER(quality_char_pc, 0.95,
                    "good_quality_doc gte good char limit", this->params()),
-    INT_MEMBER(quality_min_initial_alphas_reqd, 2,
-               "alphas in a good word", this->params()),
+      INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
+                 this->params()),
      INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
                 "Adaptation decision algorithm for tess", this->params()),
      BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
                  "Do minimal rejection on pass 1 output", this->params()),
-    BOOL_MEMBER(tessedit_test_adaption, false,
-                "Test adaption criteria", this->params()),
-    BOOL_MEMBER(tessedit_matcher_log, false,
-                "Log matcher activity", this->params()),
+      BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
+                  this->params()),
+      BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
+                  this->params()),
      INT_MEMBER(tessedit_test_adaption_mode, 3,
                 "Adaptation decision algorithm for tess", this->params()),
      BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
@ -159,18 +219,19 @@ Tesseract::Tesseract()
                 this->params()),
      BOOL_MEMBER(paragraph_text_based, true,
                  "Run paragraph detection on the post-text-recognition "
-                "(more accurate)", this->params()),
+                  "(more accurate)",
+                  this->params()),
      INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
      STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
                    this->params()),
-    STRING_MEMBER(outlines_2, "ij!?%\":;",
-                  "Non standard number of outlines", this->params()),
+      STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
+                    this->params()),
      BOOL_MEMBER(docqual_excuse_outline_errs, false,
                  "Allow outline errs in unrejection?", this->params()),
      BOOL_MEMBER(tessedit_good_quality_unrej, true,
                  "Reduce rejection on good docs", this->params()),
-    BOOL_MEMBER(tessedit_use_reject_spaces, true,
-                "Reject spaces?", this->params()),
+      BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
+                  this->params()),
      double_MEMBER(tessedit_reject_doc_percent, 65.00,
                    "%rej allowed before rej whole doc", this->params()),
      double_MEMBER(tessedit_reject_block_percent, 45.00,
@ -179,7 +240,8 @@ Tesseract::Tesseract()
                    "%rej allowed before rej whole row", this->params()),
      double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
                    "Number of row rejects in whole word rejects"
-                  "which prevents whole row rejection", this->params()),
+                    "which prevents whole row rejection",
+                    this->params()),
      BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
                  "Only rej partially rejected words in block rejection",
                  this->params()),
@ -199,63 +261,63 @@ Tesseract::Tesseract()
                    this->params()),
      BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
                  "Reject all bad quality wds", this->params()),
-    BOOL_MEMBER(tessedit_debug_doc_rejection, false,
-                "Page stats", this->params()),
+      BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
+                  this->params()),
      BOOL_MEMBER(tessedit_debug_quality_metrics, false,
                  "Output data to debug file", this->params()),
-    BOOL_MEMBER(bland_unrej, false,
-                "unrej potential with no chekcs", this->params()),
+      BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs",
+                  this->params()),
      double_MEMBER(quality_rowrej_pc, 1.1,
                    "good_quality_doc gte good char limit", this->params()),
      BOOL_MEMBER(unlv_tilde_crunching, true,
                  "Mark v.bad words for tilde crunch", this->params()),
-    BOOL_MEMBER(hocr_font_info, false,
-                "Add font info to hocr output", this->params()),
-    BOOL_MEMBER(crunch_early_merge_tess_fails, true,
-                "Before word crunch?", this->params()),
+      BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
+                  this->params()),
+      BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
+                  this->params()),
      BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
                  "Take out ~^ early?", this->params()),
-    double_MEMBER(crunch_terrible_rating, 80.0,
-                  "crunch rating lt this", this->params()),
+      double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
+                    this->params()),
      BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
      double_MEMBER(crunch_poor_garbage_cert, -9.0,
                    "crunch garbage cert lt this", this->params()),
      double_MEMBER(crunch_poor_garbage_rate, 60,
                    "crunch garbage rating lt this", this->params()),
-    double_MEMBER(crunch_pot_poor_rate, 40,
-                  "POTENTIAL crunch rating lt this", this->params()),
-    double_MEMBER(crunch_pot_poor_cert, -8.0,
-                  "POTENTIAL crunch cert lt this", this->params()),
-    BOOL_MEMBER(crunch_pot_garbage, true,
-                "POTENTIAL crunch garbage", this->params()),
-    double_MEMBER(crunch_del_rating, 60,
-                  "POTENTIAL crunch rating lt this", this->params()),
-    double_MEMBER(crunch_del_cert, -10.0,
-                  "POTENTIAL crunch cert lt this", this->params()),
-    double_MEMBER(crunch_del_min_ht, 0.7,
-                  "Del if word ht lt xht x this", this->params()),
-    double_MEMBER(crunch_del_max_ht, 3.0,
-                  "Del if word ht gt xht x this", this->params()),
+      double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
+                    this->params()),
+      double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
+                    this->params()),
+      BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
+                  this->params()),
+      double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
+                    this->params()),
+      double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
+                    this->params()),
+      double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
+                    this->params()),
+      double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
+                    this->params()),
      double_MEMBER(crunch_del_min_width, 3.0,
                    "Del if word width lt xht x this", this->params()),
      double_MEMBER(crunch_del_high_word, 1.5,
                    "Del if word gt xht x this above bl", this->params()),
      double_MEMBER(crunch_del_low_word, 0.5,
                    "Del if word gt xht x this below bl", this->params()),
-    double_MEMBER(crunch_small_outlines_size, 0.6,
-                  "Small if lt xht x this", this->params()),
-    INT_MEMBER(crunch_rating_max, 10,
-               "For adj length in rating per ch", this->params()),
+      double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
+                    this->params()),
+      INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
+                 this->params()),
      INT_MEMBER(crunch_pot_indicators, 1,
                 "How many potential indicators needed", this->params()),
-    BOOL_MEMBER(crunch_leave_ok_strings, true,
-                "Dont touch sensible strings", this->params()),
-    BOOL_MEMBER(crunch_accept_ok, true,
-                "Use acceptability in okstring", this->params()),
+      BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
+                  this->params()),
+      BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
+                  this->params()),
      BOOL_MEMBER(crunch_leave_accept_strings, false,
                  "Dont pot crunch sensible strings", this->params()),
-    BOOL_MEMBER(crunch_include_numerals, false,
-                "Fiddle alpha figures", this->params()),
+      BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
+                  this->params()),
      INT_MEMBER(crunch_leave_lc_strings, 4,
                 "Dont crunch words with long lower case strings",
                 this->params()),
@ -267,14 +329,14 @@ Tesseract::Tesseract()
      INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
      INT_MEMBER(fixsp_non_noise_limit, 1,
                 "How many non-noise blbs either side?", this->params()),
-    double_MEMBER(fixsp_small_outlines_size, 0.28,
-                  "Small if lt xht x this", this->params()),
+      double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
+                    this->params()),
      BOOL_MEMBER(tessedit_prefer_joined_punct, false,
                  "Reward punctation joins", this->params()),
-    INT_MEMBER(fixsp_done_mode, 1,
-               "What constitues done for spacing", this->params()),
-    INT_MEMBER(debug_fix_space_level, 0,
-               "Contextual fixspace debug", this->params()),
+      INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
+                 this->params()),
+      INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
+                 this->params()),
      STRING_MEMBER(numeric_punctuation, ".,",
                    "Punct. chs expected WITHIN numbers", this->params()),
      INT_MEMBER(x_ht_acceptance_tolerance, 8,
@ -282,13 +344,18 @@ Tesseract::Tesseract()
                 this->params()),
      INT_MEMBER(x_ht_min_change, 8,
                 "Min change in xht before actually trying it", this->params()),
-    INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer",
-               this->params()),
-    double_MEMBER(superscript_worse_certainty, 2.0, "How many times worse "
+      INT_MEMBER(superscript_debug, 0,
+                 "Debug level for sub & superscript fixer", this->params()),
+      double_MEMBER(
+          superscript_worse_certainty, 2.0,
+          "How many times worse "
          "certainty does a superscript position glyph need to be for "
          "us to try classifying it as a char with a different "
-                  "baseline?", this->params()),
-    double_MEMBER(superscript_bettered_certainty, 0.97, "What reduction in "
+          "baseline?",
+          this->params()),
+      double_MEMBER(
+          superscript_bettered_certainty, 0.97,
+          "What reduction in "
          "badness do we think sufficient to choose a superscript "
          "over what we'd thought.  For example, a value of 0.6 means "
          "we want to reduce badness of certainty by at least 40%",
@ -301,30 +368,24 @@ Tesseract::Tesseract()
      double_MEMBER(subscript_max_y_top, 0.5,
                    "Maximum top of a character measured as a multiple of "
                    "x-height above the baseline for us to reconsider whether "
-                  "it's a subscript.", this->params()),
+                    "it's a subscript.",
+                    this->params()),
      double_MEMBER(superscript_min_y_bottom, 0.3,
                    "Minimum bottom of a character measured as a multiple of "
                    "x-height above the baseline for us to reconsider whether "
-                  "it's a superscript.", this->params()),
+                    "it's a superscript.",
+                    this->params()),
      BOOL_MEMBER(tessedit_write_block_separators, false,
                  "Write block separators in output", this->params()),
-    BOOL_MEMBER(tessedit_write_rep_codes, false,
-                "Write repetition char code", this->params()),
-    BOOL_MEMBER(tessedit_write_unlv, false,
-                "Write .unlv output file", this->params()),
-    BOOL_MEMBER(tessedit_create_hocr, false,
-                "Write .html hOCR output file", this->params()),
-    BOOL_MEMBER(tessedit_create_pdf, false,
-                "Write .pdf output file", this->params()),
-    INT_MEMBER(tessedit_pdf_compression, 0,
-               "Type of image compression in pdf output: "
-               "0 - autoselection (default); "
-               "1 - jpeg; "
-               "2 - G4; "
-               "3 - flate",
+      BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
                  this->params()),
-    INT_MEMBER(tessedit_pdf_jpg_quality, 85,
-               "Quality level of jpeg image compression in pdf output",
+      BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
                  this->params()),
      STRING_MEMBER(unrecognised_char, "|",
                    "Output char for unidentified blobs", this->params()),
@ -333,75 +394,81 @@ Tesseract::Tesseract()
                 "Min suspect level for rejecting spaces", this->params()),
      INT_MEMBER(suspect_short_words, 2,
                 "Dont Suspect dict wds longer than this", this->params()),
-    BOOL_MEMBER(suspect_constrain_1Il, false,
-                "UNLV keep 1Il chars rejected", this->params()),
-    double_MEMBER(suspect_rating_per_ch, 999.9,
-                  "Dont touch bad rating limit", this->params()),
-    double_MEMBER(suspect_accept_rating, -999.9,
-                  "Accept good rating limit", this->params()),
+      BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
+                  this->params()),
+      double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
+                    this->params()),
+      double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
+                    this->params()),
      BOOL_MEMBER(tessedit_minimal_rejection, false,
                  "Only reject tess failures", this->params()),
-    BOOL_MEMBER(tessedit_zero_rejection, false,
-                "Dont reject ANYTHING", this->params()),
+      BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
+                  this->params()),
      BOOL_MEMBER(tessedit_word_for_word, false,
                  "Make output have exactly one word per WERD", this->params()),
      BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
                  "Dont reject ANYTHING AT ALL", this->params()),
      BOOL_MEMBER(tessedit_consistent_reps, true,
                  "Force all rep chars the same", this->params()),
-    INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()),
-    BOOL_MEMBER(tessedit_rejection_debug, false,
-                "Adaption debug", this->params()),
-    BOOL_MEMBER(tessedit_flip_0O, true,
-                "Contextual 0O O0 flips", this->params()),
+      INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
+                 this->params()),
+      BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
+                  this->params()),
+      BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
+                  this->params()),
      double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
                    "Aspect ratio dot/hyphen test", this->params()),
      double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
                    "Aspect ratio dot/hyphen test", this->params()),
      BOOL_MEMBER(rej_trust_doc_dawg, false,
                  "Use DOC dawg in 11l conf. detector", this->params()),
-    BOOL_MEMBER(rej_1Il_use_dict_word, false,
-                "Use dictword test", this->params()),
-    BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
-                "Dont double check", this->params()),
-    BOOL_MEMBER(rej_use_tess_accepted, true,
-                "Individual rejection control", this->params()),
-    BOOL_MEMBER(rej_use_tess_blanks, true,
-                "Individual rejection control", this->params()),
-    BOOL_MEMBER(rej_use_good_perm, true,
-                "Individual rejection control", this->params()),
-    BOOL_MEMBER(rej_use_sensible_wd, false,
-                "Extend permuter check", this->params()),
-    BOOL_MEMBER(rej_alphas_in_number_perm, false,
-                "Extend permuter check", this->params()),
+      BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
+                  this->params()),
+      BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
+                  this->params()),
+      BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
+                  this->params()),
+      BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
+                  this->params()),
      double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
                    "if >this fract", this->params()),
-    INT_MEMBER(tessedit_image_border, 2,
-               "Rej blbs near image edge limit", this->params()),
+      INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
+                 this->params()),
      STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
                    "Allow NN to unrej", this->params()),
-    STRING_MEMBER(conflict_set_I_l_1, "Il1[]",
-                  "Il1 conflict set", this->params()),
-    INT_MEMBER(min_sane_x_ht_pixels, 8,
-               "Reject any x-ht lt or eq than this", this->params()),
-    BOOL_MEMBER(tessedit_create_boxfile, false,
-                "Output text with boxes", this->params()),
-    INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages"
-               " , else specifc page to process", this->params()),
+      STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
+                    this->params()),
+      INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
+                 this->params()),
+      BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
+                  this->params()),
+      INT_MEMBER(tessedit_page_number, -1,
+                 "-1 -> All pages"
+                 " , else specifc page to process",
+                 this->params()),
      BOOL_MEMBER(tessedit_write_images, false,
                  "Capture the image from the IPE", this->params()),
      BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
                  this->params()),
      STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
-    BOOL_MEMBER(tessedit_override_permuter, true,
-                "According to dict_word", this->params()),
-    INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for"
-               " TessdataManager functions.", this->params()),
+      BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
+                  this->params()),
+      INT_MEMBER(tessdata_manager_debug_level, 0,
+                 "Debug level for"
+                 " TessdataManager functions.",
+                 this->params()),
      STRING_MEMBER(tessedit_load_sublangs, "",
                    "List of languages to load with this one", this->params()),
      BOOL_MEMBER(tessedit_use_primary_params_model, false,
                  "In multilingual mode use params model of the"
-                " primary language", this->params()),
+                  " primary language",
+                  this->params()),
      double_MEMBER(min_orientation_margin, 7.0,
                    "Min acceptable orientation margin", this->params()),
      BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
@ -414,13 +481,37 @@ Tesseract::Tesseract()
      BOOL_INIT_MEMBER(tessedit_init_config_only, false,
                       "Only initialize with the config file. Useful if the "
                       "instance is not going to be used for OCR but say only "
-                     "for layout analysis.", this->params()),
+                       "for layout analysis.",
+                       this->params()),
      BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
                  this->params()),
+      BOOL_MEMBER(textord_tabfind_vertical_text, true,
+                  "Enable vertical detection", this->params()),
+      BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
+                  "Force using vertical text page mode", this->params()),
+      double_MEMBER(
+          textord_tabfind_vertical_text_ratio, 0.5,
+          "Fraction of textlines deemed vertical to use vertical page "
+          "mode",
+          this->params()),
+      double_MEMBER(
+          textord_tabfind_aligned_gap_fraction, 0.75,
+          "Fraction of height used as a minimum gap for aligned blobs.",
+          this->params()),
      INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
                 this->params()),
+      BOOL_MEMBER(preserve_interword_spaces, false,
+                  "Preserve multiple interword spaces", this->params()),
+      BOOL_MEMBER(include_page_breaks, FALSE,
+                  "Include page separator string in output text after each "
+                  "image/page.",
+                  this->params()),
+      STRING_MEMBER(page_separator, "\f",
+                    "Page separator (default is form feed control character)",
+                    this->params()),

-    // The following parameters were deprecated and removed from their original
+      // The following parameters were deprecated and removed from their
+      // original
      // locations. The parameters are temporarily kept here to give Tesseract
      // users a chance to updated their [lang].traineddata and config files
      // without introducing failures during Tesseract initialization.
@ -428,21 +519,25 @@ Tesseract::Tesseract()
      // reasonably sure that Tesseract users have updated their data files.
      //
      // BEGIN DEPRECATED PARAMETERS
-    INT_MEMBER(tessedit_ok_mode, 5,
-               "Acceptance decision algorithm", this->params()),
-    BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
+      BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
+                  "find horizontal lines such as headers in vertical page mode",
+                  this->params()),
+      INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm",
+                 this->params()),
+      BOOL_INIT_MEMBER(load_fixed_length_dawgs, true,
+                       "Load fixed length dawgs"
                       " (e.g. for non-space delimited languages)",
                       this->params()),
      INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
                 this->params()),
      BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
                  this->params()),
-    double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of"
+      double_MEMBER(bestrate_pruning_factor, 2.0,
+                    "Multiplying factor of"
                    " current best rate to prune other hypotheses",
                    this->params()),
      BOOL_MEMBER(permute_script_word, 0,
-                "Turn on word script consistency permuter",
-                this->params()),
+                  "Turn on word script consistency permuter", this->params()),
      BOOL_MEMBER(segment_segcost_rating, 0,
                  "incorporate segmentation cost in word rating?",
                  this->params()),
@ -475,7 +570,8 @@ Tesseract::Tesseract()
                 this->params()),
      BOOL_MEMBER(use_new_state_cost, FALSE,
                  "use new state cost heuristics for segmentation state"
-                " evaluation", this->params()),
+                  " evaluation",
+                  this->params()),
      double_MEMBER(heuristic_segcost_rating_base, 1.25,
                    "base factor for adding segmentation cost into word rating."
                    "It's a multiplying factor, the larger the value above 1, "
@ -483,10 +579,12 @@ Tesseract::Tesseract()
                    this->params()),
      double_MEMBER(heuristic_weight_rating, 1.0,
                    "weight associated with char rating in combined cost of"
-                  "state", this->params()),
+                    "state",
+                    this->params()),
      double_MEMBER(heuristic_weight_width, 1000.0,
                    "weight associated with width evidence in combined cost of"
-                  " state", this->params()),
+                    " state",
+                    this->params()),
      double_MEMBER(heuristic_weight_seamcut, 0.0,
                    "weight associated with seam cut in combined cost of state",
                    this->params()),
@ -515,8 +613,10 @@ Tesseract::Tesseract()
      reskew_(1.0f, 0.0f),
      most_recently_used_(this),
      font_table_size_(0),
+#ifndef ANDROID_BUILD
      cube_cntxt_(NULL),
      tess_cube_combiner_(NULL),
+#endif
      equ_detect_(NULL) {
 }

@ -524,6 +624,7 @@ Tesseract::~Tesseract() {
  Clear();
  end_tesseract();
  sub_langs_.delete_data_pointers();
+#ifndef ANDROID_BUILD
  // Delete cube objects.
  if (cube_cntxt_ != NULL) {
    delete cube_cntxt_;
@ -533,6 +634,7 @@ Tesseract::~Tesseract() {
    delete tess_cube_combiner_;
    tess_cube_combiner_ = NULL;
  }
+#endif
 }

 void Tesseract::Clear() {
@ -573,11 +675,13 @@ void Tesseract::ResetDocumentDictionary() {
 void Tesseract::SetBlackAndWhitelist() {
  // Set the white and blacklists (if any)
  unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
-                                     tessedit_char_whitelist.string());
+                                     tessedit_char_whitelist.string(),
+                                     tessedit_char_unblacklist.string());
  // Black and white lists should apply to all loaded classifiers.
  for (int i = 0; i < sub_langs_.size(); ++i) {
    sub_langs_[i]->unicharset.set_black_and_whitelist(
-        tessedit_char_blacklist.string(), tessedit_char_whitelist.string());
+        tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
+        tessedit_char_unblacklist.string());
  }
 }

--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@ -1,7 +1,12 @@
 ///////////////////////////////////////////////////////////////////////
 // File:        tesseractclass.h
-// Description: An instance of Tesseract. For thread safety, *every*
+// Description: The Tesseract class. It holds/owns everything needed
+//              to run Tesseract on a single language, and also a set of
+//              sub-Tesseracts to run sub-languages. For thread safety, *every*
 //              global variable goes in here, directly, or indirectly.
+//              This makes it safe to run multiple Tesseracts in different
+//              threads in parallel, and keeps the different language
+//              instances separate.
 // Author:      Ray Smith
 // Created:     Fri Mar 07 08:17:01 PST 2008
 //
@ -92,12 +97,16 @@ class WERD_RES;
 namespace tesseract {

 class ColumnFinder;
+#ifndef ANDROID_BUILD
 class CubeLineObject;
 class CubeObject;
 class CubeRecoContext;
+#endif
 class EquationDetect;
 class Tesseract;
+#ifndef ANDROID_BUILD
 class TesseractCubeCombiner;
+#endif

 // A collection of various variables for statistics and debugging.
 struct TesseractStats {
@ -245,6 +254,15 @@ class Tesseract : public Wordrec {
  Tesseract* get_sub_lang(int index) const {
    return sub_langs_[index];
  }
+  // Returns true if any language uses Tesseract (as opposed to cube).
+  bool AnyTessLang() const {
+    if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true;
+    for (int i = 0; i < sub_langs_.size(); ++i) {
+      if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY)
+        return true;
+    }
+    return false;
+  }

  void SetBlackAndWhitelist();

@ -265,8 +283,8 @@ class Tesseract : public Wordrec {
  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                  Tesseract* osd_tess, OSResults* osr);
  void SetupWordScripts(BLOCK_LIST* blocks);
-  int AutoPageSeg(PageSegMode pageseg_mode,
-                  BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
+  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+                  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
                  Tesseract* osd_tess, OSResults* osr);
  ColumnFinder* SetupPageSegAndDetectOrientation(
      bool single_column, bool osd, bool only_osd,
@ -310,8 +328,46 @@ class Tesseract : public Wordrec {
                        WordRecognizer recognizer,
                        WERD_RES** in_word,
                        PointerVector<WERD_RES>* best_words);
-  void classify_word_and_language(WordRecognizer recognizer,
-                                  PAGE_RES_IT* pr_it,
+  // Moves good-looking "noise"/diacritics from the reject list to the main
+  // blob list on the current word. Returns true if anything was done, and
+  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+                          bool* make_next_word_fuzzy);
+  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
+  // Input: a set of noisy outlines that probably belong to the real_word.
+  // Output: outlines that overlapped blobs are set to NULL and put back into
+  // the word, either in the blobs or in the reject list.
+  void AssignDiacriticsToOverlappingBlobs(
+      const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+      PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+      GenericVector<bool>* overlapped_any_blob,
+      GenericVector<C_BLOB*>* target_blobs);
+  // Attempts to assign non-overlapping outlines to their nearest blobs or
+  // make new blobs out of them.
+  void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
+                                  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
+                                  GenericVector<bool>* word_wanted,
+                                  GenericVector<C_BLOB*>* target_blobs);
+  // Starting with ok_outlines set to indicate which outlines overlap the blob,
+  // chooses the optimal set (approximately) and returns true if any outlines
+  // are desired, in which case ok_outlines indicates which ones.
+  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
+                                   PAGE_RES_IT* pr_it, C_BLOB* blob,
+                                   const GenericVector<C_OUTLINE*>& outlines,
+                                   int num_outlines,
+                                   GenericVector<bool>* ok_outlines);
+  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+  // the inclusion of the outlines, and returns the certainty of the raw choice.
+  float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
+                                 const GenericVector<C_OUTLINE*>& outlines,
+                                 int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+                                 STRING* best_str);
+  // Classifies the given blob (part of word_data->word->word) as an individual
+  // word, using languages, chopper etc, returning only the certainty of the
+  // best raw choice, and undoing all the work done to fake out the word.
+  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+                           STRING* best_str, float* c2);
+  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
                                  WordData* word_data);
  void classify_word_pass1(const WordData& word_data,
                           WERD_RES** in_word,
@ -332,6 +388,11 @@ class Tesseract : public Wordrec {
                          WERD_RES* word, WERD_RES* new_word);
  bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
  bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
+  // Runs recognition with the test baseline shift and x-height and returns true
+  // if there was an improvement in recognition result.
+  bool TestNewNormalization(int original_misfits, float baseline_shift,
+                            float new_x_ht, WERD_RES *word, BLOCK* block,
+                            ROW *row);
  BOOL8 recog_interactive(PAGE_RES_IT* pr_it);

  // Set fonts of this word.
@ -368,6 +429,7 @@ class Tesseract : public Wordrec {
                             int *right_ok) const;

  //// cube_control.cpp ///////////////////////////////////////////////////
+#ifndef ANDROID_BUILD
  bool init_cube_objects(bool load_combiner,
                         TessdataManager *tessdata_manager);
  // Iterates through tesseract's results and calls cube on each word,
@ -393,6 +455,7 @@ class Tesseract : public Wordrec {
                          Boxa** char_boxes, CharSamp*** char_samples);
  bool create_cube_box_word(Boxa *char_boxes, int num_chars,
                            TBOX word_box, BoxWord* box_word);
+#endif
  //// output.h //////////////////////////////////////////////////////////

  void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
@ -699,8 +762,8 @@ class Tesseract : public Wordrec {
  // Creates a fake best_choice entry in each WERD_RES with the correct text.
  void CorrectClassifyWords(PAGE_RES* page_res);
  // Call LearnWord to extract features for labelled blobs within each word.
-  // Features are written to the given filename.
-  void ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res);
+  // Features are stored in an internal buffer.
+  void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);

  //// fixxht.cpp ///////////////////////////////////////////////////////
  // Returns the number of misfit blob tops in this word.
@ -709,7 +772,7 @@ class Tesseract : public Wordrec {
  // maximally compatible with the result in word_res.
  // Returns 0.0f if no x-height is found that is better than the current
  // estimate.
-  float ComputeCompatibleXheight(WERD_RES *word_res);
+  float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift);
  //// Data members ///////////////////////////////////////////////////////
  // TODO(ocr-team): Find and remove obsolete parameters.
  BOOL_VAR_H(tessedit_resegment_from_boxes, false,
@ -734,6 +797,8 @@ class Tesseract : public Wordrec {
               "Blacklist of chars not to recognize");
  STRING_VAR_H(tessedit_char_whitelist, "",
               "Whitelist of chars to recognize");
+  STRING_VAR_H(tessedit_char_unblacklist, "",
+               "List of chars to override tessedit_char_blacklist");
  BOOL_VAR_H(tessedit_ambigs_training, false,
             "Perform training for ambiguities");
  INT_VAR_H(pageseg_devanagari_split_strategy,
@ -781,6 +846,24 @@ class Tesseract : public Wordrec {
             "Enable single word correction based on the dictionary.");
  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
            "correction.");
+  BOOL_VAR_H(enable_noise_removal, true,
+             "Remove and conditionally reassign small outlines when they"
+             " confuse layout analysis, determining diacritics vs noise");
+  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
+  // Worst (min) certainty, for which a diacritic is allowed to make the base
+  // character worse and still be included.
+  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
+  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
+  // make the base character worse and still be included.
+  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
+  // Worst (min) certainty, for which a diacritic is allowed to make a new
+  // stand-alone blob.
+  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
+  // Factor of certainty margin for adding diacritics to not count as worse.
+  double_VAR_H(noise_cert_factor, 0.375,
+               "Scaling on certainty diff from Hingepoint");
+  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
+  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
  BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
@ -918,15 +1001,9 @@ class Tesseract : public Wordrec {
  BOOL_VAR_H(tessedit_write_rep_codes, false,
             "Write repetition char code");
  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
+  BOOL_VAR_H(tessedit_create_txt, true, "Write .txt output file");
  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
  BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
-  INT_VAR_H(tessedit_pdf_compression, 0, "Type of image encoding in pdf output:"
-            "0 - autoselection (default); "
-            "1 - jpeg; "
-            "2 - G4; "
-            "3 - flate");
-  INT_VAR_H(tessedit_pdf_jpg_quality, 85, "Quality level of jpeg image "
-            "compression in pdf output");
  STRING_VAR_H(unrecognised_char, "|",
               "Output char for unidentified blobs");
  INT_VAR_H(suspect_level, 99, "Suspect marker level");
@ -990,7 +1067,22 @@ class Tesseract : public Wordrec {
             "Only initialize with the config file. Useful if the instance is "
             "not going to be used for OCR but say only for layout analysis.");
  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
+  BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
+  BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
+             "Force using vertical text page mode");
+  double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
+               "Fraction of textlines deemed vertical to use vertical page "
+               "mode");
+  double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
+               "Fraction of height used as a minimum gap for aligned blobs.");
  INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
+  BOOL_VAR_H(preserve_interword_spaces, false,
+             "Preserve multiple interword spaces");
+  BOOL_VAR_H(include_page_breaks, false,
+             "Include page separator string in output text after each "
+             "image/page.");
+  STRING_VAR_H(page_separator, "\f",
+               "Page separator (default is form feed control character)");

  // The following parameters were deprecated and removed from their original
  // locations. The parameters are temporarily kept here to give Tesseract
@ -1000,6 +1092,8 @@ class Tesseract : public Wordrec {
  // reasonably sure that Tesseract users have updated their data files.
  //
  // BEGIN DEPRECATED PARAMETERS
+  BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true,
+             "find horizontal lines such as headers in vertical page mode");
  INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
  BOOL_VAR_H(load_fixed_length_dawgs, true,  "Load fixed length"
             " dawgs (e.g. for non-space delimited languages)");
@ -1062,7 +1156,9 @@ class Tesseract : public Wordrec {
                                  PAGE_RES_IT* pr_it,
                                  FILE *output_file);

+#ifndef ANDROID_BUILD
  inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
+#endif

 private:
  // The filename of a backup config file. If not null, then we currently
@ -1102,9 +1198,11 @@ class Tesseract : public Wordrec {
  Tesseract* most_recently_used_;
  // The size of the font table, ie max possible font id + 1.
  int font_table_size_;
+#ifndef ANDROID_BUILD
  // Cube objects.
  CubeRecoContext* cube_cntxt_;
  TesseractCubeCombiner *tess_cube_combiner_;
+#endif
  // Equation detector. Note: this pointer is NOT owned by the class.
  EquationDetect* equ_detect_;
 };
--- a/ccmain/tfacepp.cpp
+++ b/ccmain/tfacepp.cpp
@ -254,7 +254,7 @@ void Tesseract::join_words(WERD_RES *word,
  // Move the word2 seams onto the end of the word1 seam_array.
  // Since the seam list is one element short, an empty seam marking the
  // end of the last blob in the first word is needed first.
-  word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL));
+  word->seam_array.push_back(new SEAM(0.0f, split_pt));
  word->seam_array += word2->seam_array;
  word2->seam_array.truncate(0);
  // Fix widths and gaps.
--- a/ccstruct/blobbox.h
+++ b/ccstruct/blobbox.h
@ -137,6 +137,9 @@ class BLOBNBOX:public ELIST_LINK
      cblob_ptr = srcblob;
      area = static_cast<int>(srcblob->area());
    }
+    ~BLOBNBOX() {
+      if (owns_cblob_) delete cblob_ptr;
+    }
    static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
      C_BLOB* blob = new C_BLOB(outline);
      return new BLOBNBOX(blob);
@ -387,6 +390,7 @@ class BLOBNBOX:public ELIST_LINK
    void set_base_char_blob(BLOBNBOX* blob) {
      base_char_blob_ = blob;
    }
+    void set_owns_cblob(bool value) { owns_cblob_ = value; }

    bool UniquelyVertical() const {
      return vert_possible_ && !horz_possible_;
@ -450,6 +454,7 @@ class BLOBNBOX:public ELIST_LINK
  // construction time.
  void ConstructionInit() {
    cblob_ptr = NULL;
+    owns_cblob_ = false;
    area = 0;
    area_stroke_width_ = 0.0f;
    horz_stroke_width_ = 0.0f;
@ -525,6 +530,10 @@ class BLOBNBOX:public ELIST_LINK
  bool vert_possible_;           // Could be part of vertical flow.
  bool leader_on_left_;          // There is a leader to the left.
  bool leader_on_right_;         // There is a leader to the right.
+  // Iff true, then the destructor should delete the cblob_ptr.
+  // TODO(rays) migrate all uses to correctly setting this flag instead of
+  // deleting the C_BLOB before deleting the BLOBNBOX.
+  bool owns_cblob_;
 };

 class TO_ROW: public ELIST2_LINK
--- a/ccstruct/blobs.cpp
+++ b/ccstruct/blobs.cpp
@ -64,6 +64,42 @@ const TPOINT kDivisibleVerticalItalic(1, 5);

 CLISTIZE(EDGEPT);

+// Returns true when the two line segments cross each other.
+// (Moved from outlines.cpp).
+// Finds where the projected lines would cross and then checks to see if the
+// point of intersection lies on both of the line segments. If it does
+// then these two segments cross.
+/* static */
+bool TPOINT::IsCrossed(const TPOINT& a0, const TPOINT& a1, const TPOINT& b0,
+                       const TPOINT& b1) {
+  int b0a1xb0b1, b0b1xb0a0;
+  int a1b1xa1a0, a1a0xa1b0;
+
+  TPOINT b0a1, b0a0, a1b1, b0b1, a1a0;
+
+  b0a1.x = a1.x - b0.x;
+  b0a0.x = a0.x - b0.x;
+  a1b1.x = b1.x - a1.x;
+  b0b1.x = b1.x - b0.x;
+  a1a0.x = a0.x - a1.x;
+  b0a1.y = a1.y - b0.y;
+  b0a0.y = a0.y - b0.y;
+  a1b1.y = b1.y - a1.y;
+  b0b1.y = b1.y - b0.y;
+  a1a0.y = a0.y - a1.y;
+
+  b0a1xb0b1 = CROSS(b0a1, b0b1);
+  b0b1xb0a0 = CROSS(b0b1, b0a0);
+  a1b1xa1a0 = CROSS(a1b1, a1a0);
+  // For clarity, we want CROSS(a1a0,a1b0) here but we have b0a1 instead of a1b0
+  // so use -CROSS(a1b0,b0a1) instead, which is the same.
+  a1a0xa1b0 = -CROSS(a1a0, b0a1);
+
+  return ((b0a1xb0b1 > 0 && b0b1xb0a0 > 0) ||
+          (b0a1xb0b1 < 0 && b0b1xb0a0 < 0)) &&
+         ((a1b1xa1a0 > 0 && a1a0xa1b0 > 0) || (a1b1xa1a0 < 0 && a1a0xa1b0 < 0));
+}
+
 // Consume the circular list of EDGEPTs to make a TESSLINE.
 TESSLINE* TESSLINE::BuildFromOutlineList(EDGEPT* outline) {
  TESSLINE* result = new TESSLINE;
@ -454,6 +490,36 @@ TBOX TBLOB::bounding_box() const {
  return box;
 }

+// Finds and deletes any duplicate outlines in this blob, without deleting
+// their EDGEPTs.
+void TBLOB::EliminateDuplicateOutlines() {
+  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next) {
+    TESSLINE* last_outline = outline;
+    for (TESSLINE* other_outline = outline->next; other_outline != NULL;
+         last_outline = other_outline, other_outline = other_outline->next) {
+      if (outline->SameBox(*other_outline)) {
+        last_outline->next = other_outline->next;
+        // This doesn't leak - the outlines share the EDGEPTs.
+        other_outline->loop = NULL;
+        delete other_outline;
+        other_outline = last_outline;
+        // If it is part of a cut, then it can't be a hole any more.
+        outline->is_hole = false;
+      }
+    }
+  }
+}
+
+// Swaps the outlines of *this and next if needed to keep the centers in
+// increasing x.
+void TBLOB::CorrectBlobOrder(TBLOB* next) {
+  TBOX box = bounding_box();
+  TBOX next_box = next->bounding_box();
+  if (box.x_middle() > next_box.x_middle()) {
+    Swap(&outlines, &next->outlines);
+  }
+}
+
 #ifndef GRAPHICS_DISABLED
 void TBLOB::plot(ScrollView* window, ScrollView::Color color,
                 ScrollView::Color child_color) {
@ -739,8 +805,8 @@ TWERD* TWERD::PolygonalCopy(bool allow_detailed_fx, WERD* src) {
 // Baseline normalizes the blobs in-place, recording the normalization in the
 // DENORMs in the blobs.
 void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix,
-                        bool inverse, float x_height, bool numeric_mode,
-                        tesseract::OcrEngineMode hint,
+                        bool inverse, float x_height, float baseline_shift,
+                        bool numeric_mode, tesseract::OcrEngineMode hint,
                        const TBOX* norm_box,
                        DENORM* word_denorm) {
  TBOX word_box = bounding_box();
@ -756,7 +822,7 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix,
    if (hint == tesseract::OEM_CUBE_ONLY)
      scale = 1.0f;
  } else {
-    input_y_offset = row->base_line(word_middle);
+    input_y_offset = row->base_line(word_middle) + baseline_shift;
  }
  for (int b = 0; b < blobs.size(); ++b) {
    TBLOB* blob = blobs[b];
@ -769,7 +835,7 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix,
      blob_scale = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()),
                               scale, scale * 1.5f);
    } else if (row != NULL && hint != tesseract::OEM_CUBE_ONLY) {
-      baseline = row->base_line(mid_x);
+      baseline = row->base_line(mid_x) + baseline_shift;
    }
    // The image will be 8-bit grey if the input was grey or color. Note that in
    // a grey image 0 is black and 255 is white. If the input was binary, then
@ -858,18 +924,6 @@ void TWERD::plot(ScrollView* window) {
 }
 #endif  // GRAPHICS_DISABLED

-/**********************************************************************
- * blob_origin
- *
- * Compute the origin of a compound blob, define to be the centre
- * of the bounding box.
- **********************************************************************/
-void blob_origin(TBLOB *blob,       /*blob to compute on */
-                 TPOINT *origin) {  /*return value */
-  TBOX bbox = blob->bounding_box();
-  *origin = (bbox.topleft() + bbox.botright()) / 2;
-}
-
 /**********************************************************************
 * divisible_blob
 *
--- a/ccstruct/blobs.h
+++ b/ccstruct/blobs.h
@ -60,6 +60,13 @@ struct TPOINT {
    x /= divisor;
    y /= divisor;
  }
+  bool operator==(const TPOINT& other) const {
+    return x == other.x && y == other.y;
+  }
+  // Returns true when the two line segments cross each other.
+  // (Moved from outlines.cpp).
+  static bool IsCrossed(const TPOINT& a0, const TPOINT& a1, const TPOINT& b0,
+                        const TPOINT& b1);

  inT16 x;                       // absolute x coord.
  inT16 y;                       // absolute y coord.
@ -87,6 +94,55 @@ struct EDGEPT {
    start_step = src.start_step;
    step_count = src.step_count;
  }
+  // Returns the squared distance between the points, with the x-component
+  // weighted by x_factor.
+  int WeightedDistance(const EDGEPT& other, int x_factor) const {
+    int x_dist = pos.x - other.pos.x;
+    int y_dist = pos.y - other.pos.y;
+    return x_dist * x_dist * x_factor + y_dist * y_dist;
+  }
+  // Returns true if the positions are equal.
+  bool EqualPos(const EDGEPT& other) const { return pos == other.pos; }
+  // Returns the bounding box of the outline segment from *this to *end.
+  // Ignores hidden edge flags.
+  TBOX SegmentBox(const EDGEPT* end) const {
+    TBOX box(pos.x, pos.y, pos.x, pos.y);
+    const EDGEPT* pt = this;
+    do {
+      pt = pt->next;
+      if (pt->pos.x < box.left()) box.set_left(pt->pos.x);
+      if (pt->pos.x > box.right()) box.set_right(pt->pos.x);
+      if (pt->pos.y < box.bottom()) box.set_bottom(pt->pos.y);
+      if (pt->pos.y > box.top()) box.set_top(pt->pos.y);
+    } while (pt != end && pt != this);
+    return box;
+  }
+  // Returns the area of the outline segment from *this to *end.
+  // Ignores hidden edge flags.
+  int SegmentArea(const EDGEPT* end) const {
+    int area = 0;
+    const EDGEPT* pt = this->next;
+    do {
+      TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y);
+      area += CROSS(origin_vec, pt->vec);
+      pt = pt->next;
+    } while (pt != end && pt != this);
+    return area;
+  }
+  // Returns true if the number of points in the outline segment from *this to
+  // *end is less that min_points and false if we get back to *this first.
+  // Ignores hidden edge flags.
+  bool ShortNonCircularSegment(int min_points, const EDGEPT* end) const {
+    int count = 0;
+    const EDGEPT* pt = this;
+    do {
+      if (pt == end) return true;
+      pt = pt->next;
+      ++count;
+    } while (pt != this && count <= min_points);
+    return false;
+  }
+
  // Accessors to hide or reveal a cut edge from feature extractors.
  void Hide() {
    flags[0] = true;
@ -100,9 +156,6 @@ struct EDGEPT {
  void MarkChop() {
    flags[2] = true;
  }
-  void UnmarkChop() {
-    flags[2] = false;
-  }
  bool IsChopPt() const {
    return flags[2] != 0;
  }
@ -162,8 +215,23 @@ struct TESSLINE {
  void MinMaxCrossProduct(const TPOINT vec, int* min_xp, int* max_xp) const;

  TBOX bounding_box() const;
+  // Returns true if *this and other have equal bounding boxes.
+  bool SameBox(const TESSLINE& other) const {
+    return topleft == other.topleft && botright == other.botright;
+  }
+  // Returns true if the given line segment crosses any outline of this blob.
+  bool SegmentCrosses(const TPOINT& pt1, const TPOINT& pt2) const {
+    if (Contains(pt1) && Contains(pt2)) {
+      EDGEPT* pt = loop;
+      do {
+        if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) return true;
+        pt = pt->next;
+      } while (pt != loop);
+    }
+    return false;
+  }
  // Returns true if the point is contained within the outline box.
-  bool Contains(const TPOINT& pt) {
+  bool Contains(const TPOINT& pt) const {
    return topleft.x <= pt.x && pt.x <= botright.x &&
           botright.y <= pt.y && pt.y <= topleft.y;
  }
@ -244,6 +312,31 @@ struct TBLOB {

  TBOX bounding_box() const;

+  // Returns true if the given line segment crosses any outline of this blob.
+  bool SegmentCrossesOutline(const TPOINT& pt1, const TPOINT& pt2) const {
+    for (const TESSLINE* outline = outlines; outline != NULL;
+         outline = outline->next) {
+      if (outline->SegmentCrosses(pt1, pt2)) return true;
+    }
+    return false;
+  }
+  // Returns true if the point is contained within any of the outline boxes.
+  bool Contains(const TPOINT& pt) const {
+    for (const TESSLINE* outline = outlines; outline != NULL;
+         outline = outline->next) {
+      if (outline->Contains(pt)) return true;
+    }
+    return false;
+  }
+
+  // Finds and deletes any duplicate outlines in this blob, without deleting
+  // their EDGEPTs.
+  void EliminateDuplicateOutlines();
+
+  // Swaps the outlines of *this and next if needed to keep the centers in
+  // increasing x.
+  void CorrectBlobOrder(TBLOB* next);
+
  const DENORM& denorm() const {
    return denorm_;
  }
@ -317,7 +410,7 @@ struct TWERD {
  // Baseline normalizes the blobs in-place, recording the normalization in the
  // DENORMs in the blobs.
  void BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, bool inverse,
-                   float x_height, bool numeric_mode,
+                   float x_height, float baseline_shift, bool numeric_mode,
                   tesseract::OcrEngineMode hint,
                   const TBOX* norm_box,
                   DENORM* word_denorm);
@ -358,12 +451,7 @@ if (w) memfree (w)
 /*----------------------------------------------------------------------
              F u n c t i o n s
 ----------------------------------------------------------------------*/
-// TODO(rays) This will become a member of TBLOB when TBLOB's definition
-// moves to blobs.h
-
-// Returns the center of blob's bounding box in origin.
-void blob_origin(TBLOB *blob, TPOINT *origin);
-
+// TODO(rays) Make divisible_blob and divide_blobs members of TBLOB.
 bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location);

 void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob,
--- a/ccstruct/boxread.cpp
+++ b/ccstruct/boxread.cpp
@ -78,7 +78,7 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
    if (!ParseBoxFileStr(lines[i].string(), &page, &utf8_str, &box)) {
      continue;
    }
-    if (skip_blanks && utf8_str == " ") continue;
+    if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) continue;
    if (target_page >= 0 && page != target_page) continue;
    if (boxes != NULL) boxes->push_back(box);
    if (texts != NULL) texts->push_back(utf8_str);
--- a/ccstruct/fontinfo.cpp
+++ b/ccstruct/fontinfo.cpp
@ -59,10 +59,10 @@ bool FontInfoTable::DeSerialize(bool swap, FILE* fp) {
 // Returns true if the given set of fonts includes one with the same
 // properties as font_id.
 bool FontInfoTable::SetContainsFontProperties(
-    int font_id, const GenericVector<int>& font_set) const {
+    int font_id, const GenericVector<ScoredFont>& font_set) const {
  uinT32 properties = get(font_id).properties;
  for (int f = 0; f < font_set.size(); ++f) {
-    if (get(font_set[f]).properties == properties)
+    if (get(font_set[f].fontinfo_id).properties == properties)
      return true;
  }
  return false;
@ -70,12 +70,12 @@ bool FontInfoTable::SetContainsFontProperties(

 // Returns true if the given set of fonts includes multiple properties.
 bool FontInfoTable::SetContainsMultipleFontProperties(
-    const GenericVector<int>& font_set) const {
+    const GenericVector<ScoredFont>& font_set) const {
  if (font_set.empty()) return false;
-  int first_font = font_set[0];
+  int first_font = font_set[0].fontinfo_id;
  uinT32 properties = get(first_font).properties;
  for (int f = 1; f < font_set.size(); ++f) {
-    if (get(font_set[f]).properties != properties)
+    if (get(font_set[f].fontinfo_id).properties != properties)
      return true;
  }
  return false;
--- a/ccstruct/fontinfo.h
+++ b/ccstruct/fontinfo.h
@ -31,6 +31,22 @@ namespace tesseract {

 class BitVector;

+// Simple struct to hold a font and a score. The scores come from the low-level
+// integer matcher, so they are in the uinT16 range. Fonts are an index to
+// fontinfo_table.
+// These get copied around a lot, so best to keep them small.
+struct ScoredFont {
+  ScoredFont() : fontinfo_id(-1), score(0) {}
+  ScoredFont(int font_id, uinT16 classifier_score)
+      : fontinfo_id(font_id), score(classifier_score) {}
+
+  // Index into fontinfo table, but inside the classifier, may be a shapetable
+  // index.
+  inT32 fontinfo_id;
+  // Raw score from the low-level classifier.
+  uinT16 score;
+};
+
 // Struct for information about spacing between characters in a particular font.
 struct FontSpacingInfo {
  inT16 x_gap_before;
@ -140,11 +156,11 @@ class FontInfoTable : public GenericVector<FontInfo> {

  // Returns true if the given set of fonts includes one with the same
  // properties as font_id.
-  bool SetContainsFontProperties(int font_id,
-                                 const GenericVector<int>& font_set) const;
+  bool SetContainsFontProperties(
+      int font_id, const GenericVector<ScoredFont>& font_set) const;
  // Returns true if the given set of fonts includes multiple properties.
  bool SetContainsMultipleFontProperties(
-      const GenericVector<int>& font_set) const;
+      const GenericVector<ScoredFont>& font_set) const;

  // Moves any non-empty FontSpacingInfo entries from other to this.
  void MoveSpacingInfoFrom(FontInfoTable* other);
--- a/ccstruct/imagedata.cpp
+++ b/ccstruct/imagedata.cpp
@ -51,6 +51,7 @@ void WordFeature::ComputeSize(const GenericVector<WordFeature>& features,
 // Draws the features in the given window.
 void WordFeature::Draw(const GenericVector<WordFeature>& features,
                       ScrollView* window) {
+#ifndef GRAPHICS_DISABLED
  for (int f = 0; f < features.size(); ++f) {
    FCOORD pos(features[f].x_, features[f].y_);
    FCOORD dir;
@ -61,6 +62,7 @@ void WordFeature::Draw(const GenericVector<WordFeature>& features,
    window->DrawTo(IntCastRounded(pos.x() + dir.x()),
                      IntCastRounded(pos.y() + dir.y()));
  }
+#endif
 }

 // Writes to the given file. Returns false in case of error.
@ -244,6 +246,7 @@ int ImageData::MemoryUsed() const {

 // Draws the data in a new window.
 void ImageData::Display() const {
+#ifndef GRAPHICS_DISABLED
  const int kTextSize = 64;
  // Draw the image.
  Pix* pix = GetPix();
@ -274,6 +277,7 @@ void ImageData::Display() const {
  win->Pen(ScrollView::GREEN);
  win->Update();
  window_wait(win);
+#endif
 }

 // Adds the supplied boxes and transcriptions that correspond to the correct
--- a/ccstruct/normalis.cpp
+++ b/ccstruct/normalis.cpp
@ -487,7 +487,7 @@ void DENORM::XHeightRange(int unichar_id, const UNICHARSET& unicharset,
      top > kBlnCellHeight - kBlnBaselineOffset / 2)
    max_top += kBlnBaselineOffset;
  top -= bln_yshift;
-  int height = top - kBlnBaselineOffset - bottom_shift;
+  int height = top - kBlnBaselineOffset;
  double min_height = min_top - kBlnBaselineOffset - tolerance;
  double max_height = max_top - kBlnBaselineOffset + tolerance;

--- a/ccstruct/ocrblock.cpp
+++ b/ccstruct/ocrblock.cpp
@ -86,6 +86,18 @@ void BLOCK::rotate(const FCOORD& rotation) {
  box = *poly_block()->bounding_box();
 }

+// Returns the bounding box including the desired combination of upper and
+// lower noise/diacritic elements.
+TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
+  TBOX box;
+  // This is a read-only iteration of the rows in the block.
+  ROW_IT it(const_cast<ROW_LIST*>(&rows));
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);
+  }
+  return box;
+}
+
 /**
 * BLOCK::reflect_polygon_in_y_axis
 *
--- a/ccstruct/ocrblock.h
+++ b/ccstruct/ocrblock.h
@ -161,10 +161,14 @@ class BLOCK:public ELIST_LINK, public PDBLK
    median_size_.set_y(y);
  }

-  Pix* render_mask() {
-    return PDBLK::render_mask(re_rotation_);
+  Pix* render_mask(TBOX* mask_box) {
+    return PDBLK::render_mask(re_rotation_, mask_box);
  }

+  // Returns the bounding box including the desired combination of upper and
+  // lower noise/diacritic elements.
+  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
+
  // Reflects the polygon in the y-axis and recomputes the bounding_box.
  // Does nothing to any contained rows/words/blobs etc.
  void reflect_polygon_in_y_axis();
--- a/ccstruct/ocrrow.cpp
+++ b/ccstruct/ocrrow.cpp
@ -80,6 +80,17 @@ ROW::ROW(                 //constructor
  rmargin_ = 0;
 }

+// Returns the bounding box including the desired combination of upper and
+// lower noise/diacritic elements.
+TBOX ROW::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
+  TBOX box;
+  // This is a read-only iteration of the words in the row.
+  WERD_IT it(const_cast<WERD_LIST *>(&words));
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);
+  }
+  return box;
+}

 /**********************************************************************
 * ROW::recalc_bounding_box
--- a/ccstruct/ocrrow.h
+++ b/ccstruct/ocrrow.h
@ -85,6 +85,9 @@ class ROW:public ELIST_LINK
    TBOX bounding_box() const {  //return bounding box
      return bound_box;
    }
+    // Returns the bounding box including the desired combination of upper and
+    // lower noise/diacritic elements.
+    TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;

    void set_lmargin(inT16 lmargin) {
      lmargin_ = lmargin;
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@ -148,6 +148,7 @@ ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
          add_next_word = false;
        }
      }
+      next_word->set_flag(W_FUZZY_NON, add_next_word);
    } else {
      add_next_word = next_word->flag(W_FUZZY_NON);
    }
@ -206,12 +207,8 @@ WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
  if (!wc_dest_it.empty()) {
    wc_dest_it.move_to_first();
    best_choice = wc_dest_it.data();
-    best_choice_fontinfo_ids = source.best_choice_fontinfo_ids;
  } else {
    best_choice = NULL;
-    if (!best_choice_fontinfo_ids.empty()) {
-      best_choice_fontinfo_ids.clear();
-    }
  }

  if (source.raw_choice != NULL) {
@ -252,6 +249,7 @@ void WERD_RES::CopySimpleFields(const WERD_RES& source) {
  fontinfo_id2_count = source.fontinfo_id2_count;
  x_height = source.x_height;
  caps_height = source.caps_height;
+  baseline_shift = source.baseline_shift;
  guessed_x_ht = source.guessed_x_ht;
  guessed_caps_ht = source.guessed_caps_ht;
  reject_spaces = source.reject_spaces;
@ -314,8 +312,8 @@ bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
  float word_xheight = use_body_size && row != NULL && row->body_size() > 0.0f
                     ? row->body_size() : x_height;
  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
-                            word_xheight, numeric_mode, norm_mode_hint,
-                            norm_box, &denorm);
+                            word_xheight, baseline_shift, numeric_mode,
+                            norm_mode_hint, norm_box, &denorm);
  blob_row = row;
  SetupBasicsFromChoppedWord(unicharset_in);
  SetupBlamerBundle();
@ -366,6 +364,7 @@ void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
    LogNewCookedChoice(1, false, word);
  }
  tess_failed = true;
+  done = true;
 }

 void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
@ -404,7 +403,8 @@ void WERD_RES::SetupBlobWidthsAndGaps() {
 // as the blob widths and gaps.
 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
  // Insert the seam into the SEAMS array.
-  insert_seam(chopped_word, blob_number, seam, &seam_array);
+  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
+  seam_array.insert(seam, blob_number);
  if (ratings != NULL) {
    // Expand the ratings matrix.
    ratings = ratings->ConsumeAndMakeBigger(blob_number);
@ -485,6 +485,9 @@ void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
 void WERD_RES::DebugTopChoice(const char* msg) const {
  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
          tess_accepted, tess_would_adapt, done);
+  if (best_choice == NULL)
+    tprintf("<Null choice>\n");
+  else
    best_choice->print(msg);
 }

@ -801,12 +804,16 @@ void WERD_RES::RebuildBestState() {
  for (int i = 0; i < best_choice->length(); ++i) {
    int length = best_choice->state(i);
    best_state.push_back(length);
-    if (length > 1)
-      join_pieces(seam_array, start, start + length - 1, chopped_word);
+    if (length > 1) {
+      SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
+                       start + length - 1);
+    }
    TBLOB* blob = chopped_word->blobs[start];
    rebuild_word->blobs.push_back(new TBLOB(*blob));
-    if (length > 1)
-      break_pieces(seam_array, start, start + length - 1, chopped_word);
+    if (length > 1) {
+      SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
+                        start + length - 1);
+    }
    start += length;
  }
 }
@ -1062,8 +1069,7 @@ bool WERD_RES::PiecesAllNatural(int start, int count) const {
  for (int index = start; index < start + count - 1; ++index) {
    if (index >= 0 && index < seam_array.size()) {
      SEAM* seam = seam_array[index];
-      if (seam != NULL && seam->split1 != NULL)
-        return false;
+      if (seam != NULL && seam->HasAnySplits()) return false;
    }
  }
  return true;
@ -1093,6 +1099,7 @@ void WERD_RES::InitNonPointers() {
  fontinfo_id2_count = 0;
  x_height = 0.0;
  caps_height = 0.0;
+  baseline_shift = 0.0f;
  guessed_x_ht = TRUE;
  guessed_caps_ht = TRUE;
  combination = FALSE;
@ -1249,23 +1256,16 @@ int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
  return 0;
 }

-// Inserts the new_word and a corresponding WERD_RES before the current
-// position. The simple fields of the WERD_RES are copied from clone_res and
-// the resulting WERD_RES is returned for further setup with best_choice etc.
+// Inserts the new_word as a combination owned by a corresponding WERD_RES
+// before the current position. The simple fields of the WERD_RES are copied
+// from clone_res and the resulting WERD_RES is returned for further setup
+// with best_choice etc.
 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
                                             WERD* new_word) {
-  // Insert new_word into the ROW.
-  WERD_IT w_it(row()->row->word_list());
-  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
-    WERD* word = w_it.data();
-    if (word == word_res->word)
-      break;
-  }
-  ASSERT_HOST(!w_it.cycled_list());
-  w_it.add_before_then_move(new_word);
  // Make a WERD_RES for the new_word.
  WERD_RES* new_res = new WERD_RES(new_word);
  new_res->CopySimpleFields(clone_res);
+  new_res->combination = true;
  // Insert into the appropriate place in the ROW_RES.
  WERD_RES_IT wr_it(&row()->word_res_list);
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
@ -1315,6 +1315,10 @@ static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
 // replaced with real blobs from the current word as much as possible.
 void PAGE_RES_IT::ReplaceCurrentWord(
    tesseract::PointerVector<WERD_RES>* words) {
+  if (words->empty()) {
+    DeleteCurrentWord();
+    return;
+  }
  WERD_RES* input_word = word();
  // Set the BOL/EOL flags on the words from the input word.
  if (input_word->word->flag(W_BOL)) {
@ -1468,6 +1472,33 @@ void PAGE_RES_IT::DeleteCurrentWord() {
  ResetWordIterator();
 }

+// Makes the current word a fuzzy space if not already fuzzy. Updates
+// corresponding part of combo if required.
+void PAGE_RES_IT::MakeCurrentWordFuzzy() {
+  WERD* real_word = word_res->word;
+  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
+    real_word->set_flag(W_FUZZY_SP, true);
+    tprintf("Made word fuzzy at:");
+    real_word->bounding_box().print();
+    if (word_res->combination) {
+      // The next word should be the corresponding part of combo, but we have
+      // already stepped past it, so find it by search.
+      WERD_RES_IT wr_it(&row()->word_res_list);
+      for (wr_it.mark_cycle_pt();
+           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
+      }
+      wr_it.forward();
+      ASSERT_HOST(wr_it.data()->part_of_combo);
+      real_word = wr_it.data()->word;
+      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
+                  !real_word->flag(W_FUZZY_NON));
+      real_word->set_flag(W_FUZZY_SP, true);
+      tprintf("Made part of combo word fuzzy at:");
+      real_word->bounding_box().print();
+    }
+  }
+}
+
 /*************************************************************************
 * PAGE_RES_IT::restart_page
 *
@ -1502,12 +1533,13 @@ void PAGE_RES_IT::ResetWordIterator() {
    // Reset the member iterator so it can move forward and detect the
    // cycled_list state correctly.
    word_res_it.move_to_first();
-    word_res_it.mark_cycle_pt();
-    while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) {
-      if (prev_row_res == row_res)
-        prev_word_res = word_res;
+    for (word_res_it.mark_cycle_pt();
+         !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
+         word_res_it.forward()) {
+      if (!word_res_it.data()->part_of_combo) {
+        if (prev_row_res == row_res) prev_word_res = word_res;
        word_res = word_res_it.data();
-      word_res_it.forward();
+      }
    }
    ASSERT_HOST(!word_res_it.cycled_list());
    word_res_it.forward();
@ -1515,11 +1547,12 @@ void PAGE_RES_IT::ResetWordIterator() {
    // word_res_it is OK, but reset word_res and prev_word_res if needed.
    WERD_RES_IT wr_it(&row_res->word_res_list);
    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
-      if (prev_row_res == row_res)
-        prev_word_res = word_res;
+      if (!wr_it.data()->part_of_combo) {
+        if (prev_row_res == row_res) prev_word_res = word_res;
        word_res = wr_it.data();
      }
    }
+  }
 }

 /*************************************************************************
--- a/ccstruct/pageres.h
+++ b/ccstruct/pageres.h
@ -294,6 +294,7 @@ class WERD_RES : public ELIST_LINK {
  CRUNCH_MODE unlv_crunch_mode;
  float x_height;              // post match estimate
  float caps_height;           // post match estimate
+  float baseline_shift;        // post match estimate.

  /*
    To deal with fuzzy spaces we need to be able to combine "words" to form
@ -314,8 +315,6 @@ class WERD_RES : public ELIST_LINK {
  BOOL8 combination;           //of two fuzzy gap wds
  BOOL8 part_of_combo;         //part of a combo
  BOOL8 reject_spaces;         //Reject spacing?
-  // FontInfo ids for each unichar in best_choice.
-  GenericVector<inT8> best_choice_fontinfo_ids;

  WERD_RES() {
    InitNonPointers();
@ -707,6 +706,10 @@ class PAGE_RES_IT {
  // Deletes the current WERD_RES and its underlying WERD.
  void DeleteCurrentWord();

+  // Makes the current word a fuzzy space if not already fuzzy. Updates
+  // corresponding part of combo if required.
+  void MakeCurrentWordFuzzy();
+
  WERD_RES *forward() {  // Get next word.
    return internal_forward(false, false);
  }
@ -746,9 +749,9 @@ class PAGE_RES_IT {
    return next_block_res;
  }
  void rej_stat_word();  // for page/block/row
+  void ResetWordIterator();

 private:
-  void ResetWordIterator();
  WERD_RES *internal_forward(bool new_block, bool empty_ok);

  WERD_RES * prev_word_res;    // previous word
--- a/ccstruct/pdblock.cpp
+++ b/ccstruct/pdblock.cpp
@ -77,7 +77,6 @@ void PDBLK::set_sides(                       //set vertex lists
  right_it.add_list_before (right);
 }

-
 /**********************************************************************
 * PDBLK::contains
 *
@ -126,7 +125,7 @@ void PDBLK::move(                  // reposition block

 // Returns a binary Pix mask with a 1 pixel for every pixel within the
 // block. Rotates the coordinate system by rerotation prior to rendering.
-Pix* PDBLK::render_mask(const FCOORD& rerotation) {
+Pix* PDBLK::render_mask(const FCOORD& rerotation, TBOX* mask_box) {
  TBOX rotated_box(box);
  rotated_box.rotate(rerotation);
  Pix* pix = pixCreate(rotated_box.width(), rotated_box.height(), 1);
@ -163,6 +162,7 @@ Pix* PDBLK::render_mask(const FCOORD& rerotation) {
    pixRasterop(pix, 0, 0, rotated_box.width(), rotated_box.height(),
                PIX_SET, NULL, 0, 0);
  }
+  if (mask_box != NULL) *mask_box = rotated_box;
  return pix;
 }

--- a/ccstruct/pdblock.h
+++ b/ccstruct/pdblock.h
@ -89,7 +89,9 @@ class PDBLK

    // Returns a binary Pix mask with a 1 pixel for every pixel within the
    // block. Rotates the coordinate system by rerotation prior to rendering.
-    Pix* render_mask(const FCOORD& rerotation);
+    // If not NULL, mask_box is filled with the position box of the returned
+    // mask image.
+    Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box);

    #ifndef GRAPHICS_DISABLED
    ///draw histogram
--- a/ccstruct/ratngs.cpp
+++ b/ccstruct/ratngs.cpp
@ -90,8 +90,6 @@ static const char * const kPermuterTypeNames[] = {
 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
                         float src_rating,         // rating
                         float src_cert,           // certainty
-                         inT16 src_fontinfo_id,     // font
-                         inT16 src_fontinfo_id2,    // 2nd choice font
                         int src_script_id,        // script
                         float min_xheight,        // min xheight allowed
                         float max_xheight,        // max xheight by this char
@ -100,8 +98,8 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
  unichar_id_ = src_unichar_id;
  rating_ = src_rating;
  certainty_ = src_cert;
-  fontinfo_id_ = src_fontinfo_id;
-  fontinfo_id2_ = src_fontinfo_id2;
+  fontinfo_id_ = -1;
+  fontinfo_id2_ = -1;
  script_id_ = src_script_id;
  min_xheight_ = min_xheight;
  max_xheight_ = max_xheight;
@ -126,6 +124,7 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
  max_xheight_ = other.max_xheight_;
  yshift_ = other.yshift();
  classifier_ = other.classifier_;
+  fonts_ = other.fonts_;
 }

 // Returns true if *this and other agree on the baseline and x-height
--- a/ccstruct/ratngs.h
+++ b/ccstruct/ratngs.h
@ -24,6 +24,7 @@

 #include "clst.h"
 #include "elst.h"
+#include "fontinfo.h"
 #include "genericvector.h"
 #include "matrix.h"
 #include "unichar.h"
@ -64,8 +65,6 @@ class BLOB_CHOICE: public ELIST_LINK
    BLOB_CHOICE(UNICHAR_ID src_unichar_id,  // character id
                float src_rating,          // rating
                float src_cert,            // certainty
-                inT16 src_fontinfo_id,     // font
-                inT16 src_fontinfo_id2,    // 2nd choice font
                int script_id,             // script
                float min_xheight,         // min xheight in image pixel units
                float max_xheight,         // max xheight allowed by this char
@ -89,6 +88,26 @@ class BLOB_CHOICE: public ELIST_LINK
    inT16 fontinfo_id2() const {
      return fontinfo_id2_;
    }
+    const GenericVector<tesseract::ScoredFont>& fonts() const {
+      return fonts_;
+    }
+    void set_fonts(const GenericVector<tesseract::ScoredFont>& fonts) {
+      fonts_ = fonts;
+      int score1 = 0, score2 = 0;
+      fontinfo_id_ = -1;
+      fontinfo_id2_ = -1;
+      for (int f = 0; f < fonts_.size(); ++f) {
+        if (fonts_[f].score > score1) {
+          score2 = score1;
+          fontinfo_id2_ = fontinfo_id_;
+          score1 = fonts_[f].score;
+          fontinfo_id_ = fonts_[f].fontinfo_id;
+        } else if (fonts_[f].score > score2) {
+          score2 = fonts_[f].score;
+          fontinfo_id2_ = fonts_[f].fontinfo_id;
+        }
+      }
+    }
    int script_id() const {
      return script_id_;
    }
@ -131,12 +150,6 @@ class BLOB_CHOICE: public ELIST_LINK
    void set_certainty(float newrat) {
      certainty_ = newrat;
    }
-    void set_fontinfo_id(inT16 newfont) {
-      fontinfo_id_ = newfont;
-    }
-    void set_fontinfo_id2(inT16 newfont) {
-      fontinfo_id2_ = newfont;
-    }
    void set_script(int newscript_id) {
      script_id_ = newscript_id;
    }
@ -186,6 +199,8 @@ class BLOB_CHOICE: public ELIST_LINK

 private:
  UNICHAR_ID unichar_id_;          // unichar id
+  // Fonts and scores. Allowed to be empty.
+  GenericVector<tesseract::ScoredFont> fonts_;
  inT16 fontinfo_id_;              // char font information
  inT16 fontinfo_id2_;             // 2nd choice font information
  // Rating is the classifier distance weighted by the length of the outline
--- a/ccstruct/seam.cpp
+++ b/ccstruct/seam.cpp
@ -27,114 +27,236 @@
 ----------------------------------------------------------------------*/
 #include "seam.h"
 #include "blobs.h"
-#include "freelist.h"
 #include "tprintf.h"

-#ifdef __UNIX__
-#include <assert.h>
-#endif
-
-/*----------------------------------------------------------------------
-              V a r i a b l e s
----------------------------------------------------------------------*/
-#define NUM_STARTING_SEAMS  20
-
 /*----------------------------------------------------------------------
        Public Function Code
 ----------------------------------------------------------------------*/
-/**
- * @name point_in_split
- *
- * Check to see if either of these points are present in the current
- * split.
- * @returns TRUE if one of them is split.
- */
-bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2) {
-  return ((split) ? ((exact_point (split->point1, point1) ||
-                      exact_point (split->point1, point2) ||
-                      exact_point (split->point2, point1) ||
-                      exact_point (split->point2, point2)) ? TRUE : FALSE)
-                  : FALSE);
-}

-
-/**
- * @name point_in_seam
- *
- * Check to see if either of these points are present in the current
- * seam.
- * @returns TRUE if one of them is.
- */
-bool point_in_seam(const SEAM *seam, SPLIT *split) {
-  return (point_in_split(seam->split1, split->point1, split->point2) ||
-          point_in_split(seam->split2, split->point1, split->point2) ||
-          point_in_split(seam->split3, split->point1, split->point2));
-}
-
-/**
- * @name point_used_by_split
- *
- * Return whether this particular EDGEPT * is used in a given split.
- * @returns TRUE if the edgept is used by the split.
- */
-bool point_used_by_split(SPLIT *split, EDGEPT *point) {
-  if (split == NULL) return false;
-  return point == split->point1 || point == split->point2;
-}
-
-/**
- * @name point_used_by_seam
- *
- * Return whether this particular EDGEPT * is used in a given seam.
- * @returns TRUE if the edgept is used by the seam.
- */
-bool point_used_by_seam(SEAM *seam, EDGEPT *point) {
-  if (seam == NULL) return false;
-  return point_used_by_split(seam->split1, point) ||
-      point_used_by_split(seam->split2, point) ||
-      point_used_by_split(seam->split3, point);
-}
-
-/**
- * @name combine_seam
- *
- * Combine two seam records into a single seam.  Move the split
- * references from the second seam to the first one.  The argument
- * convention is patterned after strcpy.
- */
-void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
-  dest_seam->priority += source_seam->priority;
-  dest_seam->location += source_seam->location;
-  dest_seam->location /= 2;
-
-  if (source_seam->split1) {
-    if (!dest_seam->split1)
-      dest_seam->split1 = source_seam->split1;
-    else if (!dest_seam->split2)
-      dest_seam->split2 = source_seam->split1;
-    else if (!dest_seam->split3)
-      dest_seam->split3 = source_seam->split1;
-    else
-      delete source_seam->split1;  // Wouldn't have fitted.
-    source_seam->split1 = NULL;
+// Returns the bounding box of all the points in the seam.
+TBOX SEAM::bounding_box() const {
+  TBOX box(location_.x, location_.y, location_.x, location_.y);
+  for (int s = 0; s < num_splits_; ++s) {
+    box += splits_[s].bounding_box();
  }
-  if (source_seam->split2) {
-    if (!dest_seam->split2)
-      dest_seam->split2 = source_seam->split2;
-    else if (!dest_seam->split3)
-      dest_seam->split3 = source_seam->split2;
-    else
-      delete source_seam->split2;  // Wouldn't have fitted.
-    source_seam->split2 = NULL;
+  return box;
+}
+
+// Returns true if other can be combined into *this.
+bool SEAM::CombineableWith(const SEAM& other, int max_x_dist,
+                           float max_total_priority) const {
+  int dist = location_.x - other.location_.x;
+  if (-max_x_dist < dist && dist < max_x_dist &&
+      num_splits_ + other.num_splits_ <= kMaxNumSplits &&
+      priority_ + other.priority_ < max_total_priority &&
+      !OverlappingSplits(other) && !SharesPosition(other)) {
+    return true;
+  } else {
+    return false;
  }
-  if (source_seam->split3) {
-    if (!dest_seam->split3)
-      dest_seam->split3 = source_seam->split3;
-    else
-      delete source_seam->split3;  // Wouldn't have fitted.
-    source_seam->split3 = NULL;
+}
+
+// Combines other into *this. Only works if CombinableWith returned true.
+void SEAM::CombineWith(const SEAM& other) {
+  priority_ += other.priority_;
+  location_ += other.location_;
+  location_ /= 2;
+
+  for (int s = 0; s < other.num_splits_ && num_splits_ < kMaxNumSplits; ++s)
+    splits_[num_splits_++] = other.splits_[s];
+}
+
+// Returns true if the splits in *this SEAM appear OK in the sense that they
+// do not cross any outlines and do not chop off any ridiculously small
+// pieces.
+bool SEAM::IsHealthy(const TBLOB& blob, int min_points, int min_area) const {
+  // TODO(rays) Try testing all the splits. Duplicating original code for now,
+  // which tested only the first.
+  return num_splits_ == 0 || splits_[0].IsHealthy(blob, min_points, min_area);
+}
+
+// Computes the widthp_/widthn_ range for all existing SEAMs and for *this
+// seam, which is about to be inserted at insert_index. Returns false if
+// any of the computations fails, as this indicates an invalid chop.
+// widthn_/widthp_ are only changed if modify is true.
+bool SEAM::PrepareToInsertSeam(const GenericVector<SEAM*>& seams,
+                               const GenericVector<TBLOB*>& blobs,
+                               int insert_index, bool modify) {
+  for (int s = 0; s < insert_index; ++s) {
+    if (!seams[s]->FindBlobWidth(blobs, s, modify)) return false;
  }
-  delete source_seam;
+  if (!FindBlobWidth(blobs, insert_index, modify)) return false;
+  for (int s = insert_index; s < seams.size(); ++s) {
+    if (!seams[s]->FindBlobWidth(blobs, s + 1, modify)) return false;
+  }
+  return true;
+}
+
+// Computes the widthp_/widthn_ range. Returns false if not all the splits
+// are accounted for. widthn_/widthp_ are only changed if modify is true.
+bool SEAM::FindBlobWidth(const GenericVector<TBLOB*>& blobs, int index,
+                         bool modify) {
+  int num_found = 0;
+  if (modify) {
+    widthp_ = 0;
+    widthn_ = 0;
+  }
+  for (int s = 0; s < num_splits_; ++s) {
+    const SPLIT& split = splits_[s];
+    bool found_split = split.ContainedByBlob(*blobs[index]);
+    // Look right.
+    for (int b = index + 1; !found_split && b < blobs.size(); ++b) {
+      found_split = split.ContainedByBlob(*blobs[b]);
+      if (found_split && b - index > widthp_ && modify) widthp_ = b - index;
+    }
+    // Look left.
+    for (int b = index - 1; !found_split && b >= 0; --b) {
+      found_split = split.ContainedByBlob(*blobs[b]);
+      if (found_split && index - b > widthn_ && modify) widthn_ = index - b;
+    }
+    if (found_split) ++num_found;
+  }
+  return num_found == num_splits_;
+}
+
+// Splits this blob into two blobs by applying the splits included in
+// *this SEAM
+void SEAM::ApplySeam(bool italic_blob, TBLOB* blob, TBLOB* other_blob) const {
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].SplitOutlineList(blob->outlines);
+  }
+  blob->ComputeBoundingBoxes();
+
+  divide_blobs(blob, other_blob, italic_blob, location_);
+
+  blob->EliminateDuplicateOutlines();
+  other_blob->EliminateDuplicateOutlines();
+
+  blob->CorrectBlobOrder(other_blob);
+}
+
+// Undoes ApplySeam by removing the seam between these two blobs.
+// Produces one blob as a result, and deletes other_blob.
+void SEAM::UndoSeam(TBLOB* blob, TBLOB* other_blob) const {
+  if (blob->outlines == NULL) {
+    blob->outlines = other_blob->outlines;
+    other_blob->outlines = NULL;
+  }
+
+  TESSLINE* outline = blob->outlines;
+  while (outline->next) outline = outline->next;
+  outline->next = other_blob->outlines;
+  other_blob->outlines = NULL;
+  delete other_blob;
+
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].UnsplitOutlineList(blob);
+  }
+  blob->ComputeBoundingBoxes();
+  blob->EliminateDuplicateOutlines();
+}
+
+// Prints everything in *this SEAM.
+void SEAM::Print(const char* label) const {
+  tprintf(label);
+  tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ", priority_, location_.x, location_.y,
+          widthp_, widthn_);
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].Print();
+    if (s + 1 < num_splits_) tprintf(",   ");
+  }
+  tprintf("\n");
+}
+
+// Prints a collection of SEAMs.
+/* static */
+void SEAM::PrintSeams(const char* label, const GenericVector<SEAM*>& seams) {
+  if (!seams.empty()) {
+    tprintf("%s\n", label);
+    for (int x = 0; x < seams.size(); ++x) {
+      tprintf("%2d:   ", x);
+      seams[x]->Print("");
+    }
+    tprintf("\n");
+  }
+}
+
+#ifndef GRAPHICS_DISABLED
+// Draws the seam in the given window.
+void SEAM::Mark(ScrollView* window) const {
+  for (int s = 0; s < num_splits_; ++s) splits_[s].Mark(window);
+}
+#endif
+
+// Break up the blobs in this chain so that they are all independent.
+// This operation should undo the affect of join_pieces.
+/* static */
+void SEAM::BreakPieces(const GenericVector<SEAM*>& seams,
+                       const GenericVector<TBLOB*>& blobs, int first,
+                       int last) {
+  for (int x = first; x < last; ++x) seams[x]->Reveal();
+
+  TESSLINE* outline = blobs[first]->outlines;
+  int next_blob = first + 1;
+
+  while (outline != NULL && next_blob <= last) {
+    if (outline->next == blobs[next_blob]->outlines) {
+      outline->next = NULL;
+      outline = blobs[next_blob]->outlines;
+      ++next_blob;
+    } else {
+      outline = outline->next;
+    }
+  }
+}
+
+// Join a group of base level pieces into a single blob that can then
+// be classified.
+/* static */
+void SEAM::JoinPieces(const GenericVector<SEAM*>& seams,
+                      const GenericVector<TBLOB*>& blobs, int first, int last) {
+  TESSLINE* outline = blobs[first]->outlines;
+  if (!outline)
+    return;
+
+  for (int x = first; x < last; ++x) {
+    SEAM *seam = seams[x];
+    if (x - seam->widthn_ >= first && x + seam->widthp_ < last) seam->Hide();
+    while (outline->next) outline = outline->next;
+    outline->next = blobs[x + 1]->outlines;
+  }
+}
+
+// Hides the seam so the outlines appear not to be cut by it.
+void SEAM::Hide() const {
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].Hide();
+  }
+}
+
+// Undoes hide, so the outlines are cut by the seam.
+void SEAM::Reveal() const {
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].Reveal();
+  }
+}
+
+// Computes and returns, but does not set, the full priority of *this SEAM.
+float SEAM::FullPriority(int xmin, int xmax, double overlap_knob,
+                         int centered_maxwidth, double center_knob,
+                         double width_change_knob) const {
+  if (num_splits_ == 0) return 0.0f;
+  for (int s = 1; s < num_splits_; ++s) {
+    splits_[s].SplitOutline();
+  }
+  float full_priority =
+      priority_ +
+      splits_[0].FullPriority(xmin, xmax, overlap_knob, centered_maxwidth,
+                              center_knob, width_change_knob);
+  for (int s = num_splits_ - 1; s >= 1; --s) {
+    splits_[s].UnsplitOutlines();
+  }
+  return full_priority;
 }

 /**
@ -144,7 +266,7 @@ void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
 * present in the starting segmentation.  Each of the seams created
 * by this routine have location information only.
 */
-void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array) {
+void start_seam_list(TWERD* word, GenericVector<SEAM*>* seam_array) {
  seam_array->truncate(0);
  TPOINT location;

@ -153,381 +275,6 @@ void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array) {
    TBOX nbox = word->blobs[b]->bounding_box();
    location.x = (bbox.right() + nbox.left()) / 2;
    location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4;
-    seam_array->push_back(new SEAM(0.0f, location, NULL, NULL, NULL));
-  }
-}
-
-
-/**
- * @name test_insert_seam
- *
- * @returns true if insert_seam will succeed.
- */
-bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
-                      TWERD *word, int index) {
-  SEAM *test_seam;
-  int list_length = seam_array.size();
-  for (int test_index = 0; test_index < index; ++test_index) {
-    test_seam = seam_array[test_index];
-    if (test_index + test_seam->widthp < index &&
-        test_seam->widthp + test_index == index - 1 &&
-        account_splits(test_seam, word, test_index + 1, 1) < 0)
-      return false;
-  }
-  for (int test_index = index; test_index < list_length; test_index++) {
-    test_seam = seam_array[test_index];
-    if (test_index - test_seam->widthn >= index &&
-        test_index - test_seam->widthn == index &&
-        account_splits(test_seam, word, test_index + 1, -1) < 0)
-      return false;
-  }
-  return true;
-}
-
-/**
- * @name insert_seam
- *
- * Add another seam to a collection of seams at a particular location
- * in the seam array.
- */
-void insert_seam(const TWERD* word, int index, SEAM *seam,
-                 GenericVector<SEAM*>* seam_array) {
-  SEAM *test_seam;
-  int list_length = seam_array->size();
-  for (int test_index = 0; test_index < index; ++test_index) {
-    test_seam = seam_array->get(test_index);
-    if (test_index + test_seam->widthp >= index) {
-      test_seam->widthp++;       /*got in the way */
-    } else if (test_seam->widthp + test_index == index - 1) {
-      test_seam->widthp = account_splits(test_seam, word, test_index + 1, 1);
-      if (test_seam->widthp < 0) {
-        tprintf("Failed to find any right blob for a split!\n");
-        print_seam("New dud seam", seam);
-        print_seam("Failed seam", test_seam);
-      }
-    }
-  }
-  for (int test_index = index; test_index < list_length; test_index++) {
-    test_seam = seam_array->get(test_index);
-    if (test_index - test_seam->widthn < index) {
-      test_seam->widthn++;       /*got in the way */
-    } else if (test_index - test_seam->widthn == index) {
-      test_seam->widthn = account_splits(test_seam, word, test_index + 1, -1);
-      if (test_seam->widthn < 0) {
-        tprintf("Failed to find any left blob for a split!\n");
-        print_seam("New dud seam", seam);
-        print_seam("Failed seam", test_seam);
-      }
-    }
-  }
-  seam_array->insert(seam, index);
-}
-
-
-/**
- * @name account_splits
- *
- * Account for all the splits by looking to the right (blob_direction == 1),
- * or to the left (blob_direction == -1) in the word.
- */
-int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
-                   int blob_direction) {
-  inT8 found_em[3];
-  inT8 width;
-
-  found_em[0] = seam->split1 == NULL;
-  found_em[1] = seam->split2 == NULL;
-  found_em[2] = seam->split3 == NULL;
-  if (found_em[0] && found_em[1] && found_em[2])
-    return 0;
-  width = 0;
-  do {
-    TBLOB* blob = word->blobs[blob_index];
-    if (!found_em[0])
-      found_em[0] = find_split_in_blob(seam->split1, blob);
-    if (!found_em[1])
-      found_em[1] = find_split_in_blob(seam->split2, blob);
-    if (!found_em[2])
-      found_em[2] = find_split_in_blob(seam->split3, blob);
-    if (found_em[0] && found_em[1] && found_em[2]) {
-      return width;
-    }
-    width++;
-    blob_index += blob_direction;
-  } while (0 <= blob_index && blob_index < word->NumBlobs());
-  return -1;
-}
-
-
-/**
- * @name find_split_in_blob
- *
- * @returns TRUE if the split is somewhere in this blob.
- */
-bool find_split_in_blob(SPLIT *split, TBLOB *blob) {
-  TESSLINE *outline;
-
-  for (outline = blob->outlines; outline != NULL; outline = outline->next)
-    if (outline->Contains(split->point1->pos))
-      break;
-  if (outline == NULL)
-    return FALSE;
-  for (outline = blob->outlines; outline != NULL; outline = outline->next)
-    if (outline->Contains(split->point2->pos))
-      return TRUE;
-  return FALSE;
-}
-
-
-/**
- * @name join_two_seams
- *
- * Merge these two seams into a new seam.  Duplicate the split records
- * in both of the input seams.  Return the resultant seam.
- */
-SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2) {
-  SEAM *result = NULL;
-  SEAM *temp;
-
-  assert(seam1 &&seam2);
-
-  if (((seam1->split3 == NULL && seam2->split2 == NULL) ||
-       (seam1->split2 == NULL && seam2->split3 == NULL) ||
-        seam1->split1 == NULL || seam2->split1 == NULL) &&
-      (!shared_split_points(seam1, seam2))) {
-    result = new SEAM(*seam1);
-    temp = new SEAM(*seam2);
-    combine_seams(result, temp);
-  }
-  return (result);
-}
-
-/**
- * @name print_seam
- *
- * Print a list of splits.  Show the coordinates of both points in
- * each split.
- */
-void print_seam(const char *label, SEAM *seam) {
-  if (seam) {
-    tprintf(label);
-    tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ",
-            seam->priority, seam->location.x, seam->location.y,
-            seam->widthp, seam->widthn);
-    print_split(seam->split1);
-
-    if (seam->split2) {
-      tprintf(",   ");
-      print_split (seam->split2);
-      if (seam->split3) {
-        tprintf(",   ");
-        print_split (seam->split3);
-      }
-    }
-    tprintf("\n");
-  }
-}
-
-
-/**
- * @name print_seams
- *
- * Print a list of splits.  Show the coordinates of both points in
- * each split.
- */
-void print_seams(const char *label, const GenericVector<SEAM*>& seams) {
-  char number[CHARS_PER_LINE];
-
-  if (!seams.empty()) {
-    tprintf("%s\n", label);
-    for (int x = 0; x < seams.size(); ++x) {
-      sprintf(number, "%2d:   ", x);
-      print_seam(number, seams[x]);
-    }
-    tprintf("\n");
-  }
-}
-
-
-/**
- * @name shared_split_points
- *
- * Check these two seams to make sure that neither of them have two
- * points in common. Return TRUE if any of the same points are present
- * in any of the splits of both seams.
- */
-int shared_split_points(const SEAM *seam1, const SEAM *seam2) {
-  if (seam1 == NULL || seam2 == NULL)
-    return (FALSE);
-
-  if (seam2->split1 == NULL)
-    return (FALSE);
-  if (point_in_seam(seam1, seam2->split1))
-    return (TRUE);
-
-  if (seam2->split2 == NULL)
-    return (FALSE);
-  if (point_in_seam(seam1, seam2->split2))
-    return (TRUE);
-
-  if (seam2->split3 == NULL)
-    return (FALSE);
-  if (point_in_seam(seam1, seam2->split3))
-    return (TRUE);
-
-  return (FALSE);
-}
-
-/**********************************************************************
- * break_pieces
- *
- * Break up the blobs in this chain so that they are all independent.
- * This operation should undo the affect of join_pieces.
- **********************************************************************/
-void break_pieces(const GenericVector<SEAM*>& seams, int first, int last,
-                  TWERD *word) {
-  for (int x = first; x < last; ++x)
-    reveal_seam(seams[x]);
-
-  TESSLINE *outline = word->blobs[first]->outlines;
-  int next_blob = first + 1;
-
-  while (outline != NULL && next_blob <= last) {
-    if (outline->next == word->blobs[next_blob]->outlines) {
-      outline->next = NULL;
-      outline = word->blobs[next_blob]->outlines;
-      ++next_blob;
-    } else {
-      outline = outline->next;
-    }
-  }
-}
-
-
-/**********************************************************************
- * join_pieces
- *
- * Join a group of base level pieces into a single blob that can then
- * be classified.
- **********************************************************************/
-void join_pieces(const GenericVector<SEAM*>& seams, int first, int last,
-                 TWERD *word) {
-  TESSLINE *outline = word->blobs[first]->outlines;
-  if (!outline)
-    return;
-
-  for (int x = first; x < last; ++x) {
-    SEAM *seam = seams[x];
-    if (x - seam->widthn >= first && x + seam->widthp < last)
-      hide_seam(seam);
-    while (outline->next)
-      outline = outline->next;
-    outline->next = word->blobs[x + 1]->outlines;
-  }
-}
-
-
-/**********************************************************************
- * hide_seam
- *
- * Change the edge points that are referenced by this seam to make
- * them hidden edges.
- **********************************************************************/
-void hide_seam(SEAM *seam) {
-  if (seam == NULL || seam->split1 == NULL)
-    return;
-  hide_edge_pair (seam->split1->point1, seam->split1->point2);
-
-  if (seam->split2 == NULL)
-    return;
-  hide_edge_pair (seam->split2->point1, seam->split2->point2);
-
-  if (seam->split3 == NULL)
-    return;
-  hide_edge_pair (seam->split3->point1, seam->split3->point2);
-}
-
-
-/**********************************************************************
- * hide_edge_pair
- *
- * Change the edge points that are referenced by this seam to make
- * them hidden edges.
- **********************************************************************/
-void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
-  EDGEPT *edgept;
-
-  edgept = pt1;
-  do {
-    edgept->Hide();
-    edgept = edgept->next;
-  }
-  while (!exact_point (edgept, pt2) && edgept != pt1);
-  if (edgept == pt1) {
-    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
-       edgept->pos.x,edgept->pos.y);                                */
-  }
-  edgept = pt2;
-  do {
-    edgept->Hide();
-    edgept = edgept->next;
-  }
-  while (!exact_point (edgept, pt1) && edgept != pt2);
-  if (edgept == pt2) {
-    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
-       edgept->pos.x,edgept->pos.y);                                */
-  }
-}
-
-
-/**********************************************************************
- * reveal_seam
- *
- * Change the edge points that are referenced by this seam to make
- * them hidden edges.
- **********************************************************************/
-void reveal_seam(SEAM *seam) {
-  if (seam == NULL || seam->split1 == NULL)
-    return;
-  reveal_edge_pair (seam->split1->point1, seam->split1->point2);
-
-  if (seam->split2 == NULL)
-    return;
-  reveal_edge_pair (seam->split2->point1, seam->split2->point2);
-
-  if (seam->split3 == NULL)
-    return;
-  reveal_edge_pair (seam->split3->point1, seam->split3->point2);
-}
-
-
-/**********************************************************************
- * reveal_edge_pair
- *
- * Change the edge points that are referenced by this seam to make
- * them hidden edges.
- **********************************************************************/
-void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
-  EDGEPT *edgept;
-
-  edgept = pt1;
-  do {
-    edgept->Reveal();
-    edgept = edgept->next;
-  }
-  while (!exact_point (edgept, pt2) && edgept != pt1);
-  if (edgept == pt1) {
-    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
-       edgept->pos.x,edgept->pos.y);                                */
-  }
-  edgept = pt2;
-  do {
-    edgept->Reveal();
-    edgept = edgept->next;
-  }
-  while (!exact_point (edgept, pt1) && edgept != pt2);
-  if (edgept == pt2) {
-    /*              tprintf("Hid entire outline at (%d,%d)!!\n",
-       edgept->pos.x,edgept->pos.y);                                */
+    seam_array->push_back(new SEAM(0.0f, location));
  }
 }
--- a/ccstruct/seam.h
+++ b/ccstruct/seam.h
@ -36,95 +36,163 @@
 ----------------------------------------------------------------------*/
 typedef float PRIORITY;          /*  PRIORITY  */

-struct SEAM {
-  // Constructor that was formerly new_seam.
-  SEAM(PRIORITY priority0, const TPOINT& location0,
-       SPLIT *splita, SPLIT *splitb, SPLIT *splitc)
-  : priority(priority0), widthp(0), widthn(0), location(location0),
-    split1(splita), split2(splitb), split3(splitc) {}
-  // Copy constructor that was formerly clone_seam.
-  SEAM(const SEAM& src)
-  : priority(src.priority), widthp(src.widthp), widthn(src.widthn),
-    location(src.location) {
-    clone_split(split1, src.split1);
-    clone_split(split2, src.split2);
-    clone_split(split3, src.split3);
+class SEAM {
+ public:
+  // A seam with no splits
+  SEAM(float priority, const TPOINT& location)
+      : priority_(priority),
+        location_(location),
+        widthp_(0),
+        widthn_(0),
+        num_splits_(0) {}
+  // A seam with a single split point.
+  SEAM(float priority, const TPOINT& location, const SPLIT& split)
+      : priority_(priority),
+        location_(location),
+        widthp_(0),
+        widthn_(0),
+        num_splits_(1) {
+    splits_[0] = split;
  }
-  // Destructor was delete_seam.
-  ~SEAM() {
-    if (split1)
-      delete_split(split1);
-    if (split2)
-      delete_split(split2);
-    if (split3)
-      delete_split(split3);
+  // Default copy constructor, operator= and destructor are OK!
+
+  // Accessors.
+  float priority() const { return priority_; }
+  void set_priority(float priority) { priority_ = priority; }
+  bool HasAnySplits() const { return num_splits_ > 0; }
+
+  // Returns the bounding box of all the points in the seam.
+  TBOX bounding_box() const;
+
+  // Returns true if other can be combined into *this.
+  bool CombineableWith(const SEAM& other, int max_x_dist,
+                       float max_total_priority) const;
+  // Combines other into *this. Only works if CombinableWith returned true.
+  void CombineWith(const SEAM& other);
+
+  // Returns true if the given blob contains all splits of *this SEAM.
+  bool ContainedByBlob(const TBLOB& blob) const {
+    for (int s = 0; s < num_splits_; ++s) {
+      if (!splits_[s].ContainedByBlob(blob)) return false;
+    }
+    return true;
  }

-  PRIORITY priority;
-  inT8 widthp;
-  inT8 widthn;
-  TPOINT location;
-  SPLIT *split1;
-  SPLIT *split2;
-  SPLIT *split3;
+  // Returns true if the given EDGEPT is used by this SEAM, checking only
+  // the EDGEPT pointer, not the coordinates.
+  bool UsesPoint(const EDGEPT* point) const {
+    for (int s = 0; s < num_splits_; ++s) {
+      if (splits_[s].UsesPoint(point)) return true;
+    }
+    return false;
+  }
+  // Returns true if *this and other share any common point, by coordinates.
+  bool SharesPosition(const SEAM& other) const {
+    for (int s = 0; s < num_splits_; ++s) {
+      for (int t = 0; t < other.num_splits_; ++t)
+        if (splits_[s].SharesPosition(other.splits_[t])) return true;
+    }
+    return false;
+  }
+  // Returns true if *this and other have any vertically overlapping splits.
+  bool OverlappingSplits(const SEAM& other) const {
+    for (int s = 0; s < num_splits_; ++s) {
+      TBOX split1_box = splits_[s].bounding_box();
+      for (int t = 0; t < other.num_splits_; ++t) {
+        TBOX split2_box = other.splits_[t].bounding_box();
+        if (split1_box.y_overlap(split2_box)) return true;
+      }
+    }
+    return false;
+  }
+
+  // Marks the edgepts used by the seam so the segments made by the cut
+  // never get split further by another seam in the future.
+  void Finalize() {
+    for (int s = 0; s < num_splits_; ++s) {
+      splits_[s].point1->MarkChop();
+      splits_[s].point2->MarkChop();
+    }
+  }
+
+  // Returns true if the splits in *this SEAM appear OK in the sense that they
+  // do not cross any outlines and do not chop off any ridiculously small
+  // pieces.
+  bool IsHealthy(const TBLOB& blob, int min_points, int min_area) const;
+
+  // Computes the widthp_/widthn_ range for all existing SEAMs and for *this
+  // seam, which is about to be inserted at insert_index. Returns false if
+  // any of the computations fails, as this indicates an invalid chop.
+  // widthn_/widthp_ are only changed if modify is true.
+  bool PrepareToInsertSeam(const GenericVector<SEAM*>& seams,
+                           const GenericVector<TBLOB*>& blobs, int insert_index,
+                           bool modify);
+  // Computes the widthp_/widthn_ range. Returns false if not all the splits
+  // are accounted for. widthn_/widthp_ are only changed if modify is true.
+  bool FindBlobWidth(const GenericVector<TBLOB*>& blobs, int index,
+                     bool modify);
+
+  // Splits this blob into two blobs by applying the splits included in
+  // *this SEAM
+  void ApplySeam(bool italic_blob, TBLOB* blob, TBLOB* other_blob) const;
+  // Undoes ApplySeam by removing the seam between these two blobs.
+  // Produces one blob as a result, and deletes other_blob.
+  void UndoSeam(TBLOB* blob, TBLOB* other_blob) const;
+
+  // Prints everything in *this SEAM.
+  void Print(const char* label) const;
+  // Prints a collection of SEAMs.
+  static void PrintSeams(const char* label, const GenericVector<SEAM*>& seams);
+#ifndef GRAPHICS_DISABLED
+  // Draws the seam in the given window.
+  void Mark(ScrollView* window) const;
+#endif
+
+  // Break up the blobs in this chain so that they are all independent.
+  // This operation should undo the affect of join_pieces.
+  static void BreakPieces(const GenericVector<SEAM*>& seams,
+                          const GenericVector<TBLOB*>& blobs, int first,
+                          int last);
+  // Join a group of base level pieces into a single blob that can then
+  // be classified.
+  static void JoinPieces(const GenericVector<SEAM*>& seams,
+                         const GenericVector<TBLOB*>& blobs, int first,
+                         int last);
+
+  // Hides the seam so the outlines appear not to be cut by it.
+  void Hide() const;
+  // Undoes hide, so the outlines are cut by the seam.
+  void Reveal() const;
+
+  // Computes and returns, but does not set, the full priority of *this SEAM.
+  // The arguments here are config parameters defined in Wordrec. Add chop_
+  // to the beginning of the name.
+  float FullPriority(int xmin, int xmax, double overlap_knob,
+                     int centered_maxwidth, double center_knob,
+                     double width_change_knob) const;
+
+ private:
+  // Maximum number of splits that a SEAM can hold.
+  static const int kMaxNumSplits = 3;
+  // Priority of this split. Lower is better.
+  float priority_;
+  // Position of the middle of the seam.
+  TPOINT location_;
+  // A range such that all splits in *this SEAM are contained within blobs in
+  // the range [index - widthn_,index + widthp_] where index is the index of
+  // this SEAM in the seams vector.
+  inT8 widthp_;
+  inT8 widthn_;
+  // Number of splits_ that are used.
+  inT8 num_splits_;
+  // Set of pairs of points that are the ends of each split in the SEAM.
+  SPLIT splits_[kMaxNumSplits];
 };

-/**
- * exact_point
- *
- * Return TRUE if the point positions are the exactly the same. The
- * parameters must be of type (EDGEPT*).
- */
-
-#define exact_point(p1,p2)                    \
-        (! ((p1->pos.x - p2->pos.x) || (p1->pos.y - p2->pos.y)))
-
 /*----------------------------------------------------------------------
              F u n c t i o n s
 ----------------------------------------------------------------------*/
-bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2);

-bool point_in_seam(const SEAM *seam, SPLIT *split);
-
-bool point_used_by_split(SPLIT *split, EDGEPT *point);
-
-bool point_used_by_seam(SEAM *seam, EDGEPT *point);
-
-void combine_seams(SEAM *dest_seam, SEAM *source_seam);
-
-void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array);
-
-bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
-                      TWERD *word, int index);
-
-void insert_seam(const TWERD *word, int index, SEAM *seam,
-                 GenericVector<SEAM*>* seam_array);
-
-int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
-                   int blob_direction);
-
-bool find_split_in_blob(SPLIT *split, TBLOB *blob);
-
-SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2);
-
-void print_seam(const char *label, SEAM *seam);
-
-void print_seams(const char *label, const GenericVector<SEAM*>& seams);
-
-int shared_split_points(const SEAM *seam1, const SEAM *seam2);
-
-void break_pieces(const GenericVector<SEAM*>& seams,
-                  int first, int last, TWERD *word);
-
-void join_pieces(const GenericVector<SEAM*>& seams,
-                 int first, int last, TWERD *word);
-
-void hide_seam(SEAM *seam);
-
-void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2);
-
-void reveal_seam(SEAM *seam);
-
-void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2);
+void start_seam_list(TWERD* word, GenericVector<SEAM*>* seam_array);

 #endif
--- a/ccstruct/split.cpp
+++ b/ccstruct/split.cpp
@ -36,23 +36,103 @@
 /*----------------------------------------------------------------------
              V a r i a b l e s
 ----------------------------------------------------------------------*/
+// Limit on the amount of penalty for the chop being off-center.
+const int kCenterGradeCap = 25;
+// Ridiculously large priority for splits that are no use.
+const double kBadPriority = 999.0;
+
 BOOL_VAR(wordrec_display_splits, 0, "Display splits");

-/*----------------------------------------------------------------------
-              F u n c t i o n s
----------------------------------------------------------------------*/
-
-/**********************************************************************
- * delete_split
- *
- * Remove this split from existence.
- **********************************************************************/
-void delete_split(SPLIT *split) { 
-  if (split) {
-    delete split;
-  }
+// Returns the bounding box of all the points in the split.
+TBOX SPLIT::bounding_box() const {
+  return TBOX(
+      MIN(point1->pos.x, point2->pos.x), MIN(point1->pos.y, point2->pos.y),
+      MAX(point1->pos.x, point2->pos.x), MAX(point1->pos.y, point2->pos.y));
 }

+// Hides the SPLIT so the outlines appear not to be cut by it.
+void SPLIT::Hide() const {
+  EDGEPT* edgept = point1;
+  do {
+    edgept->Hide();
+    edgept = edgept->next;
+  } while (!edgept->EqualPos(*point2) && edgept != point1);
+  edgept = point2;
+  do {
+    edgept->Hide();
+    edgept = edgept->next;
+  } while (!edgept->EqualPos(*point1) && edgept != point2);
+}
+
+// Undoes hide, so the outlines are cut by the SPLIT.
+void SPLIT::Reveal() const {
+  EDGEPT* edgept = point1;
+  do {
+    edgept->Reveal();
+    edgept = edgept->next;
+  } while (!edgept->EqualPos(*point2) && edgept != point1);
+  edgept = point2;
+  do {
+    edgept->Reveal();
+    edgept = edgept->next;
+  } while (!edgept->EqualPos(*point1) && edgept != point2);
+}
+
+// Compute a split priority based on the bounding boxes of the parts.
+// The arguments here are config parameters defined in Wordrec. Add chop_
+// to the beginning of the name.
+float SPLIT::FullPriority(int xmin, int xmax, double overlap_knob,
+                          int centered_maxwidth, double center_knob,
+                          double width_change_knob) const {
+  TBOX box1 = Box12();
+  TBOX box2 = Box21();
+  int min_left = MIN(box1.left(), box2.left());
+  int max_right = MAX(box1.right(), box2.right());
+  if (xmin < min_left && xmax > max_right) return kBadPriority;
+
+  float grade = 0.0f;
+  // grade_overlap.
+  int width1 = box1.width();
+  int width2 = box2.width();
+  int min_width = MIN(width1, width2);
+  int overlap = -box1.x_gap(box2);
+  if (overlap == min_width) {
+    grade += 100.0f;  // Total overlap.
+  } else {
+    if (2 * overlap > min_width) overlap += 2 * overlap - min_width;
+    if (overlap > 0) grade += overlap_knob * overlap;
+  }
+  // grade_center_of_blob.
+  if (width1 <= centered_maxwidth || width2 <= centered_maxwidth) {
+    grade += MIN(kCenterGradeCap, center_knob * abs(width1 - width2));
+  }
+  // grade_width_change.
+  float width_change_grade = 20 - (max_right - min_left - MAX(width1, width2));
+  if (width_change_grade > 0.0f)
+    grade += width_change_grade * width_change_knob;
+  return grade;
+}
+
+// Returns true if *this SPLIT appears OK in the sense that it does not cross
+// any outlines and does not chop off any ridiculously small pieces.
+bool SPLIT::IsHealthy(const TBLOB& blob, int min_points, int min_area) const {
+  return !IsLittleChunk(min_points, min_area) &&
+         !blob.SegmentCrossesOutline(point1->pos, point2->pos);
+}
+
+// Returns true if the split generates a small chunk in terms of either area
+// or number of points.
+bool SPLIT::IsLittleChunk(int min_points, int min_area) const {
+  if (point1->ShortNonCircularSegment(min_points, point2) &&
+      point1->SegmentArea(point2) < min_area) {
+    return true;
+  }
+  if (point2->ShortNonCircularSegment(min_points, point1) &&
+      point2->SegmentArea(point1) < min_area) {
+    return true;
+  }
+  return false;
+}

 /**********************************************************************
 * make_edgept
@ -135,102 +215,113 @@ void remove_edgept(EDGEPT *point) {
 }

 /**********************************************************************
- * new_split
+ * Print
 *
- * Create a new split record and initialize it.  Put it on the display
- * list.
+ * Shows the coordinates of both points in a split.
 **********************************************************************/
-SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) { 
-  SPLIT *s = new SPLIT;
-  s->point1 = point1;
-  s->point2 = point2;
-  return (s);
-}
-
-
-/**********************************************************************
- * print_split
- *
- * Print a list of splits.  Show the coordinates of both points in
- * each split.
- **********************************************************************/
-void print_split(SPLIT *split) { 
-  if (split) {
-    tprintf("(%d,%d)--(%d,%d)",
-            split->point1->pos.x, split->point1->pos.y,
-            split->point2->pos.x, split->point2->pos.y);
+void SPLIT::Print() const {
+  if (this != NULL) {
+    tprintf("(%d,%d)--(%d,%d)", point1->pos.x, point1->pos.y, point2->pos.x,
+            point2->pos.y);
  }
 }

+#ifndef GRAPHICS_DISABLED
+// Draws the split in the given window.
+void SPLIT::Mark(ScrollView* window) const {
+  window->Pen(ScrollView::GREEN);
+  window->Line(point1->pos.x, point1->pos.y, point2->pos.x, point2->pos.y);
+  window->UpdateWindow();
+}
+#endif

-/**********************************************************************
- * split_outline
- *
- * Split between these two edge points.
- **********************************************************************/
-void split_outline(EDGEPT *join_point1, EDGEPT *join_point2) { 
-  assert(join_point1 != join_point2);
+// Creates two outlines out of one by splitting the original one in half.
+// Inserts the resulting outlines into the given list.
+void SPLIT::SplitOutlineList(TESSLINE* outlines) const {
+  SplitOutline();
+  while (outlines->next != NULL) outlines = outlines->next;

-  EDGEPT* temp2 = join_point2->next;
-  EDGEPT* temp1 = join_point1->next;
-  /* Create two new points */
-  EDGEPT* new_point1 = make_edgept(join_point1->pos.x, join_point1->pos.y,
-                                   temp1, join_point2);
-  EDGEPT* new_point2 = make_edgept(join_point2->pos.x, join_point2->pos.y,
-                                   temp2, join_point1);
-  // Join_point1 and 2 are now cross-over points, so they must have NULL
-  // src_outlines and give their src_outline information their new
-  // replacements.
-  new_point1->src_outline = join_point1->src_outline;
-  new_point1->start_step = join_point1->start_step;
-  new_point1->step_count = join_point1->step_count;
-  new_point2->src_outline = join_point2->src_outline;
-  new_point2->start_step = join_point2->start_step;
-  new_point2->step_count = join_point2->step_count;
-  join_point1->src_outline = NULL;
-  join_point1->start_step = 0;
-  join_point1->step_count = 0;
-  join_point2->src_outline = NULL;
-  join_point2->start_step = 0;
-  join_point2->step_count = 0;
-  join_point1->MarkChop();
-  join_point2->MarkChop();
+  outlines->next = new TESSLINE;
+  outlines->next->loop = point1;
+  outlines->next->ComputeBoundingBox();
+
+  outlines = outlines->next;
+
+  outlines->next = new TESSLINE;
+  outlines->next->loop = point2;
+  outlines->next->ComputeBoundingBox();
+
+  outlines->next->next = NULL;
 }

+// Makes a split between these two edge points, but does not affect the
+// outlines to which they belong.
+void SPLIT::SplitOutline() const {
+  EDGEPT* temp2 = point2->next;
+  EDGEPT* temp1 = point1->next;
+  /* Create two new points */
+  EDGEPT* new_point1 = make_edgept(point1->pos.x, point1->pos.y, temp1, point2);
+  EDGEPT* new_point2 = make_edgept(point2->pos.x, point2->pos.y, temp2, point1);
+  // point1 and 2 are now cross-over points, so they must have NULL
+  // src_outlines and give their src_outline information their new
+  // replacements.
+  new_point1->src_outline = point1->src_outline;
+  new_point1->start_step = point1->start_step;
+  new_point1->step_count = point1->step_count;
+  new_point2->src_outline = point2->src_outline;
+  new_point2->start_step = point2->start_step;
+  new_point2->step_count = point2->step_count;
+  point1->src_outline = NULL;
+  point1->start_step = 0;
+  point1->step_count = 0;
+  point2->src_outline = NULL;
+  point2->start_step = 0;
+  point2->step_count = 0;
+}

-/**********************************************************************
- * unsplit_outlines
- *
- * Remove the split that was put between these two points.
- **********************************************************************/
-void unsplit_outlines(EDGEPT *p1, EDGEPT *p2) { 
-  EDGEPT *tmp1 = p1->next;
-  EDGEPT *tmp2 = p2->next;
+// Undoes the effect of SplitOutlineList, correcting the outlines for undoing
+// the split, but possibly leaving some duplicate outlines.
+void SPLIT::UnsplitOutlineList(TBLOB* blob) const {
+  /* Modify edge points */
+  UnsplitOutlines();

-  assert (p1 != p2);
+  TESSLINE* outline1 = new TESSLINE;
+  outline1->next = blob->outlines;
+  blob->outlines = outline1;
+  outline1->loop = point1;

-  tmp1->next->prev = p2;
-  tmp2->next->prev = p1;
+  TESSLINE* outline2 = new TESSLINE;
+  outline2->next = blob->outlines;
+  blob->outlines = outline2;
+  outline2->loop = point2;
+}

-  // tmp2 is coincident with p1. p1 takes tmp2's place as tmp2 is deleted.
-  p1->next = tmp2->next;
-  p1->src_outline = tmp2->src_outline;
-  p1->start_step = tmp2->start_step;
-  p1->step_count = tmp2->step_count;
-  // Likewise p2 takes tmp1's place.
-  p2->next = tmp1->next;
-  p2->src_outline = tmp1->src_outline;
-  p2->start_step = tmp1->start_step;
-  p2->step_count = tmp1->step_count;
-  p1->UnmarkChop();
-  p2->UnmarkChop();
+// Removes the split that was put between these two points.
+void SPLIT::UnsplitOutlines() const {
+  EDGEPT* tmp1 = point1->next;
+  EDGEPT* tmp2 = point2->next;
+
+  tmp1->next->prev = point2;
+  tmp2->next->prev = point1;
+
+  // tmp2 is coincident with point1. point1 takes tmp2's place as tmp2 is
+  // deleted.
+  point1->next = tmp2->next;
+  point1->src_outline = tmp2->src_outline;
+  point1->start_step = tmp2->start_step;
+  point1->step_count = tmp2->step_count;
+  // Likewise point2 takes tmp1's place.
+  point2->next = tmp1->next;
+  point2->src_outline = tmp1->src_outline;
+  point2->start_step = tmp1->start_step;
+  point2->step_count = tmp1->step_count;

  delete tmp1;
  delete tmp2;

-  p1->vec.x = p1->next->pos.x - p1->pos.x;
-  p1->vec.y = p1->next->pos.y - p1->pos.y;
+  point1->vec.x = point1->next->pos.x - point1->pos.x;
+  point1->vec.y = point1->next->pos.y - point1->pos.y;

-  p2->vec.x = p2->next->pos.x - p2->pos.x;
-  p2->vec.y = p2->next->pos.y - p2->pos.y;
+  point2->vec.x = point2->next->pos.x - point2->pos.x;
+  point2->vec.y = point2->next->pos.y - point2->pos.y;
 }
--- a/ccstruct/split.h
+++ b/ccstruct/split.h
@ -29,18 +29,80 @@
              I n c l u d e s
 ----------------------------------------------------------------------*/
 #include "blobs.h"
-#include "oldlist.h"
+#include "scrollview.h"

 /*----------------------------------------------------------------------
              T y p e s
 ----------------------------------------------------------------------*/
-typedef struct split_record
-{                                /*  SPLIT  */
+struct SPLIT {
+  SPLIT() : point1(NULL), point2(NULL) {}
+  SPLIT(EDGEPT* pt1, EDGEPT* pt2) : point1(pt1), point2(pt2) {}
+
+  // Returns the bounding box of all the points in the split.
+  TBOX bounding_box() const;
+  // Returns the bounding box of the outline from point1 to point2.
+  TBOX Box12() const { return point1->SegmentBox(point2); }
+  // Returns the bounding box of the outline from point1 to point1.
+  TBOX Box21() const { return point2->SegmentBox(point1); }
+  // Returns the bounding box of the out
+
+  // Hides the SPLIT so the outlines appear not to be cut by it.
+  void Hide() const;
+  // Undoes hide, so the outlines are cut by the SPLIT.
+  void Reveal() const;
+
+  // Returns true if the given EDGEPT is used by this SPLIT, checking only
+  // the EDGEPT pointer, not the coordinates.
+  bool UsesPoint(const EDGEPT* point) const {
+    return point1 == point || point2 == point;
+  }
+  // Returns true if the other SPLIT has any position shared with *this.
+  bool SharesPosition(const SPLIT& other) const {
+    return point1->EqualPos(*other.point1) || point1->EqualPos(*other.point2) ||
+           point2->EqualPos(*other.point1) || point2->EqualPos(*other.point2);
+  }
+  // Returns true if both points are contained within the blob.
+  bool ContainedByBlob(const TBLOB& blob) const {
+    return blob.Contains(point1->pos) && blob.Contains(point2->pos);
+  }
+  // Returns true if both points are contained within the outline.
+  bool ContainedByOutline(const TESSLINE& outline) const {
+    return outline.Contains(point1->pos) && outline.Contains(point2->pos);
+  }
+  // Compute a split priority based on the bounding boxes of the parts.
+  // The arguments here are config parameters defined in Wordrec. Add chop_
+  // to the beginning of the name.
+  float FullPriority(int xmin, int xmax, double overlap_knob,
+                     int centered_maxwidth, double center_knob,
+                     double width_change_knob) const;
+  // Returns true if *this SPLIT appears OK in the sense that it does not cross
+  // any outlines and does not chop off any ridiculously small pieces.
+  bool IsHealthy(const TBLOB& blob, int min_points, int min_area) const;
+  // Returns true if the split generates a small chunk in terms of either area
+  // or number of points.
+  bool IsLittleChunk(int min_points, int min_area) const;
+
+  void Print() const;
+#ifndef GRAPHICS_DISABLED
+  // Draws the split in the given window.
+  void Mark(ScrollView* window) const;
+#endif
+
+  // Creates two outlines out of one by splitting the original one in half.
+  // Inserts the resulting outlines into the given list.
+  void SplitOutlineList(TESSLINE* outlines) const;
+  // Makes a split between these two edge points, but does not affect the
+  // outlines to which they belong.
+  void SplitOutline() const;
+  // Undoes the effect of SplitOutlineList, correcting the outlines for undoing
+  // the split, but possibly leaving some duplicate outlines.
+  void UnsplitOutlineList(TBLOB* blob) const;
+  // Removes the split that was put between these two points.
+  void UnsplitOutlines() const;
+
  EDGEPT *point1;
  EDGEPT *point2;
-} SPLIT;
-
-typedef LIST SPLITS;             /*  SPLITS  */
+};

 /*----------------------------------------------------------------------
              V a r i a b l e s
@ -48,38 +110,11 @@ typedef LIST SPLITS;             /*  SPLITS  */

 extern BOOL_VAR_H(wordrec_display_splits, 0, "Display splits");

-/*----------------------------------------------------------------------
-              M a c r o s
----------------------------------------------------------------------*/
-/**********************************************************************
- * clone_split
- *
- * Create a new split record and set the contents equal to the contents
- * of this record.
- **********************************************************************/
-
-#define clone_split(dest,source)                               \
-if (source)                                                  \
-	(dest) = new_split ((source)->point1, (source)->point2);  \
-else                                                         \
-	(dest) = (SPLIT*) NULL                                    \
-
-
 /*----------------------------------------------------------------------
              F u n c t i o n s
 ----------------------------------------------------------------------*/
-void delete_split(SPLIT *split);
-
 EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev);

 void remove_edgept(EDGEPT *point);

-SPLIT *new_split(EDGEPT *point1, EDGEPT *point2);
-
-void print_split(SPLIT *split);
-
-void split_outline(EDGEPT *join_point1, EDGEPT *join_point2);
-
-void unsplit_outlines(EDGEPT *p1, EDGEPT *p2);
-
 #endif
--- a/ccstruct/vecfuncs.cpp
+++ b/ccstruct/vecfuncs.cpp
@ -30,6 +30,7 @@
              I n c l u d e s
 ----------------------------------------------------------------------*/
 #include "vecfuncs.h"
+#include "blobs.h"

 /*----------------------------------------------------------------------
              F u n c t i o n s
--- a/ccstruct/vecfuncs.h
+++ b/ccstruct/vecfuncs.h
@ -26,7 +26,6 @@
 #define VECFUNCS_H

 #include <math.h>
-#include "blobs.h"

 struct EDGEPT;

--- a/ccstruct/werd.cpp
+++ b/ccstruct/werd.cpp
@ -160,23 +160,37 @@ WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
 * row being marked as FUZZY space.
 */

-TBOX WERD::bounding_box() {
-  TBOX box;                       // box being built
-  C_BLOB_IT rej_cblob_it = &rej_cblobs;  // rejected blobs
+TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); }

-  for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
-       rej_cblob_it.forward()) {
-    box += rej_cblob_it.data()->bounding_box();
+// Returns the bounding box including the desired combination of upper and
+// lower noise/diacritic elements.
+TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
+  TBOX box = true_bounding_box();
+  int bottom = box.bottom();
+  int top = box.top();
+  // This is a read-only iteration of the rejected blobs.
+  C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&rej_cblobs));
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    TBOX dot_box = it.data()->bounding_box();
+    if ((upper_dots || dot_box.bottom() <= top) &&
+        (lower_dots || dot_box.top() >= bottom)) {
+      box += dot_box;
    }
+  }
+  return box;
+}

-  C_BLOB_IT it = &cblobs;    // blobs of WERD
+// Returns the bounding box of only the good blobs.
+TBOX WERD::true_bounding_box() const {
+  TBOX box;  // box being built
+  // This is a read-only iteration of the good blobs.
+  C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&cblobs));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    box += it.data()->bounding_box();
  }
  return box;
 }

-
 /**
 * WERD::move
 *
@ -489,3 +503,101 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
  }
  return new_werd;
 }
+
+// Removes noise from the word by moving small outlines to the rej_cblobs
+// list, based on the size_threshold.
+void WERD::CleanNoise(float size_threshold) {
+  C_BLOB_IT blob_it(&cblobs);
+  C_BLOB_IT rej_it(&rej_cblobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    C_BLOB* blob = blob_it.data();
+    C_OUTLINE_IT ol_it(blob->out_list());
+    for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
+      C_OUTLINE* outline = ol_it.data();
+      TBOX ol_box = outline->bounding_box();
+      int ol_size =
+          ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
+      if (ol_size < size_threshold) {
+        // This outline is too small. Move it to a separate blob in the
+        // reject blobs list.
+        C_BLOB* rej_blob = new C_BLOB(ol_it.extract());
+        rej_it.add_after_then_move(rej_blob);
+      }
+    }
+    if (blob->out_list()->empty()) delete blob_it.extract();
+  }
+}
+
+// Extracts all the noise outlines and stuffs the pointers into the given
+// vector of outlines. Afterwards, the outlines vector owns the pointers.
+void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines) {
+  C_BLOB_IT rej_it(&rej_cblobs);
+  for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
+    C_BLOB* blob = rej_it.extract();
+    C_OUTLINE_IT ol_it(blob->out_list());
+    outlines->push_back(ol_it.extract());
+    delete blob;
+  }
+}
+
+// Adds the selected outlines to the indcated real blobs, and puts the rest
+// back in rej_cblobs where they came from. Where the target_blobs entry is
+// NULL, a run of wanted outlines is put into a single new blob.
+// Ownership of the outlines is transferred back to the word. (Hence
+// GenericVector and not PointerVector.)
+// Returns true if any new blob was added to the start of the word, which
+// suggests that it might need joining to the word before it, and likewise
+// sets make_next_word_fuzzy true if any new blob was added to the end.
+bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted,
+                               const GenericVector<C_BLOB*>& target_blobs,
+                               const GenericVector<C_OUTLINE*>& outlines,
+                               bool* make_next_word_fuzzy) {
+  bool outline_added_to_start = false;
+  if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false;
+  C_BLOB_IT rej_it(&rej_cblobs);
+  for (int i = 0; i < outlines.size(); ++i) {
+    C_OUTLINE* outline = outlines[i];
+    if (outline == NULL) continue;  // Already used it.
+    if (wanted[i]) {
+      C_BLOB* target_blob = target_blobs[i];
+      TBOX noise_box = outline->bounding_box();
+      if (target_blob == NULL) {
+        target_blob = new C_BLOB(outline);
+        // Need to find the insertion point.
+        C_BLOB_IT blob_it(&cblobs);
+        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
+             blob_it.forward()) {
+          C_BLOB* blob = blob_it.data();
+          TBOX blob_box = blob->bounding_box();
+          if (blob_box.left() > noise_box.left()) {
+            if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
+              // We might want to join this word to its predecessor.
+              outline_added_to_start = true;
+            }
+            blob_it.add_before_stay_put(target_blob);
+            break;
+          }
+        }
+        if (blob_it.cycled_list()) {
+          blob_it.add_to_end(target_blob);
+          if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true;
+        }
+        // Add all consecutive wanted, but null-blob outlines to same blob.
+        C_OUTLINE_IT ol_it(target_blob->out_list());
+        while (i + 1 < outlines.size() && wanted[i + 1] &&
+               target_blobs[i + 1] == NULL) {
+          ++i;
+          ol_it.add_to_end(outlines[i]);
+        }
+      } else {
+        // Insert outline into this blob.
+        C_OUTLINE_IT ol_it(target_blob->out_list());
+        ol_it.add_to_end(outline);
+      }
+    } else {
+      // Put back on noise list.
+      rej_it.add_to_end(new C_BLOB(outline));
+    }
+  }
+  return outline_added_to_start;
+}
--- a/ccstruct/werd.h
+++ b/ccstruct/werd.h
@ -114,7 +114,13 @@ class WERD : public ELIST2_LINK {
      script_id_ = id;
    }

-    TBOX bounding_box();  // compute bounding box
+    // Returns the (default) bounding box including all the dots.
+    TBOX bounding_box() const;  // compute bounding box
+    // Returns the bounding box including the desired combination of upper and
+    // lower noise/diacritic elements.
+    TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
+    // Returns the bounding box of only the good blobs.
+    TBOX true_bounding_box() const;

    const char *text() const { return correct.string(); }
    void set_text(const char *new_text) { correct = new_text; }
@ -155,6 +161,26 @@ class WERD : public ELIST2_LINK {
    void plot_rej_blobs(ScrollView *window);
    #endif  // GRAPHICS_DISABLED

+    // Removes noise from the word by moving small outlines to the rej_cblobs
+    // list, based on the size_threshold.
+    void CleanNoise(float size_threshold);
+
+    // Extracts all the noise outlines and stuffs the pointers into the given
+    // vector of outlines. Afterwards, the outlines vector owns the pointers.
+    void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
+    // Adds the selected outlines to the indcated real blobs, and puts the rest
+    // back in rej_cblobs where they came from. Where the target_blobs entry is
+    // NULL, a run of wanted outlines is put into a single new blob.
+    // Ownership of the outlines is transferred back to the word. (Hence
+    // GenericVector and not PointerVector.)
+    // Returns true if any new blob was added to the start of the word, which
+    // suggests that it might need joining to the word before it, and likewise
+    // sets make_next_word_fuzzy true if any new blob was added to the end.
+    bool AddSelectedOutlines(const GenericVector<bool> &wanted,
+                             const GenericVector<C_BLOB *> &target_blobs,
+                             const GenericVector<C_OUTLINE *> &outlines,
+                             bool *make_next_word_fuzzy);
+
 private:
    uinT8 blanks;                // no of blanks
    uinT8 dummy;                 // padding
--- a/ccutil/Makefile.am
+++ b/ccutil/Makefile.am
@ -1,3 +1,4 @@
+AUTOMAKE_OPTIONS = subdir-objects
 SUBDIRS =
 AM_CXXFLAGS =

@ -40,8 +41,7 @@ libtesseract_ccutil_la_SOURCES = \
    unichar.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
    params.cpp universalambigs.cpp

-
-if MINGW
+if T_WIN
 AM_CPPFLAGS += -I$(top_srcdir)/vs2008/port -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
 noinst_HEADERS += ../vs2010/port/strtok_r.h
 libtesseract_ccutil_la_SOURCES += ../vs2010/port/strtok_r.cpp
--- a/ccutil/ambigs.cpp
+++ b/ccutil/ambigs.cpp
@ -24,13 +24,13 @@
 #include "helpers.h"
 #include "universalambigs.h"

-#ifdef _WIN32
+#if defined _WIN32 || defined(__CYGWIN__)
 #ifndef __GNUC__
 #define strtok_r strtok_s
 #else
 #include "strtok_r.h"
 #endif  /* __GNUC__ */
-#endif  /* _WIN32 */
+#endif  /* _WIN32 __CYGWIN__*/

 namespace tesseract {

--- a/ccutil/genericvector.h
+++ b/ccutil/genericvector.h
@ -445,8 +445,10 @@ class PointerVector : public GenericVector<T*> {
  }

  PointerVector<T>& operator=(const PointerVector& other) {
+    if (&other != this) {
      this->truncate(0);
      this->operator+=(other);
+    }
    return *this;
  }

@ -777,8 +779,10 @@ GenericVector<T> &GenericVector<T>::operator+=(const GenericVector& other) {

 template <typename T>
 GenericVector<T> &GenericVector<T>::operator=(const GenericVector& other) {
+  if (&other != this) {
    this->truncate(0);
    this->operator+=(other);
+  }
  return *this;
 }

--- a/ccutil/platform.h
+++ b/ccutil/platform.h
@ -28,10 +28,12 @@
 #define ultoa _ultoa
 #endif  /* __GNUC__ */
 #define SIGNED
+#if defined(_MSC_VER)
 #define snprintf _snprintf
 #if (_MSC_VER <= 1400)
 #define vsnprintf _vsnprintf
-#endif /* _WIN32 */
+#endif /* (_MSC_VER <= 1400) */
+#endif /* defined(_MSC_VER) */
 #else
 #define __UNIX__
 #include <limits.h>
--- a/ccutil/scanutils.cpp
+++ b/ccutil/scanutils.cpp
@ -34,7 +34,7 @@
 #include "tprintf.h"

 // workaround for "'off_t' was not declared in this scope" with -std=c++11
-#if !defined(off_t) && !defined(__APPLE__)
+#if !defined(off_t) && !defined(__APPLE__) && !defined(__CYGWIN__)
 typedef long off_t;
 #endif  // off_t

--- a/ccutil/serialis.cpp
+++ b/ccutil/serialis.cpp
@ -61,9 +61,11 @@ bool TFile::Open(FILE* fp, inT64 end_offset) {
  offset_ = 0;
  inT64 current_pos = ftell(fp);
  if (end_offset < 0) {
-    fseek(fp, 0, SEEK_END);
+    if (fseek(fp, 0, SEEK_END))
+      return false;
    end_offset = ftell(fp);
-    fseek(fp, current_pos, SEEK_SET);
+    if (fseek(fp, current_pos, SEEK_SET))
+      return false;
  }
  int size = end_offset - current_pos;
  is_writing_ = false;
--- a/ccutil/tessdatamanager.cpp
+++ b/ccutil/tessdatamanager.cpp
@ -95,21 +95,30 @@ void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
  delete[] chunk;
 }

-void TessdataManager::WriteMetadata(inT64 *offset_table,
+bool TessdataManager::WriteMetadata(inT64 *offset_table,
                                    const char * language_data_path_prefix,
                                    FILE *output_file) {
-  fseek(output_file, 0, SEEK_SET);
  inT32 num_entries = TESSDATA_NUM_ENTRIES;
-  fwrite(&num_entries, sizeof(inT32), 1, output_file);
-  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
+  bool result = true;
+  if (fseek(output_file, 0, SEEK_SET) != 0 ||
+      fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
+      fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
+             output_file) != TESSDATA_NUM_ENTRIES) {
    fclose(output_file);
-
+    result = false;
+    tprintf("WriteMetadata failed in TessdataManager!\n");
+  } else if (fclose(output_file)) {
+    result = false;
+    tprintf("WriteMetadata failed to close file!\n");
+  } else {
    tprintf("TessdataManager combined tesseract data files.\n");
    for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
      tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
              language_data_path_prefix, kTessdataFileSuffixes[i],
              offset_table[i]);
    }
+  }
+  return result;
 }

 bool TessdataManager::CombineDataFiles(
@ -124,8 +133,11 @@ bool TessdataManager::CombineDataFiles(
    return false;
  }
  // Leave some space for recording the offset_table.
-  fseek(output_file,
-        sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
+  if (fseek(output_file,
+            sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
+    tprintf("Error seeking %s\n", output_filename);
+    return false;
+  }

  TessdataType type = TESSDATA_NUM_ENTRIES;
  bool text_file = false;
@ -161,8 +173,7 @@ bool TessdataManager::CombineDataFiles(
    return false;
  }

-  WriteMetadata(offset_table, language_data_path_prefix, output_file);
-  return true;
+  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
 }

 bool TessdataManager::OverwriteComponents(
@ -185,8 +196,12 @@ bool TessdataManager::OverwriteComponents(
  }

  // Leave some space for recording the offset_table.
-  fseek(output_file,
-        sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
+  if (fseek(output_file,
+            sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
+    fclose(output_file);
+    tprintf("Error seeking %s\n", new_traineddata_filename);
+    return false;
+  }

  // Open the files with the new components.
  for (i = 0; i < num_new_components; ++i) {
@ -212,8 +227,7 @@ bool TessdataManager::OverwriteComponents(
    }
  }
  const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
-  WriteMetadata(offset_table, language_data_path_prefix, output_file);
-  return true;
+  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
 }

 bool TessdataManager::TessdataTypeFromFileSuffix(
--- a/ccutil/tessdatamanager.h
+++ b/ccutil/tessdatamanager.h
@ -199,8 +199,10 @@ class TessdataManager {
    return swap_;
  }

-  /** Writes the number of entries and the given offset table to output_file. */
-  static void WriteMetadata(inT64 *offset_table,
+  /** Writes the number of entries and the given offset table to output_file.
+   * Returns false on error.
+   */
+  static bool WriteMetadata(inT64 *offset_table,
                            const char *language_data_path_prefix,
                            FILE *output_file);

--- a/ccutil/unichar.cpp
+++ b/ccutil/unichar.cpp
@ -206,12 +206,20 @@ UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
 }

 // Converts a utf-8 string to a vector of unicodes.
-void UNICHAR::UTF8ToUnicode(const char* utf8_str,
+// Returns false if the input contains invalid UTF-8, and replaces
+// the rest of the string with a single space.
+bool UNICHAR::UTF8ToUnicode(const char* utf8_str,
                            GenericVector<int>* unicodes) {
  const int utf8_length = strlen(utf8_str);
  const_iterator end_it(end(utf8_str, utf8_length));
  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
+    if (it.is_legal()) {
      unicodes->push_back(*it);
+    } else {
+      unicodes->push_back(' ');
+      return false;
    }
+  }
+  return true;
 }

--- a/ccutil/unichar.h
+++ b/ccutil/unichar.h
@ -151,7 +151,9 @@ class UNICHAR {
  static const_iterator end(const char* utf8_str, const int byte_length);

  // Converts a utf-8 string to a vector of unicodes.
-  static void UTF8ToUnicode(const char* utf8_str, GenericVector<int>* unicodes);
+  // Returns false if the input contains invalid UTF-8, and replaces
+  // the rest of the string with a single space.
+  static bool UTF8ToUnicode(const char* utf8_str, GenericVector<int>* unicodes);

 private:
  // A UTF-8 representation of 1 or more Unicode characters.
--- a/ccutil/unicharset.cpp
+++ b/ccutil/unicharset.cpp
@ -867,7 +867,10 @@ bool UNICHARSET::load_via_fgets(
    // Skip fragments if needed.
    CHAR_FRAGMENT *frag = NULL;
    if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
+      int num_pieces = frag->get_total();
      delete frag;
+      // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
+      if (num_pieces > 1)
        continue;
    }
    // Insert unichar into unicharset and set its properties.
@ -982,8 +985,10 @@ bool UNICHARSET::major_right_to_left() const {
 // Set a whitelist and/or blacklist of characters to recognize.
 // An empty or NULL whitelist enables everything (minus any blacklist).
 // An empty or NULL blacklist disables nothing.
+// An empty or NULL blacklist has no effect.
 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
-                                         const char* whitelist) {
+                                         const char* whitelist,
+                                         const char* unblacklist) {
  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
  // Set everything to default
  for (int ch = 0; ch < size_used; ++ch)
@ -1006,6 +1011,15 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
        unichars[encoding[i]].properties.enabled = false;
    }
  }
+  if (unblacklist != NULL && unblacklist[0] != '\0') {
+    // Re-enable the unblacklist.
+    GenericVector<UNICHAR_ID> encoding;
+    encode_string(unblacklist, false, &encoding, NULL, NULL);
+    for (int i = 0; i < encoding.size(); ++i) {
+      if (encoding[i] != INVALID_UNICHAR_ID)
+        unichars[encoding[i]].properties.enabled = true;
+    }
+  }
 }

 int UNICHARSET::add_script(const char* script) {
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@ -381,11 +381,14 @@ class UNICHARSET {
  // Set a whitelist and/or blacklist of characters to recognize.
  // An empty or NULL whitelist enables everything (minus any blacklist).
  // An empty or NULL blacklist disables nothing.
+  // An empty or NULL unblacklist has no effect.
  // The blacklist overrides the whitelist.
+  // The unblacklist overrides the blacklist.
  // Each list is a string of utf8 character strings. Boundaries between
  // unicharset units are worked out automatically, and characters not in
  // the unicharset are silently ignored.
-  void set_black_and_whitelist(const char* blacklist, const char* whitelist);
+  void set_black_and_whitelist(const char* blacklist, const char* whitelist,
+                               const char* unblacklist);

  // Set the isalpha property of the given unichar to the given value.
  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
@ -614,6 +617,10 @@ class UNICHARSET {
    unichars[unichar_id].properties.max_advance =
        static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
  }
+  // Returns true if the font metrics properties are empty.
+  bool PropertiesIncomplete(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.AnyRangeEmpty();
+  }

  // Return the script name of the given unichar.
  // The returned pointer will always be the same for the same script, it's
--- a/classify/Makefile.am
+++ b/classify/Makefile.am
@ -11,15 +11,15 @@ endif
 noinst_HEADERS = \
    adaptive.h blobclass.h \
    classify.h cluster.h clusttool.h cutoffs.h \
-    errorcounter.h extern.h extract.h \
-    featdefs.h flexfx.h float2int.h fpoint.h fxdefs.h \
+    errorcounter.h \
+    featdefs.h float2int.h fpoint.h \
    intfeaturedist.h intfeaturemap.h intfeaturespace.h \
    intfx.h intmatcher.h intproto.h kdtree.h \
    mastertrainer.h mf.h mfdefs.h mfoutline.h mfx.h \
    normfeat.h normmatch.h \
    ocrfeatures.h outfeat.h picofeat.h protos.h \
    sampleiterator.h shapeclassifier.h shapetable.h \
-    tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h
+    tessclassifier.h trainingsample.h trainingsampleset.h

 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_classify.la
@ -37,14 +37,14 @@ endif
 libtesseract_classify_la_SOURCES = \
    adaptive.cpp adaptmatch.cpp blobclass.cpp \
    classify.cpp cluster.cpp clusttool.cpp cutoffs.cpp \
-    errorcounter.cpp extract.cpp \
-    featdefs.cpp flexfx.cpp float2int.cpp fpoint.cpp fxdefs.cpp \
+    errorcounter.cpp \
+    featdefs.cpp float2int.cpp fpoint.cpp \
    intfeaturedist.cpp intfeaturemap.cpp intfeaturespace.cpp \
    intfx.cpp intmatcher.cpp intproto.cpp kdtree.cpp \
    mastertrainer.cpp mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
    normfeat.cpp normmatch.cpp \
    ocrfeatures.cpp outfeat.cpp picofeat.cpp protos.cpp \
    sampleiterator.cpp shapeclassifier.cpp shapetable.cpp \
-    tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp xform2d.cpp
+    tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp 


--- a/classify/adaptmatch.cpp
+++ b/classify/adaptmatch.cpp
@ -24,6 +24,7 @@
 #endif

 #include <ctype.h>
+#include "shapeclassifier.h"
 #include "ambigs.h"
 #include "blobclass.h"
 #include "blobs.h"
@ -73,23 +74,18 @@

 #define Y_DIM_OFFSET    (Y_SHIFT - BASELINE_Y_SHIFT)

-#define WORST_POSSIBLE_RATING (1.0)
+#define WORST_POSSIBLE_RATING (0.0f)

-struct ScoredClass {
-  CLASS_ID unichar_id;
-  int shape_id;
-  FLOAT32 rating;
-  bool adapted;
-  inT16 config;
-  inT16 fontinfo_id;
-  inT16 fontinfo_id2;
-};
+using tesseract::UnicharRating;
+using tesseract::ScoredFont;

 struct ADAPT_RESULTS {
  inT32 BlobLength;
  bool HasNonfragment;
-  GenericVector<ScoredClass> match;
-  ScoredClass best_match;
+  UNICHAR_ID best_unichar_id;
+  int best_match_index;
+  FLOAT32 best_rating;
+  GenericVector<UnicharRating> match;
  GenericVector<CP_RESULT_STRUCT> CPResults;

  /// Initializes data members to the default values. Sets the initial
@ -97,13 +93,20 @@ struct ADAPT_RESULTS {
  inline void Initialize() {
    BlobLength = MAX_INT32;
    HasNonfragment = false;
-     best_match.unichar_id = NO_CLASS;
-     best_match.shape_id = -1;
-     best_match.rating = WORST_POSSIBLE_RATING;
-     best_match.adapted = false;
-     best_match.config = 0;
-     best_match.fontinfo_id = kBlankFontinfoId;
-     best_match.fontinfo_id2 = kBlankFontinfoId;
+    ComputeBest();
+  }
+  // Computes best_unichar_id, best_match_index and best_rating.
+  void ComputeBest() {
+    best_unichar_id = INVALID_UNICHAR_ID;
+    best_match_index = -1;
+    best_rating = WORST_POSSIBLE_RATING;
+    for (int i = 0; i < match.size(); ++i) {
+      if (match[i].rating > best_rating) {
+        best_rating = match[i].rating;
+        best_unichar_id = match[i].unichar_id;
+        best_match_index = i;
+      }
+    }
  }
 };

@ -116,17 +119,30 @@ struct PROTO_KEY {
 /*-----------------------------------------------------------------------------
          Private Macros
 -----------------------------------------------------------------------------*/
-#define MarginalMatch(Rating)       \
-((Rating) > matcher_great_threshold)
+inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
+  return (1.0f - confidence) > matcher_great_threshold;
+}

 /*-----------------------------------------------------------------------------
          Private Function Prototypes
 -----------------------------------------------------------------------------*/
-int CompareByRating(const void *arg1, const void *arg2);
+// Returns the index of the given id in results, if present, or the size of the
+// vector (index it will go at) if not present.
+static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
+  for (int i = 0; i < results.match.size(); i++) {
+    if (results.match[i].unichar_id == id)
+      return i;
+  }
+  return results.match.size();
+}

-ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id);
-
-ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id);
+// Returns the current rating for a unichar id if we have rated it, defaulting
+// to WORST_POSSIBLE_RATING.
+static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
+  int index = FindScoredUnichar(id, results);
+  if (index >= results.match.size()) return WORST_POSSIBLE_RATING;
+  return results.match[index].rating;
+}

 void InitMatcherRatings(register FLOAT32 *Rating);

@ -176,19 +192,21 @@ void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
  DoAdaptiveMatch(Blob, Results);

  RemoveBadMatches(Results);
-  Results->match.sort(CompareByRating);
+  Results->match.sort(&UnicharRating::SortDescendingRating);
  RemoveExtraPuncs(Results);
+  Results->ComputeBest();
  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
                          Choices);

-  if (matcher_debug_level >= 1) {
-    cprintf ("AD Matches =  ");
-    PrintAdaptiveMatchResults(stdout, Results);
-  }
-
+  // TODO(rays) Move to before ConvertMatchesToChoices!
  if (LargeSpeckle(*Blob) || Choices->length() == 0)
    AddLargeSpeckleTo(Results->BlobLength, Choices);

+  if (matcher_debug_level >= 1) {
+    tprintf("AD Matches =  ");
+    PrintAdaptiveMatchResults(*Results);
+  }
+
 #ifndef GRAPHICS_DISABLED
  if (classify_enable_adaptive_debugger)
    DebugAdaptiveClassifier(Blob, Results);
@ -220,17 +238,15 @@ void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,

 // Learns the given word using its chopped_word, seam_array, denorm,
 // box_word, best_state, and correct_text to learn both correctly and
-// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
-// is called and the data will be written to a file for static training.
+// incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
+// is called and the data will be saved in an internal buffer.
 // Otherwise AdaptToBlob is called for adaption within a document.
-// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
-// be learned, otherwise all chars with good correct_text are learned.
-void Classify::LearnWord(const char* filename, WERD_RES *word) {
+void Classify::LearnWord(const char* fontname, WERD_RES* word) {
  int word_len = word->correct_text.size();
  if (word_len == 0) return;

  float* thresholds = NULL;
-  if (filename == NULL) {
+  if (fontname == NULL) {
    // Adaption mode.
    if (!EnableLearning || word->best_choice == NULL)
      return;  // Can't or won't adapt.
@ -267,8 +283,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
    if (word->correct_text[ch].length() > 0) {
      float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;

-      LearnPieces(filename, start_blob, word->best_state[ch],
-                  threshold, CST_WHOLE, word->correct_text[ch].string(), word);
+      LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
+                  CST_WHOLE, word->correct_text[ch].string(), word);

      if (word->best_state[ch] > 1 && !disable_character_fragments) {
        // Check that the character breaks into meaningful fragments
@ -301,8 +317,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
                if (i != tokens.size() - 1)
                  full_string += ' ';
              }
-              LearnPieces(filename, start_blob + frag, 1,
-                          threshold, CST_FRAGMENT, full_string.string(), word);
+              LearnPieces(fontname, start_blob + frag, 1, threshold,
+                          CST_FRAGMENT, full_string.string(), word);
            }
          }
        }
@ -314,13 +330,13 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
      if (word->best_state[ch] > 1) {
        // If the next blob is good, make junk with the rightmost fragment.
        if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
-          LearnPieces(filename, start_blob + word->best_state[ch] - 1,
+          LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
                      word->best_state[ch + 1] + 1,
                      threshold, CST_IMPROPER, INVALID_UNICHAR, word);
        }
        // If the previous blob is good, make junk with the leftmost fragment.
        if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
-          LearnPieces(filename, start_blob - word->best_state[ch - 1],
+          LearnPieces(fontname, start_blob - word->best_state[ch - 1],
                      word->best_state[ch - 1] + 1,
                      threshold, CST_IMPROPER, INVALID_UNICHAR, word);
        }
@ -329,7 +345,7 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
      if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
        STRING joined_text = word->correct_text[ch];
        joined_text += word->correct_text[ch + 1];
-        LearnPieces(filename, start_blob,
+        LearnPieces(fontname, start_blob,
                    word->best_state[ch] + word->best_state[ch + 1],
                    threshold, CST_NGRAM, joined_text.string(), word);
      }
@ -342,16 +358,16 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {

 // Builds a blob of length fragments, from the word, starting at start,
 // and then learns it, as having the given correct_text.
-// If filename is not NULL, then LearnBlob
-// is called and the data will be written to a file for static training.
+// If fontname is not NULL, then LearnBlob is called and the data will be
+// saved in an internal buffer for static training.
 // Otherwise AdaptToBlob is called for adaption within a document.
 // threshold is a magic number required by AdaptToChar and generated by
 // ComputeAdaptionThresholds.
 // Although it can be partly inferred from the string, segmentation is
 // provided to explicitly clarify the character segmentation.
-void Classify::LearnPieces(const char* filename, int start, int length,
+void Classify::LearnPieces(const char* fontname, int start, int length,
                           float threshold, CharSegmentationType segmentation,
-                           const char* correct_text, WERD_RES *word) {
+                           const char* correct_text, WERD_RES* word) {
  // TODO(daria) Remove/modify this if/when we want
  // to train and/or adapt to n-grams.
  if (segmentation != CST_WHOLE &&
@ -359,8 +375,8 @@ void Classify::LearnPieces(const char* filename, int start, int length,
    return;

  if (length > 1) {
-    join_pieces(word->seam_array, start, start + length - 1,
-                word->chopped_word);
+    SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
+                     start + length - 1);
  }
  TBLOB* blob = word->chopped_word->blobs[start];
  // Rotate the blob if needed for classification.
@ -385,7 +401,7 @@ void Classify::LearnPieces(const char* filename, int start, int length,
  }
  #endif  // GRAPHICS_DISABLED

-  if (filename != NULL) {
+  if (fontname != NULL) {
    classify_norm_method.set_value(character);  // force char norm spc 30/11/93
    tess_bn_matching.set_value(false);    // turn it off
    tess_cn_matching.set_value(false);
@ -393,8 +409,7 @@ void Classify::LearnPieces(const char* filename, int start, int length,
    INT_FX_RESULT_STRUCT fx_info;
    SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
                     &bl_denorm, &cn_denorm, &fx_info);
-    LearnBlob(feature_defs_, filename, rotated_blob, bl_denorm, cn_denorm,
-              fx_info, correct_text);
+    LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
  } else if (unicharset.contains_unichar(correct_text)) {
    UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
    int font_id = word->fontinfo != NULL
@ -413,7 +428,8 @@ void Classify::LearnPieces(const char* filename, int start, int length,
    delete rotated_blob;
  }

-  break_pieces(word->seam_array, start, start + length - 1, word->chopped_word);
+  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
+                    start + length - 1);
 }  // LearnPieces.

 /*---------------------------------------------------------------------------*/
@ -726,7 +742,7 @@ void Classify::InitAdaptedClass(TBLOB *Blob,
  ConvertConfig (AllProtosOn, 0, IClass);

  if (classify_learning_debug_level >= 1) {
-    cprintf ("Added new class '%s' with class id %d and %d protos.\n",
+    tprintf("Added new class '%s' with class id %d and %d protos.\n",
            unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
    if (classify_learning_debug_level > 1)
      DisplayAdaptedChar(Blob, IClass);
@ -839,7 +855,7 @@ void Classify::AdaptToChar(TBLOB *Blob,
                           FLOAT32 Threshold) {
  int NumFeatures;
  INT_FEATURE_ARRAY IntFeatures;
-  INT_RESULT_STRUCT IntResult;
+  UnicharRating int_result;
  INT_CLASS IClass;
  ADAPT_CLASS Class;
  TEMP_CONFIG TempConfig;
@ -849,13 +865,13 @@ void Classify::AdaptToChar(TBLOB *Blob,
  if (!LegalClassId (ClassId))
    return;

+  int_result.unichar_id = ClassId;
  Class = AdaptedTemplates->Class[ClassId];
  assert(Class != NULL);
  if (IsEmptyAdaptedClass(Class)) {
    InitAdaptedClass(Blob, ClassId, FontinfoId, Class, AdaptedTemplates);
-  }
-  else {
-    IClass = ClassForClassId (AdaptedTemplates->Templates, ClassId);
+  } else {
+    IClass = ClassForClassId(AdaptedTemplates->Templates, ClassId);

    NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
    if (NumFeatures <= 0)
@ -872,39 +888,38 @@ void Classify::AdaptToChar(TBLOB *Blob,
    }
    im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
              NumFeatures, IntFeatures,
-              &IntResult, classify_adapt_feature_threshold,
+              &int_result, classify_adapt_feature_threshold,
              NO_DEBUG, matcher_debug_separate_windows);
    FreeBitVector(MatchingFontConfigs);

    SetAdaptiveThreshold(Threshold);

-    if (IntResult.Rating <= Threshold) {
-      if (ConfigIsPermanent (Class, IntResult.Config)) {
+    if (1.0f - int_result.rating <= Threshold) {
+      if (ConfigIsPermanent(Class, int_result.config)) {
        if (classify_learning_debug_level >= 1)
-          cprintf ("Found good match to perm config %d = %4.1f%%.\n",
-            IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
+          tprintf("Found good match to perm config %d = %4.1f%%.\n",
+                  int_result.config, int_result.rating * 100.0);
        FreeFeatureSet(FloatFeatures);
        return;
      }

-      TempConfig = TempConfigFor (Class, IntResult.Config);
+      TempConfig = TempConfigFor(Class, int_result.config);
      IncreaseConfidence(TempConfig);
      if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
        Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
      }
      if (classify_learning_debug_level >= 1)
-        cprintf ("Increasing reliability of temp config %d to %d.\n",
-          IntResult.Config, TempConfig->NumTimesSeen);
+        tprintf("Increasing reliability of temp config %d to %d.\n",
+                int_result.config, TempConfig->NumTimesSeen);

      if (TempConfigReliable(ClassId, TempConfig)) {
-        MakePermanent(AdaptedTemplates, ClassId, IntResult.Config, Blob);
+        MakePermanent(AdaptedTemplates, ClassId, int_result.config, Blob);
        UpdateAmbigsGroup(ClassId, Blob);
      }
-    }
-    else {
+    } else {
      if (classify_learning_debug_level >= 1) {
-        cprintf ("Found poor match to temp config %d = %4.1f%%.\n",
-          IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
+        tprintf("Found poor match to temp config %d = %4.1f%%.\n",
+                int_result.config, int_result.rating * 100.0);
        if (classify_learning_debug_level > 2)
          DisplayAdaptedChar(Blob, IClass);
      }
@ -939,20 +954,20 @@ void Classify::DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class) {
                           &bl_features);
  if (sample == NULL) return;

-  INT_RESULT_STRUCT IntResult;
+  UnicharRating int_result;
  im_.Match(int_class, AllProtosOn, AllConfigsOn,
            bl_features.size(), &bl_features[0],
-            &IntResult, classify_adapt_feature_threshold,
+            &int_result, classify_adapt_feature_threshold,
            NO_DEBUG, matcher_debug_separate_windows);
-  cprintf ("Best match to temp config %d = %4.1f%%.\n",
-    IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
+  tprintf("Best match to temp config %d = %4.1f%%.\n",
+          int_result.config, int_result.rating * 100.0);
  if (classify_learning_debug_level >= 2) {
    uinT32 ConfigMask;
-    ConfigMask = 1 << IntResult.Config;
+    ConfigMask = 1 << int_result.config;
    ShowMatchDisplay();
    im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask,
              bl_features.size(), &bl_features[0],
-              &IntResult, classify_adapt_feature_threshold,
+              &int_result, classify_adapt_feature_threshold,
              6 | 0x19, matcher_debug_separate_windows);
    UpdateMatchDisplay();
  }
@ -988,44 +1003,34 @@ void Classify::DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class) {
 * @note Exceptions: none
 * @note History: Tue Mar 12 18:19:29 1991, DSJ, Created.
 */
-void Classify::AddNewResult(ADAPT_RESULTS *results,
-                            CLASS_ID class_id,
-                            int shape_id,
-                            FLOAT32 rating,
-                            bool adapted,
-                            int config,
-                            int fontinfo_id,
-                            int fontinfo_id2) {
-  ScoredClass *old_match = FindScoredUnichar(results, class_id);
-  ScoredClass match =
-      { class_id,
-        shape_id,
-        rating,
-        adapted,
-        static_cast<inT16>(config),
-        static_cast<inT16>(fontinfo_id),
-        static_cast<inT16>(fontinfo_id2) };
+void Classify::AddNewResult(const UnicharRating& new_result,
+                            ADAPT_RESULTS *results) {
+  int old_match = FindScoredUnichar(new_result.unichar_id, *results);

-  if (rating > results->best_match.rating + matcher_bad_match_pad ||
-      (old_match && rating >= old_match->rating))
-    return;
+  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
+      (old_match < results->match.size() &&
+       new_result.rating <= results->match[old_match].rating))
+    return;  // New one not good enough.

-  if (!unicharset.get_fragment(class_id))
+  if (!unicharset.get_fragment(new_result.unichar_id))
    results->HasNonfragment = true;

-  if (old_match)
-    old_match->rating = rating;
-  else
-    results->match.push_back(match);
+  if (old_match < results->match.size()) {
+    results->match[old_match].rating = new_result.rating;
+  } else {
+    results->match.push_back(new_result);
+  }

-  if (rating < results->best_match.rating &&
+  if (new_result.rating > results->best_rating &&
      // Ensure that fragments do not affect best rating, class and config.
      // This is needed so that at least one non-fragmented character is
      // always present in the results.
      // TODO(daria): verify that this helps accuracy and does not
      // hurt performance.
-      !unicharset.get_fragment(class_id)) {
-    results->best_match = match;
+      !unicharset.get_fragment(new_result.unichar_id)) {
+    results->best_match_index = old_match;
+    results->best_rating = new_result.rating;
+    results->best_unichar_id = new_result.unichar_id;
  }
 }                                /* AddNewResult */

@ -1060,7 +1065,7 @@ void Classify::AmbigClassifier(
    ADAPT_RESULTS *results) {
  if (int_features.empty()) return;
  uinT8* CharNormArray = new uinT8[unicharset.size()];
-  INT_RESULT_STRUCT IntResult;
+  UnicharRating int_result;

  results->BlobLength = GetCharNormFeature(fx_info, templates, NULL,
                                           CharNormArray);
@ -1073,17 +1078,18 @@ void Classify::AmbigClassifier(
  while (*ambiguities >= 0) {
    CLASS_ID class_id = *ambiguities;

+    int_result.unichar_id = class_id;
    im_.Match(ClassForClassId(templates, class_id),
              AllProtosOn, AllConfigsOn,
              int_features.size(), &int_features[0],
-              &IntResult,
+              &int_result,
              classify_adapt_feature_threshold, NO_DEBUG,
              matcher_debug_separate_windows);

    ExpandShapesAndApplyCorrections(NULL, debug, class_id, bottom, top, 0,
                                    results->BlobLength,
                                    classify_integer_matcher_multiplier,
-                                    CharNormArray, IntResult, results);
+                                    CharNormArray, &int_result, results);
    ambiguities++;
  }
  delete [] CharNormArray;
@ -1104,14 +1110,15 @@ void Classify::MasterMatcher(INT_TEMPLATES templates,
                             ADAPT_RESULTS* final_results) {
  int top = blob_box.top();
  int bottom = blob_box.bottom();
+  UnicharRating int_result;
  for (int c = 0; c < results.size(); c++) {
    CLASS_ID class_id = results[c].Class;
-    INT_RESULT_STRUCT& int_result = results[c].IMResult;
    BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos
                                        : AllProtosOn;
    BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs
                                         : AllConfigsOn;

+    int_result.unichar_id = class_id;
    im_.Match(ClassForClassId(templates, class_id),
              protos, configs,
              num_features, features,
@ -1122,7 +1129,7 @@ void Classify::MasterMatcher(INT_TEMPLATES templates,
                                    results[c].Rating,
                                    final_results->BlobLength,
                                    matcher_multiplier, norm_factors,
-                                    int_result, final_results);
+                                    &int_result, final_results);
  }
 }

@ -1135,65 +1142,76 @@ void Classify::ExpandShapesAndApplyCorrections(
    ADAPT_CLASS* classes, bool debug, int class_id, int bottom, int top,
    float cp_rating, int blob_length, int matcher_multiplier,
    const uinT8* cn_factors,
-    INT_RESULT_STRUCT& int_result, ADAPT_RESULTS* final_results) {
-  // Compute the fontinfo_ids.
-  int fontinfo_id = kBlankFontinfoId;
-  int fontinfo_id2 = kBlankFontinfoId;
+    UnicharRating* int_result, ADAPT_RESULTS* final_results) {
  if (classes != NULL) {
-    // Adapted result.
-    fontinfo_id = GetFontinfoId(classes[class_id], int_result.Config);
-    fontinfo_id2 = GetFontinfoId(classes[class_id], int_result.Config2);
+    // Adapted result. Convert configs to fontinfo_ids.
+    int_result->adapted = true;
+    for (int f = 0; f < int_result->fonts.size(); ++f) {
+      int_result->fonts[f].fontinfo_id =
+          GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
+    }
  } else {
-    // Pre-trained result.
-    fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, int_result.Config);
-    fontinfo_id2 = ClassAndConfigIDToFontOrShapeID(class_id,
-                                                   int_result.Config2);
+    // Pre-trained result. Map fonts using font_sets_.
+    int_result->adapted = false;
+    for (int f = 0; f < int_result->fonts.size(); ++f) {
+      int_result->fonts[f].fontinfo_id =
+          ClassAndConfigIDToFontOrShapeID(class_id,
+                                          int_result->fonts[f].fontinfo_id);
+    }
    if (shape_table_ != NULL) {
-      // Actually fontinfo_id is an index into the shape_table_ and it
-      // contains a list of unchar_id/font_id pairs.
-      int shape_id = fontinfo_id;
-      const Shape& shape = shape_table_->GetShape(fontinfo_id);
-      double min_rating = 0.0;
+      // Two possible cases:
+      // 1. Flat shapetable. All unichar-ids of the shapes referenced by
+      // int_result->fonts are the same. In this case build a new vector of
+      // mapped fonts and replace the fonts in int_result.
+      // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
+      // by int_result. In this case, build a vector of UnicharRating to
+      // gather together different font-ids for each unichar. Also covers case1.
+      GenericVector<UnicharRating> mapped_results;
+      for (int f = 0; f < int_result->fonts.size(); ++f) {
+        int shape_id = int_result->fonts[f].fontinfo_id;
+        const Shape& shape = shape_table_->GetShape(shape_id);
        for (int c = 0; c < shape.size(); ++c) {
          int unichar_id = shape[c].unichar_id;
-        fontinfo_id = shape[c].font_ids[0];
-        if (shape[c].font_ids.size() > 1)
-          fontinfo_id2 = shape[c].font_ids[1];
-        else if (fontinfo_id2 != kBlankFontinfoId)
-          fontinfo_id2 = shape_table_->GetShape(fontinfo_id2)[0].font_ids[0];
-        double rating = ComputeCorrectedRating(debug, unichar_id, cp_rating,
-                                               int_result.Rating,
-                                               int_result.FeatureMisses,
-                                               bottom, top, blob_length,
-                                               matcher_multiplier, cn_factors);
-        if (c == 0 || rating < min_rating)
-          min_rating = rating;
-        if (unicharset.get_enabled(unichar_id)) {
-          AddNewResult(final_results, unichar_id, shape_id, rating,
-                       classes != NULL, int_result.Config,
-                       fontinfo_id, fontinfo_id2);
+          if (!unicharset.get_enabled(unichar_id)) continue;
+          // Find the mapped_result for unichar_id.
+          int r = 0;
+          for (r = 0; r < mapped_results.size() &&
+               mapped_results[r].unichar_id != unichar_id; ++r) {}
+          if (r == mapped_results.size()) {
+            mapped_results.push_back(*int_result);
+            mapped_results[r].unichar_id = unichar_id;
+            mapped_results[r].fonts.truncate(0);
+          }
+          for (int i = 0; i < shape[c].font_ids.size(); ++i) {
+            mapped_results[r].fonts.push_back(
+                ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
          }
        }
-      int_result.Rating = min_rating;
+      }
+      for (int m = 0; m < mapped_results.size(); ++m) {
+        mapped_results[m].rating =
+            ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
+                                   cp_rating, int_result->rating,
+                                   int_result->feature_misses, bottom, top,
+                                   blob_length, matcher_multiplier, cn_factors);
+        AddNewResult(mapped_results[m], final_results);
+      }
      return;
    }
  }
-  double rating = ComputeCorrectedRating(debug, class_id, cp_rating,
-                                         int_result.Rating,
-                                         int_result.FeatureMisses,
+  if (unicharset.get_enabled(class_id)) {
+    int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
+                                                int_result->rating,
+                                                int_result->feature_misses,
                                                bottom, top, blob_length,
                                                matcher_multiplier, cn_factors);
-  if (unicharset.get_enabled(class_id)) {
-    AddNewResult(final_results, class_id, -1, rating,
-                 classes != NULL, int_result.Config,
-                 fontinfo_id, fontinfo_id2);
+    AddNewResult(*int_result, final_results);
  }
-  int_result.Rating = rating;
 }

-// Applies a set of corrections to the distance im_rating,
+// Applies a set of corrections to the confidence im_rating,
 // including the cn_correction, miss penalty and additional penalty
-// for non-alnums being vertical misfits. Returns the corrected distance.
+// for non-alnums being vertical misfits. Returns the corrected confidence.
 double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
                                        double cp_rating, double im_rating,
                                        int feature_misses,
@ -1201,7 +1219,7 @@ double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
                                        int blob_length, int matcher_multiplier,
                                        const uinT8* cn_factors) {
  // Compute class feature corrections.
-  double cn_corrected = im_.ApplyCNCorrection(im_rating, blob_length,
+  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
                                              cn_factors[unichar_id],
                                              matcher_multiplier);
  double miss_penalty = tessedit_class_miss_scale * feature_misses;
@ -1222,16 +1240,16 @@ double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
      vertical_penalty = classify_misfit_junk_penalty;
    }
  }
-  double result =cn_corrected + miss_penalty + vertical_penalty;
-  if (result > WORST_POSSIBLE_RATING)
+  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
+  if (result < WORST_POSSIBLE_RATING)
    result = WORST_POSSIBLE_RATING;
  if (debug) {
-    tprintf("%s: %2.1f(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
+    tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
            unicharset.id_to_unichar(unichar_id),
            result * 100.0,
            cp_rating * 100.0,
-            im_rating * 100.0,
-            (cn_corrected - im_rating) * 100.0,
+            (1.0 - im_rating) * 100.0,
+            (cn_corrected - (1.0 - im_rating)) * 100.0,
            cn_factors[unichar_id],
            miss_penalty * 100.0,
            vertical_penalty * 100.0);
@ -1266,11 +1284,11 @@ UNICHAR_ID *Classify::BaselineClassifier(
  ClearCharNormArray(CharNormArray);

  Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);
-  PruneClasses(Templates->Templates, int_features.size(), &int_features[0],
+  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
               CharNormArray, BaselineCutoffs, &Results->CPResults);

  if (matcher_debug_level >= 2 || classify_debug_level > 1)
-    cprintf ("BL Matches =  ");
+    tprintf("BL Matches =  ");

  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
                CharNormArray,
@ -1278,13 +1296,12 @@ UNICHAR_ID *Classify::BaselineClassifier(
                Blob->bounding_box(), Results->CPResults, Results);

  delete [] CharNormArray;
-  CLASS_ID ClassId = Results->best_match.unichar_id;
-  if (ClassId == NO_CLASS)
-    return (NULL);
-  /* this is a bug - maybe should return "" */
+  CLASS_ID ClassId = Results->best_unichar_id;
+  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
+    return NULL;

  return Templates->Class[ClassId]->
-      Config[Results->best_match.config].Perm->Ambigs;
+      Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
 }                                /* BaselineClassifier */


@ -1318,14 +1335,7 @@ int Classify::CharNormClassifier(TBLOB *blob,
                                            -1, &unichar_results);
  // Convert results to the format used internally by AdaptiveClassifier.
  for (int r = 0; r < unichar_results.size(); ++r) {
-    int unichar_id = unichar_results[r].unichar_id;
-    // Fonts are listed in order of preference.
-    int font1 = unichar_results[r].fonts.size() >= 1
-              ? unichar_results[r].fonts[0] : kBlankFontinfoId;
-    int font2 = unichar_results[r].fonts.size() >= 2
-              ? unichar_results[r].fonts[1] : kBlankFontinfoId;
-    float rating = 1.0f - unichar_results[r].rating;
-    AddNewResult(adapt_results, unichar_id, -1, rating, false, 0, font1, font2);
+    AddNewResult(unichar_results[r], adapt_results);
  }
  return sample.num_features();
 }                                /* CharNormClassifier */
@ -1356,7 +1366,7 @@ int Classify::CharNormTrainingSample(bool pruner_only,
  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
                        pruner_norm_array);

-  PruneClasses(PreTrainedTemplates, num_features, sample.features(),
+  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
               pruner_norm_array,
               shape_table_ != NULL ? &shapetable_cutoffs_[0] : CharNormCutoffs,
               &adapt_results->CPResults);
@ -1380,14 +1390,7 @@ int Classify::CharNormTrainingSample(bool pruner_only,
                  blob_box, adapt_results->CPResults, adapt_results);
    // Convert master matcher results to output format.
    for (int i = 0; i < adapt_results->match.size(); i++) {
-      ScoredClass next = adapt_results->match[i];
-      UnicharRating rating(next.unichar_id, 1.0f - next.rating);
-      if (next.fontinfo_id >= 0) {
-        rating.fonts.push_back(next.fontinfo_id);
-        if (next.fontinfo_id2 >= 0)
-          rating.fonts.push_back(next.fontinfo_id2);
-      }
-      results->push_back(rating);
+      results->push_back(adapt_results->match[i]);
    }
    results->sort(&UnicharRating::SortDescendingRating);
  }
@ -1412,60 +1415,14 @@ int Classify::CharNormTrainingSample(bool pruner_only,
 * @note Exceptions: none
 * @note History: Tue Mar 12 18:36:52 1991, DSJ, Created.
 */
-void Classify::ClassifyAsNoise(ADAPT_RESULTS *Results) {
-  register FLOAT32 Rating;
+void Classify::ClassifyAsNoise(ADAPT_RESULTS *results) {
+  float rating = results->BlobLength / matcher_avg_noise_size;
+  rating *= rating;
+  rating /= 1.0 + rating;

-  Rating = Results->BlobLength / matcher_avg_noise_size;
-  Rating *= Rating;
-  Rating /= 1.0 + Rating;
-
-  AddNewResult(Results, NO_CLASS, -1, Rating, false, -1,
-               kBlankFontinfoId, kBlankFontinfoId);
+  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
 }                                /* ClassifyAsNoise */
-}  // namespace tesseract

-
-/*---------------------------------------------------------------------------*/
-// Return a pointer to the scored unichar in results, or NULL if not present.
-ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) {
-  for (int i = 0; i < results->match.size(); i++) {
-    if (results->match[i].unichar_id == id)
-      return &results->match[i];
-  }
-  return NULL;
-}
-
-// Retrieve the current rating for a unichar id if we have rated it, defaulting
-// to WORST_POSSIBLE_RATING.
-ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) {
-  ScoredClass poor_result =
-      {id, -1, WORST_POSSIBLE_RATING, false, -1,
-          kBlankFontinfoId, kBlankFontinfoId};
-  ScoredClass *entry = FindScoredUnichar(results, id);
-  return (entry == NULL) ? poor_result : *entry;
-}
-
-// Compare character classes by rating as for qsort(3).
-// For repeatability, use character class id as a tie-breaker.
-int CompareByRating(const void *arg1,    // ScoredClass *class1
-                    const void *arg2) {  // ScoredClass *class2
-  const ScoredClass *class1 = (const ScoredClass *)arg1;
-  const ScoredClass *class2 = (const ScoredClass *)arg2;
-
-  if (class1->rating < class2->rating)
-    return -1;
-  else if (class1->rating > class2->rating)
-    return 1;
-
-  if (class1->unichar_id < class2->unichar_id)
-    return -1;
-  else if (class1->unichar_id > class2->unichar_id)
-    return 1;
-  return 0;
-}
-
-/*---------------------------------------------------------------------------*/
-namespace tesseract {
 /// The function converts the given match ratings to the list of blob
 /// choices with ratings and certainties (used by the context checkers).
 /// If character fragments are present in the results, this function also makes
@ -1496,11 +1453,9 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,

  float best_certainty = -MAX_FLOAT32;
  for (int i = 0; i < Results->match.size(); i++) {
-    ScoredClass next = Results->match[i];
-    int fontinfo_id = next.fontinfo_id;
-    int fontinfo_id2 = next.fontinfo_id2;
-    bool adapted = next.adapted;
-    bool current_is_frag = (unicharset.get_fragment(next.unichar_id) != NULL);
+    const UnicharRating& result = Results->match[i];
+    bool adapted = result.adapted;
+    bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != NULL);
    if (temp_it.length()+1 == max_matches &&
        !contains_nonfrag && current_is_frag) {
      continue;  // look for a non-fragmented character to fill the
@ -1514,7 +1469,7 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
      Certainty = -20;
      Rating = 100;    // should be -certainty * real_blob_length
    } else {
-      Rating = Certainty = next.rating;
+      Rating = Certainty = (1.0f - result.rating);
      Rating *= rating_scale * Results->BlobLength;
      Certainty *= -(getDict().certainty_scale);
    }
@ -1531,14 +1486,16 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
    }

    float min_xheight, max_xheight, yshift;
-    denorm.XHeightRange(next.unichar_id, unicharset, box,
+    denorm.XHeightRange(result.unichar_id, unicharset, box,
                        &min_xheight, &max_xheight, &yshift);
-    temp_it.add_to_end(new BLOB_CHOICE(next.unichar_id, Rating, Certainty,
-                                        fontinfo_id, fontinfo_id2,
-                                        unicharset.get_script(next.unichar_id),
+    BLOB_CHOICE* choice =
+        new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
+                        unicharset.get_script(result.unichar_id),
                        min_xheight, max_xheight, yshift,
                        adapted ? BCC_ADAPTED_CLASSIFIER
-                                                : BCC_STATIC_CLASSIFIER));
+                                : BCC_STATIC_CLASSIFIER);
+    choice->set_fonts(result.fonts);
+    temp_it.add_to_end(choice);
    contains_nonfrag |= !current_is_frag;  // update contains_nonfrag
    choices_length++;
    if (choices_length >= max_matches) break;
@ -1562,17 +1519,13 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
 void Classify::DebugAdaptiveClassifier(TBLOB *blob,
                                       ADAPT_RESULTS *Results) {
  if (static_classifier_ == NULL) return;
-  for (int i = 0; i < Results->match.size(); i++) {
-    if (i == 0 || Results->match[i].rating < Results->best_match.rating)
-      Results->best_match = Results->match[i];
-  }
  INT_FX_RESULT_STRUCT fx_info;
  GenericVector<INT_FEATURE_STRUCT> bl_features;
  TrainingSample* sample =
      BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
  if (sample == NULL) return;
  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
-                                   Results->best_match.unichar_id);
+                                   Results->best_unichar_id);
 }                                /* DebugAdaptiveClassifier */
 #endif

@ -1615,7 +1568,8 @@ void Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) {
  } else {
    Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
                                     AdaptedTemplates, Results);
-    if ((!Results->match.empty() && MarginalMatch(Results->best_match.rating) &&
+    if ((!Results->match.empty() &&
+         MarginalMatch(Results->best_rating, matcher_great_threshold) &&
         !tess_bn_matching) ||
        Results->match.empty()) {
      CharNormClassifier(Blob, *sample, Results);
@ -1674,7 +1628,7 @@ UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
  CharNormClassifier(Blob, *sample, Results);
  delete sample;
  RemoveBadMatches(Results);
-  Results->match.sort(CompareByRating);
+  Results->match.sort(&UnicharRating::SortDescendingRating);

  /* copy the class id's into an string of ambiguities - don't copy if
     the correct class is the only class id matched */
@ -2094,14 +2048,11 @@ namespace tesseract {
 * @note Exceptions: none
 * @note History: Mon Mar 18 09:24:53 1991, DSJ, Created.
 */
-void Classify::PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results) {
-  for (int i = 0; i < Results->match.size(); ++i) {
-    tprintf("%s(%d), shape %d, %.2f  ",
-            unicharset.debug_str(Results->match[i].unichar_id).string(),
-            Results->match[i].unichar_id, Results->match[i].shape_id,
-            Results->match[i].rating * 100.0);
+void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS& results) {
+  for (int i = 0; i < results.match.size(); ++i) {
+    tprintf("%s  ", unicharset.debug_str(results.match[i].unichar_id).string());
+    results.match[i].Print();
  }
-  tprintf("\n");
 }                              /* PrintAdaptiveMatchResults */

 /*---------------------------------------------------------------------------*/
@ -2124,42 +2075,51 @@ void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {
  int Next, NextGood;
  FLOAT32 BadMatchThreshold;
  static const char* romans = "i v x I V X";
-  BadMatchThreshold = Results->best_match.rating + matcher_bad_match_pad;
+  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;

  if (classify_bln_numeric_mode) {
    UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
        unicharset.unichar_to_id("1") : -1;
    UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
        unicharset.unichar_to_id("0") : -1;
-    ScoredClass scored_one = ScoredUnichar(Results, unichar_id_one);
-    ScoredClass scored_zero = ScoredUnichar(Results, unichar_id_zero);
+    float scored_one = ScoredUnichar(unichar_id_one, *Results);
+    float scored_zero = ScoredUnichar(unichar_id_zero, *Results);

    for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
-      if (Results->match[Next].rating <= BadMatchThreshold) {
-        ScoredClass match = Results->match[Next];
+      const UnicharRating& match = Results->match[Next];
+      if (match.rating >= BadMatchThreshold) {
        if (!unicharset.get_isalpha(match.unichar_id) ||
            strstr(romans,
                   unicharset.id_to_unichar(match.unichar_id)) != NULL) {
-          Results->match[NextGood++] = Results->match[Next];
        } else if (unicharset.eq(match.unichar_id, "l") &&
-                   scored_one.rating >= BadMatchThreshold) {
-          Results->match[NextGood] = scored_one;
-          Results->match[NextGood].rating = match.rating;
-          NextGood++;
+                   scored_one < BadMatchThreshold) {
+          Results->match[Next].unichar_id = unichar_id_one;
        } else if (unicharset.eq(match.unichar_id, "O") &&
-                   scored_zero.rating >= BadMatchThreshold) {
-          Results->match[NextGood] = scored_zero;
-          Results->match[NextGood].rating = match.rating;
-          NextGood++;
+                   scored_zero < BadMatchThreshold) {
+          Results->match[Next].unichar_id = unichar_id_zero;
+        } else {
+          Results->match[Next].unichar_id = INVALID_UNICHAR_ID;  // Don't copy.
+        }
+        if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
+          if (NextGood == Next) {
+            ++NextGood;
+          } else {
+            Results->match[NextGood++] = Results->match[Next];
+          }
        }
      }
    }
  } else {
    for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
-      if (Results->match[Next].rating <= BadMatchThreshold)
+      if (Results->match[Next].rating >= BadMatchThreshold) {
+        if (NextGood == Next) {
+          ++NextGood;
+        } else {
          Results->match[NextGood++] = Results->match[Next];
        }
      }
+    }
+  }
  Results->match.truncate(NextGood);
 }                              /* RemoveBadMatches */

@ -2184,18 +2144,24 @@ void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {
  punc_count = 0;
  digit_count = 0;
  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
-    ScoredClass match = Results->match[Next];
+    const UnicharRating& match = Results->match[Next];
+    bool keep = true;
    if (strstr(punc_chars,
               unicharset.id_to_unichar(match.unichar_id)) != NULL) {
-      if (punc_count < 2)
-        Results->match[NextGood++] = match;
+      if (punc_count >= 2)
+        keep = false;
      punc_count++;
    } else {
      if (strstr(digit_chars,
                 unicharset.id_to_unichar(match.unichar_id)) != NULL) {
-        if (digit_count < 1)
-          Results->match[NextGood++] = match;
+        if (digit_count >= 1)
+          keep = false;
        digit_count++;
+      }
+    }
+    if (keep) {
+      if (NextGood == Next) {
+        ++NextGood;
      } else {
        Results->match[NextGood++] = match;
      }
@ -2252,7 +2218,7 @@ void Classify::ShowBestMatchFor(int shape_id,
    tprintf("Illegal blob (char norm features)!\n");
    return;
  }
-  INT_RESULT_STRUCT cn_result;
+  UnicharRating cn_result;
  classify_norm_method.set_value(character);
  im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
            AllProtosOn, AllConfigsOn,
@ -2260,7 +2226,7 @@ void Classify::ShowBestMatchFor(int shape_id,
            classify_adapt_feature_threshold, NO_DEBUG,
            matcher_debug_separate_windows);
  tprintf("\n");
-  config_mask = 1 << cn_result.Config;
+  config_mask = 1 << cn_result.config;

  tprintf("Static Shape ID: %d\n", shape_id);
  ShowMatchDisplay();
--- a/classify/blobclass.cpp
+++ b/classify/blobclass.cpp
@ -20,63 +20,32 @@
      Include Files and Type Defines
 ----------------------------------------------------------------------------**/
 #include "blobclass.h"
-#include "extract.h"
+
+#include <stdio.h>
+
+#include "classify.h"
 #include "efio.h"
 #include "featdefs.h"
-#include "callcpp.h"
-
-#include <math.h>
-#include <stdio.h>
-#include <signal.h>
-
-#define MAXFILENAME             80
-#define MAXMATCHES              10
+#include "mf.h"
+#include "normfeat.h"

 static const char kUnknownFontName[] = "UnknownFont";

 STRING_VAR(classify_font_name, kUnknownFontName,
           "Default font name to be used in training");

-/**----------------------------------------------------------------------------
-        Global Data Definitions and Declarations
----------------------------------------------------------------------------**/
-/* name of current image file being processed */
-extern char imagefile[];
-
+namespace tesseract {
 /**----------------------------------------------------------------------------
            Public Code
 ----------------------------------------------------------------------------**/
-
-/*---------------------------------------------------------------------------*/
-// As all TBLOBs, Blob is in baseline normalized coords.
-// See SetupBLCNDenorms in intfx.cpp for other args.
-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
-               const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
-/*
- **      Parameters:
- **              Blob            blob whose micro-features are to be learned
- **              Row             row of text that blob came from
- **              BlobText        text that corresponds to blob
- **              TextLength      number of characters in blob
- **      Globals:
- **              imagefile       base filename of the page being learned
- **              classify_font_name
- **                              name of font currently being trained on
- **      Operation:
- **              Extract micro-features from the specified blob and append
- **              them to the appropriate file.
- **      Return: none
- **      Exceptions: none
- **      History: 7/28/89, DSJ, Created.
- */
-#define TRAIN_SUFFIX    ".tr"
-  static FILE *FeatureFile = NULL;
-  STRING Filename(filename);
-
-  // If no fontname was set, try to extract it from the filename
-  STRING CurrFontName = classify_font_name;
-  if (CurrFontName == kUnknownFontName) {
+// Finds the name of the training font and returns it in fontname, by cutting
+// it out based on the expectation that the filename is of the form:
+// /path/to/dir/[lang].[fontname].exp[num]
+// The [lang], [fontname] and [num] fields should not have '.' characters.
+// If the global parameter classify_font_name is set, its value is used instead.
+void ExtractFontName(const STRING& filename, STRING* fontname) {
+  *fontname = classify_font_name;
+  if (*fontname == kUnknownFontName) {
    // filename is expected to be of the form [lang].[fontname].exp[num]
    // The [lang], [fontname] and [num] fields should not have '.' characters.
    const char *basename = strrchr(filename.string(), '/');
@ -84,47 +53,56 @@ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
    const char *lastdot  = strrchr(filename.string(), '.');
    if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
      ++firstdot;
-      CurrFontName = firstdot;
-      CurrFontName[lastdot - firstdot] = '\0';
+      *fontname = firstdot;
+      fontname->truncate_at(lastdot - firstdot);
    }
  }
+}

-  // if a feature file is not yet open, open it
-  // the name of the file is the name of the image plus TRAIN_SUFFIX
-  if (FeatureFile == NULL) {
-    Filename += TRAIN_SUFFIX;
-    FeatureFile = Efopen(Filename.string(), "wb");
-    cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
-  }
-
-  LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
-            BlobText, CurrFontName.string());
-}                                // LearnBlob
-
-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
-               TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
+/*---------------------------------------------------------------------------*/
+// Extracts features from the given blob and saves them in the tr_file_data_
+// member variable.
+// fontname:  Name of font that this blob was printed in.
+// cn_denorm: Character normalization transformation to apply to the blob.
+// fx_info:   Character normalization parameters computed with cn_denorm.
+// blob_text: Ground truth text for the blob.
+void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
+                         const DENORM& cn_denorm,
                         const INT_FX_RESULT_STRUCT& fx_info,
-               const char* BlobText, const char* FontName) {
-  CHAR_DESC CharDesc;
+                         const char* blob_text) {
+  CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
+  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
+  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
+  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
+  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);

-  ASSERT_HOST(FeatureFile != NULL);
-
-  CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
-                                 Blob);
-  if (CharDesc == NULL) {
-    cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
-    return;
-  }
-
-  if (ValidCharDescription(FeatureDefs, CharDesc)) {
-    // label the features with a class name and font name
-    fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
+  if (ValidCharDescription(feature_defs_, CharDesc)) {
+    // Label the features with a class name and font name.
+    tr_file_data_ += "\n";
+    tr_file_data_ += fontname;
+    tr_file_data_ += " ";
+    tr_file_data_ += blob_text;
+    tr_file_data_ += "\n";

    // write micro-features to file and clean up
-    WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
+    WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
  } else {
    tprintf("Blob learned was invalid!\n");
  }
  FreeCharDescription(CharDesc);
-
 }                                // LearnBlob
+
+// Writes stored training data to a .tr file based on the given filename.
+// Returns false on error.
+bool Classify::WriteTRFile(const STRING& filename) {
+  STRING tr_filename = filename + ".tr";
+  FILE* fp = Efopen(tr_filename.string(), "wb");
+  int len = tr_file_data_.length();
+  bool result =
+      fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
+  fclose(fp);
+  tr_file_data_.truncate_at(0);
+  return result;
+}
+
+}  // namespace tesseract.
--- a/classify/blobclass.h
+++ b/classify/blobclass.h
@ -21,9 +21,7 @@
 /**----------------------------------------------------------------------------
          Include Files and Type Defines
 ----------------------------------------------------------------------------**/
-#include "featdefs.h"
-#include "oldlist.h"
-#include "blobs.h"
+#include "strngs.h"

 /*---------------------------------------------------------------------------
          Macros
@ -39,18 +37,14 @@
 /**----------------------------------------------------------------------------
          Public Function Prototypes
 ----------------------------------------------------------------------------**/
-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
-               const INT_FX_RESULT_STRUCT& fx_info,
-               const char* BlobText);
+namespace tesseract {
+// Finds the name of the training font and returns it in fontname, by cutting
+// it out based on the expectation that the filename is of the form:
+// /path/to/dir/[lang].[fontname].exp[num]
+// The [lang], [fontname] and [num] fields should not have '.' characters.
+// If the global parameter classify_font_name is set, its value is used instead.
+void ExtractFontName(const STRING& filename, STRING* fontname);

-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* File, TBLOB* Blob,
-               const DENORM& bl_denorm, const DENORM& cn_denorm,
-               const INT_FX_RESULT_STRUCT& fx_info,
-               const char* BlobText, const char* FontName);
+}  // namespace tesseract.

-/**----------------------------------------------------------------------------
-        Global Data Definitions and Declarations
----------------------------------------------------------------------------**/
-/*parameter used to turn on/off output of recognized chars to the screen */
 #endif
--- a/classify/classify.cpp
+++ b/classify/classify.cpp
@ -217,7 +217,7 @@ void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {
        (rating_scale * blob_length);
  }
  BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
-                                             -1, -1, 0, 0, MAX_FLOAT32, 0,
+                                             -1, 0.0f, MAX_FLOAT32, 0,
                                             BCC_SPECKLE_CLASSIFIER);
  bc_it.add_to_end(blob_choice);
 }
--- a/classify/classify.h
+++ b/classify/classify.h
@ -25,6 +25,7 @@
 #include "dict.h"
 #include "featdefs.h"
 #include "fontinfo.h"
+#include "imagedata.h"
 #include "intfx.h"
 #include "intmatcher.h"
 #include "normalis.h"
@ -97,9 +98,8 @@ class Classify : public CCStruct {
  //    results                (output) Sorted Array of pruned classes.
  //                           Array must be sized to take the maximum possible
  //                           number of outputs : int_templates->NumClasses.
-  int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
-                   int num_features,
-                   const INT_FEATURE_STRUCT* features,
+  int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates, int num_features,
+                   int keep_this, const INT_FEATURE_STRUCT* features,
                   const uinT8* normalization_factors,
                   const uinT16* expected_num_features,
                   GenericVector<CP_RESULT_STRUCT>* results);
@ -119,25 +119,25 @@ class Classify : public CCStruct {
                                   const UNICHARSET& target_unicharset);
  /* adaptmatch.cpp ***********************************************************/

-  // Learn the given word using its chopped_word, seam_array, denorm,
+  // Learns the given word using its chopped_word, seam_array, denorm,
  // box_word, best_state, and correct_text to learn both correctly and
-  // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
-  // is called and the data will be written to a file for static training.
+  // incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
+  // is called and the data will be saved in an internal buffer.
  // Otherwise AdaptToBlob is called for adaption within a document.
-  void LearnWord(const char* filename, WERD_RES *word);
+  void LearnWord(const char* fontname, WERD_RES* word);

  // Builds a blob of length fragments, from the word, starting at start,
-  // and then learn it, as having the given correct_text.
-  // If filename is not NULL, then LearnBlob
-  // is called and the data will be written to a file for static training.
+  // and then learns it, as having the given correct_text.
+  // If fontname is not NULL, then LearnBlob is called and the data will be
+  // saved in an internal buffer for static training.
  // Otherwise AdaptToBlob is called for adaption within a document.
  // threshold is a magic number required by AdaptToChar and generated by
-  // GetAdaptThresholds.
+  // ComputeAdaptionThresholds.
  // Although it can be partly inferred from the string, segmentation is
  // provided to explicitly clarify the character segmentation.
-  void LearnPieces(const char* filename, int start, int length,
-                   float threshold, CharSegmentationType segmentation,
-                   const char* correct_text, WERD_RES *word);
+  void LearnPieces(const char* fontname, int start, int length, float threshold,
+                   CharSegmentationType segmentation, const char* correct_text,
+                   WERD_RES* word);
  void InitAdaptiveClassifier(bool load_pre_trained_templates);
  void InitAdaptedClass(TBLOB *Blob,
                        CLASS_ID ClassId,
@ -174,7 +174,7 @@ class Classify : public CCStruct {
                                       int blob_length,
                                       int matcher_multiplier,
                                       const uinT8* cn_factors,
-                                       INT_RESULT_STRUCT& int_result,
+                                       UnicharRating* int_result,
                                       ADAPT_RESULTS* final_results);
  // Applies a set of corrections to the distance im_rating,
  // including the cn_correction, miss penalty and additional penalty
@ -187,14 +187,7 @@ class Classify : public CCStruct {
  void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
                               ADAPT_RESULTS *Results,
                               BLOB_CHOICE_LIST *Choices);
-  void AddNewResult(ADAPT_RESULTS *results,
-                    CLASS_ID class_id,
-                    int shape_id,
-                    FLOAT32 rating,
-                    bool adapted,
-                    int config,
-                    int fontinfo_id,
-                    int fontinfo_id2);
+  void AddNewResult(const UnicharRating& new_result, ADAPT_RESULTS *results);
  int GetAdaptiveFeatures(TBLOB *Blob,
                          INT_FEATURE_ARRAY IntFeatures,
                          FEATURE_SET *FloatFeatures);
@ -219,7 +212,7 @@ class Classify : public CCStruct {
                     CLASS_ID ClassId,
                     int ConfigId,
                     TBLOB *Blob);
-  void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
+  void PrintAdaptiveMatchResults(const ADAPT_RESULTS& results);
  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
  void RemoveBadMatches(ADAPT_RESULTS *Results);
  void SetAdaptiveThreshold(FLOAT32 Threshold);
@ -361,7 +354,22 @@ class Classify : public CCStruct {
  FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
  /* picofeat.cpp ***********************************************************/
  FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
-
+  FEATURE_SET ExtractIntCNFeatures(const TBLOB& blob,
+                                   const INT_FX_RESULT_STRUCT& fx_info);
+  FEATURE_SET ExtractIntGeoFeatures(const TBLOB& blob,
+                                    const INT_FX_RESULT_STRUCT& fx_info);
+  /* blobclass.cpp ***********************************************************/
+  // Extracts features from the given blob and saves them in the tr_file_data_
+  // member variable.
+  // fontname:  Name of font that this blob was printed in.
+  // cn_denorm: Character normalization transformation to apply to the blob.
+  // fx_info:   Character normalization parameters computed with cn_denorm.
+  // blob_text: Ground truth text for the blob.
+  void LearnBlob(const STRING& fontname, TBLOB* Blob, const DENORM& cn_denorm,
+                 const INT_FX_RESULT_STRUCT& fx_info, const char* blob_text);
+  // Writes stored training data to a .tr file based on the given filename.
+  // Returns false on error.
+  bool WriteTRFile(const STRING& filename);

  // Member variables.

@ -498,6 +506,9 @@ class Classify : public CCStruct {
  /* variables used to hold performance statistics */
  int NumAdaptationsFailed;

+  // Training data gathered here for all the images in a document.
+  STRING tr_file_data_;
+
  // Expected number of features in the class pruner, used to penalize
  // unknowns that have too few features (like a c being classified as e) so
  // it doesn't recognize everything as '@' or '#'.
--- a/classify/cluster.cpp
+++ b/classify/cluster.cpp
--- a/classify/extern.h
+++ b/classify/extern.h
@ -1,32 +0,0 @@
-#ifndef EXTERN_H
-#define EXTERN_H
-
-/* -*-C-*-
- ********************************************************************************
- *
- * File:        extern.h  (Formerly extern.h)
- * Description:  External definitions for C or C++
- * Author:       Mark Seaman, OCR Technology
- * Created:      Tue Mar 20 14:01:22 1990
- * Modified:     Tue Mar 20 14:02:09 1990 (Mark Seaman) marks@hpgrlt
- * Language:     C
- * Package:      N/A
- * Status:       Experimental (Do Not Distribute)
- *
- * (c) Copyright 1990, Hewlett-Packard Company.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- ********************************************************************************
- */
-
-#define EXTERN extern
-
-#endif
--- a/classify/extract.cpp
+++ b/classify/extract.cpp
@ -1,74 +0,0 @@
-/******************************************************************************
- **	Filename:    extract.c
- **	Purpose:     Generic high level feature extractor routines.
- **	Author:      Dan Johnson
- **	History:     Sun Jan 21 09:44:08 1990, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-/*-----------------------------------------------------------------------------
-          Include Files and Type Defines
-----------------------------------------------------------------------------*/
-#include "extract.h"
-#include "flexfx.h"
-#include "danerror.h"
-
-typedef CHAR_FEATURES (*CF_FUNC) ();
-
-/*-----------------------------------------------------------------------------
-          Private Function Prototypes
-----------------------------------------------------------------------------*/
-void ExtractorStub(); 
-
-/*-----------------------------------------------------------------------------
-              Public Code
-----------------------------------------------------------------------------*/
-/*---------------------------------------------------------------------------*/
-/**
- * Extract features from Blob by calling the feature
- * extractor which is currently being used.  This routine
- * simply provides a high level interface to feature
- * extraction.  The caller can extract any type of features
- * from a blob without understanding any lower level details.
- *
- * @param FeatureDefs	definitions of feature types/extractors
- * @param denorm	Normalize/denormalize to access original image
- * @param Blob		blob to extract features from
- *
- * @return The character features extracted from Blob.
- * @note Exceptions: none
- * @note History: Sun Jan 21 10:07:28 1990, DSJ, Created.
- */
-CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              const DENORM& bl_denorm, const DENORM& cn_denorm,
-                              const INT_FX_RESULT_STRUCT& fx_info,
-                              TBLOB *Blob) {
-  return ExtractFlexFeatures(FeatureDefs, Blob, bl_denorm, cn_denorm, fx_info);
-}                                /* ExtractBlobFeatures */
-
-/*-----------------------------------------------------------------------------
-              Private Code
-----------------------------------------------------------------------------*/
-/*---------------------------------------------------------------------------*/
-void
-ExtractorStub ()
-/**
- * This routine is used to stub out feature extractors
- * that are no longer used.  It simply calls DoError.
- *
- * @note Exceptions: none
- * @note History: Wed Jan  2 14:16:49 1991, DSJ, Created.
- */
-#define DUMMY_ERROR     1
-{
-  DoError (DUMMY_ERROR, "Selected feature extractor has been stubbed out!");
-}                                /* ExtractorStub */
--- a/classify/extract.h
+++ b/classify/extract.h
@ -1,40 +0,0 @@
-/******************************************************************************
- **	Filename:    extract.h
- **	Purpose:     Interface to high level generic feature extraction.
- **	Author:      Dan Johnson
- **	History:     1/21/90, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-#ifndef   EXTRACT_H
-#define   EXTRACT_H
-
-#include "featdefs.h"
-#include <stdio.h>
-
-class DENORM;
-
-/*-----------------------------------------------------------------------------
-          Public Function Prototypes
-----------------------------------------------------------------------------*/
-// Deprecated! Will be deleted soon!
-// In the meantime, as all TBLOBs, Blob is in baseline normalized coords.
-// See SetupBLCNDenorms in intfx.cpp for other args.
-CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              const DENORM& bl_denorm, const DENORM& cn_denorm,
-                              const INT_FX_RESULT_STRUCT& fx_info, TBLOB *Blob);
-
-/*---------------------------------------------------------------------------
-          Private Function Prototypes
----------------------------------------------------------------------------*/
-void ExtractorStub();
-#endif
--- a/classify/featdefs.cpp
+++ b/classify/featdefs.cpp
@ -178,7 +178,7 @@ CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs) {

 /*---------------------------------------------------------------------------*/
 /**
- * Write a textual representation of CharDesc to File.
+ * Appends a textual representation of CharDesc to str.
 * The format used is to write out the number of feature
 * sets which will be written followed by a representation of
 * each feature set.
@ -187,18 +187,15 @@ CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs) {
 * by a description of the feature set.  Feature sets which are
 * not present are not written.
 *
- * Globals: 
- * - none
- *
 * @param FeatureDefs    definitions of feature types/extractors
- * @param File		open text file to write CharDesc to
+ * @param str            string to append CharDesc to
 * @param CharDesc       character description to write to File
 *
 * @note Exceptions: none
 * @note History: Wed May 23 17:21:18 1990, DSJ, Created.
 */
-void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                          FILE *File, CHAR_DESC CharDesc) {
+void WriteCharDescription(const FEATURE_DEFS_STRUCT& FeatureDefs,
+                          CHAR_DESC CharDesc, STRING* str) {
  int Type;
  int NumSetsToWrite = 0;

@ -206,11 +203,14 @@ void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
    if (CharDesc->FeatureSets[Type])
      NumSetsToWrite++;

-  fprintf (File, " %d\n", NumSetsToWrite);
-  for (Type = 0; Type < CharDesc->NumFeatureSets; Type++)
+  str->add_str_int(" ", NumSetsToWrite);
+  *str += "\n";
+  for (Type = 0; Type < CharDesc->NumFeatureSets; Type++) {
    if (CharDesc->FeatureSets[Type]) {
-    fprintf (File, "%s ", (FeatureDefs.FeatureDesc[Type])->ShortName);
-    WriteFeatureSet (File, CharDesc->FeatureSets[Type]);
+      *str += FeatureDefs.FeatureDesc[Type]->ShortName;
+      *str += " ";
+      WriteFeatureSet(CharDesc->FeatureSets[Type], str);
+    }
  }
 }                                /* WriteCharDescription */

@ -231,6 +231,8 @@ bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
            anything_written = true;
        }
      }
+    } else {
+      return false;
    }
  }
  return anything_written && well_formed;
--- a/classify/featdefs.h
+++ b/classify/featdefs.h
@ -48,7 +48,6 @@ typedef CHAR_DESC_STRUCT *CHAR_DESC;
 struct FEATURE_DEFS_STRUCT {
  inT32 NumFeatureTypes;
  const FEATURE_DESC_STRUCT* FeatureDesc[NUM_FEATURE_TYPES];
-  const FEATURE_EXT_STRUCT* FeatureExtractors[NUM_FEATURE_TYPES];
  int FeatureEnabled[NUM_FEATURE_TYPES];
 };
 typedef FEATURE_DEFS_STRUCT *FEATURE_DEFS;
@ -65,8 +64,8 @@ CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs);
 bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
                          CHAR_DESC CharDesc);

-void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                          FILE *File, CHAR_DESC CharDesc);
+void WriteCharDescription(const FEATURE_DEFS_STRUCT& FeatureDefs,
+                          CHAR_DESC CharDesc, STRING* str);

 CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
                              FILE *File);
--- a/classify/flexfx.cpp
+++ b/classify/flexfx.cpp
@ -1,72 +0,0 @@
-/******************************************************************************
- **	Filename:    flexfx.c
- **	Purpose:     Interface to flexible feature extractor.
- **	Author:      Dan Johnson
- **	History:     Wed May 23 13:45:10 1990, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-/**----------------------------------------------------------------------------
-          Include Files and Type Defines
----------------------------------------------------------------------------**/
-#include "flexfx.h"
-#include "featdefs.h"
-#include "emalloc.h"
-#include <string.h>
-#include <stdio.h>
-
-/**----------------------------------------------------------------------------
-              Public Code
----------------------------------------------------------------------------**/
-/*---------------------------------------------------------------------------*/
-// Deprecated! Will be deleted soon!
-// In the meantime, as all TBLOBs, Blob is in baseline normalized coords.
-// See SetupBLCNDenorms in intfx.cpp for other args.
-CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              TBLOB *Blob, const DENORM& bl_denorm,
-                              const DENORM& cn_denorm,
-                              const INT_FX_RESULT_STRUCT& fx_info) {
-/*
- **	Parameters:
- **		Blob		blob to extract features from
- **		denorm  control parameter for feature extractor
- **	Globals: none
- **	Operation: Allocate a new character descriptor and fill it in by
- **		calling all feature extractors which are enabled.
- **	Return: Structure containing features extracted from Blob.
- **	Exceptions: none
- **	History: Wed May 23 13:46:22 1990, DSJ, Created.
- */
-  int Type;
-  CHAR_DESC CharDesc;
-
-  CharDesc = NewCharDescription(FeatureDefs);
-
-  for (Type = 0; Type < CharDesc->NumFeatureSets; Type++)
-    if (FeatureDefs.FeatureExtractors[Type] != NULL &&
-        FeatureDefs.FeatureExtractors[Type]->Extractor != NULL) {
-      CharDesc->FeatureSets[Type] =
-        (FeatureDefs.FeatureExtractors[Type])->Extractor(Blob,
-                                                         bl_denorm,
-                                                         cn_denorm,
-                                                         fx_info);
-      if (CharDesc->FeatureSets[Type] == NULL) {
-        tprintf("Feature extractor for type %d = %s returned NULL!\n",
-                Type, FeatureDefs.FeatureDesc[Type]->ShortName);
-        FreeCharDescription(CharDesc);
-        return NULL;
-      }
-    }
-
-  return (CharDesc);
-
-}                                /* ExtractFlexFeatures */
--- a/classify/flexfx.h
+++ b/classify/flexfx.h
@ -1,36 +0,0 @@
-/******************************************************************************
- **	Filename:    flexfx.h
- **	Purpose:     Interface to flexible feature extractor.
- **	Author:      Dan Johnson
- **	History:     Wed May 23 13:36:58 1990, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-#ifndef   FLEXFX_H
-#define   FLEXFX_H
-
-/**----------------------------------------------------------------------------
-          Include Files and Type Defines
----------------------------------------------------------------------------**/
-#include "featdefs.h"
-#include <stdio.h>
-
-/**----------------------------------------------------------------------------
-          Public Function Prototypes
----------------------------------------------------------------------------**/
-// As with all TBLOBs this one is also baseline normalized.
-CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs,
-                              TBLOB *Blob, const DENORM& bl_denorm,
-                              const DENORM& cn_denorm,
-                              const INT_FX_RESULT_STRUCT& fx_info);
-
-#endif
--- a/classify/float2int.cpp
+++ b/classify/float2int.cpp
@ -111,11 +111,11 @@ void Classify::ComputeIntFeatures(FEATURE_SET Features,
  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
    Feature = Features->Features[Fid];

-    IntFeatures[Fid].X = BucketFor (Feature->Params[PicoFeatX],
-      X_SHIFT, INT_FEAT_RANGE);
-    IntFeatures[Fid].Y = BucketFor (Feature->Params[PicoFeatY],
-      YShift, INT_FEAT_RANGE);
-    IntFeatures[Fid].Theta = CircBucketFor (Feature->Params[PicoFeatDir],
+    IntFeatures[Fid].X =
+        Bucket8For(Feature->Params[PicoFeatX], X_SHIFT, INT_FEAT_RANGE);
+    IntFeatures[Fid].Y =
+        Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);
+    IntFeatures[Fid].Theta = CircBucketFor(Feature->Params[PicoFeatDir],
                                           ANGLE_SHIFT, INT_FEAT_RANGE);
    IntFeatures[Fid].CP_misses = 0;
  }
--- a/classify/fxdefs.cpp
+++ b/classify/fxdefs.cpp
@ -1,45 +0,0 @@
-/******************************************************************************
- **	Filename:    fxdefs.c
- **	Purpose:     Utility functions to be used by feature extractors.
- **	Author:      Dan Johnson
- **	History:     Sun Jan 21 15:29:02 1990, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-#include "fxdefs.h"
-#include "featdefs.h"
-#include "mf.h"
-#include "outfeat.h"
-#include "picofeat.h"
-#include "normfeat.h"
-
-/*-----------------------------------------------------------------------------
-        Global Data Definitions and Declarations
-----------------------------------------------------------------------------*/
-// Definitions of extractors separated from feature definitions.
-const FEATURE_EXT_STRUCT MicroFeatureExt = { ExtractMicros };
-const FEATURE_EXT_STRUCT CharNormExt = { ExtractCharNormFeatures };
-const FEATURE_EXT_STRUCT IntFeatExt = { ExtractIntCNFeatures };
-const FEATURE_EXT_STRUCT GeoFeatExt = { ExtractIntGeoFeatures };
-
-// MUST be kept in-sync with DescDefs in featdefs.cpp.
-const FEATURE_EXT_STRUCT* ExtractorDefs[NUM_FEATURE_TYPES] = {
-  &MicroFeatureExt,
-  &CharNormExt,
-  &IntFeatExt,
-  &GeoFeatExt
-};
-
-void SetupExtractors(FEATURE_DEFS_STRUCT *FeatureDefs) {
-  for (int i = 0; i < NUM_FEATURE_TYPES; ++i)
-    FeatureDefs->FeatureExtractors[i] = ExtractorDefs[i];
-}
--- a/classify/fxdefs.h
+++ b/classify/fxdefs.h
@ -1,25 +0,0 @@
-/******************************************************************************
- **	Filename:    fxdefs.h
- **	Purpose:     Generic interface definitions for feature extractors
- **	Author:      Dan Johnson
- **	History:     Fri Jan 19 09:04:14 1990, DSJ, Created.
- **
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- ******************************************************************************/
-#ifndef   FXDEFS_H
-#define   FXDEFS_H
-
-#include "featdefs.h"
-
-void SetupExtractors(FEATURE_DEFS_STRUCT *FeatureDefs);
-
-#endif
--- a/classify/intfx.cpp
+++ b/classify/intfx.cpp
@ -75,9 +75,9 @@ namespace tesseract {

 // Generates a TrainingSample from a TBLOB. Extracts features and sets
 // the bounding box, so classifiers that operate on the image can work.
-// TODO(rays) BlobToTrainingSample must remain a global function until
-// the FlexFx and FeatureDescription code can be removed and LearnBlob
-// made a member of Classify.
+// TODO(rays) Make BlobToTrainingSample a member of Classify now that
+// the FlexFx and FeatureDescription code have been removed and LearnBlob
+// is now a member of Classify.
 TrainingSample* BlobToTrainingSample(
    const TBLOB& blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT* fx_info,
    GenericVector<INT_FEATURE_STRUCT>* bl_features) {
--- a/classify/intmatcher.cpp
+++ b/classify/intmatcher.cpp
@ -26,6 +26,8 @@
                          Include Files and Type Defines
 ----------------------------------------------------------------------------*/
 #include "intmatcher.h"
+
+#include "fontinfo.h"
 #include "intproto.h"
 #include "callcpp.h"
 #include "scrollview.h"
@ -36,6 +38,9 @@
 #include "shapetable.h"
 #include <math.h>

+using tesseract::ScoredFont;
+using tesseract::UnicharRating;
+
 /*----------------------------------------------------------------------------
                    Global Data Definitions and Declarations
 ----------------------------------------------------------------------------*/
@ -45,58 +50,51 @@
 const float IntegerMatcher::kSEExponentialMultiplier = 0.0;
 const float IntegerMatcher::kSimilarityCenter = 0.0075;

-static const uinT8 offset_table[256] = {
-  255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
-  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
-};
+#define offset_table_entries                                                   \
+  255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, \
+      0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,  \
+      0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,  \
+      0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,  \
+      0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,  \
+      0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,  \
+      0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,  \
+      0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6,  \
+      0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,  \
+      0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,  \
+      0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0

-static const uinT8 next_table[256] = {
-  0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
-  0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18,
-  0x1c, 0x1c, 0x1e,
-  0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28,
-  0x2c, 0x2c, 0x2e,
-  0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a,
-  0x38, 0x3c, 0x3c, 0x3e,
-  0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48,
-  0x4c, 0x4c, 0x4e,
-  0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a,
-  0x58, 0x5c, 0x5c, 0x5e,
-  0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a,
-  0x68, 0x6c, 0x6c, 0x6e,
-  0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a,
-  0x78, 0x7c, 0x7c, 0x7e,
-  0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88,
-  0x8c, 0x8c, 0x8e,
-  0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a,
-  0x98, 0x9c, 0x9c, 0x9e,
-  0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa,
-  0xa8, 0xac, 0xac, 0xae,
-  0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba,
-  0xb8, 0xbc, 0xbc, 0xbe,
-  0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca,
-  0xc8, 0xcc, 0xcc, 0xce,
-  0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda,
-  0xd8, 0xdc, 0xdc, 0xde,
-  0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea,
-  0xe8, 0xec, 0xec, 0xee,
-  0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa,
-  0xf8, 0xfc, 0xfc, 0xfe
-};
+#define INTMATCHER_OFFSET_TABLE_SIZE 256
+
+#define next_table_entries                                                    \
+  0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,  \
+      0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a,    \
+      0x18, 0x1c, 0x1c, 0x1e, 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26,    \
+      0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e, 0x20, 0x30, 0x30, 0x32, \
+      0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e, \
+      0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a,    \
+      0x48, 0x4c, 0x4c, 0x4e, 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, \
+      0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e, 0x40, 0x60, 0x60, 0x62, \
+      0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e, \
+      0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, \
+      0x78, 0x7c, 0x7c, 0x7e, 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86,    \
+      0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e, 0x80, 0x90, 0x90, 0x92, \
+      0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e, \
+      0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, \
+      0xa8, 0xac, 0xac, 0xae, 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, \
+      0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe, 0x80, 0xc0, 0xc0, 0xc2, \
+      0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce, \
+      0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, \
+      0xd8, 0xdc, 0xdc, 0xde, 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, \
+      0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee, 0xe0, 0xf0, 0xf0, 0xf2, \
+      0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe
+
+// See http://b/19318793 (#6) for a complete discussion.  Merging arrays
+// offset_table and next_table helps improve performance of PIE code.
+static const uinT8 data_table[512] = {offset_table_entries, next_table_entries};
+
+static const uinT8* const offset_table = &data_table[0];
+static const uinT8* const next_table =
+    &data_table[INTMATCHER_OFFSET_TABLE_SIZE];

 namespace tesseract {

@ -263,8 +261,8 @@ class ClassPruner {
  // Prunes the classes using <the maximum count> * pruning_factor/256 as a
  // threshold for keeping classes. If max_of_non_fragments, then ignore
  // fragments in computing the maximum count.
-  void PruneAndSort(int pruning_factor, bool max_of_non_fragments,
-                    const UNICHARSET& unicharset) {
+  void PruneAndSort(int pruning_factor, int keep_this,
+                    bool max_of_non_fragments, const UNICHARSET& unicharset) {
    int max_count = 0;
    for (int c = 0; c < max_classes_; ++c) {
      if (norm_count_[c] > max_count &&
@ -284,7 +282,8 @@ class ClassPruner {
      pruning_threshold_ = 1;
    num_classes_ = 0;
    for (int class_id = 0; class_id < max_classes_; class_id++) {
-      if (norm_count_[class_id] >= pruning_threshold_) {
+      if (norm_count_[class_id] >= pruning_threshold_ ||
+          class_id == keep_this) {
          ++num_classes_;
        sort_index_[num_classes_] = class_id;
        sort_key_[num_classes_] = norm_count_[class_id];
@ -406,7 +405,7 @@ class ClassPruner {
 //    results                Sorted Array of pruned classes. Must be an array
 //                           of size at least int_templates->NumClasses.
 int Classify::PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
-                           int num_features,
+                           int num_features, int keep_this,
                           const INT_FEATURE_STRUCT* features,
                           const uinT8* normalization_factors,
                           const uinT16* expected_num_features,
@ -441,7 +440,7 @@ int Classify::PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
    pruner.NoNormalization();
  }
  // Do the actual pruning and sort the short-list.
-  pruner.PruneAndSort(classify_class_pruner_threshold,
+  pruner.PruneAndSort(classify_class_pruner_threshold, keep_this,
                      shape_table_ == NULL, unicharset);

  if (classify_debug_level > 2) {
@ -464,7 +463,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate,
                           BIT_VECTOR ConfigMask,
                           inT16 NumFeatures,
                           const INT_FEATURE_STRUCT* Features,
-                           INT_RESULT Result,
+                           UnicharRating* Result,
                           int AdaptFeatureThreshold,
                           int Debug,
                           bool SeparateDebugWindows) {
@ -477,7 +476,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate,
 **              NormalizationFactor       Fudge factor from blob
 **                                        normalization process
 **              Result                    Class rating & configuration:
- **                                        (0.0 -> 1.0), 0=good, 1=bad
+ **                                        (0.0 -> 1.0), 0=bad, 1=good
 **              Debug                     Debugger flag: 1=debugger on
 **      Globals:
 **              local_matcher_multiplier_    Normalization factor multiplier
@ -498,7 +497,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate,
    cprintf ("Integer Matcher -------------------------------------------\n");

  tables->Clear(ClassTemplate);
-  Result->FeatureMisses = 0;
+  Result->feature_misses = 0;

  for (Feature = 0; Feature < NumFeatures; Feature++) {
    int csum = UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask,
@ -506,7 +505,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate,
                                      tables, Debug);
    // Count features that were missed over all configs.
    if (csum == 0)
-      Result->FeatureMisses++;
+      ++Result->feature_misses;
  }

 #ifndef GRAPHICS_DISABLED
@ -534,7 +533,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate,

 #ifndef GRAPHICS_DISABLED
  if (PrintMatchSummaryOn(Debug))
-    DebugBestMatch(BestMatch, Result);
+    Result->Print();

  if (MatchDebuggingOn(Debug))
    cprintf("Match Complete --------------------------------------------\n");
@ -1222,9 +1221,9 @@ void ScratchEvidence::NormalizeSums(

 /*---------------------------------------------------------------------------*/
 int IntegerMatcher::FindBestMatch(
-    INT_CLASS ClassTemplate,
+    INT_CLASS class_template,
    const ScratchEvidence &tables,
-    INT_RESULT Result) {
+    UnicharRating* result) {
 /*
 **      Parameters:
 **      Globals:
@ -1236,35 +1235,27 @@ int IntegerMatcher::FindBestMatch(
 **      Exceptions: none
 **      History: Wed Feb 27 14:12:28 MST 1991, RWM, Created.
 */
-  int BestMatch = 0;
-  int Best2Match = 0;
-  Result->Config = 0;
-  Result->Config2 = 0;
+  int best_match = 0;
+  result->config = 0;
+  result->fonts.truncate(0);
+  result->fonts.reserve(class_template->NumConfigs);

  /* Find best match */
-  for (int ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) {
-    int rating = tables.sum_feature_evidence_[ConfigNum];
+  for (int c = 0; c < class_template->NumConfigs; ++c) {
+    int rating = tables.sum_feature_evidence_[c];
    if (*classify_debug_level_ > 2)
-      cprintf("Config %d, rating=%d\n", ConfigNum, rating);
-    if (rating > BestMatch) {
-      if (BestMatch > 0) {
-        Result->Config2 = Result->Config;
-        Best2Match = BestMatch;
-      } else {
-        Result->Config2 = ConfigNum;
-      }
-      Result->Config = ConfigNum;
-      BestMatch = rating;
-    } else if (rating > Best2Match) {
-      Result->Config2 = ConfigNum;
-      Best2Match = rating;
+      tprintf("Config %d, rating=%d\n", c, rating);
+    if (rating > best_match) {
+      result->config = c;
+      best_match = rating;
    }
+    result->fonts.push_back(ScoredFont(c, rating));
  }

-  /* Compute Certainty Rating */
-  Result->Rating = (65536.0 - BestMatch) / 65536.0;
+  // Compute confidence on a Probability scale.
+  result->rating = best_match / 65536.0f;

-  return BestMatch;
+  return best_match;
 }

 // Applies the CN normalization factor to the given rating and returns
@ -1277,17 +1268,6 @@ float IntegerMatcher::ApplyCNCorrection(float rating, int blob_length,
      (blob_length + matcher_multiplier);
 }

-/*---------------------------------------------------------------------------*/
-#ifndef GRAPHICS_DISABLED
-// Print debug information about the best match for the current class.
-void IntegerMatcher::DebugBestMatch(
-    int BestMatch, INT_RESULT Result) {
-  tprintf("Rating = %5.1f%%  Best Config = %3d, Distance = %5.1f\n",
-          100.0 * Result->Rating, Result->Config,
-          100.0 * (65536.0 - BestMatch) / 65536.0);
-}
-#endif
-
 /*---------------------------------------------------------------------------*/
 void
 HeapSort (int n, register int ra[], register int rb[]) {
--- a/classify/intmatcher.h
+++ b/classify/intmatcher.h
@ -38,25 +38,14 @@ extern INT_VAR_H(classify_integer_matcher_multiplier, 10,
 #include "intproto.h"
 #include "cutoffs.h"

-struct INT_RESULT_STRUCT {
-  INT_RESULT_STRUCT() : Rating(0.0f), Config(0), Config2(0), FeatureMisses(0) {}
-
-  FLOAT32 Rating;
-  // TODO(rays) It might be desirable for these to be able to represent a
-  // null config.
-  uinT8 Config;
-  uinT8 Config2;
-  uinT16 FeatureMisses;
-};
-
-typedef INT_RESULT_STRUCT *INT_RESULT;
-
+namespace tesseract {
+class UnicharRating;
+}

 struct CP_RESULT_STRUCT {
  CP_RESULT_STRUCT() : Rating(0.0f), Class(0) {}

  FLOAT32 Rating;
-  INT_RESULT_STRUCT IMResult;
  CLASS_ID Class;
 };

@ -113,7 +102,7 @@ class IntegerMatcher {
             BIT_VECTOR ConfigMask,
             inT16 NumFeatures,
             const INT_FEATURE_STRUCT* Features,
-             INT_RESULT Result,
+             tesseract::UnicharRating* Result,
             int AdaptFeatureThreshold,
             int Debug,
             bool SeparateDebugWindows);
@ -155,7 +144,7 @@ class IntegerMatcher {

  int FindBestMatch(INT_CLASS ClassTemplate,
                    const ScratchEvidence &tables,
-                    INT_RESULT Result);
+                    tesseract::UnicharRating* Result);

 #ifndef GRAPHICS_DISABLED
  void DebugFeatureProtoError(
@ -182,8 +171,6 @@ class IntegerMatcher {
      int AdaptFeatureThreshold,
      int Debug,
      bool SeparateDebugWindows);
-
-  void DebugBestMatch(int BestMatch, INT_RESULT Result);
 #endif


--- a/classify/intproto.cpp
+++ b/classify/intproto.cpp
@ -439,52 +439,25 @@ void AddProtoToProtoPruner(PROTO Proto, int ProtoId,


 /*---------------------------------------------------------------------------*/
-int BucketFor(FLOAT32 Param, FLOAT32 Offset, int NumBuckets) {
-/*
- ** Parameters:
- **   Param   parameter value to map into a bucket number
- **   Offset    amount to shift param before mapping it
- **   NumBuckets  number of buckets to map param into
- ** Globals: none
- ** Operation: This routine maps a parameter value into a bucket between
- **   0 and NumBuckets-1.  Offset is added to the parameter
- **   before mapping it.  Values which map to buckets outside
- **   the range are truncated to fit within the range.  Mapping
- **   is done by truncating rather than rounding.
- ** Return: Bucket number corresponding to Param + Offset.
- ** Exceptions: none
- ** History: Thu Feb 14 13:24:33 1991, DSJ, Created.
- */
-  return ClipToRange(static_cast<int>(MapParam(Param, Offset, NumBuckets)),
-                     0, NumBuckets - 1);
-}                                /* BucketFor */
-
+// Returns a quantized bucket for the given param shifted by offset,
+// notionally (param + offset) * num_buckets, but clipped and casted to the
+// appropriate type.
+uinT8 Bucket8For(FLOAT32 param, FLOAT32 offset, int num_buckets) {
+  int bucket = IntCastRounded(MapParam(param, offset, num_buckets));
+  return static_cast<uinT8>(ClipToRange(bucket, 0, num_buckets - 1));
+}
+uinT16 Bucket16For(FLOAT32 param, FLOAT32 offset, int num_buckets) {
+  int bucket = IntCastRounded(MapParam(param, offset, num_buckets));
+  return static_cast<uinT16>(ClipToRange(bucket, 0, num_buckets - 1));
+}

 /*---------------------------------------------------------------------------*/
-int CircBucketFor(FLOAT32 Param, FLOAT32 Offset, int NumBuckets) {
-/*
- ** Parameters:
- **   Param   parameter value to map into a circular bucket
- **   Offset    amount to shift param before mapping it
- **   NumBuckets  number of buckets to map param into
- ** Globals: none
- ** Operation: This routine maps a parameter value into a bucket between
- **   0 and NumBuckets-1.  Offset is added to the parameter
- **   before mapping it.  Values which map to buckets outside
- **   the range are wrapped to a new value in a circular fashion.
- **   Mapping is done by truncating rather than rounding.
- ** Return: Bucket number corresponding to Param + Offset.
- ** Exceptions: none
- ** History: Thu Feb 14 13:24:33 1991, DSJ, Created.
- */
-  int Bucket;
-
-  Bucket = static_cast<int>(MapParam(Param, Offset, NumBuckets));
-  if (Bucket < 0)
-    Bucket += NumBuckets;
-  else if (Bucket >= NumBuckets)
-    Bucket -= NumBuckets;
-  return Bucket;
+// Returns a quantized bucket for the given circular param shifted by offset,
+// notionally (param + offset) * num_buckets, but modded and casted to the
+// appropriate type.
+uinT8 CircBucketFor(FLOAT32 param, FLOAT32 offset, int num_buckets) {
+  int bucket = IntCastRounded(MapParam(param, offset, num_buckets));
+  return static_cast<uinT8>(Modulo(bucket, num_buckets));
 }                                /* CircBucketFor */


@ -1694,23 +1667,23 @@ void InitTableFiller (FLOAT32 EndPad, FLOAT32 SidePad,

  if (fabs (Angle - 0.0) < HV_TOLERANCE || fabs (Angle - 0.5) < HV_TOLERANCE) {
    /* horizontal proto - handle as special case */
-    Filler->X = BucketFor(X - HalfLength - EndPad, XS, NB);
-    Filler->YStart = BucketFor(Y - SidePad, YS, NB * 256);
-    Filler->YEnd = BucketFor(Y + SidePad, YS, NB * 256);
+    Filler->X = Bucket8For(X - HalfLength - EndPad, XS, NB);
+    Filler->YStart = Bucket16For(Y - SidePad, YS, NB * 256);
+    Filler->YEnd = Bucket16For(Y + SidePad, YS, NB * 256);
    Filler->StartDelta = 0;
    Filler->EndDelta = 0;
    Filler->Switch[0].Type = LastSwitch;
-    Filler->Switch[0].X = BucketFor(X + HalfLength + EndPad, XS, NB);
+    Filler->Switch[0].X = Bucket8For(X + HalfLength + EndPad, XS, NB);
  } else if (fabs(Angle - 0.25) < HV_TOLERANCE ||
           fabs(Angle - 0.75) < HV_TOLERANCE) {
    /* vertical proto - handle as special case */
-    Filler->X = BucketFor(X - SidePad, XS, NB);
-    Filler->YStart = BucketFor(Y - HalfLength - EndPad, YS, NB * 256);
-    Filler->YEnd = BucketFor(Y + HalfLength + EndPad, YS, NB * 256);
+    Filler->X = Bucket8For(X - SidePad, XS, NB);
+    Filler->YStart = Bucket16For(Y - HalfLength - EndPad, YS, NB * 256);
+    Filler->YEnd = Bucket16For(Y + HalfLength + EndPad, YS, NB * 256);
    Filler->StartDelta = 0;
    Filler->EndDelta = 0;
    Filler->Switch[0].Type = LastSwitch;
-    Filler->Switch[0].X = BucketFor(X + SidePad, XS, NB);
+    Filler->Switch[0].X = Bucket8For(X + SidePad, XS, NB);
  } else {
    /* diagonal proto */

@ -1736,36 +1709,34 @@ void InitTableFiller (FLOAT32 EndPad, FLOAT32 SidePad,
      }

      /* translate into bucket positions and deltas */
-      Filler->X = (inT8) MapParam(Start.x, XS, NB);
+      Filler->X = Bucket8For(Start.x, XS, NB);
      Filler->StartDelta = -(inT16) ((Cos / Sin) * 256);
      Filler->EndDelta = (inT16) ((Sin / Cos) * 256);

      XAdjust = BucketEnd(Filler->X, XS, NB) - Start.x;
      YAdjust = XAdjust * Cos / Sin;
-      Filler->YStart = (inT16) MapParam(Start.y - YAdjust, YS, NB * 256);
+      Filler->YStart = Bucket16For(Start.y - YAdjust, YS, NB * 256);
      YAdjust = XAdjust * Sin / Cos;
-      Filler->YEnd = (inT16) MapParam(Start.y + YAdjust, YS, NB * 256);
+      Filler->YEnd = Bucket16For(Start.y + YAdjust, YS, NB * 256);

      Filler->Switch[S1].Type = StartSwitch;
-      Filler->Switch[S1].X = (inT8) MapParam(Switch1.x, XS, NB);
-      Filler->Switch[S1].Y = (inT8) MapParam(Switch1.y, YS, NB);
+      Filler->Switch[S1].X = Bucket8For(Switch1.x, XS, NB);
+      Filler->Switch[S1].Y = Bucket8For(Switch1.y, YS, NB);
      XAdjust = Switch1.x - BucketStart(Filler->Switch[S1].X, XS, NB);
      YAdjust = XAdjust * Sin / Cos;
-      Filler->Switch[S1].YInit =
-        (inT16) MapParam(Switch1.y - YAdjust, YS, NB * 256);
+      Filler->Switch[S1].YInit = Bucket16For(Switch1.y - YAdjust, YS, NB * 256);
      Filler->Switch[S1].Delta = Filler->EndDelta;

      Filler->Switch[S2].Type = EndSwitch;
-      Filler->Switch[S2].X = (inT8) MapParam(Switch2.x, XS, NB);
-      Filler->Switch[S2].Y = (inT8) MapParam(Switch2.y, YS, NB);
+      Filler->Switch[S2].X = Bucket8For(Switch2.x, XS, NB);
+      Filler->Switch[S2].Y = Bucket8For(Switch2.y, YS, NB);
      XAdjust = Switch2.x - BucketStart(Filler->Switch[S2].X, XS, NB);
      YAdjust = XAdjust * Cos / Sin;
-      Filler->Switch[S2].YInit =
-        (inT16) MapParam(Switch2.y + YAdjust, YS, NB * 256);
+      Filler->Switch[S2].YInit = Bucket16For(Switch2.y + YAdjust, YS, NB * 256);
      Filler->Switch[S2].Delta = Filler->StartDelta;

      Filler->Switch[2].Type = LastSwitch;
-      Filler->Switch[2].X = (inT8)MapParam(End.x, XS, NB);
+      Filler->Switch[2].X = Bucket8For(End.x, XS, NB);
    } else {
      /* falling diagonal proto */
      Angle *= 2.0 * PI;
@ -1788,36 +1759,34 @@ void InitTableFiller (FLOAT32 EndPad, FLOAT32 SidePad,
      }

      /* translate into bucket positions and deltas */
-      Filler->X = (inT8) MapParam(Start.x, XS, NB);
+      Filler->X = Bucket8For(Start.x, XS, NB);
      Filler->StartDelta = -(inT16) ((Sin / Cos) * 256);
      Filler->EndDelta = (inT16) ((Cos / Sin) * 256);

      XAdjust = BucketEnd(Filler->X, XS, NB) - Start.x;
      YAdjust = XAdjust * Sin / Cos;
-      Filler->YStart = (inT16) MapParam(Start.y - YAdjust, YS, NB * 256);
+      Filler->YStart = Bucket16For(Start.y - YAdjust, YS, NB * 256);
      YAdjust = XAdjust * Cos / Sin;
-      Filler->YEnd = (inT16) MapParam(Start.y + YAdjust, YS, NB * 256);
+      Filler->YEnd = Bucket16For(Start.y + YAdjust, YS, NB * 256);

      Filler->Switch[S1].Type = EndSwitch;
-      Filler->Switch[S1].X = (inT8) MapParam(Switch1.x, XS, NB);
-      Filler->Switch[S1].Y = (inT8) MapParam(Switch1.y, YS, NB);
+      Filler->Switch[S1].X = Bucket8For(Switch1.x, XS, NB);
+      Filler->Switch[S1].Y = Bucket8For(Switch1.y, YS, NB);
      XAdjust = Switch1.x - BucketStart(Filler->Switch[S1].X, XS, NB);
      YAdjust = XAdjust * Sin / Cos;
-      Filler->Switch[S1].YInit =
-        (inT16) MapParam(Switch1.y + YAdjust, YS, NB * 256);
+      Filler->Switch[S1].YInit = Bucket16For(Switch1.y + YAdjust, YS, NB * 256);
      Filler->Switch[S1].Delta = Filler->StartDelta;

      Filler->Switch[S2].Type = StartSwitch;
-      Filler->Switch[S2].X = (inT8) MapParam(Switch2.x, XS, NB);
-      Filler->Switch[S2].Y = (inT8) MapParam(Switch2.y, YS, NB);
+      Filler->Switch[S2].X = Bucket8For(Switch2.x, XS, NB);
+      Filler->Switch[S2].Y = Bucket8For(Switch2.y, YS, NB);
      XAdjust = Switch2.x - BucketStart(Filler->Switch[S2].X, XS, NB);
      YAdjust = XAdjust * Cos / Sin;
-      Filler->Switch[S2].YInit =
-        (inT16) MapParam(Switch2.y - YAdjust, YS, NB * 256);
+      Filler->Switch[S2].YInit = Bucket16For(Switch2.y - YAdjust, YS, NB * 256);
      Filler->Switch[S2].Delta = Filler->EndDelta;

      Filler->Switch[2].Type = LastSwitch;
-      Filler->Switch[2].X = (inT8) MapParam(End.x, XS, NB);
+      Filler->Switch[2].X = Bucket8For(End.x, XS, NB);
    }
  }
 }                                /* InitTableFiller */
--- a/classify/intproto.h
+++ b/classify/intproto.h
@ -218,9 +218,10 @@ void AddProtoToClassPruner(PROTO Proto,
 void AddProtoToProtoPruner(PROTO Proto, int ProtoId,
                           INT_CLASS Class, bool debug);

-int BucketFor(FLOAT32 Param, FLOAT32 Offset, int NumBuckets);
+uinT8 Bucket8For(FLOAT32 param, FLOAT32 offset, int num_buckets);
+uinT16 Bucket16For(FLOAT32 param, FLOAT32 offset, int num_buckets);

-int CircBucketFor(FLOAT32 Param, FLOAT32 Offset, int NumBuckets);
+uinT8 CircBucketFor(FLOAT32 param, FLOAT32 offset, int num_buckets);

 void UpdateMatchDisplay();

--- a/classify/mf.cpp
+++ b/classify/mf.cpp
@ -26,36 +26,32 @@

 #include <math.h>

-/**----------------------------------------------------------------------------
+/*----------------------------------------------------------------------------
        Global Data Definitions and Declarations
 ----------------------------------------------------------------------------**/
-/**----------------------------------------------------------------------------
+/*----------------------------------------------------------------------------
              Private Code
 ----------------------------------------------------------------------------**/
-/*---------------------------------------------------------------------------*/
+/**
+ * Call the old micro-feature extractor and then copy
+ * the features into the new format.  Then deallocate the
+ * old micro-features.
+ * @param Blob	blob to extract micro-features from
+ * @param denorm  control parameter to feature extractor.
+ * @return Micro-features for Blob.
+ * @note Exceptions: none
+ * @note History: Wed May 23 18:06:38 1990, DSJ, Created.
+ */
 FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm,
                          const DENORM& cn_denorm,
                          const INT_FX_RESULT_STRUCT& fx_info) {
-/*
- **	Parameters:
- **		Blob		blob to extract micro-features from
- **		denorm  control parameter to feature extractor.
- **	Globals: none
- **	Operation: Call the old micro-feature extractor and then copy
- **		the features into the new format.  Then deallocate the
- **		old micro-features.
- **	Return: Micro-features for Blob.
- **	Exceptions: none
- **	History: Wed May 23 18:06:38 1990, DSJ, Created.
- */
  int NumFeatures;
  MICROFEATURES Features, OldFeatures;
  FEATURE_SET FeatureSet;
  FEATURE Feature;
  MICROFEATURE OldFeature;

-  OldFeatures = (MICROFEATURES)BlobMicroFeatures(Blob, bl_denorm, cn_denorm,
-                                                 fx_info);
+  OldFeatures = BlobMicroFeatures(Blob, cn_denorm);
  if (OldFeatures == NULL)
    return NULL;
  NumFeatures = count (OldFeatures);
--- a/classify/mf.h
+++ b/classify/mf.h
@ -34,8 +34,6 @@ typedef float MicroFeature[MFCount];
 /*----------------------------------------------------------------------------
          Private Function Prototypes
 -----------------------------------------------------------------------------*/
-FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm,
-                          const DENORM& cn_denorm,
-                          const INT_FX_RESULT_STRUCT& fx_info);
+FEATURE_SET ExtractMicros(TBLOB* Blob, const DENORM& cn_denorm);

 #endif
--- a/classify/mfdefs.h
+++ b/classify/mfdefs.h
@ -23,7 +23,6 @@
 ----------------------------------------------------------------------------**/
 #include "oldlist.h"
 #include "matchdefs.h"
-#include "xform2d.h"

 /* definition of a list of micro-features */
 typedef LIST MICROFEATURES;
--- a/classify/mfx.cpp
+++ b/classify/mfx.cpp
@ -59,9 +59,7 @@ MICROFEATURE ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End);
 ----------------------------------------------------------------------------**/

 /*---------------------------------------------------------------------------*/
-CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm,
-                                const DENORM& cn_denorm,
-                                const INT_FX_RESULT_STRUCT& fx_info) {
+MICROFEATURES BlobMicroFeatures(TBLOB* Blob, const DENORM& cn_denorm) {
 /*
 **      Parameters:
 **              Blob            blob to extract micro-features from
@ -98,7 +96,7 @@ CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm,
    }
    FreeOutlines(Outlines);
  }
-  return ((CHAR_FEATURES) MicroFeatures);
+  return MicroFeatures;
 }                                /* BlobMicroFeatures */


--- a/classify/mfx.h
+++ b/classify/mfx.h
@ -21,6 +21,7 @@
 /*----------------------------------------------------------------------------
          Include Files and Type Defines
 ----------------------------------------------------------------------------**/
+#include "mfdefs.h"
 #include "params.h"
 /*----------------------------------------------------------------------------
          Variables
@ -35,8 +36,6 @@ extern double_VAR_H(classify_max_slope, 2.414213562,
 /*----------------------------------------------------------------------------
          Public Function Prototypes
 ----------------------------------------------------------------------------**/
-CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm,
-                                const DENORM& cn_denorm,
-                                const INT_FX_RESULT_STRUCT& fx_info);
+MICROFEATURES BlobMicroFeatures(TBLOB* Blob, const DENORM& cn_denorm);

 #endif
--- a/classify/normfeat.cpp
+++ b/classify/normfeat.cpp
@ -59,9 +59,7 @@ FLOAT32 ActualOutlineLength(FEATURE Feature) {
 //     the x center of the grapheme's bounding box.
 //     English: [0.011, 0.31]
 //
-FEATURE_SET ExtractCharNormFeatures(TBLOB *blob, const DENORM& bl_denorm,
-                                    const DENORM& cn_denorm,
-                                    const INT_FX_RESULT_STRUCT& fx_info) {
+FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT& fx_info) {
  FEATURE_SET feature_set = NewFeatureSet(1);
  FEATURE feature = NewFeature(&CharNormDesc);

--- a/classify/normfeat.h
+++ b/classify/normfeat.h
@ -34,8 +34,6 @@ typedef enum {
 ----------------------------------------------------------------------------**/
 FLOAT32 ActualOutlineLength(FEATURE Feature);

-FEATURE_SET ExtractCharNormFeatures(TBLOB *Blob, const DENORM& bl_denorm,
-                                    const DENORM& cn_denorm,
-                                    const INT_FX_RESULT_STRUCT& fx_info);
+FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT& fx_info);

 #endif
--- a/classify/ocrfeatures.cpp
+++ b/classify/ocrfeatures.cpp
@ -209,13 +209,11 @@ FEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) {


 /*---------------------------------------------------------------------------*/
-void WriteFeature(FILE *File, FEATURE Feature) {
 /*
 ** Parameters:
- **		File		open text file to write Feature to
- **		Feature		feature to write out to File
- **	Globals: none
- **	Operation: Write a textual representation of Feature to File.
+ ** Feature: feature to write out to str
+ ** str:     string to write Feature to
+ ** Operation: Appends a textual representation of Feature to str.
 ** This representation is simply a list of the N parameters
 ** of the feature, terminated with a newline.  It is assumed
 ** that the ExtraPenalty field can be reconstructed from the
@ -225,24 +223,22 @@ void WriteFeature(FILE *File, FEATURE Feature) {
 ** Exceptions: none
 ** History: Wed May 23 09:28:18 1990, DSJ, Created.
 */
-  int i;
-
-  for (i = 0; i < Feature->Type->NumParams; i++) {
+void WriteFeature(FEATURE Feature, STRING* str) {
+  for (int i = 0; i < Feature->Type->NumParams; i++) {
 #ifndef WIN32
    assert(!isnan(Feature->Params[i]));
 #endif
-    fprintf(File, " %g", Feature->Params[i]);
+    str->add_str_double(" ", Feature->Params[i]);
  }
-  fprintf(File, "\n");
+  *str += "\n";
 }                                /* WriteFeature */


 /*---------------------------------------------------------------------------*/
-void WriteFeatureSet(FILE *File, FEATURE_SET FeatureSet) {
 /*
 ** Parameters:
- **		File		open text file to write FeatureSet to
- **		FeatureSet	feature set to write to File
+ ** FeatureSet: feature set to write to File
+ ** str:        string to write Feature to
 ** Globals: none
 ** Operation: Write a textual representation of FeatureSet to File.
 ** This representation is an integer specifying the number of
@ -252,12 +248,13 @@ void WriteFeatureSet(FILE *File, FEATURE_SET FeatureSet) {
 ** Exceptions: none
 ** History: Wed May 23 10:06:03 1990, DSJ, Created.
 */
-  int i;
-
+void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str) {
  if (FeatureSet) {
-    fprintf (File, "%d\n", FeatureSet->NumFeatures);
-    for (i = 0; i < FeatureSet->NumFeatures; i++)
-      WriteFeature (File, FeatureSet->Features[i]);
+    str->add_str_int("", FeatureSet->NumFeatures);
+    *str += "\n";
+    for (int i = 0; i < FeatureSet->NumFeatures; i++) {
+      WriteFeature(FeatureSet->Features[i], str);
+    }
  }
 }                                /* WriteFeatureSet */

--- a/classify/ocrfeatures.h
+++ b/classify/ocrfeatures.h
@ -79,13 +79,6 @@ typedef FEATURE_SET_STRUCT *FEATURE_SET;
 // classifier does not need to know the details of this data structure.
 typedef char *CHAR_FEATURES;

-typedef FEATURE_SET (*FX_FUNC)(TBLOB *, const DENORM&, const DENORM&,
-                               const INT_FX_RESULT_STRUCT&);
-
-struct FEATURE_EXT_STRUCT {
-  FX_FUNC Extractor;             // func to extract features
-};
-
 /*----------------------------------------------------------------------
    Macros for defining the parameters of a new features
 ----------------------------------------------------------------------*/
@ -125,10 +118,8 @@ FEATURE ReadFeature(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc);

 FEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc);

-void WriteFeature(FILE *File, FEATURE Feature);
+void WriteFeature(FEATURE Feature, STRING* str);

-void WriteFeatureSet(FILE *File, FEATURE_SET FeatureSet);
-
-void WriteOldParamDesc(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc);
+void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str);

 #endif
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`EXTRA_DIST = AndroidManifest.xml jni/Android.mk jni/Application.mk`