3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 01:42:41 +08:00 · 2010-11-23 18:34:14 +00:00 · 2010-11-23 18:34:14 +00:00 · 4523ce9f7d
commit 4523ce9f7d
parent 7511d76315
558 changed files with 51788 additions and 41709 deletions
--- a/35
+++ b/35
@ -1,3 +1,38 @@
+2010-09-22 - V3.01
+  * Thread-safety! Moved all critical globals and statics to
+    members of the appropriate class. Tesseract is now
+    thread-safe (multiple instances can be used in parallel
+    in multiple threads.) with the minor exception that some
+    control parameters are still global and affect all threads.
+  * Added Cube, a new recognizer for Arabic. Cube can also be
+    used in combination with normal Tesseract for other languages
+    with an improvement in accuracy at the cost of (much) lower speed.
+    There is no training module for Cube yet.
+  * OcrEngineMode in Init replaces AccuracyVSpeed to control cube.
+  * Greatly improved segmentation search with consequent accuracy and
+    speed improvements, especially for Chinese.
+  * Added PageIterator and ResultIterator as cleaner ways to get the
+    full results out of Tesseract, that are not currently provided
+    by any of the TessBaseAPI::Get* methods.
+    All other methods, such as the ETEXT_STRUCT in particular are
+    deprecated and will be deleted in the future.
+  * ApplyBoxes totally rewritten to make training easier.
+    It can now cope with touching/overlapping training characters,
+    and a new boxfile format allows word boxes instead of character
+    boxes, BUT to use that you have to have already boostrapped the
+    language with character boxes. "Cyclic dependency" on traineddata.
+  * Auto orientation and script detection added to page layout analysis.
+  * Deleted *lots* of dead code.
+  * Fixxht module replaced with scalable data-driven module.
+  * Output font characteristics accuracy improved.
+  * Removed the double conversion at each classification.
+  * Upgraded oldest structs to be classes and deprecated PBLOB.
+  * Removed non-deterministic baseline fit.
+  * Added fixed length dawgs for Chinese.
+  * Handling of vertical text improved.
+  * Handling of leader dots improved.
+  * Table detection greatly improved.
+
 2010-09-21 - V3.00
  * Preparations for thread safety:
     * Changed TessBaseAPI methods to be non-static
--- a/Makefile.am
+++ b/Makefile.am
@ -1,6 +1,6 @@
 # TODO(luc) Add 'doc' to this list when ready
 ACLOCAL_AMFLAGS = -I m4
-SUBDIRS = ccstruct ccutil classify cutil dict image textord viewer wordrec ccmain training tessdata testing java api vs2008
+SUBDIRS = ccstruct ccutil classify cube cutil dict image neural_networks/runtime textord viewer wordrec ccmain training tessdata testing java api
 #if USING_GETTEXT
 #SUBDIRS += po
 #AM_CPPFLAGS = -DLOCALEDIR=\"$(localedir)\"
--- a/Makefile.in
+++ b/Makefile.in
@ -234,7 +234,7 @@ top_srcdir = @top_srcdir@

 # TODO(luc) Add 'doc' to this list when ready
 ACLOCAL_AMFLAGS = -I m4
-SUBDIRS = ccstruct ccutil classify cutil dict image textord viewer wordrec ccmain training tessdata testing java api vs2008
+SUBDIRS = ccstruct ccutil classify cube cutil dict image neural_networks/runtime textord viewer wordrec ccmain training tessdata testing java api
 #if USING_GETTEXT
 #SUBDIRS += po
 #AM_CPPFLAGS = -DLOCALEDIR=\"$(localedir)\"
--- a/35
+++ b/35
@ -1,3 +1,38 @@
+Tesseract release notes Oct 1 2010 - V3.01
+  * Thread-safety! Moved all critical globals and statics to
+    members of the appropriate class. Tesseract is now
+    thread-safe (multiple instances can be used in parallel
+    in multiple threads.) with the minor exception that some
+    control parameters are still global and affect all threads.
+  * Added Cube, a new recognizer for Arabic. Cube can also be
+    used in combination with normal Tesseract for other languages
+    with an improvement in accuracy at the cost of (much) lower speed.
+    There is no training module for Cube yet.
+  * OcrEngineMode in Init replaces AccuracyVSpeed to control cube.
+  * Greatly improved segmentation search with consequent accuracy and
+    speed improvements, especially for Chinese.
+  * Added PageIterator and ResultIterator as cleaner ways to get the
+    full results out of Tesseract, that are not currently provided
+    by any of the TessBaseAPI::Get* methods.
+    All other methods, such as the ETEXT_STRUCT in particular are
+    deprecated and will be deleted in the future.
+  * ApplyBoxes totally rewritten to make training easier.
+    It can now cope with touching/overlapping training characters,
+    and a new boxfile format allows word boxes instead of character
+    boxes, BUT to use that you have to have already boostrapped the
+    language with character boxes. "Cyclic dependency" on traineddata.
+  * Auto orientation and script detection added to page layout analysis.
+  * Deleted *lots* of dead code.
+  * Fixxht module replaced with scalable data-driven module.
+  * Output font characteristics accuracy improved.
+  * Removed the double conversion at each classification.
+  * Upgraded oldest structs to be classes and deprecated PBLOB.
+  * Removed non-deterministic baseline fit.
+  * Added fixed length dawgs for Chinese.
+  * Handling of vertical text improved.
+  * Handling of leader dots improved.
+  * Table detection greatly improved.
+
 Tesseract release notes Sep 30 2010 - V3.00
  * Preparations for thread safety:
     * Changed TessBaseAPI methods to be non-static
--- a/api/Makefile.am
+++ b/api/Makefile.am
@ -8,13 +8,15 @@ AM_CPPFLAGS = -DLOCALEDIR=\"$(localedir)\"\
    -I$(top_srcdir)/textord 

 include_HEADERS = \
-    baseapi.h tesseractmain.h
+    apitypes.h baseapi.h pageiterator.h resultiterator.h tesseractmain.h

 lib_LTLIBRARIES = libtesseract_api.la
-libtesseract_api_la_SOURCES = baseapi.cpp
+libtesseract_api_la_SOURCES = baseapi.cpp pageiterator.cpp resultiterator.cpp
 libtesseract_api_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
 libtesseract_api_la_LIBADD = \
    ../ccmain/libtesseract_main.la \
+    ../cube/libtesseract_cube.la \
+    ../neural_networks/runtime/libtesseract_neural.la \
    ../textord/libtesseract_textord.la \
    ../wordrec/libtesseract_wordrec.la \
    ../classify/libtesseract_classify.la \
--- a/api/Makefile.in
+++ b/api/Makefile.in
@ -74,6 +74,8 @@ am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" \
 	"$(DESTDIR)$(includedir)"
 LTLIBRARIES = $(lib_LTLIBRARIES)
 libtesseract_api_la_DEPENDENCIES = ../ccmain/libtesseract_main.la \
+	../cube/libtesseract_cube.la \
+	../neural_networks/runtime/libtesseract_neural.la \
 	../textord/libtesseract_textord.la \
 	../wordrec/libtesseract_wordrec.la \
 	../classify/libtesseract_classify.la \
@ -82,7 +84,8 @@ libtesseract_api_la_DEPENDENCIES = ../ccmain/libtesseract_main.la \
 	../image/libtesseract_image.la ../cutil/libtesseract_cutil.la \
 	../viewer/libtesseract_viewer.la \
 	../ccutil/libtesseract_ccutil.la
-am_libtesseract_api_la_OBJECTS = baseapi.lo
+am_libtesseract_api_la_OBJECTS = baseapi.lo pageiterator.lo \
+	resultiterator.lo
 libtesseract_api_la_OBJECTS = $(am_libtesseract_api_la_OBJECTS)
 libtesseract_api_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
@ -294,13 +297,15 @@ AM_CPPFLAGS = -DLOCALEDIR=\"$(localedir)\"\
    -I$(top_srcdir)/textord 

 include_HEADERS = \
-    baseapi.h tesseractmain.h
+    apitypes.h baseapi.h pageiterator.h resultiterator.h tesseractmain.h

 lib_LTLIBRARIES = libtesseract_api.la
-libtesseract_api_la_SOURCES = baseapi.cpp
+libtesseract_api_la_SOURCES = baseapi.cpp pageiterator.cpp resultiterator.cpp
 libtesseract_api_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
 libtesseract_api_la_LIBADD = \
    ../ccmain/libtesseract_main.la \
+    ../cube/libtesseract_cube.la \
+    ../neural_networks/runtime/libtesseract_neural.la \
    ../textord/libtesseract_textord.la \
    ../wordrec/libtesseract_wordrec.la \
    ../classify/libtesseract_classify.la \
@ -446,6 +451,8 @@ distclean-compile:
 	-rm -f *.tab.c

@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/baseapi.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pageiterator.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/resultiterator.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tesseractmain.Po@am__quote@

 .cpp.o:
--- a/api/apitypes.h
+++ b/api/apitypes.h
@ -0,0 +1,31 @@
+///////////////////////////////////////////////////////////////////////
+// File:        apitypes.h
+// Description: Types used in both the API and internally
+// Author:      Ray Smith
+// Created:     Wed Mar 03 09:22:53 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_API_APITYPES_H__
+#define TESSERACT_API_APITYPES_H__
+
+#include "publictypes.h"
+
+// The types used by the API and Page/ResultIterator can be found in
+// ccstruct/publictypes.h.
+// API interfaces and API users should be sure to include this file, rather
+// than the lower-level one, and lower-level code should be sure to include
+// only the lower-level file.
+
+#endif  // TESSERACT_API_APITYPES_H__
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
--- a/api/baseapi.h
+++ b/api/baseapi.h
@ -17,28 +17,38 @@
 //
 ///////////////////////////////////////////////////////////////////////

-#ifndef TESSERACT_CCMAIN_BASEAPI_H__
-#define TESSERACT_CCMAIN_BASEAPI_H__
+#ifndef TESSERACT_API_BASEAPI_H__
+#define TESSERACT_API_BASEAPI_H__

+// To avoid collision with other typenames include the ABSOLUTE MINIMUM
+// complexity of includes here. Use forward declarations wherever possible
+// and hide includes of complex types in baseapi.cpp.
+#include "apitypes.h"
 #include "thresholder.h"
+#include "unichar.h"

 class PAGE_RES;
 class PAGE_RES_IT;
 class BLOCK_LIST;
+class DENORM;
 class IMAGE;
+class PBLOB;
+class ROW;
 class STRING;
+class WERD;
 struct Pix;
 struct Box;
 struct Pixa;
 struct Boxa;
-struct ETEXT_STRUCT;
+class ETEXT_DESC;
 struct OSResults;
-struct TBOX;
+class TBOX;

 #define MAX_NUM_INT_FEATURES 512
 struct INT_FEATURE_STRUCT;
 typedef INT_FEATURE_STRUCT *INT_FEATURE;
 typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES];
+struct TBLOB;

 #ifdef TESSDLL_EXPORTS
 #define TESSDLL_API __declspec(dllexport)
@ -51,37 +61,21 @@ typedef INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES];

 namespace tesseract {

+class CubeRecoContext;
+class Dawg;
 class Dict;
+class PageIterator;
+class ResultIterator;
 class Tesseract;
 class Trie;
-class CubeRecoContext;
-class TesseractCubeCombiner;
-class CubeObject;
-class CubeLineObject;
-class Dawg;

-typedef int (Dict::*DictFunc)(void* void_dawg_args, int char_index,
-                              const void *word, bool word_end);
+typedef int (Dict::*DictFunc)(void* void_dawg_args,
+                              UNICHAR_ID unichar_id, bool word_end);
+typedef double (Dict::*ProbabilityInContextFunc)(const char* context,
+                                                 int context_bytes,
+                                                 const char* character,
+                                                 int character_bytes);

-enum PageSegMode {
-  PSM_AUTO,           ///< Fully automatic page segmentation.
-  PSM_SINGLE_COLUMN,  ///< Assume a single column of text of variable sizes.
-  PSM_SINGLE_BLOCK,   ///< Assume a single uniform block of text. (Default.)
-  PSM_SINGLE_LINE,    ///< Treat the image as a single text line.
-  PSM_SINGLE_WORD,    ///< Treat the image as a single word.
-  PSM_SINGLE_CHAR,    ///< Treat the image as a single character.
-
-  PSM_COUNT           ///< Number of enum entries.
-};
-
-/**
- * The values in the AccuracyVSpeed enum provide hints for how the engine
- * should trade speed for accuracy. There is no guarantee of any effect.
- */
-enum AccuracyVSpeed {
-  AVS_FASTEST = 0,         ///< Fastest speed, but lowest accuracy.
-  AVS_MOST_ACCURATE = 100  ///< Greatest accuracy, but slowest speed.
-};

 /**
 * Base class for all tesseract APIs.
@ -106,47 +100,66 @@ class TESSDLL_API TessBaseAPI {
  void SetOutputName(const char* name);

  /**
-   * Set the value of an internal "variable" (of either old or new types).
-   * Supply the name of the variable and the value as a string, just as
+   * Set the value of an internal "parameter."
+   * Supply the name of the parameter and the value as a string, just as
   * you would in a config file.
   * Returns false if the name lookup failed.
   * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
   * Or SetVariable("bln_numericmode", "1"); to set numeric-only mode.
   * SetVariable may be used before Init, but settings will revert to
   * defaults on End().
+   * TODO(rays) Add a command-line option to dump the parameters to stdout
+   * and add a pointer to it in the FAQ
   */
-  bool SetVariable(const char* variable, const char* value);
+  bool SetVariable(const char* name, const char* value);
+  // Same as above, but the parameter is set only if it is one of the "init"
+  // parameters (defined with *_INIT_* macro).
+  bool SetVariableIfInit(const char *name, const char *value);
+
+  // Returns true if the parameter was found among Tesseract parameters.
+  // Fills in value with the value of the parameter.
+  bool GetIntVariable(const char *name, int *value) const;
+  bool GetBoolVariable(const char *name, bool *value) const;
+  bool GetDoubleVariable(const char *name, double *value) const;
+  // Returns the pointer to the string that represents the value of the
+  // parameter if it was found among Tesseract parameters.
+  const char *GetStringVariable(const char *name) const;
+
+  // Print Tesseract parameters to the given file.
+  void PrintVariables(FILE *fp) const;

  /**
-   * Eventually instances will be thread-safe and totally independent,
-   * but for now, they all point to the same underlying engine,
-   * and are NOT RE-ENTRANT OR THREAD-SAFE. For now:
-   * it is safe to Init multiple TessBaseAPIs in the same language, use them
-   * sequentially, and End or delete them all, but once one is Ended, you can't
-   * do anything other than End the others. After End, it is safe to Init
-   * again on the same one.
+   * Instances are now mostly thread-safe and totally independent,
+   * but some global parameters remain. Basically it is safe to use multiple
+   * TessBaseAPIs in different threads in parallel, UNLESS:
+   * you use SetVariable on some of the Params in classify and textord.
+   * If you do, then the effect will be to change it for all your instances.
   *
   * Start tesseract. Returns zero on success and -1 on failure.
   * NOTE that the only members that may be called before Init are those
   * listed above here in the class definition.
   *
-   * The datapath must be the name of the data directory (no ending /) or
-   * some other file in which the data directory resides (for instance argv[0].)
+   * The datapath must be the name of the parent directory of tessdata and
+   * must end in / . Any name after the last / will be stripped.
   * The language is (usually) an ISO 639-3 string or NULL will default to eng.
   * It is entirely safe (and eventually will be efficient too) to call
   * Init multiple times on the same instance to change language, or just
   * to reset the classifier.
-   * WARNING: On changing languages, all Variables are reset back to their
-   * default values. If you have a rare need to set a Variable that controls
+   * WARNING: On changing languages, all Tesseract parameters are reset
+   * back to their default values. (Which may vary between languages.)
+   * If you have a rare need to set a Variable that controls
   * initialization for a second call to Init you should explicitly
   * call End() and then use SetVariable before Init. This is only a very
-   * rare use case, since there are very few uses that require any variables
+   * rare use case, since there are very few uses that require any parameters
   * to be set before Init.
   */
-  int Init(const char* datapath, const char* language,
-           char **configs, int configs_size, bool configs_global_only);
+  int Init(const char* datapath, const char* language, OcrEngineMode mode,
+           char **configs, int configs_size, bool configs_init_only);
+  int Init(const char* datapath, const char* language, OcrEngineMode oem) {
+    return Init(datapath, language, oem, NULL, 0, false);
+  }
  int Init(const char* datapath, const char* language) {
-    return Init(datapath, language, 0, 0, false);
+    return Init(datapath, language, OEM_DEFAULT, NULL, 0, false);
  }

  /**
@ -157,22 +170,24 @@ class TESSDLL_API TessBaseAPI {
   */
  int InitLangMod(const char* datapath, const char* language);

-  /**
-   * Init everything except the language model. Used to allow initialization for
-   * the specified language without any available dawg models.
-   */
-  int InitWithoutLangModel(const char* datapath, const char* language);
+  // Init only for page layout analysis. Use only for calls to SetImage and
+  // AnalysePage. Calls that attempt recognition will generate an error.
+  void InitForAnalysePage();

  /**
   * Read a "config" file containing a set of variable, value pairs.
   * Searches the standard places: tessdata/configs, tessdata/tessconfigs
   * and also accepts a relative or absolute path name.
+   * If init_only is true, only sets the parameters marked with a special
+   * INIT flag, which are typically of functional/algorithmic effect
+   * rather than debug effect. Used to separate debug settings from
+   * working settings.
   */
-  void ReadConfigFile(const char* filename, bool global_only);
+  void ReadConfigFile(const char* filename, bool init_only);

  /**
   * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
-   * The mode is stored as an INT_VARIABLE so it can also be modified by
+   * The mode is stored as an IntParam so it can also be modified by
   * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
   */
  void SetPageSegMode(PageSegMode mode);
@ -180,19 +195,6 @@ class TESSDLL_API TessBaseAPI {
  /** Return the current page segmentation mode. */
  PageSegMode GetPageSegMode() const;

-  /**
-   * Set the hint for trading accuracy against speed.
-   * Default is AVS_FASTEST, which is the old behaviour.
-   * Note that this is only a hint. Depending on the language and/or
-   * build configuration, speed and accuracy may not be tradeable.
-   * Also note that despite being an enum, any value in the range
-   * AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not
-   * have an effect, depending on the implementation.
-   * The mode is stored as an INT_VARIABLE so it can also be modified by
-   * ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string).
-   */
-  void SetAccuracyVSpeed(AccuracyVSpeed mode);
-
  /**
   * Recognize a rectangle from an image and return the result as a string.
   * May be called many times for a single Init.
@ -267,7 +269,7 @@ class TESSDLL_API TessBaseAPI {
   * delete it when it it is replaced or the API is destructed.
   */
  void SetThresholder(ImageThresholder* thresholder) {
-    if (thresholder_ != 0)
+    if (thresholder_ != NULL)
      delete thresholder_;
    thresholder_ = thresholder;
    ClearResults();
@ -291,8 +293,8 @@ class TESSDLL_API TessBaseAPI {
   * Get the textlines as a leptonica-style
   * Boxa, Pixa pair, in reading order.
   * Can be called before or after Recognize.
-   * If blockids is not NULL, the block-id of each line is also returned as an
-   * array of one element per line. delete [] after use.
+   * If blockids is not NULL, the block-id of each line is also returned
+   * as an array of one element per line. delete [] after use.
   */
  Boxa* GetTextlines(Pixa** pixa, int** blockids);

@ -303,6 +305,22 @@ class TESSDLL_API TessBaseAPI {
   */
  Boxa* GetWords(Pixa** pixa);

+  // Gets the individual connected (text) components (created
+  // after pages segmentation step, but before recognition)
+  // as a leptonica-style Boxa, Pixa pair, in reading order.
+  // Can be called before or after Recognize.
+  // Note: the caller is responsible for calling boxaDestroy()
+  // on the returned Boxa array and pixaDestroy() on cc array.
+  Boxa* GetConnectedComponents(Pixa** cc);
+
+  // Get the given level kind of components (block, textline, word etc.) as a
+  // leptonica-style Boxa, Pixa pair, in reading order.
+  // Can be called before or after Recognize.
+  // If blockids is not NULL, the block-id of each component is also returned
+  // as an array of one element per component. delete [] after use.
+  Boxa* GetComponentImages(PageIteratorLevel level,
+                           Pixa** pixa, int** blockids);
+
  /**
   * Dump the internal binary image to a PGM file.
   * @deprecated Use GetThresholdedImage and write the image using pixWrite
@ -310,13 +328,24 @@ class TESSDLL_API TessBaseAPI {
   */
  void DumpPGM(const char* filename);

+  // Runs page layout analysis in the mode set by SetPageSegMode.
+  // May optionally be called prior to Recognize to get access to just
+  // the page layout results. Returns an iterator to the results.
+  // Returns NULL on error.
+  // The returned iterator must be deleted after use.
+  // WARNING! This class points to data held within the TessBaseAPI class, and
+  // therefore can only be used while the TessBaseAPI class still exists and
+  // has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+  // DetectOS, or anything else that changes the internal PAGE_RES.
+  PageIterator* AnalyseLayout();
+
  /**
   * Recognize the image from SetAndThresholdImage, generating Tesseract
   * internal structures. Returns 0 on success.
   * Optional. The Get*Text functions below will call Recognize if needed.
   * After Recognize, the output is kept internally until the next SetImage.
   */
-  int Recognize(ETEXT_STRUCT* monitor);
+  int Recognize(ETEXT_DESC* monitor);

  /**
   * Methods to retrieve information after SetAndThresholdImage(),
@ -324,7 +353,15 @@ class TESSDLL_API TessBaseAPI {
   */

  /** Variant on Recognize used for testing chopper. */
-  int RecognizeForChopTest(struct ETEXT_STRUCT* monitor);
+  int RecognizeForChopTest(ETEXT_DESC* monitor);
+
+  // Get an iterator to the results of LayoutAnalysis and/or Recognize.
+  // The returned iterator must be deleted after use.
+  // WARNING! This class points to data held within the TessBaseAPI class, and
+  // therefore can only be used while the TessBaseAPI class still exists and
+  // has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+  // DetectOS, or anything else that changes the internal PAGE_RES.
+  ResultIterator* GetIterator();

  /**
   * The recognized text is returned as a char* which is coded
@ -334,16 +371,15 @@ class TESSDLL_API TessBaseAPI {
  /**
   * Make a HTML-formatted string with hOCR markup from the internal
   * data structures.
-   * STL removed from original patch submission and refactored by rays.
-   * page_id is 1-based and will appear in the output.
+   * page_number is 0-based but will appear in the output as 1-based.
   */
- char* GetHOCRText(int page_id);
+  char* GetHOCRText(int page_number);
  /**
   * The recognized text is returned as a char* which is coded in the same
   * format as a box file used in training. Returned string must be freed with
   * the delete [] operator.
   * Constructs coordinates in the original image - not just the rectangle.
-   * page_number is a 0-base page index that will appear in the box file.
+   * page_number is a 0-based page index that will appear in the box file.
   */
  char* GetBoxText(int page_number);
  /**
@ -388,9 +424,14 @@ class TESSDLL_API TessBaseAPI {

  bool GetTextDirection(int* out_offset, float* out_slope);

-  /** Set the letter_is_okay function to point somewhere else. */
+  /** Sets Dict::letter_is_okay_ function to point to the given function. */
  void SetDictFunc(DictFunc f);

+  /** Sets Dict::probability_in_context_ function to point to the given
+   * function.
+   */
+  void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
+
  /**
   * Estimates the Orientation And Script of the image.
   * @return true if the image was processed successfully.
@ -398,8 +439,26 @@ class TESSDLL_API TessBaseAPI {
  bool DetectOS(OSResults*);

  /** This method returns the features associated with the input image. */
-  void GetFeatures(INT_FEATURE_ARRAY int_features,
-                   int* num_features);
+  void GetFeaturesForBlob(TBLOB* blob, const DENORM& denorm,
+                          INT_FEATURE_ARRAY int_features,
+                          int* num_features, int* FeatureOutlineIndex);
+
+  // This method returns the row to which a box of specified dimensions would
+  // belong. If no good match is found, it returns NULL.
+  static ROW* FindRowForBox(BLOCK_LIST* blocks, int left, int top,
+                            int right, int bottom);
+
+  // Method to run adaptive classifier on a blob.
+  // It returns at max num_max_matches results.
+  void RunAdaptiveClassifier(TBLOB* blob, const DENORM& denorm,
+                             int num_max_matches,
+                             int* unichar_ids,
+                             char* configs,
+                             float* ratings,
+                             int* num_matches_returned);
+
+  // This method returns the string form of the specified unichar.
+  const char* GetUnichar(int unichar_id);

  /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
  const Dawg *GetDawg(int i) const;
@ -410,6 +469,42 @@ class TESSDLL_API TessBaseAPI {
  /** Return the language used in the last valid initialization. */
  const char* GetLastInitLanguage() const;

+  // Returns a ROW object created from the input row specification.
+  static ROW *MakeTessOCRRow(float baseline, float xheight,
+                             float descender, float ascender);
+
+  // Returns a TBLOB corresponding to the entire input image.
+  static TBLOB *MakeTBLOB(Pix *pix);
+
+  // This method baseline normalizes a TBLOB in-place. The input row is used
+  // for normalization. The denorm is an optional parameter in which the
+  // normalization-antidote is returned.
+  static void NormalizeTBLOB(TBLOB *tblob, ROW *row,
+                             bool numeric_mode, DENORM *denorm);
+
+  Tesseract* const tesseract() const {
+    return tesseract_;
+  }
+
+  // Return a pointer to underlying CubeRecoContext object if present.
+  CubeRecoContext *GetCubeRecoContext() const;
+
+  void set_min_orientation_margin(double margin);
+
+  // Return text orientation of each block as determined by an earlier run
+  // of layout analysis.
+  void GetBlockTextOrientations(int** block_orientation,
+                                bool** vertical_writing);
+
+  /** Find lines from the image making the BLOCK_LIST. */
+  BLOCK_LIST* FindLinesCreateBlockList();
+
+  /**
+   * Delete a block list.
+   * This is to keep BLOCK_LIST pointer opaque
+   * and let go of including the other headers.
+   */
+  static void DeleteBlockList(BLOCK_LIST* block_list);
 /* @} */

 protected:
@ -441,17 +536,7 @@ class TESSDLL_API TessBaseAPI {
  int TextLength(int* blob_count);

  /** @defgroup ocropusAddOns ocropus add-ons */
-
  /* @{ */
-  /** Find lines from the image making the BLOCK_LIST. */
-  BLOCK_LIST* FindLinesCreateBlockList();
-
-  /**
-   * Delete a block list.
-   * This is to keep BLOCK_LIST pointer opaque
-   * and let go of including the other headers.
-   */
-  static void DeleteBlockList(BLOCK_LIST* block_list);

  /**
   * Adapt to recognize the current image as the given character.
@ -465,9 +550,8 @@ class TESSDLL_API TessBaseAPI {
                        float ascender);

  /** Recognize text doing one pass only, using settings for a given pass. */
-  /*static*/ PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list);
-  /*static*/ PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list,
-                                    PAGE_RES* pass1_result);
+  PAGE_RES* RecognitionPass1(BLOCK_LIST* block_list);
+  PAGE_RES* RecognitionPass2(BLOCK_LIST* block_list, PAGE_RES* pass1_result);

  /**
   * Extract the OCR results, costs (penalty points for uncertainty),
@ -482,67 +566,25 @@ class TESSDLL_API TessBaseAPI {
                                    int** y1,
                                    PAGE_RES* page_res);

-  /**
-   * Call the Cube OCR engine. Takes the Region, line and word segmentation
-   * information from Tesseract as inputs. Makes changes or populates the
-   * output PAGE_RES object which contains the recogntion results.
-   * The behavior of this function depends on the
-   * current language and the value of the tessedit_accuracyvspeed:
-   * For English (and other Latin based scripts):
-   *    If the accuracyvspeed flag is set to any value other than AVS_FASTEST,
-   *    Cube uses the word information passed by Tesseract.
-   *    Cube will run on a subset of the words segmented and recognized by
-   *    Tesseract. The value of the accuracyvspeed and the Tesseract
-   *    confidence of a word determines whether Cube runs on it or not and
-   *    whether Cube's results override Tesseract's
-   * For Arabic & Hindi:
-   *    Cube uses the Region information passed by Tesseract. It then performs
-   *    its own line segmentation. This will change once Tesseract's line
-   *    segmentation works for Arabic. Cube then segments each line into
-   *    phrases. Each phrase is then recognized in phrase mode which allows
-   *    spaces in the results.
-   *    Note that at this point, the line segmentation algorithm might have
-   *    some problems with ill spaced Arabic document.
-   */
-  int Cube();
-  /** Run Cube on the lines extracted by Tesseract. */
-  int RunCubeOnLines();
-  /**
-   * Run Cube on a subset of the words already present in the page_res_ object
-   * The subset, and whether Cube overrides the results is determined by
-   * the SpeedVsAccuracy flag
-   */
-  int CubePostProcessWords();
-  /** Create a Cube line object for each line */
-  CubeLineObject **CreateLineObjects(Pixa* pixa_lines);
-  /**
-   * Create a TBox array corresponding to the phrases in the array of
-   * line objects
-   */
-  TBOX *CreatePhraseBoxes(Boxa* boxa_lines, CubeLineObject **line_objs,
-                          int *phrase_cnt);
-  /** Recognize the phrases saving the results to the page_res_ object */
-  bool RecognizePhrases(int line_cnt, int phrase_cnt,
-                        CubeLineObject **line_objs, TBOX *phrase_boxes);
-  /** Recognize a single phrase saving the results to the page_res_ object */
-  bool RecognizePhrase(CubeObject *phrase, PAGE_RES_IT *result);
-  /** Create the necessary Cube Objects */
-  bool CreateCubeObjects();
-  /* @} */
+  const PAGE_RES* GetPageRes() const {
+    return page_res_;
+  };

 protected:
-   Tesseract*        tesseract_;       ///< The underlying data object.
-   ImageThresholder* thresholder_;     ///< Image thresholding module.
-   bool              threshold_done_;  ///< Image has been passed to page_image.
-   BLOCK_LIST*       block_list_;      ///< The page layout.
-   PAGE_RES*         page_res_;        ///< The page-level data.
-   STRING*           input_file_;      ///< Name used by training code.
-   STRING*           output_file_;     ///< Name used by debug code.
-   STRING*           datapath_;        ///< Current location of tessdata.
-   STRING*           language_;        ///< Last initialized language.
+  Tesseract*        tesseract_;       ///< The underlying data object.
+  Tesseract*        osd_tesseract_;   ///< For orientation & script detection.
+  ImageThresholder* thresholder_;     ///< Image thresholding module.
+  BLOCK_LIST*       block_list_;      ///< The page layout.
+  PAGE_RES*         page_res_;        ///< The page-level data.
+  STRING*           input_file_;      ///< Name used by training code.
+  STRING*           output_file_;     ///< Name used by debug code.
+  STRING*           datapath_;        ///< Current location of tessdata.
+  STRING*           language_;        ///< Last initialized language.
+  OcrEngineMode last_oem_requested_;  ///< Last ocr language mode requested.
+  bool          recognition_done_;    ///< page_res_ contains recognition data.

-  /** 
-   * @defgroup ThresholderParams 
+  /**
+   * @defgroup ThresholderParams
   * Parameters saved from the Thresholder. Needed to rebuild coordinates.
   */
  /* @{ */
@ -555,6 +597,6 @@ class TESSDLL_API TessBaseAPI {
  /* @} */
 };

-} // namespace tesseract.
+}  // namespace tesseract.

-#endif // TESSERACT_CCMAIN_BASEAPI_H__
+#endif  // TESSERACT_API_BASEAPI_H__
--- a/api/pageiterator.cpp
+++ b/api/pageiterator.cpp
@ -0,0 +1,388 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pageiterator.cpp
+// Description: Iterator for tesseract page structure that avoids using
+//              tesseract internal data structures.
+// Author:      Ray Smith
+// Created:     Fri Feb 26 14:32:09 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "pageiterator.h"
+#include "allheaders.h"
+#include "helpers.h"
+#include "pageres.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                           int scale, int scaled_yres,
+                           int rect_left, int rect_top,
+                           int rect_width, int rect_height)
+  : page_res_(page_res), tesseract_(tesseract),
+    word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL),
+    scale_(scale), scaled_yres_(scaled_yres),
+    rect_left_(rect_left), rect_top_(rect_top),
+    rect_width_(rect_width), rect_height_(rect_height) {
+  it_ = new PAGE_RES_IT(page_res);
+  Begin();
+}
+
+PageIterator::~PageIterator() {
+  delete it_;
+  delete cblob_it_;
+}
+
+// PageIterators may be copied! This makes it possible to iterate over
+// all the objects at a lower level, while maintaining an iterator to
+// objects at a higher level.
+PageIterator::PageIterator(const PageIterator& src)
+  : page_res_(src.page_res_), tesseract_(src.tesseract_),
+    word_(NULL), word_length_(src.word_length_),
+    blob_index_(src.blob_index_), cblob_it_(NULL),
+    scale_(src.scale_), scaled_yres_(src.scaled_yres_),
+    rect_left_(src.rect_left_), rect_top_(src.rect_top_),
+    rect_width_(src.rect_width_), rect_height_(src.rect_height_) {
+  it_ = new PAGE_RES_IT(*src.it_);
+  BeginWord(src.blob_index_);
+}
+
+const PageIterator& PageIterator::operator=(const PageIterator& src) {
+  page_res_ = src.page_res_;
+  tesseract_ = src.tesseract_;
+  scale_ = src.scale_;
+  scaled_yres_ = src.scaled_yres_;
+  rect_left_ = src.rect_left_;
+  rect_top_ = src.rect_top_;
+  rect_width_ = src.rect_width_;
+  rect_height_ = src.rect_height_;
+  if (it_ != NULL) delete it_;
+  it_ = new PAGE_RES_IT(*src.it_);
+  BeginWord(src.blob_index_);
+  return *this;
+}
+
+// ============= Moving around within the page ============.
+
+// Resets the iterator to point to the start of the page.
+void PageIterator::Begin() {
+  it_->restart_page_with_empties();
+  BeginWord(0);
+}
+
+// Moves to the start of the next object at the given level in the
+// page hierarchy, and returns false if the end of the page was reached.
+// NOTE that RIL_SYMBOL will skip non-text blocks, but all other
+// PageIteratorLevel level values will visit each non-text block once.
+// Think of non text blocks as containing a single para, with a single line,
+// with a single imaginary word.
+// Calls to Next with different levels may be freely intermixed.
+// This function iterates words in right-to-left scripts correctly, if
+// the appropriate language has been loaded into Tesseract.
+bool PageIterator::Next(PageIteratorLevel level) {
+  if (it_->block() == NULL) return false;  // Already at the end!
+  if (it_->word() == NULL)
+    level = RIL_BLOCK;
+
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      it_->forward_block();
+      break;
+    case RIL_TEXTLINE:
+      for (it_->forward_with_empties(); it_->row() == it_->prev_row();
+           it_->forward_with_empties());
+      break;
+    case RIL_WORD:
+      it_->forward_with_empties();
+      break;
+    case RIL_SYMBOL:
+      if (cblob_it_ != NULL)
+        cblob_it_->forward();
+      ++blob_index_;
+      if (blob_index_ >= word_length_)
+        it_->forward();
+      else
+        return true;
+      break;
+  }
+  BeginWord(0);
+  return it_->block() != NULL;
+}
+
+// Returns true if the iterator is at the start of an object at the given
+// level. Possible uses include determining if a call to Next(RIL_WORD)
+// moved to the start of a RIL_PARA.
+bool PageIterator::IsAtBeginningOf(PageIteratorLevel level) const {
+  if (it_->block() == NULL) return false;  // Already at the end!
+  if (it_->word() == NULL) return true;  // In an image block.
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      return it_->block() != it_->prev_block();
+    case RIL_TEXTLINE:
+      return it_->row() != it_->prev_row();
+    case RIL_WORD:
+      return blob_index_ == 0;
+    case RIL_SYMBOL:
+      return true;
+  }
+  return false;
+}
+
+// Returns whether the iterator is positioned at the last element in a
+// given level. (e.g. the last word in a line, the last line in a block)
+bool PageIterator::IsAtFinalElement(PageIteratorLevel level,
+                                    PageIteratorLevel element) const {
+  if (it_->word() == NULL) return true;  // Already at the end!
+  // The result is true if we step forward by element and find we are
+  // at the the end of the page or at beginning of *all* levels in:
+  // [level, element).
+  // When there is more than one level difference between element and level,
+  // we could for instance move forward one symbol and still be at the first
+  // word on a line, so we also have to be at the first symbol in a word.
+  PageIterator next(*this);
+  next.Next(element);
+  if (next.it_->word() == NULL) return true;  // Reached the end of the page.
+  while (element > level) {
+    element = static_cast<PageIteratorLevel>(element - 1);
+    if (!next.IsAtBeginningOf(element))
+      return false;
+  }
+  return true;
+}
+
+// ============= Accessing data ==============.
+// Coordinate system:
+// Integer coordinates are at the cracks between the pixels.
+// The top-left corner of the top-left pixel in the image is at (0,0).
+// The bottom-right corner of the bottom-right pixel in the image is at
+// (width, height).
+// Every bounding box goes from the top-left of the top-left contained
+// pixel to the bottom-right of the bottom-right contained pixel, so
+// the bounding box of the single top-left pixel in the image is:
+// (0,0)->(1,1).
+// If an image rectangle has been set in the API, then returned coordinates
+// relate to the original (full) image, rather than the rectangle.
+
+// Returns the bounding rectangle of the current object at the given level.
+// See comment on coordinate system above.
+// Returns false if there is no such object at the current position.
+bool PageIterator::BoundingBox(PageIteratorLevel level,
+                               int* left, int* top,
+                               int* right, int* bottom) const {
+  if (it_->block() == NULL) return false;  // Already at the end!
+  if (it_->word() == NULL && level != RIL_BLOCK) return false;
+  if (level == RIL_SYMBOL && blob_index_ >= word_length_)
+    return false;  // Zero length word, or already at the end of it.
+  TBOX box;
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      box = it_->block()->block->bounding_box();
+      break;
+    case RIL_TEXTLINE:
+      box = it_->row()->row->bounding_box();
+      break;
+    case RIL_WORD:
+      box = it_->word()->word->bounding_box();
+      break;
+    case RIL_SYMBOL:
+      if (cblob_it_ == NULL)
+        box = it_->word()->box_word->BlobBox(blob_index_);
+      else
+        box = cblob_it_->data()->bounding_box();
+      // Intersect with the word box.
+      const TBOX& word_box = it_->word()->word->bounding_box();
+      if (box.overlap(word_box))
+        box -= word_box;
+      else
+        box = word_box;
+  }
+  if (level != RIL_SYMBOL || cblob_it_ != NULL)
+    box.rotate(it_->block()->block->re_rotation());
+  // Now we have a box in tesseract coordinates relative to the image rectangle,
+  // we have to convert the coords to global page coords in a top-down system.
+  *left = ClipToRange(box.left() / scale_ + rect_left_,
+                      rect_left_, rect_left_ + rect_width_);
+  *top = ClipToRange((rect_height_ - box.top()) / scale_ + rect_top_,
+                     rect_top_, rect_top_ + rect_height_);
+  *right = ClipToRange((box.right() + scale_ - 1) / scale_ + rect_left_,
+                       *left, rect_left_ + rect_width_);
+  *bottom = ClipToRange((rect_height_ - box.bottom() + scale_ - 1) / scale_
+                           + rect_top_,
+                        *top, rect_top_ + rect_height_);
+  return true;
+}
+
+// Returns the type of the current block. See apitypes.h for PolyBlockType.
+PolyBlockType PageIterator::BlockType() const {
+  if (it_->block() == NULL || it_->block()->block == NULL)
+    return PT_UNKNOWN;  // Already at the end!
+  if (it_->block()->block->poly_block() == NULL)
+    return PT_FLOWING_TEXT;  // No layout analysis used - assume text.
+  return it_->block()->block->poly_block()->isA();
+}
+
+// Returns a binary image of the current object at the given level.
+// The position and size match the return from BoundingBox.
+// Use pixDestroy to delete the image after use.
+// The following methods are used to generate the images:
+// RIL_BLOCK: mask the page image with the block polygon.
+// RIL_TEXTLINE: Clip the rectangle of the line box from the page image.
+// TODO(rays) fix this to generate and use a line polygon.
+// RIL_WORD: Clip the rectangle of the word box from the page image.
+// RIL_SYMBOL: Render the symbol outline to an image for cblobs (prior
+// to recognition) or the bounding box otherwise.
+// A reconstruction of the original image (using xor to check for double
+// representation) should be reasonably accurate,
+// apart from removed noise, at the block level. Below the block level, the
+// reconstruction will be missing images and line separators.
+// At the symbol level, kerned characters will be invade the bounding box
+// if rendered after recognition, making an xor reconstruction inaccurate, but
+// an or construction better. Before recognition, symbol-level reconstruction
+// should be good, even with xor, since the images come from the connected
+// components.
+Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
+  int left, top, right, bottom;
+  if (!BoundingBox(level, &left, &top, &right, &bottom))
+    return NULL;
+  Pix* pix = NULL;
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      pix = it_->block()->block->render_mask();
+      // AND the mask and the image.
+      pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
+                  PIX_SRC & PIX_DST, tesseract_->pix_binary(),
+                  left, top);
+      break;
+    case RIL_TEXTLINE:
+    case RIL_WORD:
+    case RIL_SYMBOL:
+      if (level == RIL_SYMBOL && cblob_it_ != NULL)
+        return cblob_it_->data()->render();
+      // Just clip from the bounding box.
+      Box* box = boxCreate(left, top, right - left, bottom - top);
+      pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
+      boxDestroy(&box);
+      break;
+  }
+  return pix;
+}
+
+// Returns an image of the current object at the given level in greyscale
+// if available in the input. To guarantee a binary image use BinaryImage.
+// NOTE that in order to give the best possible image, the bounds are
+// expanded slightly over the binary connected component, by the supplied
+// padding, so the top-left position of the returned image is returned
+// in (left,top). These will most likely not match the coordinates
+// returned by BoundingBox.
+// Use pixDestroy to delete the image after use.
+Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
+                            int* left, int* top) const {
+  int right, bottom;
+  if (!BoundingBox(level, left, top, &right, &bottom))
+    return NULL;
+  Pix* pix = tesseract_->pix_grey();
+  if (pix == NULL)
+    return GetBinaryImage(level);
+
+  // Expand the box.
+  *left = MAX(*left - padding, 0);
+  *top = MAX(*top - padding, 0);
+  right = MIN(right + padding, rect_width_);
+  bottom = MIN(bottom + padding, rect_height_);
+  Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
+  Pix* grey_pix = pixClipRectangle(pix, box, NULL);
+  boxDestroy(&box);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    Pix* mask = it_->block()->block->render_mask();
+    Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
+    pixRasterop(expanded_mask, padding, padding,
+                pixGetWidth(mask), pixGetHeight(mask),
+                PIX_SRC, mask, 0, 0);
+    pixDestroy(&mask);
+    pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1);
+    pixInvert(expanded_mask, expanded_mask);
+    pixSetMasked(grey_pix, expanded_mask, 255);
+    pixDestroy(&expanded_mask);
+  }
+  return grey_pix;
+}
+
+
+// Returns the baseline of the current object at the given level.
+// The baseline is the line that passes through (x1, y1) and (x2, y2).
+// WARNING: with vertical text, baselines may be vertical!
+bool PageIterator::Baseline(PageIteratorLevel level,
+                            int* x1, int* y1, int* x2, int* y2) const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  ROW* row = it_->row()->row;
+  WERD* word = it_->word()->word;
+  TBOX box = (level == RIL_WORD || level == RIL_SYMBOL)
+           ? word->bounding_box()
+           : row->bounding_box();
+  int left = box.left();
+  ICOORD startpt(left, static_cast<inT16>(row->base_line(left) + 0.5));
+  int right = box.right();
+  ICOORD endpt(right, static_cast<inT16>(row->base_line(right) + 0.5));
+  // Rotate to image coordinates and convert to global image coords.
+  startpt.rotate(it_->block()->block->re_rotation());
+  endpt.rotate(it_->block()->block->re_rotation());
+  *x1 = startpt.x() / scale_ + rect_left_;
+  *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
+  *x2 = endpt.x() / scale_ + rect_left_;
+  *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
+  return true;
+}
+
+// Sets up the internal data for iterating the blobs of a new word, then
+// moves the iterator to the given offset.
+void PageIterator::BeginWord(int offset) {
+  WERD_RES* word_res = it_->word();
+  if (word_res == NULL) {
+    // This is a non-text block, so there is no word.
+    word_length_ = 0;
+    blob_index_ = 0;
+    word_ = NULL;
+    return;
+  }
+  if (word_res->best_choice != NULL) {
+    // Recognition has been done, so we are using the box_word, which
+    // is already baseline denormalized.
+    word_length_ = word_res->best_choice->length();
+    ASSERT_HOST(word_res->box_word != NULL);
+    ASSERT_HOST(word_res->box_word->length() == word_length_);
+    word_ = NULL;
+    // We will be iterating the box_word.
+    if (cblob_it_ != NULL) {
+      delete cblob_it_;
+      cblob_it_ = NULL;
+    }
+  } else {
+    // No recognition yet, so a "symbol" is a cblob.
+    word_ = word_res->word;
+    ASSERT_HOST(word_->cblob_list() != NULL);
+    word_length_ = word_->cblob_list()->length();
+    if (cblob_it_ == NULL) cblob_it_ = new C_BLOB_IT;
+    cblob_it_->set_to_list(word_->cblob_list());
+  }
+  for (blob_index_ = 0; blob_index_ < offset; ++blob_index_) {
+    if (cblob_it_ != NULL)
+      cblob_it_->forward();
+  }
+}
+
+}  // namespace tesseract.
--- a/api/pageiterator.h
+++ b/api/pageiterator.h
@ -0,0 +1,184 @@
+///////////////////////////////////////////////////////////////////////
+// File:        pageiterator.h
+// Description: Iterator for tesseract page structure that avoids using
+//              tesseract internal data structures.
+// Author:      Ray Smith
+// Created:     Fri Feb 26 11:01:06 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_API_PAGEITERATOR_H__
+#define TESSERACT_API_PAGEITERATOR_H__
+
+#include "apitypes.h"
+
+class C_BLOB_IT;
+class PBLOB_IT;
+class PAGE_RES;
+class PAGE_RES_IT;
+class WERD;
+struct Pix;
+
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract page structure, providing access to all
+// levels of the page hierarchy, without including any tesseract headers or
+// having to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See apitypes.h for the definition of PageIteratorLevel.
+// See also ResultIterator, derived from PageIterator, which adds in the
+// ability to access OCR output with text-specific methods.
+
+class PageIterator {
+ public:
+  // page_res and tesseract come directly from the BaseAPI.
+  // The rectangle parameters are copied indirectly from the Thresholder,
+  // via the BaseAPI. They represent the coordinates of some rectangle in an
+  // original image (in top-left-origin coordinates) and therefore the top-left
+  // needs to be added to any output boxes in order to specify coordinates
+  // in the original image. See TessBaseAPI::SetRectangle.
+  // The scale and scaled_yres are in case the Thresholder scaled the image
+  // rectangle prior to thresholding. Any coordinates in tesseract's image
+  // must be divided by scale before adding (rect_left, rect_top).
+  // The scaled_yres indicates the effective resolution of the binary image
+  // that tesseract has been given by the Thresholder.
+  // After the constructor, Begin has already been called.
+  PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
+               int scale, int scaled_yres,
+               int rect_left, int rect_top,
+               int rect_width, int rect_height);
+  virtual ~PageIterator();
+
+  // Page/ResultIterators may be copied! This makes it possible to iterate over
+  // all the objects at a lower level, while maintaining an iterator to
+  // objects at a higher level. These constructors DO NOT CALL Begin, so
+  // iterations will continue from the location of src.
+  PageIterator(const PageIterator& src);
+  const PageIterator& operator=(const PageIterator& src);
+
+  // ============= Moving around within the page ============.
+
+  // Moves the iterator to point to the start of the page to begin an iteration.
+  void Begin();
+
+  // Moves to the start of the next object at the given level in the
+  // page hierarchy, and returns false if the end of the page was reached.
+  // NOTE that RIL_SYMBOL will skip non-text blocks, but all other
+  // PageIteratorLevel level values will visit each non-text block once.
+  // Think of non text blocks as containing a single para, with a single line,
+  // with a single imaginary word.
+  // Calls to Next with different levels may be freely intermixed.
+  // This function iterates words in right-to-left scripts correctly, if
+  // the appropriate language has been loaded into Tesseract.
+  bool Next(PageIteratorLevel level);
+
+  // Returns true if the iterator is at the start of an object at the given
+  // level. Possible uses include determining if a call to Next(RIL_WORD)
+  // moved to the start of a RIL_PARA.
+  bool IsAtBeginningOf(PageIteratorLevel level) const;
+
+  // Returns whether the iterator is positioned at the last element in a
+  // given level. (e.g. the last word in a line, the last line in a block)
+  bool IsAtFinalElement(PageIteratorLevel level,
+                        PageIteratorLevel element) const;
+
+  // ============= Accessing data ==============.
+  // Coordinate system:
+  // Integer coordinates are at the cracks between the pixels.
+  // The top-left corner of the top-left pixel in the image is at (0,0).
+  // The bottom-right corner of the bottom-right pixel in the image is at
+  // (width, height).
+  // Every bounding box goes from the top-left of the top-left contained
+  // pixel to the bottom-right of the bottom-right contained pixel, so
+  // the bounding box of the single top-left pixel in the image is:
+  // (0,0)->(1,1).
+  // If an image rectangle has been set in the API, then returned coordinates
+  // relate to the original (full) image, rather than the rectangle.
+
+  // Returns the bounding rectangle of the current object at the given level.
+  // See comment on coordinate system above.
+  // Returns false if there is no such object at the current position.
+  // The returned bounding box is guaranteed to match the size and position
+  // of the image returned by GetBinaryImage, but may clip foreground pixels
+  // from a grey image. The padding argument to GetImage can be used to expand
+  // the image to include more foreground pixels. See GetImage below.
+  bool BoundingBox(PageIteratorLevel level,
+                   int* left, int* top, int* right, int* bottom) const;
+
+  // Returns the type of the current block. See apitypes.h for PolyBlockType.
+  PolyBlockType BlockType() const;
+
+  // Returns a binary image of the current object at the given level.
+  // The position and size match the return from BoundingBox.
+  // Use pixDestroy to delete the image after use.
+  Pix* GetBinaryImage(PageIteratorLevel level) const;
+
+  // Returns an image of the current object at the given level in greyscale
+  // if available in the input. To guarantee a binary image use BinaryImage.
+  // NOTE that in order to give the best possible image, the bounds are
+  // expanded slightly over the binary connected component, by the supplied
+  // padding, so the top-left position of the returned image is returned
+  // in (left,top). These will most likely not match the coordinates
+  // returned by BoundingBox.
+  // Use pixDestroy to delete the image after use.
+  Pix* GetImage(PageIteratorLevel level, int padding,
+                int* left, int* top) const;
+
+  // Returns the baseline of the current object at the given level.
+  // The baseline is the line that passes through (x1, y1) and (x2, y2).
+  // WARNING: with vertical text, baselines may be vertical!
+  // Returns false if there is no baseline at the current position.
+  bool Baseline(PageIteratorLevel level,
+                int* x1, int* y1, int* x2, int* y2) const;
+
+ protected:
+  // Sets up the internal data for iterating the blobs of a new word, then
+  // moves the iterator to the given offset.
+  void BeginWord(int offset);
+
+  // Pointer to the page_res owned by the API.
+  PAGE_RES* page_res_;
+  // Pointer to the Tesseract object owned by the API.
+  Tesseract* tesseract_;
+  // The iterator to the page_res_. Owned by this ResultIterator.
+  // A pointer just to avoid dragging in Tesseract includes.
+  PAGE_RES_IT* it_;
+  // The current input WERD being iterated. If there is an output from OCR,
+  // then word_ is NULL. Owned by the API.
+  WERD* word_;
+  // The length of the current word_.
+  int word_length_;
+  // The current blob index within the word.
+  int blob_index_;
+  // Iterator to the blobs within the word. If NULL, then we are iterating
+  // OCR results in the box_word.
+  // Owned by this ResultIterator.
+  C_BLOB_IT* cblob_it_;
+  // Parameters saved from the Thresholder. Needed to rebuild coordinates.
+  int scale_;
+  int scaled_yres_;
+  int rect_left_;
+  int rect_top_;
+  int rect_width_;
+  int rect_height_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_API_PAGEITERATOR_H__
--- a/api/resultiterator.cpp
+++ b/api/resultiterator.cpp
@ -0,0 +1,249 @@
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.cpp
+// Description: Iterator for tesseract results that avoids using tesseract
+//              internal data structures
+// Author:      Ray Smith
+// Created:     Fri Feb 26 14:32:09 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "resultiterator.h"
+#include "allheaders.h"
+#include "pageres.h"
+#include "tesseractclass.h"
+
+namespace tesseract {
+
+ResultIterator::ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                               int scale, int scaled_yres,
+                               int rect_left, int rect_top,
+                               int rect_width, int rect_height)
+  : PageIterator(page_res, tesseract, scale, scaled_yres,
+    rect_left, rect_top, rect_width, rect_height) {
+}
+
+ResultIterator::~ResultIterator() {
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// object at the given level. Use delete [] to free after use.
+char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  STRING text;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != NULL);
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        text += best_choice->unichar_string();
+        text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block());
+      break;
+    case RIL_TEXTLINE:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        text += best_choice->unichar_string();
+        text += res_it.word()->word->flag(W_EOL) ? "\n" : " ";
+         res_it.forward();
+      } while (res_it.row() == res_it.prev_row());
+      break;
+    case RIL_WORD:
+      text = best_choice->unichar_string();
+      break;
+    case RIL_SYMBOL:
+      text = tesseract_->unicharset.id_to_unichar(
+          best_choice->unichar_id(blob_index_));
+  }
+  int length = text.length() + 1;
+  char* result = new char[length];
+  strncpy(result, text.string(), length);
+  return result;
+}
+
+// Returns the mean confidence of the current object at the given level.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float ResultIterator::Confidence(PageIteratorLevel level) const {
+  if (it_->word() == NULL) return 0.0f;  // Already at the end!
+  float mean_certainty = 0.0f;
+  int certainty_count = 0;
+  PAGE_RES_IT res_it(*it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  ASSERT_HOST(best_choice != NULL);
+  switch (level) {
+    case RIL_BLOCK:
+    case RIL_PARA:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.block() == res_it.prev_block());
+      break;
+    case RIL_TEXTLINE:
+      do {
+        best_choice = res_it.word()->best_choice;
+        ASSERT_HOST(best_choice != NULL);
+        mean_certainty += best_choice->certainty();
+        ++certainty_count;
+        res_it.forward();
+      } while (res_it.row() == res_it.prev_row());
+      break;
+    case RIL_WORD:
+      mean_certainty += best_choice->certainty();
+     ++certainty_count;
+      break;
+    case RIL_SYMBOL:
+      BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
+      if (choices != NULL) {
+        BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
+        for (int blob = 0; blob < blob_index_; ++blob)
+          blob_choices_it.forward();
+        BLOB_CHOICE_IT choice_it(blob_choices_it.data());
+        for (choice_it.mark_cycle_pt();
+             !choice_it.cycled_list();
+             choice_it.forward()) {
+          if (choice_it.data()->unichar_id() ==
+              best_choice->unichar_id(blob_index_))
+            break;
+        }
+        mean_certainty += choice_it.data()->certainty();
+      } else {
+        mean_certainty += best_choice->certainty();
+      }
+      ++certainty_count;
+  }
+  if (certainty_count > 0) {
+    mean_certainty /= certainty_count;
+    float confidence = 100 + 5 * mean_certainty;
+    if (confidence < 0.0f) confidence = 0.0f;
+    if (confidence > 100.0f) confidence = 100.0f;
+    return confidence;
+  }
+  return 0.0f;
+}
+
+// Returns the font attributes of the current word. If iterating at a higher
+// level object than words, eg textlines, then this will return the
+// attributes of the first word in that textline.
+// The actual return value is a string representing a font name. It points
+// to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+// the iterator itself, ie rendered invalid by various members of
+// TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+// Pointsize is returned in printers points (1/72 inch.)
+const char* ResultIterator::WordFontAttributes(bool* is_bold,
+                                               bool* is_italic,
+                                               bool* is_underlined,
+                                               bool* is_monospace,
+                                               bool* is_serif,
+                                               int* pointsize,
+                                               int* font_id) const {
+  if (it_->word() == NULL) return NULL;  // Already at the end!
+  *font_id = it_->word()->font1;
+  if (*font_id < 0) return NULL;  // No font available.
+  const UnicityTable<FontInfo> &font_table = tesseract_->get_fontinfo_table();
+  FontInfo font_info = font_table.get(*font_id);
+  *is_bold = font_info.is_bold();
+  *is_italic = font_info.is_italic();
+  *is_underlined = false;  // TODO(rays) fix this!
+  *is_monospace = font_info.is_fixed_pitch();
+  *is_serif = font_info.is_serif();
+  // The font size is calculated from a multiple of the x-height
+  // that came from the block.
+  float row_height = it_->row()->row->x_height() *
+      it_->block()->block->cell_over_xheight();
+  // Convert from pixels to printers points.
+  *pointsize = scaled_yres_ > 0
+    ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5)
+    : 0;
+
+  return font_info.name;
+}
+
+// Returns true if the current word was found in a dictionary.
+bool ResultIterator::WordIsFromDictionary() const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM ||
+         permuter == USER_DAWG_PERM;
+}
+
+// Returns true if the current word is numeric.
+bool ResultIterator::WordIsNumeric() const {
+  if (it_->word() == NULL) return false;  // Already at the end!
+  int permuter = it_->word()->best_choice->permuter();
+  return permuter == NUMBER_PERM;
+}
+
+ChoiceIterator::ChoiceIterator(const ResultIterator& result_it) {
+  ASSERT_HOST(result_it.it_->word() != NULL);
+  tesseract_ = result_it.tesseract_;
+  PAGE_RES_IT res_it(*result_it.it_);
+  WERD_CHOICE* best_choice = res_it.word()->best_choice;
+  BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
+  if (choices != NULL) {
+    BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
+    for (int blob = 0; blob < result_it.blob_index_; ++blob)
+      blob_choices_it.forward();
+    choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
+    choice_it_->mark_cycle_pt();
+  } else {
+    choice_it_ = NULL;
+  }
+}
+
+ChoiceIterator::~ChoiceIterator() {
+  delete choice_it_;
+}
+
+// Moves to the next choice for the symbol and returns false if there
+// are none left.
+bool ChoiceIterator::Next() {
+  if (choice_it_ == NULL)
+    return false;
+  choice_it_->forward();
+  return !choice_it_->cycled_list();
+}
+
+// Returns the null terminated UTF-8 encoded text string for the current
+// choice. Use delete [] to free after use.
+const char* ChoiceIterator::GetUTF8Text() const {
+  if (choice_it_ == NULL)
+    return NULL;
+  UNICHAR_ID id = choice_it_->data()->unichar_id();
+  if (id < 0 || id >= tesseract_->unicharset.size() ||
+      id == INVALID_UNICHAR_ID)
+    return NULL;
+  return tesseract_->unicharset.id_to_unichar(id);
+}
+
+// Returns the confidence of the current choice.
+// The number should be interpreted as a percent probability. (0.0f-100.0f)
+float ChoiceIterator::Confidence() const {
+  if (choice_it_ == NULL)
+    return 0.0f;
+  float confidence = 100 + 5 * choice_it_->data()->certainty();
+  if (confidence < 0.0f) confidence = 0.0f;
+  if (confidence > 100.0f) confidence = 100.0f;
+  return confidence;
+}
+
+
+}  // namespace tesseract.
--- a/api/resultiterator.h
+++ b/api/resultiterator.h
@ -0,0 +1,144 @@
+///////////////////////////////////////////////////////////////////////
+// File:        resultiterator.h
+// Description: Iterator for tesseract results that avoids using tesseract
+//              internal data structures.
+// Author:      Ray Smith
+// Created:     Fri Feb 26 11:01:06 PST 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_API_RESULTITERATOR_H__
+#define TESSERACT_API_RESULTITERATOR_H__
+
+#include "pageiterator.h"
+
+class BLOB_CHOICE_IT;
+
+namespace tesseract {
+
+class Tesseract;
+
+// Class to iterate over tesseract results, providing access to all levels
+// of the page hierarchy, without including any tesseract headers or having
+// to handle any tesseract structures.
+// WARNING! This class points to data held within the TessBaseAPI class, and
+// therefore can only be used while the TessBaseAPI class still exists and
+// has not been subjected to a call of Init, SetImage, Recognize, Clear, End
+// DetectOS, or anything else that changes the internal PAGE_RES.
+// See apitypes.h for the definition of PageIteratorLevel.
+// See also base class PageIterator, which contains the bulk of the interface.
+// ResultIterator adds text-specific methods for access to OCR output.
+
+class ResultIterator : public PageIterator {
+  friend class ChoiceIterator;
+ public:
+  // page_res and tesseract come directly from the BaseAPI.
+  // The rectangle parameters are copied indirectly from the Thresholder,
+  // via the BaseAPI. They represent the coordinates of some rectangle in an
+  // original image (in top-left-origin coordinates) and therefore the top-left
+  // needs to be added to any output boxes in order to specify coordinates
+  // in the original image. See TessBaseAPI::SetRectangle.
+  // The scale and scaled_yres are in case the Thresholder scaled the image
+  // rectangle prior to thresholding. Any coordinates in tesseract's image
+  // must be divided by scale before adding (rect_left, rect_top).
+  // The scaled_yres indicates the effective resolution of the binary image
+  // that tesseract has been given by the Thresholder.
+  // After the constructor, Begin has already been called.
+  ResultIterator(PAGE_RES* page_res, Tesseract* tesseract,
+                 int scale, int scaled_yres,
+                 int rect_left, int rect_top,
+                 int rect_width, int rect_height);
+  virtual ~ResultIterator();
+
+  // ResultIterators may be copied! This makes it possible to iterate over
+  // all the objects at a lower level, while maintaining an iterator to
+  // objects at a higher level. These constructors DO NOT CALL Begin, so
+  // iterations will continue from the location of src.
+  // TODO: For now the copy constructor and operator= only need the base class
+  // versions, but if new data members are added, don't forget to add them!
+
+  // ============= Moving around within the page ============.
+
+  // See PageIterator.
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // object at the given level. Use delete [] to free after use.
+  char* GetUTF8Text(PageIteratorLevel level) const;
+
+  // Returns the mean confidence of the current object at the given level.
+  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  float Confidence(PageIteratorLevel level) const;
+
+  // ============= Functions that refer to words only ============.
+
+  // Returns the font attributes of the current word. If iterating at a higher
+  // level object than words, eg textlines, then this will return the
+  // attributes of the first word in that textline.
+  // The actual return value is a string representing a font name. It points
+  // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
+  // the iterator itself, ie rendered invalid by various members of
+  // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
+  // Pointsize is returned in printers points (1/72 inch.)
+  const char* WordFontAttributes(bool* is_bold,
+                                 bool* is_italic,
+                                 bool* is_underlined,
+                                 bool* is_monospace,
+                                 bool* is_serif,
+                                 int* pointsize,
+                                 int* font_id) const;
+
+  // Returns true if the current word was found in a dictionary.
+  bool WordIsFromDictionary() const;
+
+  // Returns true if the current word is numeric.
+  bool WordIsNumeric() const;
+};
+
+// Class to iterate over the classifier choices for a single RIL_SYMBOL.
+class ChoiceIterator {
+ public:
+  // Construction is from a ResultIterator that points to the symbol of
+  // interest. The ChoiceIterator allows a one-shot iteration over the
+  // choices for this symbol and after that is is useless.
+  explicit ChoiceIterator(const ResultIterator& result_it);
+  ~ChoiceIterator();
+
+  // Moves to the next choice for the symbol and returns false if there
+  // are none left.
+  bool Next();
+
+  // ============= Accessing data ==============.
+
+  // Returns the null terminated UTF-8 encoded text string for the current
+  // choice.
+  // NOTE: Unlike ResultIterator::GetUTF8Text, the return points to an
+  // internal structure and should NOT be delete[]ed to free after use.
+  const char* GetUTF8Text() const;
+
+  // Returns the confidence of the current choice.
+  // The number should be interpreted as a percent probability. (0.0f-100.0f)
+  float Confidence() const;
+
+ private:
+  // Pointer to the Tesseract object owned by the API.
+  Tesseract* tesseract_;
+  // Iterator over the blob choices.
+  BLOB_CHOICE_IT* choice_it_;
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_API_RESULT_ITERATOR_H__
--- a/api/tesseractmain.cpp
+++ b/api/tesseractmain.cpp
@ -1,21 +1,21 @@
 /**********************************************************************
- * File:        tessedit.cpp  (Formerly tessedit.c)
- * Description: Main program for merge of tess and editor.
- * Author:                  Ray Smith
- * Created:                 Tue Jan 07 15:21:46 GMT 1992
- *
- * (C) Copyright 1992, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
+* File:        tessedit.cpp  (Formerly tessedit.c)
+* Description: Main program for merge of tess and editor.
+* Author:                  Ray Smith
+* Created:                 Tue Jan 07 15:21:46 GMT 1992
+*
+* (C) Copyright 1992, Hewlett-Packard Ltd.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*
+**********************************************************************/

 #include "mfcpch.h"
 //#define USE_VLD //Uncomment for Visual Leak Detector.
@ -23,7 +23,6 @@
 #include <vld.h>
 #endif
 #include <ctype.h>
-#include "applybox.h"
 #include "control.h"
 #include "tessvars.h"
 #include "tessedit.h"
@ -31,18 +30,16 @@
 #include "thresholder.h"
 #include "pageres.h"
 #include "imgs.h"
-#include "varabled.h"
+#include "params.h"
+#include "paramsd.h"
 #include "tprintf.h"
 #include "tesseractmain.h"
 #include "stderr.h"
 #include "notdll.h"
-#include "mainblk.h"
 #include "output.h"
 #include "globals.h"
-#include "helpers.h"
 #include "blread.h"
 #include "tfacep.h"
-#include "callnet.h"

 // Include automatically generated configuration file if running autoconf
 #ifdef HAVE_CONFIG_H
@ -55,33 +52,15 @@
 #else
 #define _(x) (x)
 #endif
-#ifdef HAVE_LIBTIFF
-#include "tiffio.h"
+#ifndef HAVE_LIBLEPT
+#error "Sorry: Tesseract no longer compiles or runs without Leptonica!";
 #endif
-#ifdef HAVE_LIBLEPT
 #include "allheaders.h"
-#else
-class Pix;
-#endif

-#ifdef _TIFFIO_
-void read_tiff_image(TIFF* tif, IMAGE* image);
-#endif

 #define VARDIR        "configs/" /*variables files */
                                 //config under api
 #define API_CONFIG      "configs/api_config"
-#define EXTERN
-
-BOOL_VAR(tessedit_create_boxfile, FALSE, "Output text with boxes");
-BOOL_VAR(tessedit_create_hocr, FALSE, "Output HTML with hOCR markup");
-BOOL_VAR(tessedit_read_image, TRUE, "Ensure the image is read");
-INT_VAR(tessedit_serial_unlv, 0,
-        "0->Whole page, 1->serial no adapt, 2->serial with adapt");
-INT_VAR(tessedit_page_number, -1,
-        "-1 -> All pages, else specific page to process");
-BOOL_VAR(tessedit_write_images, FALSE, "Capture the image from the IPE");
-BOOL_VAR(tessedit_debug_to_screen, FALSE, "Dont use debug file");

 const int kMaxIntSize = 22;
 char szAppName[] = "Tessedit";   //app name
@ -112,444 +91,373 @@ char szAppName[] = "Tessedit";   //app name
 // the value of input_file is ignored - ugly, but true - a consequence of
 // the way that unlv zone file reading takes the place of a page layout
 // analyzer.
-void TesseractImage(const char* input_file, IMAGE* image, Pix* pix, int page_index,
+void TesseractImage(const char* input_file, Pix* pix, int page_index,
                    tesseract::TessBaseAPI* api, STRING* text_out) {
-  api->SetInputName(input_file);
-#ifdef HAVE_LIBLEPT
-  if (pix != NULL) {
-    api->SetImage(pix);
-  } else {
-#endif
-    int bytes_per_line = check_legal_image_size(image->get_xsize(),
-                                                image->get_ysize(),
-                                                image->get_bpp());
-    api->SetImage(image->get_buffer(), image->get_xsize(), image->get_ysize(),
-                  image->get_bpp() / 8, bytes_per_line);
-#ifdef HAVE_LIBLEPT
-  }
-#endif
-  if (tessedit_serial_unlv == 0) {
-    char* text;
-    if (tessedit_create_boxfile)
-      text = api->GetBoxText(page_index);
-    else if (tessedit_write_unlv)
-      text = api->GetUNLVText();
-    else if (tessedit_create_hocr)
-      text = api->GetHOCRText(page_index + 1);
-    else
-      text = api->GetUTF8Text();
-    *text_out += text;
-    delete [] text;
-  } else {
-    BLOCK_LIST blocks;
-    STRING filename = input_file;
-    const char* lastdot = strrchr(filename.string(), '.');
-    if (lastdot != NULL) {
-      filename[lastdot - filename.string()] = '\0';
-    }
-    if (!read_unlv_file(filename, image->get_xsize(), image->get_ysize(),
-                        &blocks)) {
-      fprintf(stderr, _("Error: Must have a unlv zone file %s to read!\n"),
-              filename.string());
-      return;
-    }
-    BLOCK_IT b_it = &blocks;
-    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
-      BLOCK* block = b_it.data();
-      TBOX box = block->bounding_box();
-      api->SetRectangle(box.left(), image->get_ysize() - box.top(),
-                        box.width(), box.height());
-      char* text = api->GetUNLVText();
-      *text_out += text;
-      delete [] text;
-      if (tessedit_serial_unlv == 1)
-        api->ClearAdaptiveClassifier();
-    }
-  }
-  if (tessedit_write_images) {
-    page_image.write("tessinput.tif");
-  }
+	api->SetInputName(input_file);
+	api->SetImage(pix);
+	int serial_unlv;
+	ASSERT_HOST(api->GetIntVariable("tessedit_serial_unlv", &serial_unlv));
+	if (serial_unlv == 0) {
+		char* text;
+		bool bool_value;
+		if ((api->GetBoolVariable("tessedit_create_boxfile", &bool_value) &&
+		     bool_value) ||
+		    (api->GetBoolVariable("tessedit_make_boxes_from_boxes", &bool_value) &&
+		     bool_value)) {
+			text = api->GetBoxText(page_index);
+		} else if (api->GetBoolVariable("tessedit_write_unlv", &bool_value) &&
+		           bool_value) {
+			text = api->GetUNLVText();
+		} else if (api->GetBoolVariable("tessedit_create_hocr", &bool_value)
+		           && bool_value) {
+			text = api->GetHOCRText(page_index);
+		} else {
+			text = api->GetUTF8Text();
+		}
+		*text_out += text;
+		delete [] text;
+	} else {
+		BLOCK_LIST blocks;
+		STRING filename = input_file;
+		const char* lastdot = strrchr(filename.string(), '.');
+		if (lastdot != NULL) {
+			filename[lastdot - filename.string()] = '\0';
+		}
+		if (!read_unlv_file(filename, pixGetWidth(pix), pixGetHeight(pix),
+		                    &blocks)) {
+			fprintf(stderr, _("Error: Must have a unlv zone file %s to read!\n"),
+			        filename.string());
+			return;
+		}
+		BLOCK_IT b_it = &blocks;
+		for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+			BLOCK* block = b_it.data();
+			TBOX box = block->bounding_box();
+			api->SetRectangle(box.left(), pixGetHeight(pix) - box.top(),
+			                  box.width(), box.height());
+			char* text = api->GetUNLVText();
+			*text_out += text;
+			delete [] text;
+			if (serial_unlv == 1)
+				api->ClearAdaptiveClassifier();
+		}
+	}
+	bool bool_value;
+	if (api->GetBoolVariable("tessedit_write_images",
+	                         &bool_value) && bool_value) {
+		Pix* page_pix = api->GetThresholdedImage();
+		pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
+	}
 }

 /**********************************************************************
- *  main()
- *
- **********************************************************************/
+*  main()
+*
+**********************************************************************/

 int main(int argc, char **argv) {
-  STRING outfile;               //output file
+	STRING outfile;         //output file

 #ifdef USING_GETTEXT
-  setlocale (LC_ALL, "");
-  bindtextdomain (PACKAGE, LOCALEDIR);
-  textdomain (PACKAGE);
+	setlocale (LC_ALL, "");
+	bindtextdomain (PACKAGE, LOCALEDIR);
+	textdomain (PACKAGE);
 #endif

-  // Detect incorrectly placed -l option.
-  for (int arg = 0; arg < argc; ++arg) {
-    if (arg != 3 && strcmp(argv[arg], "-l") == 0) {
-      fprintf(stderr, _("Error: -l must be arg3, not %d\n"), arg);
-      argc = 0;
-    }
-  }
+	// Detect incorrectly placed -l option.
+	for (int arg = 0; arg < argc; ++arg) {
+		if (arg != 3 && strcmp(argv[arg], "-l") == 0) {
+			fprintf(stderr, _("Error: -l must be arg3, not %d\n"), arg);
+			argc = 0;
+		}
+	}
 #ifdef HAVE_CONFIG_H /* Assume that only Unix users care about -v */
-  if (argc == 2 && strcmp(argv[1], "-v") == 0) {
-    fprintf(stderr, "tesseract %s\n", PACKAGE_VERSION);
-    exit(1);
-  }
+	if (argc == 2 && strcmp(argv[1], "-v") == 0) {
+		fprintf(stderr, "tesseract %s\n", PACKAGE_VERSION);
+		exit(1);
+	}
 #endif
-  if (argc < 3) {
-    fprintf(stderr, "Usage:%s imagename outputbase [-l lang]"
-            " [configfile [[+|-]varfile]...]\n"
+	if (argc < 3) {
+		fprintf(stderr, "Usage:%s imagename outputbase [-l lang]"
+		        " [configfile [[+|-]varfile]...]\n"
 #if !defined(HAVE_LIBLEPT) && !defined(_TIFFIO_)
-            "Warning - no liblept or libtiff - cannot read compressed"
-            " tiff files.\n"
+		        "Warning - no liblept or libtiff - cannot read compressed"
+		        " tiff files.\n"
 #endif
-      , argv[0]);
-    exit(1);
-  }
-  // Find the required language.
-  const char* lang = "eng";
-  int arg = 3;
-  if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
-    lang = argv[4];
-    arg = 5;
-  }
+		        , argv[0]);
+		exit(1);
+	}
+	// Find the required language.
+	const char* lang = "eng";
+	int arg = 3;
+	if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
+		lang = argv[4];
+		arg = 5;
+	}

-  tesseract::TessBaseAPI  api;
+	tesseract::TessBaseAPI api;

-  api.SetOutputName(argv[2]);
-  api.SetPageSegMode(tesseract::PSM_AUTO);  
-  api.Init(argv[0], lang, &(argv[arg]), argc-arg, false);
+	api.SetOutputName(argv[2]);
+	api.Init(argv[0], lang, tesseract::OEM_DEFAULT, &(argv[arg]), argc-arg, false);

-  tprintf (_("Tesseract Open Source OCR Engine"));
-#if defined(HAVE_LIBLEPT)
-  tprintf (_(" with Leptonica\n"));
-#elif defined(_TIFFIO_)
-  tprintf (_(" with LibTiff\n"));
-#else
-  tprintf ("\n");
-#endif
+	tprintf (_("Tesseract Open Source OCR Engine with Leptonica\n"));

-  IMAGE image;
-  STRING text_out;
-  int page_number = tessedit_page_number;
-  if (page_number < 0)
-    page_number = 0;
-  FILE* fp = fopen(argv[1], "rb");
-  if (fp == NULL) {
-    tprintf(_("Image file %s cannot be opened!\n"), argv[1]);
-    fclose(fp);
-    exit(1);
-  }
-#ifdef HAVE_LIBLEPT
-  int page = page_number;
-  int npages = 0;
-  bool is_tiff = fileFormatIsTiff(fp);
-  if (is_tiff)
-    {
-      int tiffstat = tiffGetCount(fp, &npages);
-      if (tiffstat == 1)
-        {
-          fprintf(stderr, _("Error reading file %s!\n"), argv[1]);
-          fclose(fp);
-          exit(1);
-        }
-      else
-        {
-          fprintf(stderr, _("Number of found pages: %d.\n"), npages);
-        }
-    }
-  fclose(fp);
-  fp = NULL;
+	STRING text_out;
+	int tessedit_page_number;
+	ASSERT_HOST(api.GetIntVariable("tessedit_page_number",
+	                               &tessedit_page_number));
+	int page_number = tessedit_page_number;
+	if (page_number < 0)
+		page_number = 0;
+	FILE* fp = fopen(argv[1], "rb");
+	if (fp == NULL) {
+		tprintf(_("Image file %s cannot be opened!\n"), argv[1]);
+		fclose(fp);
+		exit(1);
+	}
+	int page = page_number;
+	int npages = 0;
+	bool is_tiff = fileFormatIsTiff(fp);
+	if (is_tiff)
+		{
+		int tiffstat = tiffGetCount(fp, &npages);
+		if (tiffstat == 1)
+			{
+			fprintf (stderr, _("Error reading file %s!\n"), argv[1]);
+			fclose(fp);
+			exit(1);
+			}
+		else
+			fprintf(stderr, _("Number of found pages: %d.\n"), npages);
+		}
+	fclose(fp);
+	fp = NULL;

-  Pix *pix;
-  if (is_tiff)
-    {
-      for (; page < npages; ++page)
-        {
-          pix = pixReadTiff(argv[1], page);
-          if (!pix)
-            continue;
-          if (npages > 1)
-            {
-              tprintf(_("Page %d\n"), page);
-            }
-          char page_str[kMaxIntSize];
-          snprintf(page_str, kMaxIntSize - 1, "%d", page);
-          api.SetVariable("applybox_page", page_str);
-          // Run tesseract on the page!
-          TesseractImage(argv[1], NULL, pix, page, &api, &text_out);
-          pixDestroy(&pix);
-          if (tessedit_page_number >= 0 || npages == 1)
-            {
-              break;
-            }
-        }
-    }
-  else
-    {
-    // The file is not a tiff file, so use the general pixRead function.
-    // If the image fails to read, try it as a list of filenames.
-    PIX* pix = pixRead(argv[1]);
-    if (pix == NULL) {
-      FILE* fimg = fopen(argv[1], "r");
-      if (fimg == NULL) {
-        tprintf(_("File %s cannot be opened!\n"), argv[1]);
-        fclose(fimg);
-        exit(1);
-      }
-      char filename[MAX_PATH];
-      while (fgets(filename, sizeof(filename), fimg) != NULL) {
-        chomp_string(filename);
-        pix = pixRead(filename);
-        if (pix == NULL) {
-          tprintf(_("Image file %s cannot be read!\n"), filename);
-          fclose(fimg);
-          exit(1);
-        }
-        tprintf(_("Page %d : %s\n"), page, filename);
-        TesseractImage(filename, NULL, pix, page, &api, &text_out);
-        pixDestroy(&pix);
-        ++page;
-      }
-      fclose(fimg);
-    } else {
-      TesseractImage(argv[1], NULL, pix, 0, &api, &text_out);
-      pixDestroy(&pix);
-    }
-  }
-#else
-#ifdef _TIFFIO_
-  int len = strlen(argv[1]);
-  char* ext = new char[5];
-  for (int i=4; i>=0; i--)
-    ext[4-i] = (char) tolower((int) argv[1][len - i]);
-  if (len > 3 && (strcmp("tif",  ext + 1) == 0 || strcmp("tiff", ext) == 0)) {
-    // Use libtiff to read a tif file so multi-page can be handled.
-    // The page number so the tiff file can be closed and reopened.
-    TIFF* archive = NULL;
-    do {
-      // Since libtiff keeps all read images in memory we have to close the
-      // file and reopen it for every page, and seek to the appropriate page.
-      if (archive != NULL)
-        TIFFClose(archive);
-      archive = TIFFOpen(argv[1], "r");
-      if (archive == NULL) {
-        tprintf(_("Read of file %s failed.\n"), argv[1]);
-        exit(1);
-      }
-      if (page_number > 0)
-        tprintf(_("Page %d\n"), page_number);
+	Pix *pix;
+	if (is_tiff) {
+		for (; page < npages; ++page)
+		  { 
+		   pix = pixReadTiff(argv[1], page);
+           if (!pix)
+             continue;
+           if (npages > 1)
+				tprintf(_("Page %d\n"), page);
+			char page_str[kMaxIntSize];
+			snprintf(page_str, kMaxIntSize - 1, "%d", page);
+			api.SetVariable("applybox_page", page_str);

-      // Seek to the appropriate page.
-      for (int i = 0; i < page_number; ++i) {
-        TIFFReadDirectory(archive);
-      }
-      char page_str[kMaxIntSize];
-      snprintf(page_str, kMaxIntSize - 1, "%d", page_number);
-      api.SetVariable("applybox_page", page_str);
-      // Read the current page into the Tesseract image.
-      IMAGE image;
-      read_tiff_image(archive, &image);
+			// Run tesseract on the page!
+			TesseractImage(argv[1], pix, page, &api, &text_out);
+			pixDestroy(&pix);
+			if (tessedit_page_number >= 0 || npages == 1)
+			  {
+				break;
+			  }
+		}
+	} else
+		{
+		// The file is not a tiff file, so use the general pixRead function.
+		// If the image fails to read, try it as a list of filenames.
+		pix = pixRead(argv[1]);
+		if (pix == NULL) {
+			FILE* fimg = fopen(argv[1], "r");
+			if (fimg == NULL) {
+				tprintf(_("File %s cannot be opened!\n"), argv[1]);
+				fclose(fimg);
+				exit(1);
+			}
+			char filename[MAX_PATH];
+			while (fgets(filename, sizeof(filename), fimg) != NULL) {
+				chomp_string(filename);
+				pix = pixRead(filename);
+				if (pix == NULL) {
+					tprintf(_("Image file %s cannot be read!\n"), filename);
+					fclose(fimg);
+					exit(1);
+				}
+				tprintf(_("Page %d : %s\n"), page, filename);
+				TesseractImage(filename, pix, page, &api, &text_out);
+				pixDestroy(&pix);
+				++page;
+			}
+			fclose(fimg);
+		} else {
+			TesseractImage(argv[1], pix, 0, &api, &text_out);
+			pixDestroy(&pix);
+		}
+	}

-      // Run tesseract on the page!
-      TesseractImage(argv[1], &image, NULL, page_number, &api, &text_out);
-      ++page_number;
-    // Do this while there are more pages in the tiff file.
-    } while (TIFFReadDirectory(archive) &&
-             (page_number <= tessedit_page_number || tessedit_page_number < 0));
-    TIFFClose(archive);
-  } else {
-#endif
-    // Using built-in image library to read bmp, or tiff without libtiff.
-    if (image.read_header(argv[1]) < 0) {
-      tprintf(_("Read of file %s failed.\n"), argv[1]);
-      exit(1);
-    }
-    if (image.read(image.get_ysize ()) < 0)
-      MEMORY_OUT.error(argv[0], EXIT, _("Read of image %s"), argv[1]);
-    invert_image(&image);
-    TesseractImage(argv[1], &image, NULL, 0, &api, &text_out);
-#ifdef _TIFFIO_
-  }
-  delete[] ext;
-#endif
-#endif  // HAVE_LIBLEPT
+	bool output_hocr = false;
+	api.GetBoolVariable("tessedit_create_hocr", &output_hocr);
+	bool output_box = false;
+	api.GetBoolVariable("tessedit_create_boxfile", &output_box);
+	outfile = argv[2];
+	outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt";
+	FILE* fout = fopen(outfile.string(), "w");
+	if (fout == NULL) {
+		tprintf(_("Cannot create output file %s\n"), outfile.string());
+		fclose(fout);
+		exit(1);
+	}
+	if (output_hocr) {
+		const char html_header[] =
+		        "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
+		        " \"http://www.w3.org/TR/html4/loose.dtd\">\n"
+		        "<html>\n<head>\n<title></title>\n"
+		        "<meta http-equiv=\"Content-Type\" content=\"text/html;"
+		        "charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
+		        "</head>\n<body>\n";
+		fprintf(fout, "%s", html_header);
+	}
+	fwrite(text_out.string(), 1, text_out.length(), fout);
+	if (output_hocr)
+		fprintf(fout, "</body>\n</html>\n");
+	fclose(fout);

-  //no longer using fp
-  if (fp != NULL) fclose(fp);
-
-  bool output_hocr = tessedit_create_hocr;
-  outfile = argv[2];
-  outfile += output_hocr ? ".html" : tessedit_create_boxfile ? ".box" : ".txt";
-  FILE* fout = fopen(outfile.string(), "w");
-  if (fout == NULL) {
-    tprintf(_("Cannot create output file %s\n"), outfile.string());
-    fclose(fout);
-    exit(1);
-  }
-  if (output_hocr) {
-    const char html_header[] =
-        "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
-        "  \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
-        "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n"
-		"  <title>OCR Output</title>\n"
-        "  <meta http-equiv=\"Content-Type\" content=\"text/html;"
-        "charset=utf-8\" />\n  <meta name='ocr-system' "
-        "content='tesseract-ocr 3.00' />\n  <meta name='ocr-capabilities'"
-        " content='ocr_page' />\n</head>\n<body>\n";
-    fprintf(fout, "%s", html_header);
-  } 
-  fwrite(text_out.string(), 1, text_out.length(), fout);
-  if (output_hocr)
-    fprintf(fout, "</body>\n</html>\n");
-  fclose(fout);
-
-  return 0;                      //Normal exit
+	return 0;                //Normal exit
 }

 #ifdef __MSW32__
 int initialized = 0;

 /**********************************************************************
- * WinMain
- *
- * Main function for a windows program.
- **********************************************************************/
+* WinMain
+*
+* Main function for a windows program.
+**********************************************************************/

 int WINAPI WinMain(  //main for windows //command line
-                   HINSTANCE hInstance,
-                   HINSTANCE hPrevInstance,
-                   LPSTR lpszCmdLine,
-                   int nCmdShow) {
-  WNDCLASS wc;
-  HWND hwnd;
-  MSG msg;
+        HINSTANCE hInstance,
+        HINSTANCE hPrevInstance,
+        LPSTR lpszCmdLine,
+        int nCmdShow) {
+	WNDCLASS wc;
+	HWND hwnd;
+	MSG msg;

-  char **argv;
-  char *argsin[2];
-  int argc;
-  int exit_code;
+	char **argv;
+	char *argsin[2];
+	int argc;
+	int exit_code;

-  wc.style = CS_NOCLOSE | CS_OWNDC;
-  wc.lpfnWndProc = (WNDPROC) WndProc;
-  wc.cbClsExtra = 0;
-  wc.cbWndExtra = 0;
-  wc.hInstance = hInstance;
-  wc.hIcon = NULL;               //LoadIcon (NULL, IDI_APPLICATION);
-  wc.hCursor = NULL;             //LoadCursor (NULL, IDC_ARROW);
-  wc.hbrBackground = (HBRUSH) (COLOR_WINDOW + 1);
-  wc.lpszMenuName = NULL;
-  wc.lpszClassName = szAppName;
+	wc.style = CS_NOCLOSE | CS_OWNDC;
+	wc.lpfnWndProc = (WNDPROC) WndProc;
+	wc.cbClsExtra = 0;
+	wc.cbWndExtra = 0;
+	wc.hInstance = hInstance;
+	wc.hIcon = NULL;         //LoadIcon (NULL, IDI_APPLICATION);
+	wc.hCursor = NULL;       //LoadCursor (NULL, IDC_ARROW);
+	wc.hbrBackground = (HBRUSH) (COLOR_WINDOW + 1);
+	wc.lpszMenuName = NULL;
+	wc.lpszClassName = szAppName;

-  RegisterClass(&wc);
+	RegisterClass(&wc);

-  hwnd = CreateWindow (szAppName, szAppName,
-    WS_OVERLAPPEDWINDOW | WS_DISABLED,
-    CW_USEDEFAULT, CW_USEDEFAULT, CW_USEDEFAULT,
-    CW_USEDEFAULT, HWND_DESKTOP, NULL, hInstance, NULL);
+	hwnd = CreateWindow (szAppName, szAppName,
+	                     WS_OVERLAPPEDWINDOW | WS_DISABLED,
+	                     CW_USEDEFAULT, CW_USEDEFAULT, CW_USEDEFAULT,
+	                     CW_USEDEFAULT, HWND_DESKTOP, NULL, hInstance, NULL);

-  argsin[0] = strdup (szAppName);
-  argsin[1] = strdup (lpszCmdLine);
-  /*allocate memory for the args. There can never be more than half*/
-  /*the total number of characters in the arguments.*/
-  argv =
-    (char **) malloc (((strlen (argsin[0]) + strlen (argsin[1])) / 2 + 1) *
-    sizeof (char *));
+	argsin[0] = strdup (szAppName);
+	argsin[1] = strdup (lpszCmdLine);
+	/*allocate memory for the args. There can never be more than half*/
+	/*the total number of characters in the arguments.*/
+	argv =
+	        (char **) malloc (((strlen (argsin[0]) + strlen (argsin[1])) / 2 + 1) *
+	                          sizeof (char *));

-  /*now construct argv as it should be for C.*/
-  argc = parse_args (2, argsin, argv);
+	/*now construct argv as it should be for C.*/
+	argc = parse_args (2, argsin, argv);

-  //  ShowWindow (hwnd, nCmdShow);
-  //  UpdateWindow (hwnd);
+	//  ShowWindow (hwnd, nCmdShow);
+	//  UpdateWindow (hwnd);

-  if (initialized) {
-    exit_code = main (argc, argv);
-    free (argsin[0]);
-    free (argsin[1]);
-    free(argv);
-    return exit_code;
-  }
-  while (GetMessage (&msg, NULL, 0, 0)) {
-    TranslateMessage(&msg);
-    DispatchMessage(&msg);
-    if (initialized) {
-      exit_code = main (argc, argv);
-      break;
-    }
-    else
-      exit_code = msg.wParam;
-  }
-  free (argsin[0]);
-  free (argsin[1]);
-  free(argv);
-  return exit_code;
+	if (initialized) {
+		exit_code = main (argc, argv);
+		free (argsin[0]);
+		free (argsin[1]);
+		free(argv);
+		return exit_code;
+	}
+	while (GetMessage (&msg, NULL, 0, 0)) {
+		TranslateMessage(&msg);
+		DispatchMessage(&msg);
+		if (initialized) {
+			exit_code = main (argc, argv);
+			break;
+		}
+		else
+			exit_code = msg.wParam;
+	}
+	free (argsin[0]);
+	free (argsin[1]);
+	free(argv);
+	return exit_code;
 }


 /**********************************************************************
- * WndProc
- *
- * Function to respond to messages.
- **********************************************************************/
+* WndProc
+*
+* Function to respond to messages.
+**********************************************************************/

 LONG WINAPI WndProc(            //message handler
-                    HWND hwnd,  //window with message
-                    UINT msg,   //message typ
-                    WPARAM wParam,
-                    LPARAM lParam) {
-  HDC hdc;
+        HWND hwnd,              //window with message
+        UINT msg,               //message typ
+        WPARAM wParam,
+        LPARAM lParam) {
+	HDC hdc;

-  if (msg == WM_CREATE) {
-    //
-    // Create a rendering context.
-    //
-    hdc = GetDC (hwnd);
-    ReleaseDC(hwnd, hdc);
-    initialized = 1;
-    return 0;
-  }
-  return DefWindowProc (hwnd, msg, wParam, lParam);
+	if (msg == WM_CREATE) {
+		//
+		// Create a rendering context.
+		//
+		hdc = GetDC (hwnd);
+		ReleaseDC(hwnd, hdc);
+		initialized = 1;
+		return 0;
+	}
+	return DefWindowProc (hwnd, msg, wParam, lParam);
 }


 /**********************************************************************
- * parse_args
- *
- * Turn a list of args into a new list of args with each separate
- * whitespace spaced string being an arg.
- **********************************************************************/
+* parse_args
+*
+* Turn a list of args into a new list of args with each separate
+* whitespace spaced string being an arg.
+**********************************************************************/

 int
 parse_args (                     /*refine arg list */
-int argc,                        /*no of input args */
-char *argv[],                    /*input args */
-char *arglist[]                  /*output args */
-) {
-  int argcount;                  /*converted argc */
-  char *testchar;                /*char in option string */
-  int arg;                       /*current argument */
+        int argc,                /*no of input args */
+        char *argv[],            /*input args */
+        char *arglist[]          /*output args */
+        ) {
+	int argcount;            /*converted argc */
+	char *testchar;          /*char in option string */
+	int arg;                 /*current argument */

-  argcount = 0;                  /*no of options */
-  for (arg = 0; arg < argc; arg++) {
-    testchar = argv[arg];        /*start of arg */
-    do {
-      while (*testchar
-        && (*testchar == ' ' || *testchar == '\n'
-        || *testchar == '\t'))
-        testchar++;              /*skip white space */
-      if (*testchar) {
-                                 /*new arg */
-        arglist[argcount++] = testchar;
-                                 /*skip to white space */
-        for (testchar++; *testchar && *testchar != ' ' && *testchar != '\n' && *testchar != '\t'; testchar++);
-        if (*testchar)
-          *testchar++ = '\0';    /*turn to separate args */
-      }
-    }
-    while (*testchar);
-  }
-  return argcount;               /*new number of args */
+	argcount = 0;            /*no of options */
+	for (arg = 0; arg < argc; arg++) {
+		testchar = argv[arg]; /*start of arg */
+		do {
+			while (*testchar
+			       && (*testchar == ' ' || *testchar == '\n'
+			           || *testchar == '\t'))
+				testchar++; /*skip white space */
+			if (*testchar) {
+				/*new arg */
+				arglist[argcount++] = testchar;
+				/*skip to white space */
+				for (testchar++; *testchar && *testchar != ' ' && *testchar != '\n' && *testchar != '\t'; testchar++) ;
+				if (*testchar)
+					*testchar++ = '\0'; /*turn to separate args */
+			}
+		}
+		while (*testchar);
+	}
+	return argcount;         /*new number of args */
 }
 #endif
--- a/api/tesseractmain.h
+++ b/api/tesseractmain.h
@ -20,41 +20,10 @@
 #ifndef           TESSERACTMAIN_H
 #define           TESSERACTMAIN_H

-#include          "varable.h"
-#include          "tessclas.h"
+#include          "params.h"
+#include          "blobs.h"
 #include          "notdll.h"

-extern BOOL_VAR_H(tessedit_create_boxfile, FALSE, "Output text with boxes");
-extern BOOL_VAR_H(tessedit_read_image, TRUE, "Ensure the image is read");
-extern INT_VAR_H(tessedit_serial_unlv, 0,
-        "0->Whole page, 1->serial no adapt, 2->serial with adapt");
-extern INT_VAR_H(tessedit_page_number, -1,
-        "-1 -> All pages, else specific page to process");
-extern BOOL_VAR_H(tessedit_write_images, FALSE,
-                  "Capture the image from the IPE");
-extern BOOL_VAR_H(tessedit_debug_to_screen, FALSE, "Dont use debug file");
-
-/**
- * run from api
- * @param arg0 program name
- * @param lang language
- */
-inT32 api_main(const char *arg0,
-               uinT16 lang);
-/**
- * setup dummy engine info
- * @param lang user language
- * @param name of engine
- * @param version of engine
- */
-inT16 setup_info(uinT16 lang,
-                 const char *name,
-                 const char *version);
-/**
- * read dummy image info
- * @param im_out read dummy image info
- */
-inT16 read_image(IMAGE *im_out);
 #ifdef __MSW32__
 /**
 * main for windows command line
--- a/ccmain/Makefile.am
+++ b/ccmain/Makefile.am
@ -1,35 +1,37 @@
 SUBDIRS =
 AM_CPPFLAGS = \
+    -DUSE_STD_NAMESPACE \
    -I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct \
    -I$(top_srcdir)/image -I$(top_srcdir)/viewer \
    -I$(top_srcdir)/ccops -I$(top_srcdir)/dict \
    -I$(top_srcdir)/classify \
    -I$(top_srcdir)/wordrec -I$(top_srcdir)/cutil \
+    -I$(top_srcdir)/neural_networks/runtime -I$(top_srcdir)/cube \
    -I$(top_srcdir)/textord

-EXTRA_DIST = tessembedded.cpp ccmain.vcproj
+EXTRA_DIST = tessembedded.cpp

 include_HEADERS = \
-    adaptions.h applybox.h blobcmp.h \
-    callnet.h charcut.h charsample.h control.h \
-    docqual.h expandblob.h fixspace.h fixxht.h \
-    imgscale.h matmatch.h osdetect.h output.h \
-    pagewalk.h paircmp.h pgedit.h reject.h scaleimg.h \
+    charcut.h control.h cube_reco_context.h \
+    docqual.h fixspace.h \
+    imgscale.h osdetect.h output.h \
+    paramsd.h pgedit.h reject.h scaleimg.h \
    tessbox.h tessedit.h tessembedded.h tesseractclass.h \
-    tessio.h tessvars.h tfacep.h tfacepp.h thresholder.h tstruct.h \
-    varabled.h werdit.h
+    tesseract_cube_combiner.h \
+    tessvars.h tfacep.h tfacepp.h thresholder.h tstruct.h \
+    werdit.h

 lib_LTLIBRARIES = libtesseract_main.la
 libtesseract_main_la_SOURCES = \
-    adaptions.cpp ambigsrecog.cpp applybox.cpp \
-    blobcmp.cpp \
-    callnet.cpp charcut.cpp charsample.cpp control.cpp \
-    docqual.cpp expandblob.cpp fixspace.cpp fixxht.cpp \
-    imgscale.cpp matmatch.cpp osdetect.cpp output.cpp \
-    pagewalk.cpp paircmp.cpp pgedit.cpp reject.cpp scaleimg.cpp \
+    adaptions.cpp applybox.cpp \
+    charcut.cpp control.cpp cube_control.cpp cube_reco_context.cpp \
+    docqual.cpp fixspace.cpp fixxht.cpp \
+    imgscale.cpp osdetect.cpp output.cpp pagesegmain.cpp \
+    pagewalk.cpp paramsd.cpp pgedit.cpp reject.cpp scaleimg.cpp \
+    recogtraining.cpp tesseract_cube_combiner.cpp \
    tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
    tfacepp.cpp thresholder.cpp tstruct.cpp \
-    varabled.cpp werdit.cpp
+    werdit.cpp
 libtesseract_main_la_LIBADD = \
    ../wordrec/libtesseract_wordrec.la
 libtesseract_main_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
--- a/ccmain/Makefile.in
+++ b/ccmain/Makefile.in
@ -72,13 +72,13 @@ am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"
 LTLIBRARIES = $(lib_LTLIBRARIES)
 libtesseract_main_la_DEPENDENCIES =  \
 	../wordrec/libtesseract_wordrec.la
-am_libtesseract_main_la_OBJECTS = adaptions.lo ambigsrecog.lo \
-	applybox.lo blobcmp.lo callnet.lo charcut.lo charsample.lo \
-	control.lo docqual.lo expandblob.lo fixspace.lo fixxht.lo \
-	imgscale.lo matmatch.lo osdetect.lo output.lo pagewalk.lo \
-	paircmp.lo pgedit.lo reject.lo scaleimg.lo tessbox.lo \
-	tessedit.lo tesseractclass.lo tessvars.lo tfacepp.lo \
-	thresholder.lo tstruct.lo varabled.lo werdit.lo
+am_libtesseract_main_la_OBJECTS = adaptions.lo applybox.lo charcut.lo \
+	control.lo cube_control.lo cube_reco_context.lo docqual.lo \
+	fixspace.lo fixxht.lo imgscale.lo osdetect.lo output.lo \
+	pagesegmain.lo pagewalk.lo paramsd.lo pgedit.lo reject.lo \
+	scaleimg.lo recogtraining.lo tesseract_cube_combiner.lo \
+	tessbox.lo tessedit.lo tesseractclass.lo tessvars.lo \
+	tfacepp.lo thresholder.lo tstruct.lo werdit.lo
 libtesseract_main_la_OBJECTS = $(am_libtesseract_main_la_OBJECTS)
 libtesseract_main_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
@ -251,7 +251,6 @@ libdir = @libdir@
 libexecdir = @libexecdir@
 localedir = @localedir@
 localstatedir = @localstatedir@
-lt_ECHO = @lt_ECHO@
 mandir = @mandir@
 mkdir_p = @mkdir_p@
 oldincludedir = @oldincludedir@
@ -269,35 +268,37 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 SUBDIRS = 
 AM_CPPFLAGS = \
+    -DUSE_STD_NAMESPACE \
    -I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct \
    -I$(top_srcdir)/image -I$(top_srcdir)/viewer \
    -I$(top_srcdir)/ccops -I$(top_srcdir)/dict \
    -I$(top_srcdir)/classify \
    -I$(top_srcdir)/wordrec -I$(top_srcdir)/cutil \
+    -I$(top_srcdir)/neural_networks/runtime -I$(top_srcdir)/cube \
    -I$(top_srcdir)/textord

-EXTRA_DIST = tessembedded.cpp ccmain.vcproj
+EXTRA_DIST = tessembedded.cpp
 include_HEADERS = \
-    adaptions.h applybox.h blobcmp.h \
-    callnet.h charcut.h charsample.h control.h \
-    docqual.h expandblob.h fixspace.h fixxht.h \
-    imgscale.h matmatch.h osdetect.h output.h \
-    pagewalk.h paircmp.h pgedit.h reject.h scaleimg.h \
+    charcut.h control.h cube_reco_context.h \
+    docqual.h fixspace.h \
+    imgscale.h osdetect.h output.h \
+    paramsd.h pgedit.h reject.h scaleimg.h \
    tessbox.h tessedit.h tessembedded.h tesseractclass.h \
-    tessio.h tessvars.h tfacep.h tfacepp.h thresholder.h tstruct.h \
-    varabled.h werdit.h
+    tesseract_cube_combiner.h \
+    tessvars.h tfacep.h tfacepp.h thresholder.h tstruct.h \
+    werdit.h

 lib_LTLIBRARIES = libtesseract_main.la
 libtesseract_main_la_SOURCES = \
-    adaptions.cpp ambigsrecog.cpp applybox.cpp \
-    blobcmp.cpp \
-    callnet.cpp charcut.cpp charsample.cpp control.cpp \
-    docqual.cpp expandblob.cpp fixspace.cpp fixxht.cpp \
-    imgscale.cpp matmatch.cpp osdetect.cpp output.cpp \
-    pagewalk.cpp paircmp.cpp pgedit.cpp reject.cpp scaleimg.cpp \
+    adaptions.cpp applybox.cpp \
+    charcut.cpp control.cpp cube_control.cpp cube_reco_context.cpp \
+    docqual.cpp fixspace.cpp fixxht.cpp \
+    imgscale.cpp osdetect.cpp output.cpp pagesegmain.cpp \
+    pagewalk.cpp paramsd.cpp pgedit.cpp reject.cpp scaleimg.cpp \
+    recogtraining.cpp tesseract_cube_combiner.cpp \
    tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
    tfacepp.cpp thresholder.cpp tstruct.cpp \
-    varabled.cpp werdit.cpp
+    werdit.cpp

 libtesseract_main_la_LIBADD = \
    ../wordrec/libtesseract_wordrec.la
@ -378,34 +379,32 @@ distclean-compile:
 	-rm -f *.tab.c

@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/adaptions.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ambigsrecog.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/applybox.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blobcmp.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/callnet.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/charcut.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/charsample.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/control.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cube_control.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cube_reco_context.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/docqual.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/expandblob.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fixspace.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fixxht.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/imgscale.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmatch.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/osdetect.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/output.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pagesegmain.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pagewalk.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/paircmp.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/paramsd.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pgedit.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/recogtraining.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/reject.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scaleimg.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessbox.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessedit.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tesseract_cube_combiner.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tesseractclass.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessvars.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tfacepp.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/thresholder.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tstruct.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/varabled.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/werdit.Plo@am__quote@

 .cpp.o:
--- a/ccmain/adaptions.cpp
+++ b/ccmain/adaptions.cpp
--- a/ccmain/adaptions.h
+++ b/ccmain/adaptions.h
@ -1,89 +0,0 @@
-/**********************************************************************
- * File:        adaptions.h  (Formerly adaptions.h)
- * Description: Functions used to adapt to blobs already confidently
- *					identified
- * Author:		Chris Newton
- * Created:		Thu Oct  7 10:17:28 BST 1993
- *
- * (C) Copyright 1992, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           ADAPTIONS_H
-#define           ADAPTIONS_H
-
-#include          "charsample.h"
-#include          "charcut.h"
-#include          "notdll.h"
-
-extern BOOL_VAR_H (tessedit_reject_ems, FALSE, "Reject all m's");
-extern BOOL_VAR_H (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");
-extern double_VAR_H (tessedit_cluster_t1, 0.20,
-"t1 threshold for clustering samples");
-extern double_VAR_H (tessedit_cluster_t2, 0.40,
-"t2 threshold for clustering samples");
-extern double_VAR_H (tessedit_cluster_t3, 0.12,
-"Extra threshold for clustering samples, only keep a new sample if best score greater than this value");
-extern double_VAR_H (tessedit_cluster_accept_fraction, 0.80,
-"Largest fraction of characters in cluster for it to be used for adaption");
-extern INT_VAR_H (tessedit_cluster_min_size, 3,
-"Smallest number of samples in a cluster for it to be used for adaption");
-extern BOOL_VAR_H (tessedit_cluster_debug, FALSE,
-"Generate and print debug information for adaption by clustering");
-extern BOOL_VAR_H (tessedit_use_best_sample, FALSE,
-"Use best sample from cluster when adapting");
-extern BOOL_VAR_H (tessedit_test_cluster_input, FALSE,
-"Set reject map to enable cluster input to be measured");
-extern BOOL_VAR_H (tessedit_matrix_match, TRUE, "Use matrix matcher");
-extern BOOL_VAR_H (tessedit_old_matrix_match, FALSE, "Use matrix matcher");
-extern BOOL_VAR_H (tessedit_mm_use_non_adaption_set, FALSE,
-"Don't try to adapt to characters on this list");
-extern STRING_VAR_H (tessedit_non_adaption_set, ",.;:'~@*",
-"Characters to be avoided when adapting");
-extern BOOL_VAR_H (tessedit_mm_adapt_using_prototypes, TRUE,
-"Use prototypes when adapting");
-extern BOOL_VAR_H (tessedit_mm_use_prototypes, TRUE,
-"Use prototypes as clusters are built");
-extern BOOL_VAR_H (tessedit_mm_use_rejmap, FALSE,
-"Adapt to characters using reject map");
-extern BOOL_VAR_H (tessedit_mm_all_rejects, FALSE,
-"Adapt to all characters using, matrix matcher");
-extern BOOL_VAR_H (tessedit_mm_only_match_same_char, FALSE,
-"Only match samples against clusters for the same character");
-extern BOOL_VAR_H (tessedit_process_rns, FALSE, "Handle m - rn ambigs");
-extern BOOL_VAR_H (tessedit_demo_adaption, FALSE,
-"Display cut images and matrix match for demo purposes");
-extern INT_VAR_H (tessedit_demo_word1, 62,
-"Word number of first word to display");
-extern INT_VAR_H (tessedit_demo_word2, 64,
-"Word number of second word to display");
-extern STRING_VAR_H (tessedit_demo_file, "academe",
-"Name of document containing demo words");
-extern BOOL_VAR_H(tessedit_adapt_to_char_fragments, TRUE,
-                  "Adapt to words that contain "
-                  " a character composed form fragments");
-
-void print_em_stats(CHAR_SAMPLES_LIST *char_clusters,
-                    CHAR_SAMPLE_LIST *chars_waiting);
-                                 //lines of the image
-CHAR_SAMPLE *clip_sample(PIXROW *pixrow,
-                         IMAGELINE *imlines,
-                         TBOX pix_box,  //box of imlines extent
-                         BOOL8 white_on_black,
-                         char c);
-void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters);
-void reject_all_ems(WERD_RES *word);
-void reject_all_fullstops(WERD_RES *word);
-void reject_suspect_fullstops(WERD_RES *word);
-BOOL8 suspect_em(WERD_RES *word, inT16 index);
-BOOL8 suspect_fullstop(WERD_RES *word, inT16 i);
-#endif
--- a/ccmain/ambigsrecog.cpp
+++ b/ccmain/ambigsrecog.cpp
@ -1,179 +0,0 @@
-///////////////////////////////////////////////////////////////////////
-// File:        genericvector.h
-// Description: Functions for producing classifications
-//              for the input to ambigstraining.
-// Author:      Daria Antonova
-// Created:     Mon Jun 23 11:26:43 PDT 2008
-//
-// (C) Copyright 2007, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-///////////////////////////////////////////////////////////////////////
-
-#include "ambigs.h"
-
-#include "applybox.h"
-#include "boxread.h"
-#include "control.h"
-#include "permute.h"
-#include "ratngs.h"
-#include "reject.h"
-#include "stopper.h"
-#include "tesseractclass.h"
-
-namespace tesseract {
-
-// Sets flags necessary for ambigs training mode.
-// Opens and returns the pointer to the output file.
-FILE *Tesseract::init_ambigs_training(const STRING &fname) {
-  permute_only_top = 1;                        // use only top choice permuter
-  tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
-  tessedit_ok_mode.set_value(0);               // turn off context checking
-  tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
-  save_best_choices.set_value(1);              // save individual char choices
-  stopper_no_acceptable_choices.set_value(1);  // explore all segmentations
-  save_raw_choices.set_value(1);               // save raw choices
-
-  // Open ambigs output file.
-  STRING output_fname = fname;
-  const char *lastdot = strrchr(output_fname.string(), '.');
-  if (lastdot != NULL) {
-    output_fname[lastdot - output_fname.string()] = '\0';
-  }
-  output_fname += ".txt";
-  FILE *output_file;
-  if (!(output_file = fopen(output_fname.string(), "a+"))) {
-    CANTOPENFILE.error("ambigs_training", EXIT,
-                       "Can't open box file %s\n", output_fname.string());
-  }
-  return output_file;
-}
-
-// This function takes tif/box pair of files and runs recognition on the image,
-// while making sure that the word bounds that tesseract identified roughly
-// match to those specified by the input box file. For each word (ngram in a
-// single bounding box from the input box file) it outputs the ocred result,
-// the correct label, rating and certainty.
-void Tesseract::ambigs_training_segmented(const STRING &fname,
-                                          PAGE_RES *page_res,
-                                          volatile ETEXT_DESC *monitor,
-                                          FILE *output_file) {
-  STRING box_fname = fname;
-  const char *lastdot = strrchr(box_fname.string(), '.');
-  if (lastdot != NULL) {
-    box_fname[lastdot - box_fname.string()] = '\0';
-  }
-  box_fname += ".box";
-  FILE *box_file;
-  if (!(box_file = fopen(box_fname.string(), "r"))) {
-    CANTOPENFILE.error("ambigs_training", EXIT,
-                       "Can't open box file %s\n", box_fname.string());
-  }
-
-  static PAGE_RES_IT page_res_it;
-  page_res_it.page_res = page_res;
-  page_res_it.restart_page();
-  int x_min, y_min, x_max, y_max;
-  char label[UNICHAR_LEN * 10];
-
-  // Process all the words on this page.
-  while (page_res_it.word() != NULL &&
-         read_next_box(applybox_page, box_file, label,
-                       &x_min, &y_min, &x_max, &y_max)) {
-    // Init bounding box of the current word bounding box and from box file.
-    TBOX box = TBOX(ICOORD(x_min, y_min), ICOORD(x_max, y_max));
-    TBOX word_box(page_res_it.word()->word->bounding_box());
-    bool one_word = true;
-    // Check whether the bounding box of the next word overlaps with the
-    // current box from box file.
-    while (page_res_it.next_word() != NULL &&
-           box.x_overlap(page_res_it.next_word()->word->bounding_box())) {
-      word_box = word_box.bounding_union(
-          page_res_it.next_word()->word->bounding_box());
-      page_res_it.forward();
-      one_word = false;
-    }
-    if (!word_box.major_overlap(box)) {
-      if (!word_box.x_overlap(box)) {
-        // We must be looking at the word that belongs in the "next" bounding
-        // box from the box file. The ngram that was supposed to appear in
-        // the current box read from the box file must have been dropped by
-        // tesseract as noise.
-        tprintf("Word %s was dropped as noise.\n", label);
-        continue;  // stay on this blob, but read next box from box file
-      } else {
-        tprintf("Error: Insufficient overlap for word box"
-                " and box from file for %s\n", label);
-        word_box.print();
-        box.print();
-        exit(1);
-      }
-    }
-    // Skip recognizing the ngram if tesseract is sure it's not
-    // one word, otherwise run one recognition pass on this word.
-    if (!one_word) {
-      tprintf("Tesseract segmented %s as multiple words\n", label);
-    } else {
-      ambigs_classify_and_output(&page_res_it, label, output_file);
-    }
-    page_res_it.forward();
-  }
-  fclose(box_file);
-}
-
-// Run classify_word_pass1() on the current word. Output tesseract's raw choice
-// as a result of the classification. For words labeled with a single unichar
-// also output all alternatives from blob_choices of the best choice.
-void Tesseract::ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
-                                           const char *label,
-                                           FILE *output_file) {
-  int offset;
-  // Classify word.
-  classify_word_pass1(page_res_it->word(), page_res_it->row()->row,
-                      page_res_it->block()->block,
-                      FALSE, NULL, NULL);
-  WERD_CHOICE *best_choice = page_res_it->word()->best_choice;
-  ASSERT_HOST(best_choice != NULL);
-  ASSERT_HOST(best_choice->blob_choices() != NULL);
-
-  // Compute the number of unichars in the label.
-  int label_num_unichars = 0;
-  int step = 1;  // should be non-zero on the first iteration
-  for (offset = 0; label[offset] != '\0' && step > 0;
-       step = getDict().getUnicharset().step(label + offset),
-       offset += step, ++label_num_unichars);
-  if (step == 0) {
-    tprintf("Not outputting illegal unichar %s\n", label);
-    return;
-  }
-
-  // Output all classifier choices for the unigrams (1-1 classifications).
-  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
-    BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
-    outer_blob_choice_it.set_to_list(best_choice->blob_choices());
-    BLOB_CHOICE_IT blob_choice_it;
-    blob_choice_it.set_to_list(outer_blob_choice_it.data());
-    for (blob_choice_it.mark_cycle_pt();
-         !blob_choice_it.cycled_list();
-         blob_choice_it.forward()) {
-      BLOB_CHOICE *blob_choice = blob_choice_it.data();
-      if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
-        fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
-               unicharset.id_to_unichar(blob_choice->unichar_id()),
-               label, blob_choice->rating(), blob_choice->certainty());
-      }
-    }
-  }
-  // Output the raw choice for succesful non 1-1 classifications.
-  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
-}
-
-}  // namespace tesseract
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
--- a/ccmain/applybox.h
+++ b/ccmain/applybox.h
@ -1,84 +0,0 @@
-/**********************************************************************
- * File:        applybox.h  (Formerly applybox.h)
- * Description: Re segment rows according to box file data
- * Author:		Phil Cheatle
- * Created:		Wed Nov 24 09:11:23 GMT 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           APPLYBOX_H
-#define           APPLYBOX_H
-
-#include          "varable.h"
-#include          "ocrblock.h"
-#include          "ocrrow.h"
-#include          "notdll.h"
-#include          "unichar.h"
-
-extern BOOL_VAR_H (applybox_rebalance, TRUE, "Drop dead");
-extern INT_VAR_H (applybox_debug, 0, "Debug level");
-extern INT_VAR_H (applybox_page, 0, "Page number to apply boxes from");
-extern STRING_VAR_H (applybox_test_exclusions, "|",
-                     "Chars ignored for testing");
-extern double_VAR_H (applybox_error_band, 0.15, "Err band as fract of xht");
-extern STRING_VAR_H(exposure_pattern, "exp",
-                    "Exposure value follows this pattern in the image"
-                    " filename. The name of the image files are expected"
-                    " to be in the form [lang].[fontname].exp[num].tif");
-
-static const int kMinFragmentOutlineArea = 10;
-
-void apply_boxes(const STRING& filename,
-                 BLOCK_LIST *block_list    //real blocks
-                );
-
-ROW *find_row_of_box(
-                     BLOCK_LIST *block_list,  //real blocks
-                     const TBOX &box,               //from boxfile
-                     inT16 &block_id,
-                     inT16 &row_id_to_process);
-
-inT16 resegment_box(
-                    ROW *row,
-                    TBOX &box,
-                    UNICHAR_ID uch_id,
-                    inT16 block_id,
-                    inT16 row_id,
-                    inT16 boxfile_lineno,
-                    inT16 boxfile_charno,
-                    inT16 *tgt_char_counts,
-                    bool learn_char_fragments,
-                    bool learning);
-
-void tidy_up(
-             BLOCK_LIST *block_list,  //real blocks
-             inT16 &ok_char_count,
-             inT16 &ok_row_count,
-             inT16 &unlabelled_words,
-             inT16 *tgt_char_counts,
-             inT16 &rebalance_count,
-             UNICHAR_ID *min_uch_id,
-             inT16 &min_samples,
-             inT16 &final_labelled_blob_count,
-             bool learn_character_fragments,
-             bool learning);
-
-void report_failed_box(inT16 boxfile_lineno,
-                       inT16 boxfile_charno,
-                       TBOX box,
-                       const char *box_ch,
-                       const char *err_msg);
-
-void apply_box_training(const STRING& filename, BLOCK_LIST *block_list);
-#endif
--- a/ccmain/blobcmp.cpp
+++ b/ccmain/blobcmp.cpp
@ -1,82 +0,0 @@
-/**********************************************************************
- * File:        blobcmp.c  (Formerly blobcmp.c)
- * Description: Code to compare blobs using the adaptive matcher.
- * Author:		Ray Smith
- * Created:		Wed Apr 21 09:28:51 BST 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#include "mfcpch.h"
-#include "fxdefs.h"
-#include "ocrfeatures.h"
-#include "intmatcher.h"
-#include "intproto.h"
-#include "adaptive.h"
-#include "adaptmatch.h"
-#include "const.h"
-#include "tessvars.h"
-#include "tesseractclass.h"
-
-#define CMP_CLASS       0
-
-/**********************************************************************
- * compare_tess_blobs
- *
- * Match 2 blobs using the adaptive classifier.
- **********************************************************************/
-namespace tesseract {
-float Tesseract::compare_tess_blobs(TBLOB *blob1,
-                                    TEXTROW *row1,
-                                    TBLOB *blob2,
-                                    TEXTROW *row2) {
-  int fcount;                    /*number of features */
-  ADAPT_CLASS adapted_class;
-  ADAPT_TEMPLATES ad_templates;
-  LINE_STATS line_stats1, line_stats2;
-  INT_FEATURE_ARRAY int_features;
-  FEATURE_SET float_features;
-  INT_RESULT_STRUCT int_result;  /*output */
-
-  BIT_VECTOR AllProtosOn = NewBitVector (MAX_NUM_PROTOS);
-  BIT_VECTOR AllConfigsOn = NewBitVector (MAX_NUM_CONFIGS);
-  set_all_bits (AllProtosOn, WordsInVectorOfSize (MAX_NUM_PROTOS));
-  set_all_bits (AllConfigsOn, WordsInVectorOfSize (MAX_NUM_CONFIGS));
-
-  EnterClassifyMode;
-  ad_templates = NewAdaptedTemplates (false);
-  GetLineStatsFromRow(row1, &line_stats1);
-                                 /*copy baseline stuff */
-  GetLineStatsFromRow(row2, &line_stats2);
-  adapted_class = NewAdaptedClass ();
-  AddAdaptedClass (ad_templates, adapted_class, CMP_CLASS);
-  InitAdaptedClass(blob1, &line_stats1, CMP_CLASS, adapted_class, ad_templates);
-  fcount = GetAdaptiveFeatures (blob2, &line_stats2,
-    int_features, &float_features);
-  if (fcount > 0) {
-    SetBaseLineMatch();
-    IntegerMatcher (ClassForClassId (ad_templates->Templates, CMP_CLASS),
-      AllProtosOn, AllConfigsOn, fcount, fcount,
-      int_features, 0, &int_result, testedit_match_debug);
-    FreeFeatureSet(float_features);
-    if (int_result.Rating < 0)
-      int_result.Rating = MAX_FLOAT32;
-  }
-
-  free_adapted_templates(ad_templates);
-  FreeBitVector(AllConfigsOn);
-  FreeBitVector(AllProtosOn);
-
-  return fcount > 0 ? int_result.Rating * fcount : MAX_FLOAT32;
-}
-}  // namespace tesseract
--- a/ccmain/callnet.cpp
+++ b/ccmain/callnet.cpp
@ -1,93 +0,0 @@
-/**********************************************************************
- * File:        callnet.cpp  (Formerly callnet.c)
- * Description: Interface to Neural Net matcher
- * Author:      Phil Cheatle
- * Created:     Wed Nov 18 10:35:00 GMT 1992
- *
- * (C) Copyright 1992, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#include "mfcpch.h"
-#include "errcode.h"
-//#include "nmatch.h"
-#include "globals.h"
-
-#define OUTPUT_NODES 94
-
-const ERRCODE NETINIT = "NN init error";
-
-//extern "C"
-//{
-//extern char*                          demodir;                                        /* where program lives */
-
-void init_net() {  /* Initialise net */
-#ifdef ASPIRIN_INCLUDED
-  char wts_filename[256];
-
-  if (nmatch_init_network () != 0) {
-    NETINIT.error ("Init_net", EXIT, "Errcode %s", nmatch_error_string ());
-  }
-  strcpy(wts_filename, demodir);
-  strcat (wts_filename, "tessdata/netwts");
-
-  if (nmatch_load_network (wts_filename) != 0) {
-    NETINIT.error ("Init_net", EXIT, "Weights failed, Errcode %s",
-      nmatch_error_string ());
-  }
-#endif
-}
-
-
-void callnet(  /* Apply image to net */
-             float *input_vector,
-             char *top,
-             float *top_score,
-             char *next,
-             float *next_score) {
-#ifdef ASPIRIN_INCLUDED
-  float *output_vector;
-  int i;
-  int max_out_i = 0;
-  int next_max_out_i = 0;
-  float max_out = -9;
-  float next_max_out = -9;
-  
-  nmatch_set_input(input_vector);
-  nmatch_propagate_forward();
-  output_vector = nmatch_get_output ();
-  
-  /* Now find top two choices */
-
-  for (i = 0; i < OUTPUT_NODES; i++) {
-    if (output_vector[i] > max_out) {
-      next_max_out = max_out;
-      max_out = output_vector[i];
-      next_max_out_i = max_out_i;
-      max_out_i = i;
-    }
-    else {
-      if (output_vector[i] > next_max_out) {
-        next_max_out = output_vector[i];
-        next_max_out_i = i;
-      }
-    }
-  }
-  *top = max_out_i + '!';
-  *next = next_max_out_i + '!';
-  *top_score = max_out;
-  *next_score = next_max_out;
-#endif
-}
-
-
-//};
--- a/ccmain/charcut.cpp
+++ b/ccmain/charcut.cpp
@ -18,12 +18,12 @@
 **********************************************************************/

 #include "mfcpch.h"
-#include          "charcut.h"
-#include          "imgs.h"
-#include          "svshowim.h"
-//#include          "evnts.h"
-#include          "notdll.h"
-#include	  "scrollview.h"
+#include "charcut.h"
+#include "imgs.h"
+#include "scrollview.h"
+#include "svshowim.h"
+#include "notdll.h"
+#include "helpers.h"

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
@ -35,10 +35,6 @@
 #define BUG_OFFSET 1
 #define EXTERN

-EXTERN INT_VAR (pix_word_margin, 3, "How far outside word BB to grow");
-
-extern IMAGE page_image;
-
 ELISTIZE (PIXROW)
 /*************************************************************************
 * PIXROW::PIXROW()
@ -58,8 +54,8 @@ PIXROW::PIXROW(inT16 pos, inT16 count, PBLOB *blob) {

  row_offset = pos;
  row_count = count;
-  min = (inT16 *) alloc_mem (count * sizeof (inT16));
-  max = (inT16 *) alloc_mem (count * sizeof (inT16));
+  min = (inT16 *) alloc_mem(count * sizeof(inT16));
+  max = (inT16 *) alloc_mem(count * sizeof(inT16));
  outline_list = blob->out_list ();
  outline_it.set_to_list (outline_list);

@ -67,27 +63,21 @@ PIXROW::PIXROW(inT16 pos, inT16 count, PBLOB *blob) {
    min[i] = MAX_INT16 - 1;
    max[i] = -MAX_INT16 + 1;
    y_coord = row_offset + i + 0.5;
-    for (outline_it.mark_cycle_pt ();
-    !outline_it.cycled_list (); outline_it.forward ()) {
-      pts_list = outline_it.data ()->polypts ();
-      pts_it.set_to_list (pts_list);
-      for (pts_it.mark_cycle_pt ();
-      !pts_it.cycled_list (); pts_it.forward ()) {
-        pt = pts_it.data ()->pos;
-        vec = pts_it.data ()->vec;
-        if ((vec.y () != 0) &&
-          (((pt.y () <= y_coord) && (pt.y () + vec.y () >= y_coord))
-          || ((pt.y () >= y_coord)
-        && (pt.y () + vec.y () <= y_coord)))) {
+    for (outline_it.mark_cycle_pt();
+         !outline_it.cycled_list(); outline_it.forward()) {
+      pts_list = outline_it.data()->polypts();
+      pts_it.set_to_list(pts_list);
+      for (pts_it.mark_cycle_pt(); !pts_it.cycled_list(); pts_it.forward()) {
+        pt = pts_it.data()->pos;
+        vec = pts_it.data()->vec;
+        if ((vec.y() != 0) &&
+            (((pt.y() <= y_coord) && (pt.y() + vec.y() >= y_coord))
+             || ((pt.y() >= y_coord) && (pt.y() + vec.y() <= y_coord)))) {
          /* The segment crosses y_coord so find x-point and check for min/max. */
-          x_coord = (inT16) floor ((y_coord -
-            pt.y ()) * vec.x () / vec.y () +
-            pt.x () + 0.5);
-          if (x_coord < min[i])
-            min[i] = x_coord;
-          x_coord--;             //to get pix to left of line
-          if (x_coord > max[i])
-            max[i] = x_coord;
+          x_coord = (inT16) floor((y_coord - pt.y()) * vec.x() / vec.y() +
+                                  pt.x() + 0.5);
+          //  x_coord - 1  to get pix to left of line
+          UpdateRange<inT16>(x_coord, x_coord - 1, &min[i], &max[i]);
        }
      }
    }
@ -154,20 +144,14 @@ TBOX PIXROW::bounding_box() const {
  for (i = 0; i < row_count; i++) {
    y_coord = row_offset + i;
    if (min[i] <= max[i]) {
-      if (y_coord < min_y)
-        min_y = y_coord;
-      if (y_coord + 1 > max_y)
-        max_y = y_coord + 1;
-      if (min[i] < min_x)
-        min_x = min[i];
-      if (max[i] + 1 > max_x)
-        max_x = max[i] + 1;
+      UpdateRange<inT16>(y_coord, y_coord + 1, &min_y, &max_y);
+      UpdateRange<inT16>(min[i], max[i] + 1, &min_x, &max_x);
    }
  }
  if (min_x > max_x || min_y > max_y)
-    return TBOX ();
+    return TBOX();
  else
-    return TBOX (ICOORD (min_x, min_y), ICOORD (max_x, max_y));
+    return TBOX(ICOORD(min_x, min_y), ICOORD(max_x, max_y));
 }


@ -479,10 +463,10 @@ void char_clip_word(                            //

  /* Define region for max pixrow expansion */
  pix_box = word_box;
-  pix_box.move_bottom_edge (-pix_word_margin);
-  pix_box.move_top_edge (pix_word_margin);
-  pix_box.move_left_edge (-pix_word_margin);
-  pix_box.move_right_edge (pix_word_margin);
+  pix_box.move_bottom_edge (-kPixWordMargin);
+  pix_box.move_top_edge (kPixWordMargin);
+  pix_box.move_left_edge (-kPixWordMargin);
+  pix_box.move_right_edge (kPixWordMargin);
  pix_box -= TBOX (ICOORD (0, 0 + BUG_OFFSET),
    ICOORD (bin_image.get_xsize (),
    bin_image.get_ysize () - BUG_OFFSET));
--- a/ccmain/charcut.h
+++ b/ccmain/charcut.h
@ -16,14 +16,6 @@
 ** limitations under the License.
 *
 **********************************************************************/
-/**
- * @file     charcut.h  
- * @note     Formerly charclip.h
- * @brief    Code for character clipping
- * @author   Phil Cheatle
- * @date     Created Wed Nov 11 08:35:15 GMT 1992
- *
- */

 #ifndef           CHARCUT_H
 #define           CHARCUT_H
@ -44,6 +36,8 @@ class ScrollView;
 * the row defined by min[0] and max[0] is held in row_offset.
 */

+const int kPixWordMargin = 3;  // how far outside word BB to grow
+
 class PIXROW:public ELIST_LINK
 {
  public:
@ -126,11 +120,6 @@ class PIXROW:public ELIST_LINK
 };

 ELISTIZEH (PIXROW)
-extern INT_VAR_H (pix_word_margin, 3, "How far outside word BB to grow");
-extern BOOL_VAR_H (show_char_clipping, TRUE, "Show clip image window?");
-extern INT_VAR_H (net_image_width, 40, "NN input image width");
-extern INT_VAR_H (net_image_height, 36, "NN input image height");
-extern INT_VAR_H (net_image_x_height, 22, "NN input image x_height");
 void char_clip_word(
                    WERD *word,                 ///< word to be processed
                    IMAGE &bin_image,           ///< whole image
--- a/ccmain/charsample.cpp
+++ b/ccmain/charsample.cpp
@ -1,709 +0,0 @@
-/**********************************************************************
- * File:        charsample.cpp  (Formerly charsample.c)
- * Description: Class to contain character samples and match scores
- *					to be used for adaption
- * Author:      Chris Newton
- * Created:     Thu Oct  7 13:40:37 BST 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#include "mfcpch.h"
-
-#include <stdio.h>
-#include          <ctype.h>
-#include          <math.h>
-#ifdef __UNIX__
-#include <assert.h>
-#include          <unistd.h>
-#endif
-#include "memry.h"
-#include          "tessvars.h"
-#include "statistc.h"
-#include          "charsample.h"
-#include "paircmp.h"
-#include "matmatch.h"
-#include          "adaptions.h"
-#include          "secname.h"
-#include          "notdll.h"
-#include          "tesseractclass.h"
-
-// Include automatically generated configuration file if running autoconf.
-#ifdef HAVE_CONFIG_H
-#include "config_auto.h"
-#endif
-
-extern inT32 demo_word;          // Hack for demos
-
-ELISTIZE (CHAR_SAMPLE) ELISTIZE (CHAR_SAMPLES) CHAR_SAMPLE::CHAR_SAMPLE () {
-  sample_blob = NULL;
-  sample_denorm = NULL;
-  sample_image = NULL;
-  ch = '\0';
-  n_samples_matched = 0;
-  total_match_scores = 0.0;
-  sumsq_match_scores = 0.0;
-}
-
-
-CHAR_SAMPLE::CHAR_SAMPLE(PBLOB *blob, DENORM *denorm, char c) {
-  sample_blob = blob;
-  sample_denorm = denorm;
-  sample_image = NULL;
-  ch = c;
-  n_samples_matched = 0;
-  total_match_scores = 0.0;
-  sumsq_match_scores = 0.0;
-}
-
-
-CHAR_SAMPLE::CHAR_SAMPLE(IMAGE *image, char c) {
-  sample_blob = NULL;
-  sample_denorm = NULL;
-  sample_image = image;
-  ch = c;
-  n_samples_matched = 0;
-  total_match_scores = 0.0;
-  sumsq_match_scores = 0.0;
-}
-
-
-float CHAR_SAMPLE::match_sample(  // Update match scores
-                                CHAR_SAMPLE *test_sample,
-                                BOOL8 updating,
-                                tesseract::Tesseract* tess) {
-  float score1;
-  float score2;
-  IMAGE *image = test_sample->image ();
-
-  if (sample_blob != NULL && test_sample->blob () != NULL) {
-    PBLOB *blob = test_sample->blob ();
-    DENORM *denorm = test_sample->denorm ();
-
-    score1 = tess->compare_bln_blobs (sample_blob, sample_denorm, blob, denorm);
-    score2 = tess->compare_bln_blobs (blob, denorm, sample_blob, sample_denorm);
-
-    score1 = (score1 > score2) ? score1 : score2;
-  }
-  else if (sample_image != NULL && image != NULL) {
-    CHAR_PROTO *sample = new CHAR_PROTO (this);
-
-    score1 = matrix_match (sample_image, image);
-    delete sample;
-  }
-  else
-    return BAD_SCORE;
-
-  if ((tessedit_use_best_sample || tessedit_cluster_debug) && updating) {
-    n_samples_matched++;
-    total_match_scores += score1;
-    sumsq_match_scores += score1 * score1;
-  }
-  return score1;
-}
-
-
-double CHAR_SAMPLE::mean_score() {
-  if (n_samples_matched > 0)
-    return (total_match_scores / n_samples_matched);
-  else
-    return BAD_SCORE;
-}
-
-
-double CHAR_SAMPLE::variance() {
-  double mean = mean_score ();
-
-  if (n_samples_matched > 0) {
-    return (sumsq_match_scores / n_samples_matched) - mean * mean;
-  }
-  else
-    return BAD_SCORE;
-}
-
-
-void CHAR_SAMPLE::print(FILE *f) {
-  if (!tessedit_cluster_debug)
-    return;
-
-  if (n_samples_matched > 0)
-    fprintf (f,
-      "%c - sample matched against " INT32FORMAT
-      " blobs, mean: %f, var: %f\n", ch, n_samples_matched,
-      mean_score (), variance ());
-  else
-    fprintf (f, "No matches for this sample (%c)\n", ch);
-}
-
-
-void CHAR_SAMPLE::reset_match_statistics() {
-  n_samples_matched = 0;
-  total_match_scores = 0.0;
-  sumsq_match_scores = 0.0;
-}
-
-
-CHAR_SAMPLES::CHAR_SAMPLES() {
-  type = UNKNOWN;
-  samples.clear ();
-  ch = '\0';
-  best_sample = NULL;
-  proto = NULL;
-}
-
-
-CHAR_SAMPLES::CHAR_SAMPLES(CHAR_SAMPLE *sample) {
-  CHAR_SAMPLE_IT sample_it = &samples;
-
-  ASSERT_HOST (sample->image () != NULL || sample->blob () != NULL);
-
-  if (sample->image () != NULL)
-    type = IMAGE_CLUSTER;
-  else if (sample->blob () != NULL)
-    type = BLOB_CLUSTER;
-
-  samples.clear ();
-  sample_it.add_to_end (sample);
-  if (tessedit_mm_only_match_same_char)
-    ch = sample->character ();
-  else
-    ch = '\0';
-  best_sample = NULL;
-  proto = NULL;
-}
-
-
-void CHAR_SAMPLES::add_sample(CHAR_SAMPLE *sample, tesseract::Tesseract* tess) {
-  CHAR_SAMPLE_IT sample_it = &samples;
-
-  if (tessedit_use_best_sample || tessedit_cluster_debug)
-    for (sample_it.mark_cycle_pt ();
-  !sample_it.cycled_list (); sample_it.forward ()) {
-    sample_it.data ()->match_sample (sample, TRUE, tess);
-    sample->match_sample (sample_it.data (), TRUE, tess);
-  }
-
-  sample_it.add_to_end (sample);
-
-  if (tessedit_mm_use_prototypes && type == IMAGE_CLUSTER) {
-    if (samples.length () == tessedit_mm_prototype_min_size)
-      this->build_prototype ();
-    else if (samples.length () > tessedit_mm_prototype_min_size)
-      this->add_sample_to_prototype (sample);
-  }
-}
-
-
-void CHAR_SAMPLES::add_sample_to_prototype(CHAR_SAMPLE *sample) {
-  BOOL8 rebuild = FALSE;
-  inT32 new_xsize = proto->x_size ();
-  inT32 new_ysize = proto->y_size ();
-  inT32 sample_xsize = sample->image ()->get_xsize ();
-  inT32 sample_ysize = sample->image ()->get_ysize ();
-
-  if (sample_xsize > new_xsize) {
-    new_xsize = sample_xsize;
-    rebuild = TRUE;
-  }
-  if (sample_ysize > new_ysize) {
-    new_ysize = sample_ysize;
-    rebuild = TRUE;
-  }
-
-  if (rebuild)
-    proto->enlarge_prototype (new_xsize, new_ysize);
-
-  proto->add_sample (sample);
-}
-
-
-void CHAR_SAMPLES::build_prototype() {
-  CHAR_SAMPLE_IT sample_it = &samples;
-  CHAR_SAMPLE *sample;
-  inT32 proto_xsize = 0;
-  inT32 proto_ysize = 0;
-
-  if (type != IMAGE_CLUSTER
-    || samples.length () < tessedit_mm_prototype_min_size)
-    return;
-
-  for (sample_it.mark_cycle_pt ();
-  !sample_it.cycled_list (); sample_it.forward ()) {
-    sample = sample_it.data ();
-    if (sample->image ()->get_xsize () > proto_xsize)
-      proto_xsize = sample->image ()->get_xsize ();
-    if (sample->image ()->get_ysize () > proto_ysize)
-      proto_ysize = sample->image ()->get_ysize ();
-  }
-
-  proto = new CHAR_PROTO (proto_xsize, proto_ysize, 0, 0, '\0');
-
-  for (sample_it.mark_cycle_pt ();
-    !sample_it.cycled_list (); sample_it.forward ())
-  this->add_sample_to_prototype (sample_it.data ());
-
-}
-
-
-void CHAR_SAMPLES::find_best_sample() {
-  CHAR_SAMPLE_IT sample_it = &samples;
-  double score;
-  double best_score = MAX_INT32;
-
-  if (ch == '\0' || samples.length () < tessedit_mm_prototype_min_size)
-    return;
-
-  for (sample_it.mark_cycle_pt ();
-  !sample_it.cycled_list (); sample_it.forward ()) {
-    score = sample_it.data ()->mean_score ();
-    if (score < best_score) {
-      best_score = score;
-      best_sample = sample_it.data ();
-    }
-  }
-  #ifndef SECURE_NAMES
-  if (tessedit_cluster_debug) {
-    tprintf ("Best sample for this %c cluster:\n", ch);
-    best_sample->print (debug_fp);
-  }
-  #endif
-}
-
-
-float CHAR_SAMPLES::match_score(CHAR_SAMPLE *sample,
-                                tesseract::Tesseract* tess) {
-  if (tessedit_mm_only_match_same_char && sample->character () != ch)
-    return BAD_SCORE;
-
-  if (tessedit_use_best_sample && best_sample != NULL)
-    return best_sample->match_sample (sample, FALSE, tess);
-  else if ((tessedit_mm_use_prototypes
-    || tessedit_mm_adapt_using_prototypes) && proto != NULL)
-    return proto->match_sample (sample);
-  else
-    return this->nn_match_score (sample, tess);
-}
-
-
-float CHAR_SAMPLES::nn_match_score(CHAR_SAMPLE *sample,
-                                   tesseract::Tesseract* tess) {
-  CHAR_SAMPLE_IT sample_it = &samples;
-  float score;
-  float min_score = MAX_INT32;
-
-  for (sample_it.mark_cycle_pt ();
-  !sample_it.cycled_list (); sample_it.forward ()) {
-    score = sample_it.data ()->match_sample (sample, FALSE, tess);
-    if (score < min_score)
-      min_score = score;
-  }
-
-  return min_score;
-}
-
-
-void CHAR_SAMPLES::assign_to_char() {
-  STATS char_frequency(FIRST_CHAR, LAST_CHAR);
-  CHAR_SAMPLE_IT sample_it = &samples;
-  inT32 i;
-  inT32 max_index = 0;
-  inT32 max_freq = 0;
-
-  if (samples.length () == 0 || tessedit_mm_only_match_same_char)
-    return;
-
-  for (sample_it.mark_cycle_pt ();
-    !sample_it.cycled_list (); sample_it.forward ())
-  char_frequency.add ((inT32) sample_it.data ()->character (), 1);
-
-  for (i = FIRST_CHAR; i <= LAST_CHAR; i++)
-  if (char_frequency.pile_count (i) > max_freq) {
-    max_index = i;
-    max_freq = char_frequency.pile_count (i);
-  }
-
-  if (samples.length () >= tessedit_cluster_min_size
-    && max_freq > samples.length () * tessedit_cluster_accept_fraction)
-    ch = (char) max_index;
-}
-
-
-void CHAR_SAMPLES::print(FILE *f) {
-  CHAR_SAMPLE_IT sample_it = &samples;
-
-  fprintf (f, "Collected " INT32FORMAT " samples\n", samples.length ());
-
-  #ifndef SECURE_NAMES
-  if (tessedit_cluster_debug)
-    for (sample_it.mark_cycle_pt ();
-    !sample_it.cycled_list (); sample_it.forward ())
-  sample_it.data ()->print (f);
-
-  if (ch == '\0')
-    fprintf (f, "\nCluster not used for adaption\n");
-  else
-    fprintf (f, "\nCluster used to adapt to '%c's\n", ch);
-  #endif
-}
-
-
-CHAR_PROTO::CHAR_PROTO() {
-  xsize = 0;
-  ysize = 0;
-  ch = '\0';
-  nsamples = 0;
-  proto_data = NULL;
-  proto = NULL;
-}
-
-
-CHAR_PROTO::CHAR_PROTO(inT32 x_size,
-                       inT32 y_size,
-                       inT32 n_samples,
-                       float initial_value,
-                       char c) {
-  inT32 x;
-  inT32 y;
-
-  xsize = x_size;
-  ysize = y_size;
-  ch = c;
-  nsamples = n_samples;
-
-  ALLOC_2D_ARRAY(xsize, ysize, proto_data, proto, float);
-
-  for (y = 0; y < ysize; y++)
-    for (x = 0; x < xsize; x++)
-      proto[x][y] = initial_value;
-}
-
-
-CHAR_PROTO::CHAR_PROTO(CHAR_SAMPLE *sample) {
-  inT32 x;
-  inT32 y;
-  IMAGELINE imline_s;
-
-  if (sample->image () == NULL) {
-    xsize = 0;
-    ysize = 0;
-    ch = '\0';
-    nsamples = 0;
-    proto_data = NULL;
-    proto = NULL;
-  }
-  else {
-    ch = sample->character ();
-    xsize = sample->image ()->get_xsize ();
-    ysize = sample->image ()->get_ysize ();
-    nsamples = 1;
-
-    ALLOC_2D_ARRAY(xsize, ysize, proto_data, proto, float);
-
-    for (y = 0; y < ysize; y++) {
-      sample->image ()->fast_get_line (0, y, xsize, &imline_s);
-      for (x = 0; x < xsize; x++)
-        if (imline_s.pixels[x] == BINIM_WHITE)
-          proto[x][y] = 1.0;
-      else
-        proto[x][y] = -1.0;
-    }
-  }
-}
-
-
-CHAR_PROTO::~CHAR_PROTO () {
-  if (proto_data != NULL)
-    FREE_2D_ARRAY(proto_data, proto);
-}
-
-
-float CHAR_PROTO::match_sample(CHAR_SAMPLE *test_sample) {
-  CHAR_PROTO *test_proto;
-  float score;
-
-  if (test_sample->image () != NULL) {
-    test_proto = new CHAR_PROTO (test_sample);
-    if (xsize > test_proto->x_size ())
-      score = this->match (test_proto);
-    else {
-      demo_word = -demo_word;    // Flag different call
-      score = test_proto->match (this);
-    }
-  }
-  else
-    return BAD_SCORE;
-
-  delete test_proto;
-
-  return score;
-}
-
-
-float CHAR_PROTO::match(CHAR_PROTO *test_proto) {
-  inT32 xsize2 = test_proto->x_size ();
-  inT32 y_size;
-  inT32 y_size2;
-  inT32 x_offset;
-  inT32 y_offset;
-  inT32 x;
-  inT32 y;
-  CHAR_PROTO *match_proto;
-  float score;
-  float sum = 0.0;
-
-  ASSERT_HOST (xsize >= xsize2);
-
-  x_offset = (xsize - xsize2) / 2;
-
-  if (ysize < test_proto->y_size ()) {
-    y_size = test_proto->y_size ();
-    y_size2 = ysize;
-    y_offset = (y_size - y_size2) / 2;
-
-    match_proto = new CHAR_PROTO (xsize,
-      y_size,
-      nsamples * test_proto->n_samples (),
-      0, '\0');
-
-    for (y = 0; y < y_offset; y++) {
-      for (x = 0; x < xsize2; x++) {
-        match_proto->data ()[x + x_offset][y] =
-          test_proto->data ()[x][y] * nsamples;
-        sum += match_proto->data ()[x + x_offset][y];
-      }
-    }
-
-    for (y = y_offset + y_size2; y < y_size; y++) {
-      for (x = 0; x < xsize2; x++) {
-        match_proto->data ()[x + x_offset][y] =
-          test_proto->data ()[x][y] * nsamples;
-        sum += match_proto->data ()[x + x_offset][y];
-      }
-    }
-
-    for (y = y_offset; y < y_offset + y_size2; y++) {
-      for (x = 0; x < x_offset; x++) {
-        match_proto->data ()[x][y] = proto[x][y - y_offset] *
-          test_proto->n_samples ();
-        sum += match_proto->data ()[x][y];
-      }
-
-      for (x = x_offset + xsize2; x < xsize; x++) {
-        match_proto->data ()[x][y] = proto[x][y - y_offset] *
-          test_proto->n_samples ();
-        sum += match_proto->data ()[x][y];
-      }
-
-      for (x = x_offset; x < x_offset + xsize2; x++) {
-        match_proto->data ()[x][y] =
-          proto[x][y - y_offset] * test_proto->data ()[x - x_offset][y];
-        sum += match_proto->data ()[x][y];
-      }
-    }
-  }
-  else {
-    y_size = ysize;
-    y_size2 = test_proto->y_size ();
-    y_offset = (y_size - y_size2) / 2;
-
-    match_proto = new CHAR_PROTO (xsize,
-      y_size,
-      nsamples * test_proto->n_samples (),
-      0, '\0');
-
-    for (y = 0; y < y_offset; y++)
-    for (x = 0; x < xsize; x++) {
-      match_proto->data ()[x][y] =
-        proto[x][y] * test_proto->n_samples ();
-      sum += match_proto->data ()[x][y];
-    }
-
-    for (y = y_offset + y_size2; y < y_size; y++)
-    for (x = 0; x < xsize; x++) {
-      match_proto->data ()[x][y] =
-        proto[x][y] * test_proto->n_samples ();
-      sum += match_proto->data ()[x][y];
-    }
-
-    for (y = y_offset; y < y_offset + y_size2; y++) {
-      for (x = 0; x < x_offset; x++) {
-        match_proto->data ()[x][y] =
-          proto[x][y] * test_proto->n_samples ();
-        sum += match_proto->data ()[x][y];
-      }
-
-      for (x = x_offset + xsize2; x < xsize; x++) {
-        match_proto->data ()[x][y] =
-          proto[x][y] * test_proto->n_samples ();
-        sum += match_proto->data ()[x][y];
-      }
-
-      for (x = x_offset; x < x_offset + xsize2; x++) {
-        match_proto->data ()[x][y] = proto[x][y] *
-          test_proto->data ()[x - x_offset][y - y_offset];
-        sum += match_proto->data ()[x][y];
-      }
-    }
-  }
-
-  score = (1.0 - sum /
-    (xsize * y_size * nsamples * test_proto->n_samples ()));
-
-  if (tessedit_mm_debug) {
-    if (score < 0) {
-      tprintf ("Match score %f\n", score);
-      tprintf ("x: %d, y: %d, ns: %d, nt: %d, dx %d, dy: %d\n",
-        xsize, y_size, nsamples, test_proto->n_samples (),
-        x_offset, y_offset);
-      for (y = 0; y < y_size; y++) {
-        tprintf ("\n%d", y);
-        for (x = 0; x < xsize; x++)
-          tprintf ("\t%d", match_proto->data ()[x][y]);
-
-      }
-      tprintf ("\n");
-      fflush(debug_fp);
-    }
-  }
-
-#ifndef GRAPHICS_DISABLED
-  if (tessedit_display_mm) {
-    tprintf ("Match score %f\n", score);
-    display_images (this->make_image (),
-      test_proto->make_image (), match_proto->make_image ());
-  }
-  else if (demo_word != 0) {
-    if (demo_word > 0)
-      display_image (test_proto->make_image (), "Test sample",
-        300, 400, FALSE);
-    else
-      display_image (this->make_image (), "Test sample", 300, 400, FALSE);
-
-    display_image (match_proto->make_image (), "Best match",
-      700, 400, TRUE);
-  }
-#endif
-
-  delete match_proto;
-
-  return score;
-}
-
-
-void CHAR_PROTO::enlarge_prototype(inT32 new_xsize, inT32 new_ysize) {
-  float *old_proto_data = proto_data;
-  float **old_proto = proto;
-  inT32 old_xsize = xsize;
-  inT32 old_ysize = ysize;
-  inT32 x_offset;
-  inT32 y_offset;
-  inT32 x;
-  inT32 y;
-
-  ASSERT_HOST (new_xsize >= xsize && new_ysize >= ysize);
-
-  xsize = new_xsize;
-  ysize = new_ysize;
-  ALLOC_2D_ARRAY(xsize, ysize, proto_data, proto, float);
-  x_offset = (xsize - old_xsize) / 2;
-  y_offset = (ysize - old_ysize) / 2;
-
-  for (y = 0; y < y_offset; y++)
-    for (x = 0; x < xsize; x++)
-      proto[x][y] = nsamples;
-
-  for (y = y_offset + old_ysize; y < ysize; y++)
-    for (x = 0; x < xsize; x++)
-      proto[x][y] = nsamples;
-
-  for (y = y_offset; y < y_offset + old_ysize; y++) {
-    for (x = 0; x < x_offset; x++)
-      proto[x][y] = nsamples;
-
-    for (x = x_offset + old_xsize; x < xsize; x++)
-      proto[x][y] = nsamples;
-
-    for (x = x_offset; x < x_offset + old_xsize; x++)
-      proto[x][y] = old_proto[x - x_offset][y - y_offset];
-  }
-
-  FREE_2D_ARRAY(old_proto_data, old_proto);
-}
-
-
-void CHAR_PROTO::add_sample(CHAR_SAMPLE *sample) {
-  inT32 x_offset;
-  inT32 y_offset;
-  inT32 x;
-  inT32 y;
-  IMAGELINE imline_s;
-  inT32 sample_xsize = sample->image ()->get_xsize ();
-  inT32 sample_ysize = sample->image ()->get_ysize ();
-
-  x_offset = (xsize - sample_xsize) / 2;
-  y_offset = (ysize - sample_ysize) / 2;
-
-  ASSERT_HOST (x_offset >= 0 && y_offset >= 0);
-
-  for (y = 0; y < y_offset; y++)
-    for (x = 0; x < xsize; x++)
-      proto[x][y]++;             // Treat pixels outside the
-  // range as white
-  for (y = y_offset + sample_ysize; y < ysize; y++)
-    for (x = 0; x < xsize; x++)
-      proto[x][y]++;
-
-  for (y = y_offset; y < y_offset + sample_ysize; y++) {
-    sample->image ()->fast_get_line (0,
-      y - y_offset, sample_xsize, &imline_s);
-    for (x = x_offset; x < x_offset + sample_xsize; x++) {
-      if (imline_s.pixels[x - x_offset] == BINIM_WHITE)
-        proto[x][y]++;
-      else
-        proto[x][y]--;
-    }
-
-    for (x = 0; x < x_offset; x++)
-      proto[x][y]++;
-
-    for (x = x_offset + sample_xsize; x < xsize; x++)
-      proto[x][y]++;
-  }
-
-  nsamples++;
-}
-
-
-IMAGE *CHAR_PROTO::make_image() {
-  IMAGE *image;
-  IMAGELINE imline_p;
-  inT32 x;
-  inT32 y;
-
-  ASSERT_HOST (nsamples != 0);
-
-  image = new (IMAGE);
-  image->create (xsize, ysize, 8);
-
-  for (y = 0; y < ysize; y++) {
-    image->fast_get_line (0, y, xsize, &imline_p);
-
-    for (x = 0; x < xsize; x++) {
-      imline_p.pixels[x] = 128 +
-        (uinT8) ((proto[x][y] * 128.0) / (0.00001 + nsamples));
-    }
-
-    image->fast_put_line (0, y, xsize, &imline_p);
-  }
-  return image;
-}
--- a/ccmain/charsample.h
+++ b/ccmain/charsample.h
@ -1,214 +0,0 @@
-/**********************************************************************
- * File:        charsample.h  (Formerly charsample.h)
- * Description: Class to contain character samples and match scores
- *					to be used for adaption
- * Author:      Chris Newton
- * Created:     Thu Oct  7 13:40:37 BST 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           CHARSAMPLE_H
-#define           CHARSAMPLE_H
-
-#include          "elst.h"
-#include          "pageres.h"
-#include          "memry.h"
-#include          "notdll.h"
-
-#define BAD_SCORE MAX_INT32
-#define FIRST_CHAR '!'
-#define LAST_CHAR  '~'
-
-namespace tesseract {
-  class Tesseract;  // Fwd decl.
-}
-
-enum ClusterType
-{ UNKNOWN, BLOB_CLUSTER, IMAGE_CLUSTER };
-
-class CHAR_SAMPLE;               //forward decl
-
-ELISTIZEH (CHAR_SAMPLE)
-class CHAR_SAMPLES;              //forward decl
-
-ELISTIZEH (CHAR_SAMPLES)
-class CHAR_PROTO;                //forward decl
-
-class CHAR_SAMPLE:public ELIST_LINK
-{
-  public:
-    CHAR_SAMPLE();  // empty constructor
-
-    CHAR_SAMPLE(  // simple constructor
-                PBLOB *blob,
-                DENORM *denorm,
-                char c
-               );
-
-    CHAR_SAMPLE(  // simple constructor
-                IMAGE *image,
-                char c
-               );
-
-    ~CHAR_SAMPLE () {
-      // We own the image, so it has to be deleted.
-      if (sample_image != NULL)
-        delete sample_image;
-    }
-
-    float match_sample(CHAR_SAMPLE *test_sample, BOOL8 updating,
-                       tesseract::Tesseract* tess);
-
-    inT32 n_matches() {
-      return n_samples_matched;
-    }
-
-    IMAGE *image() {
-      return sample_image;
-    }
-
-    PBLOB *blob() {
-      return sample_blob;
-    }
-
-    DENORM *denorm() {
-      return sample_denorm;
-    }
-
-    double mean_score();
-
-    double variance();
-
-    char character() {
-      return ch;
-    }
-
-    void print(FILE *f);
-
-    void reset_match_statistics();
-
-    NEWDELETE2 (CHAR_SAMPLE) private:
-    IMAGE * sample_image;
-    PBLOB *sample_blob;
-    DENORM *sample_denorm;
-    inT32 n_samples_matched;
-    double total_match_scores;
-    double sumsq_match_scores;
-    char ch;
-};
-
-class CHAR_SAMPLES:public ELIST_LINK
-{
-  public:
-    CHAR_SAMPLES();  //empty constructor
-
-    CHAR_SAMPLES(CHAR_SAMPLE *sample);
-
-    ~CHAR_SAMPLES () {           //destructor
-    }
-
-    inT32 n_samples() {
-      return samples.length ();
-    }
-
-    void add_sample(CHAR_SAMPLE *sample, tesseract::Tesseract*);
-
-    void build_prototype();
-
-    void rebuild_prototype(inT32 new_xsize, inT32 new_ysize);
-
-    void add_sample_to_prototype(CHAR_SAMPLE *sample);
-
-    CHAR_PROTO *prototype() {
-      return proto;
-    }
-
-    void find_best_sample();
-
-    float match_score(CHAR_SAMPLE *sample, tesseract::Tesseract* tess);
-
-    float nn_match_score(CHAR_SAMPLE *sample, tesseract::Tesseract* tess);
-
-    char character() {
-      return ch;
-    }
-
-    void assign_to_char();
-
-    void print(FILE *f);
-
-    NEWDELETE2 (CHAR_SAMPLES) private:
-    ClusterType type;
-    char ch;
-    CHAR_PROTO *proto;
-    CHAR_SAMPLE *best_sample;
-    CHAR_SAMPLE_LIST samples;
-};
-
-class CHAR_PROTO
-{
-  public:
-    CHAR_PROTO();  // empty constructor
-
-    CHAR_PROTO(inT32 x_size,
-               inT32 y_size,
-               inT32 n_samples,
-               float initial_value,
-               char c);
-
-    CHAR_PROTO(  // simple constructor
-               CHAR_SAMPLE *sample);
-
-    ~CHAR_PROTO ();
-
-    float match_sample(CHAR_SAMPLE *test_sample);
-
-    float match(CHAR_PROTO *test_proto);
-
-    inT32 n_samples() {
-      return nsamples;
-    }
-
-    inT32 x_size() {
-      return xsize;
-    }
-
-    inT32 y_size() {
-      return ysize;
-    }
-
-    float **data() {
-      return proto;
-    }
-    char character() {
-      return ch;
-    }
-
-    void enlarge_prototype(inT32 new_xsize, inT32 new_ysize);
-
-    void add_sample(CHAR_SAMPLE *sample);
-
-    IMAGE *make_image();
-
-    void print(FILE *f);
-
-    NEWDELETE2 (CHAR_PROTO) private:
-    inT32 xsize;
-    inT32 ysize;
-    float *proto_data;
-    float **proto;
-    inT32 nsamples;
-    char ch;
-};
-#endif
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
--- a/ccmain/control.h
+++ b/ccmain/control.h
@ -25,16 +25,11 @@
 #ifndef           CONTROL_H
 #define           CONTROL_H

-#include          "varable.h"
+#include          "params.h"
 #include          "ocrblock.h"
-//#include                                      "epapdest.h"
 #include          "ratngs.h"
 #include          "statistc.h"
-//#include                                      "epapconv.h"
-#include          "ocrshell.h"
 #include          "pageres.h"
-//TODO (wanke) why does the app. path have to be so weird here?
-#include          "charsample.h"
 #include          "notdll.h"

 enum ACCEPTABLE_WERD_TYPE
@ -49,129 +44,12 @@ enum ACCEPTABLE_WERD_TYPE

 typedef BOOL8 (*BLOB_REJECTOR) (PBLOB *, BLOB_CHOICE_IT *, void *);

-extern INT_VAR_H (tessedit_single_match, FALSE, "Top choice only from CP");
-//extern BOOL_VAR_H(tessedit_small_match,FALSE,"Use small matrix matcher");
-extern BOOL_VAR_H (tessedit_print_text, FALSE, "Write text to stdout");
-extern BOOL_VAR_H (tessedit_draw_words, FALSE, "Draw source words");
-extern BOOL_VAR_H (tessedit_draw_outwords, FALSE, "Draw output words");
-extern BOOL_VAR_H (tessedit_training_wiseowl, FALSE,
-"Call WO to learn blobs");
-extern BOOL_VAR_H (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
-extern BOOL_VAR_H (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
-extern BOOL_VAR_H (tessedit_dump_choices, FALSE, "Dump char choices");
-extern BOOL_VAR_H (tessedit_fix_fuzzy_spaces, TRUE,
-"Try to improve fuzzy spaces");
-extern BOOL_VAR_H (tessedit_unrej_any_wd, FALSE,
-"Dont bother with word plausibility");
-extern BOOL_VAR_H (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
-extern BOOL_VAR_H (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
-extern BOOL_VAR_H (tessedit_reject_suspect_fullstops, FALSE,
-"Reject suspect fullstops");
-extern BOOL_VAR_H (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
-extern BOOL_VAR_H (tessedit_cluster_adaption_on, TRUE,
-"Do our own adaption - ems only");
-extern BOOL_VAR_H (tessedit_enable_doc_dict, TRUE,
-"Add words to the document dictionary");
-extern BOOL_VAR_H (word_occ_first, FALSE, "Do word occ before re-est xht");
-extern BOOL_VAR_H (tessedit_xht_fiddles_on_done_wds, TRUE,
-"Apply xht fix up even if done");
-extern BOOL_VAR_H (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
-"Apply xht fix up even in no rejects");
-extern INT_VAR_H (x_ht_check_word_occ, 2, "Check Char Block occupancy");
-extern INT_VAR_H (x_ht_stringency, 1, "How many confirmed a/n to accept?");
-extern BOOL_VAR_H (x_ht_quality_check, TRUE, "Dont allow worse quality");
-extern BOOL_VAR_H (tessedit_debug_block_rejection, FALSE,
-"Block and Row stats");
-extern INT_VAR_H (debug_x_ht_level, 0, "Reestimate debug");
-extern BOOL_VAR_H (rej_use_xht, TRUE, "Individual rejection control");
-extern BOOL_VAR_H (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
-extern STRING_VAR_H (chs_leading_punct, "('`\"", "Leading punctuation");
-extern
-STRING_VAR_H (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
-extern STRING_VAR_H (chs_trailing_punct2, ")'`\"",
-"2nd Trailing punctuation");
-extern double_VAR_H (quality_rej_pc, 0.08,
-"good_quality_doc lte rejection limit");
-extern double_VAR_H (quality_blob_pc, 0.0,
-"good_quality_doc gte good blobs limit");
-extern double_VAR_H (quality_outline_pc, 1.0,
-"good_quality_doc lte outline error limit");
-extern double_VAR_H (quality_char_pc, 0.95,
-"good_quality_doc gte good char limit");
-extern INT_VAR_H (quality_min_initial_alphas_reqd, 2,
-"alphas in a good word");
-extern BOOL_VAR_H (tessedit_tess_adapt_to_rejmap, FALSE,
-"Use reject map to control Tesseract adaption");
-extern INT_VAR_H (tessedit_tess_adaption_mode, 3,
-"Adaptation decision algorithm for tess");
-extern INT_VAR_H (tessedit_em_adaption_mode, 62,
-"Adaptation decision algorithm for ems matrix matcher");
-extern BOOL_VAR_H (tessedit_cluster_adapt_after_pass1, FALSE,
-"Adapt using clusterer after pass 1");
-extern BOOL_VAR_H (tessedit_cluster_adapt_after_pass2, FALSE,
-"Adapt using clusterer after pass 1");
-extern BOOL_VAR_H (tessedit_cluster_adapt_after_pass3, FALSE,
-"Adapt using clusterer after pass 1");
-extern BOOL_VAR_H (tessedit_cluster_adapt_before_pass1, FALSE,
-"Adapt using clusterer before Tess adaping during pass 1");
-extern INT_VAR_H (tessedit_cluster_adaption_mode, 0,
-"Adaptation decision algorithm for matrix matcher");
-extern BOOL_VAR_H (tessedit_adaption_debug, FALSE,
-"Generate and print debug information for adaption");
-extern BOOL_VAR_H (tessedit_minimal_rej_pass1, FALSE,
-"Do minimal rejection on pass 1 output");
-extern BOOL_VAR_H (tessedit_test_adaption, FALSE,
-"Test adaption criteria");
-extern BOOL_VAR_H (tessedit_global_adaption, FALSE,
-"Adapt to all docs over time");
-extern BOOL_VAR_H (tessedit_matcher_log, FALSE, "Log matcher activity");
-extern INT_VAR_H (tessedit_test_adaption_mode, 3,
-"Adaptation decision algorithm for tess");
-extern BOOL_VAR_H (test_pt, FALSE, "Test for point");
-extern double_VAR_H (test_pt_x, 99999.99, "xcoord");
-extern double_VAR_H (test_pt_y, 99999.99, "ycoord");
-extern BOOL_VAR_H(save_best_choices, FALSE,
-                  "Save the results of the recognition step"
-                  " (blob_choices) within the corresponding WERD_CHOICE");
-
-/*
-void classify_word_pass1(                 //recog one word
-                         WERD_RES *word,  //word to do
-                         ROW *row,
-                         BOOL8 cluster_adapt,
-                         CHAR_SAMPLES_LIST *char_clusters,
-                         CHAR_SAMPLE_LIST *chars_waiting);
-*/
-                                 //word to do
-void classify_word_pass2(WERD_RES *word, ROW *row);
-/**
- * recognize one word
- * @param word word to do
- */
-void match_word_pass2(
-                      WERD_RES *word,
-                      ROW *row,
-                      float x_height);
-/**
- * crunch double hyphens
- * @param choice string to fix
- * @param word word to do
- * @param blob_choices char choices
- */
-void fix_hyphens(
-                 WERD_CHOICE *choice,
-                 WERD *word,
-                 BLOB_CHOICE_LIST_CLIST *blob_choices);
-
 /**
 * combine 2 blobs
 * @param blob1 dest blob
 * @param blob2 source blob
 */
-void merge_blobs(
-                 PBLOB *blob1,
-                 PBLOB *blob2
-                );
+void merge_blobs(PBLOB *blob1, PBLOB *blob2);
 /** dump chars in word */
 void choice_dump_tester(
                        PBLOB *,                   ///< blob
@ -181,20 +59,4 @@ void choice_dump_tester(
                        inT32 count,               ///< chars in text
                        BLOB_CHOICE_LIST *ratings  ///< list of results
                       );
-WERD *make_bln_copy(WERD *src_word, ROW *row, BLOCK* block,
-                    float x_height, DENORM *denorm);
-BOOL8 check_debug_pt(WERD_RES *word, int location);
-/** good chars in word */
-void add_in_one_row(
-                    ROW_RES *row,  ///< current row
-                    STATS *fonts,  ///< font stats
-                    inT8 *italic,  ///< output count
-                    inT8 *bold     ///< output count
-                   );
-/** good chars in word */
-void find_modal_font(
-                     STATS *fonts,     ///< font stats
-                     inT8 *font_out,   ///< output font
-                     inT8 *font_count  ///< output count
-                    );
 #endif
--- a/ccmain/cube_control.cpp
+++ b/ccmain/cube_control.cpp
@ -0,0 +1,465 @@
+/******************************************************************
+ * File:        cube_control.cpp
+ * Description: Tesseract class methods for invoking cube convolutional
+ *              neural network word recognizer.
+ * Author:      Raquel Romano
+ * Created:     September 2009
+ *
+ **********************************************************************/
+
+// Include automatically generated configuration file if running autoconf.
+#ifdef HAVE_CONFIG_H
+#include "config_auto.h"
+#endif
+
+#ifdef HAVE_LIBLEPT
+// Include leptonica library only if autoconf (or makefile etc) tell us to.
+#include "allheaders.h"
+#endif
+
+#include "cube_object.h"
+#include "cube_reco_context.h"
+#include "tesseractclass.h"
+#include "tesseract_cube_combiner.h"
+
+namespace tesseract {
+
+/**********************************************************************
+ * convert_prob_to_tess_certainty
+ *
+ * Normalize a probability in the range [0.0, 1.0] to a tesseract
+ * certainty in the range [-20.0, 0.0]
+ **********************************************************************/
+static float convert_prob_to_tess_certainty(float prob) {
+  return (prob - 1.0) * 20.0;
+}
+
+/**********************************************************************
+ * char_box_to_tbox
+ *
+ * Create a TBOX from a character bounding box. If nonzero, the
+ * x_offset accounts for any additional padding of the word box that
+ * should be taken into account.
+ *
+ **********************************************************************/
+TBOX char_box_to_tbox(Box* char_box, TBOX word_box, int x_offset) {
+  l_int32 left;
+  l_int32 top;
+  l_int32 width;
+  l_int32 height;
+  l_int32 right;
+  l_int32 bottom;
+
+  boxGetGeometry(char_box, &left, &top, &width, &height);
+  left += word_box.left() - x_offset;
+  right = left + width;
+  top = word_box.bottom() + word_box.height() - top;
+  bottom = top - height;
+  return TBOX(left, bottom, right, top);
+}
+
+/**********************************************************************
+ * extract_cube_state
+ *
+ * Extract CharSamp objects and character bounding boxes from the
+ * CubeObject's state. The caller should free both structres.
+ *
+**********************************************************************/
+bool Tesseract::extract_cube_state(CubeObject* cube_obj,
+                                   int* num_chars,
+                                   Boxa** char_boxes,
+                                   CharSamp*** char_samples) {
+  if (!cube_obj) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube WARNING (extract_cube_state): Invalid cube object "
+              "passed to extract_cube_state\n");
+    }
+    return false;
+  }
+
+  // Note that the CubeObject accessors return either the deslanted or
+  // regular objects search object or beam search object, whichever
+  // was used in the last call to Recognize()
+  CubeSearchObject* cube_search_obj = cube_obj->SrchObj();
+  if (!cube_search_obj) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
+              "cube's search object in extract_cube_state.\n");
+    }
+    return false;
+  }
+  BeamSearch *beam_search_obj = cube_obj->BeamObj();
+  if (!beam_search_obj) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube WARNING (Extract_cube_state): Could not retrieve "
+              "cube's beam search object in extract_cube_state.\n");
+    }
+    return false;
+  }
+
+  // Get the character samples and bounding boxes by backtracking
+  // through the beam search path
+  int best_node_index = beam_search_obj->BestPresortedNodeIndex();
+  *char_samples = beam_search_obj->BackTrack(
+      cube_search_obj, best_node_index, num_chars, NULL, char_boxes);
+  if (!*char_samples)
+    return false;
+  return true;
+}
+
+/**********************************************************************
+ * create_cube_box_word
+ *
+ * Fill the given BoxWord with boxes from character bounding
+ * boxes. The char_boxes have local coordinates w.r.t. the
+ * word bounding box, i.e., the left-most character bbox of each word
+ * has (0,0) left-top coord, but the BoxWord must be defined in page
+ * coordinates.
+ **********************************************************************/
+bool Tesseract::create_cube_box_word(Boxa *char_boxes,
+                                     int num_chars,
+                                     TBOX word_box,
+                                     BoxWord* box_word) {
+  if (!box_word) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n");
+    }
+    return false;
+  }
+
+  // Find the x-coordinate of left-most char_box, which could be
+  // nonzero if the word image was padded before recognition took place.
+  int x_offset = -1;
+  for (int i = 0; i < num_chars; ++i) {
+    Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
+    if (x_offset < 0 || char_box->x < x_offset) {
+      x_offset = char_box->x;
+    }
+    boxDestroy(&char_box);
+  }
+
+  for (int i = 0; i < num_chars; ++i) {
+    Box* char_box = boxaGetBox(char_boxes, i, L_CLONE);
+    TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset);
+    boxDestroy(&char_box);
+    box_word->InsertBox(i, tbox);
+  }
+  return true;
+}
+
+/**********************************************************************
+ * create_werd_choice
+ *
+ **********************************************************************/
+static WERD_CHOICE *create_werd_choice(
+                                       CharSamp** char_samples,
+                                       int num_chars,
+                                       const char* str,
+                                       float certainty,
+                                       const UNICHARSET &unicharset,
+                                       CharSet* cube_char_set
+                                       ) {
+  // Insert unichar ids into WERD_CHOICE
+  WERD_CHOICE *werd_choice = new WERD_CHOICE(num_chars);
+  ASSERT_HOST(werd_choice != NULL);
+  UNICHAR_ID uch_id;
+  for (int i = 0; i < num_chars; ++i) {
+    uch_id = cube_char_set->UnicharID(char_samples[i]->StrLabel());
+    if (uch_id != INVALID_UNICHAR_ID)
+      werd_choice->append_unichar_id_space_allocated(uch_id, 1, 0.0, certainty);
+  }
+
+  BLOB_CHOICE *blob_choice;
+  BLOB_CHOICE_LIST *choices_list;
+  BLOB_CHOICE_IT choices_list_it;
+  BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
+  BLOB_CHOICE_LIST_C_IT blob_choices_it;
+  blob_choices_it.set_to_list(blob_choices);
+
+  for (int i = 0; i < werd_choice->length(); ++i) {
+    // Create new BLOB_CHOICE_LIST for this unichar
+    choices_list = new BLOB_CHOICE_LIST();
+    choices_list_it.set_to_list(choices_list);
+    // Add a single BLOB_CHOICE to the list
+    blob_choice = new BLOB_CHOICE(werd_choice->unichar_id(i),
+                                  0.0, certainty, -1, -1, 0);
+    choices_list_it.add_after_then_move(blob_choice);
+    // Add list to the clist
+    blob_choices_it.add_to_end(choices_list);
+  }
+  werd_choice->populate_unichars(unicharset);
+  werd_choice->set_certainty(certainty);
+  werd_choice->set_blob_choices(blob_choices);
+  return werd_choice;
+}
+
+/**********************************************************************
+ * init_cube_objects
+ *
+ * Instantitates Tesseract object's CubeRecoContext and TesseractCubeCombiner.
+ * Returns false if cube context could not be created or if load_combiner is
+ * true, but the combiner could not be loaded.
+ **********************************************************************/
+bool Tesseract::init_cube_objects(bool load_combiner,
+                                  TessdataManager *tessdata_manager) {
+  ASSERT_HOST(cube_cntxt_ == NULL);
+  ASSERT_HOST(tess_cube_combiner_ == NULL);
+
+  // Create the cube context object
+  cube_cntxt_ = CubeRecoContext::Create(this, tessdata_manager, &unicharset);
+  if (cube_cntxt_ == NULL) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube WARNING (Tesseract::init_cube_objects()): Failed to "
+              "instantiate CubeRecoContext\n");
+    }
+    return false;
+  }
+
+  // Create the combiner object and load the combiner net for target languages.
+  if (load_combiner) {
+    tess_cube_combiner_ = new tesseract::TesseractCubeCombiner(cube_cntxt_);
+    if (!tess_cube_combiner_ || !tess_cube_combiner_->LoadCombinerNet()) {
+      delete cube_cntxt_;
+      cube_cntxt_ = NULL;
+      if (tess_cube_combiner_ != NULL) {
+        delete tess_cube_combiner_;
+        tess_cube_combiner_ = NULL;
+      }
+      if (cube_debug_level > 0)
+        tprintf("Cube ERROR (Failed to instantiate TesseractCubeCombiner\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/**********************************************************************
+ * run_cube
+ *
+ * Iterate through tesseract's results and call cube on each word.
+ * If the combiner is present, optionally run the tesseract-cube
+ * combiner on each word.
+ **********************************************************************/
+void Tesseract::run_cube(
+                         PAGE_RES *page_res  // page structure
+                         ) {
+  ASSERT_HOST(cube_cntxt_ != NULL);
+  if (!pix_binary_) {
+    if (cube_debug_level > 0)
+      tprintf("Tesseract::run_cube(): NULL binary image.\n");
+    return;
+  }
+  if (!page_res)
+    return;
+  PAGE_RES_IT page_res_it(page_res);
+  page_res_it.restart_page();
+
+  // Iterate through the word results and call cube on each word.
+  CubeObject *cube_obj;
+  for (page_res_it.restart_page(); page_res_it.word () != NULL;
+       page_res_it.forward()) {
+    WERD_RES* word = page_res_it.word();
+    TBOX word_box = word->word->bounding_box();
+    const BLOCK* block = word->denorm.block();
+    if (block != NULL && (block->re_rotation().x() != 1.0f ||
+          block->re_rotation().y() != 0.0f)) {
+      // TODO(rays) We have to rotate the bounding box to get the true coords.
+      // This will be achieved in the future via DENORM.
+      // In the mean time, cube can't process this word.
+      if (cube_debug_level > 0) {
+        tprintf("Cube can't process rotated word at:");
+        word_box.print();
+      }
+      if (word->best_choice == NULL)
+        page_res_it.DeleteCurrentWord();  // Nobody has an answer.
+      continue;
+    }
+    cube_obj = new tesseract::CubeObject(cube_cntxt_, pix_binary_,
+                                         word_box.left(),
+                                         pix_binary_->h - word_box.top(),
+                                         word_box.width(), word_box.height());
+    cube_recognize(cube_obj, &page_res_it);
+    delete cube_obj;
+  }
+}
+
+/**********************************************************************
+ * cube_recognize
+ *
+ * Call cube on the current word, optionally run the tess-cube combiner, and
+ * modify the tesseract result if cube wins. If cube fails to run, or
+ * if tesseract wins, leave the tesseract result unchanged. If the
+ * combiner is not instantiated, always use cube's result.
+ *
+ **********************************************************************/
+void Tesseract::cube_recognize(
+                               CubeObject *cube_obj,
+                               PAGE_RES_IT *page_res_it
+                               ) {
+  // Retrieve tesseract's data structure for the current word.
+  WERD_RES *tess_werd_res = page_res_it->word();
+  if (!tess_werd_res->best_choice && tess_cube_combiner_ != NULL) {
+    if (cube_debug_level > 0)
+      tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot run combiner "
+              "without a tess result.\n");
+    return;
+  }
+
+  // Skip cube entirely if combiner is present but tesseract's
+  // certainty is greater than threshold.
+  int combiner_run_thresh = convert_prob_to_tess_certainty(
+      cube_cntxt_->Params()->CombinerRunThresh());
+  if (tess_cube_combiner_ != NULL &&
+      (tess_werd_res->best_choice->certainty() >= combiner_run_thresh)) {
+    return;
+  }
+
+  // Run cube
+  WordAltList *cube_alt_list = cube_obj->RecognizeWord();
+  if (!cube_alt_list || cube_alt_list->AltCount() <= 0) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube returned nothing for word at:");
+      tess_werd_res->word->bounding_box().print();
+    }
+    if (tess_werd_res->best_choice == NULL) {
+      // Nobody has recognized it, so pretend it doesn't exist.
+      if (cube_debug_level > 0) {
+        tprintf("Deleted word not recognized by cube and/or tesseract at:");
+        tess_werd_res->word->bounding_box().print();
+      }
+      page_res_it->DeleteCurrentWord();
+    }
+    return;
+  }
+
+  // At this point we *could* run the combiner and bail out if
+  // Tesseract wins, but that would require instantiating a new
+  // CubeObject to avoid losing the original recognition results
+  // (e.g., beam search lattice) stored with the CubeObject. Instead,
+  // we first extract the state we need from the current recognition
+  // and then reuse the CubeObject so that the combiner does not need
+  // to recompute the image's connected components, segmentation, etc.
+
+  // Get cube's best result and its probability, mapped to tesseract's
+  // certainty range
+  char_32 *cube_best_32 = cube_alt_list->Alt(0);
+  double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0));
+  float cube_certainty = convert_prob_to_tess_certainty(cube_prob);
+  string cube_best_str;
+  CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str);
+
+  // Retrieve Cube's character bounding boxes and CharSamples,
+  // corresponding to the most recent call to RecognizeWord().
+  Boxa *char_boxes = NULL;
+  CharSamp **char_samples = NULL;;
+  int num_chars;
+  if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples)
+      && cube_debug_level > 0) {
+    tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract "
+            "cube state.\n");
+    return;
+  }
+
+  // Convert cube's character bounding boxes to a BoxWord.
+  BoxWord cube_box_word;
+  TBOX tess_word_box = tess_werd_res->word->bounding_box();
+  if (tess_werd_res->denorm.block() != NULL)
+    tess_word_box.rotate(tess_werd_res->denorm.block()->re_rotation());
+  bool box_word_success = create_cube_box_word(char_boxes, num_chars,
+                                               tess_word_box,
+                                               &cube_box_word);
+  boxaDestroy(&char_boxes);
+  if (!box_word_success) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
+              "create cube BoxWord\n");
+    }
+    return;
+  }
+
+  // Create cube's best choice.
+  WERD_CHOICE* cube_werd_choice = create_werd_choice(
+      char_samples, num_chars, cube_best_str.c_str(), cube_certainty,
+      unicharset, cube_cntxt_->CharacterSet());
+  delete []char_samples;
+
+  if (!cube_werd_choice) {
+    if (cube_debug_level > 0) {
+      tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
+              "create cube WERD_CHOICE\n");
+    }
+    return;
+  }
+
+  // Run combiner if present, now that we're free to reuse the CubeObject.
+  if (tess_cube_combiner_ != NULL) {
+    float combiner_prob = tess_cube_combiner_->CombineResults(tess_werd_res,
+                                                              cube_obj);
+    // If combiner probability is greater than tess/cube combiner
+    // classifier threshold, i.e. tesseract wins, then reset the WERD_RES
+    // certainty to the combiner certainty and return. Note that when
+    // tesseract and cube agree, the combiner probability is 1.0, so
+    // the final WERD_RES certainty will be maximized to 0.0.
+    if (combiner_prob >=
+        cube_cntxt_->Params()->CombinerClassifierThresh()) {
+      float combiner_certainty = convert_prob_to_tess_certainty(combiner_prob);
+      tess_werd_res->best_choice->set_certainty(combiner_certainty);
+      delete cube_werd_choice;
+      return;
+    }
+    if (cube_debug_level > 5) {
+      tprintf("Cube INFO: tesseract result replaced by cube: "
+              "%s -> %s\n",
+              tess_werd_res->best_choice->unichar_string().string(),
+              cube_best_str.c_str());
+    }
+  }
+
+  // Fill tesseract result's fields with cube results
+  fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(),
+                page_res_it);
+}
+
+/**********************************************************************
+ * fill_werd_res
+ *
+ * Fill Tesseract's word result fields with cube's.
+ *
+ **********************************************************************/
+void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
+                              WERD_CHOICE* cube_werd_choice,
+                              const char* cube_best_str,
+                              PAGE_RES_IT *page_res_it) {
+  WERD_RES *tess_werd_res = page_res_it->word();
+
+  // Replace tesseract results's best choice with cube's
+  delete tess_werd_res->best_choice;
+  tess_werd_res->best_choice = cube_werd_choice;
+
+  delete tess_werd_res->box_word;
+  tess_werd_res->box_word = new BoxWord(cube_box_word);
+  tess_werd_res->box_word->ClipToOriginalWord(page_res_it->block()->block,
+                                              tess_werd_res->word);
+  // Fill text and remaining fields
+  tess_werd_res->word->set_text(cube_best_str);
+  tess_werd_res->tess_failed = FALSE;
+  tess_werd_res->tess_accepted =
+      tess_acceptable_word(tess_werd_res->best_choice,
+                           tess_werd_res->raw_choice);
+  // There is no output word, so we can' call AdaptableWord, but then I don't
+  // think we need to. Fudge the result with accepted.
+  tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;
+
+  // Initialize the reject_map and set it to done, i.e., ignore all of
+  // tesseract's tests for rejection
+  tess_werd_res->reject_map.initialise(cube_werd_choice->length());
+  tess_werd_res->done = tess_werd_res->tess_accepted;
+
+  // Some sanity checks
+  ASSERT_HOST(tess_werd_res->best_choice->length() ==
+              tess_werd_res->best_choice->blob_choices()->length());
+  ASSERT_HOST(tess_werd_res->best_choice->length() ==
+              tess_werd_res->reject_map.length());
+}
+
+}  // namespace tesseract
--- a/ccmain/cube_reco_context.cpp
+++ b/ccmain/cube_reco_context.cpp
@ -0,0 +1,201 @@
+/**********************************************************************
+ * File:        cube_reco_context.cpp
+ * Description: Implementation of the Cube Recognition Context Class
+ * Author:    Ahmad Abdulkader
+ * Created:   2007
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <string>
+#include <limits.h>
+
+#include "cube_reco_context.h"
+
+#include "classifier_factory.h"
+#include "cube_tuning_params.h"
+#include "dict.h"
+#include "feature_bmp.h"
+#include "tessdatamanager.h"
+#include "tesseractclass.h"
+#include "tess_lang_model.h"
+
+namespace tesseract {
+
+// Instantiate a CubeRecoContext object using a Tesseract object.
+// CubeRecoContext will not take ownership of tess_obj, but will
+// record the pointer to it and will make use of various Tesseract
+// components (language model, flags, etc). Thus the caller should
+// keep tess_obj alive so long as the instantiated CubeRecoContext is used.
+CubeRecoContext::CubeRecoContext(Tesseract *tess_obj) {
+  tess_obj_ = tess_obj;
+  lang_ = "";
+  loaded_ = false;
+  lang_mod_ = NULL;
+  params_ = NULL;
+  char_classifier_ = NULL;
+  char_set_ = NULL;
+  word_size_model_ = NULL;
+  char_bigrams_ = NULL;
+  word_unigrams_ = NULL;
+  noisy_input_ = false;
+  size_normalization_ = false;
+}
+
+CubeRecoContext::~CubeRecoContext() {
+  if (char_classifier_ != NULL) {
+    delete char_classifier_;
+    char_classifier_ = NULL;
+  }
+
+  if (word_size_model_ != NULL) {
+    delete word_size_model_;
+    word_size_model_ = NULL;
+  }
+
+  if (char_set_ != NULL) {
+    delete char_set_;
+    char_set_ = NULL;
+  }
+
+  if (char_bigrams_ != NULL) {
+    delete char_bigrams_;
+    char_bigrams_ = NULL;
+  }
+
+  if (word_unigrams_ != NULL) {
+    delete word_unigrams_;
+    word_unigrams_ = NULL;
+  }
+
+  if (lang_mod_ != NULL) {
+    delete lang_mod_;
+    lang_mod_ = NULL;
+  }
+
+  if (params_ != NULL) {
+    delete params_;
+    params_ = NULL;
+  }
+}
+
+// Returns the path of the data files by looking up the TESSDATA_PREFIX
+// environment variable and appending a "tessdata" directory to it
+bool CubeRecoContext::GetDataFilePath(string *path) const {
+  *path = tess_obj_->datadir.string();
+  return true;
+}
+
+// The object initialization function that loads all the necessary
+// components of a RecoContext.  TessdataManager is used to load the
+// data from [lang].traineddata file.  If TESSDATA_CUBE_UNICHARSET
+// component is present, Cube will be instantiated with the unicharset
+// specified in this component and the corresponding dictionary
+// (TESSDATA_CUBE_SYSTEM_DAWG), and will map Cube's unicharset to
+// Tesseract's. Otherwise, TessdataManager will assume that Cube will
+// be using Tesseract's unicharset and dawgs, and will load the
+// unicharset from the TESSDATA_UNICHARSET component and will load the
+// dawgs from TESSDATA_*_DAWG components.
+bool CubeRecoContext::Load(TessdataManager *tessdata_manager,
+                           UNICHARSET *tess_unicharset) {
+  ASSERT_HOST(tess_obj_ != NULL);
+  string data_file_path;
+
+  // Get the data file path.
+  if (GetDataFilePath(&data_file_path) == false) {
+    fprintf(stderr, "Unable to get data file path\n");
+    return false;
+  }
+
+  // Get the language from the Tesseract object.
+  lang_ = tess_obj_->lang.string();
+
+  // Create the char set.
+  if ((char_set_ =
+       CharSet::Create(tessdata_manager, tess_unicharset)) == NULL) {
+    fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
+            "CharSet\n");
+    return false;
+  }
+  // Create the language model.
+  string lm_file_name = data_file_path + lang_ + ".cube.lm";
+  string lm_params;
+  if (!CubeUtils::ReadFileToString(lm_file_name, &lm_params)) {
+    fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read cube "
+            "language model params from %s\n", lm_file_name.c_str());
+    return false;
+  }
+  lang_mod_ = new TessLangModel(lm_params, data_file_path,
+                                tess_obj_->getDict().load_system_dawg,
+                                tessdata_manager, this);
+  if (lang_mod_ == NULL) {
+    fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to create "
+            "TessLangModel\n");
+    return false;
+  }
+
+  // Create the optional char bigrams object.
+  char_bigrams_ = CharBigrams::Create(data_file_path, lang_);
+
+  // Create the optional word unigrams object.
+  word_unigrams_ = WordUnigrams::Create(data_file_path, lang_);
+
+  // Create the optional size model.
+  word_size_model_ = WordSizeModel::Create(data_file_path, lang_,
+    char_set_, Contextual());
+
+  // Load tuning params.
+  params_ = CubeTuningParams::Create(data_file_path, lang_);
+  if (params_ == NULL) {
+    fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read "
+            "CubeTuningParams from %s\n", data_file_path.c_str());
+    return false;
+  }
+
+  // Create the char classifier.
+  char_classifier_ = CharClassifierFactory::Create(data_file_path, lang_,
+                                                   lang_mod_, char_set_,
+                                                   params_);
+  if (char_classifier_ == NULL) {
+    fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
+            "CharClassifierFactory object from %s\n", data_file_path.c_str());
+    return false;
+  }
+
+  loaded_ = true;
+
+  return true;
+}
+
+// Creates a CubeRecoContext object using a tesseract object
+CubeRecoContext * CubeRecoContext::Create(Tesseract *tess_obj,
+                                          TessdataManager *tessdata_manager,
+                                          UNICHARSET *tess_unicharset) {
+  // create the object
+  CubeRecoContext *cntxt = new CubeRecoContext(tess_obj);
+  if (cntxt == NULL) {
+    fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to create "
+            "CubeRecoContext object\n");
+    return NULL;
+  }
+  // load the necessary components
+  if (cntxt->Load(tessdata_manager, tess_unicharset) == false) {
+    fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to init "
+            "CubeRecoContext object\n");
+    delete cntxt;
+    return NULL;
+  }
+  // success
+  return cntxt;
+}
+}  // tesseract}
--- a/ccmain/cube_reco_context.h
+++ b/ccmain/cube_reco_context.h
@ -0,0 +1,155 @@
+/**********************************************************************
+ * File:        cube_reco_context.h
+ * Description: Declaration of the Cube Recognition Context Class
+ * Author:    Ahmad Abdulkader
+ * Created:   2007
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// The CubeRecoContext class abstracts the Cube OCR Engine. Typically a process
+// (or a thread) would create one CubeRecoContext object per language.
+// The CubeRecoContext object also provides methods to get and set the
+// different attribues of the Cube OCR Engine.
+
+#ifndef CUBE_RECO_CONTEXT_H
+#define CUBE_RECO_CONTEXT_H
+
+#include <string>
+#include "neural_net.h"
+#include "lang_model.h"
+#include "classifier_base.h"
+#include "feature_base.h"
+#include "char_set.h"
+#include "word_size_model.h"
+#include "char_bigrams.h"
+#include "word_unigrams.h"
+
+namespace tesseract {
+
+class Tesseract;
+class TessdataManager;
+
+class CubeRecoContext {
+ public:
+  // Reading order enum type
+  enum ReadOrder {
+   L2R,
+   R2L
+  };
+
+  // Instantiate using a Tesseract object
+  CubeRecoContext(Tesseract *tess_obj);
+
+  ~CubeRecoContext();
+
+  // accessor functions
+  inline const string & Lang() const { return lang_; }
+  inline CharSet *CharacterSet() const { return char_set_; }
+  inline CharClassifier *Classifier() const { return char_classifier_; }
+  inline WordSizeModel *SizeModel() const { return word_size_model_; }
+  inline CharBigrams *Bigrams() const { return char_bigrams_; }
+  inline WordUnigrams *WordUnigramsObj() const { return word_unigrams_; }
+  inline TuningParams *Params() const { return params_; }
+  inline LangModel *LangMod() const { return lang_mod_; }
+
+  // the reading order of the language
+  inline ReadOrder ReadingOrder() const {
+    return ((lang_ == "ara") ? R2L : L2R);
+  }
+
+  // does the language support case
+  inline bool HasCase() const {
+    return (lang_ != "ara" && lang_ != "hin");
+  }
+
+  inline bool Cursive() const {
+    return (lang_ == "ara");
+  }
+
+  inline bool HasItalics() const {
+    return (lang_ != "ara" && lang_ != "hin" && lang_ != "uk");
+  }
+
+  inline bool Contextual() const {
+    return (lang_ == "ara");
+  }
+
+  // RecoContext runtime flags accessor functions
+  inline bool SizeNormalization() const { return size_normalization_; }
+  inline bool NoisyInput() const { return noisy_input_; }
+  inline bool OOD() const { return lang_mod_->OOD(); }
+  inline bool Numeric() const { return lang_mod_->Numeric(); }
+  inline bool WordList() const { return lang_mod_->WordList(); }
+  inline bool Punc() const { return lang_mod_->Punc(); }
+  inline bool CaseSensitive() const {
+    return char_classifier_->CaseSensitive();
+  }
+
+  inline void SetSizeNormalization(bool size_normalization) {
+    size_normalization_ = size_normalization;
+  }
+  inline void SetNoisyInput(bool noisy_input) {
+    noisy_input_ = noisy_input;
+  }
+  inline void SetOOD(bool ood_enabled) {
+    lang_mod_->SetOOD(ood_enabled);
+  }
+  inline void SetNumeric(bool numeric_enabled) {
+    lang_mod_->SetNumeric(numeric_enabled);
+  }
+  inline void SetWordList(bool word_list_enabled) {
+    lang_mod_->SetWordList(word_list_enabled);
+  }
+  inline void SetPunc(bool punc_enabled) {
+    lang_mod_->SetPunc(punc_enabled);
+  }
+  inline void SetCaseSensitive(bool case_sensitive) {
+    char_classifier_->SetCaseSensitive(case_sensitive);
+  }
+  inline tesseract::Tesseract *TesseractObject() const {
+    return tess_obj_;
+  }
+
+  // Returns the path of the data files
+  bool GetDataFilePath(string *path) const;
+  // Creates a CubeRecoContext object using a tesseract object. Data
+  // files are loaded via the tessdata_manager, and the tesseract
+  // unicharset is provided in order to map Cube's unicharset to
+  // Tesseract's in the case where the two unicharsets differ.
+  static CubeRecoContext *Create(Tesseract *tess_obj,
+                                 TessdataManager *tessdata_manager,
+                                 UNICHARSET *tess_unicharset);
+
+ private:
+  bool loaded_;
+  string lang_;
+  CharSet *char_set_;
+  WordSizeModel *word_size_model_;
+  CharClassifier *char_classifier_;
+  CharBigrams *char_bigrams_;
+  WordUnigrams *word_unigrams_;
+  TuningParams *params_;
+  LangModel *lang_mod_;
+  Tesseract *tess_obj_;  // CubeRecoContext does not own this pointer
+  bool size_normalization_;
+  bool noisy_input_;
+
+  // Loads and initialized all the necessary components of a
+  // CubeRecoContext. See .cpp for more details.
+  bool Load(TessdataManager *tessdata_manager,
+            UNICHARSET *tess_unicharset);
+};
+}
+
+#endif  // CUBE_RECO_CONTEXT_H
--- a/ccmain/docqual.cpp
+++ b/ccmain/docqual.cpp
@ -27,397 +27,109 @@
 #include          "tstruct.h"
 #include          "tfacep.h"
 #include          "reject.h"
+#include          "tesscallback.h"
 #include          "tessvars.h"
 #include          "genblob.h"
 #include          "secname.h"
 #include          "globals.h"
 #include          "tesseractclass.h"

-#define EXTERN
+namespace tesseract{

-EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
-EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
-"Non standard number of outlines");
-EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
-"Allow outline errs in unrejection?");
-EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
-"Reduce rejection on good docs");
-EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
-EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
-"%rej allowed before rej whole doc");
-EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
-"%rej allowed before rej whole block");
-EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
-"%rej allowed before rej whole row");
-EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
-"%of row rejects in whole word rejects which prevents whole row rejection");
-EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
-"Only rej partially rejected words in block rejection");
-EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
-"Only rej partially rejected words in row rejection");
-EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
-"Use word segmentation quality metric");
-EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
-"Use word segmentation quality metric");
-EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
-"Only preserve wds longer than this");
-EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
-"Apply row rejection to good docs");
-EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
-"rej good doc wd if more than this fraction rejected");
-EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
-"Reject all bad quality wds");
-EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
-EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
-"Output data to debug file");
-EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no checks");
-EXTERN double_VAR (quality_rowrej_pc, 1.1,
-"good_quality_doc gte good char limit");
+// A little class to provide the callbacks as we have no pre-bound args.
+struct DocQualCallbacks {
+  explicit DocQualCallbacks(WERD_RES* word0)
+    : word(word0), match_count(0), accepted_match_count(0) {}

-EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
-"Mark v.bad words for tilde crunch");
-EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
-EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
-"Take out ~^ early?");
+  void CountMatchingBlobs(int index) {
+    ++match_count;
+  }

-EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
-EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
-EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
-"crunch garbage cert lt this");
-EXTERN double_VAR (crunch_poor_garbage_rate, 60,
-"crunch garbage rating lt this");
+  void CountAcceptedBlobs(int index) {
+    if (word->reject_map[index].accepted())
+      ++accepted_match_count;
+    ++match_count;
+  }

-EXTERN double_VAR (crunch_pot_poor_rate, 40,
-"POTENTIAL crunch rating lt this");
-EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
-"POTENTIAL crunch cert lt this");
-EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
+  void AcceptIfGoodQuality(int index) {
+    if (word->reject_map[index].accept_if_good_quality())
+      word->reject_map[index].setrej_quality_accept();
+  }

-EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
-EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
-EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
-EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
-EXTERN double_VAR (crunch_del_min_width, 3.0,
-"Del if word width lt xht x this");
-EXTERN double_VAR (crunch_del_high_word, 1.5,
-"Del if word gt xht x this above bl");
-EXTERN double_VAR (crunch_del_low_word, 0.5,
-"Del if word gt xht x this below bl");
-EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
-
-EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
-EXTERN INT_VAR (crunch_pot_indicators, 1,
-"How many potential indicators needed");
-
-EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
-"Dont touch sensible strings");
-EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
-EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
-"Dont pot crunch sensible strings");
-EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
-EXTERN INT_VAR (crunch_leave_lc_strings, 4,
-"Dont crunch words with long lower case strings");
-EXTERN INT_VAR (crunch_leave_uc_strings, 4,
-"Dont crunch words with long lower case strings");
-EXTERN INT_VAR (crunch_long_repetitions, 3,
-"Crunch words with long repetitions");
-
-EXTERN INT_VAR (crunch_debug, 0, "As it says");
-
-static BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2);
-static void unrej_good_chs(WERD_RES *word, ROW *row);
+  WERD_RES* word;
+  inT16 match_count;
+  inT16 accepted_match_count;
+};

 /*************************************************************************
 * word_blob_quality()
- * How many blobs in the outword are identical to those of the inword?
- * ASSUME blobs in both initial word and outword are in ascending order of
+ * How many blobs in the box_word are identical to those of the inword?
+ * ASSUME blobs in both initial word and box_word are in ascending order of
 * left hand blob edge.
 *************************************************************************/
-inT16 word_blob_quality(  //Blob seg changes
-                        WERD_RES *word,
-                        ROW *row) {
-  WERD *bln_word;                //BL norm init word
-  TWERD *tessword;               //tess format
-  WERD *init_word;               //BL norm init word
-  PBLOB_IT outword_it;
-  PBLOB_IT initial_it;
-  inT16 i;
-  inT16 init_blobs_left;
-  inT16 match_count = 0;
-  BOOL8 matched;
-  TBOX out_box;
-  PBLOB *test_blob;
-  DENORM denorm;
-  float bln_xht;
-
-  if (word->word->gblob_list ()->empty ())
+inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
+  if (word->bln_boxes == NULL ||
+      word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
    return 0;
-                                 //xht used for blnorm
-  bln_xht = bln_x_height / word->denorm.scale ();
-  bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
-  /*
-    NOTE: Need to convert to tess format and back again to ensure that the
-    same float -> int rounding of coords is done to source wd as out wd before
-    comparison
-  */
-  tessword = make_tess_word(bln_word, NULL);  // Convert word.
-  init_word = make_ed_word(tessword, bln_word);
-  delete bln_word;
-  delete_word(tessword);
-  if (init_word == NULL) {
-    // Conversion failed.
-    return 0;
-  }

-  initial_it.set_to_list(init_word->blob_list());
-  init_blobs_left = initial_it.length();
-  outword_it.set_to_list(word->outword->blob_list());
-
-  for (outword_it.mark_cycle_pt();
-       !outword_it.cycled_list(); outword_it.forward()) {
-    out_box = outword_it.data()->bounding_box();
-
-    // Skip any initial blobs LEFT of current outword blob.
-    while (!initial_it.at_last() &&
-           (initial_it.data()->bounding_box().left() < out_box.left())) {
-      initial_it.forward();
-      init_blobs_left--;
-    }
-
-    /* See if current outword blob matches any initial blob with the same left
-      coord. (Normally only one but possibly more - in unknown order) */
-
-    i = 0;
-    matched = FALSE;
-    do {
-      test_blob = initial_it.data_relative (i++);
-      matched = crude_match_blobs (test_blob, outword_it.data ());
-      if (matched)
-        match_count++;
-    }
-    while (!matched &&
-           (init_blobs_left - i > 0) &&
-           (i < 129) &&
-           !initial_it.at_last() &&
-           test_blob->bounding_box().left() == out_box.left());
-  }
-  delete init_word;
-  return match_count;
+  DocQualCallbacks cb(word);
+  word->bln_boxes->ProcessMatchedBlobs(
+      *word->rebuild_word,
+      NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
+  return cb.match_count;
 }

-
-/*************************************************************************
- * crude_match_blobs()
- * Check bounding boxes are the same and the number of outlines are the same.
- *************************************************************************/
-static BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
-  TBOX box1 = blob1->bounding_box();
-  TBOX box2 = blob2->bounding_box();
-
-  if (box1.contains(box2) &&
-      box2.contains(box1) &&
-      (blob1->out_list()->length() == blob1->out_list()->length()))
-    return TRUE;
-  else
-    return FALSE;
-}
-
-
-inT16 word_outline_errs(WERD_RES *word) {
-  PBLOB_IT outword_it;
+inT16 Tesseract::word_outline_errs(WERD_RES *word) {
  inT16 i = 0;
  inT16 err_count = 0;

-  outword_it.set_to_list(word->outword->blob_list());
+  TBLOB* blob = word->rebuild_word->blobs;

-  for (outword_it.mark_cycle_pt();
-       !outword_it.cycled_list(); outword_it.forward()) {
+  for (; blob != NULL; blob = blob->next) {
    err_count += count_outline_errs(word->best_choice->unichar_string()[i],
-                                    outword_it.data()->out_list()->length());
+                                    blob->NumOutlines());
    i++;
  }
  return err_count;
 }

-
 /*************************************************************************
 * word_char_quality()
 * Combination of blob quality and outline quality - how many good chars are
 * there? - I.e chars which pass the blob AND outline tests.
 *************************************************************************/
-void word_char_quality(WERD_RES *word,
-                       ROW *row,
-                       inT16 *match_count,
-                       inT16 *accepted_match_count) {
-  WERD *bln_word;                // BL norm init word
-  TWERD *tessword;               // tess format
-  WERD *init_word;               // BL norm init word
-  PBLOB_IT outword_it;
-  PBLOB_IT initial_it;
-  inT16 i;
-  inT16 init_blobs_left;
-  BOOL8 matched;
-  TBOX out_box;
-  PBLOB *test_blob;
-  DENORM denorm;
-  float bln_xht;
-  inT16 j = 0;
-
-  *match_count = 0;
-  *accepted_match_count = 0;
-  if (word->word->gblob_list ()->empty ())
+void Tesseract::word_char_quality(WERD_RES *word,
+                                  ROW *row,
+                                  inT16 *match_count,
+                                  inT16 *accepted_match_count) {
+  if (word->bln_boxes == NULL ||
+      word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
    return;

-                                 // xht used for blnorm
-  bln_xht = bln_x_height / word->denorm.scale();
-  bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
-  /*
-    NOTE: Need to convert to tess format and back again to ensure that the
-    same float -> int rounding of coords is done to source wd as out wd before
-    comparison
-  */
-  tessword = make_tess_word(bln_word, NULL);  // Convert word.
-  init_word = make_ed_word(tessword, bln_word);
-  delete bln_word;
-  delete_word(tessword);
-  if (init_word == NULL)
-    return;
-
-  initial_it.set_to_list(init_word->blob_list());
-  init_blobs_left = initial_it.length();
-  outword_it.set_to_list(word->outword->blob_list());
-
-  for (outword_it.mark_cycle_pt();
-  !outword_it.cycled_list(); outword_it.forward()) {
-    out_box = outword_it.data()->bounding_box();
-
-    /* Skip any initial blobs LEFT of current outword blob */
-    while (!initial_it.at_last() &&
-           (initial_it.data()->bounding_box().left() < out_box.left())) {
-           initial_it.forward();
-      init_blobs_left--;
-    }
-
-    /* See if current outword blob matches any initial blob with the same left
-      coord. (Normally only one but possibly more - in unknown order) */
-
-    i = 0;
-    matched = FALSE;
-    do {
-      test_blob = initial_it.data_relative(i++);
-      matched = crude_match_blobs(test_blob, outword_it.data());
-      if (matched &&
-        (count_outline_errs (word->best_choice->unichar_string()[j],
-        outword_it.data ()->out_list ()->length ())
-      == 0)) {
-        (*match_count)++;
-        if (word->reject_map[j].accepted ())
-          (*accepted_match_count)++;
-      }
-    }
-    while (!matched &&
-           (init_blobs_left - i > 0) &&
-           (i < 129) &&
-           !initial_it.at_last() &&
-           test_blob->bounding_box().left() == out_box.left());
-    j++;
-  }
-  delete init_word;
+  DocQualCallbacks cb(word);
+  word->bln_boxes->ProcessMatchedBlobs(
+      *word->rebuild_word,
+      NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
+  *match_count = cb.match_count;
+  *accepted_match_count = cb.accepted_match_count;
 }

-
 /*************************************************************************
 * unrej_good_chs()
 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
 *************************************************************************/
-static void unrej_good_chs(WERD_RES *word, ROW *row) {
-  WERD *bln_word;                // BL norm init word
-  TWERD *tessword;               // tess format
-  WERD *init_word;               // BL norm init word
-  PBLOB_IT outword_it;
-  PBLOB_IT initial_it;
-  inT16 i;
-  inT16 init_blobs_left;
-  BOOL8 matched;
-  TBOX out_box;
-  PBLOB *test_blob;
-  DENORM denorm;
-  float bln_xht;
-  inT16 j = 0;
-
-  if (word->word->gblob_list ()->empty ())
+void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
+  if (word->bln_boxes == NULL ||
+      word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
    return;

-                                 // xht used for blnorm
-  bln_xht = bln_x_height / word->denorm.scale ();
-  bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm);
-  /*
-    NOTE: Need to convert to tess format and back again to ensure that the
-    same float -> int rounding of coords is done to source wd as out wd before
-    comparison
-  */
-  tessword = make_tess_word(bln_word, NULL);  // Convert word
-  init_word = make_ed_word(tessword, bln_word);
-  delete bln_word;
-  delete_word(tessword);
-  if (init_word == NULL)
-    return;
-
-  initial_it.set_to_list (init_word->blob_list ());
-  init_blobs_left = initial_it.length ();
-  outword_it.set_to_list (word->outword->blob_list ());
-
-  for (outword_it.mark_cycle_pt ();
-  !outword_it.cycled_list (); outword_it.forward ()) {
-    out_box = outword_it.data ()->bounding_box ();
-
-    /* Skip any initial blobs LEFT of current outword blob */
-    while (!initial_it.at_last () &&
-    (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
-      initial_it.forward ();
-      init_blobs_left--;
-    }
-
-    /* See if current outword blob matches any initial blob with the same left
-      coord. (Normally only one but possibly more - in unknown order) */
-
-    i = 0;
-    matched = FALSE;
-    do {
-      test_blob = initial_it.data_relative (i++);
-      matched = crude_match_blobs (test_blob, outword_it.data ());
-      if (matched &&
-        (word->reject_map[j].accept_if_good_quality ()) &&
-        (docqual_excuse_outline_errs ||
-        (count_outline_errs (word->best_choice->unichar_string()[j],
-        outword_it.data ()->out_list ()->
-        length ()) == 0)))
-        word->reject_map[j].setrej_quality_accept ();
-    }
-    while (!matched &&
-      (init_blobs_left - i > 0) &&
-      (i < 129) &&
-      !initial_it.at_last () &&
-      test_blob->bounding_box ().left () == out_box.left ());
-    j++;
-  }
-  delete init_word;
+  DocQualCallbacks cb(word);
+  word->bln_boxes->ProcessMatchedBlobs(
+      *word->rebuild_word,
+      NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
 }

-
-void print_boxes(WERD *word) {
-  PBLOB_IT it;
-  TBOX box;
-
-  it.set_to_list (word->blob_list ());
-  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
-    box = it.data ()->bounding_box ();
-    box.print ();
-  }
-}
-
-
-inT16 count_outline_errs(char c, inT16 outline_count) {
+inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
  int expected_outline_count;

  if (STRING (outlines_odd).contains (c))
@ -429,20 +141,11 @@ inT16 count_outline_errs(char c, inT16 outline_count) {
  return abs (outline_count - expected_outline_count);
 }

-
-namespace tesseract {
 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
                                        BOOL8 good_quality_doc) {
  if ((tessedit_good_quality_unrej && good_quality_doc))
    unrej_good_quality_words(page_res_it);
  doc_and_block_rejection(page_res_it, good_quality_doc);
-
-  page_res_it.restart_page ();
-  while (page_res_it.word () != NULL) {
-    insert_rej_cblobs(page_res_it.word());
-    page_res_it.forward();
-  }
-
  if (unlv_tilde_crunching) {
    tilde_crunch(page_res_it);
    tilde_delete(page_res_it);
@ -542,7 +245,7 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks

  BOOL8 rej_word;
  BOOL8 prev_word_rejected;
-  inT16 char_quality;
+  inT16 char_quality = 0;
  inT16 accepted_char_quality;

  if ((page_res_it.page_res->rej_count * 100.0 /
@ -833,10 +536,10 @@ void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
    page_res_it.forward ();
  }
 }
-}  // namespace tesseract


-BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
+BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
+                                      GARBAGE_LEVEL garbage_level) {
  float rating_per_ch;
  int adjusted_len;
  int crunch_mode = 0;
@ -873,7 +576,6 @@ BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
    return FALSE;
 }

-namespace tesseract {
 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
                                       GARBAGE_LEVEL garbage_level,
                                       BOOL8 ok_dict_word) {
@ -1022,36 +724,30 @@ void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
  }
 }

+// Callback helper for merge_tess_fails returns a space if both
+// arguments are space, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID Tesseract::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
+  if (id1 == id2 && id1 == unicharset.unichar_to_id(" "))
+    return id1;
+  else
+    return INVALID_UNICHAR_ID;
+}
+
 // Change pairs of tess failures to a single one
 void Tesseract::merge_tess_fails(WERD_RES *word_res) {
-  PBLOB_IT blob_it;              //blobs
-  int len = word_res->best_choice->length();
-  bool modified = false;
-
-  ASSERT_HOST (word_res->reject_map.length () == len);
-  ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
-
-  UNICHAR_ID unichar_space = unicharset.unichar_to_id(" ");
-  blob_it = word_res->outword->blob_list ();
-  int i = 0;
-  while (i < word_res->best_choice->length()-1) {
-    if ((word_res->best_choice->unichar_id(i) == unichar_space) &&
-        (word_res->best_choice->unichar_id(i+1) == unichar_space)) {
-      modified = true;
-      word_res->best_choice->remove_unichar_id(i);
-      word_res->reject_map.remove_pos (i);
-      merge_blobs (blob_it.data_relative (1), blob_it.data ());
-      delete blob_it.extract (); //get rid of spare
-    } else {
-      i++;
-    }
-    blob_it.forward ();
-  }
-  len = word_res->best_choice->length();
-  ASSERT_HOST (word_res->reject_map.length () == len);
-  ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
-  if (modified) {
-    word_res->best_choice->populate_unichars(unicharset);
+  if (word_res->ConditionalBlobMerge(
+      unicharset,
+      NewPermanentTessCallback(this, &Tesseract::BothSpaces), NULL,
+      word_res->best_choice->blob_choices())) {
+    tprintf("Post:bc len=%d, rejmap=%d, boxword=%d, chopword=%d, rebuild=%d\n",
+            word_res->best_choice->length(),
+            word_res->reject_map.length(),
+            word_res->box_word->length(),
+            word_res->chopped_word->NumBlobs(),
+            word_res->rebuild_word->NumBlobs());
+    int len = word_res->best_choice->length();
+    ASSERT_HOST(word_res->reject_map.length() == len);
+    ASSERT_HOST(word_res->box_word->length() == len);
  }
 }

@ -1252,7 +948,6 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
      return G_OK;
  }
 }
-}  // namespace tesseract


 /*************************************************************************
@ -1271,7 +966,7 @@ GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
 *          >75% of the outline BBs have longest dimension < 0.5xht
 *************************************************************************/

-CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
+CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {
  int word_len = word->reject_map.length ();
  float rating_per_ch;
  TBOX box;                       //BB of word
@ -1286,13 +981,13 @@ CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
    return CR_DELETE;
  }

-  box = word->outword->bounding_box ();
-  if (box.height () < crunch_del_min_ht * bln_x_height) {
+  box = word->rebuild_word->bounding_box();
+  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
    delete_mode = 4;
    return CR_DELETE;
  }

-  if (noise_outlines (word->outword)) {
+  if (noise_outlines(word->rebuild_word)) {
    delete_mode = 5;
    return CR_DELETE;
  }
@ -1314,23 +1009,23 @@ CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
    return CR_LOOSE_SPACE;
  }

-  if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
+  if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
    delete_mode = 9;
    return CR_LOOSE_SPACE;
  }

  if (box.bottom () >
-  bln_baseline_offset + crunch_del_high_word * bln_x_height) {
+  kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
    delete_mode = 10;
    return CR_LOOSE_SPACE;
  }

-  if (box.height () > crunch_del_max_ht * bln_x_height) {
+  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
    delete_mode = 11;
    return CR_LOOSE_SPACE;
  }

-  if (box.width () < crunch_del_min_width * bln_x_height) {
+  if (box.width () < crunch_del_min_width * kBlnXHeight) {
    delete_mode = 3;
    return CR_LOOSE_SPACE;
  }
@ -1339,7 +1034,7 @@ CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) {
  return CR_NONE;
 }

-inT16 failure_count(WERD_RES *word) {
+inT16 Tesseract::failure_count(WERD_RES *word) {
  const char *str = word->best_choice->unichar_string().string();
  int tess_rejs = 0;

@ -1351,134 +1046,25 @@ inT16 failure_count(WERD_RES *word) {
 }


-BOOL8 noise_outlines(WERD *word) {
-  PBLOB_IT blob_it;
-  OUTLINE_IT outline_it;
-  TBOX box;                       //BB of outline
+BOOL8 Tesseract::noise_outlines(TWERD *word) {
+  TBOX box;                       // BB of outline
  inT16 outline_count = 0;
  inT16 small_outline_count = 0;
  inT16 max_dimension;
-  float small_limit = bln_x_height * crunch_small_outlines_size;
+  float small_limit = kBlnXHeight * crunch_small_outlines_size;

-  blob_it.set_to_list (word->blob_list ());
-  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
-    outline_it.set_to_list (blob_it.data ()->out_list ());
-    for (outline_it.mark_cycle_pt ();
-    !outline_it.cycled_list (); outline_it.forward ()) {
+  for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
+    for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
      outline_count++;
-      box = outline_it.data ()->bounding_box ();
-      if (box.height () > box.width ())
-        max_dimension = box.height ();
+      box = ol->bounding_box();
+      if (box.height() > box.width())
+        max_dimension = box.height();
      else
-        max_dimension = box.width ();
+        max_dimension = box.width();
      if (max_dimension < small_limit)
        small_outline_count++;
    }
  }
  return (small_outline_count >= outline_count);
 }
-
-
-/*************************************************************************
- * insert_rej_cblobs()
- * Put rejected word blobs back into the outword.
- * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER
- * OF ELEMENTS.
- *************************************************************************/
-namespace tesseract {
-void Tesseract::insert_rej_cblobs(WERD_RES *word) {
-  PBLOB_IT blob_it;              //blob iterator
-  PBLOB_IT rej_blob_it;
-  const STRING *word_str;
-  const STRING *word_lengths;
-  int old_len;
-  int rej_len;
-  char new_str[512 * UNICHAR_LEN];
-  char new_lengths[512];
-  REJMAP new_map;
-  int i = 0;                     //new_str index
-  int j = 0;                     //old_str index
-  int i_offset = 0;              //new_str offset
-  int j_offset = 0;              //old_str offset
-  int new_len;
-
-  gblob_sort_list (word->outword->rej_blob_list (), TRUE);
-  rej_blob_it.set_to_list (word->outword->rej_blob_list ());
-  if (rej_blob_it.empty ())
-    return;
-  rej_len = rej_blob_it.length ();
-  blob_it.set_to_list (word->outword->blob_list ());
-  word_str = &(word->best_choice->unichar_string());
-  word_lengths = &(word->best_choice->unichar_lengths());
-  old_len = word->best_choice->length();
-  ASSERT_HOST (word->reject_map.length () == old_len);
-  ASSERT_HOST (blob_it.length () == old_len);
-  if ((old_len + rej_len) > 511)
-    return;                      //Word is garbage anyway prevent abort
-  new_map.initialise (old_len + rej_len);
-
-  while (!rej_blob_it.empty ()) {
-    if ((j >= old_len) ||
-      (rej_blob_it.data ()->bounding_box ().left () <=
-    blob_it.data ()->bounding_box ().left ())) {
-      /* Insert reject blob */
-      if (j >= old_len)
-        blob_it.add_to_end (rej_blob_it.extract ());
-      else
-        blob_it.add_before_stay_put (rej_blob_it.extract ());
-      if (!rej_blob_it.empty ())
-        rej_blob_it.forward ();
-      new_str[i_offset] = ' ';
-      new_lengths[i] = 1;
-      new_map[i].setrej_rej_cblob ();
-      i_offset += new_lengths[i++];
-    }
-    else {
-      strncpy(new_str + i_offset, &(*word_str)[j_offset],
-              (*word_lengths)[j]);
-      new_lengths[i] = (*word_lengths)[j];
-      new_map[i] = word->reject_map[j];
-      i_offset += new_lengths[i++];
-      j_offset += (*word_lengths)[j++];
-      blob_it.forward ();
-    }
-  }
-  /* Add any extra normal blobs to strings */
-  while (j < word_lengths->length ()) {
-    strncpy(new_str + i_offset, &(*word_str)[j_offset],
-            (*word_lengths)[j]);
-    new_lengths[i] = (*word_lengths)[j];
-    new_map[i] = word->reject_map[j];
-    i_offset += new_lengths[i++];
-    j_offset += (*word_lengths)[j++];
-  }
-  new_str[i_offset] = '\0';
-  new_lengths[i] = 0;
-  /*
-    tprintf(
-          "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n",
-          old_len, i, new_str, new_map );
-  */
-  ASSERT_HOST (i == blob_it.length ());
-  ASSERT_HOST (i == old_len + rej_len);
-  word->reject_map = new_map;
-
-  // Update word->best_choice if needed.
-  if (strcmp(new_str, word->best_choice->unichar_string().string()) != 0 ||
-      strcmp(new_lengths, word->best_choice->unichar_lengths().string()) != 0) {
-    WERD_CHOICE *new_choice =
-      new WERD_CHOICE(new_str, new_lengths,
-                      word->best_choice->rating(),
-                      word->best_choice->certainty(),
-                      word->best_choice->permuter(),
-                      getDict().getUnicharset());
-   new_choice->populate_unichars(getDict().getUnicharset());
-   delete word->best_choice;
-   word->best_choice = new_choice;
-  }
-  new_len = word->best_choice->length();
-  ASSERT_HOST (word->reject_map.length () == new_len);
-  ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
-
-}
 }  // namespace tesseract
--- a/ccmain/docqual.h
+++ b/ccmain/docqual.h
@ -31,108 +31,6 @@ enum GARBAGE_LEVEL
  G_TERRIBLE
 };

-extern STRING_VAR_H (outlines_odd, "%| ", "Non standard number of outlines");
-extern STRING_VAR_H (outlines_2, "ij!?%\":;",
-"Non standard number of outlines");
-extern BOOL_VAR_H (docqual_excuse_outline_errs, FALSE,
-"Allow outline errs in unrejection?");
-extern BOOL_VAR_H (tessedit_good_quality_unrej, TRUE,
-"Reduce rejection on good docs");
-extern BOOL_VAR_H (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
-extern double_VAR_H (tessedit_reject_doc_percent, 65.00,
-"%rej allowed before rej whole doc");
-extern double_VAR_H (tessedit_reject_block_percent, 45.00,
-"%rej allowed before rej whole block");
-extern double_VAR_H (tessedit_reject_row_percent, 40.00,
-"%rej allowed before rej whole row");
-extern double_VAR_H (tessedit_whole_wd_rej_row_percent, 70.00,
-"%of row rejects in whole word rejects which prevents whole row rejection");
-extern BOOL_VAR_H (tessedit_preserve_blk_rej_perfect_wds, TRUE,
-"Only rej partially rejected words in block rejection");
-extern BOOL_VAR_H (tessedit_preserve_row_rej_perfect_wds, TRUE,
-"Only rej partially rejected words in row rejection");
-extern BOOL_VAR_H (tessedit_dont_blkrej_good_wds, FALSE,
-"Use word segmentation quality metric");
-extern BOOL_VAR_H (tessedit_dont_rowrej_good_wds, FALSE,
-"Use word segmentation quality metric");
-extern INT_VAR_H (tessedit_preserve_min_wd_len, 2,
-"Only preserve wds longer than this");
-extern BOOL_VAR_H (tessedit_row_rej_good_docs, TRUE,
-"Apply row rejection to good docs");
-extern double_VAR_H (tessedit_good_doc_still_rowrej_wd, 1.1,
-"rej good doc wd if more than this fraction rejected");
-extern BOOL_VAR_H (tessedit_reject_bad_qual_wds, TRUE,
-"Reject all bad quality wds");
-extern BOOL_VAR_H (tessedit_debug_doc_rejection, FALSE, "Page stats");
-extern BOOL_VAR_H (tessedit_debug_quality_metrics, FALSE,
-"Output data to debug file");
-extern BOOL_VAR_H (bland_unrej, FALSE, "unrej potential with no chekcs");
-extern double_VAR_H (quality_rowrej_pc, 1.1,
-"good_quality_doc gte good char limit");
-extern BOOL_VAR_H (unlv_tilde_crunching, TRUE,
-"Mark v.bad words for tilde crunch");
-extern BOOL_VAR_H (crunch_early_merge_tess_fails, TRUE,
-"Before word crunch?");
-extern BOOL_VAR_H (crunch_early_convert_bad_unlv_chs, FALSE,
-"Take out ~^ early?");
-extern double_VAR_H (crunch_terrible_rating, 80.0, "crunch rating lt this");
-extern BOOL_VAR_H (crunch_terrible_garbage, TRUE, "As it says");
-extern double_VAR_H (crunch_poor_garbage_cert, -9.0,
-"crunch garbage cert lt this");
-extern double_VAR_H (crunch_poor_garbage_rate, 60,
-"crunch garbage rating lt this");
-extern double_VAR_H (crunch_pot_poor_rate, 40,
-"POTENTIAL crunch rating lt this");
-extern double_VAR_H (crunch_pot_poor_cert, -8.0,
-"POTENTIAL crunch cert lt this");
-extern BOOL_VAR_H (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
-extern double_VAR_H (crunch_del_rating, 60,
-"POTENTIAL crunch rating lt this");
-extern double_VAR_H (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
-extern double_VAR_H (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
-extern double_VAR_H (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
-extern double_VAR_H (crunch_del_min_width, 3.0,
-"Del if word width lt xht x this");
-extern double_VAR_H (crunch_del_high_word, 1.5,
-"Del if word gt xht x this above bl");
-extern double_VAR_H (crunch_del_low_word, 0.5,
-"Del if word gt xht x this below bl");
-extern double_VAR_H (crunch_small_outlines_size, 0.6,
-"Small if lt xht x this");
-extern INT_VAR_H (crunch_rating_max, 10, "For adj length in rating per ch");
-extern INT_VAR_H (crunch_pot_indicators, 1,
-"How many potential indicators needed");
-extern BOOL_VAR_H (crunch_leave_ok_strings, TRUE,
-"Dont touch sensible strings");
-extern BOOL_VAR_H (crunch_accept_ok, TRUE, "Use acceptability in okstring");
-extern BOOL_VAR_H (crunch_leave_accept_strings, FALSE,
-"Dont pot crunch sensible strings");
-extern BOOL_VAR_H (crunch_include_numerals, FALSE, "Fiddle alpha figures");
-extern INT_VAR_H (crunch_leave_lc_strings, 4,
-"Dont crunch words with long lower case strings");
-extern INT_VAR_H (crunch_leave_uc_strings, 4,
-"Dont crunch words with long lower case strings");
-extern INT_VAR_H (crunch_long_repetitions, 3,
-"Crunch words with long repetitions");
-extern INT_VAR_H (crunch_debug, 0, "As it says");
-inT16 word_blob_quality(  //Blob seg changes
-                        WERD_RES *word,
-                        ROW *row);
-//BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2);
-inT16 word_outline_errs(  //Outline count errs
-                        WERD_RES *word);
-void word_char_quality(  //Blob seg changes
-                       WERD_RES *word,
-                       ROW *row,
-                       inT16 *match_count,
-                       inT16 *accepted_match_count);
-//void unrej_good_chs(WERD_RES *word, ROW *row);
-void print_boxes(WERD *word);
-inT16 count_outline_errs(char c, inT16 outline_count);
+inT16 word_blob_quality(WERD_RES *word, ROW *row);
 void reject_whole_page(PAGE_RES_IT &page_res_it);
-BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
-                                 //word to do
-CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
-inT16 failure_count(WERD_RES *word);
-BOOL8 noise_outlines(WERD *word);
 #endif
--- a/ccmain/expandblob.cpp
+++ b/ccmain/expandblob.cpp
@ -1,82 +0,0 @@
-/**************************************************************************
- * Revision 5.1  89/07/27  11:46:53  11:46:53  ray ()
- * (C) Copyright 1989, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
-**************************************************************************/
-#include "mfcpch.h"
-#include "expandblob.h"
-#include "tessclas.h"
-#include "const.h"
-#include "structures.h"
-#include "freelist.h"
-
-/***********************************************************************
-free_blob(blob) frees the blob and everything it is connected to,
-i.e. outlines, nodes, edgepts, bytevecs, ratings etc
-*************************************************************************/
-void free_blob(  /*blob to free */
-               register TBLOB *blob) {
-  if (blob == NULL)
-    return;                      /*duff blob */
-  free_tree (blob->outlines);    /*do the tree of outlines */
-  oldblob(blob);  /*free the actual blob */
-}
-
-
-/***************************************************************************
-free_tree(outline) frees the current outline
-and then its sub-tree
-*****************************************************************************/
-void free_tree(  /*outline to draw */
-               register TESSLINE *outline) {
-  if (outline == NULL)
-    return;                      /*duff outline */
-  if (outline->next != NULL)
-    free_tree (outline->next);
-  if (outline->child != NULL)
-    free_tree (outline->child);  /*and sub-tree */
-  free_outline(outline);  /*free the outline */
-}
-
-
-/*******************************************************************************
-free_outline(outline) frees an outline and anything connected to it
-*********************************************************************************/
-void free_outline(  /*outline to free */
-                  register TESSLINE *outline) {
-  if (outline->compactloop != NULL)
-                                 /*no compact loop */
-      memfree (outline->compactloop);
-
-  if (outline->loop != NULL)
-    free_loop (outline->loop);
-
-  oldoutline(outline);
-}
-
-
-/*********************************************************************************
-free_loop(startpt) frees all the elements of the closed loop
-starting at startpt
-***********************************************************************************/
-void free_loop(  /*outline to free */
-               register EDGEPT *startpt) {
-  register EDGEPT *edgept;       /*current point */
-
-  if (startpt == NULL)
-    return;
-  edgept = startpt;
-  do {
-    edgept = oldedgept (edgept); /*free it and move on */
-  }
-  while (edgept != startpt);
-}
--- a/ccmain/expandblob.h
+++ b/ccmain/expandblob.h
@ -1,13 +0,0 @@
-#ifndef EXPANDBLOB_H
-#define EXPANDBLOB_H
-
-#include "tessclas.h"
-
-void free_blob(register TBLOB *blob); 
-
-void free_tree(register TESSLINE *outline); 
-
-void free_outline(register TESSLINE *outline); 
-
-void free_loop(register EDGEPT *startpt); 
-#endif
--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
--- a/ccmain/fixspace.h
+++ b/ccmain/fixspace.h
@ -23,37 +23,10 @@
 #define           FIXSPACE_H

 #include          "pageres.h"
-#include          "varable.h"
-#include          "ocrclass.h"
+#include          "params.h"
 #include          "notdll.h"

-extern BOOL_VAR_H (fixsp_check_for_fp_noise_space, TRUE,
-"Try turning noise to space in fixed pitch");
-extern BOOL_VAR_H (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");
-extern BOOL_VAR_H (fixsp_noise_score_fixing, TRUE, "More sophisticated?");
-extern INT_VAR_H (fixsp_non_noise_limit, 1,
-"How many non-noise blbs either side?");
-extern double_VAR_H (fixsp_small_outlines_size, 0.28,
-"Small if lt xht x this");
-extern BOOL_VAR_H (fixsp_ignore_punct, TRUE, "In uniform spacing calc");
-extern BOOL_VAR_H (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");
-extern BOOL_VAR_H (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");
-extern BOOL_VAR_H (tessedit_test_uniform_wd_spacing, FALSE,
-"Limit context word spacing");
-extern BOOL_VAR_H (tessedit_prefer_joined_punct, FALSE,
-"Reward punctation joins");
-extern INT_VAR_H (fixsp_done_mode, 1, "What constitutes done for spacing");
-extern INT_VAR_H (debug_fix_space_level, 0, "Contextual fixspace debug");
-extern STRING_VAR_H (numeric_punctuation, ".,",
-"Punct. chs expected WITHIN numbers");
 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list);
 void transform_to_next_perm(WERD_RES_LIST &words);
-void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved);
-BOOL8 uniformly_spaced(  //sensible word
-                       WERD_RES *word);
-BOOL8 fixspace_thinks_word_done(WERD_RES *word);
-void break_noisiest_blob_word(WERD_RES_LIST &words);
-inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
-float blob_noise_score(PBLOB *blob);
 void fixspace_dbg(WERD_RES *word);
 #endif
--- a/ccmain/fixxht.cpp
+++ b/ccmain/fixxht.cpp
@ -17,816 +17,150 @@
 *
 **********************************************************************/

-#ifdef _MSC_VER
-#pragma warning(disable:4244)  // Conversion warnings
-#endif
-
 #include "mfcpch.h"
 #include          <string.h>
 #include          <ctype.h>
-#include          "varable.h"
-#include          "tessvars.h"
-#include          "control.h"
-#include          "reject.h"
-#include          "fixxht.h"
-#include          "secname.h"
+#include          "params.h"
+#include          "float2int.h"
 #include          "tesseractclass.h"

-#define EXTERN
-
-EXTERN double_VAR (x_ht_fraction_of_caps_ht, 0.7,
-"Fract of cps ht est of xht");
-EXTERN double_VAR (x_ht_variation, 0.35,
-"Err band as fract of caps/xht dist");
-EXTERN double_VAR (x_ht_sub_variation, 0.5,
-"Err band as fract of caps/xht dist");
-EXTERN BOOL_VAR (rej_trial_ambigs, TRUE,
-"reject x-ht ambigs when under trial");
-EXTERN BOOL_VAR (x_ht_conservative_ambigs, FALSE,
-"Dont rely on ambigs + maxht");
-EXTERN BOOL_VAR (x_ht_check_est, TRUE, "Cross check estimates");
-EXTERN BOOL_VAR (x_ht_case_flip, FALSE, "Flip or reject suspect case");
-EXTERN BOOL_VAR (x_ht_include_dodgy_blobs, TRUE,
-"Include blobs with possible noise?");
-EXTERN BOOL_VAR (x_ht_limit_flip_trials, TRUE,
-"Dont do trial flips when ambigs are close to xht?");
-EXTERN BOOL_VAR (rej_use_check_block_occ, TRUE,
-"Analyse rejection behaviour");
-
-EXTERN STRING_VAR (chs_non_ambig_caps_ht,
-"!#$%&()/12346789?ABDEFGHIKLNQRT[]\\bdfhkl",
-"Reliable ascenders");
-EXTERN STRING_VAR (chs_x_ht, "acegmnopqrsuvwxyz", "X height chars");
-EXTERN STRING_VAR (chs_non_ambig_x_ht, "aenqr", "reliable X height chars");
-EXTERN STRING_VAR (chs_ambig_caps_x, "cCmMoO05sSuUvVwWxXzZ",
-"X ht or caps ht chars");
-EXTERN STRING_VAR (chs_bl_ambig_caps_x, "pPyY", " Caps or descender ambigs");
-
-/* The following arent used in this module but are used in applybox.c */
-EXTERN STRING_VAR (chs_caps_ht,
-"!#$%&()/0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]\\bdfhkl{|}",
-"Ascender chars");
-EXTERN STRING_VAR (chs_desc, "gjpqy", "Descender chars");
-EXTERN STRING_VAR (chs_non_ambig_bl,
-"!#$%&01246789?ABCDEFGHIKLMNORSTUVWXYZabcdehiklmnorstuvwxz",
-"Reliable baseline chars");
-EXTERN STRING_VAR (chs_odd_top, "ijt", "Chars with funny ascender region");
-EXTERN STRING_VAR (chs_odd_bot, "()35JQ[]\\/{}|", "Chars with funny base");
-
-/* The following arent used but are defined for completeness */
-EXTERN STRING_VAR (chs_bl,
-"!#$%&()/01246789?ABCDEFGHIJKLMNOPRSTUVWXYZ[]\\abcdefhiklmnorstuvwxz{}",
-"Baseline chars");
-EXTERN STRING_VAR (chs_non_ambig_desc, "gq", "Reliable descender chars");
-
-/**
- * re_estimate_x_ht()
- *
- * Walk the blobs in the word together with the text string and reject map.
- * NOTE: All evaluation is done on the baseline normalised word. This is so that
- * the TBOX class can be used (integer). The reasons for this are:
- *   a) We must use the outword - ie the Tess result
- *   b) The outword is always converted to integer representation as that is how
- *      Tess works
- *   c) We would like to use the TBOX class, cos its there - this is integer
- *      precision.
- *   d) If we de-normed the outword we would get rounding errors and would find
- *      that integers are too imprecise (x-height around 15 pixels instead of a
- *      scale of 128 in bln form.
- *   CONVINCED?
- *
- * A) Try to re-estimatate x-ht and caps ht from confirmed pts in word.
- *
- * @verbatim
-     FOR each non reject blob
-        IF char is baseline posn ambiguous
- 			Remove ambiguity by comparing its posn with respect to baseline.
- 		IF char is a confirmed x-ht char
- 			Add x-ht posn to confirmed_x_ht pts for word
-     IF char is a confirmed caps-ht char
- 			Add blob_ht to caps ht pts for word
- 
-     IF Std Dev of caps hts < 2  (AND # samples > 0)
- 		Use mean as caps ht estimate (Dont use median as we can expect a
- 			fair variation between the heights of the NON_AMBIG_CAPS_HT_CHS)
-     IF Std Dev of caps hts >= 2  (AND # samples > 0)
- 			Suspect small caps font.
- 			Look for 2 clusters,	each with Std Dev < 2.
- 			IF 2 clusters found
- 			Pick the smaller median as the caps ht estimate of the smallcaps.
- 
-     IF failed to estimate a caps ht
-        Use the median caps ht if there is one,
- 		ELSE use the caps ht estimate of the previous word. NO!!!
- 
- 
-     IF there are confirmed x-height chars
- 			Estimate confirmed x-height as the median value
-     ELSE IF there is a confirmed caps ht
- 			Estimate confirmed x-height as a fraction of confirmed caps ht value
- 		ELSE
- 			Use the value for the previous word or the row value if this is the
- 			first word in the block. NO!!!
-   @endverbatim
- *
- * B) Add in case ambiguous blobs based on confirmed x-ht/caps ht, changing case
- *    as necessary. Reestimate caps ht and x-ht as in A, using the extended
- *    clusters.
- *
- * C) If word contains rejects, and x-ht estimate significantly differs from
- *    original estimate, return TRUE so that the word can be rematched
- */
-
-void re_estimate_x_ht(                     //improve for 1 word
-                      WERD_RES *word_res,  //word to do
-                      float *trial_x_ht    //new match value
-                     ) {
-  PBLOB_IT blob_it;
-  inT16 blob_ht_above_baseline;
-
-  const char *word_str;
-  inT16 i;
-  inT16 offset;
-
-  STATS all_blobs_ht (0, 300);   //every blob in word
-  STATS x_ht (0, 300);           //confirmed pts in wd
-  STATS caps_ht (0, 300);        //confirmed pts in wd
-  STATS case_ambig (0, 300);     //lower case ambigs
-
-  inT16 rej_blobs_count = 0;
-  inT16 rej_blobs_max_height = 0;
-  inT32 rej_blobs_max_area = 0;
-  float x_ht_ok_variation;
-  float max_blob_ht;
-  float marginally_above_x_ht;
-
-  TBOX blob_box;                  //blob bounding box
-  float est_x_ht = 0.0;          //word estimate
-  float est_caps_ht = 0.0;       //word estimate
-                                 //based on hard data?
-  BOOL8 est_caps_ht_certain = FALSE;
-  BOOL8 est_x_ht_certain = FALSE;//based on hard data?
-  BOOL8 trial = FALSE;           //Sepeculative values?
-  BOOL8 no_comment = FALSE;      //No change in xht
-  float ambig_lc_x_est;
-  float ambig_uc_caps_est;
-  inT16 x_ht_ambigs = 0;
-  inT16 caps_ht_ambigs = 0;
-
-  /* Calculate default variation of blob x_ht from bln x_ht for bln word */
-  x_ht_ok_variation =
-    (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
-
-  word_str = word_res->best_choice->unichar_string().string();
-  /*
-    Cycle blobs, allocating to one of the stats sets when possible.
-  */
-  blob_it.set_to_list (word_res->outword->blob_list ());
-  for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
-  !blob_it.cycled_list (); blob_it.forward (),
-           offset += word_res->best_choice->unichar_lengths()[i++]) {
-    if (!dodgy_blob (blob_it.data ())) {
-      blob_box = blob_it.data ()->bounding_box ();
-      blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
-      all_blobs_ht.add (blob_ht_above_baseline, 1);
-
-      if (word_res->reject_map[i].rejected ()) {
-        rej_blobs_count++;
-        if (blob_box.height () > rej_blobs_max_height)
-          rej_blobs_max_height = blob_box.height ();
-        if (blob_box.area () > rej_blobs_max_area)
-          rej_blobs_max_area = blob_box.area ();
-      }
-      else {
-        if (STRING (chs_non_ambig_x_ht).contains (word_str[offset]))
-          x_ht.add (blob_ht_above_baseline, 1);
-
-        if (STRING (chs_non_ambig_caps_ht).contains (word_str[offset]))
-          caps_ht.add (blob_ht_above_baseline, 1);
-
-        if (STRING (chs_ambig_caps_x).contains (word_str[offset])) {
-          case_ambig.add (blob_ht_above_baseline, 1);
-          if (STRING (chs_x_ht).contains (word_str[offset]))
-            x_ht_ambigs++;
-          else
-            caps_ht_ambigs++;
-        }
-
-        if (STRING (chs_bl_ambig_caps_x).contains (word_str[offset])) {
-          if (STRING (chs_x_ht).contains (word_str[offset])) {
-            /* confirm x_height provided > 15% total height below baseline */
-            if ((bln_baseline_offset - blob_box.bottom ()) /
-              (float) blob_box.height () > 0.15)
-              x_ht.add (blob_ht_above_baseline, 1);
-          }
-          else {
-            /* confirm caps_height provided < 5% total height below baseline */
-            if ((bln_baseline_offset - blob_box.bottom ()) /
-              (float) blob_box.height () < 0.05)
-              caps_ht.add (blob_ht_above_baseline, 1);
-          }
-        }
-      }
-    }
-  }
-  est_caps_ht = estimate_from_stats (caps_ht);
-  est_x_ht = estimate_from_stats (x_ht);
-  est_ambigs(word_res, case_ambig, &ambig_lc_x_est, &ambig_uc_caps_est);
-  max_blob_ht = all_blobs_ht.ile (0.9999);
-
-  #ifndef SECURE_NAMES
-  if (debug_x_ht_level >= 20) {
-    tprintf ("Mode20:A: %s ", word_str);
-    word_res->reject_map.print (debug_fp);
-    tprintf (" XHT:%f CAP:%f MAX:%f AMBIG X:%f CAP:%f\n",
-      est_x_ht, est_caps_ht, max_blob_ht,
-      ambig_lc_x_est, ambig_uc_caps_est);
-  }
-  #endif
-  if (!x_ht_conservative_ambigs &&
-    (ambig_lc_x_est > 0) &&
-    (ambig_lc_x_est == ambig_uc_caps_est) &&
-  (max_blob_ht > ambig_lc_x_est + x_ht_ok_variation)) {
-                                 //may be zero but believe xht
-    ambig_uc_caps_est = est_caps_ht;
-    #ifndef SECURE_NAMES
-    if (debug_x_ht_level >= 20)
-      tprintf ("Mode20:B: Fiddle ambig_uc_caps_est to %f\n",
-        ambig_lc_x_est);
-    #endif
-  }
-
-  /* Now make some estimates */
-
-  if ((est_x_ht > 0) || (est_caps_ht > 0) ||
-      ((ambig_lc_x_est > 0) && (ambig_lc_x_est != ambig_uc_caps_est))) {
-    /* There is some sensible data to go on so make the most of it. */
-    if (debug_x_ht_level >= 20)
-      tprintf ("Mode20:C: Sensible Data\n", ambig_lc_x_est);
-    if (est_x_ht > 0) {
-      est_x_ht_certain = TRUE;
-      if (est_caps_ht == 0) {
-        if ((ambig_uc_caps_est > ambig_lc_x_est) &&
-            (ambig_uc_caps_est > est_x_ht + x_ht_ok_variation))
-          est_caps_ht = ambig_uc_caps_est;
-        else
-          est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
-      }
-      if (case_ambig.get_total () > 0)
-        improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);
-      est_caps_ht_certain = caps_ht.get_total () > 0;
-      #ifndef SECURE_NAMES
-      if (debug_x_ht_level >= 20)
-        tprintf ("Mode20:D: Est from xht XHT:%f CAP:%f\n",
-          est_x_ht, est_caps_ht);
-      #endif
-    }
-    else if (est_caps_ht > 0) {
-      est_caps_ht_certain = TRUE;
-      if ((ambig_lc_x_est > 0) &&
-        (ambig_lc_x_est < est_caps_ht - x_ht_ok_variation))
-        est_x_ht = ambig_lc_x_est;
-      else
-        est_x_ht = est_caps_ht * x_ht_fraction_of_caps_ht;
-      if (ambig_lc_x_est + ambig_uc_caps_est > 0)
-        improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);
-      est_x_ht_certain = x_ht.get_total () > 0;
-      #ifndef SECURE_NAMES
-      if (debug_x_ht_level >= 20)
-        tprintf ("Mode20:E: Est from caps XHT:%f CAP:%f\n",
-          est_x_ht, est_caps_ht);
-      #endif
-    }
-    else {
-      /* Do something based on case ambig chars alone - we have guessed that the
-        ambigs are lower case. */
-      est_x_ht = ambig_lc_x_est;
-      est_x_ht_certain = TRUE;
-      if (ambig_uc_caps_est > ambig_lc_x_est) {
-        est_caps_ht = ambig_uc_caps_est;
-        est_caps_ht_certain = TRUE;
-      }
-      else
-        est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
-
-      #ifndef SECURE_NAMES
-      if (debug_x_ht_level >= 20)
-        tprintf ("Mode20:F: Est from ambigs XHT:%f CAP:%f\n",
-          est_x_ht, est_caps_ht);
-      #endif
-    }
-    /* Check for sane interpretation of evidence:
-      Try shifting caps ht if min certain caps ht is not significantly greater
-      than the estimated x ht or the max certain x ht is not significantly less
-      than the estimated caps ht. */
-    if (x_ht_check_est) {
-      if ((caps_ht.get_total () > 0) &&
-      (est_x_ht + x_ht_ok_variation >= caps_ht.ile (0.0001))) {
-        trial = TRUE;
-        est_caps_ht = est_x_ht;
-        est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
-
-        #ifndef SECURE_NAMES
-        if (debug_x_ht_level >= 20)
-          tprintf ("Mode20:G: Trial XHT:%f CAP:%f\n",
-            est_x_ht, est_caps_ht);
-        #endif
-      }
-      else if ((x_ht.get_total () > 0) &&
-      (est_caps_ht - x_ht_ok_variation <= x_ht.ile (0.9999))) {
-        trial = TRUE;
-        est_x_ht = est_caps_ht;
-        est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
-        #ifndef SECURE_NAMES
-        if (debug_x_ht_level >= 20)
-          tprintf ("Mode20:H: Trial XHT:%f CAP:%f\n",
-            est_x_ht, est_caps_ht);
-        #endif
-      }
-    }
-  }
-
-  else {
-    /* There is no sensible data so we're in the dark. */
-
-    marginally_above_x_ht = bln_x_height +
-      x_ht_ok_variation * x_ht_sub_variation;
-    /*
-      If there are no rejects, or the only rejects have a narrow height, or have
-      a small area compared to a normal char, then estimate the x-height as the
-      original one. (I.e dont fiddle about if the only rejects look like
-      punctuation) - we use max height as mean or median will be too low if
-      there are only two blobs - Eg "F."
-    */
-
-    if (debug_x_ht_level >= 20)
-      tprintf ("Mode20:I: In the dark\n");
-
-    if ((rej_blobs_count == 0) ||
-      (rej_blobs_max_height < 0.3 * max_blob_ht) ||
-    (rej_blobs_max_area < 0.3 * max_blob_ht * max_blob_ht)) {
-      no_comment = TRUE;
-      if (debug_x_ht_level >= 20)
-        tprintf ("Mode20:J: No comment due to no rejects\n");
-    }
-    else if (x_ht_limit_flip_trials &&
-             ((max_blob_ht < marginally_above_x_ht) ||
-             ((ambig_lc_x_est > 0) &&
-             (ambig_lc_x_est == ambig_uc_caps_est) &&
-             (ambig_lc_x_est < marginally_above_x_ht)))) {
-      no_comment = TRUE;
-      if (debug_x_ht_level >= 20)
-        tprintf ("Mode20:K: No comment as close to xht %f < %f\n",
-          ambig_lc_x_est, marginally_above_x_ht);
-    }
-    else if (x_ht_conservative_ambigs && (ambig_uc_caps_est > 0)) {
-      trial = TRUE;
-      est_caps_ht = ambig_lc_x_est;
-      est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
-
-      #ifndef SECURE_NAMES
-      if (debug_x_ht_level >= 20)
-        tprintf ("Mode20:L: Trial XHT:%f CAP:%f\n",
-          est_x_ht, est_caps_ht);
-      #endif
-    }
-    /*
-      If the top of the word is nowhere near where we expect ascenders to be
-      (less than half the x_ht -> caps_ht distance) - suspect an all caps word
-      at the x-ht. Estimate x-ht accordingly - but only as a TRIAL!
-      NOTE we do NOT check location of baseline. Commas can descend as much as
-      real descenders so we would need to do something to make sure that any
-      disqualifying descenders were not at the end.
-    */
-    else {
-      if (max_blob_ht <
-          (bln_x_height + bln_x_height / x_ht_fraction_of_caps_ht) / 2.0) {
-        trial = TRUE;
-        est_x_ht = x_ht_fraction_of_caps_ht * max_blob_ht;
-        est_caps_ht = max_blob_ht;
-
-        #ifndef SECURE_NAMES
-        if (debug_x_ht_level >= 20)
-          tprintf ("Mode20:M: Trial XHT:%f CAP:%f\n",
-            est_x_ht, est_caps_ht);
-        #endif
-      }
-      else {
-        no_comment = TRUE;
-        if (debug_x_ht_level >= 20)
-          tprintf ("Mode20:N: No comment as nothing else matched\n");
-      }
-    }
-  }
-
-  /* Sanity check - reject word if fails */
-
-  if (!no_comment &&
-      ((est_x_ht > 2 * bln_x_height) ||
-       (est_x_ht / word_res->denorm.scale () <= min_sane_x_ht_pixels) ||
-       (est_caps_ht <= est_x_ht) || (est_caps_ht >= 2.5 * est_x_ht))) {
-    no_comment = TRUE;
-    if (!trial && rej_use_xht) {
-      if (debug_x_ht_level >= 2) {
-        tprintf ("Sanity check rejecting %s ", word_str);
-        word_res->reject_map.print (debug_fp);
-        tprintf ("\n");
-      }
-      word_res->reject_map.rej_word_xht_fixup ();
-
-    }
-    if (debug_x_ht_level >= 20)
-      tprintf ("Mode20:O: No comment as nothing else matched\n");
-  }
-
-  if (no_comment || trial) {
-    word_res->x_height = bln_x_height / word_res->denorm.scale ();
-    word_res->guessed_x_ht = TRUE;
-    word_res->caps_height = (bln_x_height / x_ht_fraction_of_caps_ht) /
-      word_res->denorm.scale ();
-    word_res->guessed_caps_ht = TRUE;
-    /*
-    Reject ambigs in the current word if we are uncertain and:
-        there are rejects OR
-        there is only one char which is an ambig OR
-        there is conflict between the case of the ambigs even though there is
-        no height separation Eg "Ms" recognised from "MS"
-    */
-    if (rej_trial_ambigs &&
-      ((word_res->reject_map.reject_count () > 0) ||
-      (word_res->reject_map.length () == 1) ||
-    ((x_ht_ambigs > 0) && (caps_ht_ambigs > 0)))) {
-      #ifndef SECURE_NAMES
-      if (debug_x_ht_level >= 2) {
-        tprintf ("TRIAL Rej Ambigs %s ", word_str);
-        word_res->reject_map.print (debug_fp);
-      }
-      #endif
-      reject_ambigs(word_res);
-      if (debug_x_ht_level >= 2) {
-        tprintf (" ");
-        word_res->reject_map.print (debug_fp);
-        tprintf ("\n");
-      }
-    }
-  }
-  else {
-    word_res->x_height = est_x_ht / word_res->denorm.scale ();
-    word_res->guessed_x_ht = !est_x_ht_certain;
-    word_res->caps_height = est_caps_ht / word_res->denorm.scale ();
-    word_res->guessed_caps_ht = !est_caps_ht_certain;
-  }
-
-  if (!no_comment && (fabs (est_x_ht - bln_x_height) > x_ht_ok_variation))
-    *trial_x_ht = est_x_ht / word_res->denorm.scale ();
-  else
-    *trial_x_ht = 0.0;
-
-  #ifndef SECURE_NAMES
-  if (((*trial_x_ht > 0) && (debug_x_ht_level >= 3)) ||
-      (debug_x_ht_level >= 5)) {
-    tprintf ("%s ", word_str);
-    word_res->reject_map.print (debug_fp);
-    tprintf
-      (" X:%0.2f Cps:%0.2f Mxht:%0.2f RJ MxHt:%d MxAr:%d Rematch:%c\n",
-      est_x_ht, est_caps_ht, max_blob_ht, rej_blobs_max_height,
-      rej_blobs_max_area, *trial_x_ht > 0 ? '*' : ' ');
-  }
-  #endif
-
-}
-
-
 namespace tesseract {
-/**
- * check_block_occ()
- * Checks word for coarse block occupancy, rejecting more chars and flipping
- * case of case ambiguous chars as required.
- */
-void Tesseract::check_block_occ(WERD_RES *word_res) {
-  PBLOB_IT blob_it;
-  STRING new_string;
-  STRING new_string_lengths(word_res->best_choice->unichar_lengths());
-  REJMAP new_map = word_res->reject_map;
-  WERD_CHOICE *new_choice;

-  const char *word_str = word_res->best_choice->unichar_string().string();
-  inT16 i;
-  inT16 offset;
-  inT16 reject_count = 0;
-  char confirmed_char[UNICHAR_LEN + 1];
-  char temp_char[UNICHAR_LEN + 1];
-  float x_ht;
-  float caps_ht;
+// Fixxht overview.
+// Premise: Initial estimate of x-height is adequate most of the time, but
+// occasionally it is incorrect. Most notable causes of failure are:
+// 1. Small caps, where the top of the caps is the same as the body text
+// xheight. For small caps words the xheight needs to be reduced to correctly
+// recognize the caps in the small caps word.
+// 2. All xheight lines, such as summer. Here the initial estimate will have
+// guessed that the blob tops are caps and will have placed the xheight too low.
+// 3. Noise/logos beside words, or changes in font size on a line. Such
+// things can blow the statistics and cause an incorrect estimate.
+//
+// Algorithm.
+// Compare the vertical position (top only) of alphnumerics in a word with
+// the range of positions in training data (in the unicharset).
+// See CountMisfitTops. If any characters disagree sufficiently with the
+// initial xheight estimate, then recalculate the xheight, re-run OCR on
+// the word, and if the number of vertical misfits goes down, along with
+// either the word rating or certainty, then keep the new xheight.
+// The new xheight is calculated as follows:ComputeCompatibleXHeight
+// For each alphanumeric character that has a vertically misplaced top
+// (a misfit), yet its bottom is within the acceptable range (ie it is not
+// likely a sub-or super-script) calculate the range of acceptable xheight
+// positions from its range of tops, and give each value in the range a
+// number of votes equal to the distance of its top from its acceptance range.
+// The x-height position with the median of the votes becomes the new
+// x-height. This assumes that most characters will be correctly recognized
+// even if the x-height is incorrect. This is not a terrible assumption, but
+// it is not great. An improvement would be to use a classifier that does
+// not care about vertical position or scaling at all.

-  new_string_lengths[0] = 0;
+// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
+// then the char top cannot be used to judge misfits or suggest a new top.
+const int kMaxCharTopRange = 48;

-  if (word_res->x_height > 0)
-    x_ht = word_res->x_height * word_res->denorm.scale ();
-  else
-    x_ht = bln_x_height;
-
-  if (word_res->caps_height > 0)
-    caps_ht = word_res->caps_height * word_res->denorm.scale ();
-  else
-    caps_ht = x_ht / x_ht_fraction_of_caps_ht;
-
-  blob_it.set_to_list (word_res->outword->blob_list ());
-
-  for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
-  !blob_it.cycled_list (); blob_it.forward (),
-           offset += word_res->best_choice->unichar_lengths()[i++]) {
-    strncpy(temp_char, word_str + offset,
-            word_res->best_choice->unichar_lengths()[i]); //default copy
-    temp_char[word_res->best_choice->unichar_lengths()[i]] = '\0';
-    if (word_res->reject_map[i].accepted ()) {
-      check_blob_occ (temp_char,
-                      blob_it.data ()->bounding_box ().
-                      top () - bln_baseline_offset, x_ht,
-                      caps_ht, confirmed_char);
-
-      if (strcmp(confirmed_char, "") == 0) {
-        if (rej_use_check_block_occ) {
-          new_map[i].setrej_xht_fixup ();
-          reject_count++;
-        }
+// Returns the number of misfit blob tops in this word.
+int Tesseract::CountMisfitTops(WERD_RES *word_res) {
+  int bad_blobs = 0;
+  TBLOB* blob = word_res->rebuild_word->blobs;
+  int blob_id = 0;
+  for (; blob != NULL; blob = blob->next, ++blob_id) {
+    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
+      int top = blob->bounding_box().top();
+      if (top >= INT_FEAT_RANGE)
+        top = INT_FEAT_RANGE - 1;
+      int min_bottom, max_bottom, min_top, max_top;
+      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
+                                &min_top, &max_top);
+      if (max_top - min_top > kMaxCharTopRange)
+        continue;
+      bool bad =  top < min_top - x_ht_acceptance_tolerance ||
+                  top > max_top + x_ht_acceptance_tolerance;
+      if (bad)
+        ++bad_blobs;
+      if (debug_x_ht_level >= 1) {
+        tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
+                unicharset.id_to_unichar(class_id),
+                bad ? "Misfit" : "OK", top, min_top, max_top,
+                static_cast<int>(x_ht_acceptance_tolerance));
      }
-      else
-        strcpy(temp_char, confirmed_char);
    }
-    new_string += temp_char;
-    new_string_lengths[i] = strlen(temp_char);
-    new_string_lengths[i + 1] = 0;
-
-  }
-  if ((reject_count > 0) || (new_string != word_str)) {
-    if (debug_x_ht_level >= 2) {
-      tprintf ("Shape Verification: %s ", word_str);
-      word_res->reject_map.print (debug_fp);
-      tprintf (" -> %s ", new_string.string ());
-      new_map.print (debug_fp);
-      tprintf ("\n");
-    }
-    new_choice = new WERD_CHOICE(new_string.string(),
-                                 new_string_lengths.string(),
-                                 word_res->best_choice->rating(),
-                                 word_res->best_choice->certainty(),
-                                 word_res->best_choice->permuter(),
-                                 unicharset);
-    new_choice->populate_unichars(unicharset);
-    delete word_res->best_choice;
-    word_res->best_choice = new_choice;
-    word_res->reject_map = new_map;
  }
+  return bad_blobs;
 }
+
+// Returns a new x-height maximally compatible with the result in word_res.
+// See comment above for overall algorithm.
+float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
+  STATS top_stats(0, MAX_UINT8);
+  TBLOB* blob = word_res->rebuild_word->blobs;
+  int blob_id = 0;
+  for (; blob != NULL; blob = blob->next, ++blob_id) {
+    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
+      int top = blob->bounding_box().top();
+      // Clip the top to the limit of normalized feature space.
+      if (top >= INT_FEAT_RANGE)
+        top = INT_FEAT_RANGE - 1;
+      int bottom = blob->bounding_box().bottom();
+      int min_bottom, max_bottom, min_top, max_top;
+      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
+                                &min_top, &max_top);
+      // Chars with a wild top range would mess up the result so ignore them.
+      if (max_top - min_top > kMaxCharTopRange)
+        continue;
+      int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
+                          top - (max_top + x_ht_acceptance_tolerance));
+      int height = top - kBlnBaselineOffset;
+      if (debug_x_ht_level >= 20) {
+        tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
+                unicharset.id_to_unichar(class_id),
+                height, min_bottom, max_bottom, min_top, max_top,
+                bottom, top);
+      }
+      // Use only chars that fit in the expected bottom range, and where
+      // the range of tops is sensibly near the xheight.
+      if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
+          bottom - x_ht_acceptance_tolerance <= max_bottom &&
+          min_top > kBlnBaselineOffset &&
+          max_top - kBlnBaselineOffset >= kBlnXHeight &&
+          misfit_dist > 0) {
+        // Compute the x-height position using proportionality between the
+        // actual height and expected height.
+        int min_xht = DivRounded(height * kBlnXHeight,
+                                 max_top - kBlnBaselineOffset);
+        int max_xht = DivRounded(height * kBlnXHeight,
+                                 min_top - kBlnBaselineOffset);
+        if (debug_x_ht_level >= 20) {
+          tprintf(" xht range min=%d, max=%d\n",
+                  min_xht, max_xht);
+        }
+        // The range of expected heights gets a vote equal to the distance
+        // of the actual top from the expected top.
+        for (int y = min_xht; y <= max_xht; ++y)
+          top_stats.add(y, misfit_dist);
+      } else if (debug_x_ht_level >= 20) {
+        tprintf(" already OK\n");
+      }
+    }
+  }
+  if (top_stats.get_total() == 0)
+    return 0.0f;
+  // The new xheight is just the median vote, which is then scaled out
+  // of BLN space back to pixel space to get the x-height in pixel space.
+  float new_xht = top_stats.median();
+  if (debug_x_ht_level >= 20) {
+    tprintf("Median xht=%f\n", new_xht);
+    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
+            new_xht, new_xht / word_res->denorm.scale());
+  }
+  // The xheight must change by at least x_ht_min_change to be used.
+  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
+    return new_xht / word_res->denorm.scale();
+  else
+    return 0.0f;
+}
+
 }  // namespace tesseract
-
-/**
- * check_blob_occ()
- *
- * Checks blob for position relative to position above baseline
- * @return 0 for reject, or (possibly case shifted) confirmed char
- */
-
-void check_blob_occ(char* proposed_char,
-                    inT16 blob_ht_above_baseline,
-                    float x_ht,
-                    float caps_ht,
-                    char* confirmed_char) {
-  BOOL8 blob_definite_x_ht;
-  BOOL8 blob_definite_caps_ht;
-  float acceptable_variation;
-
-  acceptable_variation = (caps_ht - x_ht) * x_ht_variation;
-  /* ??? REJECT if expected descender and nothing significantly below BL */
-
-  /* ??? REJECT if expected ascender and nothing significantly above x-ht */
-
-  /*
-    IF AMBIG_CAPS_X_CHS
-      IF blob is definitely an ascender ( > xht + xht err )AND
-        char is an x-ht char
-      THEN
-        flip case
-      IF blob is defintiely an x-ht ( <= xht + xht err ) AND
-        char is an ascender char
-      THEN
-        flip case
-  */
-  blob_definite_x_ht = blob_ht_above_baseline <= x_ht + acceptable_variation;
-  blob_definite_caps_ht = blob_ht_above_baseline >=
-    caps_ht - acceptable_variation;
-
-  if (STRING (chs_ambig_caps_x).contains (*proposed_char)) {
-    if ((!blob_definite_x_ht && !blob_definite_caps_ht) ||
-        ((strcmp(proposed_char, "0") == 0) && !blob_definite_caps_ht) ||
-        ((strcmp(proposed_char, "o") == 0) && !blob_definite_x_ht)) {
-      strcpy(confirmed_char, "");
-      return;
-    }
-
-    else if (blob_definite_caps_ht &&
-    STRING (chs_x_ht).contains (*proposed_char)) {
-      if (x_ht_case_flip) {
-                                 //flip to upper case
-        proposed_char[0] = (char) toupper (*proposed_char);
-        return;
-      } else {
-        strcpy(confirmed_char, "");
-        return;
-      }
-    }
-
-    else if (blob_definite_x_ht &&
-    !STRING (chs_x_ht).contains (*proposed_char)) {
-      if (x_ht_case_flip) {
-                                 //flip to lower case
-        proposed_char[0] = (char) tolower (*proposed_char);
-      } else {
-        strcpy(confirmed_char, "");
-        return;
-      }
-    }
-  }
-  else
-  if ((STRING (chs_non_ambig_x_ht).contains (*proposed_char)
-    && !blob_definite_x_ht)
-    || (STRING (chs_non_ambig_caps_ht).contains (*proposed_char)
-        && !blob_definite_caps_ht)) {
-    strcpy(confirmed_char, "");
-    return;
-  }
-  strcpy(confirmed_char, proposed_char);
-  return;
-}
-
-
-float estimate_from_stats(STATS &stats) {
-  if (stats.get_total () <= 0)
-    return 0.0;
-  else if (stats.get_total () >= 3)
-    return stats.ile (0.5);      //median
-  else
-    return stats.mean ();
-}
-
-
-void improve_estimate(WERD_RES *word_res,
-                      float &est_x_ht,
-                      float &est_caps_ht,
-                      STATS &x_ht,
-                      STATS &caps_ht) {
-  PBLOB_IT blob_it;
-  inT16 blob_ht_above_baseline;
-
-  const char *word_str;
-  inT16 i;
-  inT16 offset;
-  TBOX blob_box;                  //blob bounding box
-  char confirmed_char[UNICHAR_LEN + 1];
-  char temp_char[UNICHAR_LEN + 1];
-  float new_val;
-
-  /* IMPROVE estimates here - if good estimates, and case ambig chars,
-    rescan blobs to fix case ambig blobs, re-estimate hts  ??? maybe always do
-    it after deciding x-height
-  */
-
-  blob_it.set_to_list (word_res->outword->blob_list ());
-  word_str = word_res->best_choice->unichar_string().string();
-  for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
-       !blob_it.cycled_list (); blob_it.forward (),
-           offset += word_res->best_choice->unichar_lengths()[i++]) {
-    if ((STRING (chs_ambig_caps_x).contains (word_str[offset])) &&
-        (!dodgy_blob (blob_it.data ()))) {
-      blob_box = blob_it.data ()->bounding_box ();
-      blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
-      strncpy(temp_char, word_str + offset,
-              word_res->best_choice->unichar_lengths()[i]);
-      temp_char[word_res->best_choice->unichar_lengths()[i]] = '\0';
-      check_blob_occ (temp_char,
-                      blob_ht_above_baseline,
-                      est_x_ht, est_caps_ht, confirmed_char);
-      if (strcmp(confirmed_char, "") != 0) {
-        if (STRING (chs_x_ht).contains (*confirmed_char))
-          x_ht.add (blob_ht_above_baseline, 1);
-        else
-          caps_ht.add (blob_ht_above_baseline, 1);
-      }
-    }
-  }
-  new_val = estimate_from_stats (x_ht);
-  if (new_val > 0)
-    est_x_ht = new_val;
-  new_val = estimate_from_stats (caps_ht);
-  if (new_val > 0)
-    est_caps_ht = new_val;
-}
-
-
-void reject_ambigs(  //rej any accepted xht ambig chars
-                   WERD_RES *word) {
-  const char *word_str;
-  int i = 0;
-
-  word_str = word->best_choice->unichar_string().string();
-  while (*word_str != '\0') {
-    if (STRING (chs_ambig_caps_x).contains (*word_str))
-      word->reject_map[i].setrej_xht_fixup ();
-    word_str += word->best_choice->unichar_lengths()[i++];
-  }
-}
-
-
-void est_ambigs(                          //xht ambig ht stats
-                WERD_RES *word_res,
-                STATS &stats,
-                float *ambig_lc_x_est,    //xht est
-                float *ambig_uc_caps_est  //caps est
-               ) {
-  float x_ht_ok_variation;
-  STATS short_ambigs (0, 300);
-  STATS tall_ambigs (0, 300);
-  PBLOB_IT blob_it;
-  TBOX blob_box;                  //blob bounding box
-  inT16 blob_ht_above_baseline;
-
-  const char *word_str;
-  inT16 i;
-  inT16 offset;
-  float min;                     //min ambig ch ht
-  float max;                     //max ambig ch ht
-  float short_limit;             // for lower case
-  float tall_limit;              // for upper case
-
-  x_ht_ok_variation =
-    (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
-
-  if (stats.get_total () == 0) {
-    *ambig_lc_x_est = 0;
-    *ambig_uc_caps_est = 0;
-  }
-  else {
-    min = stats.ile (0.0);
-    max = stats.ile (0.99999);
-    if ((max - min) < x_ht_ok_variation) {
-      *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
-      //close enough
-    }
-    else {
-    /* Try reclustering into lower and upper case chars */
-      short_limit = min + (max - min) * x_ht_variation;
-      tall_limit = max - (max - min) * x_ht_variation;
-      word_str = word_res->best_choice->unichar_string().string();
-      blob_it.set_to_list (word_res->outword->blob_list ());
-      for (blob_it.mark_cycle_pt (), i = 0, offset = 0;
-      !blob_it.cycled_list (); blob_it.forward (),
-               offset += word_res->best_choice->unichar_lengths()[i++]) {
-        if (word_res->reject_map[i].accepted () &&
-          STRING (chs_ambig_caps_x).contains (word_str[offset]) &&
-        (!dodgy_blob (blob_it.data ()))) {
-          blob_box = blob_it.data ()->bounding_box ();
-          blob_ht_above_baseline =
-            blob_box.top () - bln_baseline_offset;
-          if (blob_ht_above_baseline <= short_limit)
-            short_ambigs.add (blob_ht_above_baseline, 1);
-          else if (blob_ht_above_baseline >= tall_limit)
-            tall_ambigs.add (blob_ht_above_baseline, 1);
-        }
-      }
-      *ambig_lc_x_est = short_ambigs.mean ();
-      *ambig_uc_caps_est = tall_ambigs.mean ();
-      /* Cop out if we havent got sensible clusters. */
-      if (*ambig_uc_caps_est - *ambig_lc_x_est <= x_ht_ok_variation)
-        *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
-      //close enough
-    }
-  }
-}
-
-
-/**
- * dodgy_blob()
- * Returns true if the blob has more than one outline, one above the other.
- * These are dodgy as the top blob could be noise, causing the bounding box xht
- * to be misleading
- */
-
-BOOL8 dodgy_blob(PBLOB *blob) {
-  OUTLINE_IT outline_it = blob->out_list ();
-  inT16 highest_bottom = -MAX_INT16;
-  inT16 lowest_top = MAX_INT16;
-  TBOX outline_box;
-
-  if (x_ht_include_dodgy_blobs)
-    return FALSE;                //no blob is ever dodgy
-  for (outline_it.mark_cycle_pt ();
-  !outline_it.cycled_list (); outline_it.forward ()) {
-    outline_box = outline_it.data ()->bounding_box ();
-    if (lowest_top > outline_box.top ())
-      lowest_top = outline_box.top ();
-    if (highest_bottom < outline_box.bottom ())
-      highest_bottom = outline_box.bottom ();
-  }
-  return highest_bottom >= lowest_top;
-}
--- a/ccmain/fixxht.h
+++ b/ccmain/fixxht.h
@ -1,92 +0,0 @@
-/**********************************************************************
- * File:        fixxht.h  (Formerly fixxht.h)
- * Description: Improve x_ht and look out for case inconsistencies
- * Author:		Phil Cheatle
- * Created:		Thu Aug  5 14:11:08 BST 1993
- *
- * (C) Copyright 1992, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           FIXXHT_H
-#define           FIXXHT_H
-
-#include          "varable.h"
-#include          "statistc.h"
-#include          "pageres.h"
-#include          "notdll.h"
-
-extern double_VAR_H (x_ht_fraction_of_caps_ht, 0.7,
-"Fract of cps ht est of xht");
-extern double_VAR_H (x_ht_variation, 0.35,
-"Err band as fract of caps/xht dist");
-extern double_VAR_H (x_ht_sub_variation, 0.5,
-"Err band as fract of caps/xht dist");
-extern BOOL_VAR_H (rej_trial_ambigs, TRUE,
-"reject x-ht ambigs when under trial");
-extern BOOL_VAR_H (x_ht_conservative_ambigs, FALSE,
-"Dont rely on ambigs + maxht");
-extern BOOL_VAR_H (x_ht_check_est, TRUE, "Cross check estimates");
-extern BOOL_VAR_H (x_ht_case_flip, FALSE, "Flip or reject suspect case");
-extern BOOL_VAR_H (x_ht_include_dodgy_blobs, TRUE,
-"Include blobs with possible noise?");
-extern BOOL_VAR_H (x_ht_limit_flip_trials, TRUE,
-"Dont do trial flips when ambigs are close to xht?");
-extern BOOL_VAR_H (rej_use_check_block_occ, TRUE,
-"Analyse rejection behaviour");
-extern STRING_VAR_H (chs_non_ambig_caps_ht,
-"!#$%&()/12346789?ABDEFGHIKLNQRT[]\\bdfhkl",
-"Reliable ascenders");
-extern STRING_VAR_H (chs_x_ht, "acegmnopqrsuvwxyz", "X height chars");
-extern STRING_VAR_H (chs_non_ambig_x_ht, "aenqr", "reliable X height chars");
-extern STRING_VAR_H (chs_ambig_caps_x, "cCmMoO05sSuUvVwWxXzZ",
-"X ht or caps ht chars");
-extern STRING_VAR_H (chs_bl_ambig_caps_x, "pPyY",
-" Caps or descender ambigs");
-extern STRING_VAR_H (chs_caps_ht,
-"!#$%&()/0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]\\bdfhkl{|}",
-"Ascender chars");
-extern STRING_VAR_H (chs_desc, "gjpqy", "Descender chars");
-extern STRING_VAR_H (chs_non_ambig_bl,
-"!#$%&01246789?ABCDEFGHIKLMNORSTUVWXYZabcdehiklmnorstuvwxz",
-"Reliable baseline chars");
-extern STRING_VAR_H (chs_odd_top, "ijt", "Chars with funny ascender region");
-extern STRING_VAR_H (chs_odd_bot, "()35JQ[]\\/{}|", "Chars with funny base");
-extern STRING_VAR_H (chs_bl,
-"!#$%&()/01246789?ABCDEFGHIJKLMNOPRSTUVWXYZ[]\\abcdefhiklmnorstuvwxz{}",
-"Baseline chars");
-extern STRING_VAR_H (chs_non_ambig_desc, "gq", "Reliable descender chars");
-void re_estimate_x_ht(                     //improve for 1 word
-                      WERD_RES *word_res,  //word to do
-                      float *trial_x_ht    //new match value
-                     );
-void check_blob_occ(char *proposed_char,
-                    inT16 blob_ht_above_baseline,
-                    float x_ht,
-                    float caps_ht,
-                    char *confirmed_char);
-float estimate_from_stats(STATS &stats);
-void improve_estimate(WERD_RES *word_res,
-                      float &est_x_ht,
-                      float &est_caps_ht,
-                      STATS &x_ht,
-                      STATS &caps_ht);
-void reject_ambigs(  //rej any accepted xht ambig chars
-                   WERD_RES *word);
-                                 //xht ambig ht stats
-void est_ambigs(WERD_RES *word_res,
-                STATS &stats,
-                float *ambig_lc_x_est,    //xht est
-                float *ambig_uc_caps_est  //caps est
-               );
-BOOL8 dodgy_blob(PBLOB *blob);
-#endif
--- a/ccmain/matmatch.cpp
+++ b/ccmain/matmatch.cpp
@ -1,396 +0,0 @@
-/**********************************************************************
- * File:        matmatch.cpp  (Formerly matrix_match.c)
- * Description: matrix matching routines for Tessedit
- * Author:      Chris Newton
- * Created:     Wed Nov 24 15:57:41 GMT 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#include "mfcpch.h"
-#include          <stdlib.h>
-#include          <math.h>
-#include          <string.h>
-#include          <ctype.h>
-#ifdef __UNIX__
-#include          <assert.h>
-#endif
-#include          "tessvars.h"
-#include          "stderr.h"
-#include          "img.h"
-//#include          "evnts.h"
-//#include          "showim.h"
-#include          "hosthplb.h"
-#include          "scrollview.h"
-//#include          "evnts.h"
-#include          "adaptions.h"
-#include          "matmatch.h"
-#include          "secname.h"
-#include "svshowim.h"
-
-// Include automatically generated configuration file if running autoconf.
-#ifdef HAVE_CONFIG_H
-#include "config_auto.h"
-#endif
-
-#define EXTERN
-
-EXTERN BOOL_VAR (tessedit_display_mm, FALSE, "Display matrix matches");
-EXTERN BOOL_VAR (tessedit_mm_debug, FALSE,
-"Print debug information for matrix matcher");
-EXTERN INT_VAR (tessedit_mm_prototype_min_size, 3,
-"Smallest number of samples in a cluster for a prototype to be used");
-
-// Colours for displaying the match
-#define BB_COLOUR 0
-#define BW_COLOUR 1
-#define WB_COLOUR 3
-#define UB_COLOUR 5
-#define BU_COLOUR 7
-#define UU_COLOUR 9
-#define WU_COLOUR 11
-#define UW_COLOUR 13
-#define WW_COLOUR 15
-
-#define BINIM_BLACK 0
-#define BINIM_WHITE 1
-
-float matrix_match(  // returns match score
-                   IMAGE *image1,
-                   IMAGE *image2) {
-  ASSERT_HOST (image1->get_bpp () == 1 && image2->get_bpp () == 1);
-
-  if (image1->get_xsize () >= image2->get_xsize ())
-    return match1 (image1, image2);
-  else
-    return match1 (image2, image1);
-}
-
-
-float match1(  /* returns match score */
-             IMAGE *image_w,
-             IMAGE *image_n) {
-  inT32 x_offset;
-  inT32 y_offset;
-  inT32 x_size = image_w->get_xsize ();
-  inT32 y_size;
-  inT32 x_size2 = image_n->get_xsize ();
-  inT32 y_size2;
-  IMAGE match_image;
-  IMAGELINE imline_w;
-  IMAGELINE imline_n;
-  IMAGELINE match_imline;
-  inT32 x;
-  inT32 y;
-  float sum = 0.0;
-
-  x_offset = (image_w->get_xsize () - image_n->get_xsize ()) / 2;
-
-  ASSERT_HOST (x_offset >= 0);
-  match_imline.init (x_size);
-
-  sum = 0;
-
-  if (image_w->get_ysize () < image_n->get_ysize ()) {
-    y_size = image_n->get_ysize ();
-    y_size2 = image_w->get_ysize ();
-    y_offset = (y_size - y_size2) / 2;
-
-    if (tessedit_display_mm && !tessedit_mm_use_prototypes)
-      tprintf ("I1 (%d, %d), I2 (%d, %d), MI (%d, %d)\n", x_size,
-        image_w->get_ysize (), x_size2, image_n->get_ysize (),
-        x_size, y_size);
-
-    match_image.create (x_size, y_size, 4);
-
-    for (y = 0; y < y_offset; y++) {
-      image_n->fast_get_line (0, y, x_size2, &imline_n);
-      for (x = 0; x < x_size2; x++) {
-        if (imline_n.pixels[x] == BINIM_BLACK) {
-          sum += -1;
-          match_imline.pixels[x] = UB_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = UW_COLOUR;
-        }
-      }
-      match_image.fast_put_line (x_offset, y, x_size2, &match_imline);
-    }
-
-    for (y = y_offset + y_size2; y < y_size; y++) {
-      image_n->fast_get_line (0, y, x_size2, &imline_n);
-      for (x = 0; x < x_size2; x++) {
-        if (imline_n.pixels[x] == BINIM_BLACK) {
-          sum += -1.0;
-          match_imline.pixels[x] = UB_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = UW_COLOUR;
-        }
-      }
-      match_image.fast_put_line (x_offset, y, x_size2, &match_imline);
-    }
-
-    for (y = y_offset; y < y_offset + y_size2; y++) {
-      image_w->fast_get_line (0, y - y_offset, x_size, &imline_w);
-      image_n->fast_get_line (0, y, x_size2, &imline_n);
-      for (x = 0; x < x_offset; x++) {
-        if (imline_w.pixels[x] == BINIM_BLACK) {
-          sum += -1.0;
-          match_imline.pixels[x] = BU_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = WU_COLOUR;
-        }
-      }
-
-      for (x = x_offset + x_size2; x < x_size; x++) {
-        if (imline_w.pixels[x] == BINIM_BLACK) {
-          sum += -1.0;
-          match_imline.pixels[x] = BU_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = WU_COLOUR;
-        }
-      }
-
-      for (x = x_offset; x < x_offset + x_size2; x++) {
-        if (imline_n.pixels[x - x_offset] == imline_w.pixels[x]) {
-          sum += 1.0;
-          if (imline_w.pixels[x] == BINIM_BLACK)
-            match_imline.pixels[x] = BB_COLOUR;
-          else
-            match_imline.pixels[x] = WW_COLOUR;
-        }
-        else {
-          sum += -1.0;
-          if (imline_w.pixels[x] == BINIM_BLACK)
-            match_imline.pixels[x] = BW_COLOUR;
-          else
-            match_imline.pixels[x] = WB_COLOUR;
-        }
-      }
-
-      match_image.fast_put_line (0, y, x_size, &match_imline);
-    }
-  }
-  else {
-    y_size = image_w->get_ysize ();
-    y_size2 = image_n->get_ysize ();
-    y_offset = (y_size - y_size2) / 2;
-
-    if (tessedit_display_mm && !tessedit_mm_use_prototypes)
-      tprintf ("I1 (%d, %d), I2 (%d, %d), MI (%d, %d)\n", x_size,
-        image_w->get_ysize (), x_size2, image_n->get_ysize (),
-        x_size, y_size);
-
-    match_image.create (x_size, y_size, 4);
-
-    for (y = 0; y < y_offset; y++) {
-      image_w->fast_get_line (0, y, x_size, &imline_w);
-      for (x = 0; x < x_size; x++) {
-        if (imline_w.pixels[x] == BINIM_BLACK) {
-          sum += -1;
-          match_imline.pixels[x] = BU_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = WU_COLOUR;
-        }
-      }
-      match_image.fast_put_line (0, y, x_size, &match_imline);
-    }
-
-    for (y = y_offset + y_size2; y < y_size; y++) {
-      image_w->fast_get_line (0, y, x_size, &imline_w);
-      for (x = 0; x < x_size; x++) {
-        if (imline_w.pixels[x] == BINIM_BLACK) {
-          sum += -1;
-          match_imline.pixels[x] = BU_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = WU_COLOUR;
-        }
-      }
-      match_image.fast_put_line (0, y, x_size, &match_imline);
-    }
-
-    for (y = y_offset; y < y_offset + y_size2; y++) {
-      image_w->fast_get_line (0, y, x_size, &imline_w);
-      image_n->fast_get_line (0, y - y_offset, x_size2, &imline_n);
-      for (x = 0; x < x_offset; x++) {
-        if (imline_w.pixels[x] == BINIM_BLACK) {
-          sum += -1.0;
-          match_imline.pixels[x] = BU_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = WU_COLOUR;
-        }
-      }
-
-      for (x = x_offset + x_size2; x < x_size; x++) {
-        if (imline_w.pixels[x] == BINIM_BLACK) {
-          sum += -1.0;
-          match_imline.pixels[x] = BU_COLOUR;
-        }
-        else {
-          match_imline.pixels[x] = WU_COLOUR;
-        }
-      }
-
-      for (x = x_offset; x < x_offset + x_size2; x++) {
-        if (imline_n.pixels[x - x_offset] == imline_w.pixels[x]) {
-          sum += 1.0;
-          if (imline_w.pixels[x] == BINIM_BLACK)
-            match_imline.pixels[x] = BB_COLOUR;
-          else
-            match_imline.pixels[x] = WW_COLOUR;
-        }
-        else {
-          sum += -1.0;
-          if (imline_w.pixels[x] == BINIM_BLACK)
-            match_imline.pixels[x] = BW_COLOUR;
-          else
-            match_imline.pixels[x] = WB_COLOUR;
-        }
-      }
-
-      match_image.fast_put_line (0, y, x_size, &match_imline);
-    }
-  }
-
-#ifndef GRAPHICS_DISABLED
-  if (tessedit_display_mm && !tessedit_mm_use_prototypes) {
-    tprintf ("Match score %f\n", 1.0 - sum / (x_size * y_size));
-    display_images(image_w, image_n, &match_image);
-  }
-#endif
-
-  if (tessedit_mm_debug)
-    tprintf ("Match score %f\n", 1.0 - sum / (x_size * y_size));
-
-  return (1.0 - sum / (x_size * y_size));
-}
-
-
-/*************************************************************************
- * display_images()
- *
- * Show a pair of images, plus the match image
- *
- *************************************************************************/
-
-#ifndef GRAPHICS_DISABLED
-void display_images(IMAGE *image_w, IMAGE *image_n, IMAGE *match_image) {
-  ScrollView* w_im_window;
-  ScrollView* n_im_window;
-  ScrollView* match_window;
-  inT16 i;
-
-  w_im_window = new ScrollView("Image 1", 20, 100,
-      10 * image_w->get_xsize (), 10 * image_w->get_ysize (),
-      image_w->get_xsize (), image_w->get_ysize ());
-
-  sv_show_sub_image (image_w,
-    0, 0,
-    image_w->get_xsize (), image_w->get_ysize (),
-    w_im_window, 0, 0);
-
-  w_im_window->Pen(255,0,0);
-  for (i = 1; i < image_w->get_xsize (); i++) {
-     w_im_window->Line(i, 0, i, image_w->get_ysize ());
-  }
-  for (i = 1; i < image_w->get_ysize (); i++) {
-    w_im_window->Line(0, i, image_w->get_xsize (), i);
-  }
-
-  n_im_window = new ScrollView ("Image 2", 240, 100,
-      10 * image_n->get_xsize (), 10 * image_n->get_ysize (),
-      image_n->get_xsize (), image_n->get_ysize ());
-
-  sv_show_sub_image (image_n,
-    0, 0,
-    image_n->get_xsize (), image_n->get_ysize (),
-    n_im_window, 0, 0);
-
-  n_im_window->Pen(255,0,0);
-  for (i = 1; i < image_n->get_xsize (); i++) {
-     n_im_window->Line(i, 0, i, image_n->get_ysize ());
-  }
-  for (i = 1; i < image_n->get_ysize (); i++) {
-    n_im_window->Line(0, i, image_n->get_xsize (), i);
-  }
-
-  match_window = new ScrollView ("Match Result", 460, 100,
-       10 * match_image->get_xsize (), 10 * match_image->get_ysize (),
-       match_image->get_xsize (), match_image->get_ysize ());
-
-  match_window->Clear();
-  sv_show_sub_image (match_image,
-    0, 0,
-    match_image->get_xsize (), match_image->get_ysize (),
-    match_window, 0, 0);
-
-  match_window->Pen(255,0,0);
-  for (i = 1; i < match_image->get_xsize (); i++) {
-     match_window->Line(i, 0, i, match_image->get_ysize ());
-  }
-  for (i = 1; i < match_image->get_ysize (); i++) {
-     match_window->Line(0, i, match_image->get_xsize (), i);
-  }
-  SVEvent* sve = match_window->AwaitEvent(SVET_DESTROY);
-  delete sve;
-
-  delete w_im_window;
-  delete n_im_window;
-  delete match_window;
-}
-
-
-/*************************************************************************
- * display_image()
- *
- * Show a single image
- *
- *************************************************************************/
-
-ScrollView* display_image(IMAGE *image,
-                     const char *title,
-                     inT32 x,
-                     inT32 y,
-                     BOOL8 wait) {
-  ScrollView* im_window;
-  inT16 i;
-
-  im_window = new ScrollView (title, x, y,
-      10 * image->get_xsize (), 10 * image->get_ysize (),
-      image->get_xsize (),  image->get_ysize ());
-
-  sv_show_sub_image (image,
-    0, 0,
-    image->get_xsize (), image->get_ysize (), im_window, 0, 0);
-
-  im_window->Pen(255,0,0);
-  for (i = 1; i < image->get_xsize (); i++) {
-    im_window->SetCursor(i, 0);
-    im_window->DrawTo(i, image->get_ysize());
-  }
-  for (i = 1; i < image->get_ysize (); i++) {
-    im_window->SetCursor(0, i);
-    im_window->DrawTo(image->get_xsize(),i);
-  }
-
-  if (wait) { delete im_window->AwaitEvent(SVET_CLICK); }
-
-  return im_window;
-}
-#endif
--- a/ccmain/matmatch.h
+++ b/ccmain/matmatch.h
@ -1,48 +0,0 @@
-/**********************************************************************
- * File:        matmatch.h  (Formerly matrix_match.h)
- * Description: matrix matching routines for Tessedit
- * Author:      Chris Newton
- * Created:     Wed Nov 24 15:57:41 GMT 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           MATMATCH_H
-#define           MATMATCH_H
-
-#include          "img.h"
-#include          "hosthplb.h"
-#include          "notdll.h"
-
-#define BINIM_BLACK 0
-#define BINIM_WHITE 1
-#define BAD_MATCH 9999.0
-
-extern BOOL_VAR_H (tessedit_display_mm, FALSE, "Display matrix matches");
-extern BOOL_VAR_H (tessedit_mm_debug, FALSE,
-"Print debug information for matrix matcher");
-extern INT_VAR_H (tessedit_mm_prototype_min_size, 3,
-"Smallest number of samples in a cluster for a prototype to be used");
-float matrix_match(  // returns match score
-                   IMAGE *image1,
-                   IMAGE *image2);
-float match1(  /* returns match score */
-             IMAGE *image_w,
-             IMAGE *image_n);
-void display_images(IMAGE *image_w, IMAGE *image_n, IMAGE *match_image);
-ScrollView* display_image(IMAGE *image,
-                     const char *title,
-                     inT32 x,
-                     inT32 y,
-                     BOOL8 wait);
-#endif
--- a/ccmain/osdetect.cpp
+++ b/ccmain/osdetect.cpp
@ -2,6 +2,7 @@
 // File:        osdetect.cpp
 // Description: Orientation and script detection.
 // Author:      Samuel Charron
+//              Ranjith Unnikrishnan
 //
 // (C) Copyright 2008, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -18,24 +19,25 @@

 #include "osdetect.h"

-#include "strngs.h"
 #include "blobbox.h"
 #include "blread.h"
-#include "tordmain.h"
-#include "ratngs.h"
+#include "colfind.h"
+#include "imagefind.h"
+#include "linefind.h"
 #include "oldlist.h"
-#include "adaptmatch.h"
-#include "tstruct.h"
-#include "expandblob.h"
-#include "tesseractclass.h"
 #include "qrsequence.h"
-
-extern IMAGE page_image;
+#include "ratngs.h"
+#include "strngs.h"
+#include "tabvector.h"
+#include "tesseractclass.h"
+#include "textord.h"
+#include "tstruct.h"

 const int kMinCharactersToTry = 50;
 const int kMaxCharactersToTry = 5 * kMinCharactersToTry;

 const float kSizeRatioToReject = 2.0;
+const int kMinAcceptableBlobHeight = 10;

 const float kOrientationAcceptRatio = 1.3;
 const float kScriptAcceptRatio = 1.3;
@ -43,8 +45,6 @@ const float kScriptAcceptRatio = 1.3;
 const float kHanRatioInKorean = 0.7;
 const float kHanRatioInJapanese = 0.3;

-const float kLatinRationInFraktur = 0.7;
-
 const float kNonAmbiguousMargin = 1.0;

 // General scripts
@ -59,45 +59,140 @@ const char* ScriptDetector::korean_script_ = "Korean";
 const char* ScriptDetector::japanese_script_ = "Japanese";
 const char* ScriptDetector::fraktur_script_ = "Fraktur";

-CLISTIZEH(BLOBNBOX);
-CLISTIZE(BLOBNBOX);
+// Minimum believable resolution.
+const int kMinCredibleResolution = 70;
+// Default resolution used if input is not believable.
+const int kDefaultResolution = 300;
+
+void OSResults::update_best_orientation() {
+  float first = orientations[0];
+  float second = orientations[1];
+  best_result.orientation_id = 0;
+  if (orientations[0] < orientations[1]) {
+    first = orientations[1];
+    second = orientations[0];
+    best_result.orientation_id = 1;
+  }
+  for (int i = 2; i < 4; ++i) {
+    if (orientations[i] > first) {
+      second = first;
+      first = orientations[i];
+      best_result.orientation_id = i;
+    } else if (orientations[i] > second) {
+      second = orientations[i];
+    }
+  }
+  // Store difference of top two orientation scores.
+  best_result.oconfidence = first - second;
+}
+
+void OSResults::set_best_orientation(int orientation_id) {
+  best_result.orientation_id = orientation_id;
+  best_result.oconfidence = 0;
+}
+
+void OSResults::update_best_script(int orientation) {
+  // We skip index 0 to ignore the "Common" script.
+  float first = scripts_na[orientation][1];
+  float second = scripts_na[orientation][2];
+  best_result.script_id = 1;
+  if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
+    first = scripts_na[orientation][2];
+    second = scripts_na[orientation][1];
+    best_result.script_id = 2;
+  }
+  for (int i = 3; i < kMaxNumberOfScripts; ++i) {
+    if (scripts_na[orientation][i] > first) {
+      best_result.script_id = i;
+      second = first;
+      first = scripts_na[orientation][i];
+    } else if (scripts_na[orientation][i] > second) {
+      second = scripts_na[orientation][i];
+    }
+  }
+  best_result.sconfidence =
+      (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
+}
+
+// Detect and erase horizontal/vertical lines and picture regions from the
+// image, so that non-text blobs are removed from consideration.
+void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
+                            TO_BLOCK_LIST *to_blocks) {
+  Pix *pix = tess->pix_binary();
+  ASSERT_HOST(pix != NULL);
+  int vertical_x = 0;
+  int vertical_y = 1;
+  tesseract::TabVector_LIST v_lines;
+  tesseract::TabVector_LIST h_lines;
+  Boxa* boxa = NULL;
+  Pixa* pixa = NULL;
+  const int kMinCredibleResolution = 70;
+  int resolution = (kMinCredibleResolution > pixGetXRes(pix)) ?
+      kMinCredibleResolution : pixGetXRes(pix);
+
+  tesseract::LineFinder::FindVerticalLines(resolution, pix, &vertical_x,
+                                           &vertical_y, &v_lines);
+  tesseract::LineFinder::FindHorizontalLines(resolution, pix, &h_lines);
+  tesseract::ImageFinder::FindImages(pix, &boxa, &pixa);
+  pixaDestroy(&pixa);
+  boxaDestroy(&boxa);
+  tess->mutable_textord()->find_components(tess->pix_binary(),
+                                           blocks, to_blocks);
+}

 // Find connected components in the page and process a subset until finished or
 // a stopping criterion is met.
-// Returns true if the page was successfully processed.
-bool orientation_and_script_detection(STRING& filename,
-                                      OSResults* osr,
-                                      tesseract::Tesseract* tess) {
+// Returns the number of blobs used in making the estimate. 0 implies failure.
+int orientation_and_script_detection(STRING& filename,
+                                     OSResults* osr,
+                                     tesseract::Tesseract* tess) {
  STRING name = filename;        //truncated name
  const char *lastdot;           //of name
-  TO_BLOCK_LIST land_blocks, port_blocks;
-  BLOCK_LIST blocks;
  TBOX page_box;

  lastdot = strrchr (name.string (), '.');
  if (lastdot != NULL)
    name[lastdot-name.string()] = '\0';
-  if (!read_unlv_file(name, page_image.get_xsize(), page_image.get_ysize(),
-                     &blocks))
-    FullPageBlock(page_image.get_xsize(), page_image.get_ysize(), &blocks);
-  find_components(&blocks, &land_blocks, &port_blocks, &page_box);
+
+  ASSERT_HOST(tess->pix_binary() != NULL)
+  int width = pixGetWidth(tess->pix_binary());
+  int height = pixGetHeight(tess->pix_binary());
+  int resolution = pixGetXRes(tess->pix_binary());
+  // Zero resolution messes up the algorithms, so make sure it is credible.
+  if (resolution < kMinCredibleResolution)
+    resolution = kDefaultResolution;
+
+  BLOCK_LIST blocks;
+  if (!read_unlv_file(name, width, height, &blocks))
+    FullPageBlock(width, height, &blocks);
+
+  // Try to remove non-text regions from consideration.
+  TO_BLOCK_LIST land_blocks, port_blocks;
+  remove_nontext_regions(tess, &blocks, &port_blocks);
+
+  if (port_blocks.empty()) {
+    // page segmentation did not succeed, so we need to find_components first.
+    tess->mutable_textord()->find_components(tess->pix_binary(),
+                                             &blocks, &port_blocks);
+  } else {
+    page_box.set_left(0);
+    page_box.set_bottom(0);
+    page_box.set_right(width);
+    page_box.set_top(height);
+    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
+    tess->mutable_textord()->filter_blobs(page_box.topright(),
+                                          &port_blocks, true);
+  }
+
  return os_detect(&port_blocks, osr, tess);
 }

 // Filter and sample the blobs.
-// Returns true if the page was successfully processed, or false if the page had
-// too few characters to be reliable
-bool os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
-               tesseract::Tesseract* tess) {
+// Returns a non-zero number of blobs if the page was successfully processed, or
+// zero if the page had too few characters to be reliable
+int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
+              tesseract::Tesseract* tess) {
  int blobs_total = 0;
-  OSResults osr_;
-  if (osr == NULL)
-    osr = &osr_;
-
-  osr->unicharset = &tess->unicharset;
-  OrientationDetector o(osr);
-  ScriptDetector s(osr, tess);
-
  TO_BLOCK_IT block_it;
  block_it.set_to_list(port_blocks);

@ -106,9 +201,11 @@ bool os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,

  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
       block_it.forward ()) {
-    TO_BLOCK* block = block_it.data();
+    TO_BLOCK* to_block = block_it.data();
+    if (to_block->block->poly_block() &&
+        !to_block->block->poly_block()->IsText()) continue;
    BLOBNBOX_IT bbox_it;
-    bbox_it.set_to_list(&block->blobs);
+    bbox_it.set_to_list(&to_block->blobs);
    for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list ();
         bbox_it.forward ()) {
      BLOBNBOX* bbox = bbox_it.data();
@ -122,22 +219,36 @@ bool os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
      float ratio = x_y > y_x ? x_y : y_x;
      // Blob is ambiguous
      if (ratio > kSizeRatioToReject) continue;
-      if (box.height() < 10) continue;
+      if (box.height() < kMinAcceptableBlobHeight) continue;
      filtered_it.add_to_end(bbox);
    }
  }
-  if (filtered_it.length() > 0)
-    filtered_it.move_to_first();
+  return os_detect_blobs(&filtered_list, osr, tess);
+}

+// Detect orientation and script from a list of blobs.
+// Returns a non-zero number of blobs if the list was successfully processed, or
+// zero if the list had too few characters to be reliable
+int os_detect_blobs(BLOBNBOX_CLIST* blob_list, OSResults* osr,
+                    tesseract::Tesseract* tess) {
+  OSResults osr_;
+  if (osr == NULL)
+    osr = &osr_;
+
+  osr->unicharset = &tess->unicharset;
+  OrientationDetector o(osr);
+  ScriptDetector s(osr, tess);
+
+  BLOBNBOX_C_IT filtered_it(blob_list);
  int real_max = MIN(filtered_it.length(), kMaxCharactersToTry);
-   printf("Total blobs found = %d\n", blobs_total);
-   printf("Number of blobs post-filtering = %d\n", filtered_it.length());
-   printf("Number of blobs to try = %d\n", real_max);
+  // printf("Total blobs found = %d\n", blobs_total);
+  // printf("Number of blobs post-filtering = %d\n", filtered_it.length());
+  // printf("Number of blobs to try = %d\n", real_max);

  // If there are too few characters, skip this page entirely.
  if (real_max < kMinCharactersToTry / 2) {
    printf("Too few characters. Skipping this page\n");
-    return false;
+    return 0;
  }

  BLOBNBOX** blobs = new BLOBNBOX*[filtered_it.length()];
@ -147,18 +258,20 @@ bool os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
    blobs[number_of_blobs++] = (BLOBNBOX*)filtered_it.data();
  }
  QRSequenceGenerator sequence(number_of_blobs);
+  int num_blobs_evaluated = 0;
  for (int i = 0; i < real_max; ++i) {
    if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
        && i > kMinCharactersToTry) {
      break;
    }
+    ++num_blobs_evaluated;
  }
  delete [] blobs;

  // Make sure the best_result is up-to-date
  int orientation = o.get_orientation();
-  s.update_best_script(orientation);
-  return true;
+  osr->update_best_script(orientation);
+  return num_blobs_evaluated;
 }

 // Processes a single blob to estimate script and orientation.
@ -173,39 +286,40 @@ bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
  int       x_mid = (box.left() + box.right()) / 2.0f;
  int       y_mid = (box.bottom() + box.top()) / 2.0f;

-  PBLOB     pblob(blob, box.height());
+  PBLOB     pblob(blob);

  BLOB_CHOICE_LIST ratings[4];
  // Test the 4 orientations
  for (int i = 0; i < 4; ++i) {
    // normalize the blob
+    float scaling = static_cast<float>(kBlnXHeight) / box.height();
+    DENORM denorm(x_mid, scaling, 0.0, box.bottom(), 0, NULL, false, NULL);
    pblob.move(FCOORD(-x_mid, -box.bottom()));
-    pblob.scale(static_cast<float>(bln_x_height) / box.height());
-    pblob.move(FCOORD(0.0f, bln_baseline_offset));
+    pblob.scale(scaling);
+    pblob.move(FCOORD(0.0f, kBlnBaselineOffset));

    {
      // List of choices given by the classifier
-      TBLOB *tessblob;               //converted blob
-      TEXTROW tessrow;               //dummy row
-
-      tess_cn_matching.set_value(true); // turn it on
-      tess_bn_matching.set_value(false);
-      //convert blob
-      tessblob = make_tess_blob (&pblob, TRUE);
-      //make dummy row
-      make_tess_row(NULL, &tessrow);
-      //classify
-      tess->AdaptiveClassifier (tessblob, NULL, &tessrow, ratings + i, NULL);
-      free_blob(tessblob);
+      tess->tess_cn_matching.set_value(true); // turn it on
+      tess->tess_bn_matching.set_value(false);
+      // Convert blob
+      TBLOB* tessblob = make_tess_blob(&pblob);
+      // Classify
+      tess->set_denorm(&denorm);
+      tess->AdaptiveClassifier(tessblob, ratings + i, NULL);
+      delete tessblob;
    }
    // undo normalize
-    pblob.move(FCOORD(0.0f, -bln_baseline_offset));
-    pblob.scale(1.0f / (static_cast<float>(bln_x_height) / box.height()));
+    pblob.move(FCOORD(0.0f, -kBlnBaselineOffset));
+    pblob.scale(1.0f / scaling);
    pblob.move(FCOORD(x_mid, box.bottom()));

    // center the blob
    pblob.move(FCOORD(-x_mid, -y_mid));

+    // TODO(rays) Although we should now get the correct image coords with
+    // the DENORM, there is nothing to tell the classifier to rotate the
+    // image or to actually rotate the image for it.
    // Rotate it
    pblob.rotate();

@ -233,14 +347,24 @@ OrientationDetector::OrientationDetector(OSResults* osr) {
 // Score the given blob and return true if it is now sure of the orientation
 // after adding this block.
 bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
+  float blob_o_score[4] = {0.0, 0.0, 0.0, 0.0};
+  float total_blob_o_score = 0.0;
+
  for (int i = 0; i < 4; ++i) {
    BLOB_CHOICE_IT choice_it;
    choice_it.set_to_list(scores + i);
-
    if (!choice_it.empty()) {
-      osr_->orientations[i] += (100 + choice_it.data()->certainty());
+      // The certainty score ranges between [-20,0]. This is converted here to
+      // [0,1], with 1 indicating best match.
+      blob_o_score[i] = 1 + 0.05 * choice_it.data()->certainty();
+      total_blob_o_score += blob_o_score[i];
    }
  }
+  // Normalize the orientation scores for the blob and use them to
+  // update the aggregated orientation score.
+  for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
+    osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
+  }

  float first = -1;
  float second = -1;
@ -259,35 +383,9 @@ bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
  return first / second > kOrientationAcceptRatio;
 }

-void OrientationDetector::update_best_orientation() {
-  float first = osr_->orientations[0];
-  float second = osr_->orientations[1];
-
-  if (first < second) {
-    second = first;
-    first = osr_->orientations[1];
-  }
-
-  osr_->best_result.orientation = 0;
-  osr_->best_result.oconfidence = 0;
-
-  for (int i = 0; i < 4; ++i) {
-    if (osr_->orientations[i] > first) {
-      second = first;
-      first = osr_->orientations[i];
-      osr_->best_result.orientation = i;
-    } else if (osr_->orientations[i] > second) {
-      second = osr_->orientations[i];
-    }
-  }
-
-  osr_->best_result.oconfidence =
-      (first / second - 1.0) / (kOrientationAcceptRatio - 1.0);
-}
-
 int OrientationDetector::get_orientation() {
-  update_best_orientation();
-  return osr_->best_result.orientation;
+  osr_->update_best_orientation();
+  return osr_->best_result.orientation_id;
 }


@ -347,7 +445,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
        prev_class_id = choice->unichar_id();
        prev_config = choice->config();
      } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
-        script_count++;
+        ++script_count;
        next_best_score = -choice->certainty();
        next_best_script_id = choice->script_id();
        next_best_unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
@ -365,7 +463,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
    // Character is non ambiguous
    if (script_count == 1) {
      // Update the score of the winning script
-      osr_->scripts_na[i][prev_id] += 1;
+      osr_->scripts_na[i][prev_id] += 1.0;

      // Workaround for Fraktur
      if (prev_id == latin_id_) {
@ -379,19 +477,19 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
          //       fi.is_serif(), fi.is_fraktur(),
          //       prev_unichar);
          if (fi.is_fraktur()) {
-            osr_->scripts_na[i][prev_id] -= 1;
-            osr_->scripts_na[i][fraktur_id_] += 1;
+            osr_->scripts_na[i][prev_id] -= 1.0;
+            osr_->scripts_na[i][fraktur_id_] += 1.0;
          }
        }
      }

      // Update Japanese / Korean pseudo-scripts
      if (prev_id == katakana_id_)
-        osr_->scripts_na[i][japanese_id_] += 1;
+        osr_->scripts_na[i][japanese_id_] += 1.0;
      if (prev_id == hiragana_id_)
-        osr_->scripts_na[i][japanese_id_] += 1;
+        osr_->scripts_na[i][japanese_id_] += 1.0;
      if (prev_id == hangul_id_)
-        osr_->scripts_na[i][korean_id_] += 1;
+        osr_->scripts_na[i][korean_id_] += 1.0;
      if (prev_id == han_id_)
        osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
      if (prev_id == han_id_)
@ -401,27 +499,24 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
 }

 bool ScriptDetector::must_stop(int orientation) {
-  update_best_script(orientation);
+  osr_->update_best_script(orientation);
  return osr_->best_result.sconfidence > 1;
 }

-
-void ScriptDetector::update_best_script(int orientation) {
-  float first = -1;
-  float second = -1;
-
-  // i = 1 -> ignore Common scripts
-  for (int i = 1; i < kMaxNumberOfScripts; ++i) {
-    if (osr_->scripts_na[orientation][i] > first) {
-      osr_->best_result.script =
-          tess_->unicharset.get_script_from_script_id(i);
-      second = first;
-      first = osr_->scripts_na[orientation][i];
-    } else if (osr_->scripts_na[orientation][i] > second) {
-      second = osr_->scripts_na[orientation][i];
-    }
+// Helper method to convert an orientation index to its value in degrees.
+// The value represents the amount of clockwise rotation in degrees that must be
+// applied for the text to be upright (readable).
+const int OrientationIdToValue(const int& id) {
+  switch (id) {
+    case 0:
+      return 0;
+    case 1:
+      return 270;
+    case 2:
+      return 180;
+    case 3:
+      return 90;
+    default:
+      return -1;
  }
-
-  osr_->best_result.sconfidence =
-      (first / second - 1.0) / (kOrientationAcceptRatio - 1.0);
 }
--- a/ccmain/osdetect.h
+++ b/ccmain/osdetect.h
@ -2,6 +2,7 @@
 // File:        osdetect.h
 // Description: Orientation and script detection.
 // Author:      Samuel Charron
+//              Ranjith Unnikrishnan
 //
 // (C) Copyright 2008, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -25,6 +26,7 @@
 class TO_BLOCK_LIST;
 class BLOBNBOX;
 class BLOB_CHOICE_LIST;
+class BLOBNBOX_CLIST;

 namespace tesseract {
 class Tesseract;
@ -34,8 +36,10 @@ class Tesseract;
 const int kMaxNumberOfScripts = 116 + 1 + 2 + 1;

 struct OSBestResult {
-  int orientation;
-  const char* script;
+  OSBestResult() : orientation_id(0), script_id(0), sconfidence(0.0),
+                   oconfidence(0.0) {}
+  int orientation_id;
+  int script_id;
  float sconfidence;
  float oconfidence;
 };
@ -48,7 +52,16 @@ struct OSResults {
      orientations[i] = 0;
    }
  }
+  void update_best_orientation();
+  void set_best_orientation(int orientation_id);
+  void update_best_script(int orientation_id);
+
+  // Array holding scores for each orientation id [0,3].
+  // Orientation ids [0..3] map to [0, 270, 180, 90] degree orientations of the
+  // page respectively, where the values refer to the amount of clockwise
+  // rotation to be applied to the page for the text to be upright and readable.
  float orientations[4];
+  // Script confidence scores for each of 4 possible orientations.
  float scripts_na[4][kMaxNumberOfScripts];

  UNICHARSET* unicharset;
@ -59,7 +72,6 @@ class OrientationDetector {
 public:
  OrientationDetector(OSResults*);
  bool detect_blob(BLOB_CHOICE_LIST* scores);
-  void update_best_orientation();
  int get_orientation();
 private:
  OSResults* osr_;
@ -69,7 +81,6 @@ class ScriptDetector {
 public:
  ScriptDetector(OSResults*, tesseract::Tesseract* tess);
  void detect_blob(BLOB_CHOICE_LIST* scores);
-  void update_best_script(int);
  void get_script() ;
  bool must_stop(int orientation);
 private:
@ -88,15 +99,25 @@ class ScriptDetector {
  tesseract::Tesseract* tess_;
 };

-bool orientation_and_script_detection(STRING& filename,
-                                      OSResults*,
-                                      tesseract::Tesseract*);
+int orientation_and_script_detection(STRING& filename,
+                                     OSResults*,
+                                     tesseract::Tesseract*);

-bool os_detect(TO_BLOCK_LIST* port_blocks,
-               OSResults* osr,
-               tesseract::Tesseract* tess);
+int os_detect(TO_BLOCK_LIST* port_blocks,
+              OSResults* osr,
+              tesseract::Tesseract* tess);
+
+int os_detect_blobs(BLOBNBOX_CLIST* blob_list,
+                    OSResults* osr,
+                    tesseract::Tesseract* tess);

 bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
                    ScriptDetector* s, OSResults*,
                    tesseract::Tesseract* tess);
+
+// Helper method to convert an orientation index to its value in degrees.
+// The value represents the amount of clockwise rotation in degrees that must be
+// applied for the text to be upright (readable).
+const int OrientationIdToValue(const int& id);
+
 #endif  // TESSERACT_CCMAIN_OSDETECT_H__
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@ -22,28 +22,25 @@
 #endif

 #include "mfcpch.h"
-#include          "ocrshell.h"
-#include          <string.h>
-#include          <ctype.h>
+#include <string.h>
+#include <ctype.h>
 #ifdef __UNIX__
 #include          <assert.h>
 #include          <unistd.h>
-#include                    <errno.h>
+#include          <errno.h>
 #endif
-#include          "mainblk.h"
-#include          "tfacep.h"
-#include          "tessvars.h"
-#include          "control.h"
-#include          "secname.h"
-#include          "reject.h"
-#include          "docqual.h"
-#include          "output.h"
+#include "helpers.h"
+#include "tfacep.h"
+#include "tessvars.h"
+#include "control.h"
+#include "secname.h"
+#include "reject.h"
+#include "docqual.h"
+#include "output.h"
 #include "bestfirst.h"
 #include "globals.h"
 #include "tesseractclass.h"

-#define EXTERN
-
 #define EPAPER_EXT      ".ep"
 #define PAGE_YSIZE      3508
 #define CTRL_INSET      '\024'   //dc4=text inset
@ -54,44 +51,6 @@
 #define CTRL_NEWLINE      '\012' //newline
 #define CTRL_HARDLINE   '\015'   //cr

-EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
-"Write block separators in output");
-EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
-"Write raw stuff to name.raw");
-EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
-EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
-"Return ratings in IPEOCRAPI data");
-EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
-"Write .txt to .etx map file");
-EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
-"Write repetition char code");
-EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
-EXTERN STRING_EVAR (unrecognised_char, "|",
-"Output char for unidentified blobs");
-EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");
-EXTERN INT_VAR (suspect_space_level, 100,
-"Min suspect level for rejecting spaces");
-EXTERN INT_VAR (suspect_short_words, 2,
-"Dont Suspect dict wds longer than this");
-EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,
-"UNLV keep 1Il chars rejected");
-EXTERN double_VAR (suspect_rating_per_ch, 999.9,
-"Dont touch bad rating limit");
-EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");
-
-EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,
-"Only reject tess failures");
-EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
-EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,
-"Make output have exactly one word per WERD");
-EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,
-"Dont reject ANYTHING AT ALL");
-EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,
-"Force all rep chars the same");
-
-FILE *txt_mapfile = NULL;        //reject map
-FILE *unlv_file = NULL;          //reject map
-
 /**********************************************************************
 * pixels_to_pts
 *
@ -112,17 +71,13 @@ inT32 pixels_to_pts(               //convert coords
 namespace tesseract {
 void Tesseract::output_pass(  //Tess output pass //send to api
                            PAGE_RES_IT &page_res_it,
-                            BOOL8 write_to_shm,
-                            TBOX *target_word_box) {
+                            const TBOX *target_word_box) {
  BLOCK_RES *block_of_last_word;
  inT16 block_id;
  BOOL8 force_eol;               //During output
  BLOCK *nextblock;              //block of next word
  WERD *nextword;                //next word

-  if (tessedit_write_txt_map)
-    txt_mapfile = open_outfile (".map");
-
  page_res_it.restart_page ();
  block_of_last_word = NULL;
  while (page_res_it.word () != NULL) {
@ -144,9 +99,6 @@ void Tesseract::output_pass(  //Tess output pass //send to api
    block_of_last_word != page_res_it.block ()) {
      block_of_last_word = page_res_it.block ();
      block_id = block_of_last_word->block->index();
-      if (!wordrec_no_block)
-        fprintf (textfile, "|^~tr%d\n", block_id);
-      fprintf (txt_mapfile, "|^~tr%d\n", block_id);
    }

    force_eol = (tessedit_write_block_separators &&
@ -162,23 +114,11 @@ void Tesseract::output_pass(  //Tess output pass //send to api
    else
      nextblock = NULL;
                                 //regardless of tilde crunching
-    write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,
-      write_to_shm);
-    page_res_it.forward ();
-  }
-  if (write_to_shm)
-    ocr_send_text(FALSE);
-  if (tessedit_write_block_separators) {
-    if (!wordrec_no_block)
-      fprintf (textfile, "|^~tr\n");
-    fprintf (txt_mapfile, "|^~tr\n");
-  }
-  if (tessedit_write_txt_map) {
-    fprintf (txt_mapfile, "\n"); //because txt gets one
-    #ifdef __UNIX__
-    fsync (fileno (txt_mapfile));
-    #endif
-    fclose(txt_mapfile);
+    write_results(page_res_it,
+                  determine_newline_type(page_res_it.word()->word,
+                                         page_res_it.block()->block,
+                                         nextword, nextblock), force_eol);
+    page_res_it.forward();
  }
 }

@ -195,18 +135,10 @@ void Tesseract::output_pass(  //Tess output pass //send to api
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
-
-void Tesseract::write_results(                        //output a word
-                                                      //full info
-                              PAGE_RES_IT &page_res_it,
-                              char newline_type,      //type of newline
-                                                      //override tilde crunch?
-                              BOOL8 force_eol,
-                              BOOL8 write_to_shm      //send to api
-                  ) {
-                                 //word to do
-  WERD_RES *word = page_res_it.word ();
-//   WERD_CHOICE *ep_choice;        //ep format
+void Tesseract::write_results(PAGE_RES_IT &page_res_it,
+                              char newline_type,  // type of newline
+                              BOOL8 force_eol) {  // override tilde crunch?
+  WERD_RES *word = page_res_it.word();
  STRING repetition_code;
  const STRING *wordstr;
  STRING wordstr_lengths;
@ -217,49 +149,34 @@ void Tesseract::write_results(                        //output a word
  char txt_chs[32];              //Only for unlv_tilde_crunch
  char map_chs[32];              //Only for unlv_tilde_crunch
  int txt_index = 0;
-  static BOOL8 tilde_crunch_written = FALSE;
-  static BOOL8 last_char_was_newline = TRUE;
-  static BOOL8 last_char_was_tilde = FALSE;
-  static BOOL8 empty_block = TRUE;
  BOOL8 need_reject = FALSE;
  PBLOB_IT blob_it;              //blobs
  UNICHAR_ID space = unicharset.unichar_to_id(" ");
-
-  /*	if (word->best_choice->string().length() == 0)
-    {
-      tprintf("No output: to output\n");
-    }
-    else if (word->best_choice->string()[0]==' ')
-    {
-      tprintf("spaceword to output\n");
-    }
-    else if (word->best_choice->string()[0]=='\0')
-    {
-      tprintf("null to output\n");
-    }*/
-  if (word->unlv_crunch_mode != CR_NONE
-  && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
+  if ((word->unlv_crunch_mode != CR_NONE ||
+       word->best_choice->length() == 0) &&
+      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    if ((word->unlv_crunch_mode != CR_DELETE) &&
-      (!tilde_crunch_written ||
-      ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
-      (word->word->space () > 0) &&
-      !word->word->flag (W_FUZZY_NON) &&
-    !word->word->flag (W_FUZZY_SP)))) {
+        (!stats_.tilde_crunch_written ||
+         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
+          (word->word->space () > 0) &&
+          !word->word->flag (W_FUZZY_NON) &&
+          !word->word->flag (W_FUZZY_SP)))) {
      if (!word->word->flag (W_BOL) &&
-        (word->word->space () > 0) &&
-        !word->word->flag (W_FUZZY_NON) &&
-      !word->word->flag (W_FUZZY_SP)) {
-        /* Write a space to separate from preceeding good text */
+          (word->word->space () > 0) &&
+          !word->word->flag (W_FUZZY_NON) &&
+          !word->word->flag (W_FUZZY_SP)) {
+        // Write a space to separate from preceeding good text.
        txt_chs[txt_index] = ' ';
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = ' ';
-        last_char_was_tilde = FALSE;
+        stats_.last_char_was_tilde = false;
      }
      need_reject = TRUE;
    }
-    if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {
+    if ((need_reject && !stats_.last_char_was_tilde) ||
+        (force_eol && stats_.write_results_empty_block)) {
      /* Write a reject char - mark as rejected unless zero_rejection mode */
-      last_char_was_tilde = TRUE;
+      stats_.last_char_was_tilde = TRUE;
      txt_chs[txt_index] = unrecognised;
      if (tessedit_zero_rejection || (suspect_level == 0)) {
        map_chs[txt_index++] = '1';
@ -271,8 +188,7 @@ void Tesseract::write_results(                        //output a word
           The ep_choice string is a faked reject to allow newdiff to sync the
           .etx with the .txt and .map files.
         */
-        ep_chars[ep_chars_index++] = CTRL_INSET;
-        //escape code
+        ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
@ -284,12 +200,12 @@ void Tesseract::write_results(                        //output a word
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
      }
-      tilde_crunch_written = TRUE;
-      last_char_was_newline = FALSE;
-      empty_block = FALSE;
+      stats_.tilde_crunch_written = true;
+      stats_.last_char_was_newline = false;
+      stats_.write_results_empty_block = false;
    }

-    if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {
+    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      /* Add a new line output */
      txt_chs[txt_index] = '\n';
      map_chs[txt_index++] = '\n';
@ -297,70 +213,63 @@ void Tesseract::write_results(                        //output a word
      ep_chars[ep_chars_index++] = newline_type;

                                 //Cos of the real newline
-      tilde_crunch_written = FALSE;
-      last_char_was_newline = TRUE;
-      last_char_was_tilde = FALSE;
+      stats_.tilde_crunch_written = false;
+      stats_.last_char_was_newline = true;
+      stats_.last_char_was_tilde = false;
    }
    txt_chs[txt_index] = '\0';
    map_chs[txt_index] = '\0';
-                                 //xiaofan
-    if (tessedit_write_output && !wordrec_no_block)
-      fprintf (textfile, "%s", txt_chs);
-
-    if (tessedit_write_txt_map)
-      fprintf (txt_mapfile, "%s", map_chs);
-
-                                 //terminate string
-    ep_chars[ep_chars_index] = '\0';
+    ep_chars[ep_chars_index] = '\0';  // terminate string
    word->ep_choice = new WERD_CHOICE(ep_chars, unicharset);

    if (force_eol)
-      empty_block = TRUE;
+      stats_.write_results_empty_block = true;
    return;
  }

  /* NORMAL PROCESSING of non tilde crunched words */

-  tilde_crunch_written = FALSE;
+  stats_.tilde_crunch_written = false;
  if (newline_type)
-    last_char_was_newline = TRUE;
+    stats_.last_char_was_newline = true;
  else
-    last_char_was_newline = FALSE;
-  empty_block = force_eol;       //About to write a real word
+    stats_.last_char_was_newline = false;
+  stats_.write_results_empty_block = force_eol;  // about to write a real word

  if (unlv_tilde_crunching &&
-      last_char_was_tilde &&
+      stats_.last_char_was_tilde &&
      (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
       words have been removed */
    word->best_choice->remove_unichar_id(0);
+    if (word->best_choice->blob_choices() != NULL) {
+      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
+      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
+    }
    word->best_choice->populate_unichars(getDict().getUnicharset());
    word->reject_map.remove_pos (0);
-    blob_it = word->outword->blob_list ();
-    delete blob_it.extract ();   //get rid of reject blob
+    delete word->box_word;
+    word->box_word = new BoxWord;
  }
  if (newline_type ||
    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
-    last_char_was_tilde = FALSE;
+    stats_.last_char_was_tilde = false;
  else {
    if (word->reject_map.length () > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
-        last_char_was_tilde = TRUE;
+        stats_.last_char_was_tilde = true;
      else
-        last_char_was_tilde = FALSE;
+        stats_.last_char_was_tilde = false;
    }
    else if (word->word->space () > 0)
-      last_char_was_tilde = FALSE;
+      stats_.last_char_was_tilde = false;
    /* else it is unchanged as there are no output chars */
  }

  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());

-  if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
-    ensure_rep_chars_are_consistent(word);
-
  set_unlv_suspects(word);
  check_debug_pt (word, 120);
  if (tessedit_rejection_debug) {
@ -368,21 +277,13 @@ void Tesseract::write_results(                        //output a word
             word->best_choice->debug_string(unicharset).string(),
             dict_word(*(word->best_choice)));
  }
-
-#if 0
-  if (tessedit_write_unlv) {
-    write_unlv_text(word);
-  }
-#endif
-
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
    wordstr_lengths = "\001\001\001\001";
    repetition_code += unicharset.id_to_unichar(get_rep_char (word));
    wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
    wordstr = &repetition_code;
-  }
-  else {
+  } else {
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
@ -399,209 +300,9 @@ void Tesseract::write_results(                        //output a word
      }
    }
  }
-
-  if (write_to_shm)
-    write_shm_text (word, page_res_it.block ()->block,
-      page_res_it.row (), *wordstr, wordstr_lengths);
-
-#if 0
-  if (tessedit_write_output)
-    write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
-
-  if (tessedit_write_raw_output)
-    write_cooked_text (word->word, word->raw_choice->string (),
-      TRUE, FALSE, rawfile);
-
-  if (tessedit_write_txt_map)
-    write_map(txt_mapfile, word);
-
-  ep_choice = make_epaper_choice (word, newline_type);
-  word->ep_choice = ep_choice;
-#endif
-
-  character_count += word->best_choice->length();
-  word_count++;
 }
 }  // namespace tesseract

-/**********************************************************************
- * make_epaper_choice
- *
- * Construct the epaper text string for a word, using the reject map to
- * determine whether each blob should be rejected.
- **********************************************************************/
-
-#if 0
-WERD_CHOICE *make_epaper_choice(                   //convert one word
-                                WERD_RES *word,    //word to do
-                                char newline_type  //type of newline
-                               ) {
-  inT16 index = 0;               //to string
-  inT16 blobindex;               //to word
-  inT16 prevright = 0;           //right of previous blob
-  inT16 nextleft;                //left of next blob
-  PBLOB *blob;
-  TBOX inset_box;                 //bounding box
-  PBLOB_IT blob_it;              //blob iterator
-  char word_string[MAX_PATH];    //converted string
-  BOOL8 force_total_reject;
-  char unrecognised = STRING (unrecognised_char)[0];
-
-  blob_it.set_to_list (word->outword->blob_list ());
-
-  ASSERT_HOST (word->reject_map.length () ==
-    word->best_choice->string ().length ());
-  /*
-  tprintf( "\"%s\" -> length: %d;  blobcount: %d (%d)\n",
-      word->best_choice->string().string(),
-        word->best_choice->string().length(),
-      blob_it.length(),
-        blob_count( word->outword ) );
-  */
-
-  if (word->best_choice->string ().length () == 0)
-    force_total_reject = TRUE;
-  else {
-    force_total_reject = FALSE;
-    ASSERT_HOST (blob_it.length () ==
-      word->best_choice->string ().length ());
-  }
-  if (!blob_it.empty ()) {
-    for (index = 0; index < word->word->space (); index++)
-      word_string[index] = ' ';  //leading blanks
-  }
-  /* Why does this generate leading blanks regardless of whether the
-  word_choice string is empty, when write_cooked_text ony generates leading
-  blanks when the string is NOT empty???. */
-
-  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
-    strcpy (word_string + index, "|^~R");
-    index += 4;
-    strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
-    index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
-  }
-  else {
-    if (!blob_it.empty ())
-      prevright = blob_it.data ()->bounding_box ().left ();
-    //actually first left
-    for (blobindex = 0, blob_it.mark_cycle_pt ();
-    !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
-      blob = blob_it.data ();
-      if (word->reject_map[blobindex].accepted ()) {
-        if (word->best_choice->string ()[blobindex] == ' ')
-                                 //but not rejected!!
-          word_string[index++] = unrecognised;
-        else
-          word_string[index++] =
-            word->best_choice->string ()[blobindex];
-      }
-      else {                     // start reject
-        inset_box = blob->bounding_box ();
-        /* Extend reject box to include rejected neighbours */
-        while (!blob_it.at_last () &&
-          (force_total_reject ||
-        (word->reject_map[blobindex + 1].rejected ()))) {
-          blobindex++;
-          blob = blob_it.forward ();
-                                 //get total box
-          inset_box += blob->bounding_box ();
-        }
-        if (blob_it.at_last ())
-          nextleft = inset_box.right ();
-        else
-          nextleft = blob_it.data_relative (1)->bounding_box ().left ();
-
-        //       tprintf("Making reject from (%d,%d)->(%d,%d)\n",
-        //          inset_box.left(),inset_box.bottom(),
-        //          inset_box.right(),inset_box.top());
-
-        index += make_reject (&inset_box, prevright, nextleft,
-          &word->denorm, &word_string[index]);
-      }
-      prevright = blob->bounding_box ().right ();
-    }
-  }
-  if (newline_type)
-                                 //end line
-    word_string[index++] = newline_type;
-  word_string[index] = '\0';     //terminate string
-  if (strlen (word_string) != index) {
-    tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n",
-      word_string, index, strlen (word_string));
-  }
-                                 //don't pass any zeros
-  ASSERT_HOST (strlen (word_string) == index);
-  return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
-}
-#endif
-
-/**********************************************************************
- * make_reject
- *
- * Add the escape code to the string for the reject.
- **********************************************************************/
-
-inT16
-make_reject (                    //make reject code
-TBOX * inset_box,                 //bounding box
-inT16 prevright,                 //previous char
-inT16 nextleft,                  //next char
-DENORM * denorm,                 //de-normalizer
-char word_string[]               //output string
-) {
-  inT16 index;                   //to string
-  inT16 xpos;                    //start of inset
-  inT16 ypos;
-  inT16 width;                   //size of inset
-  inT16 height;
-  inT16 left_offset;             //shift form prev char
-  inT16 right_offset;            //shift to next char
-  inT16 baseline_offset;         //shift from baseline
-  inT16 inset_index = 0;         //number of inset
-  inT16 min_chars;               //min width estimate
-  inT16 max_chars;               //max width estimate
-  float x_centre;                //centre of box
-
-  index = 0;
-  x_centre = (inset_box->left () + inset_box->right ()) / 2.0;
-  left_offset =
-    (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright));
-  right_offset =
-    (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ()));
-  xpos = (inT16) floor (denorm->x (inset_box->left ()));
-  width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos;
-  ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre));
-  height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos;
-  baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre);
-                                 //escape code
-  word_string[index++] = CTRL_INSET;
-  min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ());
-  max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ());
-  /*
-  Ensure min_chars and max_chars are in the range 0..254. This ensures that
-  we can add 1 to them to avoid putting \0 in a string, and still not exceed
-  the max value in a byte.
-  */
-  if (min_chars < 0)
-    min_chars = 0;
-  if (min_chars > 254)
-    min_chars = 254;
-  if (max_chars < min_chars)
-    max_chars = min_chars;
-  if (max_chars > 254)
-    max_chars = 254;
-                                 //min chars
-  word_string[index++] = min_chars + 1;
-                                 //max chars
-  word_string[index++] = max_chars + 1;
-  word_string[index++] = 2;      //type?
-                                 //store index
-  word_string[index++] = inset_index / 255 + 1;
-  word_string[index++] = inset_index % 255 + 1;
-  return index;                  //size of string
-}
-
-
 /**********************************************************************
 * determine_newline_type
 *
@ -641,305 +342,6 @@ char determine_newline_type(                   //test line ends
  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
 }

-/**********************************************************************
- * write_shm_text
- *
- * Write the cooked text to the shared memory for the api.
- **********************************************************************/
-
-void write_shm_text(                    //write output
-                    WERD_RES *word,     //word to do
-                    BLOCK *block,       //block it is from
-                    ROW_RES *row,       //row it is from
-                    const STRING &text, //text to write
-                    const STRING &text_lengths
-                   ) {
-  inT32 index;                   //char counter
-  inT32 index2;                  //char counter
-  inT32 length;                  //chars in word
-  inT32 ptsize;                  //font size
-  inT8 blanks;                   //blanks in word
-  uinT8 enhancement;             //bold etc
-  uinT8 font;                    //font index
-  char unrecognised = STRING (unrecognised_char)[0];
-  PBLOB *blob;
-  TBOX blob_box;                  //bounding box
-  PBLOB_IT blob_it;              //blob iterator
-  WERD copy_outword;             // copy to denorm
-  uinT32 rating;                 //of char
-  BOOL8 lineend;                 //end of line
-  int offset;
-  int offset2;
-
-                                 //point size
-  ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
-  if (word->word->flag (W_BOL) && ocr_char_space () < 128
-    && ocr_send_text (TRUE) != OKAY)
-    return;                      //release failed
-  copy_outword = *(word->outword);
-  copy_outword.baseline_denormalise (&word->denorm);
-  blob_it.set_to_list (copy_outword.blob_list ());
-  length = text_lengths.length ();
-
-  if (length > 0) {
-    blanks = word->word->space ();
-    if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
-      blanks = 1;
-    for (index = 0, offset = 0; index < length;
-         offset += text_lengths[index++], blob_it.forward ()) {
-      blob = blob_it.data ();
-      blob_box = blob->bounding_box ();
-
-      enhancement = 0;
-      if (word->italic > 0 || (word->italic == 0 && row->italic > 0))
-        enhancement |= EUC_ITALIC;
-      if (word->bold > 0 || (word->bold == 0 && row->bold > 0))
-        enhancement |= EUC_BOLD;
-      if (tessedit_write_ratings)
-        rating = (uinT32) (-word->best_choice->certainty () / 0.035);
-      else if (tessedit_zero_rejection)
-        rating = text[offset] == ' ' ? 100 : 0;
-      else
-        rating = word->reject_map[index].accepted ()? 0 : 100;
-      if (rating > 255)
-        rating = 255;
-      if (word->font1_count > 2)
-        font = word->font1;
-      else if (row->font1_count > 8)
-        font = row->font1;
-      else
-                                 //font index
-        font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
-
-      lineend = word->word->flag (W_EOL) && index == length - 1;
-      if (word->word->flag (W_EOL) && tessedit_zero_rejection
-      && index < length - 1 && text[index + text_lengths[index]] == ' ') {
-        for (index2 = index + 1, offset2 = offset + text_lengths[index];
-             index2 < length && text[offset2] == ' ';
-             offset2 += text_lengths[index2++]);
-        if (index2 == length)
-          lineend = TRUE;
-      }
-
-      if (!tessedit_zero_rejection || text[offset] != ' '
-      || tessedit_word_for_word) {
-                                 //confidence
-        if (text[offset] == ' ') {
-        ocr_append_char (unrecognised,
-                         blob_box.left (), blob_box.right (),
-                         page_image.get_ysize () - 1 - blob_box.top (),
-                         page_image.get_ysize () - 1 - blob_box.bottom (),
-                         font, (uinT8) rating,
-                         ptsize,                //point size
-                         blanks, enhancement,   //enhancement
-                         OCR_CDIR_LEFT_RIGHT,
-                         OCR_LDIR_DOWN_RIGHT,
-                         lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
-        } else {
-          for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
-            ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),
-                             blob_box.left (), blob_box.right (),
-                             page_image.get_ysize () - 1 - blob_box.top (),
-                             page_image.get_ysize () - 1 - blob_box.bottom (),
-                             font, (uinT8) rating,
-                             ptsize,                //point size
-                             blanks, enhancement,   //enhancement
-                             OCR_CDIR_LEFT_RIGHT,
-                             OCR_LDIR_DOWN_RIGHT,
-                             lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
-        }
-        blanks = 0;
-      }
-
-    }
-  }
-  else if (tessedit_word_for_word) {
-    blanks = word->word->space ();
-    if (blanks == 0 && !word->word->flag (W_BOL))
-      blanks = 1;
-    blob_box = word->word->bounding_box ();
-
-    enhancement = 0;
-    if (word->italic > 0)
-      enhancement |= EUC_ITALIC;
-    if (word->bold > 0)
-      enhancement |= EUC_BOLD;
-    rating = 100;
-    if (word->font1_count > 2)
-      font = word->font1;
-    else if (row->font1_count > 8)
-      font = row->font1;
-    else
-                                 //font index
-      font = word->word->flag (W_DONT_CHOP) ? 0 : 1;
-
-    lineend = word->word->flag (W_EOL);
-
-                                 //font index
-    ocr_append_char (unrecognised,
-                     blob_box.left (), blob_box.right (),
-                     page_image.get_ysize () - 1 - blob_box.top (),
-                     page_image.get_ysize () - 1 - blob_box.bottom (),
-                     font,
-                     rating,                    //confidence
-                     ptsize,                    //point size
-                     blanks, enhancement,       //enhancement
-                     OCR_CDIR_LEFT_RIGHT,
-                     OCR_LDIR_DOWN_RIGHT,
-                     lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
-  }
-}
-
-
-/**********************************************************************
- * write_map
- *
- * Write a map file of 0's and 1'a which associates characters from the .txt
- * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char
- * is kept.  Note that there may be reject regions in the .etx file WITHOUT
- * .txt chars being rejected.  The map file should be the same length, and
- * the same number of lines as the .txt file
- *
- * The paramaterised input is because I thought I might be able to generate
- * multiple map files in a single run.  However, it didn't work because
- * newdiff needs etx files!
- **********************************************************************/
-
-#if 0
-void write_map(                //output a map file
-               FILE *mapfile,  //mapfile to write to
-               WERD_RES *word) {
-  inT16 index;
-  int status;
-  STRING mapstr = "";
-
-  if (word->best_choice->string ().length () > 0) {
-    for (index = 0; index < word->word->space (); index++) {
-      if (word->reject_spaces &&
-        (suspect_level >= suspect_space_level) &&
-        !tessedit_minimal_rejection && !tessedit_zero_rejection)
-        /* Write rejected spaces to .map file ONLY. Newdiff converts these back to
-        accepted spaces AFTER generating basic space stats but BEFORE using .etx */
-        status = fprintf (mapfile, "0");
-      else
-        status = fprintf (mapfile, "1");
-      if (status < 0)
-        WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);
-    }
-
-    if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {
-      for (index = 0; index < 5; index++)
-        mapstr += '1';
-    }
-    else {
-      ASSERT_HOST (word->reject_map.length () ==
-        word->best_choice->string ().length ());
-
-      for (index = 0; index < word->reject_map.length (); index++) {
-        if (word->reject_map[index].accepted ())
-          mapstr += '1';
-        else
-          mapstr += '0';
-      }
-    }
-    status = fprintf (mapfile, "%s", mapstr.string ());
-    if (status < 0)
-      WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);
-  }
-  if (word->word->flag (W_EOL)) {
-    status = fprintf (mapfile, "\n");
-    if (status < 0)
-      WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);
-  }
-  status = fflush (mapfile);
-  if (status != 0)
-    WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
-}
-#endif
-
-
-/*************************************************************************
- * open_file()
- *************************************************************************/
-
-namespace tesseract {
-FILE *Tesseract::open_outfile(  //open .map & .unlv file
-                   const char *extension) {
-  STRING file_name;
-  FILE *outfile;
-
-  file_name = imagebasename + extension;
-  if (!(outfile = fopen (file_name.string (), "w"))) {
-    CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",
-      file_name.string (), errno);
-  }
-  return outfile;
-}
-}  // namespace tesseract
-
-
-#if 0
-void write_unlv_text(WERD_RES *word) {
-  const char *wordstr;
-
-  char buff[512];                //string to output
-  int i = 0;
-  int j = 0;
-  char unrecognised = STRING (unrecognised_char)[0];
-  int status;
-  char space_str[3];
-
-  wordstr = word->best_choice->string ().string ();
-
-  /* DONT need to do anything special for repeated char words - at this stage
-  the repetition char has been identified and any other chars have been
-  rejected.
-  */
-
-  for (; wordstr[i] != '\0'; i++) {
-    if ((wordstr[i] == ' ') ||
-      (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))
-      buff[j++] = unrecognised;
-    else {
-      if (word->reject_map[i].rejected ())
-        buff[j++] = '^';         //Add suspect marker
-      buff[j++] = wordstr[i];
-    }
-  }
-  buff[j] = '\0';
-
-  if (strlen (wordstr) > 0) {
-    if (word->reject_spaces &&
-      (suspect_level >= suspect_space_level) &&
-      !tessedit_minimal_rejection && !tessedit_zero_rejection)
-      strcpy (space_str, "^ ");  //Suspect space
-    else
-      strcpy (space_str, " ");   //Certain space
-
-    for (i = 0; i < word->word->space (); i++) {
-      status = fprintf (unlv_file, "%s", space_str);
-      if (status < 0)
-        WRITEFAILED.error ("write_unlv_text", EXIT,
-          "Space Errno: %d", errno);
-    }
-
-    status = fprintf (unlv_file, "%s", buff);
-    if (status < 0)
-      WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);
-  }
-  if (word->word->flag (W_EOL)) {
-    status = fprintf (unlv_file, "\n");
-    if (status < 0)
-      WRITEFAILED.error ("write_unlv_text", EXIT,
-        "Newline Errno: %d", errno);
-  }
-  status = fflush (unlv_file);
-  if (status != 0)
-    WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
-}
-#endif
-
-
 /*************************************************************************
 * get_rep_char()
 * Return the first accepted character from the repetition string. This is the
@ -957,36 +359,6 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
    return unicharset.unichar_to_id(unrecognised_char.string());
  }
 }
-}  // namespace tesseract
-
-void ensure_rep_chars_are_consistent(WERD_RES *word) {
-#if 0
-  char rep_char = get_rep_char (word);
-  char *ptr;
-
-  ptr = (char *) word->best_choice->string ().string ();
-  for (; *ptr != '\0'; ptr++) {
-    if (*ptr != rep_char)
-      *ptr = rep_char;
-  }
-#endif
-
-#if 0
-  UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
-  int i;
-  char *ptr;
-  STRING consistent_string;
-  STRING consistent_string_lengths;
-
-  ptr = (char *) word->best_choice->string ().string ();
-  for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
-    consistent_string += unicharset.id_to_unichar(rep_char);
-    consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
-  }
-  word->best_choice->string() = consistent_string;
-  word->best_choice->lengths() = consistent_string_lengths;
-#endif
-}

 /*************************************************************************
 * SUSPECT LEVELS
@ -998,8 +370,6 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
 * tessedit_minimal_rejection.
 *************************************************************************/
-
-namespace tesseract {
 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
  int len = word_res->reject_map.length();
  const WERD_CHOICE &word = *(word_res->best_choice);
--- a/ccmain/output.h
+++ b/ccmain/output.h
@ -20,91 +20,15 @@
 #ifndef           OUTPUT_H
 #define           OUTPUT_H

-#include          "varable.h"
+#include          "params.h"
 //#include                                      "epapconv.h"
 #include          "pageres.h"
 #include          "notdll.h"

-extern BOOL_EVAR_H (tessedit_write_block_separators, TRUE,
-"Write block separators in output");
-extern BOOL_VAR_H (tessedit_write_raw_output, FALSE,
-"Write raw stuff to name.raw");
-extern BOOL_EVAR_H (tessedit_write_output, TRUE, "Write text to name.txt");
-extern BOOL_EVAR_H (tessedit_write_txt_map, TRUE,
-"Write .txt to .etx map file");
-extern BOOL_EVAR_H (tessedit_write_rep_codes, TRUE,
-"Write repetition char code");
-extern BOOL_EVAR_H (tessedit_write_unlv, FALSE, "Write .unlv output file");
-extern STRING_EVAR_H (unrecognised_char, "|",
-"Output char for unidentified blobs");
-extern INT_EVAR_H (suspect_level, 99, "Suspect marker level");
-extern INT_VAR_H (suspect_space_level, 100,
-"Min suspect level for rejecting spaces");
-extern INT_VAR_H (suspect_short_words, 2,
-"Dont Suspect dict wds longer than this");
-extern BOOL_VAR_H (suspect_constrain_1Il, FALSE,
-"UNLV keep 1Il chars rejected");
-extern double_VAR_H (suspect_rating_per_ch, 999.9,
-"Dont touch bad rating limit");
-extern double_VAR_H (suspect_accept_rating, -999.9,
-"Accept good rating limit");
-extern BOOL_EVAR_H (tessedit_minimal_rejection, FALSE,
-"Only reject tess failures");
-extern BOOL_VAR_H (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");
-extern BOOL_VAR_H (tessedit_word_for_word, FALSE,
-"Make output have exactly one word per WERD");
-extern BOOL_VAR_H (tessedit_consistent_reps, TRUE,
-"Force all rep chars the same");
-
-/** output a word */
-void write_results(
-                   PAGE_RES_IT &page_res_it,  ///< full info
-                   char newline_type,         ///< type of newline
-                   BOOL8 force_eol,           ///< override tilde crunch?
-                   BOOL8 write_to_shm         ///< send to api
-                  );
-
-/** convert one word */
-WERD_CHOICE *make_epaper_choice(
-                                WERD_RES *word,    ///< word to do
-                                char newline_type  ///< type of newline
-                               );
-/** make reject code */
-inT16 make_reject (
-TBOX * inset_box,                ///< bounding box
-inT16 prevright,                 ///< previous char
-inT16 nextleft,                  ///< next char
-DENORM * denorm,                 ///< de-normalizer
-char word_string[]               ///< output string
-);
-
 /** test line ends */
 char determine_newline_type(WERD *word,        ///< word to do
                            BLOCK *block,      ///< current block
                            WERD *next_word,   ///< next word
                            BLOCK *next_block  ///< block of next word
                           );
-/** write output */
-void write_cooked_text(WERD *word,          ///< word to do
-                       const STRING &text,  ///< text to write
-                       BOOL8 acceptable,    ///< good stuff
-                       BOOL8 pass2,         ///< done on pass2
-                       FILE *fp             ///< file to write
-                      );
-/** write output */
-void write_shm_text(WERD_RES *word,     ///< word to do
-                    BLOCK *block,       ///< block it is from
-                    ROW_RES *row,       ///< row it is from
-                    const STRING &text, ///< text to write
-                    const STRING &text_lengths
-                   );
-/** output a map file */
-void write_map(
-               FILE *mapfile,  ///< mapfile to write to
-               WERD_RES *word  ///< word
-              );
-/*FILE *open_outfile(  //open .map & .unlv file
-                   const char *extension);*/
-void write_unlv_text(WERD_RES *word);
-void ensure_rep_chars_are_consistent(WERD_RES *word);
 #endif
--- a/textord/pagesegmain.cpp
+++ b/textord/pagesegmain.cpp
@ -46,7 +46,8 @@
 #include "blread.h"
 #include "wordseg.h"
 #include "makerow.h"
-#include "baseapi.h"
+#include "osdetect.h"
+#include "textord.h"
 #include "tordmain.h"
 #include "tessvars.h"

@ -56,6 +57,48 @@ namespace tesseract {
 const int kMinCredibleResolution = 70;
 /// Default resolution used if input in not believable.
 const int kDefaultResolution = 300;
+// Max erosions to perform in removing an enclosing circle.
+const int kMaxCircleErosions = 8;
+
+// Helper to remove an enclosing circle from an image.
+// If there isn't one, then the image will most likely get badly mangled.
+// The returned pix must be pixDestroyed after use. NULL may be returned
+// if the image doesn't meet the trivial conditions that it uses to determine
+// success.
+static Pix* RemoveEnclosingCircle(Pix* pixs) {
+  Pix* pixsi = pixInvert(NULL, pixs);
+  Pix* pixc = pixCreateTemplate(pixs);
+  pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
+  pixSeedfillBinary(pixc, pixc, pixsi, 4);
+  pixInvert(pixc, pixc);
+  pixDestroy(&pixsi);
+  Pix* pixt = pixAnd(NULL, pixs, pixc);
+  l_int32 max_count;
+  pixCountConnComp(pixt, 8, &max_count);
+  // The count has to go up before we start looking for the minimum.
+  l_int32 min_count = MAX_INT32;
+  Pix* pixout = NULL;
+  for (int i = 1; i < kMaxCircleErosions; i++) {
+    pixDestroy(&pixt);
+    pixErodeBrick(pixc, pixc, 3, 3);
+    pixt = pixAnd(NULL, pixs, pixc);
+    l_int32 count;
+    pixCountConnComp(pixt, 8, &count);
+    if (i == 1 || count > max_count) {
+      max_count = count;
+      min_count = count;
+    } else if (i > 1 && count < min_count) {
+      min_count = count;
+      pixDestroy(&pixout);
+      pixout = pixCopy(NULL, pixt);  // Save the best.
+    } else if (count >= min_count) {
+      break;  // We have passed by the best.
+    }
+  }
+  pixDestroy(&pixt);
+  pixDestroy(&pixc);
+  return pixout;
+}

 /**
 * Segment the page according to the current value of tessedit_pageseg_mode.
@ -63,18 +106,12 @@ const int kDefaultResolution = 300;
 * and copied to image, otherwise it just uses image as the input.
 * On return the blocks list owns all the constructed page layout.
 */
-int Tesseract::SegmentPage(const STRING* input_file,
-                           IMAGE* image, BLOCK_LIST* blocks) {
-  int width = image->get_xsize();
-  int height = image->get_ysize();
-  int resolution = image->get_res();
-#ifdef HAVE_LIBLEPT
-  if (pix_binary_ != NULL) {
-    width = pixGetWidth(pix_binary_);
-    height = pixGetHeight(pix_binary_);
-    resolution = pixGetXRes(pix_binary_);
-  }
-#endif
+int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
+                           Tesseract* osd_tess, OSResults* osr) {
+  ASSERT_HOST(pix_binary_ != NULL);
+  int width = pixGetWidth(pix_binary_);
+  int height = pixGetHeight(pix_binary_);
+  int resolution = pixGetXRes(pix_binary_);
  // Zero resolution messes up the algorithms, so make sure it is credible.
  if (resolution < kMinCredibleResolution)
    resolution = kDefaultResolution;
@ -82,7 +119,7 @@ int Tesseract::SegmentPage(const STRING* input_file,
  PageSegMode pageseg_mode = static_cast<PageSegMode>(
      static_cast<int>(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
-  if (pageseg_mode != tesseract::PSM_AUTO &&
+  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
      input_file != NULL && input_file->length() > 0) {
    STRING name = *input_file;
    const char* lastdot = strrchr(name.string(), '.');
@ -90,88 +127,85 @@ int Tesseract::SegmentPage(const STRING* input_file,
      name[lastdot - name.string()] = '\0';
    read_unlv_file(name, width, height, blocks);
  }
-  bool single_column = pageseg_mode > PSM_AUTO;
  if (blocks->empty()) {
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
+    block->set_right_to_left(right_to_left());
    block_it.add_to_end(block);
  } else {
    // UNLV file present. Use PSM_SINGLE_COLUMN.
    pageseg_mode = PSM_SINGLE_COLUMN;
  }
+  bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode);
+  bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode);
+  bool osd_only = pageseg_mode == PSM_OSD_ONLY;

-  TO_BLOCK_LIST land_blocks, port_blocks;
-  TBOX page_box;
-  if (pageseg_mode <= PSM_SINGLE_COLUMN) {
-    if (AutoPageSeg(width, height, resolution, single_column,
-                    image, blocks, &port_blocks) < 0) {
-      return -1;
-    }
+  int auto_page_seg_ret_val = 0;
+  TO_BLOCK_LIST to_blocks;
+  if (osd_enabled || PSM_BLOCK_FIND_ENABLED(pageseg_mode)) {
+    auto_page_seg_ret_val =
+        AutoPageSeg(resolution, single_column, osd_enabled, osd_only,
+                    blocks, &to_blocks, osd_tess, osr);
+    if (osd_only)
+      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
-    //  port_blocks.clear();  // Uncomment to go back to the old mode.
+    //  to_blocks.clear();  // Uncomment to go back to the old mode.
  } else {
-#if HAVE_LIBLEPT
-    image->FromPix(pix_binary_);
-#endif
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
+    if (pageseg_mode == PSM_CIRCLE_WORD) {
+      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
+      if (pixcleaned != NULL) {
+        pixDestroy(&pix_binary_);
+        pix_binary_ = pixcleaned;
+      }
+    }
  }
+
+  if (auto_page_seg_ret_val < 0) {
+    return -1;
+  }
+
  if (blocks->empty()) {
    tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }

-  if (port_blocks.empty()) {
-    // AutoPageSeg was not used, so we need to find_components first.
-    find_components(blocks, &land_blocks, &port_blocks, &page_box);
-  } else {
-    // AutoPageSeg does not need to find_components as it did that already.
-    page_box.set_left(0);
-    page_box.set_bottom(0);
-    page_box.set_right(width);
-    page_box.set_top(height);
-    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
-    filter_blobs(page_box.topright(), &port_blocks, true);
-  }
+  textord_.TextordPage(pageseg_mode, width, height, pix_binary_,
+                       blocks, &to_blocks);
+  SetupWordScripts(blocks);
+  return auto_page_seg_ret_val;
+}

-  TO_BLOCK_IT to_block_it(&port_blocks);
-  ASSERT_HOST(!port_blocks.empty());
-  TO_BLOCK* to_block = to_block_it.data();
-  if (pageseg_mode <= PSM_SINGLE_BLOCK ||
-      to_block->line_size < 2) {
-    // For now, AUTO, SINGLE_COLUMN and SINGLE_BLOCK all map to the old
-    // textord. The difference is the number of blocks and how the are made.
-    textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks,
-                 this);
-  } else {
-    // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
-    float gradient = make_single_row(page_box.topright(),
-                                     to_block, &port_blocks, this);
-    if (pageseg_mode == PSM_SINGLE_LINE) {
-      // SINGLE_LINE uses the old word maker on the single line.
-      make_words(page_box.topright(), gradient, blocks,
-                 &land_blocks, &port_blocks, this);
-    } else {
-      // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
-      // single word, and in SINGLE_CHAR mode, all the outlines
-      // go in a single blob.
-      make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
-                       to_block->get_rows(), to_block->block->row_list());
+// TODO(rays) This is a hack to set all the words with a default script.
+// In the future this will be set by a preliminary pass over the document.
+void Tesseract::SetupWordScripts(BLOCK_LIST* blocks) {
+  int script = unicharset.default_sid();
+  bool has_x_height = unicharset.script_has_xheight();
+  bool is_latin = script == unicharset.latin_sid();
+  BLOCK_IT b_it(blocks);
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    ROW_IT r_it(b_it.data()->row_list());
+    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
+      WERD_IT w_it(r_it.data()->word_list());
+      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+        WERD* word = w_it.data();
+        word->set_script_id(script);
+        word->set_flag(W_SCRIPT_HAS_XHEIGHT, has_x_height);
+        word->set_flag(W_SCRIPT_IS_LATIN, is_latin);
+      }
    }
  }
-  return 0;
 }

+
 /**
 * Auto page segmentation. Divide the page image into blocks of uniform
 * text linespacing and images.
 *
- * Width, height and resolution are derived from the input image.
- *
- * If the pix is non-NULL, then it is assumed to be the input, and it is
- * copied to the image, otherwise the image is used directly.
+ * Resolution (in ppi) is derived from the input image.
 *
 * The output goes in the blocks list with corresponding TO_BLOCKs in the
 * to_blocks list.
@ -179,10 +213,17 @@ int Tesseract::SegmentPage(const STRING* input_file,
 * If single_column is true, then no attempt is made to divide the image
 * into columns, but multiple blocks are still made if the text is of
 * non-uniform linespacing.
+ *
+ * If osd is true, then orientation and script detection is performed as well.
+ * If only_osd is true, then only orientation and script detection is
+ * performed. If osr is desired, the osr_tess must be another Tesseract
+ * that was initialized especially for osd, and the results will be output
+ * into osr.
 */
-int Tesseract::AutoPageSeg(int width, int height, int resolution,
-                           bool single_column, IMAGE* image,
-                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
+int Tesseract::AutoPageSeg(int resolution, bool single_column,
+                           bool osd, bool only_osd,
+                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
+                           Tesseract* osd_tess, OSResults* osr) {
  int vertical_x = 0;
  int vertical_y = 1;
  TabVector_LIST v_lines;
@ -196,7 +237,8 @@ int Tesseract::AutoPageSeg(int width, int height, int resolution,
 #ifdef HAVE_LIBLEPT
  if (pix_binary_ != NULL) {
    if (textord_debug_images) {
-      Pix* grey_pix = pixCreate(width, height, 8);
+      Pix* grey_pix = pixCreate(pixGetWidth(pix_binary_),
+                                pixGetHeight(pix_binary_), 8);
      // Printable images are light grey on white, but for screen display
      // they are black on dark grey so the other colors show up well.
      if (textord_debug_printable) {
@ -210,8 +252,9 @@ int Tesseract::AutoPageSeg(int width, int height, int resolution,
      pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
      pixDestroy(&grey_pix);
    }
-    if (tessedit_dump_pageseg_images)
+    if (tessedit_dump_pageseg_images) {
      pixWrite("tessinput.png", pix_binary_, IFF_PNG);
+    }
    // Leptonica is used to find the lines and image regions in the input.
    LineFinder::FindVerticalLines(resolution, pix_binary_,
                                  &vertical_x, &vertical_y, &v_lines);
@ -221,16 +264,13 @@ int Tesseract::AutoPageSeg(int width, int height, int resolution,
    ImageFinder::FindImages(pix_binary_, &boxa, &pixa);
    if (tessedit_dump_pageseg_images)
      pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
-    // Copy the Pix to the IMAGE.
-    image->FromPix(pix_binary_);
    if (single_column)
      v_lines.clear();
  }
 #endif
-  TO_BLOCK_LIST land_blocks, port_blocks;
-  TBOX page_box;
+  TO_BLOCK_LIST port_blocks;
  // The rest of the algorithm uses the usual connected components.
-  find_components(blocks, &land_blocks, &port_blocks, &page_box);
+  textord_.find_components(pix_binary_, blocks, &port_blocks);

  TO_BLOCK_IT to_block_it(&port_blocks);
  ASSERT_HOST(!to_block_it.empty());
@ -244,20 +284,50 @@ int Tesseract::AutoPageSeg(int width, int height, int resolution,
      // that there aren't any interesting line separators or images, since
      // it means that we have a pre-defined unlv zone file.
      ColumnFinder finder(static_cast<int>(to_block->line_size),
-                          blkbox.botleft(), blkbox.topright(),
+                          blkbox.botleft(), blkbox.topright(), resolution,
                          &v_lines, &h_lines, vertical_x, vertical_y);
-      if (finder.FindBlocks(height, resolution, single_column,
+      BLOBNBOX_CLIST osd_blobs;
+      int osd_orientation = 0;
+      bool vertical_text = finder.IsVerticallyAlignedText(to_block, &osd_blobs);
+      if (osd && osd_tess != NULL && osr != NULL) {
+        os_detect_blobs(&osd_blobs, osr, osd_tess);
+        if (only_osd) continue;
+        osd_orientation = osr->best_result.orientation_id;
+        double osd_score = osr->orientations[osd_orientation];
+        double osd_margin = min_orientation_margin * 2;
+        // tprintf("Orientation scores:");
+        for (int i = 0; i < 4; ++i) {
+          if (i != osd_orientation &&
+              osd_score - osr->orientations[i] < osd_margin) {
+            osd_margin = osd_score - osr->orientations[i];
+          }
+          // tprintf(" %d:%f", i, osr->orientations[i]);
+        }
+        // tprintf("\n");
+        if (osd_margin < min_orientation_margin) {
+          // Margin insufficient - dream up a suitable default.
+          if (vertical_text && (osd_orientation & 1))
+            osd_orientation = 3;
+          else
+            osd_orientation = 0;
+          tprintf("Score margin insufficient:%.2f, using %d as a default\n",
+                  osd_margin, osd_orientation);
+        }
+      }
+      osd_blobs.shallow_clear();
+      finder.CorrectOrientation(to_block, vertical_text, osd_orientation);
+      if (finder.FindBlocks(single_column, pixGetHeight(pix_binary_),
                            to_block, boxa, pixa, &found_blocks, to_blocks) < 0)
        return -1;
-      finder.ComputeDeskewVectors(&deskew_, &reskew_);
+      finder.GetDeskewVectors(&deskew_, &reskew_);
      boxa = NULL;
      pixa = NULL;
    }
  }
-#ifdef HAVE_LIBLEPT
  boxaDestroy(&boxa);
  pixaDestroy(&pixa);
-#endif
+  if (only_osd) return 0;
+
  blocks->clear();
  BLOCK_IT block_it(blocks);
  // Move the found blocks to the input/output blocks.
--- a/ccmain/pagewalk.cpp
+++ b/ccmain/pagewalk.cpp
@ -17,602 +17,31 @@
 *
 **********************************************************************/

-#ifdef _MSC_VER
-#pragma warning(disable:4244)  // Conversion warnings
-#endif
-
 #include "mfcpch.h"
-#include "pagewalk.h"
+#include "pageres.h"
 #include "tesseractclass.h"

-#define EXTERN
-
-EXTERN BOOL_VAR (current_word_quit, FALSE, "Stop processing this word");
-DLLSYM BOOL_VAR (selection_quit, FALSE, "Stop processing this selection");
-
-/**
- *  block_list_bounding_box()
- *
- *  Scan block list to find the bounding box of all blocks.
- *  @param block_list the block list to find the bounding box of
- */
-
-TBOX block_list_bounding_box(BLOCK_LIST *block_list) 
-{
-  BLOCK_IT block_it(block_list);
-  TBOX enclosing_box;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
-    block_it.forward ())
-  enclosing_box += block_it.data ()->bounding_box ();
-  return enclosing_box;
-}
-
-
-/**
- *  block_list_compress()
- *
- *  Pack a block list to occupy a smaller space by compressing each block and
- *  moving the compressed blocks one above the other.
- *  The compressed block list has the same top left point as the uncompressed
- *  first.  Blocks are reordered so that the source names are in alphabetic
- *  order. (This gathers together, but does not combine, blocks from the same
- *  file.)
- *
- *  The enclosing box of the compressed block list is returned.
- */
-
-const TBOX block_list_compress(BLOCK_LIST *block_list) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ICOORD initial_top_left;
-  ICOORD block_spacing (0, BLOCK_SPACING);
-  TBOX enclosing_box;             //for full display
-
-  initial_top_left = block_it.data()->bounding_box().topleft();
-                                 //group srcfile blks
-  block_it.sort (block_name_order);
-
-  /* Compress the target block list into an area starting from the top left of
-    the first block on the list */
-
-  enclosing_box = TBOX (initial_top_left, initial_top_left);
-  enclosing_box.move_bottom_edge (BLOCK_SPACING);
-
-  for (block_it.mark_cycle_pt ();
-  !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    block->compress (enclosing_box.botleft () - block_spacing -
-      block->bounding_box ().topleft ());
-    enclosing_box += block->bounding_box ();
-  }
-  return enclosing_box;
-}
-
-
-/**
- * block_list_move()
- *
- * Move all the blocks in the list by a vector
- *
- * @param block_list the block list to move
- * @param vec the vector to move it by
- */
-
-void block_list_move(BLOCK_LIST *block_list,
-                     ICOORD vec) 
-{
-  BLOCK_IT block_it(block_list);
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
-       block_it.forward ())
-    block_it.data ()->move (vec);
-}
-
-
-/**
- *  block_name_order()
- *
- *  Block comparator used to sort a block list so that blocks from the same
- *  filename are located together, and blocks from the same file are ordered
- *  by vertical position.
- */
-
-int block_name_order(const void *block1p,
-                     const void *block2p) 
-{
-  int result;
-  BLOCK *block1 = *(BLOCK **) block1p;
-  BLOCK *block2 = *(BLOCK **) block2p;
-
-  result = strcmp (block1->name (), block2->name ());
-  if (result == 0)
-    result = block2->bounding_box ().top () - block1->bounding_box ().top ();
-  return result;
-}
-
-
-/**
- * process_all_blobs()
- *
- * Walk the current block list applying the specified blob processor function
- * to all blobs
- * @param block_list the blocks to check
- * @param blob_processor function to call
- * @param c_blob_processor function to call
- */
-
-void
-process_all_blobs (BLOCK_LIST * block_list,
-                   BOOL8 blob_processor (BLOCK *, ROW *, WERD *, PBLOB *), 
-                   BOOL8 c_blob_processor (BLOCK *, ROW *, WERD *, C_BLOB *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-  PBLOB_IT blob_it;
-  PBLOB *blob;
-  C_BLOB_IT c_blob_it;
-  C_BLOB *c_blob;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    row_it.set_to_list (block->row_list ());
-    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-      row = row_it.data ();
-      word_it.set_to_list (row->word_list ());
-      for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
-        word = word_it.data ();
-        if (word->flag (W_POLYGON)) {
-          if (blob_processor != NULL) {
-            blob_it.set_to_list (word->blob_list ());
-            for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
-              blob = blob_it.data ();
-              if (!blob_processor (block, row, word, blob) || selection_quit)
-                return;
-            }
-          }
-        }
-        else {
-          if (c_blob_processor != NULL) {
-            c_blob_it.set_to_list (word->cblob_list ());
-            for (c_blob_it.mark_cycle_pt (); !c_blob_it.cycled_list (); c_blob_it.forward ()) {
-              c_blob = c_blob_it.data ();
-              if (!c_blob_processor (block, row, word, c_blob) || selection_quit)
-                return;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-
-/**
- * process_selected_blobs()
- *
- * Walk the current block list applying the specified blob processor function
- * to each selected blob
- * @param block_list the blocks to check
- * @param selection_box within this box(?)
- * @param blob_processor function to call
- * @param c_blob_processor function to call
- */
-
-void
-process_selected_blobs (BLOCK_LIST * block_list,
-                        TBOX & selection_box, 
-                        BOOL8 blob_processor (BLOCK *, ROW *, WERD *, PBLOB *), 
-                        BOOL8 c_blob_processor (BLOCK *, ROW *, WERD *, C_BLOB *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-  PBLOB_IT blob_it;
-  PBLOB *blob;
-  C_BLOB_IT c_blob_it;
-  C_BLOB *c_blob;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      row_it.set_to_list (block->row_list ());
-      for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-        row = row_it.data ();
-        if (row->bounding_box ().overlap (selection_box)) {
-          word_it.set_to_list (row->word_list ());
-          for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
-            word = word_it.data ();
-            if (word->bounding_box ().overlap (selection_box)) {
-              if (word->flag (W_POLYGON)) {
-                if (blob_processor != NULL) {
-                  blob_it.set_to_list (word->blob_list ());
-                  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
-                    blob = blob_it.data ();
-                    if (blob->bounding_box().overlap (selection_box)) {
-                      if (!blob_processor(block, row, word, blob) || selection_quit)
-                        return;
-                    }
-                  }
-                }
-              }
-              else {
-                if (c_blob_processor != NULL) {
-                  c_blob_it.set_to_list (word->cblob_list ());
-                  for (c_blob_it.mark_cycle_pt (); !c_blob_it.cycled_list (); c_blob_it.forward ()) {
-                    c_blob = c_blob_it.data ();
-                    if (c_blob->bounding_box ().overlap (selection_box)) {
-                      if (!c_blob_processor(block, row, word, c_blob) || selection_quit)
-                        return;
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-
-/**
- * process_all_words()
- *
- * Walk the current block list applying the specified word processor function
- * to all words
- */
-void
-process_all_words (BLOCK_LIST * block_list,
-                   BOOL8 word_processor (BLOCK *, ROW *, WERD *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    row_it.set_to_list (block->row_list ());
-    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-      row = row_it.data ();
-      word_it.set_to_list (row->word_list ());
-      for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
-        word = word_it.data ();
-        if (!word_processor (block, row, word) || selection_quit)
-          return;
-      }
-    }
-  }
-}
-
-
 /**
 * process_selected_words()
 *
 * Walk the current block list applying the specified word processor function
- * to each word selected.
+ * to each word that overlaps the selection_box.
 */
-
-void
-process_selected_words (BLOCK_LIST * block_list,
-                        TBOX & selection_box,
-                        BOOL8 word_processor (BLOCK *, ROW *, WERD *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      row_it.set_to_list (block->row_list ());
-      for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-        row = row_it.data ();
-        if (row->bounding_box ().overlap (selection_box)) {
-          word_it.set_to_list (row->word_list ());
-          for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
-            word = word_it.data ();
-            if (word->bounding_box ().overlap (selection_box)) {
-              if (!word_processor (block, row, word) || selection_quit)
-                return;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
 namespace tesseract {
-void
-Tesseract::process_selected_words (BLOCK_LIST * block_list,
-                                   TBOX & selection_box,
-                                   BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK *, ROW *, WERD *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      row_it.set_to_list (block->row_list ());
-      for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-        row = row_it.data ();
-        if (row->bounding_box ().overlap (selection_box)) {
-          word_it.set_to_list (row->word_list ());
-          for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
-            word = word_it.data ();
-            if (word->bounding_box ().overlap (selection_box)) {
-              if (!((this->*word_processor) (block, row, word)) || selection_quit)
-                return;
-            }
-          }
-        }
-      }
+void Tesseract::process_selected_words(
+    PAGE_RES* page_res, // blocks to check
+    TBOX & selection_box,
+    BOOL8(tesseract::Tesseract::*word_processor)(  // function to call
+        BLOCK* block, ROW* row, WERD_RES* word_res)) {
+  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
+       page_res_it.forward()) {
+    WERD* word = page_res_it.word()->word;
+    if (word->bounding_box().overlap(selection_box)) {
+      if (!((this->*word_processor)(page_res_it.block()->block,
+                                    page_res_it.row()->row,
+                                    page_res_it.word())))
+        return;
    }
  }
 }
 }  // namespace tesseract
-
-
-/**
- * process_all_words_it()   PASS ITERATORS
- *
- * Walk the current block list applying the specified word processor function
- * to all words
- */
-
-void
-process_all_words_it (BLOCK_LIST * block_list,
-                      BOOL8 word_processor (BLOCK *, ROW *, WERD *, BLOCK_IT &, ROW_IT &, WERD_IT &)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    row_it.set_to_list (block->row_list ());
-    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-      row = row_it.data ();
-      word_it.set_to_list (row->word_list ());
-      for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
-        word = word_it.data ();
-        if (!word_processor (block, row, word, block_it, row_it, word_it) || selection_quit)
-          return;
-      }
-    }
-  }
-}
-
-
-/**
- * process_selected_words_it()   PASS ITERATORS
- *
- * Walk the current block list applying the specified word processor function
- * to each word selected.
- */
-
-void
-process_selected_words_it (BLOCK_LIST * block_list,
-                           TBOX & selection_box, 
-                           BOOL8 word_processor (BLOCK *, ROW *, WERD *, BLOCK_IT &, ROW_IT &, WERD_IT &)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      row_it.set_to_list (block->row_list ());
-      for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-        row = row_it.data ();
-        if (row->bounding_box ().overlap (selection_box)) {
-          word_it.set_to_list (row->word_list ());
-          for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
-            word = word_it.data ();
-            if (word->bounding_box ().overlap (selection_box)) {
-              if (!word_processor (block, row, word, block_it, row_it, word_it) || selection_quit)
-                return;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-
-/**
- * process_all_blocks()
- *
- * Walk the current block list applying the specified block processor function
- * to each block.
- */
-
-void
-process_all_blocks (BLOCK_LIST * block_list,
-                    BOOL8 block_processor (BLOCK *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (!block_processor (block) || selection_quit)
-      return;
-  }
-}
-
-
-/**
- * process_selected_blocks()
- *
- * Walk the current block list applying the specified block processor function
- * to each block selected.
- */
-
-void
-process_selected_blocks (BLOCK_LIST * block_list,
-                         TBOX & selection_box, 
-                         BOOL8 block_processor (BLOCK *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      if (!block_processor (block) || selection_quit)
-        return;
-    }
-  }
-}
-
-
-/**
- * process_all_rows()
- *
- * Walk the current block list applying the specified row processor function
- * to all rows
- */
-
-void
-process_all_rows (BLOCK_LIST * block_list,
-                  BOOL8 row_processor (BLOCK *, ROW *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    row_it.set_to_list (block->row_list ());
-    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-      row = row_it.data ();
-      if (!row_processor (block, row) || selection_quit)
-        return;
-    }
-  }
-}
-
-
-/**
- * process_selected_rows()
- *
- * Walk the current block list applying the specified row processor function
- * to each row selected.
- */
-
-void
-process_selected_rows (BLOCK_LIST * block_list,
-                       TBOX & selection_box, 
-                       BOOL8 row_processor (BLOCK *, ROW *)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      row_it.set_to_list (block->row_list ());
-      for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-        row = row_it.data ();
-        if (row->bounding_box ().overlap (selection_box)) {
-          if (!row_processor (block, row) || selection_quit)
-            return;
-        }
-      }
-    }
-  }
-}
-
-
-/**
- * process_all_rows_it()   PASS ITERATORS
- *
- * Walk the current block list applying the specified row processor function
- * to all rows
- */
-
-void
-process_all_rows_it (BLOCK_LIST * block_list,
-                     BOOL8 row_processor (BLOCK *, ROW *, BLOCK_IT &, ROW_IT &)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    row_it.set_to_list (block->row_list ());
-    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-      row = row_it.data ();
-      if (!row_processor (block, row, block_it, row_it) || selection_quit)
-        return;
-    }
-  }
-}
-
-
-/**
- * process_selected_rows_it()   PASS ITERATORS
- *
- * Walk the current block list applying the specified row processor function
- * to each row selected.
- */
-
-void
-process_selected_rows_it (BLOCK_LIST * block_list,
-                          TBOX & selection_box, 
-                          BOOL8 row_processor (BLOCK *, ROW *, BLOCK_IT &, ROW_IT &)) 
-{
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-
-  for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      row_it.set_to_list (block->row_list ());
-      for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
-        row = row_it.data ();
-        if (row->bounding_box ().overlap (selection_box)) {
-          if (!row_processor (block, row, block_it, row_it) || selection_quit)
-            return;
-        }
-      }
-    }
-  }
-}
--- a/ccmain/pagewalk.h
+++ b/ccmain/pagewalk.h
@ -1,157 +0,0 @@
-/**********************************************************************
- * File:        pagewalk.h  (Formerly walkers.h)
- * Description: Structure processors
- * Author:      Phil Cheatle
- * Created:     Thu Oct 10 16:25:24 BST 1991
- *
- * (C) Copyright 1991, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           PAGEWALK_H
-#define           PAGEWALK_H
-
-#include          "ocrblock.h"
-#include          "ocrrow.h"
-#include          "werd.h"
-#include          "polyblob.h"
-#include          "stepblob.h"
-#include          "rect.h"
-#include          "varable.h"
-#include          "notdll.h"
-#include          "tesseractclass.h"
-
-#define BLOCK_SPACING   20
-
-extern BOOL_VAR_H (current_word_quit, FALSE, "Stop processing this word");
-extern DLLSYM BOOL_VAR_H (selection_quit, FALSE,
-"Stop processing this selection");
-TBOX block_list_bounding_box(                        //find bounding box
-                            BLOCK_LIST *block_list  //of this block list
-                           );
-const TBOX block_list_compress(  //shuffle up blocks
-                              BLOCK_LIST *block_list);
-void block_list_move(                         //move
-                     BLOCK_LIST *block_list,  //this list
-                     ICOORD vec               //by this vector
-                    );
-int block_name_order(                      //sort blocks
-                     const void *block1p,  //ptr to ptr to block1
-                     const void *block2p   //ptr to ptr to block2
-                    );
-void process_all_blobs (         //process blobs
-BLOCK_LIST * block_list,         //blocks to check
-BOOL8 blob_processor (           //function to call
-                                 //function to call
-BLOCK *, ROW *, WERD *, PBLOB *), BOOL8 c_blob_processor (
-BLOCK
-*,
-ROW
-*,
-WERD
-*,
-C_BLOB
-*));
-void process_selected_blobs (    //process blobs
-BLOCK_LIST * block_list,         //blocks to check
-                                 //function to call
-TBOX & selection_box, BOOL8 blob_processor (
-                                 //function to call
-BLOCK *, ROW *, WERD *, PBLOB *), BOOL8 c_blob_processor (
-BLOCK
-*,
-ROW
-*,
-WERD
-*,
-C_BLOB
-*));
-void process_all_words (         //process words
-BLOCK_LIST * block_list,         //blocks to check
-BOOL8 word_processor (           //function to call
-BLOCK *, ROW *, WERD *));
-void process_selected_words (    //process words
-BLOCK_LIST * block_list,         //blocks to check
-                                 //function to call
-TBOX & selection_box, BOOL8 word_processor (
-BLOCK
-*,
-ROW
-*,
-WERD
-*));
-
-void process_all_words_it (      //process words
-BLOCK_LIST * block_list,         //blocks to check
-BOOL8 word_processor (           //function to call
-BLOCK *,
-ROW *,
-WERD *,
-BLOCK_IT &,
-ROW_IT &, WERD_IT &));
-void process_selected_words_it ( //process words
-BLOCK_LIST * block_list,         //blocks to check
-                                 //function to call
-TBOX & selection_box, BOOL8 word_processor (
-BLOCK
-*,
-ROW
-*,
-WERD
-*,
-BLOCK_IT
-&,
-ROW_IT
-&,
-WERD_IT
-&));
-void process_all_blocks (        //process blocks
-BLOCK_LIST * block_list,         //blocks to check
-BOOL8 block_processor (          //function to call
-BLOCK *));
-void process_selected_blocks (   //process blocks
-BLOCK_LIST * block_list,         //blocks to check
-                                 //function to call
-TBOX & selection_box, BOOL8 block_processor (
-BLOCK
-*));
-void process_all_rows (          //process words
-BLOCK_LIST * block_list,         //blocks to check
-BOOL8 row_processor (            //function to call
-BLOCK *, ROW *));
-void process_selected_rows (     //process rows
-BLOCK_LIST * block_list,         //blocks to check
-                                 //function to call
-TBOX & selection_box, BOOL8 row_processor (
-BLOCK
-*,
-ROW
-*));
-void process_all_rows_it (       //process words
-BLOCK_LIST * block_list,         //blocks to check
-BOOL8 row_processor (            //function to call
-BLOCK *,
-ROW *,
-BLOCK_IT &, ROW_IT &));
-void process_selected_rows_it (  //process rows
-BLOCK_LIST * block_list,         //blocks to check
-                                 //function to call
-TBOX & selection_box, BOOL8 row_processor (
-BLOCK
-*,
-ROW
-*,
-BLOCK_IT
-&,
-ROW_IT
-&));
-#endif
--- a/ccmain/paircmp.cpp
+++ b/ccmain/paircmp.cpp
@ -1,113 +0,0 @@
-/**********************************************************************
- * File:        paircmp.cpp  (Formerly paircmp.c)
- * Description: Code to compare two blobs using the adaptive matcher
- * Author:		Ray Smith
- * Created:		Wed Apr 21 09:31:02 BST 1993
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifdef _MSC_VER
-#pragma warning(disable:4244)  // Conversion warnings
-#endif
-
-#include "mfcpch.h"
-#include          "blobcmp.h"
-#include          "tfacep.h"
-#include          "paircmp.h"
-#include          "tesseractclass.h"
-
-#define EXTERN
-
-/**********************************************************************
- * compare_blob_pairs
- *
- * A blob processor to compare pairs of selected blobs.
- **********************************************************************/
-
-namespace tesseract {
-BOOL8 Tesseract::compare_blob_pairs(             //blob processor
-                                    BLOCK *,
-                                    ROW *row,    //row it came from
-                                    WERD *,
-                                    PBLOB *blob  //blob to compare
-                                   ) {
-  static ROW *prev_row = NULL;   //other in pair
-  static PBLOB *prev_blob = NULL;
-  float rating;                  //from matcher
-
-  if (prev_row == NULL || prev_blob == NULL) {
-    prev_row = row;
-    prev_blob = blob;
-  }
-  else {
-    rating = compare_blobs (prev_blob, prev_row, blob, row);
-    tprintf ("Rating=%g\n", rating);
-    prev_row = NULL;
-    prev_blob = NULL;
-  }
-  return TRUE;
-}
-
-
-/**********************************************************************
- * compare_blobs
- *
- * Compare 2 blobs and return the rating.
- **********************************************************************/
-
-float Tesseract::compare_blobs(               //match 2 blobs
-                               PBLOB *blob1,  //first blob
-                               ROW *row1,     //row it came from
-                               PBLOB *blob2,  //other blob
-                               ROW *row2) {
-  PBLOB *bn_blob1;               //baseline norm
-  PBLOB *bn_blob2;
-  DENORM denorm1, denorm2;
-  float rating;                  //match result
-
-  bn_blob1 = blob1->baseline_normalise (row1, &denorm1);
-  bn_blob2 = blob2->baseline_normalise (row2, &denorm2);
-  rating = compare_bln_blobs (bn_blob1, &denorm1, bn_blob2, &denorm2);
-  delete bn_blob1;
-  delete bn_blob2;
-  return rating;
-}
-
-
-/**********************************************************************
- * compare_bln_blobs
- *
- * Compare 2 baseline normalised blobs and return the rating.
- **********************************************************************/
-float Tesseract::compare_bln_blobs(               //match 2 blobs
-                                   PBLOB *blob1,  //first blob
-                                   DENORM *denorm1,
-                                   PBLOB *blob2,  //other blob
-                                   DENORM *denorm2) {
-  TBLOB *tblob1;                 //tessblobs
-  TBLOB *tblob2;
-  TEXTROW tessrow1, tessrow2;    //tess rows
-  float rating;                  //match result
-
-  tblob1 = make_tess_blob (blob1, TRUE);
-  make_tess_row(denorm1, &tessrow1); 
-  tblob2 = make_tess_blob (blob2, TRUE);
-  make_tess_row(denorm2, &tessrow2); 
-  rating = compare_tess_blobs (tblob1, &tessrow1, tblob2, &tessrow2);
-  free_blob(tblob1); 
-  free_blob(tblob2); 
-
-  return rating;
-}
-}  // namespace tesseract
--- a/ccmain/varabled.cpp
+++ b/ccmain/varabled.cpp
@ -1,6 +1,6 @@
 ///////////////////////////////////////////////////////////////////////
-// File:        varabled.cpp
-// Description: Variables Editor
+// File:        paramsd.cpp
+// Description: Tesseract parameter Editor
 // Author:      Joern Wanke
 // Created:     Wed Jul 18 10:05:01 PDT 2007
 //
@ -17,7 +17,7 @@
 //
 ///////////////////////////////////////////////////////////////////////
 //
-// The variables editor is used to edit all the variables used within
+// The parameters editor is used to edit all the parameters used within
 // tesseract from the ui.
 #ifdef WIN32
 #else
@ -33,69 +33,68 @@
 #endif

 #ifndef GRAPHICS_DISABLED
-#include "varabled.h"
+#include "paramsd.h"


+#include "params.h"
 #include "scrollview.h"
 #include "svmnode.h"

-#include "varable.h"
-#include "mainblk.h"

-#define VARDIR        "configs/" /*variables files */
+#define VARDIR        "configs/" /*parameters files */
 #define MAX_ITEMS_IN_SUBMENU 30

-const ERRCODE NO_VARIABLES_TO_EDIT = "No Variables defined to edit";
-
+// The following variables should remain static globals, since they
+// are used by debug editor, which uses a single Tesseract instance.
+//
 // Contains the mappings from unique VC ids to their actual pointers.
-static std::map<int, VariableContent*> vcMap;
-
-static int nrVariables = 0;
+static std::map<int, ParamContent*> vcMap;
+static int nrParams = 0;
 static int writeCommands[2];

-ELISTIZE(VariableContent)
+ELISTIZE(ParamContent)

-// Constructors for the various VarTypes.
-VariableContent::VariableContent(STRING_VARIABLE* it) {
-  my_id_ = nrVariables;
-  nrVariables++;
-  var_type_ = VT_STRING;
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::StringParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_STRING;
  sIt = it;
  vcMap[my_id_] = this;
 }
-// Constructors for the various VarTypes.
-VariableContent::VariableContent(INT_VARIABLE* it) {
-  my_id_ = nrVariables;
-  nrVariables++;
-  var_type_ = VT_INTEGER;
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::IntParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_INTEGER;
  iIt = it;
  vcMap[my_id_] = this;
 }
-// Constructors for the various VarTypes.
-VariableContent::VariableContent(BOOL_VARIABLE* it) {
-  my_id_ = nrVariables;
-  nrVariables++;
-  var_type_ = VT_BOOLEAN;
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::BoolParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_BOOLEAN;
  bIt = it;
  vcMap[my_id_] = this;
 }
-// Constructors for the various VarTypes.
-VariableContent::VariableContent(double_VARIABLE* it) {
-  my_id_ = nrVariables;
-  nrVariables++;
-  var_type_ = VT_DOUBLE;
+// Constructors for the various ParamTypes.
+ParamContent::ParamContent(tesseract::DoubleParam* it) {
+  my_id_ = nrParams;
+  nrParams++;
+  param_type_ = VT_DOUBLE;
  dIt = it;
  vcMap[my_id_] = this;
 }

 // Gets a VC object identified by its ID.
-VariableContent* VariableContent::GetVariableContentById(int id) {
+ParamContent* ParamContent::GetParamContentById(int id) {
  return vcMap[id];
 }

 // Copy the first N words from the source string to the target string.
 // Words are delimited by "_".
-void VariablesEditor::GetFirstWords(
+void ParamsEditor::GetFirstWords(
                     const char *s,  // source string
                     int n,          // number of words
                     char *t         // target string
@ -114,34 +113,34 @@ void VariablesEditor::GetFirstWords(
 }

 // Getter for the name.
-const char* VariableContent::GetName() const {
-  if (var_type_ == VT_INTEGER) { return iIt->name_str(); }
-  else if (var_type_ == VT_BOOLEAN) { return bIt->name_str(); }
-  else if (var_type_ == VT_DOUBLE) { return dIt->name_str(); }
-  else if (var_type_ == VT_STRING) { return sIt->name_str(); }
+const char* ParamContent::GetName() const {
+  if (param_type_ == VT_INTEGER) { return iIt->name_str(); }
+  else if (param_type_ == VT_BOOLEAN) { return bIt->name_str(); }
+  else if (param_type_ == VT_DOUBLE) { return dIt->name_str(); }
+  else if (param_type_ == VT_STRING) { return sIt->name_str(); }
  else
-    return "ERROR: VariableContent::GetName()";
+    return "ERROR: ParamContent::GetName()";
 }

 // Getter for the description.
-const char* VariableContent::GetDescription() const {
-  if (var_type_ == VT_INTEGER) { return iIt->info_str(); }
-  else if (var_type_ == VT_BOOLEAN) { return bIt->info_str(); }
-  else if (var_type_ == VT_DOUBLE) { return dIt->info_str(); }
-  else if (var_type_ == VT_STRING) { return sIt->info_str(); }
+const char* ParamContent::GetDescription() const {
+  if (param_type_ == VT_INTEGER) { return iIt->info_str(); }
+  else if (param_type_ == VT_BOOLEAN) { return bIt->info_str(); }
+  else if (param_type_ == VT_DOUBLE) { return dIt->info_str(); }
+  else if (param_type_ == VT_STRING) { return sIt->info_str(); }
  else return NULL;
 }

 // Getter for the value.
-const char* VariableContent::GetValue() const {
+const char* ParamContent::GetValue() const {
 char* msg = new char[1024];
-  if (var_type_ == VT_INTEGER) {
+  if (param_type_ == VT_INTEGER) {
    sprintf(msg, "%d", ((inT32) *(iIt)));
-  } else if (var_type_ == VT_BOOLEAN) {
+  } else if (param_type_ == VT_BOOLEAN) {
    sprintf(msg, "%d", ((BOOL8) * (bIt)));
-  } else if (var_type_ == VT_DOUBLE) {
+  } else if (param_type_ == VT_DOUBLE) {
    sprintf(msg, "%g", ((double) * (dIt)));
-  } else if (var_type_ == VT_STRING) {
+  } else if (param_type_ == VT_STRING) {
    if (((STRING) * (sIt)).string() != NULL) {
      sprintf(msg, "%s", ((STRING) * (sIt)).string());
    } else {
@ -152,26 +151,26 @@ char* msg = new char[1024];
 }

 // Setter for the value.
-void VariableContent::SetValue(const char* val) {
+void ParamContent::SetValue(const char* val) {
 // TODO (wanke) Test if the values actually are properly converted.
 // (Quickly visible impacts?)
  changed_ = TRUE;
-  if (var_type_ == VT_INTEGER) {
+  if (param_type_ == VT_INTEGER) {
    iIt->set_value(atoi(val));
-  } else if (var_type_ == VT_BOOLEAN) {
+  } else if (param_type_ == VT_BOOLEAN) {
    bIt->set_value(atoi(val));
-  } else if (var_type_ == VT_DOUBLE) {
+  } else if (param_type_ == VT_DOUBLE) {
    dIt->set_value(strtod(val, NULL));
-  } else if (var_type_ == VT_STRING) {
+  } else if (param_type_ == VT_STRING) {
    sIt->set_value(val);
  }
 }

 // Gets the up to the first 3 prefixes from s (split by _).
 // For example, tesseract_foo_bar will be split into tesseract,foo and bar.
-void VariablesEditor::GetPrefixes(const char* s, STRING* level_one,
-                                                 STRING* level_two,
-                                                 STRING* level_three) {
+void ParamsEditor::GetPrefixes(const char* s, STRING* level_one,
+                               STRING* level_two,
+                               STRING* level_three) {
  char* p = new char[1024];
  GetFirstWords(s, 1, p);
  *level_one = p;
@ -183,50 +182,47 @@ void VariablesEditor::GetPrefixes(const char* s, STRING* level_one,
 }

 // Compare two VC objects by their name.
-int VariableContent::Compare(const void* v1, const void* v2) {
-  const VariableContent* one =
-    *reinterpret_cast<const VariableContent* const *>(v1);
-  const VariableContent* two =
-    *reinterpret_cast<const VariableContent* const *>(v2);
+int ParamContent::Compare(const void* v1, const void* v2) {
+  const ParamContent* one =
+    *reinterpret_cast<const ParamContent* const *>(v1);
+  const ParamContent* two =
+    *reinterpret_cast<const ParamContent* const *>(v2);
  return strcmp(one->GetName(), two->GetName());
 }

-// Find all editable variables used within tesseract and create a
+// Find all editable parameters used within tesseract and create a
 // SVMenuNode tree from it.
 // TODO (wanke): This is actually sort of hackish.
-SVMenuNode* VariablesEditor::BuildListOfAllLeaves() {  // find all variables.
+SVMenuNode* ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
  SVMenuNode* mr = new SVMenuNode();
-  VariableContent_LIST vclist;
-  VariableContent_IT vc_it(&vclist);
+  ParamContent_LIST vclist;
+  ParamContent_IT vc_it(&vclist);
  // Amount counts the number of entries for a specific char*.
  // TODO(rays) get rid of the use of std::map.
  std::map<const char*, int> amount;

-  INT_VARIABLE_C_IT int_it(INT_VARIABLE::get_head());
-  BOOL_VARIABLE_C_IT bool_it(BOOL_VARIABLE::get_head());
-  STRING_VARIABLE_C_IT str_it(STRING_VARIABLE::get_head());
-  double_VARIABLE_C_IT dbl_it(double_VARIABLE::get_head());
-
-  // Add all variables to a list.
-  for (int_it.mark_cycle_pt(); !int_it.cycled_list(); int_it.forward()) {
-    vc_it.add_after_then_move(new VariableContent(int_it.data()));
-  }
-
-  for (bool_it.mark_cycle_pt(); !bool_it.cycled_list(); bool_it.forward()) {
-    vc_it.add_after_then_move(new VariableContent(bool_it.data()));
-  }
-
-  for (str_it.mark_cycle_pt(); !str_it.cycled_list(); str_it.forward()) {
-    vc_it.add_after_then_move(new VariableContent(str_it.data()));
-  }
-
-  for (dbl_it.mark_cycle_pt(); !dbl_it.cycled_list(); dbl_it.forward()) {
-    vc_it.add_after_then_move(new VariableContent(dbl_it.data()));
+  // Add all parameters to a list.
+  int v, i;
+  int num_iterations = (tess->params() == NULL) ? 1 : 2;
+  for (v = 0; v < num_iterations; ++v) {
+    tesseract::ParamsVectors *vec = (v == 0) ? GlobalParams() : tess->params();
+    for (i = 0; i < vec->int_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->int_params[i]));
+    }
+    for (i = 0; i < vec->bool_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->bool_params[i]));
+    }
+    for (i = 0; i < vec->string_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->string_params[i]));
+    }
+    for (i = 0; i < vec->double_params.size(); ++i) {
+      vc_it.add_after_then_move(new ParamContent(vec->double_params[i]));
+    }
  }

  // Count the # of entries starting with a specific prefix.
  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
-    VariableContent* vc = vc_it.data();
+    ParamContent* vc = vc_it.data();
    STRING tag;
    STRING tag2;
    STRING tag3;
@ -237,14 +233,14 @@ SVMenuNode* VariablesEditor::BuildListOfAllLeaves() {  // find all variables.
    amount[tag3.string()]++;
  }

-  vclist.sort(VariableContent::Compare);  // Sort the list alphabetically.
+  vclist.sort(ParamContent::Compare);  // Sort the list alphabetically.

  SVMenuNode* other = mr->AddChild("OTHER");

  // go through the list again and this time create the menu structure.
  vc_it.move_to_first();
  for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
-    VariableContent* vc = vc_it.data();
+    ParamContent* vc = vc_it.data();
    STRING tag;
    STRING tag2;
    STRING tag3;
@ -270,15 +266,15 @@ SVMenuNode* VariablesEditor::BuildListOfAllLeaves() {  // find all variables.
 }

 // Event listener. Waits for SVET_POPUP events and processes them.
-void VariablesEditor::Notify(const SVEvent* sve) {
+void ParamsEditor::Notify(const SVEvent* sve) {
  if (sve->type == SVET_POPUP) {  // only catch SVET_POPUP!
    char* param = sve->parameter;
    if (sve->command_id == writeCommands[0]) {
-      WriteVars(param, false);
+      WriteParams(param, false);
    } else if (sve->command_id == writeCommands[1]) {
-      WriteVars(param, true);
+      WriteParams(param, true);
    } else {
-      VariableContent* vc = VariableContent::GetVariableContentById(
+      ParamContent* vc = ParamContent::GetParamContentById(
          sve->command_id);
      vc->SetValue(param);
      sv_window_->AddMessage("Setting %s to %s",
@ -287,13 +283,13 @@ void VariablesEditor::Notify(const SVEvent* sve) {
  }
 }

-// Integrate the variables editor as popupmenu into the existing scrollview
+// Integrate the parameters editor as popupmenu into the existing scrollview
 // window (usually the pg editor). If sv == null, create a new empty
-// empty window and attach the variables editor to that window (ugly).
-VariablesEditor::VariablesEditor(const tesseract::Tesseract* tess,
+// empty window and attach the parameters editor to that window (ugly).
+ParamsEditor::ParamsEditor(tesseract::Tesseract* tess,
                                 ScrollView* sv) {
  if (sv == NULL) {
-    const char* name = "VarEditorMAIN";
+    const char* name = "ParamEditorMAIN";
    sv = new ScrollView(name, 1, 1, 200, 200, 300, 200);
  }

@ -302,31 +298,30 @@ VariablesEditor::VariablesEditor(const tesseract::Tesseract* tess,
  //Only one event handler per window.
  //sv->AddEventHandler((SVEventHandler*) this);

-  SVMenuNode* svMenuRoot = BuildListOfAllLeaves();
+  SVMenuNode* svMenuRoot = BuildListOfAllLeaves(tess);

-  STRING varfile;
-  varfile = tess->datadir;
-  varfile += VARDIR;             // variables dir
-  varfile += "edited";           // actual name
+  STRING paramfile;
+  paramfile = tess->datadir;
+  paramfile += VARDIR;             // parameters dir
+  paramfile += "edited";           // actual name

  SVMenuNode* std_menu = svMenuRoot->AddChild ("Build Config File");

-  writeCommands[0] = nrVariables+1;
-  std_menu->AddChild("All Variables", writeCommands[0],
-                     varfile.string(), "Config file name?");
+  writeCommands[0] = nrParams+1;
+  std_menu->AddChild("All Parameters", writeCommands[0],
+                     paramfile.string(), "Config file name?");

-  writeCommands[1] = nrVariables+2;
-  std_menu->AddChild ("changed_ Variables Only", writeCommands[1],
-                      varfile.string(), "Config file name?");
+  writeCommands[1] = nrParams+2;
+  std_menu->AddChild ("changed_ Parameters Only", writeCommands[1],
+                      paramfile.string(), "Config file name?");

  svMenuRoot->BuildMenu(sv, false);
 }


-// Write all (changed_) variables to a config file.
-void VariablesEditor::WriteVars(char *filename,    // in this file
-                                bool changes_only  // changed_ vars only?
-                               ) {
+// Write all (changed_) parameters to a config file.
+void ParamsEditor::WriteParams(char *filename,
+                               bool changes_only) {
  FILE *fp;                      // input file
  char msg_str[255];
                                 // if file exists
@ -344,10 +339,10 @@ void VariablesEditor::WriteVars(char *filename,    // in this file
    return;
  }

-  for (std::map<int, VariableContent*>::iterator iter = vcMap.begin();
+  for (std::map<int, ParamContent*>::iterator iter = vcMap.begin();
                                          iter != vcMap.end();
                                          ++iter) {
-    VariableContent* cur = iter->second;
+    ParamContent* cur = iter->second;
    if (!changes_only || cur->HasChanged()) {
      fprintf (fp, "%-25s   %-12s   # %s\n",
               cur->GetName(), cur->GetValue(), cur->GetDescription());
--- a/ccmain/paramsd.h
+++ b/ccmain/paramsd.h
@ -0,0 +1,124 @@
+///////////////////////////////////////////////////////////////////////
+// File:        paramsd.cpp
+// Description: Tesseract parameter editor
+// Author:      Joern Wanke
+// Created:     Wed Jul 18 10:05:01 PDT 2007
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+// Tesseract parameter editor is used to edit all the parameters used
+// within tesseract from the ui.
+#ifndef GRAPHICS_DISABLED
+#ifndef VARABLED_H
+#define VARABLED_H
+
+#include "elst.h"
+#include "scrollview.h"
+#include "params.h"
+#include "tesseractclass.h"
+
+class SVMenuNode;
+
+// A list of all possible parameter types used.
+enum ParamType {
+  VT_INTEGER,
+  VT_BOOLEAN,
+  VT_STRING,
+  VT_DOUBLE
+};
+
+// A rather hackish helper structure which can take any kind of parameter input
+// (defined by ParamType) and do a couple of common operations on them, like
+// comparisond or getting its value. It is used in the context of the
+// ParamsEditor as a bridge from the internal tesseract parameters to the
+// ones displayed by the ScrollView server.
+class ParamContent : public ELIST_LINK {
+ public:
+  // Compare two VC objects by their name.
+  static int Compare(const void* v1, const void* v2);
+
+  // Gets a VC object identified by its ID.
+  static ParamContent* GetParamContentById(int id);
+
+  // Constructors for the various ParamTypes.
+  ParamContent() {
+  }
+  ParamContent(tesseract::StringParam* it);
+  ParamContent(tesseract::IntParam* it);
+  ParamContent(tesseract::BoolParam* it);
+  ParamContent(tesseract::DoubleParam* it);
+
+
+  // Getters and Setters.
+  void SetValue(const char* val);
+  const char* GetValue() const;
+  const char* GetName() const;
+  const char* GetDescription() const;
+
+  int GetId() { return my_id_; }
+  bool HasChanged() { return changed_; }
+
+ private:
+  // The unique ID of this VC object.
+  int my_id_;
+  // Whether the parameter was changed_ and thus needs to be rewritten.
+  bool changed_;
+  // The actual ParamType of this VC object.
+  ParamType param_type_;
+
+  tesseract::StringParam* sIt;
+  tesseract::IntParam* iIt;
+  tesseract::BoolParam* bIt;
+  tesseract::DoubleParam* dIt;
+};
+
+ELISTIZEH(ParamContent)
+
+// The parameters editor enables the user to edit all the parameters used within
+// tesseract. It can be invoked on its own, but is supposed to be invoked by
+// the program editor.
+class ParamsEditor : public SVEventHandler {
+ public:
+  // Integrate the parameters editor as popupmenu into the existing scrollview
+  // window (usually the pg editor). If sv == null, create a new empty
+  // empty window and attach the parameter editor to that window (ugly).
+  ParamsEditor(tesseract::Tesseract*, ScrollView* sv = NULL);
+
+  // Event listener. Waits for SVET_POPUP events and processes them.
+  void Notify(const SVEvent* sve);
+
+ private:
+  // Gets the up to the first 3 prefixes from s (split by _).
+  // For example, tesseract_foo_bar will be split into tesseract,foo and bar.
+  void GetPrefixes(const char* s, STRING* level_one,
+                   STRING* level_two, STRING* level_three);
+
+  // Gets the first n words (split by _) and puts them in t.
+  // For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
+  void GetFirstWords(const char *s,  // source string
+                     int n,          // number of words
+                     char *t);       // target string
+
+  // Find all editable parameters used within tesseract and create a
+  // SVMenuNode tree from it.
+  SVMenuNode *BuildListOfAllLeaves(tesseract::Tesseract *tess);
+
+  // Write all (changed_) parameters to a config file.
+  void WriteParams(char* filename, bool changes_only);
+
+  ScrollView* sv_window_;
+};
+
+#endif
+#endif
--- a/ccmain/pgedit.cpp
+++ b/ccmain/pgedit.cpp
--- a/ccmain/pgedit.h
+++ b/ccmain/pgedit.h
@ -24,8 +24,7 @@
 #include          "ocrrow.h"
 #include          "werd.h"
 #include          "rect.h"
-#include          "pagewalk.h"
-#include          "varable.h"
+#include          "params.h"
 #include          "notdll.h"
 #include          "tesseractclass.h"

@ -45,7 +44,6 @@ class PGEventHandler : public SVEventHandler {
 };

 extern BLOCK_LIST *current_block_list;
-extern BOOL8 *current_image_changed;
 extern STRING_VAR_H (editor_image_win_name, "EditorImage",
 "Editor image window name");
 extern INT_VAR_H (editor_image_xpos, 590, "Editor image X Pos");
@ -71,14 +69,8 @@ extern INT_VAR_H (editor_word_height, 240, "Word window height");
 extern INT_VAR_H (editor_word_width, 655, "Word window width");
 extern double_VAR_H (editor_smd_scale_factor, 1.0, "Scaling for smd image");

-void add_word(                             //to block list
-              WERD *word,                  //word to be added
-              ROW *src_row,                //source row
-              BLOCK *src_block,            //source block
-              BLOCK_LIST *dest_block_list  //add to this
-             );
 ScrollView* bln_word_window_handle();  //return handle
-void build_image_window(TBOX page_bounding_box);
+void build_image_window(int width, int height);
 void display_bln_lines(ScrollView window,
                       ScrollView::Color colour,
                       float scale_factor,
@ -86,86 +78,11 @@ void display_bln_lines(ScrollView window,
                       float minx,
                       float maxx);
                                 //function to call
-void do_re_display (BOOL8 word_painter (
-BLOCK *, ROW *, WERD *));
-const TBOX do_tidy_cmd();  //tidy
-void do_view_cmd();
-void do_write_file(            //serialise
-                   char *name  //file name
-                  );
 void pgeditor_msg(  //message display
                  const char *msg);
 void pgeditor_show_point(  //display coords
                         SVEvent *event);
-void pgeditor_write_file(                    //serialise
-                         char *name,         //file name
-                         BLOCK_LIST *blocks  //block list to write
-                        );
                                 //put bln word in       box
-float re_scale_and_move_bln_word(WERD *norm_word,  //BL normalised word
-                                 const TBOX &box    //destination box
-                                );
-void re_segment_word(                         //break/join words
-                     BLOCK_LIST *block_list,  //blocks to check
-                     TBOX &selection_box);
-void block_space_stat(                         //show space stats
-                      BLOCK_LIST *block_list,  //blocks to check
-                      TBOX &selection_box);
-void row_space_stat(                         //show space stats
-                    BLOCK_LIST *block_list,  //blocks to check
-                    TBOX &selection_box);
-void show_point(                         //display posn of bloba word
-                BLOCK_LIST *block_list,  //blocks to check
-                float x,
-                float y);
-                                 //display a word
-BOOL8 word_blank_and_set_display(BLOCK *block,  //block holding word
-                                 ROW *row,      //row holding word
-                                 WERD *word     //word to be processed
-                                );
-BOOL8 word_bln_display(            //bln & display
-                       BLOCK *,    //block holding word
-                       ROW *row,   //row holding word
-                       WERD *word  //word to be processed
-                      );
-BOOL8 word_change_text(               //change correct text
-                       BLOCK *block,  //block holding word
-                       ROW *row,      //row holding word
-                       WERD *word     //word to be processed
-                      );
-BOOL8 word_copy(               //copy a word
-                BLOCK *block,  //block holding word
-                ROW *row,      //row holding word
-                WERD *word     //word to be processed
-               );
-BOOL8 word_delete(                     //delete a word
-                  BLOCK *block,        //block holding word
-                  ROW *row,            //row holding word
-                  WERD *word,          //word to be processed
-                  BLOCK_IT &block_it,  //block list iterator
-                  ROW_IT &row_it,      //row list iterator
-                  WERD_IT &word_it     //word list iterator
-                 );
-BOOL8 word_display(            // display a word
-                   BLOCK *,    //block holding word
-                   ROW *row,   //row holding word
-                   WERD *word  //word to be processed
-                  );
-BOOL8 word_dumper(               //dump word
-                  BLOCK *block,  //block holding word
-                  ROW *row,      //row holding word
-                  WERD *word     //word to be processed
-                 );
-BOOL8 word_set_display(               //display a word
-                       BLOCK *block,  //block holding word
-                       ROW *row,      //row holding word
-                       WERD *word     //word to be processed
-                      );
-BOOL8 word_toggle_seg(            //toggle seg flag
-                      BLOCK *,    //block holding word
-                      ROW *,      //row holding word
-                      WERD *word  //word to be processed
-                     );
-void do_check_mem(  //do it
-                  inT32 level);
+void show_point(PAGE_RES* page_res, float x, float y);
+
 #endif
--- a/ccmain/recogtraining.cpp
+++ b/ccmain/recogtraining.cpp
@ -0,0 +1,182 @@
+///////////////////////////////////////////////////////////////////////
+// File:        recogtraining.cpp
+// Description: Functions for ambiguity and parameter training.
+// Author:      Daria Antonova
+// Created:     Mon Aug 13 11:26:43 PDT 2009
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include "boxread.h"
+#include "control.h"
+#include "cutil.h"
+#include "host.h"
+#include "permute.h"
+#include "ratngs.h"
+#include "reject.h"
+#include "stopper.h"
+
+namespace tesseract {
+
+const inT16 kMaxBoxEdgeDiff = 2;
+
+// Sets flags necessary for recognition in the training mode.
+// Opens and returns the pointer to the output file.
+FILE *Tesseract::init_recog_training(const STRING &fname) {
+  if (tessedit_ambigs_training) {
+    tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
+    tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
+    save_best_choices.set_value(1);              // save individual char choices
+    getDict().save_raw_choices.set_value(1);     // save raw choices
+    getDict().permute_only_top.set_value(true);  // use only top choice permuter
+    tessedit_ok_mode.set_value(0);               // turn off context checking
+    // Explore all segmentations.
+    getDict().stopper_no_acceptable_choices.set_value(1);
+  }
+
+  STRING output_fname = fname;
+  const char *lastdot = strrchr(output_fname.string(), '.');
+  if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
+  output_fname += ".txt";
+  FILE *output_file = open_file(output_fname.string(), "a+");
+  return output_file;
+}
+
+// Copies the bounding box from page_res_it->word() to the given TBOX.
+bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
+  if (page_res_it->word() != NULL) {
+    *tbox = page_res_it->word()->word->bounding_box();
+    page_res_it->forward();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Reads the next box from the given box file into TBOX.
+bool read_b(int applybox_page, int *line_number, FILE *box_file,
+            char *label, TBOX *bbox) {
+  int x_min, y_min, x_max, y_max;
+  if (read_next_box(applybox_page, line_number, box_file, label,
+                    &x_min, &y_min, &x_max, &y_max)) {
+    bbox->set_to_given_coords(x_min, y_min, x_max, y_max);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// This function takes tif/box pair of files and runs recognition on the image,
+// while making sure that the word bounds that tesseract identified roughly
+// match to those specified by the input box file. For each word (ngram in a
+// single bounding box from the input box file) it outputs the ocred result,
+// the correct label, rating and certainty.
+void Tesseract::recog_training_segmented(const STRING &fname,
+                                         PAGE_RES *page_res,
+                                         volatile ETEXT_DESC *monitor,
+                                         FILE *output_file) {
+  STRING box_fname = fname;
+  const char *lastdot = strrchr(box_fname.string(), '.');
+  if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
+  box_fname += ".box";
+  // read_next_box() will close box_file
+  FILE *box_file = open_file(box_fname.string(), "r");
+
+  PAGE_RES_IT page_res_it;
+  page_res_it.page_res = page_res;
+  page_res_it.restart_page();
+  char label[kBoxReadBufSize];
+
+  // Process all the words on this page.
+  TBOX tbox;  // tesseract-identified box
+  TBOX bbox;  // box from the box file
+  bool keep_going;
+  int line_number = 0;
+  do {
+    keep_going = read_t(&page_res_it, &tbox);
+    keep_going &= read_b(applybox_page, &line_number, box_file, label, &bbox);
+    // Align bottom left points of the TBOXes.
+    while (keep_going &&
+           !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
+      keep_going = (bbox.bottom() < tbox.bottom()) ?
+          read_t(&page_res_it, &tbox) :
+            read_b(applybox_page, &line_number, box_file, label, &bbox);
+    }
+    while (keep_going &&
+           !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
+      keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
+        read_b(applybox_page, &line_number, box_file, label, &bbox);
+    }
+    // OCR the word if top right points of the TBOXes are similar.
+    if (keep_going &&
+        NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
+        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
+        ambigs_classify_and_output(page_res_it.prev_word(),
+                                   page_res_it.prev_row(),
+                                   page_res_it.prev_block(),
+                                   label, output_file);
+    }
+  } while (keep_going);
+}
+
+// Runs classify_word_pass1() on the current word. Outputs Tesseract's
+// raw choice as a result of the classification. For words labeled with a
+// single unichar also outputs all alternatives from blob_choices of the
+// best choice.
+void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
+                                           ROW_RES *row_res,
+                                           BLOCK_RES *block_res,
+                                           const char *label,
+                                           FILE *output_file) {
+  int offset;
+  // Classify word.
+  classify_word_pass1(werd_res, row_res->row, block_res->block);
+  WERD_CHOICE *best_choice = werd_res->best_choice;
+  ASSERT_HOST(best_choice != NULL);
+  ASSERT_HOST(best_choice->blob_choices() != NULL);
+
+  // Compute the number of unichars in the label.
+  int label_num_unichars = 0;
+  int step = 1;  // should be non-zero on the first iteration
+  for (offset = 0; label[offset] != '\0' && step > 0;
+       step = getDict().getUnicharset().step(label + offset),
+       offset += step, ++label_num_unichars);
+  if (step == 0) {
+    tprintf("Not outputting illegal unichar %s\n", label);
+    return;
+  }
+
+  // Output all classifier choices for the unigrams (1->1 classifications).
+  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
+    BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
+    outer_blob_choice_it.set_to_list(best_choice->blob_choices());
+    BLOB_CHOICE_IT blob_choice_it;
+    blob_choice_it.set_to_list(outer_blob_choice_it.data());
+    for (blob_choice_it.mark_cycle_pt();
+         !blob_choice_it.cycled_list();
+         blob_choice_it.forward()) {
+      BLOB_CHOICE *blob_choice = blob_choice_it.data();
+      if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
+        fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
+               unicharset.id_to_unichar(blob_choice->unichar_id()),
+               label, blob_choice->rating(), blob_choice->certainty());
+      }
+    }
+  }
+  // Output raw choices for many->many and 1->many classifications.
+  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
+}
+
+}  // namespace tesseract
--- a/ccmain/reject.cpp
+++ b/ccmain/reject.cpp
--- a/ccmain/reject.h
+++ b/ccmain/reject.h
@ -20,121 +20,15 @@
 #ifndef           REJECT_H
 #define           REJECT_H

-#include          "varable.h"
+#include          "params.h"
 #include          "pageres.h"
 #include          "notdll.h"

-extern INT_VAR_H (tessedit_reject_mode, 5, "Rejection algorithm");
-extern INT_VAR_H (tessedit_ok_mode, 5, "Acceptance decision algorithm");
-extern BOOL_VAR_H (tessedit_use_nn, TRUE, "");
-extern BOOL_VAR_H (tessedit_rejection_debug, FALSE, "Adaption debug");
-extern BOOL_VAR_H (tessedit_rejection_stats, FALSE, "Show NN stats");
-extern BOOL_VAR_H (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
-extern double_VAR_H (tessedit_lower_flip_hyphen, 1.5,
-"Aspect ratio dot/hyphen test");
-extern double_VAR_H (tessedit_upper_flip_hyphen, 1.8,
-"Aspect ratio dot/hyphen test");
-extern BOOL_VAR_H (rej_trust_doc_dawg, FALSE,
-"Use DOC dawg in 11l conf. detector");
-extern BOOL_VAR_H (rej_1Il_use_dict_word, FALSE, "Use dictword test");
-extern BOOL_VAR_H (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
-extern BOOL_VAR_H (one_ell_conflict_default, TRUE,
-"one_ell_conflict default");
-extern BOOL_VAR_H (show_char_clipping, FALSE, "Show clip image window?");
-extern BOOL_VAR_H (nn_debug, FALSE, "NN DEBUGGING?");
-extern BOOL_VAR_H (nn_reject_debug, FALSE, "NN DEBUG each char?");
-extern BOOL_VAR_H (nn_lax, FALSE, "Use 2nd rate matches");
-extern BOOL_VAR_H (nn_double_check_dict, FALSE, "Double check");
-extern BOOL_VAR_H (nn_conf_double_check_dict, TRUE,
-"Double check for confusions");
-extern BOOL_VAR_H (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
-extern BOOL_VAR_H (nn_conf_Ss, TRUE, "NN use Ss conflicts");
-extern BOOL_VAR_H (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
-extern BOOL_VAR_H (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
-extern BOOL_VAR_H (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
-extern BOOL_VAR_H (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
-extern BOOL_VAR_H (nn_conf_strict_on_dodgy_chs, TRUE,
-"Require stronger NN match");
-extern double_VAR_H (nn_dodgy_char_threshold, 0.99, "min accept score");
-extern INT_VAR_H (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
-extern INT_VAR_H (nn_conf_initial_i_level, 3,
-"NN accept initial Ii match level ");
-extern BOOL_VAR_H (no_unrej_dubious_chars, TRUE,
-"Dubious chars next to reject?");
-extern BOOL_VAR_H (no_unrej_no_alphanum_wds, TRUE,
-"Stop unrej of non A/N wds?");
-extern BOOL_VAR_H (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
-extern BOOL_VAR_H (rej_use_tess_accepted, TRUE,
-"Individual rejection control");
-extern BOOL_VAR_H (rej_use_tess_blanks, TRUE, "Individual rejection control");
-extern BOOL_VAR_H (rej_use_good_perm, TRUE, "Individual rejection control");
-extern BOOL_VAR_H (rej_use_sensible_wd, FALSE, "Extend permuter check");
-extern BOOL_VAR_H (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
-extern double_VAR_H (rej_whole_of_mostly_reject_word_fract, 0.85,
-"if >this fract");
-extern INT_VAR_H (rej_mostly_reject_mode, 1,
-"0-never, 1-afterNN, 2-after new xht");
-extern double_VAR_H (tessed_fullstop_aspect_ratio, 1.2,
-"if >this fract then reject");
-extern INT_VAR_H (net_image_width, 40, "NN input image width");
-extern INT_VAR_H (net_image_height, 36, "NN input image height");
-extern INT_VAR_H (net_image_x_height, 22, "NN input image x_height");
-extern INT_VAR_H (tessedit_image_border, 2, "Rej blbs near image edge limit");
-extern INT_VAR_H (net_bl_nodes, 20, "Number of baseline nodes");
-extern double_VAR_H (nn_reject_threshold, 0.5, "NN min accept score");
-extern double_VAR_H (nn_reject_head_and_shoulders, 0.6,
-"top scores sep factor");
-extern STRING_VAR_H (ok_single_ch_non_alphanum_wds, "-?\075",
-"Allow NN to unrej");
-extern STRING_VAR_H (ok_repeated_ch_non_alphanum_wds, "-?*\075",
-"Allow NN to unrej");
-extern STRING_VAR_H (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
-extern STRING_VAR_H (conflict_set_S_s, "Ss$", "Ss conflict set");
-extern STRING_VAR_H (conflict_set_hyphen, "-_~", "hyphen conflict set");
-extern STRING_VAR_H (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
-"Unreliable chars");
-extern STRING_VAR_H (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
-"Unreliable chars");
-extern INT_VAR_H (min_sane_x_ht_pixels, 8,
-"Reject any x-ht lt or eq than this");
 void reject_blanks(WERD_RES *word);
-void reject_I_1_L(WERD_RES *word);
-                                 //detailed results
 void reject_poor_matches(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices);
-float compute_reject_threshold(  //compute threshold //detailed results
-                               BLOB_CHOICE_LIST_CLIST *blob_choices);
-int sort_floats(                   //qsort function
-                const void *arg1,  //ptrs to floats
-                const void *arg2);
-void reject_edge_blobs(WERD_RES *word);
-BOOL8 word_contains_non_1_digit(const char *word,
-                                const char *word_lengths);
-                                 //of character
-inT16 nn_match_char(IMAGE &scaled_image,
-                    float baseline_pos,       //rel to scaled_image
-                    BOOL8 dict_word,          //part of dict wd?
-                    BOOL8 checked_dict_word,  //part of dict wd?
-                    BOOL8 sensible_word,      //part acceptable str?
-                    BOOL8 centre,             //not at word ends?
-                    BOOL8 good_quality_word,  //initial segmentation
-                    char tess_ch              //confirm this?
-                   );
-inT16 evaluate_net_match(char top,
-                         float top_score,
-                         char next,
-                         float next_score,
-                         char tess_ch,
-                         BOOL8 dict_word,
-                         BOOL8 checked_dict_word,
-                         BOOL8 sensible_word,
-                         BOOL8 centre,
-                         BOOL8 good_quality_word);
-void dont_allow_dubious_chars(WERD_RES *word);
-
+float compute_reject_threshold(BLOB_CHOICE_LIST_CLIST *blob_choices);
+BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths);
 void dont_allow_1Il(WERD_RES *word);
-
-void reject_mostly_rejects(  //rej all if most rejectd
-                           WERD_RES *word);
 void flip_hyphens(WERD_RES *word);
 void flip_0O(WERD_RES *word);
 BOOL8 non_0_digit(const char* str, int length);
--- a/ccmain/tessbox.cpp
+++ b/ccmain/tessbox.cpp
@ -34,43 +34,31 @@
 * @name tess_segment_pass1
 *
 * Segment a word using the pass1 conditions of the tess segmenter.
- * @param word bln word to do
- * @param denorm de-normaliser
- * @param matcher matcher function
- * @param raw_choice raw result
+ * @param word word to do
 * @param blob_choices list of blob lists
- * @param outword bln word output
 */

 namespace tesseract {
-WERD_CHOICE *Tesseract::tess_segment_pass1(WERD *word,
-                                           DENORM *denorm,
-                                           POLY_MATCHER matcher,
-                                           WERD_CHOICE *&raw_choice,
-                                           BLOB_CHOICE_LIST_CLIST *blob_choices,
-                                           WERD *&outword) {
-  WERD_CHOICE *result;           //return value
+void Tesseract::tess_segment_pass1(WERD_RES *word,
+                                   BLOB_CHOICE_LIST_CLIST *blob_choices) {
  int saved_enable_assoc = 0;
  int saved_chop_enable = 0;

-  if (word->flag (W_DONT_CHOP)) {
+  if (word->word->flag(W_DONT_CHOP)) {
    saved_enable_assoc = wordrec_enable_assoc;
    saved_chop_enable = chop_enable;
    wordrec_enable_assoc.set_value(0);
    chop_enable.set_value(0);
-    if (word->flag (W_REP_CHAR))
-      permute_only_top = 1;
+    if (word->word->flag(W_REP_CHAR))
+      getDict().permute_only_top.set_value(true);
  }
  set_pass1();
-  //      tprintf("pass1 chop on=%d, seg=%d, onlytop=%d",chop_enable,enable_assoc,permute_only_top);
-  result = recog_word (word, denorm, matcher, NULL, NULL, FALSE,
-    raw_choice, blob_choices, outword);
-  if (word->flag (W_DONT_CHOP)) {
+  recog_word(word, blob_choices);
+  if (word->word->flag(W_DONT_CHOP)) {
    wordrec_enable_assoc.set_value(saved_enable_assoc);
    chop_enable.set_value(saved_chop_enable);
-    permute_only_top = 0;
+    getDict().permute_only_top.set_value(false);
  }
-  return result;
 }


@ -78,101 +66,32 @@ WERD_CHOICE *Tesseract::tess_segment_pass1(WERD *word,
 * @name tess_segment_pass2
 *
 * Segment a word using the pass2 conditions of the tess segmenter.
- * @param word bln word to do
- * @param denorm de-normaliser
- * @param matcher matcher function
- * @param raw_choice raw result
+ * @param word word to do
 * @param blob_choices list of blob lists
- * @param outword bln word output
 */

-WERD_CHOICE *Tesseract::tess_segment_pass2(WERD *word,
-                                           DENORM *denorm,
-                                           POLY_MATCHER matcher,
-                                           WERD_CHOICE *&raw_choice,
-                                           BLOB_CHOICE_LIST_CLIST *blob_choices,
-                                           WERD *&outword) {
-  WERD_CHOICE *result;           //return value
+void Tesseract::tess_segment_pass2(WERD_RES *word,
+                                   BLOB_CHOICE_LIST_CLIST *blob_choices) {
  int saved_enable_assoc = 0;
  int saved_chop_enable = 0;

-  if (word->flag (W_DONT_CHOP)) {
+  if (word->word->flag(W_DONT_CHOP)) {
    saved_enable_assoc = wordrec_enable_assoc;
    saved_chop_enable = chop_enable;
    wordrec_enable_assoc.set_value(0);
    chop_enable.set_value(0);
-    if (word->flag (W_REP_CHAR))
-      permute_only_top = 1;
+    if (word->word->flag(W_REP_CHAR))
+      getDict().permute_only_top.set_value(true);
  }
  set_pass2();
-  result = recog_word (word, denorm, matcher, NULL, NULL, FALSE,
-    raw_choice, blob_choices, outword);
-  if (word->flag (W_DONT_CHOP)) {
+  recog_word(word, blob_choices);
+  if (word->word->flag(W_DONT_CHOP)) {
    wordrec_enable_assoc.set_value(saved_enable_assoc);
    chop_enable.set_value(saved_chop_enable);
-    permute_only_top = 0;
+    getDict().permute_only_top.set_value(false);
  }
-  return result;
 }

-
-/**
- * @name correct_segment_pass2
- *
- * Segment a word correctly using the pass2 conditions of the tess segmenter.
- * Then call the tester with all the correctly segmented blobs.
- * If the correct segmentation cannot be found, the tester is called
- * with the segmentation found by tess and all the correct flags set to
- * false and all strings are NULL.
- * @param word bln word to do
- * @param denorm de-normaliser
- * @param matcher matcher function
- * @param tester tester function
- * @param raw_choice raw result
- * @param blob_choices list of blob lists
- * @param outword bln word output
- */
-
-WERD_CHOICE *Tesseract::correct_segment_pass2(WERD *word,
-                                              DENORM *denorm,
-                                              POLY_MATCHER matcher,
-                                              POLY_TESTER tester,
-                                              WERD_CHOICE *&raw_choice,
-                                              BLOB_CHOICE_LIST_CLIST *blob_choices,
-                                              WERD *&outword) {
-  set_pass2();
-  return recog_word (word, denorm, matcher, NULL, tester, TRUE,
-    raw_choice, blob_choices, outword);
-}
-
-
-/**
- * @name test_segment_pass2
- *
- * Segment a word correctly using the pass2 conditions of the tess segmenter.
- * Then call the tester on all words used by tess in its search.
- * Do this only on words where the correct segmentation could be found.
- * @param word bln word to do
- * @param denorm de-normaliser
- * @param matcher matcher function
- * @param tester tester function
- * @param raw_choice raw result
- * @param blob_choices list of blob lists
- * @param outword bln word output
- */
-WERD_CHOICE *Tesseract::test_segment_pass2(WERD *word,
-                                           DENORM *denorm,
-                                           POLY_MATCHER matcher,
-                                           POLY_TESTER tester,
-                                           WERD_CHOICE *&raw_choice,
-                                           BLOB_CHOICE_LIST_CLIST *blob_choices,
-                                           WERD *&outword) {
-  set_pass2();
-  return recog_word (word, denorm, matcher, tester, NULL, TRUE,
-    raw_choice, blob_choices, outword);
-}
-
-
 /**
 * @name tess_acceptable_word
 *
@ -180,202 +99,10 @@ WERD_CHOICE *Tesseract::test_segment_pass2(WERD *word,
 * @param word_choice after context
 * @param raw_choice before context
 */
-BOOL8 Tesseract::tess_acceptable_word(WERD_CHOICE *word_choice,
-                                      WERD_CHOICE *raw_choice) {
-  return getDict().AcceptableResult(*word_choice, *raw_choice);
-}
-
-
-/**
- * @name tess_adaptable_word
- *
- * @return true if the word is regarded as "good enough".
- * @param word word to test
- * @param best_choice after context
- * @param raw_choice before context
- */
-BOOL8 Tesseract::tess_adaptable_word(WERD *word,
-                                     WERD_CHOICE *best_choice,
-                                     WERD_CHOICE *raw_choice) {
-  TWERD *tessword = make_tess_word(word, NULL);
-  int result = (tessword && best_choice && raw_choice &&
-                AdaptableWord(tessword, *best_choice, *raw_choice));
-  delete_word(tessword);
-  return result != 0;
-}
-
-
-/**
- * @name tess_cn_matcher
- *
- * Match a blob using the Tess Char Normalized (non-adaptive) matcher
- * only.
- * @param pblob previous blob
- * @param blob blob to match
- * @param nblob next blob
- * @param word word it came from
- * @param denorm de-normaliser
- * @param[out] ratings list of results
- * @param[out] cpresults may be null
- */
-
-void Tesseract::tess_cn_matcher(PBLOB *pblob,
-                                PBLOB *blob,
-                                PBLOB *nblob,
-                                WERD *word,
-                                DENORM *denorm,
-                                BLOB_CHOICE_LIST *ratings,
-                                CLASS_PRUNER_RESULTS cpresults) {
-  TBLOB *tessblob;               //converted blob
-  TEXTROW tessrow;               //dummy row
-
-  tess_cn_matching.set_value(true);       //turn it on
-  tess_bn_matching.set_value(false);
-                                 //convert blob
-  tessblob = make_rotated_tess_blob(denorm, blob, true);
-                                 //make dummy row
-  make_tess_row(denorm, &tessrow);
-                                 //classify
-  AdaptiveClassifier(tessblob, NULL, &tessrow, ratings, cpresults);
-  free_blob(tessblob);
-}
-
-
-/**
- * @name tess_bn_matcher
- *
- * Match a blob using the Tess Baseline Normalized (adaptive) matcher
- * only.
- * @param pblob previous blob
- * @param blob blob to match
- * @param nblob next blob
- * @param word word it came from
- * @param denorm de-normaliser
- * @param[out] ratings list of results
- */
-
-void Tesseract::tess_bn_matcher(PBLOB *pblob,
-                                PBLOB *blob,
-                                PBLOB *nblob,
-                                WERD *word,
-                                DENORM *denorm,
-                                BLOB_CHOICE_LIST *ratings) {
-  TBLOB *tessblob;               //converted blob
-  TEXTROW tessrow;               //dummy row
-
-  tess_bn_matching.set_value(true);       //turn it on
-  tess_cn_matching.set_value(false);
-                                 //convert blob
-  tessblob = make_rotated_tess_blob(denorm, blob, true);
-                                 //make dummy row
-  make_tess_row(denorm, &tessrow);
-                                 //classify
-  AdaptiveClassifier(tessblob, NULL, &tessrow, ratings, NULL);
-  free_blob(tessblob);
-}
-
-
-/**
- * @name tess_default_matcher
- *
- * Match a blob using the default functionality of the Tess matcher.
- * @param pblob previous blob
- * @param blob blob to match
- * @param nblob next blob
- * @param word word it came from
- * @param denorm de-normaliser
- * @param[out] ratings list of results
- * @param script (unused)
- */
-
-void Tesseract::tess_default_matcher(PBLOB *pblob,
-                                     PBLOB *blob,
-                                     PBLOB *nblob,
-                                     WERD *word,
-                                     DENORM *denorm,
-                                     BLOB_CHOICE_LIST *ratings,
-                                     const char* script) {
-  assert(ratings != NULL);
-  TBLOB *tessblob;               //converted blob
-  TEXTROW tessrow;               //dummy row
-
-  tess_bn_matching.set_value(false);      //turn it off
-  tess_cn_matching.set_value(false);
-                                 //convert blob
-  tessblob = make_rotated_tess_blob(denorm, blob, true);
-                                 //make dummy row
-  make_tess_row(denorm, &tessrow);
-                                 //classify
-  AdaptiveClassifier (tessblob, NULL, &tessrow, ratings, NULL);
-  free_blob(tessblob);
-}
-}  // namespace tesseract
-
-
-/**
- * @name tess_training_tester
- *
- * Matcher tester function which actually trains tess.
- * @param filename filename to output
- * @param blob blob to match
- * @param denorm de-normaliser
- * @param correct ly segmented
- * @param text correct text
- * @param count chars in text
- * @param[out] ratings list of results
- */
-
-void tess_training_tester(const STRING& filename,
-                          PBLOB *blob,
-                          DENORM *denorm,
-                          BOOL8 correct,
-                          char *text,
-                          inT32 count,
-                          BLOB_CHOICE_LIST *ratings) {
-  TBLOB *tessblob;               //converted blob
-  TEXTROW tessrow;               //dummy row
-
-  if (correct) {
-    classify_norm_method.set_value(character); // force char norm spc 30/11/93
-    tess_bn_matching.set_value(false);    //turn it off
-    tess_cn_matching.set_value(false);
-                                 //convert blob
-    tessblob = make_tess_blob (blob, TRUE);
-                                 //make dummy row
-    make_tess_row(denorm, &tessrow);
-                                 //learn it
-    LearnBlob(filename, tessblob, &tessrow, text);
-    free_blob(tessblob);
-  }
-}
-
-
-namespace tesseract {
-/**
- * @name tess_adapter
- *
- * Adapt to the word using the Tesseract mechanism.
- * @param word bln word
- * @param denorm de-normalise
- * @param choice string for word
- * @param raw_choice before context
- * @param rejmap reject map
- */
-void Tesseract::tess_adapter(WERD *word,
-                             DENORM *denorm,
-                             const WERD_CHOICE& choice,
-                             const WERD_CHOICE& raw_choice,
-                             const char *rejmap) {
-  TWERD *tessword;               //converted word
-  static TEXTROW tessrow;        //dummy row
-
-                                 //make dummy row
-  make_tess_row(denorm, &tessrow);
-                                 //make a word
-  tessword = make_tess_word (word, &tessrow);
-  AdaptToWord(tessword, &tessrow, choice, raw_choice, rejmap);
-  //adapt to it
-  delete_word(tessword);  //free it
+BOOL8 Tesseract::tess_acceptable_word(
+    WERD_CHOICE *word_choice,  // after context
+    WERD_CHOICE *raw_choice) {  // before context
+  return getDict().AcceptableResult(*word_choice);
 }


--- a/ccmain/tessbox.h
+++ b/ccmain/tessbox.h
@ -24,13 +24,6 @@
 #include          "notdll.h"
 #include "tesseractclass.h"

-void tess_training_tester(
-                          const STRING& filename,
-                          PBLOB *blob,
-                          DENORM *denorm,
-                          BOOL8 correct,
-                          char *text,
-                          inT32 count,
-                          BLOB_CHOICE_LIST *ratings
-                         );
+// TODO(ocr-team): Delete this along with other empty header files.
+
 #endif
--- a/ccmain/tessedit.cpp
+++ b/ccmain/tessedit.cpp
@ -34,47 +34,28 @@
 #include          "reject.h"
 #include          "pageres.h"
 //#include                                                      "gpapdest.h"
-#include          "mainblk.h"
 #include          "nwmain.h"
 #include          "pgedit.h"
-#include          "ocrshell.h"
 #include          "tprintf.h"
 //#include                                      "ipeerr.h"
 //#include                                                      "restart.h"
 #include          "tessedit.h"
 //#include                                                      "fontfind.h"
 #include "permute.h"
-#include "permdawg.h"
 #include "stopper.h"
-#include "adaptmatch.h"
 #include "intmatcher.h"
 #include "chop.h"
 #include "efio.h"
 #include "danerror.h"
 #include "globals.h"
 #include "tesseractclass.h"
-#include "varable.h"
-
-/*
-** Include automatically generated configuration file if running autoconf
-*/
-#ifdef HAVE_CONFIG_H
-#include "config_auto.h"
-#endif
-// Includes libtiff if HAVE_LIBTIFF is defined
-#ifdef HAVE_LIBTIFF
-#include "tiffio.h"
-
-#endif
+#include "params.h"

 #include          "notdll.h"     //phils nn stuff

 #define VARDIR        "configs/" /*variables files */
                                 //config under api
 #define API_CONFIG      "configs/api_config"
-#define EXTERN
-
-EXTERN BOOL_EVAR (tessedit_write_vars, FALSE, "Write all vars to file");

 ETEXT_DESC *global_monitor = NULL;  // progress monitor

@ -83,7 +64,7 @@ namespace tesseract {
 // Read a "config" file containing a set of variable, value pairs.
 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
 // and also accepts a relative or absolute path name.
-void Tesseract::read_config_file(const char *filename, bool global_only) {
+void Tesseract::read_config_file(const char *filename, bool init_only) {
  STRING path = datadir;
  path += "configs/";
  path += filename;
@ -100,33 +81,25 @@ void Tesseract::read_config_file(const char *filename, bool global_only) {
      path = filename;
    }
  }
-  read_variables_file(path.string(), global_only);
+  ParamUtils::ReadParamsFile(path.string(), init_only, this->params());
 }

 // Returns false if a unicharset file for the specified language was not found
 // or was invalid.
 // This function initializes TessdataManager. After TessdataManager is
 // no longer needed, TessdataManager::End() should be called.
+//
+// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
+// it is OEM_DEFAULT, in which case the value of the variable will be obtained
+// from the language-specific config file (stored in [lang].traineddata), from
+// the config files specified on the command line or left as the default
+// OEM_TESSERACT_ONLY if none of the configs specify this variable.
 bool Tesseract::init_tesseract_lang_data(
    const char *arg0, const char *textbase, const char *language,
-    char **configs, int configs_size, bool configs_global_only) {
-  FILE *var_file;
-  static char c_path[MAX_PATH];  //path for c code
-
+    OcrEngineMode oem, char **configs, int configs_size,
+    bool configs_init_only) {
  // Set the basename, compute the data directory.
  main_setup(arg0, textbase);
-  debug_window_on.set_value (FALSE);
-
-  if (tessedit_write_vars) {
-    var_file = fopen ("edited.cfg", "w");
-    if (var_file != NULL) {
-      print_variables(var_file);
-      fclose(var_file);
-    }
-  }
-  strcpy (c_path, datadir.string());
-  c_path[strlen (c_path) - strlen (m_data_sub_dir.string ())] = '\0';
-  demodir = c_path;

  // Set the language data path prefix
  lang = language != NULL ? language : "eng";
@ -134,25 +107,51 @@ bool Tesseract::init_tesseract_lang_data(
  language_data_path_prefix += lang;
  language_data_path_prefix += ".";

-  // Load tesseract variables from config files.
-  for (int i = 0; i < configs_size; ++i) {
-    read_config_file(configs[i], configs_global_only);
-  }
-
  // Initialize TessdataManager.
  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
-  tessdata_manager.Init(tessdata_path.string());
+  tessdata_manager.Init(tessdata_path.string(),
+                        tessdata_manager_debug_level);

  // If a language specific config file (lang.config) exists, load it in.
  if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
-    read_variables_from_fp(tessdata_manager.GetDataFilePtr(),
-                           tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
-                           false);
-    if (global_tessdata_manager_debug_level) {
+    ParamUtils::ReadParamsFromFp(
+        tessdata_manager.GetDataFilePtr(),
+        tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
+        false, this->params());
+    if (tessdata_manager_debug_level) {
      tprintf("Loaded language config file\n");
    }
  }

+  // Load tesseract variables from config files. This is done after loading
+  // language-specific variables from [lang].traineddata file, so that custom
+  // config files can override values in [lang].traineddata file.
+  for (int i = 0; i < configs_size; ++i) {
+    read_config_file(configs[i], configs_init_only);
+  }
+
+  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
+    FILE *params_file = fopen(tessedit_write_params_to_file.string(), "w");
+    if (params_file != NULL) {
+      ParamUtils::PrintParams(params_file, this->params());
+      fclose(params_file);
+      if (tessdata_manager_debug_level > 0) {
+        tprintf("Wrote parameters to %s\n",
+                tessedit_write_params_to_file.string());
+      }
+    } else {
+      tprintf("Failed to open %s for writing params.\n",
+              tessedit_write_params_to_file.string());
+    }
+  }
+
+  // Determine which ocr engine(s) should be loaded and used for recognition.
+  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
+  if (tessdata_manager_debug_level) {
+    tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
+            static_cast<int>(tessedit_ocr_engine_mode));
+  }
+
  // Load the unicharset
  if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
      !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
@ -162,51 +161,63 @@ bool Tesseract::init_tesseract_lang_data(
    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
    return false;
  }
-  if (global_tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
+  right_to_left_ = unicharset.any_right_to_left();
+  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");

-  if (!global_tessedit_ambigs_training &&
+  if (!tessedit_ambigs_training &&
      tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
    unichar_ambigs.LoadUnicharAmbigs(
        tessdata_manager.GetDataFilePtr(),
        tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
-        &unicharset);
-    if (global_tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
+        ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
+    if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
  }
+
+  // Load Cube objects if necessary.
+  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
+    ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
+    if (tessdata_manager_debug_level)
+      tprintf("Loaded Cube w/out combiner\n");
+  } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
+    ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
+    if (tessdata_manager_debug_level)
+      tprintf("Loaded Cube with combiner\n");
+  }
+
  return true;
 }

 int Tesseract::init_tesseract(
    const char *arg0, const char *textbase, const char *language,
-    char **configs, int configs_size, bool configs_global_only) {
-  if (!init_tesseract_lang_data(arg0, textbase, language, configs,
-                                configs_size, configs_global_only)) {
+    OcrEngineMode oem, char **configs, int configs_size,
+    bool configs_init_only) {
+  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
+                                configs_size, configs_init_only)) {
    return -1;
  }
-  start_recog(textbase);
+  // If only Cube will be used, skip loading Tesseract classifier's
+  // pre-trained templates.
+  bool init_tesseract_classifier =
+    (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
+     tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED);
+  // If only Cube will be used and if it has its own Unicharset,
+  // skip initializing permuter and loading Tesseract Dawgs.
+  bool init_dict =
+    !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
+      tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET));
+  program_editup(textbase, init_tesseract_classifier, init_dict);
  tessdata_manager.End();
  return 0;                      //Normal exit
 }

-// Init everything except the language model
-int Tesseract::init_tesseract_classifier(
-    const char *arg0, const char *textbase, const char *language,
-    char **configs, int configs_size, bool configs_global_only) {
-  if (!init_tesseract_lang_data (arg0, textbase, language, configs,
-                                 configs_size, configs_global_only)) {
-    return -1;
-  }
-  // Dont initialize the permuter.
-  program_editup(textbase, false);
-  tessdata_manager.End();
-  return 0;
-}
-
 // init the LM component
 int Tesseract::init_tesseract_lm(const char *arg0,
                   const char *textbase,
                   const char *language) {
-  init_tesseract_lang_data(arg0, textbase, language, NULL, 0, false);
-  getDict().init_permute();
+  if (!init_tesseract_lang_data(arg0, textbase, language,
+                                OEM_TESSERACT_ONLY, NULL, 0, false))
+    return -1;
+  getDict().Load();
  tessdata_manager.End();
  return 0;
 }
@ -226,42 +237,3 @@ enum CMD_EVENTS
 };

 }  // namespace tesseract
-
-#ifdef _TIFFIO_
-void read_tiff_image(TIFF* tif, IMAGE* image) {
-  tdata_t buf;
-  uint32 image_width, image_height;
-  uint16 photometric;
-  inT16 bpp;
-  inT16 samples_per_pixel = 0;
-  TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &image_width);
-  TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &image_height);
-  if (!TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp))
-    bpp = 1;  // Binary is default if no value provided.
-  TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &samples_per_pixel);
-  TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric);
-  if (samples_per_pixel > 1)
-    bpp *= samples_per_pixel;
-  // Tesseract's internal representation is 0-is-black,
-  // so if the photometric is 1 (min is black) then high-valued pixels
-  // are 1 (white), otherwise they are 0 (black).
-  uinT8 high_value = photometric == 1;
-  image->create(image_width, image_height, bpp);
-  IMAGELINE line;
-  line.init(image_width);
-
-  buf = _TIFFmalloc(TIFFScanlineSize(tif));
-  int bytes_per_line = (image_width*bpp + 7)/8;
-  uinT8* dest_buf = image->get_buffer();
-  // This will go badly wrong with one of the more exotic tiff formats,
-  // but the majority will work OK.
-  for (int y = 0; y < image_height; ++y) {
-    TIFFReadScanline(tif, buf, y);
-    memcpy(dest_buf, buf, bytes_per_line);
-    dest_buf += bytes_per_line;
-  }
-  if (high_value == 0)
-    invert_image(image);
-  _TIFFfree(buf);
-}
-#endif
--- a/ccmain/tessedit.h
+++ b/ccmain/tessedit.h
@ -20,9 +20,8 @@
 #ifndef           TESSEDIT_H
 #define           TESSEDIT_H

-#include          "tessclas.h"
-#include          "ocrclass.h"
-#include                    "pgedit.h"
+#include          "blobs.h"
+#include          "pgedit.h"
 #include          "notdll.h"

                                 //progress monitor
--- a/ccmain/tesseract_cube_combiner.cpp
+++ b/ccmain/tesseract_cube_combiner.cpp
@ -0,0 +1,308 @@
+/**********************************************************************
+ * File:        tesseract_cube_combiner.h
+ * Description: Declaration of the Tesseract & Cube results combiner Class
+ * Author:    Ahmad Abdulkader
+ * Created:   2008
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// The TesseractCubeCombiner class provides the functionality of combining
+// the recognition results of Tesseract and Cube at the word level
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <wctype.h>
+
+#include "tesseract_cube_combiner.h"
+
+#include "cube_object.h"
+#include "cube_reco_context.h"
+#include "cube_utils.h"
+#include "neural_net.h"
+#include "tesseractclass.h"
+#include "word_altlist.h"
+
+namespace tesseract {
+
+TesseractCubeCombiner::TesseractCubeCombiner(CubeRecoContext *cube_cntxt) {
+  cube_cntxt_ = cube_cntxt;
+  combiner_net_ = NULL;
+}
+
+TesseractCubeCombiner::~TesseractCubeCombiner() {
+  if (combiner_net_ != NULL) {
+    delete combiner_net_;
+    combiner_net_ = NULL;
+  }
+}
+
+bool TesseractCubeCombiner::LoadCombinerNet() {
+  ASSERT_HOST(cube_cntxt_);
+  // Compute the path of the combiner net
+  string data_path;
+  cube_cntxt_->GetDataFilePath(&data_path);
+  string net_file_name =  data_path + cube_cntxt_->Lang() +
+                          ".tesseract_cube.nn";
+
+  // Return false if file does not exist
+  FILE *fp = fopen(net_file_name.c_str(), "r");
+  if (fp == NULL)
+    return false;
+  else
+    fclose(fp);
+
+  // Load and validate net
+  combiner_net_ = NeuralNet::FromFile(net_file_name);
+  if (combiner_net_ == NULL) {
+    tprintf("Could not read combiner net file %s", net_file_name.c_str());
+    return false;
+  } else if (combiner_net_->out_cnt() != 2) {
+    tprintf("Invalid combiner net file %s! Output count != 2\n",
+            net_file_name.c_str());
+    delete combiner_net_;
+    combiner_net_ = NULL;
+    return false;
+  }
+  return true;
+}
+
+// Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
+// strips punc and/or normalizes case and then converts back
+string TesseractCubeCombiner::NormalizeString(const string &str,
+                                              bool remove_punc,
+                                              bool norm_case) {
+  // convert to UTF32
+  string_32 str32;
+  CubeUtils::UTF8ToUTF32(str.c_str(), &str32);
+  // strip punc and normalize
+  string_32 new_str32;
+  for (int idx = 0; idx < str32.length(); idx++) {
+    // if no punc removal is required or not a punctuation character
+    if (!remove_punc || iswpunct(str32[idx]) == 0) {
+      char_32 norm_char = str32[idx];
+      // normalize case if required
+      if (norm_case && iswalpha(norm_char)) {
+        norm_char = towlower(norm_char);
+      }
+      new_str32.push_back(norm_char);
+    }
+  }
+  // convert back to UTF8
+  string new_str;
+  CubeUtils::UTF32ToUTF8(new_str32.c_str(), &new_str);
+  return new_str;
+}
+
+// Compares 2 strings optionally ignoring punctuation
+int TesseractCubeCombiner::CompareStrings(const string &str1,
+                                          const string &str2,
+                                          bool ignore_punc,
+                                          bool ignore_case) {
+  if (!ignore_punc && !ignore_case) {
+    return str1.compare(str2);
+  }
+  string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case);
+  string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case);
+  return norm_str1.compare(norm_str2);
+}
+
+// Check if a string is a valid Tess dict word or not
+bool TesseractCubeCombiner::ValidWord(const string &str) {
+  return (cube_cntxt_->TesseractObject()->getDict().valid_word(str.c_str())
+          > 0);
+}
+
+// Public method for computing the combiner features. The agreement
+// output parameter will be true if both answers are identical,
+// and false otherwise.
+bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
+                                                    int tess_confidence,
+                                                    CubeObject *cube_obj,
+                                                    WordAltList *cube_alt_list,
+                                                    vector<double> *features,
+                                                    bool *agreement) {
+  features->clear();
+  *agreement = false;
+  if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0)
+    return false;
+
+  // Get Cube's best string; return false if empty
+  char_32 *cube_best_str32 = cube_alt_list->Alt(0);
+  if (cube_best_str32 == NULL || CubeUtils::StrLen(cube_best_str32) < 1)
+    return false;
+  string cube_best_str;
+  int cube_best_cost = cube_alt_list->AltCost(0);
+  int cube_best_bigram_cost = 0;
+  bool cube_best_bigram_cost_valid = true;
+  if (cube_cntxt_->Bigrams())
+    cube_best_bigram_cost = cube_cntxt_->Bigrams()->
+        Cost(cube_best_str32, cube_cntxt_->CharacterSet(),
+             &cube_cntxt_->TesseractObject()->unicharset);
+  else
+    cube_best_bigram_cost_valid = false;
+  CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str);
+
+  // Get Tesseract's UTF32 string
+  string_32 tess_str32;
+  CubeUtils::UTF8ToUTF32(tess_str.c_str(), &tess_str32);
+
+  // Compute agreement flag
+  *agreement = (tess_str.compare(cube_best_str) == 0);
+
+  // Get Cube's second best string; if empty, return false
+  char_32 *cube_next_best_str32;
+  string cube_next_best_str;
+  int cube_next_best_cost = WORST_COST;
+  if (cube_alt_list->AltCount() > 1) {
+    cube_next_best_str32 = cube_alt_list->Alt(1);
+    if (cube_next_best_str32 == NULL ||
+        CubeUtils::StrLen(cube_next_best_str32) == 0) {
+      return false;
+    }
+    cube_next_best_cost = cube_alt_list->AltCost(1);
+    CubeUtils::UTF32ToUTF8(cube_next_best_str32, &cube_next_best_str);
+  }
+  // Rank of Tesseract's top result in Cube's alternate list
+  int tess_rank = 0;
+  for (tess_rank = 0; tess_rank < cube_alt_list->AltCount(); tess_rank++) {
+    string alt_str;
+    CubeUtils::UTF32ToUTF8(cube_alt_list->Alt(tess_rank), &alt_str);
+    if (alt_str == tess_str)
+      break;
+  }
+
+  // Cube's cost for tesseract's result. Note that this modifies the
+  // state of cube_obj, including its alternate list by calling RecognizeWord()
+  int tess_cost = cube_obj->WordCost(tess_str.c_str());
+  // Cube's bigram cost of Tesseract's string
+  int tess_bigram_cost = 0;
+  int tess_bigram_cost_valid = true;
+  if (cube_cntxt_->Bigrams())
+    tess_bigram_cost = cube_cntxt_->Bigrams()->
+        Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet(),
+             &cube_cntxt_->TesseractObject()->unicharset);
+  else
+    tess_bigram_cost_valid = false;
+
+  // Tesseract confidence
+  features->push_back(tess_confidence);
+  // Cube cost of Tesseract string
+  features->push_back(tess_cost);
+  // Cube Rank of Tesseract string
+  features->push_back(tess_rank);
+  // length of Tesseract OCR string
+  features->push_back(tess_str.length());
+  // Tesseract OCR string in dictionary
+  features->push_back(ValidWord(tess_str));
+  if (tess_bigram_cost_valid) {
+    // bigram cost of Tesseract string
+    features->push_back(tess_bigram_cost);
+  }
+  // Cube tess_cost of Cube best string
+  features->push_back(cube_best_cost);
+  // Cube tess_cost of Cube next best string
+  features->push_back(cube_next_best_cost);
+  // length of Cube string
+  features->push_back(cube_best_str.length());
+  // Cube string in dictionary
+  features->push_back(ValidWord(cube_best_str));
+  if (cube_best_bigram_cost_valid) {
+    // bigram cost of Cube string
+    features->push_back(cube_best_bigram_cost);
+  }
+  // case-insensitive string comparison, including punctuation
+  int compare_nocase_punc = CompareStrings(cube_best_str.c_str(),
+                                           tess_str.c_str(), false, true);
+  features->push_back(compare_nocase_punc == 0);
+  // case-sensitive string comparison, ignoring punctuation
+  int compare_case_nopunc = CompareStrings(cube_best_str.c_str(),
+                                           tess_str.c_str(), true, false);
+  features->push_back(compare_case_nopunc == 0);
+  // case-insensitive string comparison, ignoring punctuation
+  int compare_nocase_nopunc = CompareStrings(cube_best_str.c_str(),
+                                             tess_str.c_str(), true, true);
+  features->push_back(compare_nocase_nopunc == 0);
+  return true;
+}
+
+// The CubeObject parameter is used for 2 purposes: 1) to retrieve
+// cube's alt list, and 2) to compute cube's word cost for the
+// tesseract result. The call to CubeObject::WordCost() modifies
+// the object's alternate list, so previous state will be lost.
+float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res,
+                                            CubeObject *cube_obj) {
+  // If no combiner is loaded or the cube object is undefined,
+  // tesseract wins with probability 1.0
+  if (combiner_net_ == NULL || cube_obj == NULL) {
+    tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
+            "Cube objects not initialized; defaulting to Tesseract\n");
+    return 1.0;
+  }
+
+  // Retrieve the alternate list from the CubeObject's current state.
+  // If the alt list empty, tesseract wins with probability 1.0
+  WordAltList *cube_alt_list = cube_obj->AlternateList();
+  if (cube_alt_list == NULL)
+    cube_alt_list = cube_obj->RecognizeWord();
+  if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
+    tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
+            "Cube returned no results; defaulting to Tesseract\n");
+    return 1.0;
+  }
+  return CombineResults(tess_res, cube_obj, cube_alt_list);
+}
+
+// The alt_list parameter is expected to have been extracted from the
+// CubeObject that recognized the word to be combined. The cube_obj
+// parameter passed may be either same instance or a separate instance to
+// be used only by the combiner. In both cases, its alternate
+// list will be modified by an internal call to RecognizeWord().
+float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res,
+                                            CubeObject *cube_obj,
+                                            WordAltList *cube_alt_list) {
+  // If no combiner is loaded or the cube object is undefined, or the
+  // alt list is empty, tesseract wins with probability 1.0
+  if (combiner_net_ == NULL || cube_obj == NULL ||
+      cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
+    tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
+            "Cube result cannot be retrieved; defaulting to Tesseract\n");
+    return 1.0;
+  }
+
+  // Tesseract result string, tesseract confidence, and cost of
+  // tesseract result according to cube
+  string tess_str = tess_res->best_choice->unichar_string().string();
+  // Map certainty [-20.0, 0.0] to confidence [0, 100]
+  int tess_confidence = MIN(100, MAX(1, static_cast<int>(
+      100 + (5 * tess_res->best_choice->certainty()))));
+
+  // Compute the combiner features. If feature computation fails or
+  // answers are identical, tesseract wins with probability 1.0
+  vector<double> features;
+  bool agreement;
+  bool combiner_success = ComputeCombinerFeatures(tess_str, tess_confidence,
+                                                  cube_obj, cube_alt_list,
+                                                  &features, &agreement);
+  if (!combiner_success || agreement)
+    return 1.0;
+
+  // Classify combiner feature vector and return output (probability
+  // of tesseract class).
+  double net_out[2];
+  if (!combiner_net_->FeedForward(&features[0], net_out))
+    return 1.0;
+  return net_out[1];
+}
+}
--- a/ccmain/tesseract_cube_combiner.h
+++ b/ccmain/tesseract_cube_combiner.h
@ -0,0 +1,103 @@
+/**********************************************************************
+ * File:        tesseract_cube_combiner.h
+ * Description: Declaration of the Tesseract & Cube results combiner Class
+ * Author:    Ahmad Abdulkader
+ * Created:   2008
+ *
+ * (C) Copyright 2008, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+// The TesseractCubeCombiner class provides the functionality of combining
+// the recognition results of Tesseract and Cube at the word level
+
+#ifndef TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
+#define TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
+
+#include <string>
+#include <vector>
+#include "pageres.h"
+
+#ifdef __MSW32__
+#include <windows.h>
+using namespace std;
+#endif
+
+#ifdef USE_STD_NAMESPACE
+using std::string;
+using std::vector;
+#endif
+
+namespace tesseract {
+
+class CubeObject;
+class NeuralNet;
+class CubeRecoContext;
+class WordAltList;
+
+class TesseractCubeCombiner {
+ public:
+  explicit TesseractCubeCombiner(CubeRecoContext *cube_cntxt);
+  virtual ~TesseractCubeCombiner();
+
+  // There are 2 public methods for combining the results of tesseract
+  // and cube. Both return the probability that the Tesseract result is
+  // correct. The difference between the two interfaces is in how the
+  // passed-in CubeObject is used.
+
+  // The CubeObject parameter is used for 2 purposes: 1) to retrieve
+  // cube's alt list, and 2) to compute cube's word cost for the
+  // tesseract result. Both uses may modify the state of the
+  // CubeObject (including the BeamSearch state) with a call to
+  // RecognizeWord().
+  float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj);
+
+  // The alt_list parameter is expected to have been extracted from the
+  // CubeObject that recognized the word to be combined. The cube_obj
+  // parameter passed in is a separate instance to be used only by
+  // the combiner.
+  float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj,
+                       WordAltList *alt_list);
+
+  // Public method for computing the combiner features. The agreement
+  // output parameter will be true if both answers are identical,
+  // false otherwise. Modifies the cube_alt_list, so no assumptions
+  // should be made about its state upon return.
+  bool ComputeCombinerFeatures(const string &tess_res,
+                               int tess_confidence,
+                               CubeObject *cube_obj,
+                               WordAltList *cube_alt_list,
+                               vector<double> *features,
+                               bool *agreement);
+
+  // Is the word valid according to Tesseract's language model
+  bool ValidWord(const string &str);
+
+  // Loads the combiner neural network from file, using cube_cntxt_
+  // to find path.
+  bool LoadCombinerNet();
+ private:
+  // Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
+  // strips punc and/or normalizes case and then converts back
+  string NormalizeString(const string &str, bool remove_punc, bool norm_case);
+
+  // Compares 2 strings after optionally normalizing them and or stripping
+  // punctuation
+  int CompareStrings(const string &str1, const string &str2, bool ignore_punc,
+                     bool norm_case);
+
+  NeuralNet *combiner_net_;  // pointer to the combiner NeuralNet object
+  CubeRecoContext *cube_cntxt_;  // used for language ID and data paths
+};
+}
+
+#endif  // TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@ -19,6 +19,8 @@
 ///////////////////////////////////////////////////////////////////////

 #include "tesseractclass.h"
+#include "cube_reco_context.h"
+#include "tesseract_cube_combiner.h"
 #include "globals.h"

 // Include automatically generated configuration file if running autoconf.
@ -35,44 +37,373 @@ namespace tesseract {

 Tesseract::Tesseract()
  : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
-                "Take segmentation and labeling from box file"),
+                "Take segmentation and labeling from box file",
+                this->params()),
+    BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
+                "Conversion of word/line box file to char box file",
+                this->params()),
    BOOL_MEMBER(tessedit_train_from_boxes, false,
-                "Generate training data from boxed chars"),
+                "Generate training data from boxed chars", this->params()),
+    BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
+                "Generate more boxes from boxed chars", this->params()),
    BOOL_MEMBER(tessedit_dump_pageseg_images, false,
-               "Dump itermediate images made during page segmentation"),
+               "Dump intermediate images made during page segmentation",
+               this->params()),
    // The default for pageseg_mode is the old behaviour, so as not to
    // upset anything that relies on that.
-    INT_MEMBER(tessedit_pageseg_mode, 2,
-               "Page seg mode: 0=auto, 1=col, 2=block, 3=line, 4=word, 6=char"
-               " (Values from PageSegMode enum in baseapi.h)"),
-    INT_MEMBER(tessedit_accuracyvspeed, 0,
-               "Accuracy V Speed tradeoff: 0 fastest, 100 most accurate"
-               " (Values from AccuracyVSpeed enum in baseapi.h)"),
-    BOOL_MEMBER(tessedit_train_from_boxes_word_level, false,
-                "Generate training data from boxed chars at word level."),
+    INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+               "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
+               " 5=line, 6=word, 7=char"
+               " (Values from PageSegMode enum in publictypes.h)",
+               this->params()),
+    INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
+                    "Which OCR engine(s) to run (Tesseract, Cube, both)."
+                    " Defaults to loading and running only Tesseract"
+                    " (no Cube,no combiner)."
+                    " Values from OcrEngineMode enum in tesseractclass.h)",
+               this->params()),
    STRING_MEMBER(tessedit_char_blacklist, "",
-                  "Blacklist of chars not to recognize"),
+                  "Blacklist of chars not to recognize", this->params()),
    STRING_MEMBER(tessedit_char_whitelist, "",
-                  "Whitelist of chars to recognize"),
-    BOOL_MEMBER(global_tessedit_ambigs_training, false,
-                "Perform training for ambiguities"),
+                  "Whitelist of chars to recognize", this->params()),
+    BOOL_INIT_MEMBER(tessedit_ambigs_training, false,
+                "Perform training for ambiguities", this->params()),
+    STRING_MEMBER(tessedit_write_params_to_file, "",
+                  "Write all parameters to the given file.", this->params()),
+    BOOL_MEMBER(tessedit_adapt_to_char_fragments, true,
+                "Adapt to words that contain "
+                " a character composed form fragments", this->params()),
+    BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
+                " information for adaption", this->params()),
+    BOOL_MEMBER(applybox_rebalance, TRUE, "Drop dead", this->params()),
+    INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
+    INT_MEMBER(applybox_page, 0,
+               "Page number to apply boxes from", this->params()),
+    STRING_MEMBER(applybox_test_exclusions, "",
+                  "Chars ignored for testing", this->params()),
+    double_MEMBER(applybox_error_band, 0.15,
+                  "Err band as fract of xht", this->params()),
+    STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows"
+                  " this pattern in the image filename. The name of the image"
+                  " files are expected to be in the form"
+                  " [lang].[fontname].exp[num].tif", this->params()),
+    BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
+               "Learn both character fragments (as is done in the"
+               " special low exposure mode) as well as unfragmented"
+               " characters.", this->params()),
+    BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box"
+                " is assumed to contain ngrams. Only learn the ngrams"
+                " whose outlines overlap horizontally.", this->params()),
+    BOOL_MEMBER(tessedit_print_text, false,
+                "Write text to stdout", this->params()),
+    BOOL_MEMBER(tessedit_draw_words, false,
+                "Draw source words", this->params()),
+    BOOL_MEMBER(tessedit_draw_outwords, false,
+                "Draw output words", this->params()),
+    BOOL_MEMBER(tessedit_training_tess, false,
+                "Call Tess to learn blobs", this->params()),
+    BOOL_MEMBER(tessedit_dump_choices, false,
+                "Dump char choices", this->params()),
+    BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
+                "Try to improve fuzzy spaces", this->params()),
+    BOOL_MEMBER(tessedit_unrej_any_wd, false,
+                "Dont bother with word plausibility", this->params()),
+    BOOL_MEMBER(tessedit_fix_hyphens, true,
+                "Crunch double hyphens?", this->params()),
+    BOOL_MEMBER(tessedit_redo_xheight, true,
+                "Check/Correct x-height", this->params()),
+    BOOL_MEMBER(tessedit_enable_doc_dict, true,
+                "Add words to the document dictionary", this->params()),
+    BOOL_MEMBER(tessedit_debug_fonts, false,
+                "Output font info per char", this->params()),
+    BOOL_MEMBER(tessedit_debug_block_rejection, false,
+                "Block and Row stats", this->params()),
+    INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
+    BOOL_MEMBER(debug_acceptable_wds, false,
+                "Dump word pass/fail chk", this->params()),
+    STRING_MEMBER(chs_leading_punct, "('`\"",
+                  "Leading punctuation", this->params()),
+    STRING_MEMBER(chs_trailing_punct1, ").,;:?!",
+                  "1st Trailing punctuation", this->params()),
+    STRING_MEMBER(chs_trailing_punct2, ")'`\"",
+                  "2nd Trailing punctuation", this->params()),
+    double_MEMBER(quality_rej_pc, 0.08,
+                  "good_quality_doc lte rejection limit", this->params()),
+    double_MEMBER(quality_blob_pc, 0.0,
+                  "good_quality_doc gte good blobs limit", this->params()),
+    double_MEMBER(quality_outline_pc, 1.0,
+                  "good_quality_doc lte outline error limit", this->params()),
+    double_MEMBER(quality_char_pc, 0.95,
+                  "good_quality_doc gte good char limit", this->params()),
+    INT_MEMBER(quality_min_initial_alphas_reqd, 2,
+               "alphas in a good word", this->params()),
+    BOOL_MEMBER(tessedit_tess_adapt_to_rejmap, false,
+                "Use reject map to control Tesseract adaption", this->params()),
+    INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
+               "Adaptation decision algorithm for tess", this->params()),
+    BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
+                "Do minimal rejection on pass 1 output", this->params()),
+    BOOL_MEMBER(tessedit_test_adaption, false,
+                "Test adaption criteria", this->params()),
+    BOOL_MEMBER(tessedit_matcher_log, false,
+                "Log matcher activity", this->params()),
+    INT_MEMBER(tessedit_test_adaption_mode, 3,
+               "Adaptation decision algorithm for tess", this->params()),
+    BOOL_MEMBER(save_best_choices, false,
+                "Save the results of the recognition step (blob_choices)"
+                " within the corresponding WERD_CHOICE", this->params()),
+    BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
+    double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
+    double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
+    INT_MEMBER(cube_debug_level, 1, "Print cube debug info.", this->params()),
+    STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
+                  this->params()),
+    STRING_MEMBER(outlines_2, "ij!?%\":;",
+                  "Non standard number of outlines", this->params()),
+    BOOL_MEMBER(docqual_excuse_outline_errs, false,
+                "Allow outline errs in unrejection?", this->params()),
+    BOOL_MEMBER(tessedit_good_quality_unrej, true,
+                "Reduce rejection on good docs", this->params()),
+    BOOL_MEMBER(tessedit_use_reject_spaces, true,
+                "Reject spaces?", this->params()),
+    double_MEMBER(tessedit_reject_doc_percent, 65.00,
+                  "%rej allowed before rej whole doc", this->params()),
+    double_MEMBER(tessedit_reject_block_percent, 45.00,
+                  "%rej allowed before rej whole block", this->params()),
+    double_MEMBER(tessedit_reject_row_percent, 40.00,
+                "%rej allowed before rej whole row", this->params()),
+    double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
+                  "Number of row rejects in whole word rejects"
+                  "which prevents whole row rejection", this->params()),
+    BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
+                "Only rej partially rejected words in block rejection",
+                this->params()),
+    BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
+                "Only rej partially rejected words in row rejection",
+                this->params()),
+    BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
+                "Use word segmentation quality metric", this->params()),
+    BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
+                "Use word segmentation quality metric", this->params()),
+    INT_MEMBER(tessedit_preserve_min_wd_len, 2,
+               "Only preserve wds longer than this", this->params()),
+    BOOL_MEMBER(tessedit_row_rej_good_docs, true,
+                "Apply row rejection to good docs", this->params()),
+    double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
+                  "rej good doc wd if more than this fraction rejected",
+                  this->params()),
+    BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
+                "Reject all bad quality wds", this->params()),
+    BOOL_MEMBER(tessedit_debug_doc_rejection, false,
+                "Page stats", this->params()),
+    BOOL_MEMBER(tessedit_debug_quality_metrics, false,
+                "Output data to debug file", this->params()),
+    BOOL_MEMBER(bland_unrej, false,
+                "unrej potential with no chekcs", this->params()),
+    double_MEMBER(quality_rowrej_pc, 1.1,
+                  "good_quality_doc gte good char limit", this->params()),
+    BOOL_MEMBER(unlv_tilde_crunching, true,
+                "Mark v.bad words for tilde crunch", this->params()),
+    BOOL_MEMBER(crunch_early_merge_tess_fails, true,
+                "Before word crunch?", this->params()),
+    BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
+                "Take out ~^ early?", this->params()),
+    double_MEMBER(crunch_terrible_rating, 80.0,
+                  "crunch rating lt this", this->params()),
+    BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
+    double_MEMBER(crunch_poor_garbage_cert, -9.0,
+                  "crunch garbage cert lt this", this->params()),
+    double_MEMBER(crunch_poor_garbage_rate, 60,
+                  "crunch garbage rating lt this", this->params()),
+    double_MEMBER(crunch_pot_poor_rate, 40,
+                  "POTENTIAL crunch rating lt this", this->params()),
+    double_MEMBER(crunch_pot_poor_cert, -8.0,
+                  "POTENTIAL crunch cert lt this", this->params()),
+    BOOL_MEMBER(crunch_pot_garbage, true,
+                "POTENTIAL crunch garbage", this->params()),
+    double_MEMBER(crunch_del_rating, 60,
+                  "POTENTIAL crunch rating lt this", this->params()),
+    double_MEMBER(crunch_del_cert, -10.0,
+                  "POTENTIAL crunch cert lt this", this->params()),
+    double_MEMBER(crunch_del_min_ht, 0.7,
+                  "Del if word ht lt xht x this", this->params()),
+    double_MEMBER(crunch_del_max_ht, 3.0,
+                  "Del if word ht gt xht x this", this->params()),
+    double_MEMBER(crunch_del_min_width, 3.0,
+                  "Del if word width lt xht x this", this->params()),
+    double_MEMBER(crunch_del_high_word, 1.5,
+                  "Del if word gt xht x this above bl", this->params()),
+    double_MEMBER(crunch_del_low_word, 0.5,
+                  "Del if word gt xht x this below bl", this->params()),
+    double_MEMBER(crunch_small_outlines_size, 0.6,
+                  "Small if lt xht x this", this->params()),
+    INT_MEMBER(crunch_rating_max, 10,
+               "For adj length in rating per ch", this->params()),
+    INT_MEMBER(crunch_pot_indicators, 1,
+               "How many potential indicators needed", this->params()),
+    BOOL_MEMBER(crunch_leave_ok_strings, true,
+                "Dont touch sensible strings", this->params()),
+    BOOL_MEMBER(crunch_accept_ok, true,
+                "Use acceptability in okstring", this->params()),
+    BOOL_MEMBER(crunch_leave_accept_strings, false,
+                "Dont pot crunch sensible strings", this->params()),
+    BOOL_MEMBER(crunch_include_numerals, false,
+                "Fiddle alpha figures", this->params()),
+    INT_MEMBER(crunch_leave_lc_strings, 4,
+               "Dont crunch words with long lower case strings",
+               this->params()),
+    INT_MEMBER(crunch_leave_uc_strings, 4,
+               "Dont crunch words with long lower case strings",
+               this->params()),
+    INT_MEMBER(crunch_long_repetitions, 3,
+               "Crunch words with long repetitions", this->params()),
+    INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
+    INT_MEMBER(fixsp_non_noise_limit, 1,
+               "How many non-noise blbs either side?", this->params()),
+    double_MEMBER(fixsp_small_outlines_size, 0.28,
+                  "Small if lt xht x this", this->params()),
+    BOOL_MEMBER(tessedit_prefer_joined_punct, false,
+                "Reward punctation joins", this->params()),
+    INT_MEMBER(fixsp_done_mode, 1,
+               "What constitues done for spacing", this->params()),
+    INT_MEMBER(debug_fix_space_level, 0,
+               "Contextual fixspace debug", this->params()),
+    STRING_MEMBER(numeric_punctuation, ".,",
+                  "Punct. chs expected WITHIN numbers", this->params()),
+    INT_MEMBER(x_ht_acceptance_tolerance, 8,
+               "Max allowed deviation of blob top outside of font data",
+               this->params()),
+    INT_MEMBER(x_ht_min_change, 8,
+               "Min change in xht before actually trying it", this->params()),
+    BOOL_MEMBER(tessedit_write_block_separators, false,
+                "Write block separators in output", this->params()),
+    BOOL_MEMBER(tessedit_write_raw_output, false,
+                "Write raw stuff to name.raw", this->params()),
+    BOOL_MEMBER(tessedit_write_output, false,
+                "Write text to name.txt", this->params()),
+    BOOL_MEMBER(tessedit_write_ratings, false,
+                "Return ratings in IPEOCRAPI data", this->params()),
+    BOOL_MEMBER(tessedit_write_rep_codes, false,
+                "Write repetition char code", this->params()),
+    BOOL_MEMBER(tessedit_write_unlv, false,
+                "Write .unlv output file", this->params()),
+    BOOL_MEMBER(tessedit_create_hocr, false,
+                "Write .html hOCR output file", this->params()),
+    STRING_MEMBER(unrecognised_char, "|",
+                  "Output char for unidentified blobs", this->params()),
+    INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
+    INT_MEMBER(suspect_space_level, 100,
+               "Min suspect level for rejecting spaces", this->params()),
+    INT_MEMBER(suspect_short_words, 2,
+               "Dont Suspect dict wds longer than this", this->params()),
+    BOOL_MEMBER(suspect_constrain_1Il, false,
+                "UNLV keep 1Il chars rejected", this->params()),
+    double_MEMBER(suspect_rating_per_ch, 999.9,
+                  "Dont touch bad rating limit", this->params()),
+    double_MEMBER(suspect_accept_rating, -999.9,
+                  "Accept good rating limit", this->params()),
+    BOOL_MEMBER(tessedit_minimal_rejection, false,
+                "Only reject tess failures", this->params()),
+    BOOL_MEMBER(tessedit_zero_rejection, false,
+                "Dont reject ANYTHING", this->params()),
+    BOOL_MEMBER(tessedit_word_for_word, false,
+                "Make output have exactly one word per WERD", this->params()),
+    BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
+                "Dont reject ANYTHING AT ALL", this->params()),
+    BOOL_MEMBER(tessedit_consistent_reps, true,
+                "Force all rep chars the same", this->params()),
+    INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()),
+    INT_MEMBER(tessedit_ok_mode, 5,
+               "Acceptance decision algorithm", this->params()),
+    BOOL_MEMBER(tessedit_rejection_debug, false,
+                "Adaption debug", this->params()),
+    BOOL_MEMBER(tessedit_flip_0O, true,
+                "Contextual 0O O0 flips", this->params()),
+    double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
+                  "Aspect ratio dot/hyphen test", this->params()),
+    double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
+                  "Aspect ratio dot/hyphen test", this->params()),
+    BOOL_MEMBER(rej_trust_doc_dawg, false,
+                "Use DOC dawg in 11l conf. detector", this->params()),
+    BOOL_MEMBER(rej_1Il_use_dict_word, false,
+                "Use dictword test", this->params()),
+    BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
+                "Dont double check", this->params()),
+    BOOL_MEMBER(rej_use_tess_accepted, true,
+                "Individual rejection control", this->params()),
+    BOOL_MEMBER(rej_use_tess_blanks, true,
+                "Individual rejection control", this->params()),
+    BOOL_MEMBER(rej_use_good_perm, true,
+                "Individual rejection control", this->params()),
+    BOOL_MEMBER(rej_use_sensible_wd, false,
+                "Extend permuter check", this->params()),
+    BOOL_MEMBER(rej_alphas_in_number_perm, false,
+                "Extend permuter check", this->params()),
+    double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
+                  "if >this fract", this->params()),
+    INT_MEMBER(tessedit_image_border, 2,
+               "Rej blbs near image edge limit", this->params()),
+    STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
+                  "Allow NN to unrej", this->params()),
+    STRING_MEMBER(conflict_set_I_l_1, "Il1[]",
+                  "Il1 conflict set", this->params()),
+    INT_MEMBER(min_sane_x_ht_pixels, 8,
+               "Reject any x-ht lt or eq than this", this->params()),
+    BOOL_MEMBER(tessedit_create_boxfile, false,
+                "Output text with boxes", this->params()),
+    BOOL_MEMBER(tessedit_read_image, true,
+                "Ensure the image is read", this->params()),
+    INT_MEMBER(tessedit_serial_unlv, 0, "0->Whole page, 1->serial"
+               " no adapt, 2->serial with adapt", this->params()),
+    INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages"
+               " , else specifc page to process", this->params()),
+    BOOL_MEMBER(tessedit_write_images, false,
+                "Capture the image from the IPE", this->params()),
+    BOOL_MEMBER(interactive_mode, false, "Run interactively?", this->params()),
+    STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
+    INT_MEMBER(testedit_match_debug, 0,
+               "Integer match debug ctrl", this->params()),
+    BOOL_MEMBER(tessedit_override_permuter, true,
+                "According to dict_word", this->params()),
+    INT_INIT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for"
+                    " TessdataManager functions.", this->params()),
+    double_MEMBER(min_orientation_margin, 12.0,
+                  "Min acceptable orientation margin", this->params()),
+    backup_config_file_(NULL),
    pix_binary_(NULL),
+    pix_grey_(NULL),
+    orig_image_changed_(false),
+    textord_(this),
+    right_to_left_(false),
    deskew_(1.0f, 0.0f),
    reskew_(1.0f, 0.0f),
-    hindi_image_(false) {
+    cube_cntxt_(NULL),
+    tess_cube_combiner_(NULL) {
 }

 Tesseract::~Tesseract() {
  Clear();
+  // Delete cube objects.
+  if (cube_cntxt_ != NULL) {
+    delete cube_cntxt_;
+    cube_cntxt_ = NULL;
+  }
+  if (tess_cube_combiner_ != NULL) {
+    delete tess_cube_combiner_;
+    tess_cube_combiner_ = NULL;
+  }
 }

 void Tesseract::Clear() {
 #ifdef HAVE_LIBLEPT
  if (pix_binary_ != NULL)
    pixDestroy(&pix_binary_);
+  if (pix_grey_ != NULL)
+    pixDestroy(&pix_grey_);
 #endif
- deskew_ = FCOORD(1.0f, 0.0f);
- reskew_ = FCOORD(1.0f, 0.0f);
+  deskew_ = FCOORD(1.0f, 0.0f);
+  reskew_ = FCOORD(1.0f, 0.0f);
+  orig_image_changed_ = false;
 }

 void Tesseract::SetBlackAndWhitelist() {
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
--- a/ccmain/tessio.h
+++ b/ccmain/tessio.h
@ -1,210 +0,0 @@
-/**********************************************************************
- * File:        tessio.h  (Formerly tessread.h)
- * Description: Read/write Tesseract format row files.
- * Author:		Ray Smith
- * Created:		Wed Oct 09 15:02:46 BST 1991
- *
- * (C) Copyright 1991, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#ifndef           TESSIO_H
-#define           TESSIO_H
-
-#include          <stdio.h>
-#include          "tessclas.h"
-#include          "notdll.h"
-
-/** 
- * open read & close
- * @param name file name
- * @param topright corner
- */
-TEXTROW *get_tess_row_file(
-                           const char *name,
-                           TPOINT *topright
-                          );
-/** 
- * open read & close
- * @param name file name
- * @param topright corner
- */
-TBLOB *get_tess_blob_file(
-                          const char *name,
-                          TPOINT *topright
-                         );
-/** 
- * read row file
- * @param gphfd file to read
- * @param count number expected
- * @param imagesize size of image
- */
-TEXTROW *readrows(
-                  int gphfd,
-                  int count,
-                  TPOINT *imagesize
-                 );
-/** 
- * read some words
- * @param gphfd file to read
- * @param count number expected
- * @param row row it comes from
- * @param imagesize size of image
- */
-TWERD *readwords(
-                 int gphfd,
-                 int count,
-                 TEXTROW *row,
-                 TPOINT *imagesize
-                );
-/** 
- * read some blobs
- * @param gphfd file to read
- * @param count number expected
- * @param imagesize size of image
- */
-TBLOB *readblobs(
-                 int gphfd,
-                 int count,
-                 TPOINT *imagesize
-                );
-/** 
- * get a string
- * @param gphfd file to read
- * @param ratingspace size to read
- */
-char *readratings(
-                  int gphfd,
-                  int ratingspace
-                 );
-/** 
- * read some outlines
- * @param gphfd file to read
- * @param outlines array of ptrs
- * @param outlinecount no to read
- */
-void readoutlines(
-                  int gphfd,
-                  TESSLINE **outlines,
-                  int outlinecount
-                 );
-/** 
- * read with testing
- * @param fd file to read
- * @param start buffer to write
- * @param size amount to write
- * @param checkeof give error on eof?
- */
-int readgph(
-            int fd,
-            void *start,
-            int size,
-            int checkeof
-           );
-/** 
- * write a row
- * @param name file name
- * @param row row to write
- */
-void write_row(
-               FILE *name,
-               TEXTROW *row
-              );
-/** 
- * write special row
- * @param name file name
- * @param row row to write
- * @param wordcount number of words to go
- */
-void write_error_row(
-                     FILE *name,
-                     TEXTROW *row,
-                     int wordcount
-                    );
-/** 
- * write special blob
- * @param name file name
- * @param blob blob to write
- * @param charlist true chars
- * @param charcount number of true chars
- */
-void write_error_blob(
-                      FILE *name,
-                      TBLOB *blob,
-                      char *charlist,
-                      int charcount
-                     );
-/** 
- * write special word
- * @param name file name
- * @param word word to write
- * @param charlist true chars
- * @param charcount number of true chars
- */
-void write_error_word(
-                      FILE *name,
-                      TWERD *word,
-                      char *charlist,
-                      int charcount
-                     );
-/** 
- * write a blob
- * @param name file to write
- * @param blob blob to write
- */
-void writeblob(
-               FILE *name,
-               TBLOB *blob
-              );
-/** 
- * serialize
- * @param name file to write to
- * @param blob current blob
- * @param outline current outline
- * @param outlineno current serial no
- */
-void serial_outlines(
-                     FILE *name,
-                     TBLOB *blob,
-                     register TESSLINE *outline,
-                     int *outlineno
-                    );
-/** 
- * count loopsize
- * @param vector vectors to count
- */
-int countloop(
-              register BYTEVEC *vector
-             );
-/** 
- * get serial no
- * @param outline start of search
- * @param target outline to find
- * @param serial serial no so far
- */
-int outlineserial(
-                  register TESSLINE *outline,
-                  register TESSLINE *target,
-                  int serial
-                 );
-/** 
- * Interface to fwrite 
- * @param name file to write
- * @param start buffer to write
- * @param size amount to write
- */
-void writegph(
-              FILE *name,
-              void *start,
-              int size
-             );
-#endif
--- a/ccmain/tessvars.cpp
+++ b/ccmain/tessvars.cpp
@ -17,22 +17,9 @@
 *
 **********************************************************************/

+#include <stdio.h>
+
 #include "mfcpch.h"
-#include          "tessvars.h"
+#include  "tessvars.h"

-#define EXTERN
-
-EXTERN INT_VAR (tessedit_adapt_kludge, 0,
-"Use acceptable result or dangambigs");
-EXTERN BOOL_VAR (interactive_mode, FALSE, "Run interactively?");
-EXTERN BOOL_VAR (edit_variables, FALSE, "Variables Editor Window?");
-// xiaofan EXTERN STRING_VAR(file_type,".bl","Filename extension");
-EXTERN STRING_VAR (file_type, ".tif", "Filename extension");
-INT_VAR (testedit_match_debug, 0, "Integer match debug ctrl");
-EXTERN INT_VAR (tessedit_dangambigs_chop, FALSE,
-"Use UnicharAmbigs to direct chop");
-EXTERN INT_VAR (tessedit_dangambigs_assoc, FALSE,
-"Use UnicharAmbigs to direct assoc");
-
-EXTERN IMAGE page_image;         //image of page
-EXTERN FILE *debug_fp = stderr;           //write debug stuff here
+FILE *debug_fp = stderr;  // write debug stuff here
--- a/ccmain/tessvars.h
+++ b/ccmain/tessvars.h
@ -20,29 +20,10 @@
 #ifndef           TESSVARS_H
 #define           TESSVARS_H

-#include          "varable.h"
+#include <stdio.h>
+
 #include          "img.h"
-#include          "tordmain.h"
 #include          "notdll.h"

-extern INT_VAR_H (tessedit_adapt_kludge, 0,
-"Use acceptable result or dangambigs");
-extern BOOL_VAR_H (interactive_mode, FALSE, "Run interactively?");
-extern BOOL_VAR_H (edit_variables, FALSE, "Variables Editor Window?");
-//xiaofan extern STRING_VAR_H(file_type,".bl","Filename extension");
-extern STRING_VAR_H (file_type, ".tif", "Filename extension");
-extern INT_VAR_H (tessedit_truncate_wordchoice_log, 10,
-"Max words to keep in list");
-extern INT_VAR_H (testedit_match_debug, 0, "Integer match debug ctrl");
-extern INT_VAR_H (tessedit_truncate_chopper, 1,
-"Shorten chopper seam search");
-extern INT_VAR_H (tessedit_fix_sideways_chops, 1,
-"Fix sideways chop problem");
-extern INT_VAR_H (tessedit_dangambigs_chop, FALSE,
-"Use UnicharAmbigs to direct chop");
-extern INT_VAR_H (tessedit_dangambigs_assoc, FALSE,
-"Use UnicharAmbigs to direct assoc");
-
-extern IMAGE page_image;         //image of page
-extern FILE *debug_fp;           //write debug stuff here
+extern FILE *debug_fp;    // write debug stuff here
 #endif
--- a/ccmain/tfacep.h
+++ b/ccmain/tfacep.h
@ -17,45 +17,23 @@
 *
 **********************************************************************/

-#ifndef           TFACEP_H
-#define           TFACEP_H
+#ifndef TFACEP_H
+#define TFACEP_H

-#include          "hosthplb.h"
-#include          "tessclas.h"
-#include          "tessarray.h"
-#include          "tstruct.h"
-#include          "notdll.h"
-#include "choices.h"
+#include "hosthplb.h"
+#include "blobs.h"
+#include "tessarray.h"
+#include "tstruct.h"
+#include "notdll.h"
 #include "oldlist.h"
-#include "tface.h"
 #include "permute.h"
-#include "adaptmatch.h"
 #include "blobclass.h"
 #include "stopper.h"
 #include "associate.h"
 #include "chop.h"
-#include "expandblob.h"
-#include "tordvars.h"
-#include "metrics.h"
-#include "tface.h"
-#include "badwords.h"
 #include "structures.h"

 typedef void (*TESS_TESTER) (TBLOB *, BOOL8, char *, inT32, LIST);
-typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *, void *, TEXTROW *);
+typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *);

-extern TEXTROW normalized_row;
-extern int display_ratings;
-
-#if 0
-#define strsave(s)    \
-	((s) ?  \
-	((char*) strcpy ((char*)alloc_string (strlen(s)+1), s))  :  \
-	(NULL))
-#endif
-
-#define BOLD_ON				"&dB(s3B"
-#define BOLD_OFF			"&d@(s0B"
-#define UNDERLINE_ON		"&dD"
-#define UNDERLINE_OFF		"&d@"
 #endif
--- a/ccmain/tfacepp.cpp
+++ b/ccmain/tfacepp.cpp
@ -39,11 +39,6 @@
 #include          "reject.h"
 #include          "tesseractclass.h"

-#define EXTERN
-
-EXTERN BOOL_VAR (tessedit_override_permuter, TRUE, "According to dict_word");
-
-
 #define MAX_UNDIVIDED_LENGTH 24


@ -55,70 +50,52 @@ EXTERN BOOL_VAR (tessedit_override_permuter, TRUE, "According to dict_word");
 * Convert the output back to editor form.
 **********************************************************************/
 namespace tesseract {
-WERD_CHOICE *Tesseract::recog_word(                     //recog one owrd
-                                   WERD *word,          //word to do
-                                   DENORM *denorm,      //de-normaliser
-                                                        //matcher function
-                                   POLY_MATCHER matcher,
-                                   POLY_TESTER tester,  //tester function
-                                   POLY_TESTER trainer, //trainer function
-                                   BOOL8 testing,       //true if answer driven
-                                                        //raw result
-                                   WERD_CHOICE *&raw_choice,
-                                                        //list of blob lists
-                                   BLOB_CHOICE_LIST_CLIST *blob_choices,
-                                   WERD *&outword       //bln word output
-                                  ) {
-  WERD_CHOICE *word_choice;
-  uinT8 perm_type;
-  uinT8 real_dict_perm_type;
-
-  if (word->blob_list ()->empty ()) {
-    word_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
-                                  TOP_CHOICE_PERM, unicharset);
-    raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
-                                 TOP_CHOICE_PERM, unicharset);
-    outword = word->poly_copy (denorm->row ()->x_height ());
+void Tesseract::recog_word(WERD_RES *word,
+                           BLOB_CHOICE_LIST_CLIST *blob_choices) {
+  ASSERT_HOST(word->chopped_word->blobs != NULL);
+  recog_word_recursive(word, blob_choices);
+  word->SetupBoxWord();
+  if ((word->best_choice->length() != word->box_word->length()) ||
+      (word->best_choice->length() != blob_choices->length())) {
+    tprintf("recog_word ASSERT FAIL String:\"%s\"; "
+            "Strlen=%d; #Blobs=%d; #Choices=%d\n",
+            word->best_choice->debug_string(unicharset).string(),
+            word->best_choice->length(), word->box_word->length(),
+            blob_choices->length());
  }
-  else
-    word_choice = recog_word_recursive (word, denorm, matcher, tester,
-      trainer, testing, raw_choice,
-      blob_choices, outword);
-  if ((word_choice->length() != outword->blob_list()->length()) ||
-      (word_choice->length() != blob_choices->length())) {
-    tprintf
-      ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
-      word_choice->debug_string(unicharset).string(),
-      word_choice->length(), outword->blob_list()->length(),
-      blob_choices->length());
-  }
-  ASSERT_HOST(word_choice->length() == outword->blob_list()->length());
-  ASSERT_HOST(word_choice->length() == blob_choices->length());
-
-  /* Copy any reject blobs into the outword */
-  outword->rej_blob_list()->deep_copy(word->rej_blob_list(), &PBLOB::deep_copy);
-
+  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
+  ASSERT_HOST(word->best_choice->length() == blob_choices->length());
  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
-    perm_type = word_choice->permuter();
+    uinT8 perm_type = word->best_choice->permuter();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
        (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
-      real_dict_perm_type = dict_word(*word_choice);
+      uinT8 real_dict_perm_type = dict_word(*word->best_choice);
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
           (real_dict_perm_type == FREQ_DAWG_PERM) ||
           (real_dict_perm_type == USER_DAWG_PERM)) &&
-          (alpha_count(word_choice->unichar_string().string(),
-                      word_choice->unichar_lengths().string()) > 0)) {
-        word_choice->set_permuter (real_dict_perm_type);  // use dict perm
+          (alpha_count(word->best_choice->unichar_string().string(),
+                       word->best_choice->unichar_lengths().string()) > 0)) {
+        word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
      }
    }
-    if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
-      tprintf ("Permuter Type Flipped from %d to %d\n",
-        perm_type, word_choice->permuter ());
+    if (tessedit_rejection_debug &&
+        perm_type != word->best_choice->permuter()) {
+      tprintf("Permuter Type Flipped from %d to %d\n",
+              perm_type, word->best_choice->permuter());
    }
  }
-  assert ((word_choice == NULL) == (raw_choice == NULL));
-  return word_choice;
+  // Factored out from control.cpp
+  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
+  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
+      strspn(word->best_choice->unichar_string().string(), " ") ==
+        word->best_choice->length()) {
+    word->tess_failed = true;
+    word->reject_map.initialise(word->box_word->length());
+    word->reject_map.rej_word_tess_failure();
+  } else {
+    word->tess_failed = false;
+  }
 }


@ -128,105 +105,65 @@ WERD_CHOICE *Tesseract::recog_word(                     //recog one owrd
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/
-WERD_CHOICE *
-Tesseract::recog_word_recursive(
-    WERD *word,                            // word to do
-    DENORM *denorm,                        // de-normaliser
-    POLY_MATCHER matcher,                  // matcher function
-    POLY_TESTER tester,                    // tester function
-    POLY_TESTER trainer,                   // trainer function
-    BOOL8 testing,                         // true if answer driven
-    WERD_CHOICE *&raw_choice,              // raw result
-    BLOB_CHOICE_LIST_CLIST *blob_choices,  // list of blob lists
-    WERD *&outword                         // bln word output
-    ) {
-  inT32 initial_blob_choice_len;
-  inT32 word_length;                      // no of blobs
-  STRING word_string;                     // converted from tess
-  STRING word_string_lengths;
-  BLOB_CHOICE_LIST_VECTOR *tess_ratings;  // tess results
-  TWERD *tessword;                        // tess format
-  BLOB_CHOICE_LIST_C_IT blob_choices_it;  // iterator
+void Tesseract::recog_word_recursive(WERD_RES *word,
+                                     BLOB_CHOICE_LIST_CLIST *blob_choices) {
+  int word_length = word->chopped_word->NumBlobs();  // no of blobs
+  if (word_length > MAX_UNDIVIDED_LENGTH) {
+    return split_and_recog_word(word, blob_choices);
+  }
+  int initial_blob_choice_len = blob_choices->length();
+  BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);

-  tess_matcher = matcher;           // install matcher
-  tess_tester = testing ? tester : NULL;
-  tess_trainer = testing ? trainer : NULL;
-  tess_denorm = denorm;
-  tess_word = word;
-  //      blob_matchers[1]=call_matcher;
-  if (word->blob_list ()->length () > MAX_UNDIVIDED_LENGTH) {
-    return split_and_recog_word (word, denorm, matcher, tester, trainer,
-      testing, raw_choice, blob_choices,
-      outword);
-  } else {
+  // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
+  BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
+  for (int i = 0; i < tess_ratings->length(); ++i) {
+    blob_choices_it.add_to_end(tess_ratings->get(i));
+  }
+  delete tess_ratings;
+
+  word_length = word->rebuild_word->NumBlobs();  // No of blobs in output.
+  // Pad raw_choice with spaces if needed.
+  if (word->raw_choice->length() < word_length) {
    UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
-    WERD_CHOICE *best_choice = new WERD_CHOICE();
-    raw_choice = new WERD_CHOICE();
-    initial_blob_choice_len = blob_choices->length();
-    tessword = make_tess_word (word, NULL);
-    tess_ratings = cc_recog(tessword, best_choice, raw_choice,
-                            testing && tester != NULL,
-                            testing && trainer != NULL,
-                            word->flag(W_EOL));
+    while (word->raw_choice->length() < word_length) {
+      word->raw_choice->append_unichar_id(space_id, 1, 0.0,
+                                          word->raw_choice->certainty());
+    }
+    word->raw_choice->populate_unichars(unicharset);
+  }

-    outword = make_ed_word (tessword, word);  // convert word
-    if (outword == NULL) {
-      outword = word->poly_copy (denorm->row ()->x_height ());
+  // Do sanity checks and minor fixes on best_choice.
+  if (word->best_choice->length() > word_length) {
+    word->best_choice->make_bad();  // should never happen
+    tprintf("recog_word: Discarded long string \"%s\""
+            " (%d characters vs %d blobs)\n",
+            word->best_choice->unichar_string().string(),
+            word->best_choice->length(), word_length);
+    tprintf("Word is at:");
+    word->word->bounding_box().print();
+  }
+  if (blob_choices->length() - initial_blob_choice_len != word_length) {
+    word->best_choice->make_bad();  // force rejection
+    tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
+            blob_choices->length(), word_length);
+    blob_choices_it.set_to_list(blob_choices);  // list of lists
+    while (blob_choices->length() - initial_blob_choice_len < word_length) {
+      blob_choices_it.add_to_end(new BLOB_CHOICE_LIST());  // add a fake one
+      tprintf("recog_word: Added dummy choice list\n");
    }
-    delete_word(tessword);  // get rid of it
-    word_length = outword->blob_list()->length();  // no of blobs
-
-    // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
-    blob_choices_it.set_to_list(blob_choices);
-    for (int i = 0; i < tess_ratings->length(); ++i) {
-      blob_choices_it.add_to_end(tess_ratings->get(i));
+    while (blob_choices->length() - initial_blob_choice_len > word_length) {
+      blob_choices_it.move_to_last(); // should never happen
+      delete blob_choices_it.extract();
+      tprintf("recog_word: Deleted choice list\n");
    }
-    delete tess_ratings;
-
-    // Pad raw_choice with spaces if needed.
-    if (raw_choice->length() < word_length) {
-      while (raw_choice->length() < word_length) {
-        raw_choice->append_unichar_id(space_id, 1, 0.0,
-                                      raw_choice->certainty());
-      }
-      raw_choice->populate_unichars(unicharset);
+  }
+  if (word->best_choice->length() < word_length) {
+    UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
+    while (word->best_choice->length() < word_length) {
+      word->best_choice->append_unichar_id(space_id, 1, 0.0,
+                                           word->best_choice->certainty());
    }
-
-    // Do sanity checks and minor fixes on best_choice.
-    if (best_choice->length() > word_length) {
-      tprintf("recog_word: Discarded long string \"%s\""
-              " (%d characters vs %d blobs)\n",
-              best_choice->unichar_string().string (),
-              best_choice->length(), word_length);
-      best_choice->make_bad();  // should never happen
-      tprintf("Word is at (%g,%g)\n",
-              denorm->origin(),
-              denorm->y(word->bounding_box().bottom(), 0.0));
-    }
-    if (blob_choices->length() - initial_blob_choice_len != word_length) {
-      best_choice->make_bad();  // force rejection
-      tprintf ("recog_word: Choices list len:%d; blob lists len:%d\n",
-        blob_choices->length(), word_length);
-      blob_choices_it.set_to_list(blob_choices);  // list of lists
-      while (blob_choices->length() - initial_blob_choice_len < word_length) {
-        blob_choices_it.add_to_end(new BLOB_CHOICE_LIST());  // add a fake one
-        tprintf("recog_word: Added dummy choice list\n");
-      }
-      while (blob_choices->length() - initial_blob_choice_len > word_length) {
-        blob_choices_it.move_to_last(); // should never happen
-        delete blob_choices_it.extract();
-        tprintf("recog_word: Deleted choice list\n");
-      }
-    }
-    if (best_choice->length() < word_length) {
-      while (best_choice->length() < word_length) {
-        best_choice->append_unichar_id(space_id, 1, 0.0,
-                                       best_choice->certainty());
-      }
-      best_choice->populate_unichars(unicharset);
-    }
-
-    return best_choice;
+    word->best_choice->populate_unichars(unicharset);
  }
 }

@ -234,143 +171,76 @@ Tesseract::recog_word_recursive(
 /**********************************************************************
 * split_and_recog_word
 *
- * Convert the word to tess form and pass it to the tess segmenter.
- * Convert the output back to editor form.
+ * Split the word into 2 smaller pieces at the largest gap.
+ * Recognize the pieces and stick the results back together.
 **********************************************************************/

-WERD_CHOICE *
-Tesseract::split_and_recog_word(                        //recog one owrd
-                                WERD *word,             //word to do
-                                DENORM *denorm,         //de-normaliser
-                                POLY_MATCHER matcher,   //matcher function
-                                POLY_TESTER tester,     //tester function
-                                POLY_TESTER trainer,    //trainer function
-                                BOOL8 testing,          //true if answer driven
-                                                        //raw result
-                                WERD_CHOICE *&raw_choice,
-                                                        //list of blob lists
-                                BLOB_CHOICE_LIST_CLIST *blob_choices,
-                                WERD *&outword          //bln word output
-                               ) {
-  //   inT32                                                      outword1_len;
-  //   inT32                                                      outword2_len;
-  WERD *first_word;              //poly copy of word
-  WERD *second_word;             //fabricated word
-  WERD *outword2;                //2nd output word
-  PBLOB *blob;
-  WERD_CHOICE *result;           //return value
-  WERD_CHOICE *result2;          //output of 2nd word
-  WERD_CHOICE *raw_choice2;      //raw version of 2nd
-  float gap;                     //blob gap
-  float bestgap;                 //biggest gap
-  PBLOB_LIST new_blobs;          //list of gathered blobs
-  PBLOB_IT blob_it;
-                                 //iterator
-  PBLOB_IT new_blob_it = &new_blobs;
-
-  first_word = word->poly_copy (denorm->row ()->x_height ());
-  blob_it.set_to_list (first_word->blob_list ());
-  bestgap = (float) -MAX_INT32;
-  while (!blob_it.at_last ()) {
-    blob = blob_it.data ();
-                                 //gap to next
-    gap = (float) blob_it.data_relative(1)->bounding_box().left() -
-        blob->bounding_box().right();
-    blob_it.forward ();
-    if (gap > bestgap) {
-      bestgap = gap;             //find biggest
-      new_blob_it = blob_it;     //save position
+void Tesseract::split_and_recog_word(WERD_RES *word,
+                                     BLOB_CHOICE_LIST_CLIST *blob_choices) {
+  // Find the biggest blob gap in the chopped_word.
+  int bestgap = -MAX_INT32;
+  TPOINT best_split_pt;
+  TBLOB* best_end = NULL;
+  TBLOB* prev_blob = NULL;
+  for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
+       blob = blob->next) {
+    if (prev_blob != NULL) {
+      TBOX prev_box = prev_blob->bounding_box();
+      TBOX blob_box = blob->bounding_box();
+      int gap = blob_box.left() - prev_box.right();
+      if (gap > bestgap) {
+        bestgap = gap;
+        best_end = prev_blob;
+        best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
+        best_split_pt.y = (prev_box.top() + prev_box.bottom() +
+                           blob_box.top() + blob_box.bottom()) / 4;
+      }
    }
+    prev_blob = blob;
  }
-                                 //take 2nd half
-  new_blobs.assign_to_sublist (&new_blob_it, &blob_it);
-                                 //make it a word
-  second_word = new WERD (&new_blobs, 1, NULL);
-  ASSERT_HOST (word->blob_list ()->length () ==
-    first_word->blob_list ()->length () +
-    second_word->blob_list ()->length ());
+  ASSERT_HOST(best_end != NULL);

-  result = recog_word_recursive (first_word, denorm, matcher,
-    tester, trainer, testing, raw_choice,
-    blob_choices, outword);
-  delete first_word;             //done that one
-  result2 = recog_word_recursive (second_word, denorm, matcher,
-    tester, trainer, testing, raw_choice2,
-    blob_choices, outword2);
-  delete second_word;            //done that too
-  *result += *result2;           //combine ratings
-  delete result2;
-  *raw_choice += *raw_choice2;
-  delete raw_choice2;            //finished with it
-  //   outword1_len= outword->blob_list()->length();
-  //   outword2_len= outword2->blob_list()->length();
-  outword->join_on (outword2);   //join words
-  delete outword2;
-  //   if ( outword->blob_list()->length() != outword1_len + outword2_len )
-  //      tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n",
-  //                                outword1_len, outword2_len, outword->blob_list()->length() );
-  //   ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len );
-  return result;
+  // Make a copy of the word to put the 2nd half in.
+  WERD_RES* word2 = new WERD_RES(*word);
+  // Blow away the copied chopped_word, as we want to work with the blobs
+  // from the input chopped_word so the seam_arrays can be merged.
+  delete word2->chopped_word;
+  word2->chopped_word = new TWERD;
+  word2->chopped_word->blobs = best_end->next;
+  best_end->next = NULL;
+  // Make a new seamarray on both words.
+  free_seam_list(word->seam_array);
+  word->seam_array = start_seam_list(word->chopped_word->blobs);
+  word2->seam_array = start_seam_list(word2->chopped_word->blobs);
+  // Recognize the first part of the word.
+  recog_word_recursive(word, blob_choices);
+  // Recognize the second part of the word.
+  recog_word_recursive(word2, blob_choices);
+  // Tack the word2 outputs onto the end of the word outputs.
+  // New blobs might have appeared on the end of word1.
+  for (best_end = word->chopped_word->blobs; best_end->next != NULL;
+       best_end = best_end->next);
+  best_end->next = word2->chopped_word->blobs;
+  TBLOB* blob;
+  for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
+  blob->next = word2->rebuild_word->blobs;
+  word2->chopped_word->blobs = NULL;
+  word2->rebuild_word->blobs = NULL;
+  // Copy the seams onto the end of the word1 seam_array.
+  // Since the seam list is one element short, an empty seam marking the
+  // end of the last blob in the first word is needed first.
+  word->seam_array = add_seam(word->seam_array,
+                              new_seam(0.0, best_split_pt, NULL, NULL, NULL));
+  for (int i = 0; i < array_count(word2->seam_array); ++i) {
+    SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
+    array_value(word2->seam_array, i) = NULL;
+    word->seam_array = add_seam(word->seam_array, seam);
+  }
+  word->best_state += word2->best_state;
+  // Append the word choices.
+  *word->best_choice += *word2->best_choice;
+  *word->raw_choice += *word2->raw_choice;
+  delete word2;
 }

 }  // namespace tesseract
-
-/**********************************************************************
- * call_tester
- *
- * Called from Tess with a blob in tess form.
- * Convert the blob to editor form.
- * Call the tester setup by the segmenter in tess_tester.
- **********************************************************************/
-#if 0  // dead code
-void call_tester(                     //call a tester
-                 const STRING& filename,
-                 TBLOB *tessblob,     //blob to test
-                 BOOL8 correct_blob,  //true if good
-                 char *text,          //source text
-                 inT32 count,         //chars in text
-                 LIST result          //output of matcher
-                ) {
-  PBLOB *blob;                   //converted blob
-  BLOB_CHOICE_LIST ratings;      //matcher result
-
-  blob = make_ed_blob (tessblob);//convert blob
-  if (blob == NULL)
-    return;
-                                 //make it right type
-  convert_choice_list(result, ratings);
-  if (tess_tester != NULL)
-    (*tess_tester) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
-  delete blob;                   //don't need that now
-}
-#endif
-
-/**********************************************************************
- * call_train_tester
- *
- * Called from Tess with a blob in tess form.
- * Convert the blob to editor form.
- * Call the trainer setup by the segmenter in tess_trainer.
- **********************************************************************/
-#if 0  // dead code
-void call_train_tester(                     //call a tester
-                       const STRING& filename,
-                       TBLOB *tessblob,     //blob to test
-                       BOOL8 correct_blob,  //true if good
-                       char *text,          //source text
-                       inT32 count,         //chars in text
-                       LIST result          //output of matcher
-                      ) {
-  PBLOB *blob;                   //converted blob
-  BLOB_CHOICE_LIST ratings;      //matcher result
-
-  blob = make_ed_blob (tessblob);//convert blob
-  if (blob == NULL)
-    return;
-                                 //make it right type
-  convert_choice_list(result, ratings);
-  if (tess_trainer != NULL)
-    (*tess_trainer) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
-  delete blob;                   //don't need that now
-}
-#endif
--- a/ccmain/tfacepp.h
+++ b/ccmain/tfacepp.h
@ -20,15 +20,12 @@
 #ifndef           TFACEPP_H
 #define           TFACEPP_H

-#include          "varable.h"
 #include          "tstruct.h"
 #include          "ratngs.h"
-#include          "tessclas.h"
+#include          "blobs.h"
 #include          "notdll.h"
 #include          "tesseractclass.h"

-extern BOOL_VAR_H (tessedit_override_permuter, TRUE,
-"According to dict_word");
 void call_tester(                     //call a tester
                 TBLOB *tessblob,     //blob to test
                 BOOL8 correct_blob,  //true if good
--- a/ccmain/thresholder.cpp
+++ b/ccmain/thresholder.cpp
@ -230,6 +230,11 @@ void ImageThresholder::ThresholdToPix(Pix** pix) {
  }
 }

+// Common initialization shared between SetImage methods.
+void ImageThresholder::Init() {
+  SetRectangle(0, 0, image_width_, image_height_);
+}
+
 // Get a clone/copy of the source image rectangle.
 // The returned Pix must be pixDestroyed.
 // This function will be used in the future by the page layout analysis, and
@ -253,12 +258,24 @@ Pix* ImageThresholder::GetPixRect() {
  RawRectToPix(&raw_pix);
  return raw_pix;
 }
-#endif

-// Common initialization shared between SetImage methods.
-void ImageThresholder::Init() {
-  SetRectangle(0, 0, image_width_, image_height_);
+// Get a clone/copy of the source image rectangle, reduced to greyscale.
+// The returned Pix must be pixDestroyed.
+// This function will be used in the future by the page layout analysis, and
+// the layout analysis that uses it will only be available with Leptonica,
+// so there is no raw equivalent.
+Pix* ImageThresholder::GetPixRectGrey() {
+  Pix* pix = GetPixRect();  // May have to be reduced to grey.
+  int depth = pixGetDepth(pix);
+  if (depth != 8) {
+    Pix* result = depth < 8 ? pixConvertTo8(pix, false)
+                            : pixConvertRGBToLuminance(pix);
+    pixDestroy(&pix);
+    return result;
+  }
+  return pix;
 }
+#endif

 // Otsu threshold the rectangle, taking everything except the image buffer
 // pointer from the class, to the output IMAGE.
--- a/ccmain/thresholder.h
+++ b/ccmain/thresholder.h
@ -66,7 +66,7 @@ class ImageThresholder {
  virtual void GetImageSizes(int* left, int* top, int* width, int* height,
                             int* imagewidth, int* imageheight);

-  /// Return true if HAVE_LIBLEPT and this thresholder implements the Pix
+  /// Return true if this thresholder implements the Pix
  /// interface.
  virtual bool HasThresholdToPix() const;

@ -75,11 +75,15 @@ class ImageThresholder {
    return image_bytespp_ >= 3;
  }

+  /// Returns true if the source image is binary.
+  bool IsBinary() const {
+    return image_bytespp_ == 0;
+  }
+
  /// Threshold the source image as efficiently as possible to the output
  /// tesseract IMAGE class.
  virtual void ThresholdToIMAGE(IMAGE* image);

-#ifdef HAVE_LIBLEPT
  /// Pix vs raw, which to use?
  /// Implementations should provide the ability to source and target Pix
  /// where possible. A future version of Tesseract may choose to use Pix
@ -101,7 +105,13 @@ class ImageThresholder {
  /// the layout analysis that uses it will only be available with Leptonica,
  /// so there is no raw equivalent.
  Pix* GetPixRect();
-#endif
+
+  /// Get a clone/copy of the source image rectangle, reduced to greyscale.
+  /// The returned Pix must be pixDestroyed.
+  /// This function will be used in the future by the page layout analysis, and
+  /// the layout analysis that uses it will only be available with Leptonica,
+  /// so there is no raw equivalent.
+  Pix* GetPixRectGrey();

 protected:
  // ----------------------------------------------------------------------
@ -133,7 +143,6 @@ class ImageThresholder {
  /// output IMAGE.
  void CopyBinaryRectRawToIMAGE(IMAGE* image) const;

-#ifdef HAVE_LIBLEPT
  /// Otsu threshold the rectangle, taking everything except the image buffer
  /// pointer from the class, to the output Pix.
  void OtsuThresholdRectToPix(const unsigned char* imagedata,
@ -152,14 +161,11 @@ class ImageThresholder {

  /// Cut out the requested rectangle of the binary image to the output IMAGE.
  void CopyBinaryRectPixToIMAGE(IMAGE* image) const;
-#endif

 protected:
-#ifdef HAVE_LIBLEPT
  /// Clone or other copy of the source Pix.
  /// The pix will always be PixDestroy()ed on destruction of the class.
  Pix*                 pix_;
-#endif
  /// Exactly one of pix_ and image_data_ is not NULL.
  const unsigned char* image_data_;     //< Raw source image.

@ -178,4 +184,3 @@ class ImageThresholder {

 #endif  // TESSERACT_CCMAIN_THRESHOLDER_H__

-
--- a/ccmain/tstruct.cpp
+++ b/ccmain/tstruct.cpp
@ -18,370 +18,12 @@
 **********************************************************************/

 #include "mfcpch.h"
-
-#ifdef _MSC_VER
-#pragma warning(disable:4244)  // Conversion warnings
-#endif
-
-#include          "tfacep.h"
-#include          "tstruct.h"
-#include          "makerow.h"
-#include          "ocrblock.h"
-//#include "structures.h"
-
-static ERRCODE BADFRAGMENTS = "Couldn't find matching fragment ends";
-
-ELISTIZE (FRAGMENT)
-//extern /*"C"*/ oldoutline(TESSLINE*);
-/**********************************************************************
- * FRAGMENT::FRAGMENT
- *
- * Constructor for fragments.
- **********************************************************************/
-FRAGMENT::FRAGMENT (             //constructor
-EDGEPT * head_pt,                //start point
-EDGEPT * tail_pt                 //end point
-):head (head_pt->pos.x, head_pt->pos.y), tail (tail_pt->pos.x,
-tail_pt->pos.y) {
-  headpt = head_pt;              // save ptrs
-  tailpt = tail_pt;
-}
-
-// Helper function to make a fake PBLOB formed from the bounding box
-// of the given old-format outline.
-static PBLOB* MakeRectBlob(TESSLINE* ol) {
-  POLYPT_LIST poly_list;
-  POLYPT_IT poly_it = &poly_list;
-  FCOORD pos, vec;
-  POLYPT *polypt;
-
-  // Create points at each of the 4 corners of the rectangle in turn.
-  pos = FCOORD(ol->topleft.x, ol->topleft.y);
-  vec = FCOORD(0.0f, ol->botright.y - ol->topleft.y);
-  polypt = new POLYPT(pos, vec);
-  poly_it.add_after_then_move(polypt);
-  pos = FCOORD(ol->topleft.x, ol->botright.y);
-  vec = FCOORD(ol->botright.x - ol->topleft.x, 0.0f);
-  polypt = new POLYPT(pos, vec);
-  poly_it.add_after_then_move(polypt);
-  pos = FCOORD(ol->botright.x, ol->botright.y);
-  vec = FCOORD(0.0f, ol->topleft.y - ol->botright.y);
-  polypt = new POLYPT(pos, vec);
-  poly_it.add_after_then_move(polypt);
-  pos = FCOORD(ol->botright.x, ol->topleft.y);
-  vec = FCOORD(ol->topleft.x - ol->botright.x, 0.0f);
-  polypt = new POLYPT(pos, vec);
-  poly_it.add_after_then_move(polypt);
-
-  OUTLINE_LIST out_list;
-  OUTLINE_IT out_it = &out_list;
-  out_it.add_after_then_move(new OUTLINE(&poly_it));
-  return new PBLOB(&out_list);
-}
-
-/**********************************************************************
- * make_ed_word
- *
- * Make an editor format word from the tess style word.
- **********************************************************************/
-
-WERD *make_ed_word(                  //construct word
-                   TWERD *tessword,  //word to convert
-                   WERD *clone       //clone this one
-                  ) {
-  WERD *word;                    //converted word
-  TBLOB *tblob;                  //current blob
-  PBLOB *blob;                   //new blob
-  PBLOB_LIST blobs;              //list of blobs
-  PBLOB_IT blob_it = &blobs;     //iterator
-
-  for (tblob = tessword->blobs; tblob != NULL; tblob = tblob->next) {
-    blob = make_ed_blob (tblob);
-    if (blob == NULL && tblob->outlines != NULL) {
-      // Make a fake blob using the bounding box rectangle of the 1st outline.
-      blob = MakeRectBlob(tblob->outlines);
-    }
-    if (blob != NULL) {
-      blob_it.add_after_then_move (blob);
-    }
-  }
-  if (!blobs.empty ())
-    word = new WERD (&blobs, clone);
-  else
-    word = NULL;
-  return word;
-}
-
-
-/**********************************************************************
- * make_ed_blob
- *
- * Make an editor format blob from the tess style blob.
- **********************************************************************/
-
-PBLOB *make_ed_blob(                 //construct blob
-                    TBLOB *tessblob  //blob to convert
-                   ) {
-  TESSLINE *tessol;              //tess outline
-  FRAGMENT_LIST fragments;       //list of fragments
-  OUTLINE *outline;              //current outline
-  OUTLINE_LIST out_list;         //list of outlines
-  OUTLINE_IT out_it = &out_list; //iterator
-
-  for (tessol = tessblob->outlines; tessol != NULL; tessol = tessol->next) {
-                                 //stick in list
-    register_outline(tessol, &fragments);
-  }
-  while (!fragments.empty ()) {
-    outline = make_ed_outline (&fragments);
-    if (outline != NULL) {
-      out_it.add_after_then_move (outline);
-    }
-  }
-  if (out_it.empty())
-    return NULL;                 //couldn't do it
-  return new PBLOB (&out_list);  //turn to blob
-}
-
-
-/**********************************************************************
- * make_ed_outline
- *
- * Make an editor format outline from the list of fragments.
- **********************************************************************/
-
-OUTLINE *make_ed_outline(                     //constructoutline
-                         FRAGMENT_LIST *list  //list of fragments
-                        ) {
-  FRAGMENT *fragment;            //current fragment
-  EDGEPT *edgept;                //current point
-  ICOORD headpos;                //coords of head
-  ICOORD tailpos;                //coords of tail
-  FCOORD pos;                    //coords of edgept
-  FCOORD vec;                    //empty
-  POLYPT *polypt;                //current point
-  POLYPT_LIST poly_list;         //list of point
-  POLYPT_IT poly_it = &poly_list;//iterator
-  FRAGMENT_IT fragment_it = list;//fragment
-
-  headpos = fragment_it.data ()->head;
-  do {
-    fragment = fragment_it.data ();
-    edgept = fragment->headpt;   //start of segment
-    do {
-      pos = FCOORD (edgept->pos.x, edgept->pos.y);
-      vec = FCOORD (edgept->vec.x, edgept->vec.y);
-      polypt = new POLYPT (pos, vec);
-                                 //add to list
-      poly_it.add_after_then_move (polypt);
-      edgept = edgept->next;
-    }
-    while (edgept != fragment->tailpt);
-    tailpos = ICOORD (edgept->pos.x, edgept->pos.y);
-                                 //get rid of it
-    delete fragment_it.extract ();
-    if (tailpos != headpos) {
-      if (fragment_it.empty ()) {
-        return NULL;
-      }
-      fragment_it.forward ();
-                                 //find next segment
-      for (fragment_it.mark_cycle_pt (); !fragment_it.cycled_list () &&
-               fragment_it.data ()->head != tailpos;
-        fragment_it.forward ());
-      if (fragment_it.data ()->head != tailpos) {
-        // It is legitimate for the heads to not all match to tails,
-        // since not all combinations of seams always make sense.
-        for (fragment_it.mark_cycle_pt ();
-        !fragment_it.cycled_list (); fragment_it.forward ()) {
-          fragment = fragment_it.extract ();
-          delete fragment;
-        }
-        return NULL;             //can't do it
-      }
-    }
-  }
-  while (tailpos != headpos);
-  return new OUTLINE (&poly_it); //turn to outline
-}
-
-
-/**********************************************************************
- * register_outline
- *
- * Add the fragments in the given outline to the list
- **********************************************************************/
-
-void register_outline(                     //add fragments
-                      TESSLINE *outline,   //tess format
-                      FRAGMENT_LIST *list  //list to add to
-                     ) {
-  EDGEPT *startpt;               //start of outline
-  EDGEPT *headpt;                //start of fragment
-  EDGEPT *tailpt;                //end of fragment
-  FRAGMENT *fragment;            //new fragment
-  FRAGMENT_IT it = list;         //iterator
-
-  startpt = outline->loop;
-  do {
-    startpt = startpt->next;
-    if (startpt == NULL)
-      return;                    //illegal!
-  }
-  while (startpt->flags[0] == 0 && startpt != outline->loop);
-  headpt = startpt;
-  do
-  startpt = startpt->next;
-  while (startpt->flags[0] != 0 && startpt != headpt);
-  if (startpt->flags[0] != 0)
-    return;                      //all hidden!
-
-  headpt = startpt;
-  do {
-    tailpt = headpt;
-    do
-    tailpt = tailpt->next;
-    while (tailpt->flags[0] == 0 && tailpt != startpt);
-    fragment = new FRAGMENT (headpt, tailpt);
-    it.add_after_then_move (fragment);
-    while (tailpt->flags[0] != 0)
-      tailpt = tailpt->next;
-    headpt = tailpt;
-  }
-  while (tailpt != startpt);
-}
-
-
-/**********************************************************************
- * make_tess_row
- *
- * Make a fake row structure to pass to the tesseract matchers.
- **********************************************************************/
-
-void make_tess_row(                  //make fake row
-                   DENORM *denorm,   //row info
-                   TEXTROW *tessrow  //output row
-                  ) {
-  tessrow->baseline.segments = 1;
-  tessrow->baseline.xstarts[0] = -32767;
-  tessrow->baseline.xstarts[1] = 32767;
-  tessrow->baseline.quads[0].a = 0;
-  tessrow->baseline.quads[0].b = 0;
-  tessrow->baseline.quads[0].c = bln_baseline_offset;
-  tessrow->xheight.segments = 1;
-  tessrow->xheight.xstarts[0] = -32767;
-  tessrow->xheight.xstarts[1] = 32767;
-  tessrow->xheight.quads[0].a = 0;
-  tessrow->xheight.quads[0].b = 0;
-  tessrow->xheight.quads[0].c = bln_x_height + bln_baseline_offset;
-  tessrow->lineheight = bln_x_height;
-  if (denorm != NULL) {
-    tessrow->ascrise = denorm->row ()->ascenders () * denorm->scale ();
-    tessrow->descdrop = denorm->row ()->descenders () * denorm->scale ();
-  } else {
-    tessrow->ascrise = bln_baseline_offset;
-    tessrow->descdrop = -bln_baseline_offset;
-  }
-}
-
-
-/**********************************************************************
- * make_tess_word
- *
- * Convert the word to Tess format.
- **********************************************************************/
-
-TWERD *make_tess_word(              //convert word
-                      WERD *word,   //word to do
-                      TEXTROW *row  //fake row
-                     ) {
-  TWERD *tessword;               //tess format
-
-  tessword = newword ();         //use old allocator
-  tessword->row = row;           //give them something
-                                 //copy string
-  tessword->correct = strsave (word->text ());
-  tessword->guess = NULL;
-  tessword->blobs = make_tess_blobs (word->blob_list ());
-  tessword->blanks = 1;
-  tessword->blobcount = word->blob_list ()->length ();
-  tessword->next = NULL;
-  return tessword;
-}
-
-
-/**********************************************************************
- * make_tess_blobs
- *
- * Make Tess style blobs from a list of BLOBs.
- **********************************************************************/
-
-TBLOB *make_tess_blobs(                      //make tess blobs
-                       PBLOB_LIST *bloblist  //list to convert
-                      ) {
-  PBLOB_IT it = bloblist;        //iterator
-  PBLOB *blob;                   //current blob
-  TBLOB *head;                   //output list
-  TBLOB *tail;                   //end of list
-  TBLOB *tessblob;
-
-  head = NULL;
-  tail = NULL;
-  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
-    blob = it.data ();
-    tessblob = make_tess_blob (blob, TRUE);
-    if (head)
-      tail->next = tessblob;
-    else
-      head = tessblob;
-    tail = tessblob;
-  }
-  return head;
-}
-
-/**********************************************************************
- * make_rotated_tess_blob
- *
- * Make a single Tess style blob, applying the given rotation and
- * renormalizing.
- **********************************************************************/
-TBLOB *make_rotated_tess_blob(const DENORM* denorm, PBLOB *blob,
-                              BOOL8 flatten) {
-  if (denorm != NULL && denorm->block() != NULL &&
-      denorm->block()->classify_rotation().y() != 0.0) {
-    TBOX box = blob->bounding_box();
-    int src_width = box.width();
-    int src_height = box.height();
-    src_width = static_cast<int>(src_width / denorm->scale() + 0.5);
-    src_height = static_cast<int>(src_height / denorm->scale() + 0.5);
-    int x_middle = (box.left() + box.right()) / 2;
-    int y_middle = (box.top() + box.bottom()) / 2;
-    PBLOB* rotated_blob = PBLOB::deep_copy(blob);
-    rotated_blob->move(FCOORD(-x_middle, -y_middle));
-    rotated_blob->rotate(denorm->block()->classify_rotation());
-    ICOORD median_size = denorm->block()->median_size();
-    int tolerance = median_size.x() / 8;
-    // TODO(dsl/rays) find a better normalization solution. In the mean time
-    // make it work for CJK by normalizing for Cap height in the same way
-    // as is applied in compute_block_xheight when the row is presumed to
-    // be ALLCAPS, i.e. the x-height is the fixed fraction
-    // blob height * textord_merge_x / (textord_merge_x + textord_merge_asc)
-    if (NearlyEqual(src_width, static_cast<int>(median_size.x()), tolerance) &&
-        NearlyEqual(src_height, static_cast<int>(median_size.y()), tolerance)) {
-      float target_height = bln_x_height * (textord_merge_x + textord_merge_asc)
-                          / textord_merge_x;
-      rotated_blob->scale(target_height / box.width());
-      rotated_blob->move(FCOORD(0.0f,
-                                bln_baseline_offset -
-                                  rotated_blob->bounding_box().bottom()));
-    }
-    TBLOB* result = make_tess_blob(rotated_blob, flatten);
-    delete rotated_blob;
-    return result;
-  } else {
-    return make_tess_blob(blob, flatten);
-  }
-}
+#include "ccstruct.h"
+#include "helpers.h"
+#include "tfacep.h"
+#include "tstruct.h"
+#include "makerow.h"
+#include "ocrblock.h"

 /**********************************************************************
 * make_tess_blob
@ -389,24 +31,9 @@ TBLOB *make_rotated_tess_blob(const DENORM* denorm, PBLOB *blob,
 * Make a single Tess style blob
 **********************************************************************/

-TBLOB *make_tess_blob(               //make tess blob
-                      PBLOB *blob,   //blob to convert
-                      BOOL8 flatten  //flatten outline structure
-                     ) {
-  inT32 index;
-  TBLOB *tessblob;
-
-  tessblob = newblob ();
-  tessblob->outlines = (struct olinestruct *)
-    make_tess_outlines (blob->out_list (), flatten);
-  for (index = 0; index < TBLOBFLAGS; index++)
-    tessblob->flags[index] = 0;  //!!
-  tessblob->correct = 0;
-  tessblob->guess = 0;
-  for (index = 0; index < MAX_WO_CLASSES; index++) {
-    tessblob->classes[index] = 0;
-    tessblob->values[index] = 0;
-  }
+TBLOB *make_tess_blob(PBLOB *blob) {
+  TBLOB* tessblob = new TBLOB;
+  tessblob->outlines = make_tess_outlines(blob->out_list(), false);
  tessblob->next = NULL;
  return tessblob;
 }
@ -418,10 +45,8 @@ TBLOB *make_tess_blob(               //make tess blob
 * Make Tess style outlines from a list of OUTLINEs.
 **********************************************************************/

-TESSLINE *make_tess_outlines(                            //make tess outlines
-                             OUTLINE_LIST *outlinelist,  //list to convert
-                             BOOL8 flatten               //flatten outline structure
-                            ) {
+TESSLINE *make_tess_outlines(OUTLINE_LIST *outlinelist,  // List to convert.
+                             bool is_holes) {  // These are hole outlines.
  OUTLINE_IT it = outlinelist;   //iterator
  OUTLINE *outline;              //current outline
  TESSLINE *head;                //output list
@ -430,31 +55,21 @@ TESSLINE *make_tess_outlines(                            //make tess outlines

  head = NULL;
  tail = NULL;
-  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
-    outline = it.data ();
-    tessoutline = newoutline ();
-    tessoutline->compactloop = NULL;
-    tessoutline->loop = make_tess_edgepts (outline->polypts (),
-      tessoutline->topleft,
-      tessoutline->botright);
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    outline = it.data();
+    tessoutline = new TESSLINE;
+    tessoutline->loop = make_tess_edgepts(outline->polypts(),
+                                          tessoutline->topleft,
+                                          tessoutline->botright);
    if (tessoutline->loop == NULL) {
-      oldoutline(tessoutline);
+      delete tessoutline;
      continue;
    }
    tessoutline->start = tessoutline->loop->pos;
-    tessoutline->node = NULL;
    tessoutline->next = NULL;
-    tessoutline->child = NULL;
-    if (!outline->child ()->empty ()) {
-      if (flatten)
-        tessoutline->next = (struct olinestruct *)
-          make_tess_outlines (outline->child (), flatten);
-      else {
-        tessoutline->next = NULL;
-        tessoutline->child = (struct olinestruct *)
-          make_tess_outlines (outline->child (), flatten);
-      }
-    }
+    tessoutline->is_hole = is_holes;
+    if (!outline->child()->empty())
+      tessoutline->next = make_tess_outlines(outline->child(), true);
    else
      tessoutline->next = NULL;
    if (head)
@ -492,22 +107,17 @@ EDGEPT *make_tess_edgepts(                          //make tess edgepts
  tl.y = -MAX_INT16;
  br.x = -MAX_INT16;
  br.y = MAX_INT16;
-  for (it.mark_cycle_pt (); !it.cycled_list ();) {
-    edgept = it.data ();
-    tessedgept = newedgept ();
-    tessedgept->pos.x = (inT16) edgept->pos.x ();
-    tessedgept->pos.y = (inT16) edgept->pos.y ();
-    if (tessedgept->pos.x < tl.x)
-      tl.x = tessedgept->pos.x;
-    if (tessedgept->pos.x > br.x)
-      br.x = tessedgept->pos.x;
-    if (tessedgept->pos.y > tl.y)
-      tl.y = tessedgept->pos.y;
-    if (tessedgept->pos.y < br.y)
-      br.y = tessedgept->pos.y;
-    if (head != NULL && tessedgept->pos.x == tail->pos.x
-    && tessedgept->pos.y == tail->pos.y) {
-      oldedgept(tessedgept);
+  for (it.mark_cycle_pt(); !it.cycled_list ();) {
+    edgept = it.data();
+    tessedgept = new EDGEPT;
+    tessedgept->pos.x = (inT16) edgept->pos.x();
+    tessedgept->pos.y = (inT16) edgept->pos.y();
+    UpdateRange(tessedgept->pos.x, &tl.x, &br.x);
+    UpdateRange(tessedgept->pos.y, &br.y, &tl.y);
+    if (head != NULL &&
+        tessedgept->pos.x == tail->pos.x &&
+        tessedgept->pos.y == tail->pos.y) {
+      delete tessedgept;
    }
    else {
      for (index = 0; index < EDGEPTFLAGS; index++)
@ -530,7 +140,7 @@ EDGEPT *make_tess_edgepts(                          //make tess edgepts
  tail->vec.x = head->pos.x - tail->pos.x;
  tail->vec.y = head->pos.y - tail->pos.y;
  if (head == tail) {
-    oldedgept(head);
+    delete head;
    return NULL;                 //empty
  }
  return head;
--- a/ccmain/tstruct.h
+++ b/ccmain/tstruct.h
@ -21,62 +21,13 @@
 #define           TSTRUCT_H

 #include          "werd.h"
-#include          "tessclas.h"
+#include          "blobs.h"
 #include          "ratngs.h"
 #include          "notdll.h"

-class FRAGMENT:public ELIST_LINK
-{
-  public:
-    FRAGMENT() {  //constructor
-    }
-    FRAGMENT(EDGEPT *head_pt,   //start
-             EDGEPT *tail_pt);  //end
-
-    ICOORD head;                 //coords of start
-    ICOORD tail;                 //coords of end
-    EDGEPT *headpt;              //start point
-    EDGEPT *tailpt;              //end point
-
-    NEWDELETE2 (FRAGMENT)
-};
-
-ELISTIZEH (FRAGMENT)
-WERD *make_ed_word(                  //construct word
-                   TWERD *tessword,  //word to convert
-                   WERD *clone       //clone this one
-                  );
-PBLOB *make_ed_blob(                 //construct blob
-                    TBLOB *tessblob  //blob to convert
-                   );
-OUTLINE *make_ed_outline(                     //constructoutline
-                         FRAGMENT_LIST *list  //list of fragments
-                        );
-void register_outline(                     //add fragments
-                      TESSLINE *outline,   //tess format
-                      FRAGMENT_LIST *list  //list to add to
-                     );
-void make_tess_row(                  //make fake row
-                   DENORM *denorm,   //row info
-                   TEXTROW *tessrow  //output row
-                  );
-TWERD *make_tess_word(              //convert owrd
-                      WERD *word,   //word to do
-                      TEXTROW *row  //fake row
-                     );
-TBLOB *make_tess_blobs(                      //make tess blobs
-                       PBLOB_LIST *bloblist  //list to convert
-                      );
-TBLOB *make_rotated_tess_blob(const DENORM* denorm, PBLOB *blob,
-                              BOOL8 flatten);
-TBLOB *make_tess_blob(               //make tess blob
-                      PBLOB *blob,   //blob to convert
-                      BOOL8 flatten  //flatten outline structure
-                     );
-TESSLINE *make_tess_outlines(                            //make tess outlines
-                             OUTLINE_LIST *outlinelist,  //list to convert
-                             BOOL8 flatten               //flatten outline structure
-                            );
+TBLOB *make_tess_blob(PBLOB *blob);
+TESSLINE *make_tess_outlines(OUTLINE_LIST *outlinelist,  // List to convert
+                             bool is_holes);  // These are hole outlines.
 EDGEPT *make_tess_edgepts(                          //make tess edgepts
                          POLYPT_LIST *edgeptlist,  //list to convert
                          TPOINT &tl,               //bounding box
--- a/ccmain/varabled.h
+++ b/ccmain/varabled.h
@ -1,139 +0,0 @@
-///////////////////////////////////////////////////////////////////////
-// File:        varabled.cpp
-// Description: Variables Editor
-// Author:      Joern Wanke
-// Created:     Wed Jul 18 10:05:01 PDT 2007
-//
-// (C) Copyright 2007, Google Inc.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-///////////////////////////////////////////////////////////////////////
-
-/**
- * @file varabled.h
- * The variables editor is used to edit all the variables used within
- * tesseract from the ui.
- */
-#ifndef GRAPHICS_DISABLED
-#ifndef VARABLED_H
-#define VARABLED_H
-
-#include "elst.h"
-#include "scrollview.h"
-#include "varable.h"
-#include "tesseractclass.h"
-
-class SVMenuNode;
-
-/** A list of all possible variable types used. */
-enum VarType {
-  VT_INTEGER,
-  VT_BOOLEAN,
-  VT_STRING,
-  VT_DOUBLE
-};
-
-/**
- * A rather hackish helper structure which can take any kind of variable input
- * (defined by VarType) and do a couple of common operations on them, like
- * comparisond or getting its value. It is used in the context of the
- * VariablesEditor as a bridge from the internal tesseract variables to the
- * ones displayed by the ScrollView server.
- */
-class VariableContent : public ELIST_LINK {
- public:
-  /** Compare two VC objects by their name. */
-  static int Compare(const void* v1, const void* v2);
-
-  /** Gets a VC object identified by its ID. */
-  static VariableContent* GetVariableContentById(int id);
-
-  /** Constructors for the various VarTypes. */
-  VariableContent() {
-  }
-  VariableContent(STRING_VARIABLE* it);
-  VariableContent(INT_VARIABLE* it);
-  VariableContent(BOOL_VARIABLE* it);
-  VariableContent(double_VARIABLE* it);
-
-
-  /** Getters and Setters. */
-  void SetValue(const char* val);
-  const char* GetValue() const;
-  const char* GetName() const;
-  const char* GetDescription() const;
-
-  int GetId() { return my_id_; }
-  bool HasChanged() { return changed_; }
-
- private:
-  /** The unique ID of this VC object. */
-  int my_id_;
-  /** Whether the variable was changed_ and thus needs to be rewritten. */
-  bool changed_;
-  /** The actual vartype of this VC object. */
-  VarType var_type_;
-
-  STRING_VARIABLE* sIt;
-  INT_VARIABLE* iIt;
-  BOOL_VARIABLE* bIt;
-  double_VARIABLE* dIt;
-};
-
-ELISTIZEH(VariableContent)
-
-/**
- * The variables editor enables the user to edit all the variables used within
- * tesseract. It can be invoked on its own, but is supposed to be invoked by
- * the program editor.
- */
-class VariablesEditor : public SVEventHandler {
- public:
-  /**
-   * Integrate the variables editor as popupmenu into the existing scrollview
-   * window (usually the pg editor). If sv == null, create a new empty
-   * empty window and attach the variables editor to that window (ugly).
-   */
-  VariablesEditor(const tesseract::Tesseract*, ScrollView* sv = NULL);
-
-  /** Event listener. Waits for SVET_POPUP events and processes them. */
-  void Notify(const SVEvent* sve);
-
- private:
-  /**
-   * Gets the up to the first 3 prefixes from s (split by _).
-   * For example, tesseract_foo_bar will be split into tesseract, foo, and bar.
-   */
-  void GetPrefixes(const char* s, STRING* level_one,
-                   STRING* level_two, STRING* level_three);
-
-  /**
-   * Gets the first n words (split by _) and puts them in t.
-   * For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
-   */
-  void GetFirstWords(const char *s,  // source string
-                     int n,          // number of words
-                     char *t);       // target string
-
-  /**
-   * Find all editable variables used within tesseract and create a
-   * SVMenuNode tree from it.
-   */
-  SVMenuNode *BuildListOfAllLeaves();
-
-  /** Write all (changed_) variables to a config file. */
-  void WriteVars(char* filename, bool changes_only);
-
-  ScrollView* sv_window_;
-};
-
-#endif
-#endif
--- a/ccmain/werdit.cpp
+++ b/ccmain/werdit.cpp
@ -1,3 +1,4 @@
+
 /**********************************************************************
 * File:        werdit.cpp  (Formerly wordit.c)
 * Description: An iterator for passing over all the words in a document.
@ -18,99 +19,7 @@
 **********************************************************************/

 #include "mfcpch.h"
-#include          "werdit.h"
-
-#define EXTERN
-
-//EXTERN BOOL_VAR(wordit_linearc,FALSE,"Pass poly of linearc to Tess");
-
-/**********************************************************************
- * WERDIT::start_page
- *
- * Get ready to iterate over the page by setting the iterators.
- **********************************************************************/
-
-void WERDIT::start_page(                        //set iterators
-                        BLOCK_LIST *block_list  //blocks to check
-                       ) {
-  block_it.set_to_list (block_list);
-  block_it.mark_cycle_pt ();
-  do {
-    while (block_it.data ()->row_list ()->empty ()
-    && !block_it.cycled_list ()) {
-      block_it.forward ();
-    }
-    if (!block_it.data ()->row_list ()->empty ()) {
-      row_it.set_to_list (block_it.data ()->row_list ());
-      row_it.mark_cycle_pt ();
-      while (row_it.data ()->word_list ()->empty ()
-      && !row_it.cycled_list ()) {
-        row_it.forward ();
-      }
-      if (!row_it.data ()->word_list ()->empty ()) {
-        word_it.set_to_list (row_it.data ()->word_list ());
-        word_it.mark_cycle_pt ();
-      }
-    }
-  }
-  while (!block_it.cycled_list () && row_it.data ()->word_list ()->empty ());
-}
-
-
-/**********************************************************************
- * WERDIT::forward
- *
- * Give the next word on the page, or NULL if none left.
- * This code assumes all rows to be non-empty, but blocks are allowed
- * to be empty as eventually we will have non-text blocks.
- * The output is always a copy and needs to be deleted by somebody.
- **********************************************************************/
-
-WERD *WERDIT::forward() {  //use iterators
-  WERD *word;                    //actual word
-  //      WERD                                                    *larc_word;                                                     //linearc copy
-  WERD *result;                  //output word
-  ROW *row;                      //row of word
-
-  if (word_it.cycled_list ()) {
-    return NULL;                 //finished page
-  }
-  else {
-    word = word_it.data ();
-    row = row_it.data ();
-    word_it.forward ();
-    if (word_it.cycled_list ()) {
-      row_it.forward ();         //finished row
-      if (row_it.cycled_list ()) {
-        do {
-          block_it.forward ();   //finished block
-          if (!block_it.cycled_list ()) {
-            row_it.set_to_list (block_it.data ()->row_list ());
-            row_it.mark_cycle_pt ();
-          }
-        }
-                                 //find non-empty block
-        while (!block_it.cycled_list ()
-          && row_it.cycled_list ());
-      }
-      if (!row_it.cycled_list ()) {
-        word_it.set_to_list (row_it.data ()->word_list ());
-        word_it.mark_cycle_pt ();
-      }
-    }
-
-    //              if (wordit_linearc && !word->flag(W_POLYGON))
-    //              {
-    //                      larc_word=word->larc_copy(row->x_height());
-    //                      result=larc_word->poly_copy(row->x_height());
-    //                      delete larc_word;
-    //              }
-    //              else
-    result = word->poly_copy (row->x_height ());
-    return result;
-  }
-}
-
+#include "werdit.h"

 /**********************************************************************
 * make_pseudo_word
@ -119,74 +28,33 @@ WERD *WERDIT::forward() {  //use iterators
 * The word is always a copy and needs to be deleted.
 **********************************************************************/

-WERD *make_pseudo_word(                         //make fake word
-                       BLOCK_LIST *block_list,  //blocks to check //block of selection
+WERD *make_pseudo_word(PAGE_RES* page_res,  // Blocks to check.
                       TBOX &selection_box,
                       BLOCK *&pseudo_block,
-                       ROW *&pseudo_row         //row of selection
-                      ) {
-  BLOCK_IT block_it(block_list);
-  BLOCK *block;
-  ROW_IT row_it;
-  ROW *row;
-  WERD_IT word_it;
-  WERD *word;
-  PBLOB_IT blob_it;
-  PBLOB *blob;
-  PBLOB_LIST new_blobs;          //list of gathered blobs
-                                 //iterator
-  PBLOB_IT new_blob_it = &new_blobs;
-  WERD *pseudo_word;             //fabricated word
-  WERD *poly_word;               //poly copy of word
-  //      WERD                                                    *larc_word;                                                     //linearc copy
+                       ROW *&pseudo_row) {      // Row of selection.
+  PAGE_RES_IT pr_it(page_res);
+  C_BLOB_LIST new_blobs;               // list of gathered blobs
+  C_BLOB_IT new_blob_it = &new_blobs;  // iterator
+  WERD *pseudo_word;                   // fabricated word

-  for (block_it.mark_cycle_pt ();
-  !block_it.cycled_list (); block_it.forward ()) {
-    block = block_it.data ();
-    if (block->bounding_box ().overlap (selection_box)) {
-      pseudo_block = block;
-      row_it.set_to_list (block->row_list ());
-      for (row_it.mark_cycle_pt ();
-      !row_it.cycled_list (); row_it.forward ()) {
-        row = row_it.data ();
-        if (row->bounding_box ().overlap (selection_box)) {
-          word_it.set_to_list (row->word_list ());
-          for (word_it.mark_cycle_pt ();
-          !word_it.cycled_list (); word_it.forward ()) {
-            word = word_it.data ();
-            if (word->bounding_box ().overlap (selection_box)) {
-              //                                                      if (wordit_linearc && !word->flag(W_POLYGON))
-              //                                                      {
-              //                                                              larc_word=word->larc_copy(row->x_height());
-              //                                                              poly_word=larc_word->poly_copy(row->x_height());
-              //                                                              delete larc_word;
-              //                                                      }
-              //                                                      else
-              poly_word = word->poly_copy (row->x_height ());
-              blob_it.set_to_list (poly_word->blob_list ());
-              for (blob_it.mark_cycle_pt ();
-              !blob_it.cycled_list (); blob_it.forward ()) {
-                blob = blob_it.data ();
-                if (blob->bounding_box ().
-                overlap (selection_box)) {
-                  new_blob_it.add_after_then_move (blob_it.
-                    extract
-                    ());
-                                 //steal off list
-                  pseudo_row = row;
-                }
-              }
-              delete poly_word;  //get rid of it
-            }
-          }
+  for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
+       word_res = pr_it.forward()) {
+    WERD* word = word_res->word;
+    if (word->bounding_box().overlap(selection_box)) {
+      C_BLOB_IT blob_it(word->cblob_list());
+      for (blob_it.mark_cycle_pt();
+           !blob_it.cycled_list(); blob_it.forward()) {
+        C_BLOB* blob = blob_it.data();
+        if (blob->bounding_box().overlap(selection_box)) {
+          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
+          pseudo_row = pr_it.row()->row;
+          pseudo_block = pr_it.block()->block;
        }
      }
    }
  }
-  if (!new_blobs.empty ()) {
-                                 //make new word
-    pseudo_word = new WERD (&new_blobs, 1, NULL);
-  }
+  if (!new_blobs.empty())
+    pseudo_word = new WERD(&new_blobs, 1, NULL);
  else
    pseudo_word = NULL;
  return pseudo_word;
--- a/ccmain/werdit.h
+++ b/ccmain/werdit.h
@ -20,48 +20,12 @@
 #ifndef           WERDIT_H
 #define           WERDIT_H

-#include          "varable.h"
-#include          "ocrblock.h"
+#include          "pageres.h"
 #include          "notdll.h"

-class WERDIT
-{
-  public:
-    WERDIT() {
-    }                            //empty contructor
-    WERDIT(                          //empty contructor
-           BLOCK_LIST *blocklist) {  //blocks on page
-      start_page(blocklist);  //ready to scan
-    }
-
-    void start_page(                         //get ready
-                    BLOCK_LIST *blocklist);  //blocks on page
-
-    WERD *forward();  //get next word
-    WERD *next_word() {  //get next word
-      return word_it.data ();    //already at next
-    }
-    ROW *row() {  //get current row
-      return word_it.cycled_list ()? NULL : row_it.data ();
-    }
-    ROW *next_row() {  //get next row
-      return row_it.data_relative (1);
-    }
-    BLOCK *block() {  //get current block
-      return block_it.data ();
-    }
-
-  private:
-    BLOCK_IT block_it;           //iterators
-    ROW_IT row_it;
-    WERD_IT word_it;
-};
-
-//extern BOOL_VAR_H(wordit_linearc,FALSE,"Pass poly of linearc to Tess");
-WERD *make_pseudo_word(                         //make fake word
-                       BLOCK_LIST *block_list,  //blocks to check //block of selection
+WERD *make_pseudo_word(PAGE_RES* page_res,  // blocks to check
                       TBOX &selection_box,
                       BLOCK *&pseudo_block,
-                       ROW *&pseudo_row         //row of selection
-                      );
+                       ROW *&pseudo_row);
+
 #endif
--- a/ccstruct/Makefile.am
+++ b/ccstruct/Makefile.am
@ -3,28 +3,26 @@ AM_CPPFLAGS = \
    -I$(top_srcdir)/ccutil -I$(top_srcdir)/cutil \
    -I$(top_srcdir)/image -I$(top_srcdir)/viewer

-EXTRA_DIST = ccstruct.vcproj
-
 include_HEADERS = \
-    blckerr.h blobbox.h blobs.h blread.h ccstruct.h coutln.h crakedge.h \
-    detlinefit.h genblob.h hpddef.h hpdsizes.h ipoints.h \
-    labls.h linlsq.h lmedsq.h mod128.h normalis.h \
+    blckerr.h blobbox.h blobs.h blread.h boxword.h ccstruct.h coutln.h crakedge.h \
+    detlinefit.h dppoint.h genblob.h hpddef.h hpdsizes.h ipoints.h \
+    linlsq.h matrix.h mod128.h normalis.h \
    ocrblock.h ocrrow.h otsuthr.h \
    pageres.h pdblock.h points.h polyaprx.h polyblk.h \
-    polyblob.h polyvert.h poutline.h \
+    polyblob.h polyvert.h poutline.h publictypes.h \
    quadlsq.h quadratc.h quspline.h ratngs.h rect.h rejctmap.h \
-    statistc.h stepblob.h vecfuncs.h werd.h
+    seam.h split.h statistc.h stepblob.h vecfuncs.h werd.h

 lib_LTLIBRARIES = libtesseract_ccstruct.la
 libtesseract_ccstruct_la_SOURCES = \
-    blobbox.cpp blobs.cpp blread.cpp callcpp.cpp ccstruct.cpp coutln.cpp \
-    detlinefit.cpp genblob.cpp \
-    labls.cpp linlsq.cpp lmedsq.cpp mod128.cpp normalis.cpp \
+    blobbox.cpp blobs.cpp blread.cpp boxword.cpp callcpp.cpp ccstruct.cpp coutln.cpp \
+    detlinefit.cpp dppoint.cpp genblob.cpp \
+    linlsq.cpp matrix.cpp mod128.cpp normalis.cpp \
    ocrblock.cpp ocrrow.cpp otsuthr.cpp \
    pageres.cpp pdblock.cpp points.cpp polyaprx.cpp polyblk.cpp \
-    polyblob.cpp polyvert.cpp poutline.cpp \
+    polyblob.cpp polyvert.cpp poutline.cpp publictypes.cpp \
    quadlsq.cpp quadratc.cpp quspline.cpp ratngs.cpp rect.cpp rejctmap.cpp \
-    statistc.cpp stepblob.cpp \
+    seam.cpp split.cpp statistc.cpp stepblob.cpp \
    vecfuncs.cpp werd.cpp

 libtesseract_ccstruct_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
--- a/ccstruct/Makefile.in
+++ b/ccstruct/Makefile.in
@ -72,12 +72,13 @@ am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)"
 LTLIBRARIES = $(lib_LTLIBRARIES)
 libtesseract_ccstruct_la_LIBADD =
 am_libtesseract_ccstruct_la_OBJECTS = blobbox.lo blobs.lo blread.lo \
-	callcpp.lo ccstruct.lo coutln.lo detlinefit.lo genblob.lo \
-	labls.lo linlsq.lo lmedsq.lo mod128.lo normalis.lo ocrblock.lo \
-	ocrrow.lo otsuthr.lo pageres.lo pdblock.lo points.lo \
-	polyaprx.lo polyblk.lo polyblob.lo polyvert.lo poutline.lo \
-	quadlsq.lo quadratc.lo quspline.lo ratngs.lo rect.lo \
-	rejctmap.lo statistc.lo stepblob.lo vecfuncs.lo werd.lo
+	boxword.lo callcpp.lo ccstruct.lo coutln.lo detlinefit.lo \
+	dppoint.lo genblob.lo linlsq.lo matrix.lo mod128.lo \
+	normalis.lo ocrblock.lo ocrrow.lo otsuthr.lo pageres.lo \
+	pdblock.lo points.lo polyaprx.lo polyblk.lo polyblob.lo \
+	polyvert.lo poutline.lo publictypes.lo quadlsq.lo quadratc.lo \
+	quspline.lo ratngs.lo rect.lo rejctmap.lo seam.lo split.lo \
+	statistc.lo stepblob.lo vecfuncs.lo werd.lo
 libtesseract_ccstruct_la_OBJECTS =  \
 	$(am_libtesseract_ccstruct_la_OBJECTS)
 libtesseract_ccstruct_la_LINK = $(LIBTOOL) --tag=CXX \
@ -252,7 +253,6 @@ libdir = @libdir@
 libexecdir = @libexecdir@
 localedir = @localedir@
 localstatedir = @localstatedir@
-lt_ECHO = @lt_ECHO@
 mandir = @mandir@
 mkdir_p = @mkdir_p@
 oldincludedir = @oldincludedir@
@ -273,27 +273,26 @@ AM_CPPFLAGS = \
    -I$(top_srcdir)/ccutil -I$(top_srcdir)/cutil \
    -I$(top_srcdir)/image -I$(top_srcdir)/viewer

-EXTRA_DIST = ccstruct.vcproj
 include_HEADERS = \
-    blckerr.h blobbox.h blobs.h blread.h ccstruct.h coutln.h crakedge.h \
-    detlinefit.h genblob.h hpddef.h hpdsizes.h ipoints.h \
-    labls.h linlsq.h lmedsq.h mod128.h normalis.h \
+    blckerr.h blobbox.h blobs.h blread.h boxword.h ccstruct.h coutln.h crakedge.h \
+    detlinefit.h dppoint.h genblob.h hpddef.h hpdsizes.h ipoints.h \
+    linlsq.h matrix.h mod128.h normalis.h \
    ocrblock.h ocrrow.h otsuthr.h \
    pageres.h pdblock.h points.h polyaprx.h polyblk.h \
-    polyblob.h polyvert.h poutline.h \
+    polyblob.h polyvert.h poutline.h publictypes.h \
    quadlsq.h quadratc.h quspline.h ratngs.h rect.h rejctmap.h \
-    statistc.h stepblob.h vecfuncs.h werd.h
+    seam.h split.h statistc.h stepblob.h vecfuncs.h werd.h

 lib_LTLIBRARIES = libtesseract_ccstruct.la
 libtesseract_ccstruct_la_SOURCES = \
-    blobbox.cpp blobs.cpp blread.cpp callcpp.cpp ccstruct.cpp coutln.cpp \
-    detlinefit.cpp genblob.cpp \
-    labls.cpp linlsq.cpp lmedsq.cpp mod128.cpp normalis.cpp \
+    blobbox.cpp blobs.cpp blread.cpp boxword.cpp callcpp.cpp ccstruct.cpp coutln.cpp \
+    detlinefit.cpp dppoint.cpp genblob.cpp \
+    linlsq.cpp matrix.cpp mod128.cpp normalis.cpp \
    ocrblock.cpp ocrrow.cpp otsuthr.cpp \
    pageres.cpp pdblock.cpp points.cpp polyaprx.cpp polyblk.cpp \
-    polyblob.cpp polyvert.cpp poutline.cpp \
+    polyblob.cpp polyvert.cpp poutline.cpp publictypes.cpp \
    quadlsq.cpp quadratc.cpp quspline.cpp ratngs.cpp rect.cpp rejctmap.cpp \
-    statistc.cpp stepblob.cpp \
+    seam.cpp split.cpp statistc.cpp stepblob.cpp \
    vecfuncs.cpp werd.cpp

 libtesseract_ccstruct_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
@ -374,14 +373,15 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blobbox.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blobs.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blread.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/boxword.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/callcpp.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccstruct.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/coutln.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/detlinefit.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dppoint.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/genblob.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/labls.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/linlsq.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lmedsq.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matrix.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mod128.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/normalis.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ocrblock.Plo@am__quote@
@ -395,12 +395,15 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/polyblob.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/polyvert.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/poutline.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/publictypes.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/quadlsq.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/quadratc.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/quspline.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ratngs.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rect.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rejctmap.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/seam.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/split.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/statistc.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stepblob.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vecfuncs.Plo@am__quote@
--- a/ccstruct/blobbox.cpp
+++ b/ccstruct/blobbox.cpp
@ -18,17 +18,45 @@
 **********************************************************************/

 #include "mfcpch.h"
-#include          "blobbox.h"
+#include "blobbox.h"
+#include "helpers.h"

 #define PROJECTION_MARGIN 10     //arbitrary
 #define EXTERN

-EXTERN double_VAR (textord_error_weight, 3,
-"Weighting for error in believability");
-EXTERN BOOL_VAR (pitsync_projection_fix, TRUE,
-"Fix bug in projection profile");
-
 ELISTIZE (BLOBNBOX) ELIST2IZE (TO_ROW) ELISTIZE (TO_BLOCK)
+
+// Upto 30 degrees is allowed for rotations of diacritic blobs.
+const double kCosSmallAngle = 0.866;
+// Min aspect ratio for a joined word to indicate an obvious flow direction.
+const double kDefiniteAspectRatio = 2.0;
+// Multiple of short length in perimeter to make a joined word.
+const double kComplexShapePerimeterRatio = 1.5;
+
+void BLOBNBOX::rotate(FCOORD rotation) {
+  cblob_ptr->rotate(rotation);
+  rotate_box(rotation);
+  compute_bounding_box();
+}
+
+// Rotate the box by the angle given by rotation.
+// If the blob is a diacritic, then only small rotations for skew
+// correction can be applied.
+void BLOBNBOX::rotate_box(FCOORD rotation) {
+  if (IsDiacritic()) {
+    ASSERT_HOST(rotation.x() >= kCosSmallAngle)
+    ICOORD top_pt((box.left() + box.right()) / 2, base_char_top_);
+    ICOORD bottom_pt(top_pt.x(), base_char_bottom_);
+    top_pt.rotate(rotation);
+    base_char_top_ = top_pt.y();
+    bottom_pt.rotate(rotation);
+    base_char_bottom_ = bottom_pt.y();
+    box.rotate(rotation);
+  } else {
+    box.rotate(rotation);
+    set_diacritic_box(box);
+  }
+}
 /**********************************************************************
 * BLOBNBOX::merge
 *
@ -38,10 +66,22 @@ void BLOBNBOX::merge(                    //merge blobs
                     BLOBNBOX *nextblob  //blob to join with
                    ) {
  box += nextblob->box;          //merge boxes
+  set_diacritic_box(box);
  nextblob->joined = TRUE;
 }


+// Merge this with other, taking the outlines from other.
+// Other is not deleted, but left for the caller to handle.
+void BLOBNBOX::really_merge(BLOBNBOX* other) {
+  if (cblob_ptr != NULL && other->cblob_ptr != NULL) {
+    C_OUTLINE_IT ol_it(cblob_ptr->out_list());
+    ol_it.add_list_after(other->cblob_ptr->out_list());
+  }
+  compute_bounding_box();
+}
+
+
 /**********************************************************************
 * BLOBNBOX::chop
 *
@ -88,10 +128,7 @@ void BLOBNBOX::chop(                        //chop blobs
            rightx,
            /*rotation, */ test_ymin, test_ymax);
        blob_it.forward ();
-        if (test_ymin < ymin)
-          ymin = test_ymin;
-        if (test_ymax > ymax)
-          ymax = test_ymax;
+        UpdateRange(test_ymin, test_ymax, &ymin, &ymax);
      }
      while (blob != end_it->data ());
      if (ymin < ymax) {
@ -107,6 +144,8 @@ void BLOBNBOX::chop(                        //chop blobs
                                 //box is all it has
          newblob->box = TBOX (bl, tr);
                                 //stay on current
+          newblob->base_char_top_ = tr.y();
+          newblob->base_char_bottom_ = bl.y();
          end_it->add_after_stay_put (newblob);
        }
      }
@ -114,6 +153,201 @@ void BLOBNBOX::chop(                        //chop blobs
  }
 }

+// Returns the box gaps between this and its neighbours_ in an array
+// indexed by BlobNeighbourDir.
+void BLOBNBOX::NeighbourGaps(int gaps[BND_COUNT]) const {
+  for (int dir = 0; dir < BND_COUNT; ++dir) {
+    gaps[dir] = MAX_INT16;
+    BLOBNBOX* neighbour = neighbours_[dir];
+    if (neighbour != NULL) {
+      TBOX n_box = neighbour->bounding_box();
+      if (dir == BND_LEFT || dir == BND_RIGHT) {
+        gaps[dir] = box.x_gap(n_box);
+      } else {
+        gaps[dir] = box.y_gap(n_box);
+      }
+    }
+  }
+}
+// Returns the min and max horizontal and vertical gaps (from NeighbourGaps)
+// modified so that if the max exceeds the max dimension of the blob, and
+// the min is less, the max is replaced with the min.
+// The objective is to catch cases where there is only a single neighbour
+// and avoid reporting the other gap as a ridiculously large number
+void BLOBNBOX::MinMaxGapsClipped(int* h_min, int* h_max,
+                                 int* v_min, int* v_max) const {
+  int max_dimension = MAX(box.width(), box.height());
+  int gaps[BND_COUNT];
+  NeighbourGaps(gaps);
+  *h_min = MIN(gaps[BND_LEFT], gaps[BND_RIGHT]);
+  *h_max = MAX(gaps[BND_LEFT], gaps[BND_RIGHT]);
+  if (*h_max > max_dimension && *h_min < max_dimension) *h_max = *h_min;
+  *v_min = MIN(gaps[BND_ABOVE], gaps[BND_BELOW]);
+  *v_max = MAX(gaps[BND_ABOVE], gaps[BND_BELOW]);
+  if (*v_max > max_dimension && *v_min < max_dimension) *v_max = *v_min;
+}
+
+// Returns positive if there is at least one side neighbour that has a similar
+// stroke width and is not on the other side of a rule line.
+int BLOBNBOX::GoodTextBlob() const {
+  int score = 0;
+  for (int dir = 0; dir < BND_COUNT; ++dir) {
+    BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
+    if (good_stroke_neighbour(bnd))
+      ++score;
+  }
+  return score;
+}
+
+// Returns true, and sets vert_possible/horz_possible if the blob has some
+// feature that makes it individually appear to flow one way.
+// eg if it has a high aspect ratio, yet has a complex shape, such as a
+// joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1 etc.
+bool BLOBNBOX::DefiniteIndividualFlow() {
+  int box_perimeter = 2 * (box.height() + box.width());
+  if (box.width() > box.height() * kDefiniteAspectRatio) {
+    // Attempt to distinguish a wide joined word from a dash.
+    // If it is a dash, then its perimeter is approximately
+    // 2 * (box width + stroke width), but more if the outline is noisy,
+    // so perimeter - 2*(box width + stroke width) should be close to zero.
+    // A complex shape such as a joined word should have a much larger value.
+    int perimeter = cblob()->perimeter();
+    if (vert_stroke_width() > 0)
+      perimeter -= 2 * vert_stroke_width();
+    else
+      perimeter -= 4 * cblob()->area() / perimeter;
+    perimeter -= 2 * box.width();
+    // Use a multiple of the box perimeter as a threshold.
+    if (perimeter > kComplexShapePerimeterRatio * box_perimeter) {
+      set_vert_possible(false);
+      set_horz_possible(true);
+      return true;
+    }
+  }
+  if (box.height() > box.width() * kDefiniteAspectRatio) {
+    // As above, but for a putative vertical word vs a I/1/l.
+    int perimeter = cblob()->perimeter();
+    if (horz_stroke_width() > 0)
+      perimeter -= 2 * horz_stroke_width();
+    else
+      perimeter -= 4 * cblob()->area() / perimeter;
+    perimeter -= 2 * box.height();
+    if (perimeter > kComplexShapePerimeterRatio * box_perimeter) {
+      set_vert_possible(true);
+      set_horz_possible(false);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if there is no tabstop violation in merging this and other.
+bool BLOBNBOX::ConfirmNoTabViolation(const BLOBNBOX& other) const {
+  if (box.left() < other.box.left() && box.left() < other.left_rule_)
+    return false;
+  if (other.box.left() < box.left() && other.box.left() < left_rule_)
+    return false;
+  if (box.right() > other.box.right() && box.right() > other.right_rule_)
+    return false;
+  if (other.box.right() > box.right() && other.box.right() > right_rule_)
+    return false;
+  return true;
+}
+
+// Returns true if other has a similar stroke width to this.
+bool BLOBNBOX::MatchingStrokeWidth(const BLOBNBOX& other,
+                                   double fractional_tolerance,
+                                   double constant_tolerance) const {
+  // The perimeter-based width is used as a backup in case there is
+  // no information in the blob.
+  double p_width = area_stroke_width();
+  double n_p_width = other.area_stroke_width();
+  float h_tolerance = horz_stroke_width_ * fractional_tolerance
+                     + constant_tolerance;
+  float v_tolerance = vert_stroke_width_ * fractional_tolerance
+                     + constant_tolerance;
+  double p_tolerance = p_width * fractional_tolerance
+                     + constant_tolerance;
+  bool h_zero = horz_stroke_width_ == 0.0f || other.horz_stroke_width_ == 0.0f;
+  bool v_zero = vert_stroke_width_ == 0.0f || other.vert_stroke_width_ == 0.0f;
+  bool h_ok = !h_zero && NearlyEqual(horz_stroke_width_,
+                                     other.horz_stroke_width_, h_tolerance);
+  bool v_ok = !v_zero && NearlyEqual(vert_stroke_width_,
+                                     other.vert_stroke_width_, v_tolerance);
+  bool p_ok = h_zero && v_zero && NearlyEqual(p_width, n_p_width, p_tolerance);
+  // For a match, at least one of the horizontal and vertical widths
+  // must match, and the other one must either match or be zero.
+  // Only if both are zero will we look at the perimeter metric.
+  return p_ok || ((v_ok || h_ok) && (h_ok || h_zero) && (v_ok || v_zero));
+}
+
+// Returns a bounding box of the outline contained within the
+// given horizontal range.
+TBOX BLOBNBOX::BoundsWithinLimits(int left, int right) {
+  FCOORD no_rotation(1.0f, 0.0f);
+  float top, bottom;
+  if (cblob_ptr != NULL) {
+    find_cblob_limits(cblob_ptr, static_cast<float>(left),
+                      static_cast<float>(right), no_rotation,
+                      bottom, top);
+  } else {
+    find_blob_limits(blob_ptr, static_cast<float>(left),
+                     static_cast<float>(right), no_rotation,
+                     bottom, top);
+  }
+
+  if (top < bottom) {
+    top = box.top();
+    bottom = box.bottom();
+  }
+  FCOORD bot_left(left, bottom);
+  FCOORD top_right(right, top);
+  TBOX shrunken_box(bot_left);
+  TBOX shrunken_box2(top_right);
+  shrunken_box += shrunken_box2;
+  return shrunken_box;
+}
+
+#ifndef GRAPHICS_DISABLED
+ScrollView::Color BLOBNBOX::TextlineColor(BlobRegionType region_type,
+                                          BlobTextFlowType flow_type) {
+  switch (region_type) {
+    case BRT_HLINE:
+      return ScrollView::BROWN;
+    case BRT_VLINE:
+      return ScrollView::DARK_GREEN;
+    case BRT_RECTIMAGE:
+      return ScrollView::RED;
+    case BRT_POLYIMAGE:
+      return ScrollView::ORANGE;
+    case BRT_UNKNOWN:
+      return flow_type == BTFT_NONTEXT ? ScrollView::CYAN : ScrollView::WHITE;
+    case BRT_VERT_TEXT:
+      if (flow_type == BTFT_STRONG_CHAIN || flow_type == BTFT_TEXT_ON_IMAGE)
+        return ScrollView::GREEN;
+      if (flow_type == BTFT_CHAIN)
+        return ScrollView::LIME_GREEN;
+      return ScrollView::YELLOW;
+    case BRT_TEXT:
+      if (flow_type == BTFT_STRONG_CHAIN)
+        return ScrollView::BLUE;
+      if (flow_type == BTFT_TEXT_ON_IMAGE)
+        return ScrollView::LIGHT_BLUE;
+      if (flow_type == BTFT_CHAIN)
+        return ScrollView::MEDIUM_BLUE;
+      if (flow_type == BTFT_LEADER)
+        return ScrollView::WHEAT;
+      return ScrollView::MAGENTA;
+    default:
+      return ScrollView::GREY;
+  }
+}
+
+// Keep in sync with BlobRegionType.
+ScrollView::Color BLOBNBOX::BoxColor() const {
+  return TextlineColor(region_type_, flow_);
+}
+#endif

 /**********************************************************************
 * find_blob_limits
@ -152,26 +386,15 @@ void find_blob_limits(                  //get y limits
      if ((pos.x () < leftx && pos.x () + vec.x () > leftx)
      || (pos.x () > leftx && pos.x () + vec.x () < leftx)) {
        testy = pos.y () + vec.y () * (leftx - pos.x ()) / vec.x ();
-        //intercept of boundary
-        if (testy < ymin)
-          ymin = testy;
-        if (testy > ymax)
-          ymax = testy;
+        UpdateRange(testy, &ymin, &ymax);
      }
      if (pos.x () >= leftx && pos.x () <= rightx) {
-        if (pos.y () > ymax)
-          ymax = pos.y ();
-        if (pos.y () < ymin)
-          ymin = pos.y ();
+        UpdateRange(pos.y(), &ymin, &ymax);
      }
      if ((pos.x () > rightx && pos.x () + vec.x () < rightx)
      || (pos.x () < rightx && pos.x () + vec.x () > rightx)) {
        testy = pos.y () + vec.y () * (rightx - pos.x ()) / vec.x ();
-        //intercept of boundary
-        if (testy < ymin)
-          ymin = testy;
-        if (testy > ymax)
-          ymax = testy;
+        UpdateRange(testy, &ymin, &ymax);
      }
    }
  }
@ -208,10 +431,7 @@ void find_cblob_limits(                  //get y limits
    for (stepindex = 0; stepindex < outline->pathlength (); stepindex++) {
                                 //inside
      if (pos.x () >= leftx && pos.x () <= rightx) {
-        if (pos.y () > ymax)
-          ymax = pos.y ();
-        if (pos.y () < ymin)
-          ymin = pos.y ();
+        UpdateRange(pos.y(), &ymin, &ymax);
      }
      vec = outline->step (stepindex);
      vec.rotate (rotation);
@ -249,10 +469,7 @@ void find_cblob_vlimits(               //get y limits
    for (stepindex = 0; stepindex < outline->pathlength (); stepindex++) {
                                 //inside
      if (pos.x () >= leftx && pos.x () <= rightx) {
-        if (pos.y () > ymax)
-          ymax = pos.y ();
-        if (pos.y () < ymin)
-          ymin = pos.y ();
+        UpdateRange(pos.y(), &ymin, &ymax);
      }
      vec = outline->step (stepindex);
      pos += vec;                //move to next
@ -289,10 +506,7 @@ void find_cblob_hlimits(                //get x limits
    for (stepindex = 0; stepindex < outline->pathlength (); stepindex++) {
                                 //inside
      if (pos.y () >= bottomy && pos.y () <= topy) {
-        if (pos.x () > xmax)
-          xmax = pos.x ();
-        if (pos.x () < xmin)
-          xmin = pos.x ();
+        UpdateRange(pos.x(), &xmin, &xmax);
      }
      vec = outline->step (stepindex);
      pos += vec;                //move to next
@ -351,7 +565,7 @@ PBLOB *rotate_cblob(                 //rotate it
  OUTLINE_IT out_it;
  POLYPT_IT poly_it;             //outline pts

-  copy = new PBLOB (blob, xheight);
+  copy = new PBLOB (blob);
  out_it.set_to_list (copy->out_list ());
  for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
                                 //get points
@ -458,7 +672,12 @@ BLOBNBOX * blob,                 //first blob
 float top,                       //corrected top
 float bottom,                    //of row
 float row_size                   //ideal
-): y_min(bottom), y_max(top), initial_y_min(bottom), num_repeated_sets_(-1) {
+) {
+  clear();
+  y_min = bottom;
+  y_max = top;
+  initial_y_min = bottom;
+
  float diff;                    //in size
  BLOBNBOX_IT it = &blobs;       //list of blobs

@ -572,6 +791,46 @@ void TO_ROW::compute_vertical_projection() {  //project whole row
 }


+/**********************************************************************
+ * TO_ROW::clear
+ *
+ * Zero out all scalar members.
+ **********************************************************************/
+void TO_ROW::clear() {
+  all_caps = 0;
+  used_dm_model = 0;
+  projection_left = 0;
+  projection_right = 0;
+  pitch_decision = PITCH_DUNNO;
+  fixed_pitch = 0.0;
+  fp_space = 0.0;
+  fp_nonsp = 0.0;
+  pr_space = 0.0;
+  pr_nonsp = 0.0;
+  spacing = 0.0;
+  xheight = 0.0;
+  xheight_evidence = 0;
+  ascrise = 0.0;
+  descdrop = 0.0;
+  min_space = 0;
+  max_nonspace = 0;
+  space_threshold = 0;
+  kern_size = 0.0;
+  space_size = 0.0;
+  y_min = 0.0;
+  y_max = 0.0;
+  initial_y_min = 0.0;
+  m = 0.0;
+  c = 0.0;
+  error = 0.0;
+  para_c = 0.0;
+  para_error = 0.0;
+  y_origin = 0.0;
+  credibility = 0.0;
+  num_repeated_sets_ = -1;
+}
+
+
 /**********************************************************************
 * vertical_blob_projection
 *
@ -722,16 +981,9 @@ void vertical_coutline_projection(                     //project outlines
  for (stepindex = 0; stepindex < length; stepindex++) {
    step = outline->step (stepindex);
    if (step.x () > 0) {
-      if (pitsync_projection_fix)
-        stats->add (pos.x (), -pos.y ());
-      else
-        stats->add (pos.x (), pos.y ());
-    }
-    else if (step.x () < 0) {
-      if (pitsync_projection_fix)
-        stats->add (pos.x () - 1, pos.y ());
-      else
-        stats->add (pos.x () - 1, -pos.y ());
+     stats->add (pos.x (), -pos.y ());
+    } else if (step.x () < 0) {
+      stats->add (pos.x () - 1, pos.y ());
    }
    pos += step;
  }
@ -751,6 +1003,7 @@ void vertical_coutline_projection(                     //project outlines
 TO_BLOCK::TO_BLOCK(                  //make a block
                   BLOCK *src_block  //real block
                  ) {
+  clear();
  block = src_block;
 }

@ -767,6 +1020,32 @@ static void clear_blobnboxes(BLOBNBOX_LIST* boxes) {
  }
 }

+/**********************************************************************
+ * TO_BLOCK::clear
+ *
+ * Zero out all scalar members.
+ **********************************************************************/
+void TO_BLOCK::clear() {
+  block = NULL;
+  pitch_decision = PITCH_DUNNO;
+  line_spacing = 0.0;
+  line_size = 0.0;
+  max_blob_size = 0.0;
+  baseline_offset = 0.0;
+  xheight = 0.0;
+  fixed_pitch = 0.0;
+  kern_size = 0.0;
+  space_size = 0.0;
+  min_space = 0;
+  max_nonspace = 0;
+  fp_space = 0.0;
+  fp_nonsp = 0.0;
+  pr_space = 0.0;
+  pr_nonsp = 0.0;
+  key_row = NULL;
+}
+
+
 TO_BLOCK::~TO_BLOCK() {
  // Any residual BLOBNBOXes at this stage own their blobs, so delete them.
  clear_blobnboxes(&blobs);
@ -802,6 +1081,4 @@ void plot_blob_list(ScrollView* win,                   // window to draw in
    it.data()->plot(win, body_colour, child_colour);
  }
 }
-
 #endif //GRAPHICS_DISABLED
-
--- a/ccstruct/blobbox.h
+++ b/ccstruct/blobbox.h
@ -20,16 +20,12 @@
 #ifndef           BLOBBOX_H
 #define           BLOBBOX_H

-#include          "varable.h"
 #include          "clst.h"
 #include          "elst2.h"
 #include          "werd.h"
 #include          "ocrblock.h"
 #include          "statistc.h"

-extern double_VAR_H (textord_error_weight, 3,
-"Weighting for error in believability");
-
 enum PITCH_TYPE
 {
  PITCH_DUNNO,                   //insufficient data
@ -53,10 +49,12 @@ enum TabType {

 // The possible region types of a BLOBNBOX.
 // Note: keep all the text types > BRT_UNKNOWN and all the image types less.
-// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor below.
+// Keep in sync with kBlobTypes in colpartition.cpp and BoxColor, and the
+// *Type static functions below.
 enum BlobRegionType {
  BRT_NOISE,      // Neither text nor image.
  BRT_HLINE,      // Horizontal separator line.
+  BRT_VLINE,      // Vertical separator line.
  BRT_RECTIMAGE,  // Rectangular image.
  BRT_POLYIMAGE,  // Non-rectangular image.
  BRT_UNKNOWN,    // Not determined yet.
@ -66,6 +64,46 @@ enum BlobRegionType {
  BRT_COUNT       // Number of possibilities.
 };

+// enum for elements of arrays that refer to neighbours.
+enum BlobNeighbourDir {
+  BND_LEFT,
+  BND_BELOW,
+  BND_RIGHT,
+  BND_ABOVE,
+  BND_COUNT
+};
+
+// BlobTextFlowType indicates the quality of neighbouring information
+// related to a chain of connected components, either horizontally or
+// vertically. Also used by ColPartition for the collection of blobs
+// within, which should all have the same value in most cases.
+enum BlobTextFlowType {
+  BTFT_NONE,           // No text flow set yet.
+  BTFT_NONTEXT,        // Flow too poor to be likely text.
+  BTFT_NEIGHBOURS,     // Neighbours support flow in this direction.
+  BTFT_CHAIN,          // There is a weak chain of text in this direction.
+  BTFT_STRONG_CHAIN,   // There is a strong chain of text in this direction.
+  BTFT_TEXT_ON_IMAGE,  // There is a strong chain of text on an image.
+  BTFT_LEADER,         // Leader dots/dashes etc.
+  BTFT_COUNT
+};
+
+// Returns true if type1 dominates type2 in a merge. Mostly determined by the
+// ordering of the enum, but NONTEXT dominates everything else, and LEADER
+// dominates nothing.
+// The function is anti-symmetric (t1 > t2) === !(t2 > t1), except that
+// this cannot be true if t1 == t2, so the result is undefined.
+inline bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2) {
+  // NONTEXT dominates everything.
+  if (type1 == BTFT_NONTEXT) return true;
+  if (type2 == BTFT_NONTEXT) return false;
+  // LEADER always loses.
+  if (type1 == BTFT_LEADER) return false;
+  if (type2 == BTFT_LEADER) return true;
+  // With those out of the way, the ordering of the enum determines the result.
+  return type1 >= type2;
+}
+
 namespace tesseract {
 class ColPartition;
 }
@ -76,46 +114,84 @@ class BLOBNBOX:public ELIST_LINK
 {
  public:
    BLOBNBOX() {
-      blob_ptr = NULL;
-      cblob_ptr = NULL;
-      area = 0;
-      Init();
+      ConstructionInit();
    }
    explicit BLOBNBOX(PBLOB *srcblob) {
+      box = srcblob->bounding_box();
+      ConstructionInit();
      blob_ptr = srcblob;
-      cblob_ptr = NULL;
-      box = srcblob->bounding_box ();
-      area = (int) srcblob->area ();
-      Init();
+      area = static_cast<int>(srcblob->area());
    }
    explicit BLOBNBOX(C_BLOB *srcblob) {
-      blob_ptr = NULL;
+      box = srcblob->bounding_box();
+      ConstructionInit();
      cblob_ptr = srcblob;
-      box = srcblob->bounding_box ();
-      area = (int) srcblob->area ();
-      Init();
+      area = static_cast<int>(srcblob->area());
+    }
+    static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
+      C_BLOB* blob = new C_BLOB(outline);
+      return new BLOBNBOX(blob);
    }

-    void rotate_box(FCOORD vec) {
-      box.rotate(vec);
-    }
+    void rotate_box(FCOORD rotation);
+    void rotate(FCOORD rotation);
    void translate_box(ICOORD v) {
-      box.move(v);
+      if (IsDiacritic()) {
+        box.move(v);
+        base_char_top_ += v.y();
+        base_char_bottom_ += v.y();
+      } else {
+        box.move(v);
+        set_diacritic_box(box);
+      }
    }
    void merge(BLOBNBOX *nextblob);
+    void really_merge(BLOBNBOX* other);
    void chop(                        // fake chop blob
              BLOBNBOX_IT *start_it,  // location of this
              BLOBNBOX_IT *blob_it,   // iterator
              FCOORD rotation,        // for landscape
              float xheight);         // line height

+    void NeighbourGaps(int gaps[BND_COUNT]) const;
+    void MinMaxGapsClipped(int* h_min, int* h_max,
+                           int* v_min, int* v_max) const;
+    int GoodTextBlob() const;
+
+    // Returns true, and sets vert_possible/horz_possible if the blob has some
+    // feature that makes it individually appear to flow one way.
+    // eg if it has a high aspect ratio, yet has a complex shape, such as a
+    // joined word in Latin, Arabic, or Hindi, rather than being a -, I, l, 1.
+    bool DefiniteIndividualFlow();
+
+    // Returns true if there is no tabstop violation in merging this and other.
+    bool ConfirmNoTabViolation(const BLOBNBOX& other) const;
+
+    // Returns true if other has a similar stroke width to this.
+    bool MatchingStrokeWidth(const BLOBNBOX& other,
+                             double fractional_tolerance,
+                             double constant_tolerance) const;
+
+    // Returns a bounding box of the outline contained within the
+    // given horizontal range.
+    TBOX BoundsWithinLimits(int left, int right);
+
    // Simple accessors.
    const TBOX& bounding_box() const {
      return box;
    }
+    // Set the bounding box. Use with caution.
+    // Normally use compute_bounding_box instead.
+    void set_bounding_box(const TBOX& new_box) {
+      box = new_box;
+      base_char_top_ = box.top();
+      base_char_bottom_ = box.bottom();
+    }
    void compute_bounding_box() {
      box = cblob_ptr != NULL ? cblob_ptr->bounding_box()
                              : blob_ptr->bounding_box();
+      base_char_top_ = box.top();
+      base_char_bottom_ = box.bottom();
    }
    const TBOX& reduced_box() const {
      return red_box;
@ -163,6 +239,24 @@ class BLOBNBOX:public ELIST_LINK
    void set_region_type(BlobRegionType new_type) {
      region_type_ = new_type;
    }
+    BlobTextFlowType flow() const {
+      return flow_;
+    }
+    void set_flow(BlobTextFlowType value) {
+      flow_ = value;
+    }
+    bool vert_possible() const {
+      return vert_possible_;
+    }
+    void set_vert_possible(bool value) {
+      vert_possible_ = value;
+    }
+    bool horz_possible() const {
+      return horz_possible_;
+    }
+    void set_horz_possible(bool value) {
+      horz_possible_ = value;
+    }
    int left_rule() const {
      return left_rule_;
    }
@ -199,40 +293,80 @@ class BLOBNBOX:public ELIST_LINK
    void set_vert_stroke_width(float width) {
      vert_stroke_width_ = width;
    }
+    float area_stroke_width() const {
+      return area_stroke_width_;
+    }
    tesseract::ColPartition* owner() const {
      return owner_;
    }
    void set_owner(tesseract::ColPartition* new_owner) {
      owner_ = new_owner;
    }
-    void set_noise_flag(bool flag) {
-      noise_flag_ = flag;
+    bool leader_on_left() const {
+      return leader_on_left_;
    }
-    bool noise_flag() const {
-      return noise_flag_;
+    void set_leader_on_left(bool flag) {
+      leader_on_left_ = flag;
    }
+    bool leader_on_right() const {
+      return leader_on_right_;
+    }
+    void set_leader_on_right(bool flag) {
+      leader_on_right_ = flag;
+    }
+    BLOBNBOX* neighbour(BlobNeighbourDir n) const {
+      return neighbours_[n];
+    }
+    bool good_stroke_neighbour(BlobNeighbourDir n) const {
+      return good_stroke_neighbours_[n];
+    }
+    void set_neighbour(BlobNeighbourDir n, BLOBNBOX* neighbour, bool good) {
+      neighbours_[n] = neighbour;
+      good_stroke_neighbours_[n] = good;
+    }
+    bool IsDiacritic() const {
+      return base_char_top_ != box.top() || base_char_bottom_ != box.bottom();
+    }
+    int base_char_top() const {
+      return base_char_top_;
+    }
+    int base_char_bottom() const {
+      return base_char_bottom_;
+    }
+    void set_diacritic_box(const TBOX& diacritic_box) {
+      base_char_top_ = diacritic_box.top();
+      base_char_bottom_ = diacritic_box.bottom();
+    }
+    bool UniquelyVertical() const {
+      return vert_possible_ && !horz_possible_;
+    }
+    bool UniquelyHorizontal() const {
+      return horz_possible_ && !vert_possible_;
+    }
+
+    // Returns true if the region type is text.
+    static bool IsTextType(BlobRegionType type) {
+      return type == BRT_TEXT || type == BRT_VERT_TEXT;
+    }
+    // Returns true if the region type is image.
+    static bool IsImageType(BlobRegionType type) {
+      return type == BRT_RECTIMAGE || type == BRT_POLYIMAGE;
+    }
+    // Returns true if the region type is line.
+    static bool IsLineType(BlobRegionType type) {
+      return type == BRT_HLINE || type == BRT_VLINE;
+    }
+    // Returns true if the region type cannot be merged.
+    static bool UnMergeableType(BlobRegionType type) {
+      return IsLineType(type) || IsImageType(type);
+    }
+
+    static ScrollView::Color TextlineColor(BlobRegionType region_type,
+                                           BlobTextFlowType flow_type);

 #ifndef GRAPHICS_DISABLED
    // Keep in sync with BlobRegionType.
-    ScrollView::Color BoxColor() const {
-      switch (region_type_) {
-      case BRT_HLINE:
-        return ScrollView::YELLOW;
-      case BRT_RECTIMAGE:
-        return ScrollView::RED;
-      case BRT_POLYIMAGE:
-        return ScrollView::ORANGE;
-      case BRT_UNKNOWN:
-        return ScrollView::CYAN;
-      case BRT_VERT_TEXT:
-        return ScrollView::GREEN;
-      case BRT_TEXT:
-        return ScrollView::BLUE;
-      case BRT_NOISE:
-      default:
-        return ScrollView::GREY;
-      }
-    }
+    ScrollView::Color BoxColor() const;

    void plot(ScrollView* window,                // window to draw in
              ScrollView::Color blob_colour,     // for outer bits
@ -244,27 +378,53 @@ class BLOBNBOX:public ELIST_LINK
    }
 #endif

-    NEWDELETE2(BLOBNBOX)
+  NEWDELETE2(BLOBNBOX)

- private:
-  // Initializes the bulk of the members to default values.
-  void Init() {
+  // Initializes the bulk of the members to default values for use at
+  // construction time.
+  void ConstructionInit() {
+    blob_ptr = NULL;
+    cblob_ptr = NULL;
+    area = 0;
+    area_stroke_width_ = 0.0f;
+    horz_stroke_width_ = 0.0f;
+    vert_stroke_width_ = 0.0f;
+    ReInit();
+  }
+  // Initializes members set by StrokeWidth and beyond, without discarding
+  // stored area and strokewidth values, which are expensive to calculate.
+  void ReInit() {
    joined = false;
    reduced = false;
    repeated_set_ = 0;
    left_tab_type_ = TT_NONE;
    right_tab_type_ = TT_NONE;
    region_type_ = BRT_UNKNOWN;
+    flow_ = BTFT_NONE;
    left_rule_ = 0;
    right_rule_ = 0;
    left_crossing_rule_ = 0;
    right_crossing_rule_ = 0;
-    horz_stroke_width_ = 0.0f;
-    vert_stroke_width_ = 0.0f;
+    if (area_stroke_width_ == 0.0f && area > 0 && cblob() != NULL)
+      area_stroke_width_ = 2.0f * area / cblob()->perimeter();
    owner_ = NULL;
-    noise_flag_ = false;
+    base_char_top_ = box.top();
+    base_char_bottom_ = box.bottom();
+    horz_possible_ = false;
+    vert_possible_ = false;
+    leader_on_left_ = false;
+    leader_on_right_ = false;
+    ClearNeighbours();
  }

+  void ClearNeighbours() {
+    for (int n = 0; n < BND_COUNT; ++n) {
+      neighbours_[n] = NULL;
+      good_stroke_neighbours_[n] = false;
+    }
+  }
+
+ private:
  PBLOB *blob_ptr;              // poly blob
  C_BLOB *cblob_ptr;            // edgestep blob
  TBOX box;                     // bounding box
@ -276,22 +436,32 @@ class BLOBNBOX:public ELIST_LINK
  TabType left_tab_type_;       // Indicates tab-stop assessment
  TabType right_tab_type_;      // Indicates tab-stop assessment
  BlobRegionType region_type_;  // Type of region this blob belongs to
+  BlobTextFlowType flow_;       // Quality of text flow.
  inT16 left_rule_;             // x-coord of nearest but not crossing rule line
  inT16 right_rule_;            // x-coord of nearest but not crossing rule line
  inT16 left_crossing_rule_;    // x-coord of nearest or crossing rule line
  inT16 right_crossing_rule_;   // x-coord of nearest or crossing rule line
+  inT16 base_char_top_;         // y-coord of top/bottom of diacritic base,
+  inT16 base_char_bottom_;      // if it exists else top/bottom of this blob.
  float horz_stroke_width_;     // Median horizontal stroke width
  float vert_stroke_width_;     // Median vertical stroke width
+  float area_stroke_width_;     // Stroke width from area/perimeter ratio.
  tesseract::ColPartition* owner_;  // Who will delete me when I am not needed
-  // Was the blob flagged as noise in the initial filtering step
-  bool noise_flag_;
+  BLOBNBOX* neighbours_[BND_COUNT];
+  bool good_stroke_neighbours_[BND_COUNT];
+  bool horz_possible_;           // Could be part of horizontal flow.
+  bool vert_possible_;           // Could be part of vertical flow.
+  bool leader_on_left_;          // There is a leader to the left.
+  bool leader_on_right_;         // There is a leader to the right.
 };

-class TO_ROW:public ELIST2_LINK
+class TO_ROW: public ELIST2_LINK
 {
  public:
+    static const int kErrorWeight = 3;
+
    TO_ROW() {
-      num_repeated_sets_ = -1;
+      clear();
    }                            //empty
    TO_ROW(                 //constructor
           BLOBNBOX *blob,  //from first blob
@ -359,7 +529,7 @@ class TO_ROW:public ELIST2_LINK
      para_c = new_c;
      para_error = new_error;
      credibility =
-        (float) (blobs.length () - textord_error_weight * new_error);
+        (float) (blobs.length () - kErrorWeight * new_error);
      y_origin = (float) (new_c / sqrt (1 + gradient * gradient));
      //real intercept
    }
@ -413,6 +583,8 @@ class TO_ROW:public ELIST2_LINK
    STATS projection;            // vertical projection

  private:
+    void clear();  // clear all values to reasonable defaults
+
    BLOBNBOX_LIST blobs;         //blobs in row
    float y_min;                 //coords
    float y_max;
@ -432,16 +604,45 @@ ELIST2IZEH (TO_ROW)
 class TO_BLOCK:public ELIST_LINK
 {
  public:
-    TO_BLOCK() {
+    TO_BLOCK() : pitch_decision(PITCH_DUNNO) {
+      clear();
    }                            //empty
    TO_BLOCK(                    //constructor
             BLOCK *src_block);  //real block
    ~TO_BLOCK();

+    void clear();  // clear all scalar members.
+
    TO_ROW_LIST *get_rows() {  //access function
      return &row_list;
    }

+    // Rotate all the blobnbox lists and the underlying block. Then update the
+    // median size statistic from the blobs list.
+    void rotate(const FCOORD& rotation) {
+      BLOBNBOX_LIST* blobnbox_list[] = {&blobs, &underlines, &noise_blobs,
+                                        &small_blobs, &large_blobs, NULL};
+      for (BLOBNBOX_LIST** list = blobnbox_list; *list != NULL; ++list) {
+        BLOBNBOX_IT it(*list);
+        for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+          it.data()->rotate(rotation);
+        }
+      }
+      // Rotate the block
+      ASSERT_HOST(block->poly_block() != NULL);
+      block->rotate(rotation);
+      // Update the median size statistic from the blobs list.
+      STATS widths(0, block->bounding_box().width());
+      STATS heights(0, block->bounding_box().height());
+      BLOBNBOX_IT blob_it(&blobs);
+      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+        widths.add(blob_it.data()->bounding_box().width(), 1);
+        heights.add(blob_it.data()->bounding_box().height(), 1);
+      }
+      block->set_median_size(static_cast<int>(widths.median() + 0.5),
+                             static_cast<int>(heights.median() + 0.5));
+    }
+
    void print_rows() {  //debug info
      TO_ROW_IT row_it = &row_list;
      TO_ROW *row;
@ -468,6 +669,11 @@ class TO_BLOCK:public ELIST_LINK
    BLOCK *block;                //real block
    PITCH_TYPE pitch_decision;   //how strong is decision
    float line_spacing;          //estimate
+    // line_size is a lower-bound estimate of the font size in pixels of
+    // the text in the block (with ascenders and descenders), being a small
+    // (1.25) multiple of the median height of filtered blobs.
+    // In most cases the font size will be bigger, but it will be closer
+    // if the text is allcaps, or in a no-x-height script.
    float line_size;             //estimate
    float max_blob_size;         //line assignment limit
    float baseline_offset;       //phase shift
--- a/ccstruct/blobs.cpp
+++ b/ccstruct/blobs.cpp
@ -30,11 +30,434 @@
 #include "blobs.h"
 #include "cutil.h"
 #include "emalloc.h"
+#include "helpers.h"
+#include "ndminx.h"
+#include "normalis.h"
+#include "ocrrow.h"
+#include "points.h"
+#include "polyaprx.h"
 #include "structures.h"
+#include "werd.h"
+
+// A Vector representing the "vertical" direction when measuring the
+// divisiblity of blobs into multiple blobs just by separating outlines.
+// See divisible_blob below for the use.
+const TPOINT kDivisibleVerticalUpright = {0, 1};
+// A vector representing the "vertical" direction for italic text for use
+// when separating outlines. Using it actually deteriorates final accuracy,
+// so it is only used for ApplyBoxes chopping to get a better segmentation.
+const TPOINT kDivisibleVerticalItalic = {1, 5};

 /*----------------------------------------------------------------------
              F u n c t i o n s
 ----------------------------------------------------------------------*/
+// Consume the circular list of EDGEPTs to make a TESSLINE.
+TESSLINE* TESSLINE::BuildFromOutlineList(EDGEPT* outline) {
+  TESSLINE* result = new TESSLINE;
+  result->loop = outline;
+  result->SetupFromPos();
+  return result;
+}
+
+// Copies the data and the outline, but leaves next untouched.
+void TESSLINE::CopyFrom(const TESSLINE& src) {
+  Clear();
+  topleft = src.topleft;
+  botright = src.botright;
+  start = src.start;
+  is_hole = src.is_hole;
+  if (src.loop != NULL) {
+    EDGEPT* prevpt = NULL;
+    EDGEPT* newpt = NULL;
+    EDGEPT* srcpt = src.loop;
+    do {
+      newpt = new EDGEPT(*srcpt);
+      if (prevpt == NULL) {
+        loop = newpt;
+      } else {
+        newpt->prev = prevpt;
+        prevpt->next = newpt;
+      }
+      prevpt = newpt;
+      srcpt = srcpt->next;
+    } while (srcpt != src.loop);
+    loop->prev = newpt;
+    newpt->next = loop;
+  }
+}
+
+// Deletes owned data.
+void TESSLINE::Clear() {
+  if (loop == NULL)
+    return;
+
+  EDGEPT* this_edge = loop;
+  do {
+    EDGEPT* next_edge = this_edge->next;
+    delete this_edge;
+    this_edge = next_edge;
+  } while (this_edge != loop);
+  loop = NULL;
+}
+
+// Rotates by the given rotation in place.
+void TESSLINE::Rotate(const FCOORD rot) {
+  EDGEPT* pt = loop;
+  do {
+    int tmp = static_cast<int>(floor(pt->pos.x * rot.x() -
+                                     pt->pos.y * rot.y() + 0.5));
+    pt->pos.y = static_cast<int>(floor(pt->pos.y * rot.x() +
+                                       pt->pos.x * rot.y() + 0.5));
+    pt->pos.x = tmp;
+    pt = pt->next;
+  } while (pt != loop);
+  SetupFromPos();
+}
+
+// Moves by the given vec in place.
+void TESSLINE::Move(const ICOORD vec) {
+  EDGEPT* pt = loop;
+  do {
+    pt->pos.x += vec.x();
+    pt->pos.y += vec.y();
+    pt = pt->next;
+  } while (pt != loop);
+  SetupFromPos();
+}
+
+// Scales by the given factor in place.
+void TESSLINE::Scale(float factor) {
+  EDGEPT* pt = loop;
+  do {
+    pt->pos.x = static_cast<int>(floor(pt->pos.x * factor + 0.5));
+    pt->pos.y = static_cast<int>(floor(pt->pos.y * factor + 0.5));
+    pt = pt->next;
+  } while (pt != loop);
+  SetupFromPos();
+}
+
+// Sets up the start and vec members of the loop from the pos members.
+void TESSLINE::SetupFromPos() {
+  EDGEPT* pt = loop;
+  do {
+    pt->vec.x = pt->next->pos.x - pt->pos.x;
+    pt->vec.y = pt->next->pos.y - pt->pos.y;
+    pt = pt->next;
+  } while (pt != loop);
+  start = pt->pos;
+  ComputeBoundingBox();
+}
+
+// Recomputes the bounding box from the points in the loop.
+void TESSLINE::ComputeBoundingBox() {
+  int minx = MAX_INT32;
+  int miny = MAX_INT32;
+  int maxx = -MAX_INT32;
+  int maxy = -MAX_INT32;
+
+  // Find boundaries.
+  start = loop->pos;
+  EDGEPT* this_edge = loop;
+  do {
+    if (!this_edge->IsHidden() || !this_edge->prev->IsHidden()) {
+      if (this_edge->pos.x < minx)
+        minx = this_edge->pos.x;
+      if (this_edge->pos.y < miny)
+        miny = this_edge->pos.y;
+      if (this_edge->pos.x > maxx)
+        maxx = this_edge->pos.x;
+      if (this_edge->pos.y > maxy)
+        maxy = this_edge->pos.y;
+    }
+    this_edge = this_edge->next;
+  } while (this_edge != loop);
+  // Reset bounds.
+  topleft.x = minx;
+  topleft.y = maxy;
+  botright.x = maxx;
+  botright.y = miny;
+}
+
+// Computes the min and max cross product of the outline points with the
+// given vec and returns the results in min_xp and max_xp. Geometrically
+// this is the left and right edge of the outline perpendicular to the
+// given direction, but to get the distance units correct, you would
+// have to divide by the modulus of vec.
+void TESSLINE::MinMaxCrossProduct(const TPOINT vec,
+                                  int* min_xp, int* max_xp) const {
+  *min_xp = MAX_INT32;
+  *max_xp = MIN_INT32;
+  EDGEPT* this_edge = loop;
+  do {
+    if (!this_edge->IsHidden() || !this_edge->prev->IsHidden()) {
+      int product = CROSS(this_edge->pos, vec);
+      UpdateRange(product, min_xp, max_xp);
+    }
+    this_edge = this_edge->next;
+  } while (this_edge != loop);
+}
+
+TBOX TESSLINE::bounding_box() const {
+  return TBOX(topleft.x, botright.y, botright.x, topleft.y);
+}
+
+void TESSLINE::plot(ScrollView* window, ScrollView::Color color,
+                    ScrollView::Color child_color) {
+  if (is_hole)
+    window->Pen(child_color);
+  else
+    window->Pen(color);
+  window->SetCursor(start.x, start.y);
+  EDGEPT* pt = loop;
+  do {
+    bool prev_hidden = pt->IsHidden();
+    pt = pt->next;
+    if (prev_hidden)
+      window->SetCursor(pt->pos.x, pt->pos.y);
+    else
+      window->DrawTo(pt->pos.x, pt->pos.y);
+  } while (pt != loop);
+}
+
+// Iterate the given list of outlines, converting to TESSLINE by polygonal
+// approximation and recursively any children, returning the current tail
+// of the resulting list of TESSLINEs.
+static TESSLINE** ApproximateOutlineList(C_OUTLINE_LIST* outlines,
+                                         bool children,
+                                         TESSLINE** tail) {
+  C_OUTLINE_IT ol_it(outlines);
+  for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
+    C_OUTLINE* outline = ol_it.data();
+    TESSLINE* tessline = ApproximateOutline(outline);
+    tessline->is_hole = children;
+    *tail = tessline;
+    tail = &tessline->next;
+    if (!outline->child()->empty()) {
+      tail = ApproximateOutlineList(outline->child(), true, tail);
+    }
+  }
+  return tail;
+}
+
+// Factory to build a TBLOB from a C_BLOB with polygonal
+// approximation along the way.
+TBLOB* TBLOB::PolygonalCopy(C_BLOB* src) {
+  C_OUTLINE_IT ol_it = src->out_list();
+  TBLOB* tblob = new TBLOB;
+  ApproximateOutlineList(src->out_list(), false, &tblob->outlines);
+  return tblob;
+}
+
+// Copies the data and the outline, but leaves next untouched.
+void TBLOB::CopyFrom(const TBLOB& src) {
+  Clear();
+  TESSLINE* prev_outline = NULL;
+  for (TESSLINE* srcline = src.outlines; srcline != NULL;
+       srcline = srcline->next) {
+    TESSLINE* new_outline = new TESSLINE(*srcline);
+    if (outlines == NULL)
+      outlines = new_outline;
+    else
+      prev_outline->next = new_outline;
+    prev_outline = new_outline;
+  }
+}
+
+// Deletes owned data.
+void TBLOB::Clear() {
+  for (TESSLINE* next_outline = NULL; outlines != NULL;
+       outlines = next_outline) {
+    next_outline = outlines->next;
+    delete outlines;
+  }
+}
+
+// Rotates by the given rotation in place.
+void TBLOB::Rotate(const FCOORD rotation) {
+  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next) {
+    outline->Rotate(rotation);
+  }
+}
+
+// Moves by the given vec in place.
+void TBLOB::Move(const ICOORD vec) {
+  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next) {
+    outline->Move(vec);
+  }
+}
+
+// Scales by the given factor in place.
+void TBLOB::Scale(float factor) {
+  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next) {
+    outline->Scale(factor);
+  }
+}
+
+// Recomputes the bounding boxes of the outlines.
+void TBLOB::ComputeBoundingBoxes() {
+  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next) {
+    outline->ComputeBoundingBox();
+  }
+}
+
+// Returns the number of outlines.
+int TBLOB::NumOutlines() const {
+  int result = 0;
+  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next)
+    ++result;
+  return result;
+}
+
+TBOX TBLOB::bounding_box() const {
+  TPOINT topleft;
+  TPOINT botright;
+  blob_bounding_box(this, &topleft, &botright);
+  TBOX box(topleft.x, botright.y, botright.x, topleft.y);
+  return box;
+}
+
+void TBLOB::plot(ScrollView* window, ScrollView::Color color,
+                 ScrollView::Color child_color) {
+  for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next)
+    outline->plot(window, color, child_color);
+}
+
+// Factory to build a TWERD from a (C_BLOB) WERD, with polygonal
+// approximation along the way.
+TWERD* TWERD::PolygonalCopy(WERD* src) {
+  TWERD* tessword = new TWERD;
+  tessword->latin_script = src->flag(W_SCRIPT_IS_LATIN);
+  C_BLOB_IT b_it(src->cblob_list());
+  TBLOB *tail = NULL;
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    C_BLOB* blob = b_it.data();
+    TBLOB* tblob = TBLOB::PolygonalCopy(blob);
+    if (tail == NULL) {
+      tessword->blobs = tblob;
+    } else {
+      tail->next = tblob;
+    }
+    tail = tblob;
+  }
+  return tessword;
+}
+
+// Normalize in-place and record the normalization in the DENORM.
+void TWERD::Normalize(ROW* row, float x_height, bool numeric_mode,
+                      DENORM* denorm) {
+  TBOX word_box = bounding_box();
+  DENORM antidote((word_box.left() + word_box.right()) / 2.0,
+                  kBlnXHeight / x_height, row);
+  if (row == NULL) {
+    antidote = DENORM(antidote.origin(), antidote.scale(), 0.0,
+                      word_box.bottom(), 0, NULL, false, NULL);
+  }
+  int num_segments = 0;
+  DENORM_SEG *segs = new DENORM_SEG[NumBlobs()];
+  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
+    TBOX blob_box = blob->bounding_box();
+    ICOORD translation(-static_cast<int>(floor(antidote.origin() + 0.5)),
+                       -blob_box.bottom());
+    float factor = antidote.scale();
+    if (numeric_mode) {
+      factor = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()),
+                           factor, factor * 1.5f);
+      segs[num_segments].xstart = blob->bounding_box().left();
+      segs[num_segments].ycoord = blob_box.bottom();
+      segs[num_segments++].scale_factor = factor;
+    } else {
+      float blob_x_center = (blob_box.left() + blob_box.right()) / 2.0;
+      float y_shift = antidote.yshift_at_orig_x(blob_x_center);
+      translation.set_y(-static_cast<int>(floor(y_shift + 0.5)));
+    }
+    blob->Move(translation);
+    blob->Scale(factor);
+    blob->Move(ICOORD(0, kBlnBaselineOffset));
+  }
+  if (num_segments > 0) {
+    antidote.set_segments(segs, num_segments);
+  }
+  delete [] segs;
+  if (denorm != NULL)
+    *denorm = antidote;
+}
+
+// Copies the data and the blobs, but leaves next untouched.
+void TWERD::CopyFrom(const TWERD& src) {
+  Clear();
+  latin_script = src.latin_script;
+  TBLOB* prev_blob = NULL;
+  for (TBLOB* srcblob = src.blobs; srcblob != NULL; srcblob = srcblob->next) {
+    TBLOB* new_blob = new TBLOB(*srcblob);
+    if (blobs == NULL)
+      blobs = new_blob;
+    else
+      prev_blob->next = new_blob;
+    prev_blob = new_blob;
+  }
+}
+
+// Deletes owned data.
+void TWERD::Clear() {
+  for (TBLOB* next_blob = NULL; blobs != NULL; blobs = next_blob) {
+    next_blob = blobs->next;
+    delete blobs;
+  }
+}
+
+// Recomputes the bounding boxes of the blobs.
+void TWERD::ComputeBoundingBoxes() {
+  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
+    blob->ComputeBoundingBoxes();
+  }
+}
+
+TBOX TWERD::bounding_box() const {
+  TBOX result;
+  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
+    TBOX box = blob->bounding_box();
+    result += box;
+  }
+  return result;
+}
+
+// Merges the blobs from start to end, not including end, and deletes
+// the blobs between start and end.
+void TWERD::MergeBlobs(int start, int end) {
+  TBLOB* blob = blobs;
+  for (int i = 0; i < start && blob != NULL; ++i)
+    blob = blob->next;
+  if (blob == NULL || blob->next == NULL)
+    return;
+  TBLOB* next_blob = blob->next;
+  TESSLINE* outline = blob->outlines;
+  for (int i = start + 1; i < end && next_blob != NULL; ++i) {
+    // Take the outlines from the next blob.
+    if (outline == NULL) {
+      blob->outlines = next_blob->outlines;
+      outline = blob->outlines;
+    } else {
+      while (outline->next != NULL)
+        outline = outline->next;
+      outline->next = next_blob->outlines;
+      next_blob->outlines = NULL;
+    }
+    // Delete the next blob and move on.
+    TBLOB* dead_blob = next_blob;
+    next_blob = next_blob->next;
+    blob->next = next_blob;
+    delete dead_blob;
+  }
+}
+
+void TWERD::plot(ScrollView* window) {
+  ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
+  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
+    blob->plot(window, color, ScrollView::BROWN);
+    color = WERD::NextColor(color);
+  }
+}
+
 /**********************************************************************
 * blob_origin
 *
@ -61,32 +484,23 @@ void blob_origin(TBLOB *blob,       /*blob to compute on */
 * max coordinate value of the bounding boxes of all the top-level
 * outlines in the box.
 **********************************************************************/
-void blob_bounding_box(TBLOB *blob,               /*blob to compute on */
-                       register TPOINT *topleft,  /*bounding box */
-                       register TPOINT *botright) {
-  register TESSLINE *outline;    /*current outline */
+void blob_bounding_box(const TBLOB *blob,         // blob to compute on.
+                       TPOINT *topleft,           // bounding box.
+                       TPOINT *botright) {
+  register TESSLINE *outline;    // Current outline.

  if (blob == NULL || blob->outlines == NULL) {
    topleft->x = topleft->y = 0;
-    *botright = *topleft;        /*default value */
-  }
-  else {
+    *botright = *topleft;        // Default value.
+  } else {
    outline = blob->outlines;
    *topleft = outline->topleft;
    *botright = outline->botright;
    for (outline = outline->next; outline != NULL; outline = outline->next) {
-      if (outline->topleft.x < topleft->x)
-                                 /*find extremes */
-        topleft->x = outline->topleft.x;
-      if (outline->botright.x > botright->x)
-                                 /*find extremes */
-        botright->x = outline->botright.x;
-      if (outline->topleft.y > topleft->y)
-                                 /*find extremes */
-        topleft->y = outline->topleft.y;
-      if (outline->botright.y < botright->y)
-                                 /*find extremes */
-        botright->y = outline->botright.y;
+      UpdateRange(outline->topleft.x, outline->botright.x,
+                  &topleft->x, &botright->x);
+      UpdateRange(outline->botright.y, outline->topleft.y,
+                  &botright->y, &topleft->y);
    }
  }
 }
@ -100,11 +514,10 @@ void blob_bounding_box(TBLOB *blob,               /*blob to compute on */
 void blobs_bounding_box(TBLOB *blobs, TPOINT *topleft, TPOINT *botright) { 
  TPOINT tl;
  TPOINT br;
-  TBLOB *blob;
  /* Start with first blob */
  blob_bounding_box(blobs, topleft, botright); 

-  iterate_blobs(blob, blobs) { 
+  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { 
    blob_bounding_box(blob, &tl, &br); 

    if (tl.x < topleft->x)
@ -148,7 +561,6 @@ WIDTH_RECORD *blobs_widths(TBLOB *blobs) {  /*blob to compute on */
  WIDTH_RECORD *width_record;
  TPOINT topleft;                /*bounding box */
  TPOINT botright;
-  TBLOB *blob;                   /*blob to compute on */
  int i = 0;
  int blob_end;
  int num_blobs = count_blobs (blobs);
@ -162,7 +574,7 @@ WIDTH_RECORD *blobs_widths(TBLOB *blobs) {  /*blob to compute on */
  /* First width */
  blob_end = botright.x;

-  iterate_blobs (blob, blobs->next) {
+  for (TBLOB* blob = blobs->next; blob != NULL; blob = blob->next) {
    blob_bounding_box(blob, &topleft, &botright); 
    width_record->widths[i++] = topleft.x - blob_end;
    width_record->widths[i++] = botright.x - topleft.x;
@ -178,70 +590,102 @@ WIDTH_RECORD *blobs_widths(TBLOB *blobs) {  /*blob to compute on */
 * Return a count of the number of blobs attached to this one.
 **********************************************************************/
 int count_blobs(TBLOB *blobs) { 
-  TBLOB *b;
  int x = 0;

-  iterate_blobs (b, blobs) x++;
-  return (x);
+  for (TBLOB* b = blobs; b != NULL; b = b->next)
+    x++;
+  return x;
 }

-
 /**********************************************************************
- * delete_word
+ * divisible_blob
 *
- * Reclaim the memory taken by this word structure and all of its
- * lower level structures.
+ * Returns true if the blob contains multiple outlines than can be
+ * separated using divide_blobs. Sets the location to be used in the
+ * call to divide_blobs.
 **********************************************************************/
-void delete_word(TWERD *word) { 
-  TBLOB *blob;
-  TBLOB *nextblob;
-  TESSLINE *outline;
-  TESSLINE *nextoutline;
-  TESSLINE *child;
-  TESSLINE *nextchild;
-
-  for (blob = word->blobs; blob; blob = nextblob) {
-    nextblob = blob->next;
-
-    for (outline = blob->outlines; outline; outline = nextoutline) {
-      nextoutline = outline->next;
-
-      delete_edgepts (outline->loop);
-
-      for (child = outline->child; child; child = nextchild) {
-        nextchild = child->next;
-
-        delete_edgepts (child->loop);
-
-        oldoutline(child); 
+bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location) {
+  if (blob->outlines == NULL || blob->outlines->next == NULL)
+    return false;  // Need at least 2 outlines for it to be possible.
+  int max_gap = 0;
+  TPOINT vertical = italic_blob ? kDivisibleVerticalItalic
+                                : kDivisibleVerticalUpright;
+  for (TESSLINE* outline1 = blob->outlines; outline1 != NULL;
+       outline1 = outline1->next) {
+    if (outline1->is_hole)
+      continue;  // Holes do not count as separable.
+    TPOINT mid_pt1 = {(outline1->topleft.x + outline1->botright.x) / 2,
+                      (outline1->topleft.y + outline1->botright.y) / 2};
+    int mid_prod1 = CROSS(mid_pt1, vertical);
+    int min_prod1, max_prod1;
+    outline1->MinMaxCrossProduct(vertical, &min_prod1, &max_prod1);
+    for (TESSLINE* outline2 = outline1->next; outline2 != NULL;
+         outline2 = outline2->next) {
+      if (outline2->is_hole)
+        continue;  // Holes do not count as separable.
+      TPOINT mid_pt2 = {  (outline2->topleft.x + outline2->botright.x) / 2,
+                        (outline2->topleft.y + outline2->botright.y) / 2};
+      int mid_prod2 = CROSS(mid_pt2, vertical);
+      int min_prod2, max_prod2;
+      outline2->MinMaxCrossProduct(vertical, &min_prod2, &max_prod2);
+      int mid_gap = abs(mid_prod2 - mid_prod1);
+      int overlap = MIN(max_prod1, max_prod2) - MAX(min_prod1, min_prod2);
+      if (mid_gap - overlap / 2 > max_gap) {
+        max_gap = mid_gap - overlap / 2;
+        *location = mid_pt1;
+        *location += mid_pt2;
+        *location /= 2;
      }
-      oldoutline(outline); 
    }
-    oldblob(blob); 
  }
-  if (word->correct != NULL)
-    strfree (word->correct);     /* Reclaim memory */
-  oldword(word); 
+  // Use the y component of the vertical vector as an approximation to its
+  // length.
+  return max_gap > vertical.y;
 }

-
 /**********************************************************************
- * delete_edgepts
+ * divide_blobs
 *
- * Delete a list of EDGEPT structures.
+ * Create two blobs by grouping the outlines in the appropriate blob.
+ * The outlines that are beyond the location point are moved to the
+ * other blob.  The ones whose x location is less than that point are
+ * retained in the original blob.
 **********************************************************************/
-void delete_edgepts(register EDGEPT *edgepts) { 
-  register EDGEPT *this_edge;
-  register EDGEPT *next_edge;
+void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob,
+                  const TPOINT& location) {
+  TPOINT vertical = italic_blob ? kDivisibleVerticalItalic
+                                : kDivisibleVerticalUpright;
+  TESSLINE *outline1 = NULL;
+  TESSLINE *outline2 = NULL;

-  if (edgepts == NULL)
-    return;
+  TESSLINE *outline = blob->outlines;
+  blob->outlines = NULL;
+  int location_prod = CROSS(location, vertical);

-  this_edge = edgepts;
-  do {
-    next_edge = this_edge->next;
-    oldedgept(this_edge); 
-    this_edge = next_edge;
+  while (outline != NULL) {
+    TPOINT mid_pt = {(outline->topleft.x + outline->botright.x) / 2,
+                     (outline->topleft.y + outline->botright.y) / 2};
+    int mid_prod = CROSS(mid_pt, vertical);
+    if (mid_prod < location_prod) {
+      // Outline is in left blob.
+      if (outline1)
+        outline1->next = outline;
+      else
+        blob->outlines = outline;
+      outline1 = outline;
+    } else {
+      // Outline is in right blob.
+      if (outline2)
+        outline2->next = outline;
+      else
+        other_blob->outlines = outline;
+      outline2 = outline;
+    }
+    outline = outline->next;
  }
-  while (this_edge != edgepts);
+
+  if (outline1)
+    outline1->next = NULL;
+  if (outline2)
+    outline2->next = NULL;
 }
--- a/ccstruct/blobs.h
+++ b/ccstruct/blobs.h
@ -29,18 +29,225 @@
 /*----------------------------------------------------------------------
              I n c l u d e s
 ----------------------------------------------------------------------*/
-#include                   "vecfuncs.h"
-#include  "tessclas.h"
+#include "rect.h"
+#include "vecfuncs.h"
+
+class C_BLOB;
+class DENORM;
+class ROW;
+class WERD;

 /*----------------------------------------------------------------------
              T y p e s
 ----------------------------------------------------------------------*/
+#define EDGEPTFLAGS     4        /*concavity,length etc. */
+
 typedef struct
 {                                /* Widths of pieces */
  int num_chars;
  int widths[1];
 } WIDTH_RECORD;

+struct TPOINT {
+  void operator+=(const TPOINT& other) {
+    x += other.x;
+    y += other.y;
+  }
+  void operator/=(int divisor) {
+    x /= divisor;
+    y /= divisor;
+  }
+
+  inT16 x;                       // absolute x coord.
+  inT16 y;                       // absolute y coord.
+};
+typedef TPOINT VECTOR;           // structure for coordinates.
+
+struct EDGEPT {
+  EDGEPT() : next(NULL), prev(NULL) {
+    memset(flags, 0, EDGEPTFLAGS * sizeof(flags[0]));
+  }
+  EDGEPT(const EDGEPT& src) : next(NULL), prev(NULL) {
+    CopyFrom(src);
+  }
+  EDGEPT& operator=(const EDGEPT& src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Copies the data elements, but leaves the pointers untouched.
+  void CopyFrom(const EDGEPT& src) {
+    pos = src.pos;
+    vec = src.vec;
+    memcpy(flags, src.flags, EDGEPTFLAGS * sizeof(flags[0]));
+  }
+  // Accessors to hide or reveal a cut edge from feature extractors.
+  void Hide() {
+    flags[0] = true;
+  }
+  void Reveal() {
+    flags[0] = false;
+  }
+  bool IsHidden() const {
+    return flags[0] != 0;
+  }
+
+  TPOINT pos;                    // position
+  VECTOR vec;                    // vector to next point
+  // TODO(rays) Remove flags and replace with
+  // is_hidden, runlength, dir, and fixed. The only use
+  // of the flags other than is_hidden is in polyaprx.cpp.
+  char flags[EDGEPTFLAGS];       // concavity, length etc
+  EDGEPT* next;                  // anticlockwise element
+  EDGEPT* prev;                  // clockwise element
+};
+
+struct TESSLINE {
+  TESSLINE() : is_hole(false), loop(NULL), next(NULL) {}
+  TESSLINE(const TESSLINE& src) : loop(NULL), next(NULL) {
+    CopyFrom(src);
+  }
+  ~TESSLINE() {
+    Clear();
+  }
+  TESSLINE& operator=(const TESSLINE& src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Consume the circular list of EDGEPTs to make a TESSLINE.
+  static TESSLINE* BuildFromOutlineList(EDGEPT* outline);
+  // Copies the data and the outline, but leaves next untouched.
+  void CopyFrom(const TESSLINE& src);
+  // Deletes owned data.
+  void Clear();
+  // Rotates by the given rotation in place.
+  void Rotate(const FCOORD rotation);
+  // Moves by the given vec in place.
+  void Move(const ICOORD vec);
+  // Scales by the given factor in place.
+  void Scale(float factor);
+  // Sets up the start and vec members of the loop from the pos members.
+  void SetupFromPos();
+  // Recomputes the bounding box from the points in the loop.
+  void ComputeBoundingBox();
+  // Computes the min and max cross product of the outline points with the
+  // given vec and returns the results in min_xp and max_xp. Geometrically
+  // this is the left and right edge of the outline perpendicular to the
+  // given direction, but to get the distance units correct, you would
+  // have to divide by the modulus of vec.
+  void MinMaxCrossProduct(const TPOINT vec, int* min_xp, int* max_xp) const;
+
+  TBOX bounding_box() const;
+  // Returns true if the point is contained within the outline box.
+  bool Contains(const TPOINT& pt) {
+    return topleft.x <= pt.x && pt.x <= botright.x &&
+           botright.y <= pt.y && pt.y <= topleft.y;
+  }
+
+  void plot(ScrollView* window, ScrollView::Color color,
+            ScrollView::Color child_color);
+
+  int BBArea() const {
+    return (botright.x - topleft.x) * (topleft.y - botright.y);
+  }
+
+  TPOINT topleft;                // Top left of loop.
+  TPOINT botright;               // Bottom right of loop.
+  TPOINT start;                  // Start of loop.
+  bool is_hole;                  // True if this is a hole/child outline.
+  EDGEPT *loop;                  // Edgeloop.
+  TESSLINE *next;                // Next outline in blob.
+};                               // Outline structure.
+
+struct TBLOB {
+  TBLOB() : outlines(NULL), next(NULL) {}
+  TBLOB(const TBLOB& src) : outlines(NULL), next(NULL) {
+    CopyFrom(src);
+  }
+  ~TBLOB() {
+    Clear();
+  }
+  TBLOB& operator=(const TBLOB& src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Factory to build a TBLOB from a C_BLOB with polygonal
+  // approximation along the way.
+  static TBLOB* PolygonalCopy(C_BLOB* src);
+  // Copies the data and the outlines, but leaves next untouched.
+  void CopyFrom(const TBLOB& src);
+  // Deletes owned data.
+  void Clear();
+  // Rotates by the given rotation in place.
+  void Rotate(const FCOORD rotation);
+  // Moves by the given vec in place.
+  void Move(const ICOORD vec);
+  // Scales by the given factor in place.
+  void Scale(float factor);
+  // Recomputes the bounding boxes of the outlines.
+  void ComputeBoundingBoxes();
+
+  // Returns the number of outlines.
+  int NumOutlines() const;
+
+  TBOX bounding_box() const;
+
+  void plot(ScrollView* window, ScrollView::Color color,
+            ScrollView::Color child_color);
+
+  int BBArea() const {
+    int total_area = 0;
+    for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next)
+      total_area += outline->BBArea();
+    return total_area;
+  }
+
+  TESSLINE *outlines;            // List of outlines in blob.
+  TBLOB *next;                   // Next blob in block.
+};                               // Blob structure.
+
+int count_blobs(TBLOB *blobs);
+
+struct TWERD {
+  TWERD() : blobs(NULL), latin_script(false), next(NULL) {}
+  TWERD(const TWERD& src) : blobs(NULL), next(NULL) {
+    CopyFrom(src);
+  }
+  ~TWERD() {
+    Clear();
+  }
+  TWERD& operator=(const TWERD& src) {
+    CopyFrom(src);
+    return *this;
+  }
+  // Factory to build a TWERD from a (C_BLOB) WERD, with polygonal
+  // approximation along the way.
+  static TWERD* PolygonalCopy(WERD* src);
+  // Normalize in-place and record the normalization in the DENORM.
+  void Normalize(ROW* row, float x_height, bool numeric_mode, DENORM* denorm);
+  // Copies the data and the blobs, but leaves next untouched.
+  void CopyFrom(const TWERD& src);
+  // Deletes owned data.
+  void Clear();
+  // Recomputes the bounding boxes of the blobs.
+  void ComputeBoundingBoxes();
+
+  // Returns the number of blobs in the word.
+  int NumBlobs() const {
+    return count_blobs(blobs);
+  }
+  TBOX bounding_box() const;
+
+  // Merges the blobs from start to end, not including end, and deletes
+  // the blobs between start and end.
+  void MergeBlobs(int start, int end);
+
+  void plot(ScrollView* window);
+
+  TBLOB* blobs;                  // blobs in word.
+  bool latin_script;             // This word is in a latin-based script.
+  TWERD* next;                   // next word.
+};
+
 /*----------------------------------------------------------------------
              M a c r o s
 ----------------------------------------------------------------------*/
@ -55,13 +262,17 @@ if (w) memfree (w)
 /*----------------------------------------------------------------------
              F u n c t i o n s
 ----------------------------------------------------------------------*/
+// TODO(rays) This will become a member of TBLOB when TBLOB's definition
+// moves to blobs.h
+TBOX TBLOB_bounding_box(const TBLOB* blob);
+
 void blob_origin(TBLOB *blob,      /*blob to compute on */
                 TPOINT *origin);  /*return value */

                                 /*blob to compute on */
-void blob_bounding_box(TBLOB *blob,
-                       register TPOINT *topleft,  /*bounding box */
-                       register TPOINT *botright);
+void blob_bounding_box(const TBLOB *blob,
+                       TPOINT *topleft,  // Bounding box.
+                       TPOINT *botright);

 void blobs_bounding_box(TBLOB *blobs, TPOINT *topleft, TPOINT *botright); 

@ -71,49 +282,9 @@ void blobs_origin(TBLOB *blobs,     /*blob to compute on */
                                 /*blob to compute on */
 WIDTH_RECORD *blobs_widths(TBLOB *blobs); 

-int count_blobs(TBLOB *blobs); 
+bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location);

-void delete_word(TWERD *word); 
+void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob,
+                  const TPOINT& location);

-void delete_edgepts(register EDGEPT *edgepts); 
-
-/*
-#if defined(__STDC__) || defined(__cplusplus)
-# define	_ARGS(s) s
-#else
-# define	_ARGS(s) ()
-#endif*/
-
-/* blobs.c
-void blob_origin
-  _ARGS((BLOB *blob,
-  TPOINT *origin));
-
-void blob_bounding_box
-  _ARGS((BLOB *blob,
-  TPOINT *topleft,
-  TPOINT *botright));
-
-void blobs_bounding_box
-  _ARGS((BLOB *blobs,
-  TPOINT *topleft,
-  TPOINT *botright));
-
-void blobs_origin
-  _ARGS((BLOB *blobs,
-  TPOINT *origin));
-
-WIDTH_RECORD *blobs_widths
-  _ARGS((BLOB *blobs));
-
-int count_blobs
-  _ARGS((BLOB *blobs));
-
-void delete_word
-  _ARGS((TWERD *word));
-
-void delete_edgepts
-  _ARGS((EDGEPT *edgepts));
-#undef _ARGS
-*/
 #endif
--- a/ccstruct/blread.h
+++ b/ccstruct/blread.h
@ -20,7 +20,7 @@
 #ifndef           BLREAD_H
 #define           BLREAD_H

-#include          "varable.h"
+#include          "params.h"
 #include          "ocrblock.h"

 bool read_unlv_file(                    //print list of sides
--- a/ccstruct/boxword.cpp
+++ b/ccstruct/boxword.cpp
@ -0,0 +1,214 @@
+///////////////////////////////////////////////////////////////////////
+// File:        boxword.h
+// Description: Class to represent the bounding boxes of the output.
+// Author:      Ray Smith
+// Created:     Tue May 25 14:18:14 PDT 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "blobs.h"
+#include "boxword.h"
+#include "normalis.h"
+#include "ocrblock.h"
+#include "pageres.h"
+
+namespace tesseract {
+
+// Clip output boxes to input blob boxes for bounds that are within this
+// tolerance. Otherwise, the blob may be chopped and we have to just use
+// the word bounding box.
+const int kBoxClipTolerance = 2;
+
+BoxWord::BoxWord() : length_(0) {
+}
+
+BoxWord::BoxWord(const BoxWord& src) {
+  CopyFrom(src);
+}
+
+BoxWord::~BoxWord() {
+}
+
+BoxWord& BoxWord::operator=(const BoxWord& src) {
+  CopyFrom(src);
+  return *this;
+}
+
+void BoxWord::CopyFrom(const BoxWord& src) {
+  bbox_ = src.bbox_;
+  length_ = src.length_;
+  boxes_.clear();
+  boxes_.reserve(length_);
+  for (int i = 0; i < length_; ++i)
+    boxes_.push_back(src.boxes_[i]);
+}
+
+// Factory to build a BoxWord from a TWERD and the DENORM to switch
+// back to original image coordinates.
+// If the denorm is not NULL, then the output is denormalized and rotated
+// back to the original image coordinates.
+BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
+                                     TWERD* tessword) {
+  const BLOCK* block = denorm != NULL ? denorm->block() : NULL;
+  BoxWord* boxword = new BoxWord();
+  // Count the blobs.
+  boxword->length_ = 0;
+  for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next)
+    ++boxword->length_;
+  // Allocate memory.
+  boxword->boxes_.reserve(boxword->length_);
+
+  for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next) {
+    TBOX blob_box;
+    for (TESSLINE* outline = tblob->outlines; outline != NULL;
+         outline = outline->next) {
+      EDGEPT* edgept = outline->loop;
+      // Iterate over the edges.
+      do {
+        if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
+          ICOORD pos(edgept->pos.x, edgept->pos.y);
+          if (denorm != NULL) {
+            FCOORD denormed(denorm->x(edgept->pos.x),
+                            denorm->y(edgept->pos.y, edgept->pos.x));
+            if (block != NULL)
+              denormed.rotate(block->re_rotation());
+            pos.set_x(static_cast<inT16>(floor(denormed.x() + 0.5)));
+            pos.set_y(static_cast<inT16>(floor(denormed.y() + 0.5)));
+          }
+          TBOX pt_box(pos, pos);
+          blob_box += pt_box;
+        }
+        edgept = edgept->next;
+      } while (edgept != outline->loop);
+    }
+    boxword->boxes_.push_back(blob_box);
+  }
+  boxword->ComputeBoundingBox();
+  return boxword;
+}
+
+BoxWord* BoxWord::CopyFromPBLOBs(PBLOB_LIST* blobs) {
+  BoxWord* boxword = new BoxWord();
+  // Count the blobs.
+  boxword->length_ = blobs->length();
+  // Allocate memory.
+  boxword->boxes_.reserve(boxword->length_);
+  // Copy the boxes.
+  PBLOB_IT pb_it(blobs);
+  int i = 0;
+  for (pb_it.mark_cycle_pt(); !pb_it.cycled_list(); pb_it.forward(), ++i) {
+    boxword->boxes_.push_back(pb_it.data()->bounding_box());
+  }
+  boxword->ComputeBoundingBox();
+  return boxword;
+}
+
+// Clean up the bounding boxes from the polygonal approximation by
+// expanding slightly, then clipping to the blobs from the original_word
+// that overlap. If not null, the block provides the inverse rotation.
+void BoxWord::ClipToOriginalWord(const BLOCK* block, WERD* original_word) {
+  for (int i = 0; i < length_; ++i) {
+    TBOX box = boxes_[i];
+    // Expand by a single pixel, as the poly approximation error is 1 pixel.
+    box = TBOX(box.left() - 1, box.bottom() - 1,
+               box.right() + 1, box.top() + 1);
+    // Now find the original box that matches.
+    TBOX original_box;
+    C_BLOB_IT b_it(original_word->cblob_list());
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      TBOX blob_box = b_it.data()->bounding_box();
+      if (block != NULL)
+        blob_box.rotate(block->re_rotation());
+      if (blob_box.major_overlap(box)) {
+        original_box += blob_box;
+      }
+    }
+    if (!original_box.null_box()) {
+      if (NearlyEqual<int>(original_box.left(), box.left(), kBoxClipTolerance))
+        box.set_left(original_box.left());
+      if (NearlyEqual<int>(original_box.right(), box.right(),
+                           kBoxClipTolerance))
+        box.set_right(original_box.right());
+      if (NearlyEqual<int>(original_box.top(), box.top(), kBoxClipTolerance))
+        box.set_top(original_box.top());
+      if (NearlyEqual<int>(original_box.bottom(), box.bottom(),
+                           kBoxClipTolerance))
+        box.set_bottom(original_box.bottom());
+    }
+    boxes_[i] = box.intersection(original_word->bounding_box());
+  }
+  ComputeBoundingBox();
+}
+
+// Merges the boxes from start to end, not including end, and deletes
+// the boxes between start and end.
+void BoxWord::MergeBoxes(int start, int end) {
+  start = ClipToRange(start, 0, length_);
+  end = ClipToRange(end, 0, length_);
+  if (end <= start + 1)
+    return;
+  for (int i = start + 1; i < end; ++i) {
+    boxes_[start] += boxes_[i];
+  }
+  int shrinkage = end - 1 - start;
+  length_ -= shrinkage;
+  for (int i = start + 1; i < length_; ++i)
+    boxes_[i] = boxes_[i + shrinkage];
+  boxes_.truncate(length_);
+}
+
+// Inserts a new box before the given index.
+// Recomputes the bounding box.
+void BoxWord::InsertBox(int index, const TBOX& box) {
+  if (index < length_)
+    boxes_.insert(box, index);
+  else
+    boxes_.push_back(box);
+  length_ = boxes_.size();
+  ComputeBoundingBox();
+}
+
+// Deletes the box with the given index, and shuffles up the rest.
+// Recomputes the bounding box.
+void BoxWord::DeleteBox(int index) {
+  ASSERT_HOST(0 <= index && index < length_);
+  boxes_.remove(index);
+  --length_;
+  ComputeBoundingBox();
+}
+
+// Computes the bounding box of the word.
+void BoxWord::ComputeBoundingBox() {
+  bbox_ = TBOX();
+  for (int i = 0; i < length_; ++i)
+    bbox_ += boxes_[i];
+}
+
+// This and other putatively are the same, so call the (permanent) callback
+// for each blob index where the bounding boxes match.
+// The callback is deleted on completion.
+void BoxWord::ProcessMatchedBlobs(const TWERD& other,
+                                  TessCallback1<int>* cb) const {
+  TBLOB* blob = other.blobs;
+  for (int i = 0; i < length_ && blob != NULL; ++i, blob = blob->next) {
+    TBOX blob_box = blob->bounding_box();
+    if (blob_box == boxes_[i])
+      cb->Run(i);
+  }
+  delete cb;
+}
+
+}  // namespace tesseract.
+
+
--- a/ccstruct/boxword.h
+++ b/ccstruct/boxword.h
@ -0,0 +1,98 @@
+///////////////////////////////////////////////////////////////////////
+// File:        boxword.h
+// Description: Class to represent the bounding boxes of the output.
+// Author:      Ray Smith
+// Created:     Tue May 25 14:18:14 PDT 2010
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CSTRUCT_BOXWORD_H__
+#define TESSERACT_CSTRUCT_BOXWORD_H__
+
+#include "genericvector.h"
+#include "rect.h"
+
+class BLOCK;
+class DENORM;
+class PBLOB_LIST;
+struct TWERD;
+class WERD;
+class WERD_RES;
+
+namespace tesseract {
+
+// Class to hold an array of bounding boxes for an output word and
+// the bounding box of the whole word.
+class BoxWord {
+ public:
+  BoxWord();
+  explicit BoxWord(const BoxWord& src);
+  ~BoxWord();
+
+  BoxWord& operator=(const BoxWord& src);
+
+  void CopyFrom(const BoxWord& src);
+
+  // Factory to build a BoxWord from a TWERD and the DENORM to switch
+  // back to original image coordinates.
+  // If the denorm is not NULL, then the output is denormalized and rotated
+  // back to the original image coordinates.
+  static BoxWord* CopyFromNormalized(const DENORM* denorm,
+                                     TWERD* tessword);
+  static BoxWord* CopyFromPBLOBs(PBLOB_LIST* blobs);
+
+  // Clean up the bounding boxes from the polygonal approximation by
+  // expanding slightly, then clipping to the blobs from the original_word
+  // that overlap. If not null, the block provides the inverse rotation.
+  void ClipToOriginalWord(const BLOCK* block, WERD* original_word);
+
+  // Merges the boxes from start to end, not including end, and deletes
+  // the boxes between start and end.
+  void MergeBoxes(int start, int end);
+
+  // Inserts a new box before the given index.
+  // Recomputes the bounding box.
+  void InsertBox(int index, const TBOX& box);
+
+  // Deletes the box with the given index, and shuffles up the rest.
+  // Recomputes the bounding box.
+  void DeleteBox(int index);
+
+  // This and other putatively are the same, so call the (permanent) callback
+  // for each blob index where the bounding boxes match.
+  // The callback is deleted on completion.
+  void ProcessMatchedBlobs(const TWERD& other, TessCallback1<int>* cb) const;
+
+  const TBOX& bounding_box() const {
+    return bbox_;
+  }
+  const int length() const {
+    return length_;
+  }
+  const TBOX& BlobBox(int index) const {
+    return boxes_[index];
+  }
+
+ private:
+  void ComputeBoundingBox();
+
+  TBOX bbox_;
+  int length_;
+  GenericVector<TBOX> boxes_;
+};
+
+}  // namespace tesseract.
+
+
+#endif  // TESSERACT_CSTRUCT_BOXWORD_H__
--- a/ccstruct/callcpp.cpp
+++ b/ccstruct/callcpp.cpp
@ -18,20 +18,18 @@
 **********************************************************************/

 #include "mfcpch.h"
-#include "errcode.h"
+#include          "errcode.h"
 #ifdef __UNIX__
-#include <assert.h>
+#include          <assert.h>
 #include <stdarg.h>
 #endif
-#include <time.h>
-#include "memry.h"
-#include "scrollview.h"
-//#include          "evnts.h"
-#include "varable.h"
-#include "callcpp.h"
-#include "tprintf.h"
-//#include                                      "strace.h"
-#include "host.h"
+#include          <time.h>
+#include          "memry.h"
+#include          "scrollview.h"
+#include          "params.h"
+#include          "callcpp.h"
+#include          "tprintf.h"
+#include          "host.h"
 #include "unichar.h"

 // Include automatically generated configuration file if running autoconf.
@ -39,30 +37,6 @@
 #include "config_auto.h"
 #endif

-//extern "C" {
-
-INT_VAR (tess_cp_mapping0, 0, "Mappings for class pruner distance");
-INT_VAR (tess_cp_mapping1, 1, "Mappings for class pruner distance");
-INT_VAR (tess_cp_mapping2, 2, "Mappings for class pruner distance");
-INT_VAR (tess_cp_mapping3, 3, "Mappings for class pruner distance");
-INT_VAR (record_matcher_output, 0, "Record detailed matcher info");
-INT_VAR (il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
-double_VAR (permuter_pending_threshold, 0.0,
-"Worst conf for using pending dictionary");
-//Global matcher info from the class pruner.
-inT32 cp_maps[4];
-//Global info to control writes of matcher info
-char blob_answer[UNICHAR_LEN + 1]; //correct char
-char *word_answer;                 //correct word
-inT32 bits_in_states;              //no of bits in states
-
-void setup_cp_maps() {
-  cp_maps[0] = tess_cp_mapping0;
-  cp_maps[1] = tess_cp_mapping1;
-  cp_maps[2] = tess_cp_mapping2;
-  cp_maps[3] = tess_cp_mapping3;
-}
-
 void
 cprintf (                        //Trace printf
 const char *format, ...          //special message
@ -172,6 +146,3 @@ void reverse16(void *ptr) {
  *cptr = *(cptr + 1);
  *(cptr + 1) = tmp;
 }
-
-
-//};
--- a/ccstruct/ccstruct.cpp
+++ b/ccstruct/ccstruct.cpp
@ -19,6 +19,15 @@
 #include "ccstruct.h"

 namespace tesseract  {
+
+// APPROXIMATIONS of the fractions of the character cell taken by
+// the descenders, ascenders, and x-height.
+const double CCStruct::kDescenderFraction = 0.25;
+const double CCStruct::kXHeightFraction = 0.5;
+const double CCStruct::kAscenderFraction = 0.25;
+const double CCStruct::kXHeightCapRatio = CCStruct::kXHeightFraction /
+    (CCStruct::kXHeightFraction + CCStruct::kAscenderFraction);
+
 CCStruct::CCStruct()
  : image_(this) {
 }
--- a/ccstruct/ccstruct.h
+++ b/ccstruct/ccstruct.h
@ -22,17 +22,21 @@
 #include "cutil.h"
 #include "image.h"

-class PBLOB;
-class DENORM;
-class WERD;
-class BLOB_CHOICE_LIST;
-
 namespace tesseract {
 class CCStruct : public CUtil {
 public:
  CCStruct();
  ~CCStruct();

+  // Globally accessible constants.
+  // APPROXIMATIONS of the fractions of the character cell taken by
+  // the descenders, ascenders, and x-height.
+  static const double kDescenderFraction;  // = 0.25;
+  static const double kXHeightFraction;    // = 0.5;
+  static const double kAscenderFraction;   // = 0.25;
+  // Derived value giving the x-height as a fraction of cap-height.
+  static const double kXHeightCapRatio;    // = XHeight/(XHeight + Ascender).
+
 protected:
  Image image_;
 };
@ -40,13 +44,5 @@ class CCStruct : public CUtil {
 class Tesseract;
 }  // namespace tesseract

-typedef void (tesseract::Tesseract::*POLY_MATCHER)
-  (PBLOB *, PBLOB *, PBLOB *, WERD *,
-   DENORM *, BLOB_CHOICE_LIST *, const char*);
-/*
-  typedef void (tesseract::Tesseract::*POLY_TESTER)
-  (const STRING&, PBLOB *, DENORM *, BOOL8, char *,
-  inT32, BLOB_CHOICE_LIST *);
-*/

 #endif  // TESSERACT_CCSTRUCT_CCSTRUCT_H__
--- a/ccstruct/coutln.cpp
+++ b/ccstruct/coutln.cpp
@ -18,11 +18,12 @@
 **********************************************************************/

 #include "mfcpch.h"
-#include          <string.h>
+#include <string.h>
 #ifdef __UNIX__
-#include          <assert.h>
+#include <assert.h>
 #endif
-#include          "coutln.h"
+#include "coutln.h"
+#include "allheaders.h"

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
@ -620,6 +621,23 @@ void C_OUTLINE::RemoveSmallRecursive(int min_size, C_OUTLINE_IT* it) {
  }
 }

+// Renders the outline to the given pix, with left and top being
+// the coords of the upper-left corner of the pix.
+void C_OUTLINE::render(int left, int top, Pix* pix) {
+  ICOORD pos = start;
+  for (int stepindex = 0; stepindex < stepcount; ++stepindex) {
+    ICOORD next_step = step(stepindex);
+    if (next_step.y() < 0) {
+      pixRasterop(pix, 0, top - pos.y(), pos.x() - left, 1,
+                  PIX_NOT(PIX_DST), NULL, 0, 0);
+    } else if (next_step.y() > 0) {
+      pixRasterop(pix, 0, top - pos.y() - 1, pos.x() - left, 1,
+                  PIX_NOT(PIX_DST), NULL, 0, 0);
+    }
+    pos += next_step;
+  }
+}
+
 /**********************************************************************
 * C_OUTLINE::plot
 *
@ -628,15 +646,14 @@ void C_OUTLINE::RemoveSmallRecursive(int min_size, C_OUTLINE_IT* it) {

 #ifndef GRAPHICS_DISABLED
 void C_OUTLINE::plot(                //draw it
-                     ScrollView* window,  //window to draw in
-                     ScrollView::Color colour   //colour to draw in
+                     ScrollView* window,       // window to draw in
+                     ScrollView::Color colour  // colour to draw in
                    ) const {
-  inT16 stepindex;               //index to cstep
-  ICOORD pos;                    //current position
-  DIR128 stepdir;                //direction of step
-  DIR128 oldstepdir;             //previous stepdir
+  inT16 stepindex;               // index to cstep
+  ICOORD pos;                    // current position
+  DIR128 stepdir;                // direction of step

-  pos = start;                   //current position
+  pos = start;                   // current position
  window->Pen(colour);
  if (stepcount == 0) {
    window->Rectangle(box.left(), box.top(), box.right(), box.bottom());
@ -645,19 +662,17 @@ void C_OUTLINE::plot(                //draw it
  window->SetCursor(pos.x(), pos.y());

  stepindex = 0;
-  stepdir = step_dir (0);        //get direction
  while (stepindex < stepcount) {
-    do {
-      pos += step (stepindex);   //step to next
-      stepindex++;               //count steps
-      oldstepdir = stepdir;
-                                 //new direction
-      stepdir = step_dir (stepindex);
+    pos += step(stepindex);    // step to next
+    stepdir = step_dir(stepindex);
+    stepindex++;               // count steps
+    // merge straight lines
+    while (stepindex < stepcount &&
+           stepdir.get_dir() == step_dir(stepindex).get_dir()) {
+      pos += step(stepindex);
+      stepindex++;
    }
-    while (stepindex < stepcount
-      && oldstepdir.get_dir () == stepdir.get_dir ());
-    //merge straight lines
-     window->DrawTo(pos.x(), pos.y());
+    window->DrawTo(pos.x(), pos.y());
  }
 }
 #endif
--- a/ccstruct/coutln.h
+++ b/ccstruct/coutln.h
@ -38,6 +38,7 @@ enum C_OUTLINE_FLAGS
 };

 class DLLSYM C_OUTLINE;          //forward declaration
+struct Pix;

 ELISTIZEH_S (C_OUTLINE)
 class DLLSYM C_OUTLINE:public ELIST_LINK
@ -149,6 +150,10 @@ class DLLSYM C_OUTLINE:public ELIST_LINK
    // then this is extracted from *it, so an iteration can continue.
    void RemoveSmallRecursive(int min_size, C_OUTLINE_IT* it);

+    // Renders the outline to the given pix, with left and top being
+    // the coords of the upper-left corner of the pix.
+    void render(int left, int top, Pix* pix);
+
    void plot(                       //draw one
              ScrollView* window,         //window to draw in
              ScrollView::Color colour) const;  //colour to draw it
--- a/ccstruct/detlinefit.cpp
+++ b/ccstruct/detlinefit.cpp
@ -103,6 +103,104 @@ double DetLineFit::Fit(ICOORD* pt1, ICOORD* pt2) {
  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
 }

+// Backwards compatible fit returning a gradient and constant.
+// Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
+// function in preference to the LMS class.
+double DetLineFit::Fit(float* m, float* c) {
+  ICOORD start, end;
+  double error = Fit(&start, &end);
+  if (end.x() != start.x()) {
+    *m = static_cast<float>(end.y() - start.y()) / (end.x() - start.x());
+    *c = start.y() - *m * start.x();
+  } else {
+    *m = 0.0f;
+    *c = 0.0f;
+  }
+  return error;
+}
+
+// Helper function to compute a fictitious end point that is on a line
+// of a given gradient through the given start.
+ICOORD ComputeEndFromGradient(const ICOORD& start, double m) {
+  if (m > 1.0 || m < -1.0) {
+    // dy dominates. Force it to have the opposite sign of start.y() and
+    // compute dx based on dy being as large as possible
+    int dx = static_cast<int>(floor(MAX_INT16 / m));
+    if (dx < 0) ++dx;  // Truncate towards 0.
+    if (start.y() > 0) dx = - dx;  // Force dy to be opposite to start.y().
+    // Constrain dx so the result fits in an inT16.
+    while (start.x() + dx > MAX_INT16 || start.x() + dx < -MAX_INT16)
+      dx /= 2;
+    if (-1 <= dx && dx <= 1) {
+      return ICOORD(start.x(), start.y() + 1);  // Too steep for anything else.
+    }
+    int y = start.y() + static_cast<int>(floor(dx * m + 0.5));
+    ASSERT_HOST(-MAX_INT16 <= y && y <= MAX_INT16);
+    return ICOORD(start.x() + dx, y);
+  } else {
+    // dx dominates. Force it to have the opposite sign of start.x() and
+    // compute dy based on dx being as large as possible.
+    int dy = static_cast<int>(floor(MAX_INT16 * m));
+    if (dy < 0) ++dy;  // Truncate towards 0.
+    if (start.x() > 0) dy = - dy;  // Force dx to be opposite to start.x().
+    // Constrain dy so the result fits in an inT16.
+    while (start.y() + dy > MAX_INT16 || start.y() + dy < -MAX_INT16)
+      dy /= 2;
+    if (-1 <= dy && dy <= 1) {
+      return ICOORD(start.x() + 1, start.y());  // Too flat for anything else.
+    }
+    int x = start.x() + static_cast<int>(floor(dy / m + 0.5));
+    ASSERT_HOST(-MAX_INT16 <= x && x <= MAX_INT16);
+    return ICOORD(x, start.y() + dy);
+  }
+}
+
+// Backwards compatible constrained fit with a supplied gradient.
+double DetLineFit::ConstrainedFit(double m, float* c) {
+  ICOORDELT_IT it(&pt_list_);
+  // Do something sensible with no points.
+  if (pt_list_.empty()) {
+    *c = 0.0f;
+    return 0.0;
+  }
+  // Count the points and find the first and last kNumEndPoints.
+  // Put the ends in a single array to make their use easier later.
+  ICOORD* pts[kNumEndPoints * 2];
+  int pt_count = 0;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    if (pt_count < kNumEndPoints) {
+      pts[pt_count] = it.data();
+      pts[kNumEndPoints + pt_count] = pts[pt_count];
+    } else {
+      for (int i = 1; i < kNumEndPoints; ++i)
+        pts[kNumEndPoints + i - 1] = pts[kNumEndPoints + i];
+      pts[kNumEndPoints * 2 - 1] = it.data();
+    }
+    ++pt_count;
+  }
+  while (pt_count < kNumEndPoints) {
+    pts[pt_count] = NULL;
+    pts[kNumEndPoints + pt_count++] = NULL;
+  }
+  int* distances = new int[pt_count];
+  double best_uq = -1.0;
+  // Iterate each pair of points and find the best fitting line.
+  for (int i = 0; i < kNumEndPoints * 2; ++i) {
+    ICOORD* start = pts[i];
+    if (start == NULL) continue;
+    ICOORD end = ComputeEndFromGradient(*start, m);
+    // Compute the upper quartile error from the line.
+    double dist = ComputeErrors(*start, end, distances);
+    if (dist < best_uq || best_uq < 0.0) {
+      best_uq = dist;
+      *c = start->y() - start->x() * m;
+    }
+  }
+  delete [] distances;
+  // Finally compute the square root to return the true distance.
+  return best_uq > 0.0 ? sqrt(best_uq) : best_uq;
+}
+
 // Comparator function used by the nth_item funtion.
 static int CompareInts(const void *p1, const void *p2) {
  const int* i1 = reinterpret_cast<const int*>(p1);
--- a/ccstruct/detlinefit.h
+++ b/ccstruct/detlinefit.h
@ -67,6 +67,14 @@ class DetLineFit {
  // points, and the upper quartile error.
  double Fit(ICOORD* pt1, ICOORD* pt2);

+  // Backwards compatible fit returning a gradient and constant.
+  // Deprecated. Prefer Fit(ICOORD*, ICOORD*) where possible, but use this
+  // function in preference to the LMS class.
+  double Fit(float* m, float* c);
+
+  // Backwards compatible constrained fit with a supplied gradient.
+  double ConstrainedFit(double m, float* c);
+
 private:
  double ComputeErrors(const ICOORD start, const ICOORD end, int* distances);

--- a/ccstruct/dppoint.cpp
+++ b/ccstruct/dppoint.cpp
@ -0,0 +1,98 @@
+/**********************************************************************
+ * File:        dppoint.cpp
+ * Description: Simple generic dynamic programming class.
+ * Author:      Ray Smith
+ * Created:     Wed Mar 25 19:08:01 PDT 2009
+ *
+ * (C) Copyright 2009, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "dppoint.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Solve the dynamic programming problem for the given array of points, with
+// the given size and cost function.
+// Steps backwards are limited to being between min_step and max_step
+// inclusive.
+// The return value is the tail of the best path.
+DPPoint* DPPoint::Solve(int min_step, int max_step, bool debug,
+                        CostFunc cost_func, int size, DPPoint* points) {
+  if (size <= 0 || max_step < min_step || min_step >= size)
+    return NULL;  // Degenerate, but not necessarily an error.
+  ASSERT_HOST(min_step > 0);  // Infinite loop possible if this is not true.
+  if (debug)
+    tprintf("min = %d, max=%d\n",
+            min_step, max_step);
+  // Evaluate the total cost at each point.
+  for (int i = 0; i < size; ++i) {
+    for (int offset = min_step; offset <= max_step; ++offset) {
+      DPPoint* prev = offset <= i ? points + i - offset : NULL;
+      inT64 new_cost = (points[i].*cost_func)(prev);
+      if (points[i].best_prev_ != NULL && offset > min_step * 2 &&
+          new_cost > points[i].total_cost_)
+        break;  // Find only the first minimum if going over twice the min.
+    }
+    points[i].total_cost_ += points[i].local_cost_;
+    if (debug) {
+      tprintf("At point %d, local cost=%d, total_cost=%d, steps=%d\n",
+              i, points[i].local_cost_, points[i].total_cost_,
+              points[i].total_steps_);
+    }
+  }
+  // Now find the end of the best path and return it.
+  int best_cost = points[size - 1].total_cost_;
+  int best_end = size - 1;
+  for (int end = best_end - 1; end >= size - min_step; --end) {
+    int cost = points[end].total_cost_;
+    if (cost < best_cost) {
+      best_cost = cost;
+      best_end = end;
+    }
+  }
+  return points + best_end;
+}
+
+// A CostFunc that takes the variance of step into account in the cost.
+inT64 DPPoint::CostWithVariance(const DPPoint* prev) {
+  if (prev == NULL || prev == this) {
+    UpdateIfBetter(0, 1, NULL, 0, 0, 0);
+    return 0;
+  }
+
+  int delta = this - prev;
+  inT32 n = prev->n_ + 1;
+  inT32 sig_x = prev->sig_x_ + delta;
+  inT64 sig_xsq = prev->sig_xsq_ + delta * delta;
+  inT64 cost = (sig_xsq - sig_x * sig_x / n) / n;
+  cost += prev->total_cost_;
+  UpdateIfBetter(cost, prev->total_steps_ + 1, prev, n, sig_x, sig_xsq);
+  return cost;
+}
+
+// Update the other members if the cost is lower.
+void DPPoint::UpdateIfBetter(inT64 cost, inT32 steps, const DPPoint* prev,
+                             inT32 n, inT32 sig_x, inT64 sig_xsq) {
+  if (cost < total_cost_) {
+    total_cost_ = cost;
+    total_steps_ = steps;
+    best_prev_ = prev;
+    n_ = n;
+    sig_x_ = sig_x;
+    sig_xsq_ = sig_xsq;
+  }
+}
+
+}  // namespace tesseract.
+
--- a/ccstruct/dppoint.h
+++ b/ccstruct/dppoint.h
@ -0,0 +1,102 @@
+/**********************************************************************
+ * File:        dppoint.h
+ * Description: Simple generic dynamic programming class.
+ * Author:      Ray Smith
+ * Created:     Wed Mar 25 18:57:01 PDT 2009
+ *
+ * (C) Copyright 2009, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCSTRUCT_DPPOINT_H__
+#define TESSERACT_CCSTRUCT_DPPOINT_H__
+
+#include "host.h"
+
+namespace tesseract {
+
+// A simple class to provide a dynamic programming solution to a class of
+// 1st-order problems in which the cost is dependent only on the current
+// step and the best cost to that step, with a possible special case
+// of using the variance of the steps, and only the top choice is required.
+// Useful for problems such as finding the optimal cut points in a fixed-pitch
+// (vertical or horizontal) situation.
+// Skeletal Example:
+// DPPoint* array = new DPPoint[width];
+// for (int i = 0; i < width; i++) {
+//   array[i].AddLocalCost(cost_at_i)
+// }
+// DPPoint* best_end = DPPoint::Solve(..., array);
+// while (best_end != NULL) {
+//   int cut_index = best_end - array;
+//   best_end = best_end->best_prev();
+// }
+// delete [] array;
+class DPPoint {
+ public:
+  // The cost function evaluates the total cost at this (excluding this's
+  // local_cost) and if it beats this's total_cost, then
+  // replace the appropriate values in this.
+  typedef inT64 (DPPoint::*CostFunc)(const DPPoint* prev);
+
+  DPPoint()
+    : local_cost_(0), total_cost_(MAX_INT32), total_steps_(1), best_prev_(NULL),
+      n_(0), sig_x_(0), sig_xsq_(0) {
+  }
+
+  // Solve the dynamic programming problem for the given array of points, with
+  // the given size and cost function.
+  // Steps backwards are limited to being between min_step and max_step
+  // inclusive.
+  // The return value is the tail of the best path.
+  static DPPoint* Solve(int min_step, int max_step, bool debug,
+                        CostFunc cost_func, int size, DPPoint* points);
+
+  // A CostFunc that takes the variance of step into account in the cost.
+  inT64 CostWithVariance(const DPPoint* prev);
+
+  // Accessors.
+  int total_cost() const {
+    return total_cost_;
+  }
+  int Pathlength() const {
+    return total_steps_;
+  }
+  const DPPoint* best_prev() const {
+    return best_prev_;
+  }
+  void AddLocalCost(int new_cost) {
+    local_cost_ += new_cost;
+  }
+
+ private:
+  // Code common to different cost functions.
+
+  // Update the other members if the cost is lower.
+  void UpdateIfBetter(inT64 cost, inT32 steps, const DPPoint* prev,
+                      inT32 n, inT32 sig_x, inT64 sig_xsq);
+
+  inT32 local_cost_;    // Cost of this point on its own.
+  inT32 total_cost_;    // Sum of all costs in best path to here.
+                        // During cost calculations local_cost is excluded.
+  inT32 total_steps_;   // Number of steps in best path to here.
+  const DPPoint* best_prev_;  // Pointer to prev point in best path from here.
+  // Information for computing the variance part of the cost.
+  inT32 n_;             // Number of steps in best path to here for variance.
+  inT32 sig_x_;         // Sum of step sizes for computing variance.
+  inT64 sig_xsq_;       // Sum of squares of steps for computing variance.
+};
+
+}  // namespace tesseract.
+
+#endif  // TESSERACT_CCSTRUCT_DPPOINT_H__
+
--- a/ccstruct/labls.cpp
+++ b/ccstruct/labls.cpp
@ -1,193 +0,0 @@
-/**********************************************************************
- * File:        labls.c  (Formerly labels.c)
- * Description: Attribute definition tables
- * Author:					Sheelagh Lloyd?
- * Created:
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-// Include automatically generated configuration file if running autoconf.
-#ifdef HAVE_CONFIG_H
-#include "config_auto.h"
-#endif
-
-#include "mfcpch.h"
-#include          "hpdsizes.h"
-#include          "labls.h"
-
-/******************************************************************************
- * TEXT REGIONS
- *****************************************************************************/
-DLLSYM inT32 tn[NUM_TEXT_ATTR] = {
-  3,                             //T_HORIZONTAL
-  4,                             //T_TEXT
-  2,                             //T_SERIF
-  2,                             //T_PROPORTIONAL
-  2,                             //T_NORMAL
-  2,                             //T_UPRIGHT
-  2,                             //T_SOLID
-  3,                             //T_BLACK
-  2,                             //T_NOTUNDER
-  2,                             //T_NOTDROP
-};
-
-DLLSYM char tlabel[NUM_TEXT_ATTR][4][MAXLENGTH] = { {
-                                 //T_HORIZONTAL
-    "Horizontal",
-    "Vertical",
-    "Skew",
-    ""
-  },
-  {                              //T_TEXT
-    "Text",
-    "Table",
-    "Form",
-    "Mixed"
-  },
-  {                              //T_SERIF
-    "Serif",
-    "Sans-serif",
-    "",
-    ""
-  },
-  {                              //T_PROPORTIONAL
-    "Proportional",
-    "Fixed pitch",
-    "",
-    ""
-  },
-  {                              //T_NORMAL
-    "Normal",
-    "Bold",
-    "",
-    ""
-  },
-  {                              //T_UPRIGHT
-    "Upright",
-    "Italic",
-    "",
-    ""
-  },
-  {                              //T_SOLID
-    "Solid",
-    "Outline",
-    "",
-    ""
-  },
-  {                              //T_BLACK
-    "Black",
-    "White",
-    "Coloured",
-    ""
-  },
-  {                              //T_NOTUNDER
-    "Not underlined",
-    "Underlined",
-    "",
-    ""
-  },
-  {                              //T_NOTDROP
-    "Not drop caps",
-    "Drop Caps",
-    "",
-    ""
-  }
-};
-
-DLLSYM inT32 bn[NUM_BLOCK_ATTR] = {
-  4,                             //G_MONOCHROME
-  2,                             //I_MONOCHROME
-  2,                             //I_SMOOTH
-  3,                             //R_SINGLE
-  3,                             //R_BLACK
-  3,                             //S_BLACK
-  2                              //W_TEXT
-};
-
-DLLSYM inT32 tvar[NUM_TEXT_ATTR];
-DLLSYM inT32 bvar[NUM_BLOCK_ATTR];
-DLLSYM char blabel[NUM_BLOCK_ATTR][4][MAXLENGTH] = { {
-                                 //G_MONOCHROME
-
-  /****************************************************************************
-   *  GRAPHICS
-   ***************************************************************************/
-    "Monochrome ",
-    "Two colour ",
-    "Spot colour",
-    "Multicolour"
-  },
-
-  /****************************************************************************
-   *  IMAGE
-   ***************************************************************************/
-  {                              //I_MONOCHROME
-    "Monochrome ",
-    "Colour     ",
-    "",
-    ""
-  },
-  {                              //I_SMOOTH
-    "Smooth     ",
-    "Grainy     ",
-    "",
-    ""
-  },
-
-  /****************************************************************************
-   *  RULES
-   ***************************************************************************/
-  {                              //R_SINGLE
-    "Single  ",
-    "Double  ",
-    "Multiple",
-    ""
-  },
-  {                              //R_BLACK
-    "Black   ",
-    "White   ",
-    "Coloured",
-    ""
-  },
-
-  /****************************************************************************
-   *  SCRIBBLE
-   ***************************************************************************/
-  {                              //S_BLACK
-    "Black   ",
-    "White   ",
-    "Coloured",
-    ""
-  },
-  /****************************************************************************
-   *  WEIRD
-   ***************************************************************************/
-  {                              //W_TEXT
-    "No text      ",
-    "Contains text",
-    "",
-    ""
-  }
-};
-
-DLLSYM char backlabel[NUM_BACKGROUNDS][MAXLENGTH] = {
-  "White",                       //B_WHITE
-  "Black",                       //B_BLACK
-  "Coloured",                    //B_COLOURED
-  "Textured",                    //B_TEXTURED
-  "Patterned",                   //B_PATTERNED
-  "Gradient fill",               //B_GRADIENTFILL
-  "Image",                       //B_IMAGE
-  "Text"                         //B_TEXT
-};
--- a/ccstruct/labls.h
+++ b/ccstruct/labls.h
@ -1,38 +0,0 @@
-/**********************************************************************
- * File:        labls.h  (Formerly labels.h)
- * Description: Attribute definition tables
- * Author:					Sheelagh Lloyd?
- * Created:
- *
- * (C) Copyright 1993, Hewlett-Packard Ltd.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-#ifndef           LABLS_H
-#define           LABLS_H
-
-#include          "host.h"
-#include          "hpdsizes.h"
-
-#include          "hpddef.h"     //must be last (handpd.dll)
-
-extern DLLSYM inT32 tn[NUM_TEXT_ATTR];
-
-extern DLLSYM char tlabel[NUM_TEXT_ATTR][4][MAXLENGTH];
-
-extern DLLSYM inT32 bn[NUM_BLOCK_ATTR];
-
-extern DLLSYM inT32 tvar[NUM_TEXT_ATTR];
-extern DLLSYM inT32 bvar[NUM_BLOCK_ATTR];
-extern DLLSYM char blabel[NUM_BLOCK_ATTR][4][MAXLENGTH];
-
-extern DLLSYM char backlabel[NUM_BACKGROUNDS][MAXLENGTH];
-#endif
--- a/ccstruct/linlsq.cpp
+++ b/ccstruct/linlsq.cpp
@ -31,9 +31,8 @@ const ERRCODE EMPTY_LLSQ = "Can't delete from an empty LLSQ";

 #define EXTERN

-EXTERN double_VAR (pdlsq_posdir_ratio, 4e-6, "Mult of dir to cf pos");
-EXTERN double_VAR (pdlsq_threshold_angleavg, 0.1666666,
-"Frac of pi for simple fit");
+const double kPdlsqPosdirRatio = 4e-6f;  // Mult of dir to cf pos
+const double kPdlsqThresholdAngleAvg = 0.166666f; // Frac of pi for simple fit

 /**********************************************************************
 * LLSQ::clear
@ -192,11 +191,11 @@ float PDLSQ::fit(                 //get fit

  if (pos.n > 0) {
    a = pos.sigxy - pos.sigx * pos.sigy / pos.n
-      + pdlsq_posdir_ratio * dir.sigxy;
+      + kPdlsqPosdirRatio * dir.sigxy;
    b =
      pos.sigxx - pos.sigyy + (pos.sigy * pos.sigy -
      pos.sigx * pos.sigx) / pos.n +
-      pdlsq_posdir_ratio * (dir.sigxx - dir.sigyy);
+      kPdlsqPosdirRatio * (dir.sigxx - dir.sigyy);
    if (dir.sigy != 0 || dir.sigx != 0)
      avg_angle = atan2 (dir.sigy, dir.sigx);
    else
@ -214,8 +213,8 @@ float PDLSQ::fit(                 //get fit
      error += M_PI;
      angle -= M_PI;
    }
-    if (error > M_PI * pdlsq_threshold_angleavg
-      || error < -M_PI * pdlsq_threshold_angleavg)
+    if (error > M_PI * kPdlsqThresholdAngleAvg ||
+        error < -M_PI * kPdlsqThresholdAngleAvg)
      angle = avg_angle;         //go simple
                                 //convert direction
    ang = (inT16) (angle * MODULUS / (2 * M_PI));
@ -227,7 +226,7 @@ float PDLSQ::fit(                 //get fit
    //                      a,b,angle,r);
    error = dir.sigxx * sinx * sinx + dir.sigyy * cosx * cosx
      - 2 * dir.sigxy * sinx * cosx;
-    error *= pdlsq_posdir_ratio;
+    error *= kPdlsqPosdirRatio;
    error += sinx * sinx * pos.sigxx + cosx * cosx * pos.sigyy
      - 2 * sinx * cosx * pos.sigxy
      - 2 * r * (sinx * pos.sigx - cosx * pos.sigy) + r * r * pos.n;
--- a/Show More
+++ b/Show More