From 627368df42e0318ba0e6a716f830a92cb49998d0 Mon Sep 17 00:00:00 2001
From: theraysmith <theraysmith@d0cd1f9f-072b-0410-8dd7-cf729c803f20>
Date: Wed, 18 Jul 2007 01:11:18 +0000
Subject: [PATCH] API/output changes to produce unlv-style latin-1 output and
 test scripts

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@86 d0cd1f9f-072b-0410-8dd7-cf729c803f20
---
 ccmain/applybox.cpp               | 290 ++++++++++++++++-------------
 ccmain/baseapi.cpp                | 293 ++++++++++++++++++++++++++++--
 ccmain/baseapi.h                  |  29 ++-
 ccmain/output.cpp                 | 224 +++++++++++++++--------
 ccmain/tesseractmain.cpp          |  92 ++++++++--
 ccstruct/blread.cpp               |   4 +-
 cutil/tordvars.cpp                |   2 +-
 tessdata/configs/makebox          |   1 +
 tessdata/configs/unlv             |   3 +
 tessdata/tessconfigs/batch        |  78 +-------
 tessdata/tessconfigs/batch.nochop |   2 +
 tessdata/tessconfigs/matdemo      |  74 --------
 tessdata/tessconfigs/msdemo       |  13 ++
 tessdata/tessconfigs/nobatch      |   2 +
 tessdata/tessconfigs/segdemo      |  64 -------
 testing/Makefile                  | 185 +++++++++++++++++++
 testing/Makefile.am               |   2 +
 testing/Makefile.in               | 185 +++++++++++++++++++
 testing/README                    |  43 +++++
 testing/counttestset.sh           |  61 +++++++
 testing/reorgdata.sh              |  44 +++++
 testing/reports/1995.bus.3B.sum   |   1 +
 testing/reports/1995.doe3.3B.sum  |   1 +
 testing/reports/1995.mag.3B.sum   |   1 +
 testing/reports/1995.news.3B.sum  |   1 +
 testing/runalltests.sh            | 110 +++++++++++
 testing/runtestset.sh             |  61 +++++++
 27 files changed, 1424 insertions(+), 442 deletions(-)
 create mode 100644 tessdata/configs/makebox
 create mode 100644 tessdata/configs/unlv
 create mode 100644 tessdata/tessconfigs/batch.nochop
 create mode 100644 tessdata/tessconfigs/msdemo
 create mode 100644 tessdata/tessconfigs/nobatch
 create mode 100644 testing/Makefile
 create mode 100644 testing/Makefile.am
 create mode 100644 testing/Makefile.in
 create mode 100644 testing/README
 create mode 100755 testing/counttestset.sh
 create mode 100755 testing/reorgdata.sh
 create mode 100644 testing/reports/1995.bus.3B.sum
 create mode 100644 testing/reports/1995.doe3.3B.sum
 create mode 100644 testing/reports/1995.mag.3B.sum
 create mode 100644 testing/reports/1995.news.3B.sum
 create mode 100755 testing/runalltests.sh
 create mode 100755 testing/runtestset.sh

diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp
index 41b482259..888acf4fb 100644
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@@ -24,20 +24,22 @@ what measures we are interested in.
 /* #define SECURE_NAMES done in secnames.h when necessary*/
 
 #include "mfcpch.h"
-#include          "applybox.h"
-#include          <ctype.h>
-#include          <string.h>
+#include "applybox.h"
+#include <ctype.h>
+#include <string.h>
 #ifdef __UNIX__
-#include          <assert.h>
-#include                    <errno.h>
+#include <assert.h>
+#include <errno.h>
 #endif
-#include          "mainblk.h"
-#include                   "genblob.h"
-#include                   "fixxht.h"
-#include          "control.h"
-#include          "tessbox.h"
-#include          "globals.h"
-#include          "secname.h"
+#include "mainblk.h"
+#include "genblob.h"
+#include "fixxht.h"
+#include "control.h"
+#include "tessbox.h"
+#include "globals.h"
+#include "secname.h"
+#include "unichar.h"
+#include "matchdefs.h"
 
 #define SECURE_NAMES
 #ifndef SECURE_NAMES
@@ -47,10 +49,13 @@ what measures we are interested in.
 #define EXTERN
 EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
 EXTERN INT_VAR (applybox_debug, 0, "Debug level");
-EXTERN STRING_VAR (applybox_test_exclusions, "|",
+EXTERN STRING_VAR (applybox_test_exclusions, "",
 "Chars ignored for testing");
 EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
 
+// The unicharset used during box training
+static UNICHARSET unicharset_boxes;
+
 /*************************************************************************
  * The code re-assigns outlines to form words each with ONE labelled blob.
  * Noise is left in UNLABELLED words. The chars on the page are checked crudely
@@ -89,7 +94,7 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
   INT16 boxfile_lineno = 0;
   INT16 boxfile_charno = 0;
   BOX box;                       //boxfile box
-  char ch[2];                    //correct ch from boxfile
+  UNICHAR_ID uch_id;             //correct ch from boxfile
   ROW *row;
   ROW *prev_row = NULL;
   INT16 prev_box_right = MAX_INT16;
@@ -100,15 +105,20 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
   INT16 labels_ok;
   INT16 rows_ok;
   INT16 bad_blobs;
-  INT16 tgt_char_counts[128];    //No. of box samples
+  INT16 tgt_char_counts[MAX_NUM_CLASSES];    //No. of box samples
   //      INT16                                   labelled_char_counts[128];      //No. of unique labelled samples
   INT16 i;
   INT16 rebalance_count = 0;
-  char min_char;
+  UNICHAR_ID min_uch_id;
   INT16 min_samples;
   INT16 final_labelled_blob_count;
 
-  for (i = 0; i < 128; i++)
+  // Clean the unichar set
+  unicharset_boxes.clear();
+  // Space character needed to represent NIL classification
+  unicharset_boxes.unichar_insert(" ");
+
+  for (i = 0; i < MAX_NUM_CLASSES; i++)
     tgt_char_counts[i] = 0;
 
   FILE* box_file;
@@ -120,11 +130,10 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
       filename.string(), errno);
   }
 
-  ch[1] = '\0';
   clear_any_old_text(block_list);
-  while (read_next_box (box_file, &box, &ch[0])) {
+  while (read_next_box (box_file, &box, &uch_id)) {
     box_count++;
-    tgt_char_counts[ch[0]]++;
+    tgt_char_counts[uch_id]++;
     row = find_row_of_box (block_list, box, block_id, row_id);
     if (box.left () < prev_box_right) {
       boxfile_lineno++;
@@ -135,14 +144,16 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
 
     if (row == NULL) {
       box_failures++;
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+      report_failed_box (boxfile_lineno, boxfile_charno, box,
+                         unicharset_boxes.id_to_unichar(uch_id),
         "FAILURE! box overlaps no blobs or blobs in multiple rows");
     }
     else {
       if ((box.left () >= prev_box_right) && (row != prev_row))
-        report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
           "WARNING! false row break");
-      box_failures += resegment_box (row, box, ch, block_id, row_id,
+      box_failures += resegment_box (row, box, uch_id, block_id, row_id,
         boxfile_lineno, boxfile_charno);
       prev_row = row;
     }
@@ -154,7 +165,7 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
           bad_blobs,
           tgt_char_counts,
           rebalance_count,
-          min_char,
+          &min_uch_id,
           min_samples,
           final_labelled_blob_count);
   tprintf ("APPLY_BOXES:\n");
@@ -163,7 +174,8 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
     labels_ok, rows_ok);
   tprintf ("   Box failures detected:		%6d\n", box_failures);
   tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
-  tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);
+  tprintf ("   \"%s\" has fewest samples:%6d\n",
+           unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
   tprintf ("				Total unlabelled words:   %6d\n",
     bad_blobs);
   tprintf ("				Final labelled words:     %6d\n",
@@ -194,7 +206,7 @@ void clear_any_old_text(                        //remove correct text
 
 BOOL8 read_next_box(FILE* box_file,  //
                     BOX *box,
-                    char *ch) {
+                    UNICHAR_ID *uch_id) {
   char buff[256];                //boxfile read buffer
   char *buffptr = buff;
   STRING box_filename;
@@ -204,23 +216,38 @@ BOOL8 read_next_box(FILE* box_file,  //
   INT32 x_max;
   INT32 y_max;
   INT32 count = 0;
+  char uch[256];
 
   while (!feof (box_file)) {
     fgets (buff, sizeof (buff) - 1, box_file);
     line++;
 
+    buffptr = buff;
+    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
+    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
+      buffptr += 3;  // Skip unicode file designation.
     /* Check for blank lines in box file */
-    for (buffptr = buff; isspace (*buffptr); buffptr++)
-      ;
+    while (isspace (*buffptr))
+      buffptr++;
     if (*buffptr != '\0') {
       count =
-        sscanf (buff,
-        "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
-        INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
+        sscanf (buffptr,
+        "%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
+        INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
       if (count != 5) {
         tprintf ("Box file format error on line %i ignored\n", line);
       }
       else {
+        if (!unicharset_boxes.contains_unichar(uch))
+        {
+          unicharset_boxes.unichar_insert(uch);
+          if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
+            tprintf("Error: Size of unicharset of boxes is \
+greater than MAX_NUM_CLASSES\n");
+            exit(1);
+          }
+        }
+        *uch_id = unicharset_boxes.unichar_to_id(uch);
         *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
         return TRUE;             //read a box ok
       }
@@ -314,7 +341,7 @@ ROW *find_row_of_box(                         //
 INT16 resegment_box(  //
                     ROW *row,
                     BOX box,
-                    char *ch,
+                    UNICHAR_ID uch_id,
                     INT16 block_id,
                     INT16 row_id,
                     INT16 boxfile_lineno,
@@ -358,7 +385,7 @@ INT16 resegment_box(  //
                   if (applybox_debug > 4)
                     report_failed_box (boxfile_lineno,
                       boxfile_charno,
-                      box, ch,
+                      box, unicharset_boxes.id_to_unichar(uch_id),
                       "FAILURE! box overlaps blob in labelled word");
                 }
                 if (applybox_debug > 4)
@@ -375,7 +402,7 @@ INT16 resegment_box(  //
                 if (new_word == NULL) {
                                  /* Make a new word with a single blob */
                   new_word = word->shallow_copy ();
-                  new_word->set_text (ch);
+                  new_word->set_text (unicharset_boxes.id_to_unichar(uch_id));
                   if (polyg)
                     new_blob = new PBLOB;
                   else
@@ -414,63 +441,75 @@ INT16 resegment_box(  //
     word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
     baseline = row->base_line (word_x_centre);
 
-    if (STRING (chs_caps_ht).contains (ch[0]) &&
-      (new_word_box.top () <
-    baseline + (1 + applybox_error_band) * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! caps-ht char didn't ascend");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_odd_top).contains (ch[0]) &&
-      (new_word_box.top () <
-    baseline + (1 - applybox_error_band) * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! Odd top char below xht");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_x_ht).contains (ch[0]) &&
-      ((new_word_box.top () >
-      baseline + (1 + applybox_error_band) * row->x_height ()) ||
-      (new_word_box.top () <
-    baseline + (1 - applybox_error_band) * row->x_height ()))) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! x-ht char didn't have top near xht");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
-      ((new_word_box.bottom () <
-      baseline - applybox_error_band * row->x_height ()) ||
-      (new_word_box.bottom () >
-    baseline + applybox_error_band * row->x_height ()))) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! non ambig BL char didnt have bottom near baseline");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_odd_bot).contains (ch[0]) &&
-      (new_word_box.bottom () >
-    baseline + applybox_error_band * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! Odd bottom char above baseline");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_desc).contains (ch[0]) &&
-      (new_word_box.bottom () >
-    baseline - applybox_error_band * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+#if 0
+    if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
+      if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.top () <
+           baseline + (1 + applybox_error_band) * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! caps-ht char didn't ascend");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.top () <
+           baseline + (1 - applybox_error_band) * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! Odd top char below xht");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          ((new_word_box.top () >
+            baseline + (1 + applybox_error_band) * row->x_height ()) ||
+           (new_word_box.top () <
+            baseline + (1 - applybox_error_band) * row->x_height ()))) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! x-ht char didn't have top near xht");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_non_ambig_bl).contains
+          (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          ((new_word_box.bottom () <
+            baseline - applybox_error_band * row->x_height ()) ||
+           (new_word_box.bottom () >
+            baseline + applybox_error_band * row->x_height ()))) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! non ambig BL char didnt have bottom near baseline");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.bottom () >
+           baseline + applybox_error_band * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! Odd bottom char above baseline");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.bottom () >
+           baseline - applybox_error_band * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
         "FAILURE! Descender doesn't descend");
-      new_word->set_text ("");
-      return 1;
+        new_word->set_text ("");
+        return 1;
+      }
     }
+#endif
     return 0;
   }
   else {
-    report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-      "FAILURE! Couldn't find any blobs");
+    report_failed_box (boxfile_lineno, boxfile_charno, box,
+                       unicharset_boxes.id_to_unichar(uch_id),
+                       "FAILURE! Couldn't find any blobs");
     return 1;
   }
 }
@@ -492,7 +531,7 @@ void tidy_up(                         //
              INT16 &unlabelled_words,
              INT16 *tgt_char_counts,
              INT16 &rebalance_count,
-             char &min_char,
+             UNICHAR_ID *min_uch_id,
              INT16 &min_samples,
              INT16 &final_labelled_blob_count) {
   BLOCK_IT block_it(block_list);
@@ -507,16 +546,16 @@ void tidy_up(                         //
   BOOL8 row_ok;
   BOOL8 rebalance_needed = FALSE;
                                  //No. of unique labelled samples
-  INT16 labelled_char_counts[128];
+  INT16 labelled_char_counts[MAX_NUM_CLASSES];
   INT16 i;
-  char ch;
-  char prev_ch = '\0';
+  UNICHAR_ID uch_id;
+  UNICHAR_ID prev_uch_id = -1;
   BOOL8 at_dupe_of_prev_word;
   ROW *prev_row = NULL;
   INT16 left;
   INT16 prev_left = -1;
 
-  for (i = 0; i < 128; i++)
+  for (i = 0; i < MAX_NUM_CLASSES; i++)
     labelled_char_counts[i] = 0;
 
   ok_char_count = 0;
@@ -556,7 +595,7 @@ void tidy_up(                         //
               block_idx, row_idx, all_row_idx);
 
           ok_char_count++;
-          labelled_char_counts[*word->text ()]++;
+          labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++;
           row_ok = TRUE;
         }
       }
@@ -571,24 +610,24 @@ void tidy_up(                         //
   }
 
   min_samples = 9999;
-  for (i = 0; i < 128; i++) {
+  for (i = 0; i < unicharset_boxes.size(); i++) {
     if (tgt_char_counts[i] > labelled_char_counts[i]) {
       if (labelled_char_counts[i] <= 1) {
         tprintf
-          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
-          labelled_char_counts[i], (char) i, tgt_char_counts[i]);
+          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d\n",
+          labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);
       }
       else {
         rebalance_needed = TRUE;
         if (applybox_debug > 0)
           tprintf
-            ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
-            (char) i, tgt_char_counts[i], labelled_char_counts[i]);
+            ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",
+            unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);
       }
     }
     if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
       min_samples = labelled_char_counts[i];
-      min_char = (char) i;
+      *min_uch_id = i;
     }
   }
 
@@ -605,33 +644,36 @@ void tidy_up(                         //
         !word_it.cycled_list (); word_it.forward ()) {
           word = word_it.data ();
           left = word->bounding_box ().left ();
-          ch = *word->text ();
+          if (*word->text () != '\0')
+            uch_id = unicharset_boxes.unichar_to_id(word->text ());
+          else
+            uch_id = -1;
           at_dupe_of_prev_word = ((row == prev_row) &&
             (left = prev_left) &&
-            (ch == prev_ch));
-          if ((ch != '\0') &&
-            (labelled_char_counts[ch] > 1) &&
-            (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
+            (uch_id == prev_uch_id));
+          if ((uch_id != -1) &&
+            (labelled_char_counts[uch_id] > 1) &&
+            (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
           (!at_dupe_of_prev_word)) {
             /* Duplicate the word to rebalance the labelled samples */
             if (applybox_debug > 9) {
-              tprintf ("Duping \"%c\" from ", ch);
+              tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
               word->bounding_box ().print ();
             }
             duplicate_word = new WERD;
             *duplicate_word = *word;
             word_it.add_after_then_move (duplicate_word);
             rebalance_count++;
-            labelled_char_counts[ch]++;
+            labelled_char_counts[uch_id]++;
           }
           prev_row = row;
           prev_left = left;
-          prev_ch = ch;
+          prev_uch_id = uch_id;
         }
       }
     }
     rebalance_needed = FALSE;
-    for (i = 0; i < 128; i++) {
+    for (i = 0; i < unicharset_boxes.size(); i++) {
       if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
       (labelled_char_counts[i] > 1)) {
         rebalance_needed = TRUE;
@@ -653,7 +695,7 @@ void tidy_up(                         //
       for (word_it.mark_cycle_pt ();
       !word_it.cycled_list (); word_it.forward ()) {
         word = word_it.data ();
-        if ((strlen (word->text ()) == 1) &&
+        if ((strlen (word->text ()) > 0) &&
           (word->gblob_list ()->length () == 1))
           final_labelled_blob_count++;
       }
@@ -665,7 +707,7 @@ void tidy_up(                         //
 void report_failed_box(INT16 boxfile_lineno,
                        INT16 boxfile_charno,
                        BOX box,
-                       char *box_ch,
+                       const char *box_ch,
                        const char *err_msg) {
   if (applybox_debug > 4)
     tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
@@ -687,10 +729,9 @@ void apply_box_training(BLOCK_LIST *block_list) {
   PBLOB_IT blob_it;
   DENORM denorm;
   INT16 count = 0;
-  char ch[2];
-
-  ch[1] = '\0';
+  char unichar[UNICHAR_LEN + 1];
 
+  unichar[UNICHAR_LEN] = '\0';
   tprintf ("Generating training data\n");
   for (block_it.mark_cycle_pt ();
   !block_it.cycled_list (); block_it.forward ()) {
@@ -701,23 +742,22 @@ void apply_box_training(BLOCK_LIST *block_list) {
       for (word_it.mark_cycle_pt ();
       !word_it.cycled_list (); word_it.forward ()) {
         word = word_it.data ();
-        if ((strlen (word->text ()) == 1) &&
+        if ((strlen (word->text ()) > 0) &&
         (word->gblob_list ()->length () == 1)) {
-          /* Here is a word with a single char label and a single blob so train on it */
+          /* Here is a word with a single unichar label and a single blob so train on it */
           bln_word =
             make_bln_copy (word, row, row->x_height (), &denorm);
           blob_it.set_to_list (bln_word->blob_list ());
-          ch[0] = *word->text ();
+          strncpy(unichar, word->text (), UNICHAR_LEN);
           tess_training_tester (blob_it.data (),
                                  //single blob
             &denorm, TRUE,       //correct
-            ch,                  //correct ASCII char
-            1,                   //ASCII length
+            unichar,             //correct character
+            strlen(unichar),     //character length
             NULL);
           copy_outword = *(bln_word);
           copy_outword.baseline_denormalise (&denorm);
           blob_it.set_to_list (copy_outword.blob_list ());
-          ch[0] = *word->text ();
           delete bln_word;
           count++;
         }
@@ -793,7 +833,7 @@ void apply_box_testing(BLOCK_LIST *block_list) {
             choice list, outword blob lists and best_choice string are the same
             length. A TESS screw up is indicated by a blank filled or 0 length string.
           */
-          if ((best_choice->string ().length () == 0) ||
+          if ((best_choice->lengths ().length () == 0) ||
             (strspn (best_choice->string ().string (), " ") ==
           best_choice->string ().length ())) {
             rej_count++;
@@ -804,22 +844,22 @@ void apply_box_testing(BLOCK_LIST *block_list) {
             #endif
           }
           else {
-            if ((best_choice->string ().length () !=
+            if ((best_choice->lengths ().length () !=
               outword->blob_list ()->length ()) ||
-              (best_choice->string ().length () !=
+              (best_choice->lengths ().length () !=
             blob_choices.length ())) {
               tprintf
                 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
                 best_choice->string ().string (),
-                best_choice->string ().length (),
+                best_choice->lengths ().length (),
                 outword->blob_list ()->length (),
                 blob_choices.length ());
             }
-            ASSERT_HOST (best_choice->string ().length () ==
+            ASSERT_HOST (best_choice->lengths ().length () ==
               outword->blob_list ()->length ());
-            ASSERT_HOST (best_choice->string ().length () ==
+            ASSERT_HOST (best_choice->lengths ().length () ==
               blob_choices.length ());
-            fix_quotes ((char *) best_choice->string ().string (),
+            fix_quotes (best_choice,
                                  //turn to double
               outword, &blob_choices);
             if (strcmp (best_choice->string ().string (), ch) != 0) {
diff --git a/ccmain/baseapi.cpp b/ccmain/baseapi.cpp
index 86934533a..80cb53f58 100644
--- a/ccmain/baseapi.cpp
+++ b/ccmain/baseapi.cpp
@@ -27,6 +27,7 @@
 #include "applybox.h"
 #include "pgedit.h"
 #include "varabled.h"
+#include "output.h"
 #include "adaptmatch.h"
 
 BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
@@ -37,6 +38,8 @@ BOOL_VAR(tessedit_train_from_boxes, FALSE,
 // Minimum sensible image size to be worth running tesseract.
 const int kMinRectSize = 10;
 
+static STRING input_file = "noname.tif";
+
 // Start tesseract.
 // The datapath must be the name of the data directory or some other file
 // in which the data directory resides (for instance argv[0].)
@@ -70,6 +73,12 @@ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
   return result;
 }
 
+// Set the name of the input file. Needed only for training and
+// loading a UNLV zone file.
+void TessBaseAPI::SetInputName(const char* name) {
+  input_file = name;
+}
+
 // Recognize a rectangle from an image and return the result as a string.
 // May be called many times for a single Init.
 // Currently has no error checking.
@@ -96,6 +105,52 @@ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
   return RecognizeToString();
 }
 
+// As TesseractRect but produces a box file as output.
+char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
+                                      int bytes_per_pixel,
+                                      int bytes_per_line,
+                                      int left, int top,
+                                      int width, int height,
+                                      int imageheight) {
+  if (width < kMinRectSize || height < kMinRectSize)
+  return NULL;  // Nothing worth doing.
+
+  // Copy/Threshold the image to the tesseract global page_image.
+  CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
+                       left, top, width, height);
+
+  BLOCK_LIST    block_list;
+
+  FindLines(&block_list);
+
+  // Now run the main recognition.
+  PAGE_RES* page_res = Recognize(&block_list, NULL);
+
+  return TesseractToBoxText(page_res, left, imageheight - (top + height));
+}
+
+char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
+                                     int bytes_per_pixel,
+                                     int bytes_per_line,
+                                     int left, int top,
+                                     int width, int height) {
+  if (width < kMinRectSize || height < kMinRectSize)
+    return NULL;  // Nothing worth doing.
+
+  // Copy/Threshold the image to the tesseract global page_image.
+  CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
+                       left, top, width, height);
+
+  BLOCK_LIST    block_list;
+
+  FindLines(&block_list);
+
+  // Now run the main recognition.
+  PAGE_RES* page_res = Recognize(&block_list, NULL);
+
+  return TesseractToUNLV(page_res);
+}
+
 // Call between pages or documents etc to free up memory and forget
 // adaptive data.
 void TessBaseAPI::ClearAdaptiveClassifier() {
@@ -326,7 +381,7 @@ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
   image.capture(const_cast<unsigned char*>(imagedata),
                 bytes_per_line*8, top + height, 1);
   page_image.create(width, height, 1);
-  copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false);
+  copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
 }
 
 // Low-level function to recognize the current global image to a string.
@@ -343,7 +398,6 @@ char* TessBaseAPI::RecognizeToString() {
 
 // Find lines from the image making the BLOCK_LIST.
 void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
-  STRING input_file = "noname.tif";
   // The following call creates a full-page block and then runs connected
   // component analysis and text line creation.
   pgeditor_read_file(input_file, block_list);
@@ -369,21 +423,32 @@ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
   return page_res;
 }
 
+// Return the maximum length that the output text string might occupy.
+int TessBaseAPI::TextLength(PAGE_RES* page_res) {
+  PAGE_RES_IT   page_res_it(page_res);
+  int total_length = 2;
+  // Iterate over the data structures to extract the recognition result.
+  for (page_res_it.restart_page(); page_res_it.word () != NULL;
+       page_res_it.forward()) {
+    WERD_RES *word = page_res_it.word();
+    WERD_CHOICE* choice = word->best_choice;
+    if (choice != NULL) {
+      total_length += choice->string().length() + 1;
+      for (int i = 0; i < word->reject_map.length(); ++i) {
+        if (word->reject_map[i].rejected())
+          ++total_length;
+      }
+    }
+  }
+  return total_length;
+}
+
 // Make a text string from the internal data structures.
 // The input page_res is deleted.
 char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
   if (page_res != NULL) {
-    int total_length = 2;
+    int total_length = TextLength(page_res);
     PAGE_RES_IT   page_res_it(page_res);
-    // Iterate over the data structures to extract the recognition result.
-    for (page_res_it.restart_page(); page_res_it.word () != NULL;
-         page_res_it.forward()) {
-      WERD_RES *word = page_res_it.word();
-      WERD_CHOICE* choice = word->best_choice;
-      if (choice != NULL) {
-        total_length += choice->string().length() + 1;
-      }
-    }
     char* result = new char[total_length];
     char* ptr = result;
     for (page_res_it.restart_page(); page_res_it.word () != NULL;
@@ -406,3 +471,207 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
   }
   return NULL;
 }
+
+static int ConvertWordToBoxText(WERD_RES *word,
+                                ROW_RES* row,
+                                int left,
+                                int bottom,
+                                char* word_str) {
+  // Copy the output word and denormalize it back to image coords.
+  WERD copy_outword;
+  copy_outword = *(word->outword);
+  copy_outword.baseline_denormalise(&word->denorm);
+  PBLOB_IT blob_it;
+  blob_it.set_to_list(copy_outword.blob_list());
+  int length = copy_outword.blob_list()->length();
+  int output_size = 0;
+
+  if (length > 0) {
+    for (int index = 0, offset = 0; index < length;
+         offset += word->best_choice->lengths()[index++], blob_it.forward()) {
+      PBLOB* blob = blob_it.data();
+      BOX blob_box = blob->bounding_box();
+      if (word->tess_failed ||
+          blob_box.left() < 0 ||
+          blob_box.right() > page_image.get_xsize() ||
+          blob_box.bottom() < 0 ||
+          blob_box.top() > page_image.get_ysize()) {
+        // Bounding boxes can be illegal when tess fails on a word.
+        blob_box = word->word->bounding_box();  // Use original word as backup.
+        tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
+                blob_box.left(), blob_box.bottom(),
+                blob_box.right(), blob_box.top());
+      }
+
+      // A single classification unit can be composed of several UTF-8
+      // characters. Append each of them to the result.
+      for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
+        char ch = word->best_choice->string()[offset + sub];
+        // Tesseract uses space for recognition failure. Fix to a reject
+        // character, '~' so we don't create illegal box files.
+        if (ch == ' ')
+          ch = '~';
+        word_str[output_size++] = ch;
+      }
+      sprintf(word_str + output_size, " %d %d %d %d\n",
+              blob_box.left() + left, blob_box.bottom() + bottom,
+              blob_box.right() + left, blob_box.top() + bottom);
+      output_size += strlen(word_str + output_size);
+    }
+  }
+  return output_size;
+}
+
+// Multiplier for textlength assumes 4 numbers @ 5 digits and a space
+// plus the newline and the orginial character = 4*(5+1)+2
+const int kMaxCharsPerChar = 26;
+
+// Make a text string from the internal data structures.
+// The input page_res is deleted.
+// The text string takes the form of a box file as needed for training.
+char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
+                                      int left, int bottom) {
+  if (page_res != NULL) {
+    int total_length = TextLength(page_res) * kMaxCharsPerChar;
+    PAGE_RES_IT   page_res_it(page_res);
+    char* result = new char[total_length];
+    char* ptr = result;
+    for (page_res_it.restart_page(); page_res_it.word () != NULL;
+         page_res_it.forward()) {
+      WERD_RES *word = page_res_it.word();
+      ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
+    }
+    *ptr = '\0';
+    delete page_res;
+    return result;
+  }
+  return NULL;
+}
+
+// Make a text string from the internal data structures.
+// The input page_res is deleted. The text string is converted
+// to UNLV-format: Latin-1 with specific reject and suspect codes.
+const char kUnrecognized = '~';
+// Conversion table for non-latin characters.
+// Maps characters out of the latin set into the latin set.
+// TODO(rays) incorporate this translation into unicharset.
+const int kUniChs[] = {
+  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
+};
+// Latin chars corresponding to the unicode chars above.
+const int kLatinChs[] = {
+  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
+};
+
+char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
+  bool tilde_crunch_written = false;
+  bool last_char_was_newline = true;
+  bool last_char_was_tilde = false;
+
+  if (page_res != NULL) {
+    int total_length = TextLength(page_res);
+    PAGE_RES_IT   page_res_it(page_res);
+    char* result = new char[total_length];
+    char* ptr = result;
+    for (page_res_it.restart_page(); page_res_it.word () != NULL;
+         page_res_it.forward()) {
+      WERD_RES *word = page_res_it.word();
+      // Process the current word.
+      if (word->unlv_crunch_mode != CR_NONE) {
+        if (word->unlv_crunch_mode != CR_DELETE &&
+            (!tilde_crunch_written ||
+             (word->unlv_crunch_mode == CR_KEEP_SPACE &&
+              word->word->space () > 0 &&
+              !word->word->flag (W_FUZZY_NON) &&
+              !word->word->flag (W_FUZZY_SP)))) {
+          if (!word->word->flag (W_BOL) &&
+              word->word->space () > 0 &&
+              !word->word->flag (W_FUZZY_NON) &&
+              !word->word->flag (W_FUZZY_SP)) {
+            /* Write a space to separate from preceeding good text */
+            *ptr++ = ' ';
+            last_char_was_tilde = false;
+          }
+          if (!last_char_was_tilde) {
+            // Write a reject char.
+            last_char_was_tilde = true;
+            *ptr++ = kUnrecognized;
+            tilde_crunch_written = true;
+            last_char_was_newline = false;
+          }
+        }
+      } else {
+        // NORMAL PROCESSING of non tilde crunched words.
+        tilde_crunch_written = false;
+
+        if (last_char_was_tilde &&
+            word->word->space () == 0 &&
+            (word->best_choice->string ()[0] == ' ')) {
+          /* Prevent adjacent tilde across words - we know that adjacent tildes within
+             words have been removed */
+          char* p = (char *) word->best_choice->string().string ();
+          strcpy (p, p + 1);       //shuffle up
+          p = (char *) word->best_choice->lengths().string ();
+          strcpy (p, p + 1);       //shuffle up
+          word->reject_map.remove_pos (0);
+          PBLOB_IT blob_it = word->outword->blob_list ();
+          delete blob_it.extract ();   //get rid of reject blob
+        }
+
+        if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
+          ensure_rep_chars_are_consistent(word);
+
+        set_unlv_suspects(word);
+        const char* wordstr = word->best_choice->string().string();
+        if (wordstr[0] != 0) {
+          if (!last_char_was_newline)
+            *ptr++ = ' ';
+          else
+            last_char_was_newline = false;
+          int offset = 0;
+          const STRING& lengths = word->best_choice->lengths();
+          int length = lengths.length();
+          for (int i = 0; i < length; offset += lengths[i++]) {
+            if (wordstr[offset] == ' ' ||
+                wordstr[offset] == '~' ||
+                wordstr[offset] == '|') {
+              *ptr++ = kUnrecognized;
+              last_char_was_tilde = true;
+            } else {
+              if (word->reject_map[i].rejected())
+                *ptr++ = '^';
+              UNICHAR ch(wordstr + offset, lengths[i]);
+              int uni_ch = ch.first_uni();
+              for (int j = 0; kUniChs[j] != 0; ++j) {
+                if (kUniChs[j] == uni_ch) {
+                  uni_ch = kLatinChs[j];
+                  break;
+                }
+              }
+              if (uni_ch <= 0xff) {
+                *ptr++ = static_cast<char>(uni_ch);
+                last_char_was_tilde = false;
+              } else {
+                *ptr++ = kUnrecognized;
+                last_char_was_tilde = true;
+              }
+            }
+          }
+        }
+      }
+      if (word->word->flag(W_EOL) && !last_char_was_newline) {
+        /* Add a new line output */
+        *ptr++ = '\n';
+        tilde_crunch_written = false;
+        last_char_was_newline = true;
+        last_char_was_tilde = false;
+      }
+    }
+    *ptr++ = '\n';
+    *ptr = '\0';
+    delete page_res;
+    return result;
+  }
+  return NULL;
+}
+
diff --git a/ccmain/baseapi.h b/ccmain/baseapi.h
index cdb8b251b..d33f9dff0 100644
--- a/ccmain/baseapi.h
+++ b/ccmain/baseapi.h
@@ -20,8 +20,6 @@
 #ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
 #define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
 
-#include <string>
-
 class PAGE_RES;
 class BLOCK_LIST;
 
@@ -56,6 +54,10 @@ class TessBaseAPI {
                               const char* language, const char* configfile,
                               bool numeric_mode, int argc, char* argv[]);
 
+  // Set the name of the input file. Needed only for training and
+  // reading a UNLV zone file.
+  static void SetInputName(const char* name);
+
   // Recognize a rectangle from an image and return the result as a string.
   // May be called many times for a single Init.
   // Currently has no error checking.
@@ -71,6 +73,19 @@ class TessBaseAPI {
                              int bytes_per_pixel,
                              int bytes_per_line,
                              int left, int top, int width, int height);
+  // As TesseractRect but produces a box file as output.
+  // Image height is needed as well as rect height, since output y-coords
+  // will be relative to the bottom of the image.
+  static char* TesseractRectBoxes(const unsigned char* imagedata,
+                                  int bytes_per_pixel,
+                                  int bytes_per_line,
+                                  int left, int top, int width, int height,
+                                  int imageheight);
+  // As TesseractRect but produces UNLV-style output.
+  static char* TesseractRectUNLV(const unsigned char* imagedata,
+                                 int bytes_per_pixel,
+                                 int bytes_per_line,
+                                 int left, int top, int width, int height);
 
   // Call between pages or documents etc to free up memory and forget
   // adaptive data.
@@ -153,8 +168,18 @@ class TessBaseAPI {
   static PAGE_RES* Recognize(BLOCK_LIST* block_list,
                              struct ETEXT_STRUCT* monitor);
 
+  // Return the maximum length that the output text string might occupy.
+  static int TextLength(PAGE_RES* page_res);
   // Convert (and free) the internal data structures into a text string.
   static char* TesseractToText(PAGE_RES* page_res);
+  // Make a text string from the internal data structures.
+  // The input page_res is deleted.
+  // The text string takes the form of a box file as needed for training.
+  static char* TesseractToBoxText(PAGE_RES* page_res, int left, int bottom);
+  // Make a text string from the internal data structures.
+  // The input page_res is deleted. The text string is converted
+  // to UNLV-format: Latin-1 with specific reject and suspect codes.
+  static char* TesseractToUNLV(PAGE_RES* page_res);
 };
 
 #endif  // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
diff --git a/ccmain/output.cpp b/ccmain/output.cpp
index ed2f8323f..3a703a0de 100644
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@@ -35,6 +35,7 @@
 #include          "docqual.h"
 #include          "output.h"
 #include "bestfirst.h"
+#include "globals.h"
 
 #define EXTERN
 
@@ -55,12 +56,12 @@ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
 "Write block separators in output");
 EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
 "Write raw stuff to name.raw");
-EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");
+EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
 "Return ratings in IPEOCRAPI data");
-EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,
+EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
 "Write .txt to .etx map file");
-EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,
+EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
 "Write repetition char code");
 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
 EXTERN STRING_EVAR (unrecognised_char, "|",
@@ -106,7 +107,6 @@ INT32 pixels_to_pts(               //convert coords
   return (INT32) (pts + 0.5);    //round it
 }
 
-
 void output_pass(  //Tess output pass //send to api
                  PAGE_RES_IT &page_res_it,
                  BOOL8 write_to_shm,
@@ -119,8 +119,7 @@ void output_pass(  //Tess output pass //send to api
 
   if (tessedit_write_txt_map)
     txt_mapfile = open_outfile (".map");
-  if (tessedit_write_unlv)
-    unlv_file = open_outfile (".unlv");
+
   page_res_it.restart_page ();
   block_of_last_word = NULL;
   while (page_res_it.word () != NULL) {
@@ -189,7 +188,6 @@ void output_pass(  //Tess output pass //send to api
   }
 }
 
-
 /*************************************************************************
  * write_results()
  *
@@ -211,9 +209,10 @@ void write_results(                           //output a word
                   ) {
                                  //word to do
   WERD_RES *word = page_res_it.word ();
-  WERD_CHOICE *ep_choice;        //ep format
+//   WERD_CHOICE *ep_choice;        //ep format
   STRING repetition_code;
   const STRING *wordstr;
+  STRING wordstr_lengths;
   const char *text;
   int i;
   char unrecognised = STRING (unrecognised_char)[0];
@@ -312,15 +311,12 @@ void write_results(                           //output a word
     if (tessedit_write_output && !NO_BLOCK)
       fprintf (textfile, "%s", txt_chs);
 
-    if (tessedit_write_unlv)
-      fprintf (unlv_file, "%s", txt_chs);
-
     if (tessedit_write_txt_map)
       fprintf (txt_mapfile, "%s", map_chs);
 
                                  //terminate string
     ep_chars[ep_chars_index] = '\0';
-    word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
+    word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
 
     if (force_eol)
       empty_block = TRUE;
@@ -345,6 +341,8 @@ void write_results(                           //output a word
        words have been removed */
     ptr = (char *) word->best_choice->string ().string ();
     strcpy (ptr, ptr + 1);       //shuffle up
+    ptr = (char *) word->best_choice->lengths ().string ();
+    strcpy (ptr, ptr + 1);       //shuffle up
     word->reject_map.remove_pos (0);
     blob_it = word->outword->blob_list ();
     delete blob_it.extract ();   //get rid of reject blob
@@ -354,8 +352,10 @@ void write_results(                           //output a word
     last_char_was_tilde = FALSE;
   else {
     if (word->reject_map.length () > 0) {
-      if (word->best_choice->string ()[word->reject_map.length () - 1] ==
-        ' ')
+      for (i = 0, ptr = (char *) word->best_choice->string().string();
+           i < word->reject_map.length () - 1; ++i)
+        ptr += word->best_choice->lengths()[i];
+      if (*ptr == ' ')
         last_char_was_tilde = TRUE;
       else
         last_char_was_tilde = FALSE;
@@ -365,7 +365,7 @@ void write_results(                           //output a word
     /* else it is unchanged as there are no output chars */
   }
 
-  ptr = (char *) word->best_choice->string ().string ();
+  ptr = (char *) word->best_choice->lengths ().string ();
   ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
 
   if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
@@ -379,21 +379,26 @@ void write_results(                           //output a word
       dict_word (word->best_choice->string ().string ()));
   }
 
+#if 0
   if (tessedit_write_unlv) {
     write_unlv_text(word);
   }
+#endif
 
   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
     repetition_code = "|^~R";
-    repetition_code += get_rep_char (word);
+    wordstr_lengths = "\001\001\001\001";
+    repetition_code += unicharset.id_to_unichar(get_rep_char (word));
+    wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
     wordstr = &repetition_code;
   }
   else {
     wordstr = &(word->best_choice->string ());
+    wordstr_lengths = word->best_choice->lengths ();
     if (tessedit_zero_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
       text = wordstr->string ();
-      for (i = 0; text[i] != '\0'; i++) {
+      for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
         if (word->reject_map[i].rejected ())
           word->reject_map[i].setrej_minimal_rej_accept ();
       }
@@ -401,8 +406,8 @@ void write_results(                           //output a word
     if (tessedit_minimal_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
       text = wordstr->string ();
-      for (i = 0; text[i] != '\0'; i++) {
-        if ((text[i] != ' ') && word->reject_map[i].rejected ())
+      for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
+        if ((*text != ' ') && word->reject_map[i].rejected ())
           word->reject_map[i].setrej_minimal_rej_accept ();
       }
     }
@@ -410,8 +415,9 @@ void write_results(                           //output a word
 
   if (write_to_shm)
     write_shm_text (word, page_res_it.block ()->block,
-      page_res_it.row (), *wordstr);
+      page_res_it.row (), *wordstr, wordstr_lengths);
 
+#if 0
   if (tessedit_write_output)
     write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
 
@@ -424,12 +430,12 @@ void write_results(                           //output a word
 
   ep_choice = make_epaper_choice (word, newline_type);
   word->ep_choice = ep_choice;
+#endif
 
-  character_count += word->best_choice->string ().length ();
+  character_count += word->best_choice->lengths ().length ();
   word_count++;
 }
 
-
 /**********************************************************************
  * make_epaper_choice
  *
@@ -437,6 +443,7 @@ void write_results(                           //output a word
  * determine whether each blob should be rejected.
  **********************************************************************/
 
+#if 0
 WERD_CHOICE *make_epaper_choice(                   //convert one word
                                 WERD_RES *word,    //word to do
                                 char newline_type  //type of newline
@@ -482,7 +489,8 @@ WERD_CHOICE *make_epaper_choice(                   //convert one word
   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
     strcpy (word_string + index, "|^~R");
     index += 4;
-    word_string[index++] = get_rep_char (word);
+    strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
+    index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
   }
   else {
     if (!blob_it.empty ())
@@ -537,7 +545,7 @@ WERD_CHOICE *make_epaper_choice(                   //convert one word
   ASSERT_HOST (strlen (word_string) == index);
   return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
 }
-
+#endif
 
 /**********************************************************************
  * make_reject
@@ -653,6 +661,7 @@ char determine_newline_type(                   //test line ends
  * to the given file.
  **********************************************************************/
 
+#if 0
 void write_cooked_text(                     //write output
                        WERD *word,          //word to do
                        const STRING &text,  //text to write
@@ -749,6 +758,7 @@ void write_cooked_text(                     //write output
   if (status != 0)
     WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
 }
+#endif
 
 
 /**********************************************************************
@@ -761,7 +771,8 @@ void write_shm_text(                    //write output
                     WERD_RES *word,     //word to do
                     BLOCK *block,       //block it is from
                     ROW_RES *row,       //row it is from
-                    const STRING &text  //text to write
+                    const STRING &text, //text to write
+                    const STRING &text_lengths
                    ) {
   INT32 index;                   //char counter
   INT32 index2;                  //char counter
@@ -777,6 +788,8 @@ void write_shm_text(                    //write output
   WERD copy_outword;             // copy to denorm
   UINT32 rating;                 //of char
   BOOL8 lineend;                 //end of line
+  int offset;
+  int offset2;
 
                                  //point size
   ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
@@ -786,13 +799,14 @@ void write_shm_text(                    //write output
   copy_outword = *(word->outword);
   copy_outword.baseline_denormalise (&word->denorm);
   blob_it.set_to_list (copy_outword.blob_list ());
-  length = text.length ();
+  length = text_lengths.length ();
 
   if (length > 0) {
     blanks = word->word->space ();
     if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
       blanks = 1;
-    for (index = 0; index < length; index++, blob_it.forward ()) {
+    for (index = 0, offset = 0; index < length;
+         offset += text_lengths[index++], blob_it.forward ()) {
       blob = blob_it.data ();
       blob_box = blob->bounding_box ();
 
@@ -804,7 +818,7 @@ void write_shm_text(                    //write output
       if (tessedit_write_ratings)
         rating = (UINT32) (-word->best_choice->certainty () / 0.035);
       else if (tessedit_zero_rejection)
-        rating = text[index] == ' ' ? 100 : 0;
+        rating = text[offset] == ' ' ? 100 : 0;
       else
         rating = word->reject_map[index].accepted ()? 0 : 100;
       if (rating > 255)
@@ -819,22 +833,41 @@ void write_shm_text(                    //write output
 
       lineend = word->word->flag (W_EOL) && index == length - 1;
       if (word->word->flag (W_EOL) && tessedit_zero_rejection
-      && index < length - 1 && text[index + 1] == ' ') {
-        for (index2 = index + 1; index2 < length && text[index2] == ' ';
-          index2++);
+      && index < length - 1 && text[index + text_lengths[index]] == ' ') {
+        for (index2 = index + 1, offset2 = offset + text_lengths[index];
+             index2 < length && text[offset2] == ' ';
+             offset2 += text_lengths[index2++]);
         if (index2 == length)
           lineend = TRUE;
       }
 
-      if (!tessedit_zero_rejection || text[index] != ' '
+      if (!tessedit_zero_rejection || text[offset] != ' '
       || tessedit_word_for_word) {
                                  //confidence
-        ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
-          ptsize,                //point size
-          blanks, enhancement,   //enhancement
-          OCR_CDIR_LEFT_RIGHT,
-          OCR_LDIR_DOWN_RIGHT,
-          lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+        if (text[offset] == ' ') {
+        ocr_append_char (unrecognised,
+                         blob_box.left (), blob_box.right (),
+                         page_image.get_ysize () - 1 - blob_box.top (),
+                         page_image.get_ysize () - 1 - blob_box.bottom (),
+                         font, (UINT8) rating,
+                         ptsize,                //point size
+                         blanks, enhancement,   //enhancement
+                         OCR_CDIR_LEFT_RIGHT,
+                         OCR_LDIR_DOWN_RIGHT,
+                         lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+        } else {
+          for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
+            ocr_append_char (text[offset + suboffset],
+                             blob_box.left (), blob_box.right (),
+                             page_image.get_ysize () - 1 - blob_box.top (),
+                             page_image.get_ysize () - 1 - blob_box.bottom (),
+                             font, (UINT8) rating,
+                             ptsize,                //point size
+                             blanks, enhancement,   //enhancement
+                             OCR_CDIR_LEFT_RIGHT,
+                             OCR_LDIR_DOWN_RIGHT,
+                             lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+        }
         blanks = 0;
       }
 
@@ -863,13 +896,17 @@ void write_shm_text(                    //write output
     lineend = word->word->flag (W_EOL);
 
                                  //font index
-    ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
-      rating,                    //confidence
-      ptsize,                    //point size
-      blanks, enhancement,       //enhancement
-      OCR_CDIR_LEFT_RIGHT,
-      OCR_LDIR_DOWN_RIGHT,
-      lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+    ocr_append_char (unrecognised,
+                     blob_box.left (), blob_box.right (),
+                     page_image.get_ysize () - 1 - blob_box.top (),
+                     page_image.get_ysize () - 1 - blob_box.bottom (),
+                     font,
+                     rating,                    //confidence
+                     ptsize,                    //point size
+                     blanks, enhancement,       //enhancement
+                     OCR_CDIR_LEFT_RIGHT,
+                     OCR_LDIR_DOWN_RIGHT,
+                     lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
   }
 }
 
@@ -888,6 +925,7 @@ void write_shm_text(                    //write output
  * newdiff needs etx files!
  **********************************************************************/
 
+#if 0
 void write_map(                //output a map file
                FILE *mapfile,  //mapfile to write to
                WERD_RES *word) {
@@ -937,6 +975,7 @@ void write_map(                //output a map file
   if (status != 0)
     WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
 }
+#endif
 
 
 /*************************************************************************
@@ -957,6 +996,7 @@ FILE *open_outfile(  //open .map & .unlv file
 }
 
 
+#if 0
 void write_unlv_text(WERD_RES *word) {
   const char *wordstr;
 
@@ -1015,6 +1055,7 @@ void write_unlv_text(WERD_RES *word) {
   if (status != 0)
     WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
 }
+#endif
 
 
 /*************************************************************************
@@ -1022,21 +1063,24 @@ void write_unlv_text(WERD_RES *word) {
  * Return the first accepted character from the repetition string. This is the
  * character which is repeated - as determined earlier by fix_rep_char()
  *************************************************************************/
-char get_rep_char(  // what char is repeated?
-                  WERD_RES *word) {
+UNICHAR_ID get_rep_char(WERD_RES *word) {  // what char is repeated?
   int i;
+  int offset;
 
-  for (i = 0;
+  for (i = 0, offset = 0;
     ((i < word->reject_map.length ()) &&
-    (word->reject_map[i].rejected ())); i++);
+    (word->reject_map[i].rejected ()));
+       offset += word->best_choice->lengths()[i++]);
   if (i < word->reject_map.length ())
-    return word->best_choice->string ()[i];
+    return unicharset.unichar_to_id(word->best_choice->string().string()
+                                    + offset,
+                                    word->best_choice->lengths()[i]);
   else
-    return STRING (unrecognised_char)[0];
+    return unicharset.unichar_to_id(unrecognised_char.string());
 }
 
-
 void ensure_rep_chars_are_consistent(WERD_RES *word) {
+#if 0
   char rep_char = get_rep_char (word);
   char *ptr;
 
@@ -1045,8 +1089,24 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
     if (*ptr != rep_char)
       *ptr = rep_char;
   }
-}
+#endif
 
+#if 0
+  UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
+  int i;
+  char *ptr;
+  STRING consistent_string;
+  STRING consistent_string_lengths;
+
+  ptr = (char *) word->best_choice->string ().string ();
+  for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
+    consistent_string += unicharset.id_to_unichar(rep_char);
+    consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
+  }
+  word->best_choice->string() = consistent_string;
+  word->best_choice->lengths() = consistent_string_lengths;
+#endif
+}
 
 /*************************************************************************
  * SUSPECT LEVELS
@@ -1062,7 +1122,9 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
 void set_unlv_suspects(WERD_RES *word) {
   int len = word->reject_map.length ();
   int i;
+  int offset;
   const char *ptr;
+  const char *lengths = word->best_choice->lengths ().string ();
   float rating_per_ch;
 
   ptr = word->best_choice->string ().string ();
@@ -1080,10 +1142,12 @@ void set_unlv_suspects(WERD_RES *word) {
 
   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
 
-  if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
+  if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
+                               suspect_short_words)) {
     /* Unreject alphas in dictionary words */
-    for (i = 0; i < len; i++) {
-      if (word->reject_map[i].rejected () && isalpha (ptr[i]))
+    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
+      if (word->reject_map[i].rejected () &&
+          unicharset.get_isalpha (ptr + offset, lengths[i]))
         word->reject_map[i].setrej_minimal_rej_accept ();
     }
   }
@@ -1095,8 +1159,8 @@ void set_unlv_suspects(WERD_RES *word) {
 
   if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
-    for (i = 0; i < len; i++) {
-      if (word->reject_map[i].rejected () && (ptr[i] != ' '))
+    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
+      if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
         word->reject_map[i].setrej_minimal_rej_accept ();
     }
   }
@@ -1130,9 +1194,11 @@ void set_unlv_suspects(WERD_RES *word) {
     }
   }
 
-  if ((acceptable_word_string (word->best_choice->string ().string ())
+  if ((acceptable_word_string (word->best_choice->string ().string (),
+                               word->best_choice->lengths ().string ())
     != AC_UNACCEPTABLE) ||
-  acceptable_number_string (word->best_choice->string ().string ())) {
+  acceptable_number_string (word->best_choice->string ().string (),
+                            word->best_choice->lengths ().string ())) {
     if (word->reject_map.length () > suspect_short_words) {
       for (i = 0; i < len; i++) {
         if (word->reject_map[i].rejected () &&
@@ -1149,11 +1215,12 @@ void set_unlv_suspects(WERD_RES *word) {
 
 
 INT16 count_alphas(  //how many alphas
-                   const char *s) {
+                   const char *s,
+                   const char *lengths) {
   int count = 0;
 
-  for (; *s != '\0'; s++) {
-    if (isalpha (*s))
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isalpha(s, *lengths))
       count++;
   }
   return count;
@@ -1161,36 +1228,43 @@ INT16 count_alphas(  //how many alphas
 
 
 INT16 count_alphanums(  //how many alphanums
-                      const char *s) {
+                      const char *s,
+                      const char *lengths) {
   int count = 0;
 
-  for (; *s != '\0'; s++) {
-    if (isalnum (*s))
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isalpha(s, *lengths) ||
+        unicharset.get_isdigit(s, *lengths))
       count++;
   }
   return count;
 }
 
 
-BOOL8 acceptable_number_string(const char *s) {
+BOOL8 acceptable_number_string(const char *s,
+                               const char *lengths) {
   BOOL8 prev_digit = FALSE;
 
-  if (*s == '(')
+  if (*lengths == 1 && *s == '(')
     s++;
 
-  if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
+  if (*lengths == 1 &&
+      ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
     s++;
 
-  for (; *s != '\0'; s++) {
-    if (isdigit (*s))
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isdigit (s, *lengths))
       prev_digit = TRUE;
-    else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
-      prev_digit = FALSE;
     else if (prev_digit &&
-      (*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
+             (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
+      prev_digit = FALSE;
+    else if (prev_digit && *lengths == 1 &&
+             (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
       return TRUE;
     else if (prev_digit &&
-      (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
+             *lengths == 1 && (*s == '%') &&
+             (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
+             (*(s + *lengths + *(lengths + 1)) == '\0'))
       return TRUE;
     else
       return FALSE;
diff --git a/ccmain/tesseractmain.cpp b/ccmain/tesseractmain.cpp
index 865f2df7e..8c051ff67 100644
--- a/ccmain/tesseractmain.cpp
+++ b/ccmain/tesseractmain.cpp
@@ -31,7 +31,9 @@
 #include "stderr.h"
 #include "notdll.h"
 #include "mainblk.h"
+#include "output.h"
 #include "globals.h"
+#include "blread.h"
 #include "tfacep.h"
 #include "callnet.h"
 
@@ -40,7 +42,10 @@
 #define API_CONFIG      "configs/api_config"
 #define EXTERN
 
+EXTERN BOOL_VAR (tessedit_create_boxfile, FALSE, "Output text with boxes");
 EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read");
+EXTERN INT_VAR (tessedit_serial_unlv, 0,
+                "0->Whole page, 1->serial no adapt, 2->serial with adapt");
 EXTERN BOOL_VAR (tessedit_write_images, FALSE,
 "Capture the image from the IPE");
 EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file");
@@ -63,15 +68,30 @@ int main(int argc, char **argv) {
 
   if (argc < 3) {
     USAGE.error (argv[0], EXIT,
-      "%s imagename outputbase [configfile [[+|-]varfile]...]\n", argv[0]);
+      "%s imagename outputbase [-l lang] [configfile [[+|-]varfile]...]\n",
+      argv[0]);
+  }
+  // Find the required language.
+  const char* lang = "eng";
+  int arg = 3;
+  if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
+    lang = argv[4];
+    arg = 5;
+  }
+  // Find the basename of the input file.
+  STRING infile(argv[1]);
+  const char* lastdot = strrchr(argv[1], '.');
+  if (lastdot != NULL) {
+    infile[lastdot - argv[1]] = '\0';
   }
 
-  if (argc == 3)
-    TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
-                                  NULL, false, 0, argv + 2);
+  if (argc == arg)
+    TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
+                                  NULL, false, 0, argv + arg);
   else
-    TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
-                                  argv[3], false, argc - 4, argv + 4);
+    TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
+                                  argv[arg], false,
+                                  argc - arg - 1, argv + arg + 1);
 
   tprintf ("Tesseract Open Source OCR Engine\n");
 
@@ -92,20 +112,70 @@ int main(int argc, char **argv) {
       argv[1]);
   }
 #endif
+  STRING text_out;
   int bytes_per_line = check_legal_image_size(image.get_xsize(),
                                               image.get_ysize(),
                                               image.get_bpp());
-  char* text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
-                                          bytes_per_line, 0, 0,
-                                          image.get_xsize(), image.get_ysize());
+  if (tessedit_serial_unlv == 0) {
+    TessBaseAPI::SetInputName(argv[1]);
+    char* text;
+    if (tessedit_create_boxfile)
+      text = TessBaseAPI::TesseractRectBoxes(image.get_buffer(),
+                                             image.get_bpp()/8,
+                                             bytes_per_line, 0, 0,
+                                             image.get_xsize(),
+                                             image.get_ysize(),
+                                             image.get_ysize());
+    else if (tessedit_write_unlv)
+      text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
+                                            image.get_bpp()/8,
+                                            bytes_per_line, 0, 0,
+                                            image.get_xsize(),
+                                            image.get_ysize());
+    else
+      text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
+                                        bytes_per_line, 0, 0,
+                                        image.get_xsize(), image.get_ysize());
+    text_out = text;
+    delete [] text;
+  } else {
+    BLOCK_LIST blocks;
+    STRING filename = argv[1];
+    int len = filename.length();
+    if (len > 4 && filename[len - 4] == '.') {
+      filename[len - 4] = '\0';
+    }
+    if (!read_unlv_file(filename, image.get_xsize(), image.get_ysize(),
+                        &blocks)) {
+      fprintf(stderr, "Error: Must have a unlv zone file %s to read!\n",
+              filename.string());
+      return 1;
+    }
+    BLOCK_IT b_it = &blocks;
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      BLOCK* block = b_it.data();
+      BOX box = block->bounding_box();
+      char* text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
+                                                  image.get_bpp()/8,
+                                                  bytes_per_line,
+                                                  box.left(),
+                                                  image.get_ysize() - box.top(),
+                                                  box.width(),
+                                                  box.height());
+      text_out += text;
+      delete [] text;
+      if (tessedit_serial_unlv == 1)
+        TessBaseAPI::ClearAdaptiveClassifier();
+    }
+  }
+
   outfile = argv[2];
   outfile += ".txt";
   FILE* fp = fopen(outfile.string(), "w");
   if (fp != NULL) {
-    fwrite(text, 1, strlen(text), fp);
+    fwrite(text_out.string(), 1, text_out.length(), fp);
     fclose(fp);
   }
-  delete [] text;
   TessBaseAPI::End();
 
   return 0;                      //Normal exit
diff --git a/ccstruct/blread.cpp b/ccstruct/blread.cpp
index 93a3412a1..915490cf4 100644
--- a/ccstruct/blread.cpp
+++ b/ccstruct/blread.cpp
@@ -527,7 +527,9 @@ BOOL8 read_unlv_file(                    //print list of sides
   else {
     while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
                                  //make rect block
-      block = new BLOCK (name.string (), TRUE, 0, 0, (INT16) x, (INT16) (ysize - 1 - y - height), (INT16) (x + width), (INT16) (ysize - 1 - y));
+      block = new BLOCK (name.string (), TRUE, 0, 0,
+                         (INT16) x, (INT16) (ysize - y - height),
+                         (INT16) (x + width), (INT16) (ysize - y));
                                  //on end of list
       block_it.add_to_end (block);
     }
diff --git a/cutil/tordvars.cpp b/cutil/tordvars.cpp
index 7ef1bbc95..204f765f8 100644
--- a/cutil/tordvars.cpp
+++ b/cutil/tordvars.cpp
@@ -63,7 +63,7 @@ make_toggle_var (debug_8, 0, make_debug_8, 6, 8, toggle_debug_8, "Debug #8");
 make_toggle_var (display_ratings, 0, make_display_ratings,
 6, 9, toggle_ratings, "Ratings display");
 
-make_toggle_var (display_text, 1, make_display_text,
+make_toggle_var (display_text, 0, make_display_text,
 6, 10, toggle_text, "Display Text");
 
 make_toggle_var (show_bold, 1, make_show_bold,
diff --git a/tessdata/configs/makebox b/tessdata/configs/makebox
new file mode 100644
index 000000000..3d90ac26f
--- /dev/null
+++ b/tessdata/configs/makebox
@@ -0,0 +1 @@
+tessedit_create_boxfile 1
diff --git a/tessdata/configs/unlv b/tessdata/configs/unlv
new file mode 100644
index 000000000..537ad77a1
--- /dev/null
+++ b/tessdata/configs/unlv
@@ -0,0 +1,3 @@
+tessedit_write_unlv 1
+tessedit_write_output 0
+tessedit_write_txt_map 0
diff --git a/tessdata/tessconfigs/batch b/tessdata/tessconfigs/batch
index f0c729c18..619b64675 100644
--- a/tessdata/tessconfigs/batch
+++ b/tessdata/tessconfigs/batch
@@ -1,78 +1,2 @@
-#################################################
-# Adaptive Matcher Using PreAdapted Templates
-#################################################
-
-acts_fx                 0x800
-acts_ocr                0x20
-
-RatingScale             30.0
-CertaintyScale          20.0
-
-#EnableMatcher				0
-#CurrentFx					2
-MinSlope                 0.414213562
-MaxSlope                 2.414213562
-#ExtremityMode            1
-NormMethod               1
-EnableAdaptiveMatcher	1
-
-NormAdjMidpoint			32.0
-NormAdjCurl					2.0
-
-MinNormScaleX				0.0
-MaxNormScaleX				0.325
-MinNormScaleY				0.0
-MaxNormScaleY				0.325
-
-BuiltInTemplatesFile		tessdata/inttemp
-BuiltInCutoffsFile		tessdata/pffmtable
-
-EnableLearning				0
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-ReliableConfigThreshold	2
-MinNumPermClasses			3
-
-#EnableStopper				1
-GoodAdaptiveMatch			0.125
-GreatAdaptiveMatch		0.0
-
-EnableIntFX					1
-EnableNewAdaptRules		1
-################################################################################
-#
-# File:         marks/configs/knobs
-# Description:  Control variables for 'marks' code
-# Author:       Mark Seaman, OCR Technology
-# Created:      Wed Feb 27 11:27:27 1991
-# Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
-# Language:     Text
-# Package:      N/A
-# Status:       Experimental (Do Not Distribute)
-#
-# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
-#
-################################################################################
-
-#hidden_edges            1
-
-save_doc_words          1
-doc_dict_enable         1
-ClassPrunerThreshold			229
-ClassPrunerMultiplier		15
-IntThetaFudge					128
-CPCutoffStrength				0.15
-EvidenceTableBits				9
-IntEvidenceTruncBits			14
-SEExponentialMultiplier		0
-SimilarityCenter				0.0075
-#################################################
-# Adaptive Matcher Using 2 Passes
-#################################################
-
-EnableLearning				1
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-
-#save_errors             0
+# No content needed as all defaults are correct.
 
diff --git a/tessdata/tessconfigs/batch.nochop b/tessdata/tessconfigs/batch.nochop
new file mode 100644
index 000000000..93ae70046
--- /dev/null
+++ b/tessdata/tessconfigs/batch.nochop
@@ -0,0 +1,2 @@
+chop_enable 0
+enable_assoc 0
diff --git a/tessdata/tessconfigs/matdemo b/tessdata/tessconfigs/matdemo
index c1440a9e4..f3ad41d34 100755
--- a/tessdata/tessconfigs/matdemo
+++ b/tessdata/tessconfigs/matdemo
@@ -2,80 +2,6 @@
 # Adaptive Matcher Using PreAdapted Templates
 #################################################
 
-acts_fx                 0x800
-acts_ocr                0x20
-
-RatingScale             30.0
-CertaintyScale          20.0
-
-#EnableMatcher				0
-#CurrentFx					2
-EnableAdaptiveMatcher	1
-
-NormAdjMidpoint			32.0
-NormAdjCurl					2.0
-
-MinNormScaleX				0.0
-MaxNormScaleX				0.325
-MinNormScaleY				0.0
-MaxNormScaleY				0.325
-
-BuiltInTemplatesFile		tessdata/inttemp
-BuiltInCutoffsFile		tessdata/pffmtable
-
-EnableLearning				0
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-ReliableConfigThreshold	2
-MinNumPermClasses			3
-
-#EnableStopper				1
-GoodAdaptiveMatch			0.125
-GreatAdaptiveMatch		0.0
-
-EnableIntFX					1
-EnableNewAdaptRules		1
 EnableAdaptiveDebugger   1
 MatchDebugFlags         6
 MatcherDebugLevel       1
-################################################################################
-#
-# File:         marks/configs/knobs
-# Description:  Control variables for 'marks' code
-# Author:       Mark Seaman, OCR Technology
-# Created:      Wed Feb 27 11:27:27 1991
-# Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
-# Language:     Text
-# Package:      N/A
-# Status:       Experimental (Do Not Distribute)
-#
-# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
-#
-################################################################################
-
-#hidden_edges            1
-
-save_doc_words          1
-doc_dict_enable         1
-ClassPrunerThreshold			229
-ClassPrunerMultiplier		15
-IntThetaFudge					128
-CPCutoffStrength				0.15
-EvidenceTableBits				9
-IntEvidenceTruncBits			14
-SEExponentialMultiplier		0
-SimilarityCenter				0.0075
-#################################################
-# Adaptive Matcher Using 2 Passes
-#################################################
-
-display_splits          0
-display_all_words       0
-display_all_blobs       0
-display_segmentations   0
-EnableLearning				1
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-
-#save_errors             0
-
diff --git a/tessdata/tessconfigs/msdemo b/tessdata/tessconfigs/msdemo
new file mode 100644
index 000000000..9f312feac
--- /dev/null
+++ b/tessdata/tessconfigs/msdemo
@@ -0,0 +1,13 @@
+#################################################
+# Adaptive Matcher Using PreAdapted Templates
+#################################################
+
+EnableAdaptiveDebugger   1
+MatchDebugFlags         6
+MatcherDebugLevel       1
+
+display_splits          0
+display_all_words       1
+display_all_blobs       1
+display_segmentations   2
+display_ratings			1
diff --git a/tessdata/tessconfigs/nobatch b/tessdata/tessconfigs/nobatch
new file mode 100644
index 000000000..b042c2701
--- /dev/null
+++ b/tessdata/tessconfigs/nobatch
@@ -0,0 +1,2 @@
+display_text 0
+
diff --git a/tessdata/tessconfigs/segdemo b/tessdata/tessconfigs/segdemo
index 244386ebd..d1487bb3e 100755
--- a/tessdata/tessconfigs/segdemo
+++ b/tessdata/tessconfigs/segdemo
@@ -2,70 +2,6 @@
 # Adaptive Matcher Using PreAdapted Templates
 #################################################
 
-acts_fx                 0x800
-acts_ocr                0x20
-
-RatingScale             30.0
-CertaintyScale          20.0
-
-#EnableMatcher				0
-#CurrentFx					2
-EnableAdaptiveMatcher	1
-
-NormAdjMidpoint			32.0
-NormAdjCurl					2.0
-
-MinNormScaleX				0.0
-MaxNormScaleX				0.325
-MinNormScaleY				0.0
-MaxNormScaleY				0.325
-
-BuiltInTemplatesFile		tessdata/inttemp
-BuiltInCutoffsFile		tessdata/pffmtable
-
-EnableLearning				0
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-ReliableConfigThreshold	2
-MinNumPermClasses			3
-
-#EnableStopper				1
-GoodAdaptiveMatch			0.125
-GreatAdaptiveMatch		0.0
-
-EnableIntFX					1
-EnableNewAdaptRules		1
-################################################################################
-#
-# File:         marks/configs/knobs
-# Description:  Control variables for 'marks' code
-# Author:       Mark Seaman, OCR Technology
-# Created:      Wed Feb 27 11:27:27 1991
-# Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
-# Language:     Text
-# Package:      N/A
-# Status:       Experimental (Do Not Distribute)
-#
-# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
-#
-################################################################################
-
-#hidden_edges            1
-
-save_doc_words          1
-doc_dict_enable         1
-ClassPrunerThreshold			229
-ClassPrunerMultiplier		15
-IntThetaFudge					128
-CPCutoffStrength				0.15
-EvidenceTableBits				9
-IntEvidenceTruncBits			14
-SEExponentialMultiplier		0
-SimilarityCenter				0.0075
-#################################################
-# Adaptive Matcher Using 2 Passes
-#################################################
-
 display_splits          0
 display_all_words       1
 display_all_blobs       1
diff --git a/testing/Makefile b/testing/Makefile
new file mode 100644
index 000000000..b1e1132fc
--- /dev/null
+++ b/testing/Makefile
@@ -0,0 +1,185 @@
+# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+
+SHELL = /bin/sh
+
+srcdir = .
+top_srcdir = ..
+
+prefix = /usr/local
+exec_prefix = ${prefix}
+
+bindir = ${exec_prefix}/bin
+sbindir = ${exec_prefix}/sbin
+libexecdir = ${exec_prefix}/libexec
+datadir = ${prefix}/share
+sysconfdir = ${prefix}/etc
+sharedstatedir = ${prefix}/com
+localstatedir = ${prefix}/var
+libdir = ${exec_prefix}/lib
+infodir = ${prefix}/info
+mandir = ${prefix}/man
+includedir = ${prefix}/include/tesseract
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/
+pkglibdir = $(libdir)/
+pkgincludedir = $(includedir)/
+
+top_builddir = ..
+
+ACLOCAL = aclocal-1.4
+AUTOCONF = autoconf
+AUTOMAKE = automake-1.4
+AUTOHEADER = autoheader
+
+INSTALL = /usr/bin/install -c
+INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS)
+INSTALL_DATA = ${INSTALL} -m 644
+INSTALL_SCRIPT = ${INSTALL}
+transform = s,x,x,
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+host_alias = 
+host_triplet = x86_64-unknown-linux-gnu
+CC = gcc
+CXX = g++
+HAVE_LIB = @HAVE_LIB@
+LIB = @LIB@
+LTLIB = @LTLIB@
+MAINT = #
+MAKEINFO = /home/rays/src/opensrc/tesseract-ocr/config/missing makeinfo
+PACKAGE = 
+PACKAGE_DATE = 07/2007
+PACKAGE_NAME = tesseract
+PACKAGE_VERSION = 2.00
+PACKAGE_YEAR = 2007
+RANLIB = ranlib
+VERSION = 
+
+EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
+mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
+CONFIG_HEADER = ../config_auto.h
+CONFIG_CLEAN_FILES = 
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
+
+TAR = tar
+GZIP_ENV = --best
+all: all-redirect
+.SUFFIXES:
+$(srcdir)/Makefile.in: # Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status $(BUILT_SOURCES)
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+tags: TAGS
+TAGS:
+
+
+distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
+
+subdir = testing
+
+distdir: $(DISTFILES)
+	here=`cd $(top_builddir) && pwd`; \
+	top_distdir=`cd $(top_distdir) && pwd`; \
+	distdir=`cd $(distdir) && pwd`; \
+	cd $(top_srcdir) \
+	  && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
+	$(mkinstalldirs) $(distdir)/reports
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pr $$d/$$file $(distdir)/$$file; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || ln $$d/$$file $(distdir)/$$file 2> /dev/null \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+info-am:
+info: info-am
+dvi-am:
+dvi: dvi-am
+check-am: all-am
+check: check-am
+installcheck-am:
+installcheck: installcheck-am
+install-exec-am:
+install-exec: install-exec-am
+
+install-data-am:
+install-data: install-data-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-am
+uninstall-am:
+uninstall: uninstall-am
+all-am: Makefile
+all-redirect: all-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
+installdirs:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+mostlyclean-am:  mostlyclean-generic
+
+mostlyclean: mostlyclean-am
+
+clean-am:  clean-generic mostlyclean-am
+
+clean: clean-am
+
+distclean-am:  distclean-generic clean-am
+
+distclean: distclean-am
+
+maintainer-clean-am:  maintainer-clean-generic distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-am
+
+.PHONY: tags distdir info-am info dvi-am dvi check check-am \
+installcheck-am installcheck install-exec-am install-exec \
+install-data-am install-data install-am install uninstall-am uninstall \
+all-redirect all-am all installdirs mostlyclean-generic \
+distclean-generic clean-generic maintainer-clean-generic clean \
+mostlyclean distclean maintainer-clean
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/testing/Makefile.am b/testing/Makefile.am
new file mode 100644
index 000000000..d7254b532
--- /dev/null
+++ b/testing/Makefile.am
@@ -0,0 +1,2 @@
+
+EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
diff --git a/testing/Makefile.in b/testing/Makefile.in
new file mode 100644
index 000000000..061d682aa
--- /dev/null
+++ b/testing/Makefile.in
@@ -0,0 +1,185 @@
+# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS)
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+host_alias = @host_alias@
+host_triplet = @host@
+CC = @CC@
+CXX = @CXX@
+HAVE_LIB = @HAVE_LIB@
+LIB = @LIB@
+LTLIB = @LTLIB@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+PACKAGE = @PACKAGE@
+PACKAGE_DATE = @PACKAGE_DATE@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PACKAGE_YEAR = @PACKAGE_YEAR@
+RANLIB = @RANLIB@
+VERSION = @VERSION@
+
+EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
+mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
+CONFIG_HEADER = ../config_auto.h
+CONFIG_CLEAN_FILES = 
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
+
+TAR = tar
+GZIP_ENV = --best
+all: all-redirect
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status $(BUILT_SOURCES)
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+tags: TAGS
+TAGS:
+
+
+distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
+
+subdir = testing
+
+distdir: $(DISTFILES)
+	here=`cd $(top_builddir) && pwd`; \
+	top_distdir=`cd $(top_distdir) && pwd`; \
+	distdir=`cd $(distdir) && pwd`; \
+	cd $(top_srcdir) \
+	  && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
+	$(mkinstalldirs) $(distdir)/reports
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pr $$d/$$file $(distdir)/$$file; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || ln $$d/$$file $(distdir)/$$file 2> /dev/null \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+info-am:
+info: info-am
+dvi-am:
+dvi: dvi-am
+check-am: all-am
+check: check-am
+installcheck-am:
+installcheck: installcheck-am
+install-exec-am:
+install-exec: install-exec-am
+
+install-data-am:
+install-data: install-data-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-am
+uninstall-am:
+uninstall: uninstall-am
+all-am: Makefile
+all-redirect: all-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
+installdirs:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+mostlyclean-am:  mostlyclean-generic
+
+mostlyclean: mostlyclean-am
+
+clean-am:  clean-generic mostlyclean-am
+
+clean: clean-am
+
+distclean-am:  distclean-generic clean-am
+
+distclean: distclean-am
+
+maintainer-clean-am:  maintainer-clean-generic distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-am
+
+.PHONY: tags distdir info-am info dvi-am dvi check check-am \
+installcheck-am installcheck install-exec-am install-exec \
+install-data-am install-data install-am install uninstall-am uninstall \
+all-redirect all-am all installdirs mostlyclean-generic \
+distclean-generic clean-generic maintainer-clean-generic clean \
+mostlyclean distclean maintainer-clean
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/testing/README b/testing/README
new file mode 100644
index 000000000..48024a908
--- /dev/null
+++ b/testing/README
@@ -0,0 +1,43 @@
+How to run UNLV tests.
+
+The scripts in this directory make it possible to duplicate the tests
+published in the Fourth Annual Test of OCR Accuracy.
+See http://www.isri.unlv.edu/downloads/AT-1995.pdf
+but first you have to get the tools and data from UNLV:
+
+Step 1: to download the images goto
+http://www.isri.unlv.edu/ISRI/OCRtk
+and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz.
+
+Step 2: extract the files. It doesn't really matter where
+in your filesystem you put them, but they must go under a common
+root so you have directories 3, B, M and N in, for example,
+/users/me/ISRI-OCRtk.
+
+Step 3: Reorg the files
+The lack of tif extensions on the images is inconvenient, so there
+is a script to reorganize the data to match the rest of the test
+scripts.
+cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run
+/blah/blah/tesseract-ocr/testing/reorgdata.sh 3B
+This makes directories doe3.3B, bus.3B, mag.3B and news.3B.
+You can now get rid of 3, B, M, and N unless you want to get some of the
+other scanning resolutions out of them.
+
+Step 4: Download the ISRI toolkit from:
+http://www.isri.unlv.edu/downloads/ftk-1.0.tgz
+
+Step 5: If they work for you, use the binaries directly from the bin
+directory and put them in tesseract-ocr/testing/unlv
+otherwise build the tools for yourself and put them there.
+
+Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
+
+Step 7: run testing/runalltests.sh with the root data dir and testname:
+testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0
+and go to the gym, have lunch etc.
+
+Step 8: There should be a file
+testing/reports/tess2.0.summary that contains the final summarized accuracy
+report and comparison with the 1995 results.
+
diff --git a/testing/counttestset.sh b/testing/counttestset.sh
new file mode 100755
index 000000000..408a93c17
--- /dev/null
+++ b/testing/counttestset.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# File:        counttestset.sh
+# Description: Script to count the errors on a single UNLV set.
+# Author:      Ray Smith
+# Created:     Wed Jun 13 11:58:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 1 ]
+then
+  echo "Usage:$0 pagesfile"
+  exit 1
+fi
+if [ ! -d ccmain ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r testing/unlv/accuracy ]
+then
+  echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
+  exit 1
+fi
+pages=$1
+
+imdir=${pages%/pages}
+setname=${imdir##*/}
+resdir=testing/results/$setname
+mkdir -p testing/reports
+echo "Counting on set $setname in directory $imdir to $resdir"
+accfiles=""
+wafiles=""
+while read page dir
+do
+  if [ "$dir" ]
+  then
+     srcdir="$imdir/$dir"
+  else
+     srcdir="$imdir"
+  fi
+#  echo "$srcdir/$page.tif"
+  # Count character errors.
+  testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc
+  accfiles="$accfiles $resdir/$page.acc"
+  # Count word errors.
+  testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa
+  wafiles="$wafiles $resdir/$page.wa"
+done <$pages
+testing/unlv/accsum $accfiles >testing/reports/$setname.characc
+testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc
+
+
diff --git a/testing/reorgdata.sh b/testing/reorgdata.sh
new file mode 100755
index 000000000..141de4a6f
--- /dev/null
+++ b/testing/reorgdata.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]
+then
+    echo "Usage:$0 scantype"
+    echo "UNLV data comes in several scan types:"
+    echo "3B=300 dpi binary"
+    echo "3A=adaptive thresholded 300 dpi"
+    echo "3G=300 dpi grey"
+    echo "4B=400dpi binary"
+    echo "2B=200dpi binary"
+    echo "For now we only use 3B"
+    exit 1
+fi
+ext=$1
+
+#There are several test sets without meaningful names, so rename
+#them with something a bit more meaningful.
+#Each s is oldname/newname
+for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
+do
+    old=${s%/*}
+    #if this set was downloaded then process it.
+    if [ -r "$old/PAGES" ]
+    then
+	new=${s#*/}.$ext
+	mkdir -p $new
+    	echo "Set $old -> $new"
+	#The pages file had - instead of _ so fix it and add the extension.
+	for page in `cat $old/PAGES`
+	do
+    	    echo "${page%-*}_${page#*-}.$ext"
+	done >$new/pages
+	for f in `cat $new/pages`
+	do
+    	    #Put a tif extension on the tif files.
+	    cp $old/${old}_B/$f $new/$f.tif
+	    #Put a uzn extension on the zone files.
+	    cp $old/${old}_B/${f}Z $new/$f.uzn
+	    #Cat all the truth files together and put into a single txt file.
+	    cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt
+	done
+    fi
+done
diff --git a/testing/reports/1995.bus.3B.sum b/testing/reports/1995.bus.3B.sum
new file mode 100644
index 000000000..00eb97a86
--- /dev/null
+++ b/testing/reports/1995.bus.3B.sum
@@ -0,0 +1 @@
+1995	bus.3B	5959	98.14%	0.00%	1631	96.83%	0.00%	1293	95.73%	0.00%
diff --git a/testing/reports/1995.doe3.3B.sum b/testing/reports/1995.doe3.3B.sum
new file mode 100644
index 000000000..7eb753aee
--- /dev/null
+++ b/testing/reports/1995.doe3.3B.sum
@@ -0,0 +1 @@
+1995	doe3.3B	36349	97.52%	0.00%	7826	96.34%	0.00%	7042	94.87%	0.00%
diff --git a/testing/reports/1995.mag.3B.sum b/testing/reports/1995.mag.3B.sum
new file mode 100644
index 000000000..e718c5433
--- /dev/null
+++ b/testing/reports/1995.mag.3B.sum
@@ -0,0 +1 @@
+1995	mag.3B	15043	97.74%	0.00%	4566	96.01%	0.00%	3379	94.99%	0.00%
diff --git a/testing/reports/1995.news.3B.sum b/testing/reports/1995.news.3B.sum
new file mode 100644
index 000000000..bd0b7c68d
--- /dev/null
+++ b/testing/reports/1995.news.3B.sum
@@ -0,0 +1 @@
+1995	news.3B	6432	98.69%	0.00%	1946	97.68%	0.00%	1502	96.94%	0.00%
diff --git a/testing/runalltests.sh b/testing/runalltests.sh
new file mode 100755
index 000000000..6a3fdc1fd
--- /dev/null
+++ b/testing/runalltests.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# File:        runalltests.sh
+# Description: Script to run a set of UNLV test sets.
+# Author:      Ray Smith
+# Created:     Thu Jun 14 08:21:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 2 ]
+then
+   echo "Usage:$0 unlv-data-dir version-id"
+   exit 1
+fi
+if [ ! -d ccmain ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r ccmain/tesseract -a ! -r tesseract.exe ]
+then
+  echo "Please build tesseract before running $0"
+  exit 1
+fi
+if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ]
+then
+  echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
+  exit 1
+fi
+
+#deltapc new old calculates the %change from old to new
+deltapc() {
+awk ' BEGIN {
+printf("%.2f", 100.0*('$1'-'$2')/'$2');
+}'
+}
+
+imdir="$1"
+vid="$2"
+bindir=${0%/*}
+if [ "$bindir" = "$0" ]
+then
+    bindir="./"
+fi
+rdir=testing/reports
+testsets="bus.3B doe3.3B mag.3B news.3B"
+
+totalerrs=0
+totalwerrs=0
+totalnswerrs=0
+totalolderrs=0
+totaloldwerrs=0
+totaloldnswerrs=0
+for set in $testsets
+do
+    if [ -r $imdir/$set/pages ]
+    then
+	# Run tesseract on all the pages.
+	$bindir/runtestset.sh $imdir/$set/pages
+	# Count the errors on all the pages.
+	$bindir/counttestset.sh $imdir/$set/pages
+	# Get the old character word and nonstop word errors.
+	olderrs=`cat testing/reports/1995.$set.sum | cut -f3`
+	oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6`
+	oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9`
+	# Get the new character word and nonstop word errors and accuracy.
+	cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
+	    cut -c10-17 |tr -d '[:blank:]'`
+	nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
+	    cut -c19-26 |tr -d '[:blank:]'`
+	# Compute the percent change.
+	chdelta=`deltapc $cherrs $olderrs`
+	wdelta=`deltapc $wderrs $oldwerrs`
+	nswdelta=`deltapc $nswderrs $oldnswerrs`
+	sumfile=$rdir/$vid.$set.sum
+	echo "$vid	$set	$cherrs	$chacc	$chdelta%	$wderrs	$wdacc\
+	$wdelta%	$nswderrs	$nswdacc	$nswdelta%" >$sumfile
+	# Sum totals over all the testsets.
+	let totalerrs=totalerrs+cherrs
+	let totalwerrs=totalwerrs+wderrs
+	let totalnswerrs=totalnswerrs+nswderrs
+	let totalolderrs=totalolderrs+olderrs
+	let totaloldwerrs=totaloldwerrs+oldwerrs
+	let totaloldnswerrs=totaloldnswerrs+oldnswerrs
+    fi
+done
+# Compute grand total percent change.
+chdelta=`deltapc $totalerrs $totalolderrs`
+wdelta=`deltapc $totalwerrs $totaloldwerrs`
+nswdelta=`deltapc $totalnswerrs $totaloldnswerrs `
+tfile=$rdir/$vid.total.sum
+echo "$vid	Total	$totalerrs	-	$chdelta%	$totalwerrs\
+	-	$wdelta%	$totalnswerrs	-	$nswdelta%" >$tfile
+cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary
diff --git a/testing/runtestset.sh b/testing/runtestset.sh
new file mode 100755
index 000000000..b44d51c9e
--- /dev/null
+++ b/testing/runtestset.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# File:        runtestset.sh
+# Description: Script to run tesseract on a single UNLV set.
+# Author:      Ray Smith
+# Created:     Wed Jun 13 10:13:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 1 ]
+then
+  echo "Usage:$0 pagesfile"
+  exit 1
+fi
+if [ ! -d ccmain ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r ccmain/tesseract ]
+then
+  if [ ! -r tesseract.exe ]
+  then
+    echo "Please build tesseract before running $0"
+    exit 1
+  else
+    tess="./tesseract.exe"
+  fi
+else
+  tess="ccmain/tesseract"
+  export TESSDATA_PREFIX=$PWD/
+fi
+
+pages=$1
+
+imdir=${pages%/pages}
+setname=${imdir##*/}
+resdir=testing/results/$setname
+echo "Testing on set $setname in directory $imdir to $resdir"
+mkdir -p $resdir
+while read page dir
+do
+  # A pages file may be a list of files with subdirs or maybe just
+  # a plain list of files so accomodate both.
+  if [ "$dir" ]
+  then
+     srcdir="$imdir/$dir"
+  else
+     srcdir="$imdir"
+  fi
+#  echo "$srcdir/$page.tif"
+  $tess $srcdir/$page.tif $resdir/$page nobatch unlv
+done <$pages