API/output changes to produce unlv-style latin-1 output and test scripts

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@86 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-01-18 14:41:36 +08:00 · 2007-07-18 01:11:18 +00:00 · 2007-07-18 01:11:18 +00:00 · 627368df42
commit 627368df42
parent eeaca1beba
27 changed files with 1424 additions and 442 deletions
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@ -24,20 +24,22 @@ what measures we are interested in.
 /* #define SECURE_NAMES done in secnames.h when necessary*/
 #include "mfcpch.h"
-#include          "applybox.h"
+#include "applybox.h"
-#include          <ctype.h>
+#include <ctype.h>
-#include          <string.h>
+#include <string.h>
 #ifdef __UNIX__
-#include          <assert.h>
+#include <assert.h>
-#include                    <errno.h>
+#include <errno.h>
 #endif
-#include          "mainblk.h"
+#include "mainblk.h"
-#include                   "genblob.h"
+#include "genblob.h"
-#include                   "fixxht.h"
+#include "fixxht.h"
-#include          "control.h"
+#include "control.h"
-#include          "tessbox.h"
+#include "tessbox.h"
-#include          "globals.h"
+#include "globals.h"
-#include          "secname.h"
+#include "secname.h"
 #include "unichar.h"
 #include "matchdefs.h"
 #define SECURE_NAMES
 #ifndef SECURE_NAMES
@ -47,10 +49,13 @@ what measures we are interested in.
 #define EXTERN
 EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
 EXTERN INT_VAR (applybox_debug, 0, "Debug level");
-EXTERN STRING_VAR (applybox_test_exclusions, "|",
+EXTERN STRING_VAR (applybox_test_exclusions, "",
 "Chars ignored for testing");
 EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
 // The unicharset used during box training
 static UNICHARSET unicharset_boxes;
 /*************************************************************************
 * The code re-assigns outlines to form words each with ONE labelled blob.
 * Noise is left in UNLABELLED words. The chars on the page are checked crudely
@ -89,7 +94,7 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
  INT16 boxfile_lineno = 0;
  INT16 boxfile_charno = 0;
  BOX box;                       //boxfile box
-  char ch[2];                    //correct ch from boxfile
+  UNICHAR_ID uch_id;             //correct ch from boxfile
  ROW *row;
  ROW *prev_row = NULL;
  INT16 prev_box_right = MAX_INT16;
@ -100,15 +105,20 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
  INT16 labels_ok;
  INT16 rows_ok;
  INT16 bad_blobs;
-  INT16 tgt_char_counts[128];    //No. of box samples
+  INT16 tgt_char_counts[MAX_NUM_CLASSES];    //No. of box samples
  //      INT16                                   labelled_char_counts[128];      //No. of unique labelled samples
  INT16 i;
  INT16 rebalance_count = 0;
-  char min_char;
+  UNICHAR_ID min_uch_id;
  INT16 min_samples;
  INT16 final_labelled_blob_count;
-  for (i = 0; i < 128; i++)
+  // Clean the unichar set
  unicharset_boxes.clear();
  // Space character needed to represent NIL classification
  unicharset_boxes.unichar_insert(" ");
  for (i = 0; i < MAX_NUM_CLASSES; i++)
    tgt_char_counts[i] = 0;
  FILE* box_file;
@ -120,11 +130,10 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
      filename.string(), errno);
  }
  ch[1] = '\0';
  clear_any_old_text(block_list);
-  while (read_next_box (box_file, &box, &ch[0])) {
+  while (read_next_box (box_file, &box, &uch_id)) {
    box_count++;
-    tgt_char_counts[ch[0]]++;
+    tgt_char_counts[uch_id]++;
    row = find_row_of_box (block_list, box, block_id, row_id);
    if (box.left () < prev_box_right) {
      boxfile_lineno++;
@ -135,14 +144,16 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
    if (row == NULL) {
      box_failures++;
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+      report_failed_box (boxfile_lineno, boxfile_charno, box,
                         unicharset_boxes.id_to_unichar(uch_id),
        "FAILURE! box overlaps no blobs or blobs in multiple rows");
    }
    else {
      if ((box.left () >= prev_box_right) && (row != prev_row))
-        report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
          "WARNING! false row break");
-      box_failures += resegment_box (row, box, ch, block_id, row_id,
+      box_failures += resegment_box (row, box, uch_id, block_id, row_id,
        boxfile_lineno, boxfile_charno);
      prev_row = row;
    }
@ -154,7 +165,7 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
          bad_blobs,
          tgt_char_counts,
          rebalance_count,
-          min_char,
+          &min_uch_id,
          min_samples,
          final_labelled_blob_count);
  tprintf ("APPLY_BOXES:\n");
@ -163,7 +174,8 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
    labels_ok, rows_ok);
  tprintf ("   Box failures detected:		%6d\n", box_failures);
  tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
-  tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);
+  tprintf ("   \"%s\" has fewest samples:%6d\n",
           unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
  tprintf ("				Total unlabelled words:   %6d\n",
    bad_blobs);
  tprintf ("				Final labelled words:     %6d\n",
@ -194,7 +206,7 @@ void clear_any_old_text(                        //remove correct text
 BOOL8 read_next_box(FILE* box_file,  //
                    BOX *box,
-                    char *ch) {
+                    UNICHAR_ID *uch_id) {
  char buff[256];                //boxfile read buffer
  char *buffptr = buff;
  STRING box_filename;
@ -204,23 +216,38 @@ BOOL8 read_next_box(FILE* box_file,  //
  INT32 x_max;
  INT32 y_max;
  INT32 count = 0;
  char uch[256];
  while (!feof (box_file)) {
    fgets (buff, sizeof (buff) - 1, box_file);
    line++;
    buffptr = buff;
    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
      buffptr += 3;  // Skip unicode file designation.
    /* Check for blank lines in box file */
-    for (buffptr = buff; isspace (*buffptr); buffptr++)
+    while (isspace (*buffptr))
-      ;
+      buffptr++;
    if (*buffptr != '\0') {
      count =
-        sscanf (buff,
+        sscanf (buffptr,
-        "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
+        "%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
-        INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
+        INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
      if (count != 5) {
        tprintf ("Box file format error on line %i ignored\n", line);
      }
      else {
        if (!unicharset_boxes.contains_unichar(uch))
        {
          unicharset_boxes.unichar_insert(uch);
          if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
            tprintf("Error: Size of unicharset of boxes is \
 greater than MAX_NUM_CLASSES\n");
            exit(1);
          }
        }
        *uch_id = unicharset_boxes.unichar_to_id(uch);
        *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
        return TRUE;             //read a box ok
      }
@ -314,7 +341,7 @@ ROW *find_row_of_box(                         //
 INT16 resegment_box(  //
                    ROW *row,
                    BOX box,
-                    char *ch,
+                    UNICHAR_ID uch_id,
                    INT16 block_id,
                    INT16 row_id,
                    INT16 boxfile_lineno,
@ -358,7 +385,7 @@ INT16 resegment_box(  //
                  if (applybox_debug > 4)
                    report_failed_box (boxfile_lineno,
                      boxfile_charno,
-                      box, ch,
+                      box, unicharset_boxes.id_to_unichar(uch_id),
                      "FAILURE! box overlaps blob in labelled word");
                }
                if (applybox_debug > 4)
@ -375,7 +402,7 @@ INT16 resegment_box(  //
                if (new_word == NULL) {
                                 /* Make a new word with a single blob */
                  new_word = word->shallow_copy ();
-                  new_word->set_text (ch);
+                  new_word->set_text (unicharset_boxes.id_to_unichar(uch_id));
                  if (polyg)
                    new_blob = new PBLOB;
                  else
@ -414,63 +441,75 @@ INT16 resegment_box(  //
    word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
    baseline = row->base_line (word_x_centre);
-    if (STRING (chs_caps_ht).contains (ch[0]) &&
+#if 0
-      (new_word_box.top () <
+    if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
-    baseline + (1 + applybox_error_band) * row->x_height ())) {
+      if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+          (new_word_box.top () <
-        "FAILURE! caps-ht char didn't ascend");
+           baseline + (1 + applybox_error_band) * row->x_height ())) {
-      new_word->set_text ("");
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
-      return 1;
+                           unicharset_boxes.id_to_unichar(uch_id),
-    }
+                           "FAILURE! caps-ht char didn't ascend");
-    if (STRING (chs_odd_top).contains (ch[0]) &&
+        new_word->set_text ("");
-      (new_word_box.top () <
+        return 1;
-    baseline + (1 - applybox_error_band) * row->x_height ())) {
+      }
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+      if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
-        "FAILURE! Odd top char below xht");
+          (new_word_box.top () <
-      new_word->set_text ("");
+           baseline + (1 - applybox_error_band) * row->x_height ())) {
-      return 1;
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
-    }
+                           unicharset_boxes.id_to_unichar(uch_id),
-    if (STRING (chs_x_ht).contains (ch[0]) &&
+                           "FAILURE! Odd top char below xht");
-      ((new_word_box.top () >
+        new_word->set_text ("");
-      baseline + (1 + applybox_error_band) * row->x_height ()) ||
+        return 1;
-      (new_word_box.top () <
+      }
-    baseline + (1 - applybox_error_band) * row->x_height ()))) {
+      if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+          ((new_word_box.top () >
-        "FAILURE! x-ht char didn't have top near xht");
+            baseline + (1 + applybox_error_band) * row->x_height ()) ||
-      new_word->set_text ("");
+           (new_word_box.top () <
-      return 1;
+            baseline + (1 - applybox_error_band) * row->x_height ()))) {
-    }
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
-    if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
+                           unicharset_boxes.id_to_unichar(uch_id),
-      ((new_word_box.bottom () <
+                           "FAILURE! x-ht char didn't have top near xht");
-      baseline - applybox_error_band * row->x_height ()) ||
+        new_word->set_text ("");
-      (new_word_box.bottom () >
+        return 1;
-    baseline + applybox_error_band * row->x_height ()))) {
+      }
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+      if (STRING (chs_non_ambig_bl).contains
-        "FAILURE! non ambig BL char didnt have bottom near baseline");
+          (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
-      new_word->set_text ("");
+          ((new_word_box.bottom () <
-      return 1;
+            baseline - applybox_error_band * row->x_height ()) ||
-    }
+           (new_word_box.bottom () >
-    if (STRING (chs_odd_bot).contains (ch[0]) &&
+            baseline + applybox_error_band * row->x_height ()))) {
-      (new_word_box.bottom () >
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
-    baseline + applybox_error_band * row->x_height ())) {
+                           unicharset_boxes.id_to_unichar(uch_id),
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+                           "FAILURE! non ambig BL char didnt have bottom near baseline");
-        "FAILURE! Odd bottom char above baseline");
+        new_word->set_text ("");
-      new_word->set_text ("");
+        return 1;
-      return 1;
+      }
-    }
+      if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
-    if (STRING (chs_desc).contains (ch[0]) &&
+          (new_word_box.bottom () >
-      (new_word_box.bottom () >
+           baseline + applybox_error_band * row->x_height ())) {
-    baseline - applybox_error_band * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+                           unicharset_boxes.id_to_unichar(uch_id),
                           "FAILURE! Odd bottom char above baseline");
        new_word->set_text ("");
        return 1;
      }
      if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
          (new_word_box.bottom () >
           baseline - applybox_error_band * row->x_height ())) {
        report_failed_box (boxfile_lineno, boxfile_charno, box,
                           unicharset_boxes.id_to_unichar(uch_id),
        "FAILURE! Descender doesn't descend");
-      new_word->set_text ("");
+        new_word->set_text ("");
-      return 1;
+        return 1;
      }
    }
 #endif
    return 0;
  }
  else {
-    report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+    report_failed_box (boxfile_lineno, boxfile_charno, box,
-      "FAILURE! Couldn't find any blobs");
+                       unicharset_boxes.id_to_unichar(uch_id),
                       "FAILURE! Couldn't find any blobs");
    return 1;
  }
 }
@ -492,7 +531,7 @@ void tidy_up(                         //
             INT16 &unlabelled_words,
             INT16 *tgt_char_counts,
             INT16 &rebalance_count,
-             char &min_char,
+             UNICHAR_ID *min_uch_id,
             INT16 &min_samples,
             INT16 &final_labelled_blob_count) {
  BLOCK_IT block_it(block_list);
@ -507,16 +546,16 @@ void tidy_up(                         //
  BOOL8 row_ok;
  BOOL8 rebalance_needed = FALSE;
                                 //No. of unique labelled samples
-  INT16 labelled_char_counts[128];
+  INT16 labelled_char_counts[MAX_NUM_CLASSES];
  INT16 i;
-  char ch;
+  UNICHAR_ID uch_id;
-  char prev_ch = '\0';
+  UNICHAR_ID prev_uch_id = -1;
  BOOL8 at_dupe_of_prev_word;
  ROW *prev_row = NULL;
  INT16 left;
  INT16 prev_left = -1;
-  for (i = 0; i < 128; i++)
+  for (i = 0; i < MAX_NUM_CLASSES; i++)
    labelled_char_counts[i] = 0;
  ok_char_count = 0;
@ -556,7 +595,7 @@ void tidy_up(                         //
              block_idx, row_idx, all_row_idx);
          ok_char_count++;
-          labelled_char_counts[*word->text ()]++;
+          labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++;
          row_ok = TRUE;
        }
      }
@ -571,24 +610,24 @@ void tidy_up(                         //
  }
  min_samples = 9999;
-  for (i = 0; i < 128; i++) {
+  for (i = 0; i < unicharset_boxes.size(); i++) {
    if (tgt_char_counts[i] > labelled_char_counts[i]) {
      if (labelled_char_counts[i] <= 1) {
        tprintf
-          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
+          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d\n",
-          labelled_char_counts[i], (char) i, tgt_char_counts[i]);
+          labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);
      }
      else {
        rebalance_needed = TRUE;
        if (applybox_debug > 0)
          tprintf
-            ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
+            ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",
-            (char) i, tgt_char_counts[i], labelled_char_counts[i]);
+            unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);
      }
    }
    if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
      min_samples = labelled_char_counts[i];
-      min_char = (char) i;
+      *min_uch_id = i;
    }
  }
@ -605,33 +644,36 @@ void tidy_up(                         //
        !word_it.cycled_list (); word_it.forward ()) {
          word = word_it.data ();
          left = word->bounding_box ().left ();
-          ch = *word->text ();
+          if (*word->text () != '\0')
            uch_id = unicharset_boxes.unichar_to_id(word->text ());
          else
            uch_id = -1;
          at_dupe_of_prev_word = ((row == prev_row) &&
            (left = prev_left) &&
-            (ch == prev_ch));
+            (uch_id == prev_uch_id));
-          if ((ch != '\0') &&
+          if ((uch_id != -1) &&
-            (labelled_char_counts[ch] > 1) &&
+            (labelled_char_counts[uch_id] > 1) &&
-            (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
+            (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
          (!at_dupe_of_prev_word)) {
            /* Duplicate the word to rebalance the labelled samples */
            if (applybox_debug > 9) {
-              tprintf ("Duping \"%c\" from ", ch);
+              tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
              word->bounding_box ().print ();
            }
            duplicate_word = new WERD;
            *duplicate_word = *word;
            word_it.add_after_then_move (duplicate_word);
            rebalance_count++;
-            labelled_char_counts[ch]++;
+            labelled_char_counts[uch_id]++;
          }
          prev_row = row;
          prev_left = left;
-          prev_ch = ch;
+          prev_uch_id = uch_id;
        }
      }
    }
    rebalance_needed = FALSE;
-    for (i = 0; i < 128; i++) {
+    for (i = 0; i < unicharset_boxes.size(); i++) {
      if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
      (labelled_char_counts[i] > 1)) {
        rebalance_needed = TRUE;
@ -653,7 +695,7 @@ void tidy_up(                         //
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
-        if ((strlen (word->text ()) == 1) &&
+        if ((strlen (word->text ()) > 0) &&
          (word->gblob_list ()->length () == 1))
          final_labelled_blob_count++;
      }
@ -665,7 +707,7 @@ void tidy_up(                         //
 void report_failed_box(INT16 boxfile_lineno,
                       INT16 boxfile_charno,
                       BOX box,
-                       char *box_ch,
+                       const char *box_ch,
                       const char *err_msg) {
  if (applybox_debug > 4)
    tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
@ -687,10 +729,9 @@ void apply_box_training(BLOCK_LIST *block_list) {
  PBLOB_IT blob_it;
  DENORM denorm;
  INT16 count = 0;
-  char ch[2];
+  char unichar[UNICHAR_LEN + 1];
  ch[1] = '\0';
  unichar[UNICHAR_LEN] = '\0';
  tprintf ("Generating training data\n");
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
@ -701,23 +742,22 @@ void apply_box_training(BLOCK_LIST *block_list) {
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
-        if ((strlen (word->text ()) == 1) &&
+        if ((strlen (word->text ()) > 0) &&
        (word->gblob_list ()->length () == 1)) {
-          /* Here is a word with a single char label and a single blob so train on it */
+          /* Here is a word with a single unichar label and a single blob so train on it */
          bln_word =
            make_bln_copy (word, row, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
-          ch[0] = *word->text ();
+          strncpy(unichar, word->text (), UNICHAR_LEN);
          tess_training_tester (blob_it.data (),
                                 //single blob
            &denorm, TRUE,       //correct
-            ch,                  //correct ASCII char
+            unichar,             //correct character
-            1,                   //ASCII length
+            strlen(unichar),     //character length
            NULL);
          copy_outword = *(bln_word);
          copy_outword.baseline_denormalise (&denorm);
          blob_it.set_to_list (copy_outword.blob_list ());
          ch[0] = *word->text ();
          delete bln_word;
          count++;
        }
@ -793,7 +833,7 @@ void apply_box_testing(BLOCK_LIST *block_list) {
            choice list, outword blob lists and best_choice string are the same
            length. A TESS screw up is indicated by a blank filled or 0 length string.
          */
-          if ((best_choice->string ().length () == 0) ||
+          if ((best_choice->lengths ().length () == 0) ||
            (strspn (best_choice->string ().string (), " ") ==
          best_choice->string ().length ())) {
            rej_count++;
@ -804,22 +844,22 @@ void apply_box_testing(BLOCK_LIST *block_list) {
            #endif
          }
          else {
-            if ((best_choice->string ().length () !=
+            if ((best_choice->lengths ().length () !=
              outword->blob_list ()->length ()) ||
-              (best_choice->string ().length () !=
+              (best_choice->lengths ().length () !=
            blob_choices.length ())) {
              tprintf
                ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
                best_choice->string ().string (),
-                best_choice->string ().length (),
+                best_choice->lengths ().length (),
                outword->blob_list ()->length (),
                blob_choices.length ());
            }
-            ASSERT_HOST (best_choice->string ().length () ==
+            ASSERT_HOST (best_choice->lengths ().length () ==
              outword->blob_list ()->length ());
-            ASSERT_HOST (best_choice->string ().length () ==
+            ASSERT_HOST (best_choice->lengths ().length () ==
              blob_choices.length ());
-            fix_quotes ((char *) best_choice->string ().string (),
+            fix_quotes (best_choice,
                                 //turn to double
              outword, &blob_choices);
            if (strcmp (best_choice->string ().string (), ch) != 0) {
--- a/ccmain/baseapi.cpp
+++ b/ccmain/baseapi.cpp
@ -27,6 +27,7 @@
 #include "applybox.h"
 #include "pgedit.h"
 #include "varabled.h"
 #include "output.h"
 #include "adaptmatch.h"
 BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
@ -37,6 +38,8 @@ BOOL_VAR(tessedit_train_from_boxes, FALSE,
 // Minimum sensible image size to be worth running tesseract.
 const int kMinRectSize = 10;
 static STRING input_file = "noname.tif";
 // Start tesseract.
 // The datapath must be the name of the data directory or some other file
 // in which the data directory resides (for instance argv[0].)
@ -70,6 +73,12 @@ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
  return result;
 }
 // Set the name of the input file. Needed only for training and
 // loading a UNLV zone file.
 void TessBaseAPI::SetInputName(const char* name) {
  input_file = name;
 }
 // Recognize a rectangle from an image and return the result as a string.
 // May be called many times for a single Init.
 // Currently has no error checking.
@ -96,6 +105,52 @@ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
  return RecognizeToString();
 }
 // As TesseractRect but produces a box file as output.
 char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
                                      int bytes_per_pixel,
                                      int bytes_per_line,
                                      int left, int top,
                                      int width, int height,
                                      int imageheight) {
  if (width < kMinRectSize || height < kMinRectSize)
  return NULL;  // Nothing worth doing.
  // Copy/Threshold the image to the tesseract global page_image.
  CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
                       left, top, width, height);
  BLOCK_LIST    block_list;
  FindLines(&block_list);
  // Now run the main recognition.
  PAGE_RES* page_res = Recognize(&block_list, NULL);
  return TesseractToBoxText(page_res, left, imageheight - (top + height));
 }
 char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
                                     int bytes_per_pixel,
                                     int bytes_per_line,
                                     int left, int top,
                                     int width, int height) {
  if (width < kMinRectSize || height < kMinRectSize)
    return NULL;  // Nothing worth doing.
  // Copy/Threshold the image to the tesseract global page_image.
  CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
                       left, top, width, height);
  BLOCK_LIST    block_list;
  FindLines(&block_list);
  // Now run the main recognition.
  PAGE_RES* page_res = Recognize(&block_list, NULL);
  return TesseractToUNLV(page_res);
 }
 // Call between pages or documents etc to free up memory and forget
 // adaptive data.
 void TessBaseAPI::ClearAdaptiveClassifier() {
@ -326,7 +381,7 @@ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
  image.capture(const_cast<unsigned char*>(imagedata),
                bytes_per_line*8, top + height, 1);
  page_image.create(width, height, 1);
-  copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false);
+  copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
 }
 // Low-level function to recognize the current global image to a string.
@ -343,7 +398,6 @@ char* TessBaseAPI::RecognizeToString() {
 // Find lines from the image making the BLOCK_LIST.
 void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
  STRING input_file = "noname.tif";
  // The following call creates a full-page block and then runs connected
  // component analysis and text line creation.
  pgeditor_read_file(input_file, block_list);
@ -369,21 +423,32 @@ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
  return page_res;
 }
 // Return the maximum length that the output text string might occupy.
 int TessBaseAPI::TextLength(PAGE_RES* page_res) {
  PAGE_RES_IT   page_res_it(page_res);
  int total_length = 2;
  // Iterate over the data structures to extract the recognition result.
  for (page_res_it.restart_page(); page_res_it.word () != NULL;
       page_res_it.forward()) {
    WERD_RES *word = page_res_it.word();
    WERD_CHOICE* choice = word->best_choice;
    if (choice != NULL) {
      total_length += choice->string().length() + 1;
      for (int i = 0; i < word->reject_map.length(); ++i) {
        if (word->reject_map[i].rejected())
          ++total_length;
      }
    }
  }
  return total_length;
 }
 // Make a text string from the internal data structures.
 // The input page_res is deleted.
 char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
  if (page_res != NULL) {
-    int total_length = 2;
+    int total_length = TextLength(page_res);
    PAGE_RES_IT   page_res_it(page_res);
    // Iterate over the data structures to extract the recognition result.
    for (page_res_it.restart_page(); page_res_it.word () != NULL;
         page_res_it.forward()) {
      WERD_RES *word = page_res_it.word();
      WERD_CHOICE* choice = word->best_choice;
      if (choice != NULL) {
        total_length += choice->string().length() + 1;
      }
    }
    char* result = new char[total_length];
    char* ptr = result;
    for (page_res_it.restart_page(); page_res_it.word () != NULL;
@ -406,3 +471,207 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
  }
  return NULL;
 }
 static int ConvertWordToBoxText(WERD_RES *word,
                                ROW_RES* row,
                                int left,
                                int bottom,
                                char* word_str) {
  // Copy the output word and denormalize it back to image coords.
  WERD copy_outword;
  copy_outword = *(word->outword);
  copy_outword.baseline_denormalise(&word->denorm);
  PBLOB_IT blob_it;
  blob_it.set_to_list(copy_outword.blob_list());
  int length = copy_outword.blob_list()->length();
  int output_size = 0;
  if (length > 0) {
    for (int index = 0, offset = 0; index < length;
         offset += word->best_choice->lengths()[index++], blob_it.forward()) {
      PBLOB* blob = blob_it.data();
      BOX blob_box = blob->bounding_box();
      if (word->tess_failed ||
          blob_box.left() < 0 ||
          blob_box.right() > page_image.get_xsize() ||
          blob_box.bottom() < 0 ||
          blob_box.top() > page_image.get_ysize()) {
        // Bounding boxes can be illegal when tess fails on a word.
        blob_box = word->word->bounding_box();  // Use original word as backup.
        tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
                blob_box.left(), blob_box.bottom(),
                blob_box.right(), blob_box.top());
      }
      // A single classification unit can be composed of several UTF-8
      // characters. Append each of them to the result.
      for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
        char ch = word->best_choice->string()[offset + sub];
        // Tesseract uses space for recognition failure. Fix to a reject
        // character, '~' so we don't create illegal box files.
        if (ch == ' ')
          ch = '~';
        word_str[output_size++] = ch;
      }
      sprintf(word_str + output_size, " %d %d %d %d\n",
              blob_box.left() + left, blob_box.bottom() + bottom,
              blob_box.right() + left, blob_box.top() + bottom);
      output_size += strlen(word_str + output_size);
    }
  }
  return output_size;
 }
 // Multiplier for textlength assumes 4 numbers @ 5 digits and a space
 // plus the newline and the orginial character = 4*(5+1)+2
 const int kMaxCharsPerChar = 26;
 // Make a text string from the internal data structures.
 // The input page_res is deleted.
 // The text string takes the form of a box file as needed for training.
 char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
                                      int left, int bottom) {
  if (page_res != NULL) {
    int total_length = TextLength(page_res) * kMaxCharsPerChar;
    PAGE_RES_IT   page_res_it(page_res);
    char* result = new char[total_length];
    char* ptr = result;
    for (page_res_it.restart_page(); page_res_it.word () != NULL;
         page_res_it.forward()) {
      WERD_RES *word = page_res_it.word();
      ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
    }
    *ptr = '\0';
    delete page_res;
    return result;
  }
  return NULL;
 }
 // Make a text string from the internal data structures.
 // The input page_res is deleted. The text string is converted
 // to UNLV-format: Latin-1 with specific reject and suspect codes.
 const char kUnrecognized = '~';
 // Conversion table for non-latin characters.
 // Maps characters out of the latin set into the latin set.
 // TODO(rays) incorporate this translation into unicharset.
 const int kUniChs[] = {
  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
 };
 // Latin chars corresponding to the unicode chars above.
 const int kLatinChs[] = {
  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
 };
 char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
  bool tilde_crunch_written = false;
  bool last_char_was_newline = true;
  bool last_char_was_tilde = false;
  if (page_res != NULL) {
    int total_length = TextLength(page_res);
    PAGE_RES_IT   page_res_it(page_res);
    char* result = new char[total_length];
    char* ptr = result;
    for (page_res_it.restart_page(); page_res_it.word () != NULL;
         page_res_it.forward()) {
      WERD_RES *word = page_res_it.word();
      // Process the current word.
      if (word->unlv_crunch_mode != CR_NONE) {
        if (word->unlv_crunch_mode != CR_DELETE &&
            (!tilde_crunch_written ||
             (word->unlv_crunch_mode == CR_KEEP_SPACE &&
              word->word->space () > 0 &&
              !word->word->flag (W_FUZZY_NON) &&
              !word->word->flag (W_FUZZY_SP)))) {
          if (!word->word->flag (W_BOL) &&
              word->word->space () > 0 &&
              !word->word->flag (W_FUZZY_NON) &&
              !word->word->flag (W_FUZZY_SP)) {
            /* Write a space to separate from preceeding good text */
            *ptr++ = ' ';
            last_char_was_tilde = false;
          }
          if (!last_char_was_tilde) {
            // Write a reject char.
            last_char_was_tilde = true;
            *ptr++ = kUnrecognized;
            tilde_crunch_written = true;
            last_char_was_newline = false;
          }
        }
      } else {
        // NORMAL PROCESSING of non tilde crunched words.
        tilde_crunch_written = false;
        if (last_char_was_tilde &&
            word->word->space () == 0 &&
            (word->best_choice->string ()[0] == ' ')) {
          /* Prevent adjacent tilde across words - we know that adjacent tildes within
             words have been removed */
          char* p = (char *) word->best_choice->string().string ();
          strcpy (p, p + 1);       //shuffle up
          p = (char *) word->best_choice->lengths().string ();
          strcpy (p, p + 1);       //shuffle up
          word->reject_map.remove_pos (0);
          PBLOB_IT blob_it = word->outword->blob_list ();
          delete blob_it.extract ();   //get rid of reject blob
        }
        if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
          ensure_rep_chars_are_consistent(word);
        set_unlv_suspects(word);
        const char* wordstr = word->best_choice->string().string();
        if (wordstr[0] != 0) {
          if (!last_char_was_newline)
            *ptr++ = ' ';
          else
            last_char_was_newline = false;
          int offset = 0;
          const STRING& lengths = word->best_choice->lengths();
          int length = lengths.length();
          for (int i = 0; i < length; offset += lengths[i++]) {
            if (wordstr[offset] == ' ' ||
                wordstr[offset] == '~' ||
                wordstr[offset] == '|') {
              *ptr++ = kUnrecognized;
              last_char_was_tilde = true;
            } else {
              if (word->reject_map[i].rejected())
                *ptr++ = '^';
              UNICHAR ch(wordstr + offset, lengths[i]);
              int uni_ch = ch.first_uni();
              for (int j = 0; kUniChs[j] != 0; ++j) {
                if (kUniChs[j] == uni_ch) {
                  uni_ch = kLatinChs[j];
                  break;
                }
              }
              if (uni_ch <= 0xff) {
                *ptr++ = static_cast<char>(uni_ch);
                last_char_was_tilde = false;
              } else {
                *ptr++ = kUnrecognized;
                last_char_was_tilde = true;
              }
            }
          }
        }
      }
      if (word->word->flag(W_EOL) && !last_char_was_newline) {
        /* Add a new line output */
        *ptr++ = '\n';
        tilde_crunch_written = false;
        last_char_was_newline = true;
        last_char_was_tilde = false;
      }
    }
    *ptr++ = '\n';
    *ptr = '\0';
    delete page_res;
    return result;
  }
  return NULL;
 }
--- a/ccmain/baseapi.h
+++ b/ccmain/baseapi.h
@ -20,8 +20,6 @@
 #ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
 #define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
 #include <string>
 class PAGE_RES;
 class BLOCK_LIST;
@ -56,6 +54,10 @@ class TessBaseAPI {
                              const char* language, const char* configfile,
                              bool numeric_mode, int argc, char* argv[]);
  // Set the name of the input file. Needed only for training and
  // reading a UNLV zone file.
  static void SetInputName(const char* name);
  // Recognize a rectangle from an image and return the result as a string.
  // May be called many times for a single Init.
  // Currently has no error checking.
@ -71,6 +73,19 @@ class TessBaseAPI {
                             int bytes_per_pixel,
                             int bytes_per_line,
                             int left, int top, int width, int height);
  // As TesseractRect but produces a box file as output.
  // Image height is needed as well as rect height, since output y-coords
  // will be relative to the bottom of the image.
  static char* TesseractRectBoxes(const unsigned char* imagedata,
                                  int bytes_per_pixel,
                                  int bytes_per_line,
                                  int left, int top, int width, int height,
                                  int imageheight);
  // As TesseractRect but produces UNLV-style output.
  static char* TesseractRectUNLV(const unsigned char* imagedata,
                                 int bytes_per_pixel,
                                 int bytes_per_line,
                                 int left, int top, int width, int height);
  // Call between pages or documents etc to free up memory and forget
  // adaptive data.
@ -153,8 +168,18 @@ class TessBaseAPI {
  static PAGE_RES* Recognize(BLOCK_LIST* block_list,
                             struct ETEXT_STRUCT* monitor);
  // Return the maximum length that the output text string might occupy.
  static int TextLength(PAGE_RES* page_res);
  // Convert (and free) the internal data structures into a text string.
  static char* TesseractToText(PAGE_RES* page_res);
  // Make a text string from the internal data structures.
  // The input page_res is deleted.
  // The text string takes the form of a box file as needed for training.
  static char* TesseractToBoxText(PAGE_RES* page_res, int left, int bottom);
  // Make a text string from the internal data structures.
  // The input page_res is deleted. The text string is converted
  // to UNLV-format: Latin-1 with specific reject and suspect codes.
  static char* TesseractToUNLV(PAGE_RES* page_res);
 };
 #endif  // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@ -35,6 +35,7 @@
 #include          "docqual.h"
 #include          "output.h"
 #include "bestfirst.h"
 #include "globals.h"
 #define EXTERN
@ -55,12 +56,12 @@ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
 "Write block separators in output");
 EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
 "Write raw stuff to name.raw");
-EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");
+EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
 "Return ratings in IPEOCRAPI data");
-EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,
+EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
 "Write .txt to .etx map file");
-EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,
+EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
 "Write repetition char code");
 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
 EXTERN STRING_EVAR (unrecognised_char, "|",
@ -106,7 +107,6 @@ INT32 pixels_to_pts(               //convert coords
  return (INT32) (pts + 0.5);    //round it
 }
 void output_pass(  //Tess output pass //send to api
                 PAGE_RES_IT &page_res_it,
                 BOOL8 write_to_shm,
@ -119,8 +119,7 @@ void output_pass(  //Tess output pass //send to api
  if (tessedit_write_txt_map)
    txt_mapfile = open_outfile (".map");
-  if (tessedit_write_unlv)
+
    unlv_file = open_outfile (".unlv");
  page_res_it.restart_page ();
  block_of_last_word = NULL;
  while (page_res_it.word () != NULL) {
@ -189,7 +188,6 @@ void output_pass(  //Tess output pass //send to api
  }
 }
 /*************************************************************************
 * write_results()
 *
@ -211,9 +209,10 @@ void write_results(                           //output a word
                  ) {
                                 //word to do
  WERD_RES *word = page_res_it.word ();
-  WERD_CHOICE *ep_choice;        //ep format
+//   WERD_CHOICE *ep_choice;        //ep format
  STRING repetition_code;
  const STRING *wordstr;
  STRING wordstr_lengths;
  const char *text;
  int i;
  char unrecognised = STRING (unrecognised_char)[0];
@ -312,15 +311,12 @@ void write_results(                           //output a word
    if (tessedit_write_output && !NO_BLOCK)
      fprintf (textfile, "%s", txt_chs);
    if (tessedit_write_unlv)
      fprintf (unlv_file, "%s", txt_chs);
    if (tessedit_write_txt_map)
      fprintf (txt_mapfile, "%s", map_chs);
                                 //terminate string
    ep_chars[ep_chars_index] = '\0';
-    word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
+    word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
    if (force_eol)
      empty_block = TRUE;
@ -345,6 +341,8 @@ void write_results(                           //output a word
       words have been removed */
    ptr = (char *) word->best_choice->string ().string ();
    strcpy (ptr, ptr + 1);       //shuffle up
    ptr = (char *) word->best_choice->lengths ().string ();
    strcpy (ptr, ptr + 1);       //shuffle up
    word->reject_map.remove_pos (0);
    blob_it = word->outword->blob_list ();
    delete blob_it.extract ();   //get rid of reject blob
@ -354,8 +352,10 @@ void write_results(                           //output a word
    last_char_was_tilde = FALSE;
  else {
    if (word->reject_map.length () > 0) {
-      if (word->best_choice->string ()[word->reject_map.length () - 1] ==
+      for (i = 0, ptr = (char *) word->best_choice->string().string();
-        ' ')
+           i < word->reject_map.length () - 1; ++i)
        ptr += word->best_choice->lengths()[i];
      if (*ptr == ' ')
        last_char_was_tilde = TRUE;
      else
        last_char_was_tilde = FALSE;
@ -365,7 +365,7 @@ void write_results(                           //output a word
    /* else it is unchanged as there are no output chars */
  }
-  ptr = (char *) word->best_choice->string ().string ();
+  ptr = (char *) word->best_choice->lengths ().string ();
  ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
  if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
@ -379,21 +379,26 @@ void write_results(                           //output a word
      dict_word (word->best_choice->string ().string ()));
  }
 #if 0
  if (tessedit_write_unlv) {
    write_unlv_text(word);
  }
 #endif
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
-    repetition_code += get_rep_char (word);
+    wordstr_lengths = "\001\001\001\001";
    repetition_code += unicharset.id_to_unichar(get_rep_char (word));
    wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
    wordstr = &repetition_code;
  }
  else {
    wordstr = &(word->best_choice->string ());
    wordstr_lengths = word->best_choice->lengths ();
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      text = wordstr->string ();
-      for (i = 0; text[i] != '\0'; i++) {
+      for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
        if (word->reject_map[i].rejected ())
          word->reject_map[i].setrej_minimal_rej_accept ();
      }
@ -401,8 +406,8 @@ void write_results(                           //output a word
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      text = wordstr->string ();
-      for (i = 0; text[i] != '\0'; i++) {
+      for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
-        if ((text[i] != ' ') && word->reject_map[i].rejected ())
+        if ((*text != ' ') && word->reject_map[i].rejected ())
          word->reject_map[i].setrej_minimal_rej_accept ();
      }
    }
@ -410,8 +415,9 @@ void write_results(                           //output a word
  if (write_to_shm)
    write_shm_text (word, page_res_it.block ()->block,
-      page_res_it.row (), *wordstr);
+      page_res_it.row (), *wordstr, wordstr_lengths);
 #if 0
  if (tessedit_write_output)
    write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
@ -424,12 +430,12 @@ void write_results(                           //output a word
  ep_choice = make_epaper_choice (word, newline_type);
  word->ep_choice = ep_choice;
 #endif
-  character_count += word->best_choice->string ().length ();
+  character_count += word->best_choice->lengths ().length ();
  word_count++;
 }
 /**********************************************************************
 * make_epaper_choice
 *
@ -437,6 +443,7 @@ void write_results(                           //output a word
 * determine whether each blob should be rejected.
 **********************************************************************/
 #if 0
 WERD_CHOICE *make_epaper_choice(                   //convert one word
                                WERD_RES *word,    //word to do
                                char newline_type  //type of newline
@ -482,7 +489,8 @@ WERD_CHOICE *make_epaper_choice(                   //convert one word
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    strcpy (word_string + index, "|^~R");
    index += 4;
-    word_string[index++] = get_rep_char (word);
+    strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
    index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
  }
  else {
    if (!blob_it.empty ())
@ -537,7 +545,7 @@ WERD_CHOICE *make_epaper_choice(                   //convert one word
  ASSERT_HOST (strlen (word_string) == index);
  return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
 }
-
+#endif
 /**********************************************************************
 * make_reject
@ -653,6 +661,7 @@ char determine_newline_type(                   //test line ends
 * to the given file.
 **********************************************************************/
 #if 0
 void write_cooked_text(                     //write output
                       WERD *word,          //word to do
                       const STRING &text,  //text to write
@ -749,6 +758,7 @@ void write_cooked_text(                     //write output
  if (status != 0)
    WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
 }
 #endif
 /**********************************************************************
@ -761,7 +771,8 @@ void write_shm_text(                    //write output
                    WERD_RES *word,     //word to do
                    BLOCK *block,       //block it is from
                    ROW_RES *row,       //row it is from
-                    const STRING &text  //text to write
+                    const STRING &text, //text to write
                    const STRING &text_lengths
                   ) {
  INT32 index;                   //char counter
  INT32 index2;                  //char counter
@ -777,6 +788,8 @@ void write_shm_text(                    //write output
  WERD copy_outword;             // copy to denorm
  UINT32 rating;                 //of char
  BOOL8 lineend;                 //end of line
  int offset;
  int offset2;
                                 //point size
  ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
@ -786,13 +799,14 @@ void write_shm_text(                    //write output
  copy_outword = *(word->outword);
  copy_outword.baseline_denormalise (&word->denorm);
  blob_it.set_to_list (copy_outword.blob_list ());
-  length = text.length ();
+  length = text_lengths.length ();
  if (length > 0) {
    blanks = word->word->space ();
    if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
      blanks = 1;
-    for (index = 0; index < length; index++, blob_it.forward ()) {
+    for (index = 0, offset = 0; index < length;
         offset += text_lengths[index++], blob_it.forward ()) {
      blob = blob_it.data ();
      blob_box = blob->bounding_box ();
@ -804,7 +818,7 @@ void write_shm_text(                    //write output
      if (tessedit_write_ratings)
        rating = (UINT32) (-word->best_choice->certainty () / 0.035);
      else if (tessedit_zero_rejection)
-        rating = text[index] == ' ' ? 100 : 0;
+        rating = text[offset] == ' ' ? 100 : 0;
      else
        rating = word->reject_map[index].accepted ()? 0 : 100;
      if (rating > 255)
@ -819,22 +833,41 @@ void write_shm_text(                    //write output
      lineend = word->word->flag (W_EOL) && index == length - 1;
      if (word->word->flag (W_EOL) && tessedit_zero_rejection
-      && index < length - 1 && text[index + 1] == ' ') {
+      && index < length - 1 && text[index + text_lengths[index]] == ' ') {
-        for (index2 = index + 1; index2 < length && text[index2] == ' ';
+        for (index2 = index + 1, offset2 = offset + text_lengths[index];
-          index2++);
+             index2 < length && text[offset2] == ' ';
             offset2 += text_lengths[index2++]);
        if (index2 == length)
          lineend = TRUE;
      }
-      if (!tessedit_zero_rejection || text[index] != ' '
+      if (!tessedit_zero_rejection || text[offset] != ' '
      || tessedit_word_for_word) {
                                 //confidence
-        ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
+        if (text[offset] == ' ') {
-          ptsize,                //point size
+        ocr_append_char (unrecognised,
-          blanks, enhancement,   //enhancement
+                         blob_box.left (), blob_box.right (),
-          OCR_CDIR_LEFT_RIGHT,
+                         page_image.get_ysize () - 1 - blob_box.top (),
-          OCR_LDIR_DOWN_RIGHT,
+                         page_image.get_ysize () - 1 - blob_box.bottom (),
-          lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+                         font, (UINT8) rating,
                         ptsize,                //point size
                         blanks, enhancement,   //enhancement
                         OCR_CDIR_LEFT_RIGHT,
                         OCR_LDIR_DOWN_RIGHT,
                         lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
        } else {
          for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
            ocr_append_char (text[offset + suboffset],
                             blob_box.left (), blob_box.right (),
                             page_image.get_ysize () - 1 - blob_box.top (),
                             page_image.get_ysize () - 1 - blob_box.bottom (),
                             font, (UINT8) rating,
                             ptsize,                //point size
                             blanks, enhancement,   //enhancement
                             OCR_CDIR_LEFT_RIGHT,
                             OCR_LDIR_DOWN_RIGHT,
                             lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
        }
        blanks = 0;
      }
@ -863,13 +896,17 @@ void write_shm_text(                    //write output
    lineend = word->word->flag (W_EOL);
                                 //font index
-    ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
+    ocr_append_char (unrecognised,
-      rating,                    //confidence
+                     blob_box.left (), blob_box.right (),
-      ptsize,                    //point size
+                     page_image.get_ysize () - 1 - blob_box.top (),
-      blanks, enhancement,       //enhancement
+                     page_image.get_ysize () - 1 - blob_box.bottom (),
-      OCR_CDIR_LEFT_RIGHT,
+                     font,
-      OCR_LDIR_DOWN_RIGHT,
+                     rating,                    //confidence
-      lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+                     ptsize,                    //point size
                     blanks, enhancement,       //enhancement
                     OCR_CDIR_LEFT_RIGHT,
                     OCR_LDIR_DOWN_RIGHT,
                     lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
  }
 }
@ -888,6 +925,7 @@ void write_shm_text(                    //write output
 * newdiff needs etx files!
 **********************************************************************/
 #if 0
 void write_map(                //output a map file
               FILE *mapfile,  //mapfile to write to
               WERD_RES *word) {
@ -937,6 +975,7 @@ void write_map(                //output a map file
  if (status != 0)
    WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
 }
 #endif
 /*************************************************************************
@ -957,6 +996,7 @@ FILE *open_outfile(  //open .map & .unlv file
 }
 #if 0
 void write_unlv_text(WERD_RES *word) {
  const char *wordstr;
@ -1015,6 +1055,7 @@ void write_unlv_text(WERD_RES *word) {
  if (status != 0)
    WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
 }
 #endif
 /*************************************************************************
@ -1022,21 +1063,24 @@ void write_unlv_text(WERD_RES *word) {
 * Return the first accepted character from the repetition string. This is the
 * character which is repeated - as determined earlier by fix_rep_char()
 *************************************************************************/
-char get_rep_char(  // what char is repeated?
+UNICHAR_ID get_rep_char(WERD_RES *word) {  // what char is repeated?
                  WERD_RES *word) {
  int i;
  int offset;
-  for (i = 0;
+  for (i = 0, offset = 0;
    ((i < word->reject_map.length ()) &&
-    (word->reject_map[i].rejected ())); i++);
+    (word->reject_map[i].rejected ()));
       offset += word->best_choice->lengths()[i++]);
  if (i < word->reject_map.length ())
-    return word->best_choice->string ()[i];
+    return unicharset.unichar_to_id(word->best_choice->string().string()
                                    + offset,
                                    word->best_choice->lengths()[i]);
  else
-    return STRING (unrecognised_char)[0];
+    return unicharset.unichar_to_id(unrecognised_char.string());
 }
 void ensure_rep_chars_are_consistent(WERD_RES *word) {
 #if 0
  char rep_char = get_rep_char (word);
  char *ptr;
@ -1045,8 +1089,24 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
    if (*ptr != rep_char)
      *ptr = rep_char;
  }
-}
+#endif
 #if 0
  UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
  int i;
  char *ptr;
  STRING consistent_string;
  STRING consistent_string_lengths;
  ptr = (char *) word->best_choice->string ().string ();
  for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
    consistent_string += unicharset.id_to_unichar(rep_char);
    consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
  }
  word->best_choice->string() = consistent_string;
  word->best_choice->lengths() = consistent_string_lengths;
 #endif
 }
 /*************************************************************************
 * SUSPECT LEVELS
@ -1062,7 +1122,9 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
 void set_unlv_suspects(WERD_RES *word) {
  int len = word->reject_map.length ();
  int i;
  int offset;
  const char *ptr;
  const char *lengths = word->best_choice->lengths ().string ();
  float rating_per_ch;
  ptr = word->best_choice->string ().string ();
@ -1080,10 +1142,12 @@ void set_unlv_suspects(WERD_RES *word) {
  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
-  if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
+  if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
                               suspect_short_words)) {
    /* Unreject alphas in dictionary words */
-    for (i = 0; i < len; i++) {
+    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
-      if (word->reject_map[i].rejected () && isalpha (ptr[i]))
+      if (word->reject_map[i].rejected () &&
          unicharset.get_isalpha (ptr + offset, lengths[i]))
        word->reject_map[i].setrej_minimal_rej_accept ();
    }
  }
@ -1095,8 +1159,8 @@ void set_unlv_suspects(WERD_RES *word) {
  if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
-    for (i = 0; i < len; i++) {
+    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
-      if (word->reject_map[i].rejected () && (ptr[i] != ' '))
+      if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
        word->reject_map[i].setrej_minimal_rej_accept ();
    }
  }
@ -1130,9 +1194,11 @@ void set_unlv_suspects(WERD_RES *word) {
    }
  }
-  if ((acceptable_word_string (word->best_choice->string ().string ())
+  if ((acceptable_word_string (word->best_choice->string ().string (),
                               word->best_choice->lengths ().string ())
    != AC_UNACCEPTABLE) ||
-  acceptable_number_string (word->best_choice->string ().string ())) {
+  acceptable_number_string (word->best_choice->string ().string (),
                            word->best_choice->lengths ().string ())) {
    if (word->reject_map.length () > suspect_short_words) {
      for (i = 0; i < len; i++) {
        if (word->reject_map[i].rejected () &&
@ -1149,11 +1215,12 @@ void set_unlv_suspects(WERD_RES *word) {
 INT16 count_alphas(  //how many alphas
-                   const char *s) {
+                   const char *s,
                   const char *lengths) {
  int count = 0;
-  for (; *s != '\0'; s++) {
+  for (; *s != '\0'; s += *(lengths++)) {
-    if (isalpha (*s))
+    if (unicharset.get_isalpha(s, *lengths))
      count++;
  }
  return count;
@ -1161,36 +1228,43 @@ INT16 count_alphas(  //how many alphas
 INT16 count_alphanums(  //how many alphanums
-                      const char *s) {
+                      const char *s,
                      const char *lengths) {
  int count = 0;
-  for (; *s != '\0'; s++) {
+  for (; *s != '\0'; s += *(lengths++)) {
-    if (isalnum (*s))
+    if (unicharset.get_isalpha(s, *lengths) ||
        unicharset.get_isdigit(s, *lengths))
      count++;
  }
  return count;
 }
-BOOL8 acceptable_number_string(const char *s) {
+BOOL8 acceptable_number_string(const char *s,
                               const char *lengths) {
  BOOL8 prev_digit = FALSE;
-  if (*s == '(')
+  if (*lengths == 1 && *s == '(')
    s++;
-  if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
+  if (*lengths == 1 &&
      ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
    s++;
-  for (; *s != '\0'; s++) {
+  for (; *s != '\0'; s += *(lengths++)) {
-    if (isdigit (*s))
+    if (unicharset.get_isdigit (s, *lengths))
      prev_digit = TRUE;
    else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
      prev_digit = FALSE;
    else if (prev_digit &&
-      (*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
+             (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
      prev_digit = FALSE;
    else if (prev_digit && *lengths == 1 &&
             (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
      return TRUE;
    else if (prev_digit &&
-      (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
+             *lengths == 1 && (*s == '%') &&
             (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
             (*(s + *lengths + *(lengths + 1)) == '\0'))
      return TRUE;
    else
      return FALSE;
--- a/ccmain/tesseractmain.cpp
+++ b/ccmain/tesseractmain.cpp
@ -31,7 +31,9 @@
 #include "stderr.h"
 #include "notdll.h"
 #include "mainblk.h"
 #include "output.h"
 #include "globals.h"
 #include "blread.h"
 #include "tfacep.h"
 #include "callnet.h"
@ -40,7 +42,10 @@
 #define API_CONFIG      "configs/api_config"
 #define EXTERN
 EXTERN BOOL_VAR (tessedit_create_boxfile, FALSE, "Output text with boxes");
 EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read");
 EXTERN INT_VAR (tessedit_serial_unlv, 0,
                "0->Whole page, 1->serial no adapt, 2->serial with adapt");
 EXTERN BOOL_VAR (tessedit_write_images, FALSE,
 "Capture the image from the IPE");
 EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file");
@ -63,15 +68,30 @@ int main(int argc, char **argv) {
  if (argc < 3) {
    USAGE.error (argv[0], EXIT,
-      "%s imagename outputbase [configfile [[+|-]varfile]...]\n", argv[0]);
+      "%s imagename outputbase [-l lang] [configfile [[+|-]varfile]...]\n",
      argv[0]);
  }
  // Find the required language.
  const char* lang = "eng";
  int arg = 3;
  if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
    lang = argv[4];
    arg = 5;
  }
  // Find the basename of the input file.
  STRING infile(argv[1]);
  const char* lastdot = strrchr(argv[1], '.');
  if (lastdot != NULL) {
    infile[lastdot - argv[1]] = '\0';
  }
-  if (argc == 3)
+  if (argc == arg)
-    TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
+    TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
-                                  NULL, false, 0, argv + 2);
+                                  NULL, false, 0, argv + arg);
  else
-    TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
+    TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
-                                  argv[3], false, argc - 4, argv + 4);
+                                  argv[arg], false,
                                  argc - arg - 1, argv + arg + 1);
  tprintf ("Tesseract Open Source OCR Engine\n");
@ -92,20 +112,70 @@ int main(int argc, char **argv) {
      argv[1]);
  }
 #endif
  STRING text_out;
  int bytes_per_line = check_legal_image_size(image.get_xsize(),
                                              image.get_ysize(),
                                              image.get_bpp());
-  char* text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
+  if (tessedit_serial_unlv == 0) {
-                                          bytes_per_line, 0, 0,
+    TessBaseAPI::SetInputName(argv[1]);
-                                          image.get_xsize(), image.get_ysize());
+    char* text;
    if (tessedit_create_boxfile)
      text = TessBaseAPI::TesseractRectBoxes(image.get_buffer(),
                                             image.get_bpp()/8,
                                             bytes_per_line, 0, 0,
                                             image.get_xsize(),
                                             image.get_ysize(),
                                             image.get_ysize());
    else if (tessedit_write_unlv)
      text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
                                            image.get_bpp()/8,
                                            bytes_per_line, 0, 0,
                                            image.get_xsize(),
                                            image.get_ysize());
    else
      text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
                                        bytes_per_line, 0, 0,
                                        image.get_xsize(), image.get_ysize());
    text_out = text;
    delete [] text;
  } else {
    BLOCK_LIST blocks;
    STRING filename = argv[1];
    int len = filename.length();
    if (len > 4 && filename[len - 4] == '.') {
      filename[len - 4] = '\0';
    }
    if (!read_unlv_file(filename, image.get_xsize(), image.get_ysize(),
                        &blocks)) {
      fprintf(stderr, "Error: Must have a unlv zone file %s to read!\n",
              filename.string());
      return 1;
    }
    BLOCK_IT b_it = &blocks;
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
      BLOCK* block = b_it.data();
      BOX box = block->bounding_box();
      char* text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
                                                  image.get_bpp()/8,
                                                  bytes_per_line,
                                                  box.left(),
                                                  image.get_ysize() - box.top(),
                                                  box.width(),
                                                  box.height());
      text_out += text;
      delete [] text;
      if (tessedit_serial_unlv == 1)
        TessBaseAPI::ClearAdaptiveClassifier();
    }
  }
  outfile = argv[2];
  outfile += ".txt";
  FILE* fp = fopen(outfile.string(), "w");
  if (fp != NULL) {
-    fwrite(text, 1, strlen(text), fp);
+    fwrite(text_out.string(), 1, text_out.length(), fp);
    fclose(fp);
  }
  delete [] text;
  TessBaseAPI::End();
  return 0;                      //Normal exit
--- a/ccstruct/blread.cpp
+++ b/ccstruct/blread.cpp
@ -527,7 +527,9 @@ BOOL8 read_unlv_file(                    //print list of sides
  else {
    while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
                                 //make rect block
-      block = new BLOCK (name.string (), TRUE, 0, 0, (INT16) x, (INT16) (ysize - 1 - y - height), (INT16) (x + width), (INT16) (ysize - 1 - y));
+      block = new BLOCK (name.string (), TRUE, 0, 0,
                         (INT16) x, (INT16) (ysize - y - height),
                         (INT16) (x + width), (INT16) (ysize - y));
                                 //on end of list
      block_it.add_to_end (block);
    }
--- a/cutil/tordvars.cpp
+++ b/cutil/tordvars.cpp
@ -63,7 +63,7 @@ make_toggle_var (debug_8, 0, make_debug_8, 6, 8, toggle_debug_8, "Debug #8");
 make_toggle_var (display_ratings, 0, make_display_ratings,
 6, 9, toggle_ratings, "Ratings display");
-make_toggle_var (display_text, 1, make_display_text,
+make_toggle_var (display_text, 0, make_display_text,
 6, 10, toggle_text, "Display Text");
 make_toggle_var (show_bold, 1, make_show_bold,
--- a/tessdata/configs/makebox
+++ b/tessdata/configs/makebox
@ -0,0 +1 @@
 tessedit_create_boxfile 1
--- a/tessdata/configs/unlv
+++ b/tessdata/configs/unlv
@ -0,0 +1,3 @@
 tessedit_write_unlv 1
 tessedit_write_output 0
 tessedit_write_txt_map 0
--- a/tessdata/tessconfigs/batch
+++ b/tessdata/tessconfigs/batch
@ -1,78 +1,2 @@
-#################################################
+# No content needed as all defaults are correct.
 # Adaptive Matcher Using PreAdapted Templates
 #################################################
 acts_fx                 0x800
 acts_ocr                0x20
 RatingScale             30.0
 CertaintyScale          20.0
 #EnableMatcher				0
 #CurrentFx					2
 MinSlope                 0.414213562
 MaxSlope                 2.414213562
 #ExtremityMode            1
 NormMethod               1
 EnableAdaptiveMatcher	1
 NormAdjMidpoint			32.0
 NormAdjCurl					2.0
 MinNormScaleX				0.0
 MaxNormScaleX				0.325
 MinNormScaleY				0.0
 MaxNormScaleY				0.325
 BuiltInTemplatesFile		tessdata/inttemp
 BuiltInCutoffsFile		tessdata/pffmtable
 EnableLearning				0
 SaveAdaptedTemplates		0
 UsePreAdaptedTemplates	0
 ReliableConfigThreshold	2
 MinNumPermClasses			3
 #EnableStopper				1
 GoodAdaptiveMatch			0.125
 GreatAdaptiveMatch		0.0
 EnableIntFX					1
 EnableNewAdaptRules		1
 ################################################################################
 #
 # File:         marks/configs/knobs
 # Description:  Control variables for 'marks' code
 # Author:       Mark Seaman, OCR Technology
 # Created:      Wed Feb 27 11:27:27 1991
 # Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
 # Language:     Text
 # Package:      N/A
 # Status:       Experimental (Do Not Distribute)
 #
 # (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
 #
 ################################################################################
 #hidden_edges            1
 save_doc_words          1
 doc_dict_enable         1
 ClassPrunerThreshold			229
 ClassPrunerMultiplier		15
 IntThetaFudge					128
 CPCutoffStrength				0.15
 EvidenceTableBits				9
 IntEvidenceTruncBits			14
 SEExponentialMultiplier		0
 SimilarityCenter				0.0075
 #################################################
 # Adaptive Matcher Using 2 Passes
 #################################################
 EnableLearning				1
 SaveAdaptedTemplates		0
 UsePreAdaptedTemplates	0
 #save_errors             0
--- a/tessdata/tessconfigs/batch.nochop
+++ b/tessdata/tessconfigs/batch.nochop
@ -0,0 +1,2 @@
 chop_enable 0
 enable_assoc 0
--- a/tessdata/tessconfigs/matdemo
+++ b/tessdata/tessconfigs/matdemo
@ -2,80 +2,6 @@
 # Adaptive Matcher Using PreAdapted Templates
 #################################################
 acts_fx                 0x800
 acts_ocr                0x20
 RatingScale             30.0
 CertaintyScale          20.0
 #EnableMatcher				0
 #CurrentFx					2
 EnableAdaptiveMatcher	1
 NormAdjMidpoint			32.0
 NormAdjCurl					2.0
 MinNormScaleX				0.0
 MaxNormScaleX				0.325
 MinNormScaleY				0.0
 MaxNormScaleY				0.325
 BuiltInTemplatesFile		tessdata/inttemp
 BuiltInCutoffsFile		tessdata/pffmtable
 EnableLearning				0
 SaveAdaptedTemplates		0
 UsePreAdaptedTemplates	0
 ReliableConfigThreshold	2
 MinNumPermClasses			3
 #EnableStopper				1
 GoodAdaptiveMatch			0.125
 GreatAdaptiveMatch		0.0
 EnableIntFX					1
 EnableNewAdaptRules		1
 EnableAdaptiveDebugger   1
 MatchDebugFlags         6
 MatcherDebugLevel       1
 ################################################################################
 #
 # File:         marks/configs/knobs
 # Description:  Control variables for 'marks' code
 # Author:       Mark Seaman, OCR Technology
 # Created:      Wed Feb 27 11:27:27 1991
 # Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
 # Language:     Text
 # Package:      N/A
 # Status:       Experimental (Do Not Distribute)
 #
 # (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
 #
 ################################################################################
 #hidden_edges            1
 save_doc_words          1
 doc_dict_enable         1
 ClassPrunerThreshold			229
 ClassPrunerMultiplier		15
 IntThetaFudge					128
 CPCutoffStrength				0.15
 EvidenceTableBits				9
 IntEvidenceTruncBits			14
 SEExponentialMultiplier		0
 SimilarityCenter				0.0075
 #################################################
 # Adaptive Matcher Using 2 Passes
 #################################################
 display_splits          0
 display_all_words       0
 display_all_blobs       0
 display_segmentations   0
 EnableLearning				1
 SaveAdaptedTemplates		0
 UsePreAdaptedTemplates	0
 #save_errors             0
--- a/tessdata/tessconfigs/msdemo
+++ b/tessdata/tessconfigs/msdemo
@ -0,0 +1,13 @@
 #################################################
 # Adaptive Matcher Using PreAdapted Templates
 #################################################
 EnableAdaptiveDebugger   1
 MatchDebugFlags         6
 MatcherDebugLevel       1
 display_splits          0
 display_all_words       1
 display_all_blobs       1
 display_segmentations   2
 display_ratings			1
--- a/tessdata/tessconfigs/nobatch
+++ b/tessdata/tessconfigs/nobatch
@ -0,0 +1,2 @@
 display_text 0
--- a/tessdata/tessconfigs/segdemo
+++ b/tessdata/tessconfigs/segdemo
@ -2,70 +2,6 @@
 # Adaptive Matcher Using PreAdapted Templates
 #################################################
 acts_fx                 0x800
 acts_ocr                0x20
 RatingScale             30.0
 CertaintyScale          20.0
 #EnableMatcher				0
 #CurrentFx					2
 EnableAdaptiveMatcher	1
 NormAdjMidpoint			32.0
 NormAdjCurl					2.0
 MinNormScaleX				0.0
 MaxNormScaleX				0.325
 MinNormScaleY				0.0
 MaxNormScaleY				0.325
 BuiltInTemplatesFile		tessdata/inttemp
 BuiltInCutoffsFile		tessdata/pffmtable
 EnableLearning				0
 SaveAdaptedTemplates		0
 UsePreAdaptedTemplates	0
 ReliableConfigThreshold	2
 MinNumPermClasses			3
 #EnableStopper				1
 GoodAdaptiveMatch			0.125
 GreatAdaptiveMatch		0.0
 EnableIntFX					1
 EnableNewAdaptRules		1
 ################################################################################
 #
 # File:         marks/configs/knobs
 # Description:  Control variables for 'marks' code
 # Author:       Mark Seaman, OCR Technology
 # Created:      Wed Feb 27 11:27:27 1991
 # Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
 # Language:     Text
 # Package:      N/A
 # Status:       Experimental (Do Not Distribute)
 #
 # (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
 #
 ################################################################################
 #hidden_edges            1
 save_doc_words          1
 doc_dict_enable         1
 ClassPrunerThreshold			229
 ClassPrunerMultiplier		15
 IntThetaFudge					128
 CPCutoffStrength				0.15
 EvidenceTableBits				9
 IntEvidenceTruncBits			14
 SEExponentialMultiplier		0
 SimilarityCenter				0.0075
 #################################################
 # Adaptive Matcher Using 2 Passes
 #################################################
 display_splits          0
 display_all_words       1
 display_all_blobs       1
--- a/testing/Makefile
+++ b/testing/Makefile
@ -0,0 +1,185 @@
 # Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
 # Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY, to the extent permitted by law; without
 # even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 # PARTICULAR PURPOSE.
 SHELL = /bin/sh
 srcdir = .
 top_srcdir = ..
 prefix = /usr/local
 exec_prefix = ${prefix}
 bindir = ${exec_prefix}/bin
 sbindir = ${exec_prefix}/sbin
 libexecdir = ${exec_prefix}/libexec
 datadir = ${prefix}/share
 sysconfdir = ${prefix}/etc
 sharedstatedir = ${prefix}/com
 localstatedir = ${prefix}/var
 libdir = ${exec_prefix}/lib
 infodir = ${prefix}/info
 mandir = ${prefix}/man
 includedir = ${prefix}/include/tesseract
 oldincludedir = /usr/include
 DESTDIR =
 pkgdatadir = $(datadir)/
 pkglibdir = $(libdir)/
 pkgincludedir = $(includedir)/
 top_builddir = ..
 ACLOCAL = aclocal-1.4
 AUTOCONF = autoconf
 AUTOMAKE = automake-1.4
 AUTOHEADER = autoheader
 INSTALL = /usr/bin/install -c
 INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS)
 INSTALL_DATA = ${INSTALL} -m 644
 INSTALL_SCRIPT = ${INSTALL}
 transform = s,x,x,
 NORMAL_INSTALL = :
 PRE_INSTALL = :
 POST_INSTALL = :
 NORMAL_UNINSTALL = :
 PRE_UNINSTALL = :
 POST_UNINSTALL = :
 host_alias = 
 host_triplet = x86_64-unknown-linux-gnu
 CC = gcc
 CXX = g++
 HAVE_LIB = @HAVE_LIB@
 LIB = @LIB@
 LTLIB = @LTLIB@
 MAINT = #
 MAKEINFO = /home/rays/src/opensrc/tesseract-ocr/config/missing makeinfo
 PACKAGE = 
 PACKAGE_DATE = 07/2007
 PACKAGE_NAME = tesseract
 PACKAGE_VERSION = 2.00
 PACKAGE_YEAR = 2007
 RANLIB = ranlib
 VERSION = 
 EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
 mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
 CONFIG_HEADER = ../config_auto.h
 CONFIG_CLEAN_FILES = 
 DIST_COMMON =  README Makefile.am Makefile.in
 DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
 TAR = tar
 GZIP_ENV = --best
 all: all-redirect
 .SUFFIXES:
 $(srcdir)/Makefile.in: # Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) 
 	cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
 Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status $(BUILT_SOURCES)
 	cd $(top_builddir) \
 	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
 tags: TAGS
 TAGS:
 distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
 subdir = testing
 distdir: $(DISTFILES)
 	here=`cd $(top_builddir) && pwd`; \
 	top_distdir=`cd $(top_distdir) && pwd`; \
 	distdir=`cd $(distdir) && pwd`; \
 	cd $(top_srcdir) \
 	  && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
 	$(mkinstalldirs) $(distdir)/reports
 	@for file in $(DISTFILES); do \
 	  d=$(srcdir); \
 	  if test -d $$d/$$file; then \
 	    cp -pr $$d/$$file $(distdir)/$$file; \
 	  else \
 	    test -f $(distdir)/$$file \
 	    || ln $$d/$$file $(distdir)/$$file 2> /dev/null \
 	    || cp -p $$d/$$file $(distdir)/$$file || :; \
 	  fi; \
 	done
 info-am:
 info: info-am
 dvi-am:
 dvi: dvi-am
 check-am: all-am
 check: check-am
 installcheck-am:
 installcheck: installcheck-am
 install-exec-am:
 install-exec: install-exec-am
 install-data-am:
 install-data: install-data-am
 install-am: all-am
 	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
 install: install-am
 uninstall-am:
 uninstall: uninstall-am
 all-am: Makefile
 all-redirect: all-am
 install-strip:
 	$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
 installdirs:
 mostlyclean-generic:
 clean-generic:
 distclean-generic:
 	-rm -f Makefile $(CONFIG_CLEAN_FILES)
 	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
 maintainer-clean-generic:
 mostlyclean-am:  mostlyclean-generic
 mostlyclean: mostlyclean-am
 clean-am:  clean-generic mostlyclean-am
 clean: clean-am
 distclean-am:  distclean-generic clean-am
 distclean: distclean-am
 maintainer-clean-am:  maintainer-clean-generic distclean-am
 	@echo "This command is intended for maintainers to use;"
 	@echo "it deletes files that may require special tools to rebuild."
 maintainer-clean: maintainer-clean-am
 .PHONY: tags distdir info-am info dvi-am dvi check check-am \
 installcheck-am installcheck install-exec-am install-exec \
 install-data-am install-data install-am install uninstall-am uninstall \
 all-redirect all-am all installdirs mostlyclean-generic \
 distclean-generic clean-generic maintainer-clean-generic clean \
 mostlyclean distclean maintainer-clean
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
--- a/testing/Makefile.am
+++ b/testing/Makefile.am
@ -0,0 +1,2 @@
 EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
--- a/testing/Makefile.in
+++ b/testing/Makefile.in
@ -0,0 +1,185 @@
 # Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
 # Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
 # with or without modifications, as long as this notice is preserved.
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY, to the extent permitted by law; without
 # even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 # PARTICULAR PURPOSE.
 SHELL = @SHELL@
 srcdir = @srcdir@
 top_srcdir = @top_srcdir@
 VPATH = @srcdir@
 prefix = @prefix@
 exec_prefix = @exec_prefix@
 bindir = @bindir@
 sbindir = @sbindir@
 libexecdir = @libexecdir@
 datadir = @datadir@
 sysconfdir = @sysconfdir@
 sharedstatedir = @sharedstatedir@
 localstatedir = @localstatedir@
 libdir = @libdir@
 infodir = @infodir@
 mandir = @mandir@
 includedir = @includedir@
 oldincludedir = /usr/include
 DESTDIR =
 pkgdatadir = $(datadir)/@PACKAGE@
 pkglibdir = $(libdir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
 top_builddir = ..
 ACLOCAL = @ACLOCAL@
 AUTOCONF = @AUTOCONF@
 AUTOMAKE = @AUTOMAKE@
 AUTOHEADER = @AUTOHEADER@
 INSTALL = @INSTALL@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS)
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_SCRIPT = @INSTALL_SCRIPT@
 transform = @program_transform_name@
 NORMAL_INSTALL = :
 PRE_INSTALL = :
 POST_INSTALL = :
 NORMAL_UNINSTALL = :
 PRE_UNINSTALL = :
 POST_UNINSTALL = :
 host_alias = @host_alias@
 host_triplet = @host@
 CC = @CC@
 CXX = @CXX@
 HAVE_LIB = @HAVE_LIB@
 LIB = @LIB@
 LTLIB = @LTLIB@
 MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 PACKAGE = @PACKAGE@
 PACKAGE_DATE = @PACKAGE_DATE@
 PACKAGE_NAME = @PACKAGE_NAME@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PACKAGE_YEAR = @PACKAGE_YEAR@
 RANLIB = @RANLIB@
 VERSION = @VERSION@
 EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
 mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
 CONFIG_HEADER = ../config_auto.h
 CONFIG_CLEAN_FILES = 
 DIST_COMMON =  README Makefile.am Makefile.in
 DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
 TAR = tar
 GZIP_ENV = --best
 all: all-redirect
 .SUFFIXES:
 $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) 
 	cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
 Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status $(BUILT_SOURCES)
 	cd $(top_builddir) \
 	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
 tags: TAGS
 TAGS:
 distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
 subdir = testing
 distdir: $(DISTFILES)
 	here=`cd $(top_builddir) && pwd`; \
 	top_distdir=`cd $(top_distdir) && pwd`; \
 	distdir=`cd $(distdir) && pwd`; \
 	cd $(top_srcdir) \
 	  && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
 	$(mkinstalldirs) $(distdir)/reports
 	@for file in $(DISTFILES); do \
 	  d=$(srcdir); \
 	  if test -d $$d/$$file; then \
 	    cp -pr $$d/$$file $(distdir)/$$file; \
 	  else \
 	    test -f $(distdir)/$$file \
 	    || ln $$d/$$file $(distdir)/$$file 2> /dev/null \
 	    || cp -p $$d/$$file $(distdir)/$$file || :; \
 	  fi; \
 	done
 info-am:
 info: info-am
 dvi-am:
 dvi: dvi-am
 check-am: all-am
 check: check-am
 installcheck-am:
 installcheck: installcheck-am
 install-exec-am:
 install-exec: install-exec-am
 install-data-am:
 install-data: install-data-am
 install-am: all-am
 	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
 install: install-am
 uninstall-am:
 uninstall: uninstall-am
 all-am: Makefile
 all-redirect: all-am
 install-strip:
 	$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
 installdirs:
 mostlyclean-generic:
 clean-generic:
 distclean-generic:
 	-rm -f Makefile $(CONFIG_CLEAN_FILES)
 	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
 maintainer-clean-generic:
 mostlyclean-am:  mostlyclean-generic
 mostlyclean: mostlyclean-am
 clean-am:  clean-generic mostlyclean-am
 clean: clean-am
 distclean-am:  distclean-generic clean-am
 distclean: distclean-am
 maintainer-clean-am:  maintainer-clean-generic distclean-am
 	@echo "This command is intended for maintainers to use;"
 	@echo "it deletes files that may require special tools to rebuild."
 maintainer-clean: maintainer-clean-am
 .PHONY: tags distdir info-am info dvi-am dvi check check-am \
 installcheck-am installcheck install-exec-am install-exec \
 install-data-am install-data install-am install uninstall-am uninstall \
 all-redirect all-am all installdirs mostlyclean-generic \
 distclean-generic clean-generic maintainer-clean-generic clean \
 mostlyclean distclean maintainer-clean
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
--- a/testing/README
+++ b/testing/README
@ -0,0 +1,43 @@
 How to run UNLV tests.
 The scripts in this directory make it possible to duplicate the tests
 published in the Fourth Annual Test of OCR Accuracy.
 See http://www.isri.unlv.edu/downloads/AT-1995.pdf
 but first you have to get the tools and data from UNLV:
 Step 1: to download the images goto
 http://www.isri.unlv.edu/ISRI/OCRtk
 and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz.
 Step 2: extract the files. It doesn't really matter where
 in your filesystem you put them, but they must go under a common
 root so you have directories 3, B, M and N in, for example,
 /users/me/ISRI-OCRtk.
 Step 3: Reorg the files
 The lack of tif extensions on the images is inconvenient, so there
 is a script to reorganize the data to match the rest of the test
 scripts.
 cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run
 /blah/blah/tesseract-ocr/testing/reorgdata.sh 3B
 This makes directories doe3.3B, bus.3B, mag.3B and news.3B.
 You can now get rid of 3, B, M, and N unless you want to get some of the
 other scanning resolutions out of them.
 Step 4: Download the ISRI toolkit from:
 http://www.isri.unlv.edu/downloads/ftk-1.0.tgz
 Step 5: If they work for you, use the binaries directly from the bin
 directory and put them in tesseract-ocr/testing/unlv
 otherwise build the tools for yourself and put them there.
 Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
 Step 7: run testing/runalltests.sh with the root data dir and testname:
 testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0
 and go to the gym, have lunch etc.
 Step 8: There should be a file
 testing/reports/tess2.0.summary that contains the final summarized accuracy
 report and comparison with the 1995 results.
--- a/testing/counttestset.sh
+++ b/testing/counttestset.sh
@ -0,0 +1,61 @@
 #!/bin/bash
 # File:        counttestset.sh
 # Description: Script to count the errors on a single UNLV set.
 # Author:      Ray Smith
 # Created:     Wed Jun 13 11:58:01 PDT 2007
 #
 # (C) Copyright 2007, Google Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 if [ $# -ne 1 ]
 then
  echo "Usage:$0 pagesfile"
  exit 1
 fi
 if [ ! -d ccmain ]
 then
  echo "Run $0 from the tesseract-ocr root directory!"
  exit 1
 fi
 if [ ! -r testing/unlv/accuracy ]
 then
  echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
  exit 1
 fi
 pages=$1
 imdir=${pages%/pages}
 setname=${imdir##*/}
 resdir=testing/results/$setname
 mkdir -p testing/reports
 echo "Counting on set $setname in directory $imdir to $resdir"
 accfiles=""
 wafiles=""
 while read page dir
 do
  if [ "$dir" ]
  then
     srcdir="$imdir/$dir"
  else
     srcdir="$imdir"
  fi
 #  echo "$srcdir/$page.tif"
  # Count character errors.
  testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc
  accfiles="$accfiles $resdir/$page.acc"
  # Count word errors.
  testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa
  wafiles="$wafiles $resdir/$page.wa"
 done <$pages
 testing/unlv/accsum $accfiles >testing/reports/$setname.characc
 testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc
--- a/testing/reorgdata.sh
+++ b/testing/reorgdata.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 if [ $# -ne 1 ]
 then
    echo "Usage:$0 scantype"
    echo "UNLV data comes in several scan types:"
    echo "3B=300 dpi binary"
    echo "3A=adaptive thresholded 300 dpi"
    echo "3G=300 dpi grey"
    echo "4B=400dpi binary"
    echo "2B=200dpi binary"
    echo "For now we only use 3B"
    exit 1
 fi
 ext=$1
 #There are several test sets without meaningful names, so rename
 #them with something a bit more meaningful.
 #Each s is oldname/newname
 for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
 do
    old=${s%/*}
    #if this set was downloaded then process it.
    if [ -r "$old/PAGES" ]
    then
 	new=${s#*/}.$ext
 	mkdir -p $new
    	echo "Set $old -> $new"
 	#The pages file had - instead of _ so fix it and add the extension.
 	for page in `cat $old/PAGES`
 	do
    	    echo "${page%-*}_${page#*-}.$ext"
 	done >$new/pages
 	for f in `cat $new/pages`
 	do
    	    #Put a tif extension on the tif files.
 	    cp $old/${old}_B/$f $new/$f.tif
 	    #Put a uzn extension on the zone files.
 	    cp $old/${old}_B/${f}Z $new/$f.uzn
 	    #Cat all the truth files together and put into a single txt file.
 	    cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt
 	done
    fi
 done
--- a/testing/reports/1995.bus.3B.sum
+++ b/testing/reports/1995.bus.3B.sum
@ -0,0 +1 @@
 1995	bus.3B	5959	98.14%	0.00%	1631	96.83%	0.00%	1293	95.73%	0.00%
--- a/testing/reports/1995.doe3.3B.sum
+++ b/testing/reports/1995.doe3.3B.sum
@ -0,0 +1 @@
 1995	doe3.3B	36349	97.52%	0.00%	7826	96.34%	0.00%	7042	94.87%	0.00%
--- a/testing/reports/1995.mag.3B.sum
+++ b/testing/reports/1995.mag.3B.sum
@ -0,0 +1 @@
 1995	mag.3B	15043	97.74%	0.00%	4566	96.01%	0.00%	3379	94.99%	0.00%
--- a/testing/reports/1995.news.3B.sum
+++ b/testing/reports/1995.news.3B.sum
@ -0,0 +1 @@
 1995	news.3B	6432	98.69%	0.00%	1946	97.68%	0.00%	1502	96.94%	0.00%
--- a/testing/runalltests.sh
+++ b/testing/runalltests.sh
@ -0,0 +1,110 @@
 #!/bin/bash
 # File:        runalltests.sh
 # Description: Script to run a set of UNLV test sets.
 # Author:      Ray Smith
 # Created:     Thu Jun 14 08:21:01 PDT 2007
 #
 # (C) Copyright 2007, Google Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 if [ $# -ne 2 ]
 then
   echo "Usage:$0 unlv-data-dir version-id"
   exit 1
 fi
 if [ ! -d ccmain ]
 then
  echo "Run $0 from the tesseract-ocr root directory!"
  exit 1
 fi
 if [ ! -r ccmain/tesseract -a ! -r tesseract.exe ]
 then
  echo "Please build tesseract before running $0"
  exit 1
 fi
 if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ]
 then
  echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
  exit 1
 fi
 #deltapc new old calculates the %change from old to new
 deltapc() {
 awk ' BEGIN {
 printf("%.2f", 100.0*('$1'-'$2')/'$2');
 }'
 }
 imdir="$1"
 vid="$2"
 bindir=${0%/*}
 if [ "$bindir" = "$0" ]
 then
    bindir="./"
 fi
 rdir=testing/reports
 testsets="bus.3B doe3.3B mag.3B news.3B"
 totalerrs=0
 totalwerrs=0
 totalnswerrs=0
 totalolderrs=0
 totaloldwerrs=0
 totaloldnswerrs=0
 for set in $testsets
 do
    if [ -r $imdir/$set/pages ]
    then
 	# Run tesseract on all the pages.
 	$bindir/runtestset.sh $imdir/$set/pages
 	# Count the errors on all the pages.
 	$bindir/counttestset.sh $imdir/$set/pages
 	# Get the old character word and nonstop word errors.
 	olderrs=`cat testing/reports/1995.$set.sum | cut -f3`
 	oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6`
 	oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9`
 	# Get the new character word and nonstop word errors and accuracy.
 	cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]'`
 	chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]'`
 	wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]'`
 	wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
 	    tr -d '[:blank:]'`
 	nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
 	    cut -c10-17 |tr -d '[:blank:]'`
 	nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
 	    cut -c19-26 |tr -d '[:blank:]'`
 	# Compute the percent change.
 	chdelta=`deltapc $cherrs $olderrs`
 	wdelta=`deltapc $wderrs $oldwerrs`
 	nswdelta=`deltapc $nswderrs $oldnswerrs`
 	sumfile=$rdir/$vid.$set.sum
 	echo "$vid	$set	$cherrs	$chacc	$chdelta%	$wderrs	$wdacc\
 	$wdelta%	$nswderrs	$nswdacc	$nswdelta%" >$sumfile
 	# Sum totals over all the testsets.
 	let totalerrs=totalerrs+cherrs
 	let totalwerrs=totalwerrs+wderrs
 	let totalnswerrs=totalnswerrs+nswderrs
 	let totalolderrs=totalolderrs+olderrs
 	let totaloldwerrs=totaloldwerrs+oldwerrs
 	let totaloldnswerrs=totaloldnswerrs+oldnswerrs
    fi
 done
 # Compute grand total percent change.
 chdelta=`deltapc $totalerrs $totalolderrs`
 wdelta=`deltapc $totalwerrs $totaloldwerrs`
 nswdelta=`deltapc $totalnswerrs $totaloldnswerrs `
 tfile=$rdir/$vid.total.sum
 echo "$vid	Total	$totalerrs	-	$chdelta%	$totalwerrs\
 	-	$wdelta%	$totalnswerrs	-	$nswdelta%" >$tfile
 cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary
--- a/testing/runtestset.sh
+++ b/testing/runtestset.sh
@ -0,0 +1,61 @@
 #!/bin/bash
 # File:        runtestset.sh
 # Description: Script to run tesseract on a single UNLV set.
 # Author:      Ray Smith
 # Created:     Wed Jun 13 10:13:01 PDT 2007
 #
 # (C) Copyright 2007, Google Inc.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 if [ $# -ne 1 ]
 then
  echo "Usage:$0 pagesfile"
  exit 1
 fi
 if [ ! -d ccmain ]
 then
  echo "Run $0 from the tesseract-ocr root directory!"
  exit 1
 fi
 if [ ! -r ccmain/tesseract ]
 then
  if [ ! -r tesseract.exe ]
  then
    echo "Please build tesseract before running $0"
    exit 1
  else
    tess="./tesseract.exe"
  fi
 else
  tess="ccmain/tesseract"
  export TESSDATA_PREFIX=$PWD/
 fi
 pages=$1
 imdir=${pages%/pages}
 setname=${imdir##*/}
 resdir=testing/results/$setname
 echo "Testing on set $setname in directory $imdir to $resdir"
 mkdir -p $resdir
 while read page dir
 do
  # A pages file may be a list of files with subdirs or maybe just
  # a plain list of files so accomodate both.
  if [ "$dir" ]
  then
     srcdir="$imdir/$dir"
  else
     srcdir="$imdir"
  fi
 #  echo "$srcdir/$page.tif"
  $tess $srcdir/$page.tif $resdir/$page nobatch unlv
 done <$pages
		`@ -0,0 +1,2 @@`

							`EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum`
		`@ -0,0 +1 @@`
							`1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%`
		`@ -0,0 +1 @@`
							`1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%`
		`@ -0,0 +1 @@`
							`1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%`
		`@ -0,0 +1 @@`
							`1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%`