API/output changes to produce unlv-style latin-1 output and test scripts

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@86 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-06-07 09:52:40 +08:00 · 2007-07-18 01:11:18 +00:00 · 2007-07-18 01:11:18 +00:00 · 627368df42
commit 627368df42
parent eeaca1beba
27 changed files with 1424 additions and 442 deletions
--- a/ccmain/applybox.cpp
+++ b/ccmain/applybox.cpp
@ -24,20 +24,22 @@ what measures we are interested in.
 /* #define SECURE_NAMES done in secnames.h when necessary*/

 #include "mfcpch.h"
-#include          "applybox.h"
-#include          <ctype.h>
-#include          <string.h>
+#include "applybox.h"
+#include <ctype.h>
+#include <string.h>
 #ifdef __UNIX__
-#include          <assert.h>
-#include                    <errno.h>
+#include <assert.h>
+#include <errno.h>
 #endif
-#include          "mainblk.h"
-#include                   "genblob.h"
-#include                   "fixxht.h"
-#include          "control.h"
-#include          "tessbox.h"
-#include          "globals.h"
-#include          "secname.h"
+#include "mainblk.h"
+#include "genblob.h"
+#include "fixxht.h"
+#include "control.h"
+#include "tessbox.h"
+#include "globals.h"
+#include "secname.h"
+#include "unichar.h"
+#include "matchdefs.h"

 #define SECURE_NAMES
 #ifndef SECURE_NAMES
@ -47,10 +49,13 @@ what measures we are interested in.
 #define EXTERN
 EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
 EXTERN INT_VAR (applybox_debug, 0, "Debug level");
-EXTERN STRING_VAR (applybox_test_exclusions, "|",
+EXTERN STRING_VAR (applybox_test_exclusions, "",
 "Chars ignored for testing");
 EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");

+// The unicharset used during box training
+static UNICHARSET unicharset_boxes;
+
 /*************************************************************************
 * The code re-assigns outlines to form words each with ONE labelled blob.
 * Noise is left in UNLABELLED words. The chars on the page are checked crudely
@ -89,7 +94,7 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
  INT16 boxfile_lineno = 0;
  INT16 boxfile_charno = 0;
  BOX box;                       //boxfile box
-  char ch[2];                    //correct ch from boxfile
+  UNICHAR_ID uch_id;             //correct ch from boxfile
  ROW *row;
  ROW *prev_row = NULL;
  INT16 prev_box_right = MAX_INT16;
@ -100,15 +105,20 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
  INT16 labels_ok;
  INT16 rows_ok;
  INT16 bad_blobs;
-  INT16 tgt_char_counts[128];    //No. of box samples
+  INT16 tgt_char_counts[MAX_NUM_CLASSES];    //No. of box samples
  //      INT16                                   labelled_char_counts[128];      //No. of unique labelled samples
  INT16 i;
  INT16 rebalance_count = 0;
-  char min_char;
+  UNICHAR_ID min_uch_id;
  INT16 min_samples;
  INT16 final_labelled_blob_count;

-  for (i = 0; i < 128; i++)
+  // Clean the unichar set
+  unicharset_boxes.clear();
+  // Space character needed to represent NIL classification
+  unicharset_boxes.unichar_insert(" ");
+
+  for (i = 0; i < MAX_NUM_CLASSES; i++)
    tgt_char_counts[i] = 0;

  FILE* box_file;
@ -120,11 +130,10 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
      filename.string(), errno);
  }

-  ch[1] = '\0';
  clear_any_old_text(block_list);
-  while (read_next_box (box_file, &box, &ch[0])) {
+  while (read_next_box (box_file, &box, &uch_id)) {
    box_count++;
-    tgt_char_counts[ch[0]]++;
+    tgt_char_counts[uch_id]++;
    row = find_row_of_box (block_list, box, block_id, row_id);
    if (box.left () < prev_box_right) {
      boxfile_lineno++;
@ -135,14 +144,16 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks

    if (row == NULL) {
      box_failures++;
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+      report_failed_box (boxfile_lineno, boxfile_charno, box,
+                         unicharset_boxes.id_to_unichar(uch_id),
        "FAILURE! box overlaps no blobs or blobs in multiple rows");
    }
    else {
      if ((box.left () >= prev_box_right) && (row != prev_row))
-        report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
          "WARNING! false row break");
-      box_failures += resegment_box (row, box, ch, block_id, row_id,
+      box_failures += resegment_box (row, box, uch_id, block_id, row_id,
        boxfile_lineno, boxfile_charno);
      prev_row = row;
    }
@ -154,7 +165,7 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
          bad_blobs,
          tgt_char_counts,
          rebalance_count,
-          min_char,
+          &min_uch_id,
          min_samples,
          final_labelled_blob_count);
  tprintf ("APPLY_BOXES:\n");
@ -163,7 +174,8 @@ void apply_boxes(BLOCK_LIST *block_list    //real blocks
    labels_ok, rows_ok);
  tprintf ("   Box failures detected:		%6d\n", box_failures);
  tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
-  tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);
+  tprintf ("   \"%s\" has fewest samples:%6d\n",
+           unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
  tprintf ("				Total unlabelled words:   %6d\n",
    bad_blobs);
  tprintf ("				Final labelled words:     %6d\n",
@ -194,7 +206,7 @@ void clear_any_old_text(                        //remove correct text

 BOOL8 read_next_box(FILE* box_file,  //
                    BOX *box,
-                    char *ch) {
+                    UNICHAR_ID *uch_id) {
  char buff[256];                //boxfile read buffer
  char *buffptr = buff;
  STRING box_filename;
@ -204,23 +216,38 @@ BOOL8 read_next_box(FILE* box_file,  //
  INT32 x_max;
  INT32 y_max;
  INT32 count = 0;
+  char uch[256];

  while (!feof (box_file)) {
    fgets (buff, sizeof (buff) - 1, box_file);
    line++;

+    buffptr = buff;
+    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
+    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
+      buffptr += 3;  // Skip unicode file designation.
    /* Check for blank lines in box file */
-    for (buffptr = buff; isspace (*buffptr); buffptr++)
-      ;
+    while (isspace (*buffptr))
+      buffptr++;
    if (*buffptr != '\0') {
      count =
-        sscanf (buff,
-        "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
-        INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
+        sscanf (buffptr,
+        "%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
+        INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
      if (count != 5) {
        tprintf ("Box file format error on line %i ignored\n", line);
      }
      else {
+        if (!unicharset_boxes.contains_unichar(uch))
+        {
+          unicharset_boxes.unichar_insert(uch);
+          if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
+            tprintf("Error: Size of unicharset of boxes is \
+greater than MAX_NUM_CLASSES\n");
+            exit(1);
+          }
+        }
+        *uch_id = unicharset_boxes.unichar_to_id(uch);
        *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
        return TRUE;             //read a box ok
      }
@ -314,7 +341,7 @@ ROW *find_row_of_box(                         //
 INT16 resegment_box(  //
                    ROW *row,
                    BOX box,
-                    char *ch,
+                    UNICHAR_ID uch_id,
                    INT16 block_id,
                    INT16 row_id,
                    INT16 boxfile_lineno,
@ -358,7 +385,7 @@ INT16 resegment_box(  //
                  if (applybox_debug > 4)
                    report_failed_box (boxfile_lineno,
                      boxfile_charno,
-                      box, ch,
+                      box, unicharset_boxes.id_to_unichar(uch_id),
                      "FAILURE! box overlaps blob in labelled word");
                }
                if (applybox_debug > 4)
@ -375,7 +402,7 @@ INT16 resegment_box(  //
                if (new_word == NULL) {
                                 /* Make a new word with a single blob */
                  new_word = word->shallow_copy ();
-                  new_word->set_text (ch);
+                  new_word->set_text (unicharset_boxes.id_to_unichar(uch_id));
                  if (polyg)
                    new_blob = new PBLOB;
                  else
@ -414,63 +441,75 @@ INT16 resegment_box(  //
    word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
    baseline = row->base_line (word_x_centre);

-    if (STRING (chs_caps_ht).contains (ch[0]) &&
-      (new_word_box.top () <
-    baseline + (1 + applybox_error_band) * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! caps-ht char didn't ascend");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_odd_top).contains (ch[0]) &&
-      (new_word_box.top () <
-    baseline + (1 - applybox_error_band) * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! Odd top char below xht");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_x_ht).contains (ch[0]) &&
-      ((new_word_box.top () >
-      baseline + (1 + applybox_error_band) * row->x_height ()) ||
-      (new_word_box.top () <
-    baseline + (1 - applybox_error_band) * row->x_height ()))) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! x-ht char didn't have top near xht");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
-      ((new_word_box.bottom () <
-      baseline - applybox_error_band * row->x_height ()) ||
-      (new_word_box.bottom () >
-    baseline + applybox_error_band * row->x_height ()))) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! non ambig BL char didnt have bottom near baseline");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_odd_bot).contains (ch[0]) &&
-      (new_word_box.bottom () >
-    baseline + applybox_error_band * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-        "FAILURE! Odd bottom char above baseline");
-      new_word->set_text ("");
-      return 1;
-    }
-    if (STRING (chs_desc).contains (ch[0]) &&
-      (new_word_box.bottom () >
-    baseline - applybox_error_band * row->x_height ())) {
-      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
+#if 0
+    if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
+      if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.top () <
+           baseline + (1 + applybox_error_band) * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! caps-ht char didn't ascend");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.top () <
+           baseline + (1 - applybox_error_band) * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! Odd top char below xht");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          ((new_word_box.top () >
+            baseline + (1 + applybox_error_band) * row->x_height ()) ||
+           (new_word_box.top () <
+            baseline + (1 - applybox_error_band) * row->x_height ()))) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! x-ht char didn't have top near xht");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_non_ambig_bl).contains
+          (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          ((new_word_box.bottom () <
+            baseline - applybox_error_band * row->x_height ()) ||
+           (new_word_box.bottom () >
+            baseline + applybox_error_band * row->x_height ()))) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! non ambig BL char didnt have bottom near baseline");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.bottom () >
+           baseline + applybox_error_band * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
+                           "FAILURE! Odd bottom char above baseline");
+        new_word->set_text ("");
+        return 1;
+      }
+      if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
+          (new_word_box.bottom () >
+           baseline - applybox_error_band * row->x_height ())) {
+        report_failed_box (boxfile_lineno, boxfile_charno, box,
+                           unicharset_boxes.id_to_unichar(uch_id),
        "FAILURE! Descender doesn't descend");
-      new_word->set_text ("");
-      return 1;
+        new_word->set_text ("");
+        return 1;
+      }
    }
+#endif
    return 0;
  }
  else {
-    report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
-      "FAILURE! Couldn't find any blobs");
+    report_failed_box (boxfile_lineno, boxfile_charno, box,
+                       unicharset_boxes.id_to_unichar(uch_id),
+                       "FAILURE! Couldn't find any blobs");
    return 1;
  }
 }
@ -492,7 +531,7 @@ void tidy_up(                         //
             INT16 &unlabelled_words,
             INT16 *tgt_char_counts,
             INT16 &rebalance_count,
-             char &min_char,
+             UNICHAR_ID *min_uch_id,
             INT16 &min_samples,
             INT16 &final_labelled_blob_count) {
  BLOCK_IT block_it(block_list);
@ -507,16 +546,16 @@ void tidy_up(                         //
  BOOL8 row_ok;
  BOOL8 rebalance_needed = FALSE;
                                 //No. of unique labelled samples
-  INT16 labelled_char_counts[128];
+  INT16 labelled_char_counts[MAX_NUM_CLASSES];
  INT16 i;
-  char ch;
-  char prev_ch = '\0';
+  UNICHAR_ID uch_id;
+  UNICHAR_ID prev_uch_id = -1;
  BOOL8 at_dupe_of_prev_word;
  ROW *prev_row = NULL;
  INT16 left;
  INT16 prev_left = -1;

-  for (i = 0; i < 128; i++)
+  for (i = 0; i < MAX_NUM_CLASSES; i++)
    labelled_char_counts[i] = 0;

  ok_char_count = 0;
@ -556,7 +595,7 @@ void tidy_up(                         //
              block_idx, row_idx, all_row_idx);

          ok_char_count++;
-          labelled_char_counts[*word->text ()]++;
+          labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++;
          row_ok = TRUE;
        }
      }
@ -571,24 +610,24 @@ void tidy_up(                         //
  }

  min_samples = 9999;
-  for (i = 0; i < 128; i++) {
+  for (i = 0; i < unicharset_boxes.size(); i++) {
    if (tgt_char_counts[i] > labelled_char_counts[i]) {
      if (labelled_char_counts[i] <= 1) {
        tprintf
-          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
-          labelled_char_counts[i], (char) i, tgt_char_counts[i]);
+          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d\n",
+          labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);
      }
      else {
        rebalance_needed = TRUE;
        if (applybox_debug > 0)
          tprintf
-            ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
-            (char) i, tgt_char_counts[i], labelled_char_counts[i]);
+            ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",
+            unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);
      }
    }
    if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
      min_samples = labelled_char_counts[i];
-      min_char = (char) i;
+      *min_uch_id = i;
    }
  }

@ -605,33 +644,36 @@ void tidy_up(                         //
        !word_it.cycled_list (); word_it.forward ()) {
          word = word_it.data ();
          left = word->bounding_box ().left ();
-          ch = *word->text ();
+          if (*word->text () != '\0')
+            uch_id = unicharset_boxes.unichar_to_id(word->text ());
+          else
+            uch_id = -1;
          at_dupe_of_prev_word = ((row == prev_row) &&
            (left = prev_left) &&
-            (ch == prev_ch));
-          if ((ch != '\0') &&
-            (labelled_char_counts[ch] > 1) &&
-            (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
+            (uch_id == prev_uch_id));
+          if ((uch_id != -1) &&
+            (labelled_char_counts[uch_id] > 1) &&
+            (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
          (!at_dupe_of_prev_word)) {
            /* Duplicate the word to rebalance the labelled samples */
            if (applybox_debug > 9) {
-              tprintf ("Duping \"%c\" from ", ch);
+              tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
              word->bounding_box ().print ();
            }
            duplicate_word = new WERD;
            *duplicate_word = *word;
            word_it.add_after_then_move (duplicate_word);
            rebalance_count++;
-            labelled_char_counts[ch]++;
+            labelled_char_counts[uch_id]++;
          }
          prev_row = row;
          prev_left = left;
-          prev_ch = ch;
+          prev_uch_id = uch_id;
        }
      }
    }
    rebalance_needed = FALSE;
-    for (i = 0; i < 128; i++) {
+    for (i = 0; i < unicharset_boxes.size(); i++) {
      if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
      (labelled_char_counts[i] > 1)) {
        rebalance_needed = TRUE;
@ -653,7 +695,7 @@ void tidy_up(                         //
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
-        if ((strlen (word->text ()) == 1) &&
+        if ((strlen (word->text ()) > 0) &&
          (word->gblob_list ()->length () == 1))
          final_labelled_blob_count++;
      }
@ -665,7 +707,7 @@ void tidy_up(                         //
 void report_failed_box(INT16 boxfile_lineno,
                       INT16 boxfile_charno,
                       BOX box,
-                       char *box_ch,
+                       const char *box_ch,
                       const char *err_msg) {
  if (applybox_debug > 4)
    tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
@ -687,10 +729,9 @@ void apply_box_training(BLOCK_LIST *block_list) {
  PBLOB_IT blob_it;
  DENORM denorm;
  INT16 count = 0;
-  char ch[2];
-
-  ch[1] = '\0';
+  char unichar[UNICHAR_LEN + 1];

+  unichar[UNICHAR_LEN] = '\0';
  tprintf ("Generating training data\n");
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
@ -701,23 +742,22 @@ void apply_box_training(BLOCK_LIST *block_list) {
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
-        if ((strlen (word->text ()) == 1) &&
+        if ((strlen (word->text ()) > 0) &&
        (word->gblob_list ()->length () == 1)) {
-          /* Here is a word with a single char label and a single blob so train on it */
+          /* Here is a word with a single unichar label and a single blob so train on it */
          bln_word =
            make_bln_copy (word, row, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
-          ch[0] = *word->text ();
+          strncpy(unichar, word->text (), UNICHAR_LEN);
          tess_training_tester (blob_it.data (),
                                 //single blob
            &denorm, TRUE,       //correct
-            ch,                  //correct ASCII char
-            1,                   //ASCII length
+            unichar,             //correct character
+            strlen(unichar),     //character length
            NULL);
          copy_outword = *(bln_word);
          copy_outword.baseline_denormalise (&denorm);
          blob_it.set_to_list (copy_outword.blob_list ());
-          ch[0] = *word->text ();
          delete bln_word;
          count++;
        }
@ -793,7 +833,7 @@ void apply_box_testing(BLOCK_LIST *block_list) {
            choice list, outword blob lists and best_choice string are the same
            length. A TESS screw up is indicated by a blank filled or 0 length string.
          */
-          if ((best_choice->string ().length () == 0) ||
+          if ((best_choice->lengths ().length () == 0) ||
            (strspn (best_choice->string ().string (), " ") ==
          best_choice->string ().length ())) {
            rej_count++;
@ -804,22 +844,22 @@ void apply_box_testing(BLOCK_LIST *block_list) {
            #endif
          }
          else {
-            if ((best_choice->string ().length () !=
+            if ((best_choice->lengths ().length () !=
              outword->blob_list ()->length ()) ||
-              (best_choice->string ().length () !=
+              (best_choice->lengths ().length () !=
            blob_choices.length ())) {
              tprintf
                ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
                best_choice->string ().string (),
-                best_choice->string ().length (),
+                best_choice->lengths ().length (),
                outword->blob_list ()->length (),
                blob_choices.length ());
            }
-            ASSERT_HOST (best_choice->string ().length () ==
+            ASSERT_HOST (best_choice->lengths ().length () ==
              outword->blob_list ()->length ());
-            ASSERT_HOST (best_choice->string ().length () ==
+            ASSERT_HOST (best_choice->lengths ().length () ==
              blob_choices.length ());
-            fix_quotes ((char *) best_choice->string ().string (),
+            fix_quotes (best_choice,
                                 //turn to double
              outword, &blob_choices);
            if (strcmp (best_choice->string ().string (), ch) != 0) {
--- a/ccmain/baseapi.cpp
+++ b/ccmain/baseapi.cpp
@ -27,6 +27,7 @@
 #include "applybox.h"
 #include "pgedit.h"
 #include "varabled.h"
+#include "output.h"
 #include "adaptmatch.h"

 BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
@ -37,6 +38,8 @@ BOOL_VAR(tessedit_train_from_boxes, FALSE,
 // Minimum sensible image size to be worth running tesseract.
 const int kMinRectSize = 10;

+static STRING input_file = "noname.tif";
+
 // Start tesseract.
 // The datapath must be the name of the data directory or some other file
 // in which the data directory resides (for instance argv[0].)
@ -70,6 +73,12 @@ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
  return result;
 }

+// Set the name of the input file. Needed only for training and
+// loading a UNLV zone file.
+void TessBaseAPI::SetInputName(const char* name) {
+  input_file = name;
+}
+
 // Recognize a rectangle from an image and return the result as a string.
 // May be called many times for a single Init.
 // Currently has no error checking.
@ -96,6 +105,52 @@ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
  return RecognizeToString();
 }

+// As TesseractRect but produces a box file as output.
+char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
+                                      int bytes_per_pixel,
+                                      int bytes_per_line,
+                                      int left, int top,
+                                      int width, int height,
+                                      int imageheight) {
+  if (width < kMinRectSize || height < kMinRectSize)
+  return NULL;  // Nothing worth doing.
+
+  // Copy/Threshold the image to the tesseract global page_image.
+  CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
+                       left, top, width, height);
+
+  BLOCK_LIST    block_list;
+
+  FindLines(&block_list);
+
+  // Now run the main recognition.
+  PAGE_RES* page_res = Recognize(&block_list, NULL);
+
+  return TesseractToBoxText(page_res, left, imageheight - (top + height));
+}
+
+char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
+                                     int bytes_per_pixel,
+                                     int bytes_per_line,
+                                     int left, int top,
+                                     int width, int height) {
+  if (width < kMinRectSize || height < kMinRectSize)
+    return NULL;  // Nothing worth doing.
+
+  // Copy/Threshold the image to the tesseract global page_image.
+  CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
+                       left, top, width, height);
+
+  BLOCK_LIST    block_list;
+
+  FindLines(&block_list);
+
+  // Now run the main recognition.
+  PAGE_RES* page_res = Recognize(&block_list, NULL);
+
+  return TesseractToUNLV(page_res);
+}
+
 // Call between pages or documents etc to free up memory and forget
 // adaptive data.
 void TessBaseAPI::ClearAdaptiveClassifier() {
@ -326,7 +381,7 @@ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
  image.capture(const_cast<unsigned char*>(imagedata),
                bytes_per_line*8, top + height, 1);
  page_image.create(width, height, 1);
-  copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false);
+  copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
 }

 // Low-level function to recognize the current global image to a string.
@ -343,7 +398,6 @@ char* TessBaseAPI::RecognizeToString() {

 // Find lines from the image making the BLOCK_LIST.
 void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
-  STRING input_file = "noname.tif";
  // The following call creates a full-page block and then runs connected
  // component analysis and text line creation.
  pgeditor_read_file(input_file, block_list);
@ -369,21 +423,32 @@ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
  return page_res;
 }

+// Return the maximum length that the output text string might occupy.
+int TessBaseAPI::TextLength(PAGE_RES* page_res) {
+  PAGE_RES_IT   page_res_it(page_res);
+  int total_length = 2;
+  // Iterate over the data structures to extract the recognition result.
+  for (page_res_it.restart_page(); page_res_it.word () != NULL;
+       page_res_it.forward()) {
+    WERD_RES *word = page_res_it.word();
+    WERD_CHOICE* choice = word->best_choice;
+    if (choice != NULL) {
+      total_length += choice->string().length() + 1;
+      for (int i = 0; i < word->reject_map.length(); ++i) {
+        if (word->reject_map[i].rejected())
+          ++total_length;
+      }
+    }
+  }
+  return total_length;
+}
+
 // Make a text string from the internal data structures.
 // The input page_res is deleted.
 char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
  if (page_res != NULL) {
-    int total_length = 2;
+    int total_length = TextLength(page_res);
    PAGE_RES_IT   page_res_it(page_res);
-    // Iterate over the data structures to extract the recognition result.
-    for (page_res_it.restart_page(); page_res_it.word () != NULL;
-         page_res_it.forward()) {
-      WERD_RES *word = page_res_it.word();
-      WERD_CHOICE* choice = word->best_choice;
-      if (choice != NULL) {
-        total_length += choice->string().length() + 1;
-      }
-    }
    char* result = new char[total_length];
    char* ptr = result;
    for (page_res_it.restart_page(); page_res_it.word () != NULL;
@ -406,3 +471,207 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
  }
  return NULL;
 }
+
+static int ConvertWordToBoxText(WERD_RES *word,
+                                ROW_RES* row,
+                                int left,
+                                int bottom,
+                                char* word_str) {
+  // Copy the output word and denormalize it back to image coords.
+  WERD copy_outword;
+  copy_outword = *(word->outword);
+  copy_outword.baseline_denormalise(&word->denorm);
+  PBLOB_IT blob_it;
+  blob_it.set_to_list(copy_outword.blob_list());
+  int length = copy_outword.blob_list()->length();
+  int output_size = 0;
+
+  if (length > 0) {
+    for (int index = 0, offset = 0; index < length;
+         offset += word->best_choice->lengths()[index++], blob_it.forward()) {
+      PBLOB* blob = blob_it.data();
+      BOX blob_box = blob->bounding_box();
+      if (word->tess_failed ||
+          blob_box.left() < 0 ||
+          blob_box.right() > page_image.get_xsize() ||
+          blob_box.bottom() < 0 ||
+          blob_box.top() > page_image.get_ysize()) {
+        // Bounding boxes can be illegal when tess fails on a word.
+        blob_box = word->word->bounding_box();  // Use original word as backup.
+        tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
+                blob_box.left(), blob_box.bottom(),
+                blob_box.right(), blob_box.top());
+      }
+
+      // A single classification unit can be composed of several UTF-8
+      // characters. Append each of them to the result.
+      for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
+        char ch = word->best_choice->string()[offset + sub];
+        // Tesseract uses space for recognition failure. Fix to a reject
+        // character, '~' so we don't create illegal box files.
+        if (ch == ' ')
+          ch = '~';
+        word_str[output_size++] = ch;
+      }
+      sprintf(word_str + output_size, " %d %d %d %d\n",
+              blob_box.left() + left, blob_box.bottom() + bottom,
+              blob_box.right() + left, blob_box.top() + bottom);
+      output_size += strlen(word_str + output_size);
+    }
+  }
+  return output_size;
+}
+
+// Multiplier for textlength assumes 4 numbers @ 5 digits and a space
+// plus the newline and the orginial character = 4*(5+1)+2
+const int kMaxCharsPerChar = 26;
+
+// Make a text string from the internal data structures.
+// The input page_res is deleted.
+// The text string takes the form of a box file as needed for training.
+char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
+                                      int left, int bottom) {
+  if (page_res != NULL) {
+    int total_length = TextLength(page_res) * kMaxCharsPerChar;
+    PAGE_RES_IT   page_res_it(page_res);
+    char* result = new char[total_length];
+    char* ptr = result;
+    for (page_res_it.restart_page(); page_res_it.word () != NULL;
+         page_res_it.forward()) {
+      WERD_RES *word = page_res_it.word();
+      ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
+    }
+    *ptr = '\0';
+    delete page_res;
+    return result;
+  }
+  return NULL;
+}
+
+// Make a text string from the internal data structures.
+// The input page_res is deleted. The text string is converted
+// to UNLV-format: Latin-1 with specific reject and suspect codes.
+const char kUnrecognized = '~';
+// Conversion table for non-latin characters.
+// Maps characters out of the latin set into the latin set.
+// TODO(rays) incorporate this translation into unicharset.
+const int kUniChs[] = {
+  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
+};
+// Latin chars corresponding to the unicode chars above.
+const int kLatinChs[] = {
+  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
+};
+
+char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
+  bool tilde_crunch_written = false;
+  bool last_char_was_newline = true;
+  bool last_char_was_tilde = false;
+
+  if (page_res != NULL) {
+    int total_length = TextLength(page_res);
+    PAGE_RES_IT   page_res_it(page_res);
+    char* result = new char[total_length];
+    char* ptr = result;
+    for (page_res_it.restart_page(); page_res_it.word () != NULL;
+         page_res_it.forward()) {
+      WERD_RES *word = page_res_it.word();
+      // Process the current word.
+      if (word->unlv_crunch_mode != CR_NONE) {
+        if (word->unlv_crunch_mode != CR_DELETE &&
+            (!tilde_crunch_written ||
+             (word->unlv_crunch_mode == CR_KEEP_SPACE &&
+              word->word->space () > 0 &&
+              !word->word->flag (W_FUZZY_NON) &&
+              !word->word->flag (W_FUZZY_SP)))) {
+          if (!word->word->flag (W_BOL) &&
+              word->word->space () > 0 &&
+              !word->word->flag (W_FUZZY_NON) &&
+              !word->word->flag (W_FUZZY_SP)) {
+            /* Write a space to separate from preceeding good text */
+            *ptr++ = ' ';
+            last_char_was_tilde = false;
+          }
+          if (!last_char_was_tilde) {
+            // Write a reject char.
+            last_char_was_tilde = true;
+            *ptr++ = kUnrecognized;
+            tilde_crunch_written = true;
+            last_char_was_newline = false;
+          }
+        }
+      } else {
+        // NORMAL PROCESSING of non tilde crunched words.
+        tilde_crunch_written = false;
+
+        if (last_char_was_tilde &&
+            word->word->space () == 0 &&
+            (word->best_choice->string ()[0] == ' ')) {
+          /* Prevent adjacent tilde across words - we know that adjacent tildes within
+             words have been removed */
+          char* p = (char *) word->best_choice->string().string ();
+          strcpy (p, p + 1);       //shuffle up
+          p = (char *) word->best_choice->lengths().string ();
+          strcpy (p, p + 1);       //shuffle up
+          word->reject_map.remove_pos (0);
+          PBLOB_IT blob_it = word->outword->blob_list ();
+          delete blob_it.extract ();   //get rid of reject blob
+        }
+
+        if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
+          ensure_rep_chars_are_consistent(word);
+
+        set_unlv_suspects(word);
+        const char* wordstr = word->best_choice->string().string();
+        if (wordstr[0] != 0) {
+          if (!last_char_was_newline)
+            *ptr++ = ' ';
+          else
+            last_char_was_newline = false;
+          int offset = 0;
+          const STRING& lengths = word->best_choice->lengths();
+          int length = lengths.length();
+          for (int i = 0; i < length; offset += lengths[i++]) {
+            if (wordstr[offset] == ' ' ||
+                wordstr[offset] == '~' ||
+                wordstr[offset] == '|') {
+              *ptr++ = kUnrecognized;
+              last_char_was_tilde = true;
+            } else {
+              if (word->reject_map[i].rejected())
+                *ptr++ = '^';
+              UNICHAR ch(wordstr + offset, lengths[i]);
+              int uni_ch = ch.first_uni();
+              for (int j = 0; kUniChs[j] != 0; ++j) {
+                if (kUniChs[j] == uni_ch) {
+                  uni_ch = kLatinChs[j];
+                  break;
+                }
+              }
+              if (uni_ch <= 0xff) {
+                *ptr++ = static_cast<char>(uni_ch);
+                last_char_was_tilde = false;
+              } else {
+                *ptr++ = kUnrecognized;
+                last_char_was_tilde = true;
+              }
+            }
+          }
+        }
+      }
+      if (word->word->flag(W_EOL) && !last_char_was_newline) {
+        /* Add a new line output */
+        *ptr++ = '\n';
+        tilde_crunch_written = false;
+        last_char_was_newline = true;
+        last_char_was_tilde = false;
+      }
+    }
+    *ptr++ = '\n';
+    *ptr = '\0';
+    delete page_res;
+    return result;
+  }
+  return NULL;
+}
+
--- a/ccmain/baseapi.h
+++ b/ccmain/baseapi.h
@ -20,8 +20,6 @@
 #ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
 #define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__

-#include <string>
-
 class PAGE_RES;
 class BLOCK_LIST;

@ -56,6 +54,10 @@ class TessBaseAPI {
                              const char* language, const char* configfile,
                              bool numeric_mode, int argc, char* argv[]);

+  // Set the name of the input file. Needed only for training and
+  // reading a UNLV zone file.
+  static void SetInputName(const char* name);
+
  // Recognize a rectangle from an image and return the result as a string.
  // May be called many times for a single Init.
  // Currently has no error checking.
@ -71,6 +73,19 @@ class TessBaseAPI {
                             int bytes_per_pixel,
                             int bytes_per_line,
                             int left, int top, int width, int height);
+  // As TesseractRect but produces a box file as output.
+  // Image height is needed as well as rect height, since output y-coords
+  // will be relative to the bottom of the image.
+  static char* TesseractRectBoxes(const unsigned char* imagedata,
+                                  int bytes_per_pixel,
+                                  int bytes_per_line,
+                                  int left, int top, int width, int height,
+                                  int imageheight);
+  // As TesseractRect but produces UNLV-style output.
+  static char* TesseractRectUNLV(const unsigned char* imagedata,
+                                 int bytes_per_pixel,
+                                 int bytes_per_line,
+                                 int left, int top, int width, int height);

  // Call between pages or documents etc to free up memory and forget
  // adaptive data.
@ -153,8 +168,18 @@ class TessBaseAPI {
  static PAGE_RES* Recognize(BLOCK_LIST* block_list,
                             struct ETEXT_STRUCT* monitor);

+  // Return the maximum length that the output text string might occupy.
+  static int TextLength(PAGE_RES* page_res);
  // Convert (and free) the internal data structures into a text string.
  static char* TesseractToText(PAGE_RES* page_res);
+  // Make a text string from the internal data structures.
+  // The input page_res is deleted.
+  // The text string takes the form of a box file as needed for training.
+  static char* TesseractToBoxText(PAGE_RES* page_res, int left, int bottom);
+  // Make a text string from the internal data structures.
+  // The input page_res is deleted. The text string is converted
+  // to UNLV-format: Latin-1 with specific reject and suspect codes.
+  static char* TesseractToUNLV(PAGE_RES* page_res);
 };

 #endif  // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@ -35,6 +35,7 @@
 #include          "docqual.h"
 #include          "output.h"
 #include "bestfirst.h"
+#include "globals.h"

 #define EXTERN

@ -55,12 +56,12 @@ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
 "Write block separators in output");
 EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
 "Write raw stuff to name.raw");
-EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");
+EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
 EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
 "Return ratings in IPEOCRAPI data");
-EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,
+EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
 "Write .txt to .etx map file");
-EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,
+EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
 "Write repetition char code");
 EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
 EXTERN STRING_EVAR (unrecognised_char, "|",
@ -106,7 +107,6 @@ INT32 pixels_to_pts(               //convert coords
  return (INT32) (pts + 0.5);    //round it
 }

-
 void output_pass(  //Tess output pass //send to api
                 PAGE_RES_IT &page_res_it,
                 BOOL8 write_to_shm,
@ -119,8 +119,7 @@ void output_pass(  //Tess output pass //send to api

  if (tessedit_write_txt_map)
    txt_mapfile = open_outfile (".map");
-  if (tessedit_write_unlv)
-    unlv_file = open_outfile (".unlv");
+
  page_res_it.restart_page ();
  block_of_last_word = NULL;
  while (page_res_it.word () != NULL) {
@ -189,7 +188,6 @@ void output_pass(  //Tess output pass //send to api
  }
 }

-
 /*************************************************************************
 * write_results()
 *
@ -211,9 +209,10 @@ void write_results(                           //output a word
                  ) {
                                 //word to do
  WERD_RES *word = page_res_it.word ();
-  WERD_CHOICE *ep_choice;        //ep format
+//   WERD_CHOICE *ep_choice;        //ep format
  STRING repetition_code;
  const STRING *wordstr;
+  STRING wordstr_lengths;
  const char *text;
  int i;
  char unrecognised = STRING (unrecognised_char)[0];
@ -312,15 +311,12 @@ void write_results(                           //output a word
    if (tessedit_write_output && !NO_BLOCK)
      fprintf (textfile, "%s", txt_chs);

-    if (tessedit_write_unlv)
-      fprintf (unlv_file, "%s", txt_chs);
-
    if (tessedit_write_txt_map)
      fprintf (txt_mapfile, "%s", map_chs);

                                 //terminate string
    ep_chars[ep_chars_index] = '\0';
-    word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
+    word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);

    if (force_eol)
      empty_block = TRUE;
@ -345,6 +341,8 @@ void write_results(                           //output a word
       words have been removed */
    ptr = (char *) word->best_choice->string ().string ();
    strcpy (ptr, ptr + 1);       //shuffle up
+    ptr = (char *) word->best_choice->lengths ().string ();
+    strcpy (ptr, ptr + 1);       //shuffle up
    word->reject_map.remove_pos (0);
    blob_it = word->outword->blob_list ();
    delete blob_it.extract ();   //get rid of reject blob
@ -354,8 +352,10 @@ void write_results(                           //output a word
    last_char_was_tilde = FALSE;
  else {
    if (word->reject_map.length () > 0) {
-      if (word->best_choice->string ()[word->reject_map.length () - 1] ==
-        ' ')
+      for (i = 0, ptr = (char *) word->best_choice->string().string();
+           i < word->reject_map.length () - 1; ++i)
+        ptr += word->best_choice->lengths()[i];
+      if (*ptr == ' ')
        last_char_was_tilde = TRUE;
      else
        last_char_was_tilde = FALSE;
@ -365,7 +365,7 @@ void write_results(                           //output a word
    /* else it is unchanged as there are no output chars */
  }

-  ptr = (char *) word->best_choice->string ().string ();
+  ptr = (char *) word->best_choice->lengths ().string ();
  ASSERT_HOST (strlen (ptr) == word->reject_map.length ());

  if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
@ -379,21 +379,26 @@ void write_results(                           //output a word
      dict_word (word->best_choice->string ().string ()));
  }

+#if 0
  if (tessedit_write_unlv) {
    write_unlv_text(word);
  }
+#endif

  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
-    repetition_code += get_rep_char (word);
+    wordstr_lengths = "\001\001\001\001";
+    repetition_code += unicharset.id_to_unichar(get_rep_char (word));
+    wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
    wordstr = &repetition_code;
  }
  else {
    wordstr = &(word->best_choice->string ());
+    wordstr_lengths = word->best_choice->lengths ();
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      text = wordstr->string ();
-      for (i = 0; text[i] != '\0'; i++) {
+      for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
        if (word->reject_map[i].rejected ())
          word->reject_map[i].setrej_minimal_rej_accept ();
      }
@ -401,8 +406,8 @@ void write_results(                           //output a word
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      text = wordstr->string ();
-      for (i = 0; text[i] != '\0'; i++) {
-        if ((text[i] != ' ') && word->reject_map[i].rejected ())
+      for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
+        if ((*text != ' ') && word->reject_map[i].rejected ())
          word->reject_map[i].setrej_minimal_rej_accept ();
      }
    }
@ -410,8 +415,9 @@ void write_results(                           //output a word

  if (write_to_shm)
    write_shm_text (word, page_res_it.block ()->block,
-      page_res_it.row (), *wordstr);
+      page_res_it.row (), *wordstr, wordstr_lengths);

+#if 0
  if (tessedit_write_output)
    write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);

@ -424,12 +430,12 @@ void write_results(                           //output a word

  ep_choice = make_epaper_choice (word, newline_type);
  word->ep_choice = ep_choice;
+#endif

-  character_count += word->best_choice->string ().length ();
+  character_count += word->best_choice->lengths ().length ();
  word_count++;
 }

-
 /**********************************************************************
 * make_epaper_choice
 *
@ -437,6 +443,7 @@ void write_results(                           //output a word
 * determine whether each blob should be rejected.
 **********************************************************************/

+#if 0
 WERD_CHOICE *make_epaper_choice(                   //convert one word
                                WERD_RES *word,    //word to do
                                char newline_type  //type of newline
@ -482,7 +489,8 @@ WERD_CHOICE *make_epaper_choice(                   //convert one word
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    strcpy (word_string + index, "|^~R");
    index += 4;
-    word_string[index++] = get_rep_char (word);
+    strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
+    index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
  }
  else {
    if (!blob_it.empty ())
@ -537,7 +545,7 @@ WERD_CHOICE *make_epaper_choice(                   //convert one word
  ASSERT_HOST (strlen (word_string) == index);
  return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
 }
-
+#endif

 /**********************************************************************
 * make_reject
@ -653,6 +661,7 @@ char determine_newline_type(                   //test line ends
 * to the given file.
 **********************************************************************/

+#if 0
 void write_cooked_text(                     //write output
                       WERD *word,          //word to do
                       const STRING &text,  //text to write
@ -749,6 +758,7 @@ void write_cooked_text(                     //write output
  if (status != 0)
    WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
 }
+#endif


 /**********************************************************************
@ -761,7 +771,8 @@ void write_shm_text(                    //write output
                    WERD_RES *word,     //word to do
                    BLOCK *block,       //block it is from
                    ROW_RES *row,       //row it is from
-                    const STRING &text  //text to write
+                    const STRING &text, //text to write
+                    const STRING &text_lengths
                   ) {
  INT32 index;                   //char counter
  INT32 index2;                  //char counter
@ -777,6 +788,8 @@ void write_shm_text(                    //write output
  WERD copy_outword;             // copy to denorm
  UINT32 rating;                 //of char
  BOOL8 lineend;                 //end of line
+  int offset;
+  int offset2;

                                 //point size
  ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
@ -786,13 +799,14 @@ void write_shm_text(                    //write output
  copy_outword = *(word->outword);
  copy_outword.baseline_denormalise (&word->denorm);
  blob_it.set_to_list (copy_outword.blob_list ());
-  length = text.length ();
+  length = text_lengths.length ();

  if (length > 0) {
    blanks = word->word->space ();
    if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
      blanks = 1;
-    for (index = 0; index < length; index++, blob_it.forward ()) {
+    for (index = 0, offset = 0; index < length;
+         offset += text_lengths[index++], blob_it.forward ()) {
      blob = blob_it.data ();
      blob_box = blob->bounding_box ();

@ -804,7 +818,7 @@ void write_shm_text(                    //write output
      if (tessedit_write_ratings)
        rating = (UINT32) (-word->best_choice->certainty () / 0.035);
      else if (tessedit_zero_rejection)
-        rating = text[index] == ' ' ? 100 : 0;
+        rating = text[offset] == ' ' ? 100 : 0;
      else
        rating = word->reject_map[index].accepted ()? 0 : 100;
      if (rating > 255)
@ -819,22 +833,41 @@ void write_shm_text(                    //write output

      lineend = word->word->flag (W_EOL) && index == length - 1;
      if (word->word->flag (W_EOL) && tessedit_zero_rejection
-      && index < length - 1 && text[index + 1] == ' ') {
-        for (index2 = index + 1; index2 < length && text[index2] == ' ';
-          index2++);
+      && index < length - 1 && text[index + text_lengths[index]] == ' ') {
+        for (index2 = index + 1, offset2 = offset + text_lengths[index];
+             index2 < length && text[offset2] == ' ';
+             offset2 += text_lengths[index2++]);
        if (index2 == length)
          lineend = TRUE;
      }

-      if (!tessedit_zero_rejection || text[index] != ' '
+      if (!tessedit_zero_rejection || text[offset] != ' '
      || tessedit_word_for_word) {
                                 //confidence
-        ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
-          ptsize,                //point size
-          blanks, enhancement,   //enhancement
-          OCR_CDIR_LEFT_RIGHT,
-          OCR_LDIR_DOWN_RIGHT,
-          lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+        if (text[offset] == ' ') {
+        ocr_append_char (unrecognised,
+                         blob_box.left (), blob_box.right (),
+                         page_image.get_ysize () - 1 - blob_box.top (),
+                         page_image.get_ysize () - 1 - blob_box.bottom (),
+                         font, (UINT8) rating,
+                         ptsize,                //point size
+                         blanks, enhancement,   //enhancement
+                         OCR_CDIR_LEFT_RIGHT,
+                         OCR_LDIR_DOWN_RIGHT,
+                         lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+        } else {
+          for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
+            ocr_append_char (text[offset + suboffset],
+                             blob_box.left (), blob_box.right (),
+                             page_image.get_ysize () - 1 - blob_box.top (),
+                             page_image.get_ysize () - 1 - blob_box.bottom (),
+                             font, (UINT8) rating,
+                             ptsize,                //point size
+                             blanks, enhancement,   //enhancement
+                             OCR_CDIR_LEFT_RIGHT,
+                             OCR_LDIR_DOWN_RIGHT,
+                             lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+        }
        blanks = 0;
      }

@ -863,13 +896,17 @@ void write_shm_text(                    //write output
    lineend = word->word->flag (W_EOL);

                                 //font index
-    ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
-      rating,                    //confidence
-      ptsize,                    //point size
-      blanks, enhancement,       //enhancement
-      OCR_CDIR_LEFT_RIGHT,
-      OCR_LDIR_DOWN_RIGHT,
-      lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
+    ocr_append_char (unrecognised,
+                     blob_box.left (), blob_box.right (),
+                     page_image.get_ysize () - 1 - blob_box.top (),
+                     page_image.get_ysize () - 1 - blob_box.bottom (),
+                     font,
+                     rating,                    //confidence
+                     ptsize,                    //point size
+                     blanks, enhancement,       //enhancement
+                     OCR_CDIR_LEFT_RIGHT,
+                     OCR_LDIR_DOWN_RIGHT,
+                     lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
  }
 }

@ -888,6 +925,7 @@ void write_shm_text(                    //write output
 * newdiff needs etx files!
 **********************************************************************/

+#if 0
 void write_map(                //output a map file
               FILE *mapfile,  //mapfile to write to
               WERD_RES *word) {
@ -937,6 +975,7 @@ void write_map(                //output a map file
  if (status != 0)
    WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
 }
+#endif


 /*************************************************************************
@ -957,6 +996,7 @@ FILE *open_outfile(  //open .map & .unlv file
 }


+#if 0
 void write_unlv_text(WERD_RES *word) {
  const char *wordstr;

@ -1015,6 +1055,7 @@ void write_unlv_text(WERD_RES *word) {
  if (status != 0)
    WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
 }
+#endif


 /*************************************************************************
@ -1022,21 +1063,24 @@ void write_unlv_text(WERD_RES *word) {
 * Return the first accepted character from the repetition string. This is the
 * character which is repeated - as determined earlier by fix_rep_char()
 *************************************************************************/
-char get_rep_char(  // what char is repeated?
-                  WERD_RES *word) {
+UNICHAR_ID get_rep_char(WERD_RES *word) {  // what char is repeated?
  int i;
+  int offset;

-  for (i = 0;
+  for (i = 0, offset = 0;
    ((i < word->reject_map.length ()) &&
-    (word->reject_map[i].rejected ())); i++);
+    (word->reject_map[i].rejected ()));
+       offset += word->best_choice->lengths()[i++]);
  if (i < word->reject_map.length ())
-    return word->best_choice->string ()[i];
+    return unicharset.unichar_to_id(word->best_choice->string().string()
+                                    + offset,
+                                    word->best_choice->lengths()[i]);
  else
-    return STRING (unrecognised_char)[0];
+    return unicharset.unichar_to_id(unrecognised_char.string());
 }

-
 void ensure_rep_chars_are_consistent(WERD_RES *word) {
+#if 0
  char rep_char = get_rep_char (word);
  char *ptr;

@ -1045,8 +1089,24 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
    if (*ptr != rep_char)
      *ptr = rep_char;
  }
-}
+#endif

+#if 0
+  UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
+  int i;
+  char *ptr;
+  STRING consistent_string;
+  STRING consistent_string_lengths;
+
+  ptr = (char *) word->best_choice->string ().string ();
+  for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
+    consistent_string += unicharset.id_to_unichar(rep_char);
+    consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
+  }
+  word->best_choice->string() = consistent_string;
+  word->best_choice->lengths() = consistent_string_lengths;
+#endif
+}

 /*************************************************************************
 * SUSPECT LEVELS
@ -1062,7 +1122,9 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
 void set_unlv_suspects(WERD_RES *word) {
  int len = word->reject_map.length ();
  int i;
+  int offset;
  const char *ptr;
+  const char *lengths = word->best_choice->lengths ().string ();
  float rating_per_ch;

  ptr = word->best_choice->string ().string ();
@ -1080,10 +1142,12 @@ void set_unlv_suspects(WERD_RES *word) {

  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/

-  if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
+  if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
+                               suspect_short_words)) {
    /* Unreject alphas in dictionary words */
-    for (i = 0; i < len; i++) {
-      if (word->reject_map[i].rejected () && isalpha (ptr[i]))
+    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
+      if (word->reject_map[i].rejected () &&
+          unicharset.get_isalpha (ptr + offset, lengths[i]))
        word->reject_map[i].setrej_minimal_rej_accept ();
    }
  }
@ -1095,8 +1159,8 @@ void set_unlv_suspects(WERD_RES *word) {

  if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
-    for (i = 0; i < len; i++) {
-      if (word->reject_map[i].rejected () && (ptr[i] != ' '))
+    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
+      if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
        word->reject_map[i].setrej_minimal_rej_accept ();
    }
  }
@ -1130,9 +1194,11 @@ void set_unlv_suspects(WERD_RES *word) {
    }
  }

-  if ((acceptable_word_string (word->best_choice->string ().string ())
+  if ((acceptable_word_string (word->best_choice->string ().string (),
+                               word->best_choice->lengths ().string ())
    != AC_UNACCEPTABLE) ||
-  acceptable_number_string (word->best_choice->string ().string ())) {
+  acceptable_number_string (word->best_choice->string ().string (),
+                            word->best_choice->lengths ().string ())) {
    if (word->reject_map.length () > suspect_short_words) {
      for (i = 0; i < len; i++) {
        if (word->reject_map[i].rejected () &&
@ -1149,11 +1215,12 @@ void set_unlv_suspects(WERD_RES *word) {


 INT16 count_alphas(  //how many alphas
-                   const char *s) {
+                   const char *s,
+                   const char *lengths) {
  int count = 0;

-  for (; *s != '\0'; s++) {
-    if (isalpha (*s))
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isalpha(s, *lengths))
      count++;
  }
  return count;
@ -1161,36 +1228,43 @@ INT16 count_alphas(  //how many alphas


 INT16 count_alphanums(  //how many alphanums
-                      const char *s) {
+                      const char *s,
+                      const char *lengths) {
  int count = 0;

-  for (; *s != '\0'; s++) {
-    if (isalnum (*s))
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isalpha(s, *lengths) ||
+        unicharset.get_isdigit(s, *lengths))
      count++;
  }
  return count;
 }


-BOOL8 acceptable_number_string(const char *s) {
+BOOL8 acceptable_number_string(const char *s,
+                               const char *lengths) {
  BOOL8 prev_digit = FALSE;

-  if (*s == '(')
+  if (*lengths == 1 && *s == '(')
    s++;

-  if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
+  if (*lengths == 1 &&
+      ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
    s++;

-  for (; *s != '\0'; s++) {
-    if (isdigit (*s))
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isdigit (s, *lengths))
      prev_digit = TRUE;
-    else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
-      prev_digit = FALSE;
    else if (prev_digit &&
-      (*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
+             (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
+      prev_digit = FALSE;
+    else if (prev_digit && *lengths == 1 &&
+             (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
      return TRUE;
    else if (prev_digit &&
-      (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
+             *lengths == 1 && (*s == '%') &&
+             (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
+             (*(s + *lengths + *(lengths + 1)) == '\0'))
      return TRUE;
    else
      return FALSE;
--- a/ccmain/tesseractmain.cpp
+++ b/ccmain/tesseractmain.cpp
@ -31,7 +31,9 @@
 #include "stderr.h"
 #include "notdll.h"
 #include "mainblk.h"
+#include "output.h"
 #include "globals.h"
+#include "blread.h"
 #include "tfacep.h"
 #include "callnet.h"

@ -40,7 +42,10 @@
 #define API_CONFIG      "configs/api_config"
 #define EXTERN

+EXTERN BOOL_VAR (tessedit_create_boxfile, FALSE, "Output text with boxes");
 EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read");
+EXTERN INT_VAR (tessedit_serial_unlv, 0,
+                "0->Whole page, 1->serial no adapt, 2->serial with adapt");
 EXTERN BOOL_VAR (tessedit_write_images, FALSE,
 "Capture the image from the IPE");
 EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file");
@ -63,15 +68,30 @@ int main(int argc, char **argv) {

  if (argc < 3) {
    USAGE.error (argv[0], EXIT,
-      "%s imagename outputbase [configfile [[+|-]varfile]...]\n", argv[0]);
+      "%s imagename outputbase [-l lang] [configfile [[+|-]varfile]...]\n",
+      argv[0]);
+  }
+  // Find the required language.
+  const char* lang = "eng";
+  int arg = 3;
+  if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
+    lang = argv[4];
+    arg = 5;
+  }
+  // Find the basename of the input file.
+  STRING infile(argv[1]);
+  const char* lastdot = strrchr(argv[1], '.');
+  if (lastdot != NULL) {
+    infile[lastdot - argv[1]] = '\0';
  }

-  if (argc == 3)
-    TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
-                                  NULL, false, 0, argv + 2);
+  if (argc == arg)
+    TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
+                                  NULL, false, 0, argv + arg);
  else
-    TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
-                                  argv[3], false, argc - 4, argv + 4);
+    TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
+                                  argv[arg], false,
+                                  argc - arg - 1, argv + arg + 1);

  tprintf ("Tesseract Open Source OCR Engine\n");

@ -92,20 +112,70 @@ int main(int argc, char **argv) {
      argv[1]);
  }
 #endif
+  STRING text_out;
  int bytes_per_line = check_legal_image_size(image.get_xsize(),
                                              image.get_ysize(),
                                              image.get_bpp());
-  char* text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
-                                          bytes_per_line, 0, 0,
-                                          image.get_xsize(), image.get_ysize());
+  if (tessedit_serial_unlv == 0) {
+    TessBaseAPI::SetInputName(argv[1]);
+    char* text;
+    if (tessedit_create_boxfile)
+      text = TessBaseAPI::TesseractRectBoxes(image.get_buffer(),
+                                             image.get_bpp()/8,
+                                             bytes_per_line, 0, 0,
+                                             image.get_xsize(),
+                                             image.get_ysize(),
+                                             image.get_ysize());
+    else if (tessedit_write_unlv)
+      text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
+                                            image.get_bpp()/8,
+                                            bytes_per_line, 0, 0,
+                                            image.get_xsize(),
+                                            image.get_ysize());
+    else
+      text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
+                                        bytes_per_line, 0, 0,
+                                        image.get_xsize(), image.get_ysize());
+    text_out = text;
+    delete [] text;
+  } else {
+    BLOCK_LIST blocks;
+    STRING filename = argv[1];
+    int len = filename.length();
+    if (len > 4 && filename[len - 4] == '.') {
+      filename[len - 4] = '\0';
+    }
+    if (!read_unlv_file(filename, image.get_xsize(), image.get_ysize(),
+                        &blocks)) {
+      fprintf(stderr, "Error: Must have a unlv zone file %s to read!\n",
+              filename.string());
+      return 1;
+    }
+    BLOCK_IT b_it = &blocks;
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      BLOCK* block = b_it.data();
+      BOX box = block->bounding_box();
+      char* text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
+                                                  image.get_bpp()/8,
+                                                  bytes_per_line,
+                                                  box.left(),
+                                                  image.get_ysize() - box.top(),
+                                                  box.width(),
+                                                  box.height());
+      text_out += text;
+      delete [] text;
+      if (tessedit_serial_unlv == 1)
+        TessBaseAPI::ClearAdaptiveClassifier();
+    }
+  }
+
  outfile = argv[2];
  outfile += ".txt";
  FILE* fp = fopen(outfile.string(), "w");
  if (fp != NULL) {
-    fwrite(text, 1, strlen(text), fp);
+    fwrite(text_out.string(), 1, text_out.length(), fp);
    fclose(fp);
  }
-  delete [] text;
  TessBaseAPI::End();

  return 0;                      //Normal exit
--- a/ccstruct/blread.cpp
+++ b/ccstruct/blread.cpp
@ -527,7 +527,9 @@ BOOL8 read_unlv_file(                    //print list of sides
  else {
    while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
                                 //make rect block
-      block = new BLOCK (name.string (), TRUE, 0, 0, (INT16) x, (INT16) (ysize - 1 - y - height), (INT16) (x + width), (INT16) (ysize - 1 - y));
+      block = new BLOCK (name.string (), TRUE, 0, 0,
+                         (INT16) x, (INT16) (ysize - y - height),
+                         (INT16) (x + width), (INT16) (ysize - y));
                                 //on end of list
      block_it.add_to_end (block);
    }
--- a/cutil/tordvars.cpp
+++ b/cutil/tordvars.cpp
@ -63,7 +63,7 @@ make_toggle_var (debug_8, 0, make_debug_8, 6, 8, toggle_debug_8, "Debug #8");
 make_toggle_var (display_ratings, 0, make_display_ratings,
 6, 9, toggle_ratings, "Ratings display");

-make_toggle_var (display_text, 1, make_display_text,
+make_toggle_var (display_text, 0, make_display_text,
 6, 10, toggle_text, "Display Text");

 make_toggle_var (show_bold, 1, make_show_bold,
--- a/tessdata/configs/makebox
+++ b/tessdata/configs/makebox
@ -0,0 +1 @@
+tessedit_create_boxfile 1
--- a/tessdata/configs/unlv
+++ b/tessdata/configs/unlv
@ -0,0 +1,3 @@
+tessedit_write_unlv 1
+tessedit_write_output 0
+tessedit_write_txt_map 0
--- a/tessdata/tessconfigs/batch
+++ b/tessdata/tessconfigs/batch
@ -1,78 +1,2 @@
-#################################################
-# Adaptive Matcher Using PreAdapted Templates
-#################################################
-
-acts_fx                 0x800
-acts_ocr                0x20
-
-RatingScale             30.0
-CertaintyScale          20.0
-
-#EnableMatcher				0
-#CurrentFx					2
-MinSlope                 0.414213562
-MaxSlope                 2.414213562
-#ExtremityMode            1
-NormMethod               1
-EnableAdaptiveMatcher	1
-
-NormAdjMidpoint			32.0
-NormAdjCurl					2.0
-
-MinNormScaleX				0.0
-MaxNormScaleX				0.325
-MinNormScaleY				0.0
-MaxNormScaleY				0.325
-
-BuiltInTemplatesFile		tessdata/inttemp
-BuiltInCutoffsFile		tessdata/pffmtable
-
-EnableLearning				0
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-ReliableConfigThreshold	2
-MinNumPermClasses			3
-
-#EnableStopper				1
-GoodAdaptiveMatch			0.125
-GreatAdaptiveMatch		0.0
-
-EnableIntFX					1
-EnableNewAdaptRules		1
-################################################################################
-#
-# File:         marks/configs/knobs
-# Description:  Control variables for 'marks' code
-# Author:       Mark Seaman, OCR Technology
-# Created:      Wed Feb 27 11:27:27 1991
-# Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
-# Language:     Text
-# Package:      N/A
-# Status:       Experimental (Do Not Distribute)
-#
-# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
-#
-################################################################################
-
-#hidden_edges            1
-
-save_doc_words          1
-doc_dict_enable         1
-ClassPrunerThreshold			229
-ClassPrunerMultiplier		15
-IntThetaFudge					128
-CPCutoffStrength				0.15
-EvidenceTableBits				9
-IntEvidenceTruncBits			14
-SEExponentialMultiplier		0
-SimilarityCenter				0.0075
-#################################################
-# Adaptive Matcher Using 2 Passes
-#################################################
-
-EnableLearning				1
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-
-#save_errors             0
+# No content needed as all defaults are correct.

--- a/tessdata/tessconfigs/batch.nochop
+++ b/tessdata/tessconfigs/batch.nochop
@ -0,0 +1,2 @@
+chop_enable 0
+enable_assoc 0
--- a/tessdata/tessconfigs/matdemo
+++ b/tessdata/tessconfigs/matdemo
@ -2,80 +2,6 @@
 # Adaptive Matcher Using PreAdapted Templates
 #################################################

-acts_fx                 0x800
-acts_ocr                0x20
-
-RatingScale             30.0
-CertaintyScale          20.0
-
-#EnableMatcher				0
-#CurrentFx					2
-EnableAdaptiveMatcher	1
-
-NormAdjMidpoint			32.0
-NormAdjCurl					2.0
-
-MinNormScaleX				0.0
-MaxNormScaleX				0.325
-MinNormScaleY				0.0
-MaxNormScaleY				0.325
-
-BuiltInTemplatesFile		tessdata/inttemp
-BuiltInCutoffsFile		tessdata/pffmtable
-
-EnableLearning				0
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-ReliableConfigThreshold	2
-MinNumPermClasses			3
-
-#EnableStopper				1
-GoodAdaptiveMatch			0.125
-GreatAdaptiveMatch		0.0
-
-EnableIntFX					1
-EnableNewAdaptRules		1
 EnableAdaptiveDebugger   1
 MatchDebugFlags         6
 MatcherDebugLevel       1
-################################################################################
-#
-# File:         marks/configs/knobs
-# Description:  Control variables for 'marks' code
-# Author:       Mark Seaman, OCR Technology
-# Created:      Wed Feb 27 11:27:27 1991
-# Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
-# Language:     Text
-# Package:      N/A
-# Status:       Experimental (Do Not Distribute)
-#
-# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
-#
-################################################################################
-
-#hidden_edges            1
-
-save_doc_words          1
-doc_dict_enable         1
-ClassPrunerThreshold			229
-ClassPrunerMultiplier		15
-IntThetaFudge					128
-CPCutoffStrength				0.15
-EvidenceTableBits				9
-IntEvidenceTruncBits			14
-SEExponentialMultiplier		0
-SimilarityCenter				0.0075
-#################################################
-# Adaptive Matcher Using 2 Passes
-#################################################
-
-display_splits          0
-display_all_words       0
-display_all_blobs       0
-display_segmentations   0
-EnableLearning				1
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-
-#save_errors             0
-
--- a/tessdata/tessconfigs/msdemo
+++ b/tessdata/tessconfigs/msdemo
@ -0,0 +1,13 @@
+#################################################
+# Adaptive Matcher Using PreAdapted Templates
+#################################################
+
+EnableAdaptiveDebugger   1
+MatchDebugFlags         6
+MatcherDebugLevel       1
+
+display_splits          0
+display_all_words       1
+display_all_blobs       1
+display_segmentations   2
+display_ratings			1
--- a/tessdata/tessconfigs/nobatch
+++ b/tessdata/tessconfigs/nobatch
@ -0,0 +1,2 @@
+display_text 0
+
--- a/tessdata/tessconfigs/segdemo
+++ b/tessdata/tessconfigs/segdemo
@ -2,70 +2,6 @@
 # Adaptive Matcher Using PreAdapted Templates
 #################################################

-acts_fx                 0x800
-acts_ocr                0x20
-
-RatingScale             30.0
-CertaintyScale          20.0
-
-#EnableMatcher				0
-#CurrentFx					2
-EnableAdaptiveMatcher	1
-
-NormAdjMidpoint			32.0
-NormAdjCurl					2.0
-
-MinNormScaleX				0.0
-MaxNormScaleX				0.325
-MinNormScaleY				0.0
-MaxNormScaleY				0.325
-
-BuiltInTemplatesFile		tessdata/inttemp
-BuiltInCutoffsFile		tessdata/pffmtable
-
-EnableLearning				0
-SaveAdaptedTemplates		0
-UsePreAdaptedTemplates	0
-ReliableConfigThreshold	2
-MinNumPermClasses			3
-
-#EnableStopper				1
-GoodAdaptiveMatch			0.125
-GreatAdaptiveMatch		0.0
-
-EnableIntFX					1
-EnableNewAdaptRules		1
-################################################################################
-#
-# File:         marks/configs/knobs
-# Description:  Control variables for 'marks' code
-# Author:       Mark Seaman, OCR Technology
-# Created:      Wed Feb 27 11:27:27 1991
-# Modified:     Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
-# Language:     Text
-# Package:      N/A
-# Status:       Experimental (Do Not Distribute)
-#
-# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
-#
-################################################################################
-
-#hidden_edges            1
-
-save_doc_words          1
-doc_dict_enable         1
-ClassPrunerThreshold			229
-ClassPrunerMultiplier		15
-IntThetaFudge					128
-CPCutoffStrength				0.15
-EvidenceTableBits				9
-IntEvidenceTruncBits			14
-SEExponentialMultiplier		0
-SimilarityCenter				0.0075
-#################################################
-# Adaptive Matcher Using 2 Passes
-#################################################
-
 display_splits          0
 display_all_words       1
 display_all_blobs       1
--- a/testing/Makefile
+++ b/testing/Makefile
@ -0,0 +1,185 @@
+# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+
+SHELL = /bin/sh
+
+srcdir = .
+top_srcdir = ..
+
+prefix = /usr/local
+exec_prefix = ${prefix}
+
+bindir = ${exec_prefix}/bin
+sbindir = ${exec_prefix}/sbin
+libexecdir = ${exec_prefix}/libexec
+datadir = ${prefix}/share
+sysconfdir = ${prefix}/etc
+sharedstatedir = ${prefix}/com
+localstatedir = ${prefix}/var
+libdir = ${exec_prefix}/lib
+infodir = ${prefix}/info
+mandir = ${prefix}/man
+includedir = ${prefix}/include/tesseract
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/
+pkglibdir = $(libdir)/
+pkgincludedir = $(includedir)/
+
+top_builddir = ..
+
+ACLOCAL = aclocal-1.4
+AUTOCONF = autoconf
+AUTOMAKE = automake-1.4
+AUTOHEADER = autoheader
+
+INSTALL = /usr/bin/install -c
+INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS)
+INSTALL_DATA = ${INSTALL} -m 644
+INSTALL_SCRIPT = ${INSTALL}
+transform = s,x,x,
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+host_alias = 
+host_triplet = x86_64-unknown-linux-gnu
+CC = gcc
+CXX = g++
+HAVE_LIB = @HAVE_LIB@
+LIB = @LIB@
+LTLIB = @LTLIB@
+MAINT = #
+MAKEINFO = /home/rays/src/opensrc/tesseract-ocr/config/missing makeinfo
+PACKAGE = 
+PACKAGE_DATE = 07/2007
+PACKAGE_NAME = tesseract
+PACKAGE_VERSION = 2.00
+PACKAGE_YEAR = 2007
+RANLIB = ranlib
+VERSION = 
+
+EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
+mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
+CONFIG_HEADER = ../config_auto.h
+CONFIG_CLEAN_FILES = 
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
+
+TAR = tar
+GZIP_ENV = --best
+all: all-redirect
+.SUFFIXES:
+$(srcdir)/Makefile.in: # Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status $(BUILT_SOURCES)
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+tags: TAGS
+TAGS:
+
+
+distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
+
+subdir = testing
+
+distdir: $(DISTFILES)
+	here=`cd $(top_builddir) && pwd`; \
+	top_distdir=`cd $(top_distdir) && pwd`; \
+	distdir=`cd $(distdir) && pwd`; \
+	cd $(top_srcdir) \
+	  && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
+	$(mkinstalldirs) $(distdir)/reports
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pr $$d/$$file $(distdir)/$$file; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || ln $$d/$$file $(distdir)/$$file 2> /dev/null \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+info-am:
+info: info-am
+dvi-am:
+dvi: dvi-am
+check-am: all-am
+check: check-am
+installcheck-am:
+installcheck: installcheck-am
+install-exec-am:
+install-exec: install-exec-am
+
+install-data-am:
+install-data: install-data-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-am
+uninstall-am:
+uninstall: uninstall-am
+all-am: Makefile
+all-redirect: all-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
+installdirs:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+mostlyclean-am:  mostlyclean-generic
+
+mostlyclean: mostlyclean-am
+
+clean-am:  clean-generic mostlyclean-am
+
+clean: clean-am
+
+distclean-am:  distclean-generic clean-am
+
+distclean: distclean-am
+
+maintainer-clean-am:  maintainer-clean-generic distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-am
+
+.PHONY: tags distdir info-am info dvi-am dvi check check-am \
+installcheck-am installcheck install-exec-am install-exec \
+install-data-am install-data install-am install uninstall-am uninstall \
+all-redirect all-am all installdirs mostlyclean-generic \
+distclean-generic clean-generic maintainer-clean-generic clean \
+mostlyclean distclean maintainer-clean
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
--- a/testing/Makefile.am
+++ b/testing/Makefile.am
@ -0,0 +1,2 @@
+
+EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
--- a/testing/Makefile.in
+++ b/testing/Makefile.in
@ -0,0 +1,185 @@
+# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS)
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+host_alias = @host_alias@
+host_triplet = @host@
+CC = @CC@
+CXX = @CXX@
+HAVE_LIB = @HAVE_LIB@
+LIB = @LIB@
+LTLIB = @LTLIB@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+PACKAGE = @PACKAGE@
+PACKAGE_DATE = @PACKAGE_DATE@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PACKAGE_YEAR = @PACKAGE_YEAR@
+RANLIB = @RANLIB@
+VERSION = @VERSION@
+
+EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
+mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
+CONFIG_HEADER = ../config_auto.h
+CONFIG_CLEAN_FILES = 
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
+
+TAR = tar
+GZIP_ENV = --best
+all: all-redirect
+.SUFFIXES:
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status $(BUILT_SOURCES)
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+tags: TAGS
+TAGS:
+
+
+distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
+
+subdir = testing
+
+distdir: $(DISTFILES)
+	here=`cd $(top_builddir) && pwd`; \
+	top_distdir=`cd $(top_distdir) && pwd`; \
+	distdir=`cd $(distdir) && pwd`; \
+	cd $(top_srcdir) \
+	  && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
+	$(mkinstalldirs) $(distdir)/reports
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pr $$d/$$file $(distdir)/$$file; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || ln $$d/$$file $(distdir)/$$file 2> /dev/null \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+info-am:
+info: info-am
+dvi-am:
+dvi: dvi-am
+check-am: all-am
+check: check-am
+installcheck-am:
+installcheck: installcheck-am
+install-exec-am:
+install-exec: install-exec-am
+
+install-data-am:
+install-data: install-data-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-am
+uninstall-am:
+uninstall: uninstall-am
+all-am: Makefile
+all-redirect: all-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
+installdirs:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+mostlyclean-am:  mostlyclean-generic
+
+mostlyclean: mostlyclean-am
+
+clean-am:  clean-generic mostlyclean-am
+
+clean: clean-am
+
+distclean-am:  distclean-generic clean-am
+
+distclean: distclean-am
+
+maintainer-clean-am:  maintainer-clean-generic distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-am
+
+.PHONY: tags distdir info-am info dvi-am dvi check check-am \
+installcheck-am installcheck install-exec-am install-exec \
+install-data-am install-data install-am install uninstall-am uninstall \
+all-redirect all-am all installdirs mostlyclean-generic \
+distclean-generic clean-generic maintainer-clean-generic clean \
+mostlyclean distclean maintainer-clean
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
--- a/testing/README
+++ b/testing/README
@ -0,0 +1,43 @@
+How to run UNLV tests.
+
+The scripts in this directory make it possible to duplicate the tests
+published in the Fourth Annual Test of OCR Accuracy.
+See http://www.isri.unlv.edu/downloads/AT-1995.pdf
+but first you have to get the tools and data from UNLV:
+
+Step 1: to download the images goto
+http://www.isri.unlv.edu/ISRI/OCRtk
+and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz.
+
+Step 2: extract the files. It doesn't really matter where
+in your filesystem you put them, but they must go under a common
+root so you have directories 3, B, M and N in, for example,
+/users/me/ISRI-OCRtk.
+
+Step 3: Reorg the files
+The lack of tif extensions on the images is inconvenient, so there
+is a script to reorganize the data to match the rest of the test
+scripts.
+cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run
+/blah/blah/tesseract-ocr/testing/reorgdata.sh 3B
+This makes directories doe3.3B, bus.3B, mag.3B and news.3B.
+You can now get rid of 3, B, M, and N unless you want to get some of the
+other scanning resolutions out of them.
+
+Step 4: Download the ISRI toolkit from:
+http://www.isri.unlv.edu/downloads/ftk-1.0.tgz
+
+Step 5: If they work for you, use the binaries directly from the bin
+directory and put them in tesseract-ocr/testing/unlv
+otherwise build the tools for yourself and put them there.
+
+Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
+
+Step 7: run testing/runalltests.sh with the root data dir and testname:
+testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0
+and go to the gym, have lunch etc.
+
+Step 8: There should be a file
+testing/reports/tess2.0.summary that contains the final summarized accuracy
+report and comparison with the 1995 results.
+
--- a/testing/counttestset.sh
+++ b/testing/counttestset.sh
@ -0,0 +1,61 @@
+#!/bin/bash
+# File:        counttestset.sh
+# Description: Script to count the errors on a single UNLV set.
+# Author:      Ray Smith
+# Created:     Wed Jun 13 11:58:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 1 ]
+then
+  echo "Usage:$0 pagesfile"
+  exit 1
+fi
+if [ ! -d ccmain ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r testing/unlv/accuracy ]
+then
+  echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
+  exit 1
+fi
+pages=$1
+
+imdir=${pages%/pages}
+setname=${imdir##*/}
+resdir=testing/results/$setname
+mkdir -p testing/reports
+echo "Counting on set $setname in directory $imdir to $resdir"
+accfiles=""
+wafiles=""
+while read page dir
+do
+  if [ "$dir" ]
+  then
+     srcdir="$imdir/$dir"
+  else
+     srcdir="$imdir"
+  fi
+#  echo "$srcdir/$page.tif"
+  # Count character errors.
+  testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc
+  accfiles="$accfiles $resdir/$page.acc"
+  # Count word errors.
+  testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa
+  wafiles="$wafiles $resdir/$page.wa"
+done <$pages
+testing/unlv/accsum $accfiles >testing/reports/$setname.characc
+testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc
+
+
--- a/testing/reorgdata.sh
+++ b/testing/reorgdata.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+
+if [ $# -ne 1 ]
+then
+    echo "Usage:$0 scantype"
+    echo "UNLV data comes in several scan types:"
+    echo "3B=300 dpi binary"
+    echo "3A=adaptive thresholded 300 dpi"
+    echo "3G=300 dpi grey"
+    echo "4B=400dpi binary"
+    echo "2B=200dpi binary"
+    echo "For now we only use 3B"
+    exit 1
+fi
+ext=$1
+
+#There are several test sets without meaningful names, so rename
+#them with something a bit more meaningful.
+#Each s is oldname/newname
+for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
+do
+    old=${s%/*}
+    #if this set was downloaded then process it.
+    if [ -r "$old/PAGES" ]
+    then
+	new=${s#*/}.$ext
+	mkdir -p $new
+    	echo "Set $old -> $new"
+	#The pages file had - instead of _ so fix it and add the extension.
+	for page in `cat $old/PAGES`
+	do
+    	    echo "${page%-*}_${page#*-}.$ext"
+	done >$new/pages
+	for f in `cat $new/pages`
+	do
+    	    #Put a tif extension on the tif files.
+	    cp $old/${old}_B/$f $new/$f.tif
+	    #Put a uzn extension on the zone files.
+	    cp $old/${old}_B/${f}Z $new/$f.uzn
+	    #Cat all the truth files together and put into a single txt file.
+	    cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt
+	done
+    fi
+done
--- a/testing/reports/1995.bus.3B.sum
+++ b/testing/reports/1995.bus.3B.sum
@ -0,0 +1 @@
+1995	bus.3B	5959	98.14%	0.00%	1631	96.83%	0.00%	1293	95.73%	0.00%
--- a/testing/reports/1995.doe3.3B.sum
+++ b/testing/reports/1995.doe3.3B.sum
@ -0,0 +1 @@
+1995	doe3.3B	36349	97.52%	0.00%	7826	96.34%	0.00%	7042	94.87%	0.00%
--- a/testing/reports/1995.mag.3B.sum
+++ b/testing/reports/1995.mag.3B.sum
@ -0,0 +1 @@
+1995	mag.3B	15043	97.74%	0.00%	4566	96.01%	0.00%	3379	94.99%	0.00%
--- a/testing/reports/1995.news.3B.sum
+++ b/testing/reports/1995.news.3B.sum
@ -0,0 +1 @@
+1995	news.3B	6432	98.69%	0.00%	1946	97.68%	0.00%	1502	96.94%	0.00%
--- a/testing/runalltests.sh
+++ b/testing/runalltests.sh
@ -0,0 +1,110 @@
+#!/bin/bash
+# File:        runalltests.sh
+# Description: Script to run a set of UNLV test sets.
+# Author:      Ray Smith
+# Created:     Thu Jun 14 08:21:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 2 ]
+then
+   echo "Usage:$0 unlv-data-dir version-id"
+   exit 1
+fi
+if [ ! -d ccmain ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r ccmain/tesseract -a ! -r tesseract.exe ]
+then
+  echo "Please build tesseract before running $0"
+  exit 1
+fi
+if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ]
+then
+  echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
+  exit 1
+fi
+
+#deltapc new old calculates the %change from old to new
+deltapc() {
+awk ' BEGIN {
+printf("%.2f", 100.0*('$1'-'$2')/'$2');
+}'
+}
+
+imdir="$1"
+vid="$2"
+bindir=${0%/*}
+if [ "$bindir" = "$0" ]
+then
+    bindir="./"
+fi
+rdir=testing/reports
+testsets="bus.3B doe3.3B mag.3B news.3B"
+
+totalerrs=0
+totalwerrs=0
+totalnswerrs=0
+totalolderrs=0
+totaloldwerrs=0
+totaloldnswerrs=0
+for set in $testsets
+do
+    if [ -r $imdir/$set/pages ]
+    then
+	# Run tesseract on all the pages.
+	$bindir/runtestset.sh $imdir/$set/pages
+	# Count the errors on all the pages.
+	$bindir/counttestset.sh $imdir/$set/pages
+	# Get the old character word and nonstop word errors.
+	olderrs=`cat testing/reports/1995.$set.sum | cut -f3`
+	oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6`
+	oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9`
+	# Get the new character word and nonstop word errors and accuracy.
+	cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
+	    tr -d '[:blank:]'`
+	nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
+	    cut -c10-17 |tr -d '[:blank:]'`
+	nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
+	    cut -c19-26 |tr -d '[:blank:]'`
+	# Compute the percent change.
+	chdelta=`deltapc $cherrs $olderrs`
+	wdelta=`deltapc $wderrs $oldwerrs`
+	nswdelta=`deltapc $nswderrs $oldnswerrs`
+	sumfile=$rdir/$vid.$set.sum
+	echo "$vid	$set	$cherrs	$chacc	$chdelta%	$wderrs	$wdacc\
+	$wdelta%	$nswderrs	$nswdacc	$nswdelta%" >$sumfile
+	# Sum totals over all the testsets.
+	let totalerrs=totalerrs+cherrs
+	let totalwerrs=totalwerrs+wderrs
+	let totalnswerrs=totalnswerrs+nswderrs
+	let totalolderrs=totalolderrs+olderrs
+	let totaloldwerrs=totaloldwerrs+oldwerrs
+	let totaloldnswerrs=totaloldnswerrs+oldnswerrs
+    fi
+done
+# Compute grand total percent change.
+chdelta=`deltapc $totalerrs $totalolderrs`
+wdelta=`deltapc $totalwerrs $totaloldwerrs`
+nswdelta=`deltapc $totalnswerrs $totaloldnswerrs `
+tfile=$rdir/$vid.total.sum
+echo "$vid	Total	$totalerrs	-	$chdelta%	$totalwerrs\
+	-	$wdelta%	$totalnswerrs	-	$nswdelta%" >$tfile
+cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary
--- a/testing/runtestset.sh
+++ b/testing/runtestset.sh
@ -0,0 +1,61 @@
+#!/bin/bash
+# File:        runtestset.sh
+# Description: Script to run tesseract on a single UNLV set.
+# Author:      Ray Smith
+# Created:     Wed Jun 13 10:13:01 PDT 2007
+#
+# (C) Copyright 2007, Google Inc.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 1 ]
+then
+  echo "Usage:$0 pagesfile"
+  exit 1
+fi
+if [ ! -d ccmain ]
+then
+  echo "Run $0 from the tesseract-ocr root directory!"
+  exit 1
+fi
+if [ ! -r ccmain/tesseract ]
+then
+  if [ ! -r tesseract.exe ]
+  then
+    echo "Please build tesseract before running $0"
+    exit 1
+  else
+    tess="./tesseract.exe"
+  fi
+else
+  tess="ccmain/tesseract"
+  export TESSDATA_PREFIX=$PWD/
+fi
+
+pages=$1
+
+imdir=${pages%/pages}
+setname=${imdir##*/}
+resdir=testing/results/$setname
+echo "Testing on set $setname in directory $imdir to $resdir"
+mkdir -p $resdir
+while read page dir
+do
+  # A pages file may be a list of files with subdirs or maybe just
+  # a plain list of files so accomodate both.
+  if [ "$dir" ]
+  then
+     srcdir="$imdir/$dir"
+  else
+     srcdir="$imdir"
+  fi
+#  echo "$srcdir/$page.tif"
+  $tess $srcdir/$page.tif $resdir/$page nobatch unlv
+done <$pages
				`@ -0,0 +1,2 @@`

				`EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum`
				`@ -0,0 +1 @@`
				`1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%`
				`@ -0,0 +1 @@`
				`1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%`
				`@ -0,0 +1 @@`
				`1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%`
				`@ -0,0 +1 @@`
				`1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%`