Fixed the extern C mismatches properly.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@82 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2024-11-27 20:59:36 +08:00 · 2007-07-18 01:00:54 +00:00 · 2007-07-18 01:00:54 +00:00 · 1943de9aa9
commit 1943de9aa9
parent f4baca27c8
9 changed files with 344 additions and 296 deletions
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@ -50,6 +50,7 @@
 #include          "notdll.h"
 #include "tordvars.h"
 #include "adaptmatch.h"
+#include "globals.h"

 #define MIN_FONT_ROW_COUNT  8
 #define MAX_XHEIGHT_DIFF  3
@ -148,18 +149,9 @@ EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
 EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");

 extern int MatcherDebugLevel;
-extern "C" { extern int display_ratings; }
+extern int display_ratings;
 extern int number_debug;
 extern int adjust_debug;
-/*
-extern "C" {
-  extern int 	MatcherDebugLevel;
-  extern int 	display_ratings;
-  extern int	number_debug;
-  extern int	adjust_debug;
-//	extern int 	LearningDebugLevel;
- };
-*/
 FILE *choice_file = NULL;        //Choice file ptr

 CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
@ -569,8 +561,8 @@ if (dopasses==1) return;

 ////changed by jetsoft
 //needed for dll to output memory structure
-  if ((dopasses==0 || dopasses==2) && monitor)
-	output_pass (page_res_it,true, target_word_box);
+  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
+	output_pass(page_res_it, ocr_char_space() > 0, target_word_box);
 // end jetsoft

 }
@ -620,34 +612,33 @@ void classify_word_pass1(                 //recog one word
    tess_default_matcher,
    word->raw_choice, &blob_choices,
    word->outword);
-
  /*
     Test for TESS screw up on word. Recog_word has already ensured that the
     choice list, outword blob lists and best_choice string are the same
     length. A TESS screw up is indicated by a blank filled or 0 length string.
   */
-  if ((word->best_choice->string ().length () == 0) ||
+  if ((word->best_choice->lengths ().length () == 0) ||
    (strspn (word->best_choice->string ().string (), " ") ==
  word->best_choice->string ().length ())) {
    word->done = FALSE;          //Try again on pass2 - adaption may help
    word->tess_failed = TRUE;
-    word->reject_map.initialise (word->best_choice->string ().length ());
+    word->reject_map.initialise (word->best_choice->lengths ().length ());
    word->reject_map.rej_word_tess_failure ();
  }
  else {
    word->tess_failed = FALSE;
-    if ((word->best_choice->string ().length () !=
+    if ((word->best_choice->lengths ().length () !=
      word->outword->blob_list ()->length ()) ||
-    (word->best_choice->string ().length () != blob_choices.length ())) {
+    (word->best_choice->lengths ().length () != blob_choices.length ())) {
      tprintf
        ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
        word->best_choice->string ().string (),
-        word->best_choice->string ().length (),
+        word->best_choice->lengths ().length (),
        word->outword->blob_list ()->length (), blob_choices.length ());
    }
-    ASSERT_HOST (word->best_choice->string ().length () ==
+    ASSERT_HOST (word->best_choice->lengths ().length () ==
      word->outword->blob_list ()->length ());
-    ASSERT_HOST (word->best_choice->string ().length () ==
+    ASSERT_HOST (word->best_choice->lengths ().length () ==
      blob_choices.length ());

    /*
@ -664,12 +655,12 @@ void classify_word_pass1(                 //recog one word
      fix_rep_char(word);
    }
    else {
-      fix_quotes ((char *) word->best_choice->string ().string (),
+      fix_quotes (word->best_choice,
      //turn to double
        word->outword, &blob_choices);
      if (tessedit_fix_hyphens)
                                 //turn 2 to 1
-        fix_hyphens ((char *) word->best_choice->string ().string (), word->outword, &blob_choices);
+        fix_hyphens (word->best_choice, word->outword, &blob_choices);
      record_certainty (word->best_choice->certainty (), 1);
      //accounting

@ -692,7 +683,7 @@ void classify_word_pass1(                 //recog one word
          rejmap = NULL;
        else {
          ASSERT_HOST (word->reject_map.length () ==
-            word->best_choice->string ().length ());
+            word->best_choice->lengths ().length ());

          for (index = 0; index < word->reject_map.length (); index++) {
            if (adapt_ok || word->reject_map[index].accepted ())
@ -704,7 +695,9 @@ void classify_word_pass1(                 //recog one word
        }

                                 //adapt to it
-        tess_adapter (word->outword, &word->denorm, word->best_choice->string ().string (), word->raw_choice->string ().string (), rejmap);
+        tess_adapter (word->outword, &word->denorm,
+                      *word->best_choice,
+                      *word->raw_choice, rejmap);
      }

      if (tessedit_enable_doc_dict)
@ -712,10 +705,12 @@ void classify_word_pass1(                 //recog one word
      set_word_fonts(word, &blob_choices);
    }
  }
+#if 0
  if (tessedit_print_text) {
    write_cooked_text (bln_word, word->best_choice->string (),
      word->done, FALSE, stdout);
  }
+#endif
  delete bln_word;
  blob_choices.deep_clear ();
 }
@ -898,10 +893,12 @@ void classify_word_pass2(  //word to do
 #endif

  set_global_subloc_code(SUBLOC_NORM);
+#if 0
  if (tessedit_print_text) {
    write_cooked_text (word->outword, word->best_choice->string (),
      word->done, done_this_pass, stdout);
  }
+#endif
  check_debug_pt (word, 50);
 }

@ -971,18 +968,18 @@ void match_word_pass2(                 //recog one word
    //              tprintf("Empty word produced\n");
  }
  else {
-    if ((word->best_choice->string ().length () !=
+    if ((word->best_choice->lengths ().length () !=
      word->outword->blob_list ()->length ()) ||
-    (word->best_choice->string ().length () != blob_choices.length ())) {
+    (word->best_choice->lengths ().length () != blob_choices.length ())) {
      tprintf
        ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
        word->best_choice->string ().string (),
-        word->best_choice->string ().length (),
+        word->best_choice->lengths ().length (),
        word->outword->blob_list ()->length (), blob_choices.length ());
    }
-    ASSERT_HOST (word->best_choice->string ().length () ==
+    ASSERT_HOST (word->best_choice->lengths ().length () ==
      word->outword->blob_list ()->length ());
-    ASSERT_HOST (word->best_choice->string ().length () ==
+    ASSERT_HOST (word->best_choice->lengths ().length () ==
      blob_choices.length ());

    word->tess_failed = FALSE;
@ -990,29 +987,29 @@ void match_word_pass2(                 //recog one word
      fix_rep_char(word);
    }
    else {
-      fix_quotes ((char *) word->best_choice->string ().string (),
+      fix_quotes (word->best_choice,
        word->outword, &blob_choices);
      if (tessedit_fix_hyphens)
-        fix_hyphens ((char *) word->best_choice->string ().string (),
+        fix_hyphens (word->best_choice,
          word->outword, &blob_choices);
      /* Dont trust fix_quotes! - though I think I've fixed the bug */
-      if ((word->best_choice->string ().length () !=
-        word->outword->blob_list ()->length ()) ||
-        (word->best_choice->string ().length () !=
-      blob_choices.length ())) {
+      if ((word->best_choice->lengths ().length () !=
+           word->outword->blob_list ()->length ()) ||
+          (word->best_choice->lengths ().length () !=
+           blob_choices.length ())) {
        #ifndef SECURE_NAMES
        tprintf
          ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
-          word->best_choice->string ().string (),
-          word->best_choice->string ().length (),
-          word->outword->blob_list ()->length (),
-          blob_choices.length ());
+           word->best_choice->string ().string (),
+           word->best_choice->lengths ().length (),
+           word->outword->blob_list ()->length (),
+           blob_choices.length ());
        #endif

      }
-      ASSERT_HOST (word->best_choice->string ().length () ==
+      ASSERT_HOST (word->best_choice->lengths ().length () ==
        word->outword->blob_list ()->length ());
-      ASSERT_HOST (word->best_choice->string ().length () ==
+      ASSERT_HOST (word->best_choice->lengths ().length () ==
        blob_choices.length ());

      word->tess_accepted = tess_acceptable_word (word->best_choice,
@ -1039,7 +1036,7 @@ void fix_rep_char(                //Repeated char word
                 ) {
  struct REP_CH
  {
-    char ch;
+    char ch[UNICHAR_LEN + 1];
    int count;
  };

@ -1048,19 +1045,25 @@ void fix_rep_char(                //Repeated char word
  int rep_ch_count = 0;          //how many unique chs
  const char *word_str;          //the repeated chs
  int i, j;
+  int offset;
  int total = 0;
  int max = 0;
-  char maxch = ' ';              //Most common char
+  char *maxch = NULL;              //Most common char

  word_str = word->best_choice->string ().string ();
-  word_len = strlen (word_str);
+  word_len = word->best_choice->lengths ().length ();;
  rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH));
-  for (i = 0; i < word_len; i++) {
-    for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++);
+  for (i = 0, offset = 0; i < word_len;
+       offset += word->best_choice->lengths()[i++]) {
+    for (j = 0; j < rep_ch_count &&
+             strncmp(rep_ch[j].ch, word_str + offset,
+                     word->best_choice->lengths()[i]) != 0; j++);
    if (j < rep_ch_count)
      rep_ch[j].count++;
    else {
-      rep_ch[rep_ch_count].ch = word_str[i];
+      strncpy(rep_ch[rep_ch_count].ch, word_str + offset,
+              word->best_choice->lengths()[i]);
+      rep_ch[rep_ch_count].ch[word->best_choice->lengths()[i]] = '\0';
      rep_ch[rep_ch_count].count = 1;
      rep_ch_count++;
    }
@ -1068,7 +1071,7 @@ void fix_rep_char(                //Repeated char word

  for (j = 0; j < rep_ch_count; j++) {
    total += rep_ch[j].count;
-    if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) {
+    if ((rep_ch[j].count > max) && (*rep_ch[j].ch != ' ')) {
      max = rep_ch[j].count;
      maxch = rep_ch[j].ch;
    }
@ -1078,26 +1081,47 @@ void fix_rep_char(                //Repeated char word
  free_mem(rep_ch);

  word->reject_map.initialise (word_len);
-  for (i = 0; i < word_len; i++) {
-    if (word_str[i] != maxch)
+  for (i = 0, offset = 0; i < word_len;
+       offset += word->best_choice->lengths()[i++]) {
+    if (strncmp(word_str + offset, maxch,
+                word->best_choice->lengths()[i]) != 0)
                                 //rej unrecognised blobs
      word->reject_map[i].setrej_bad_repetition ();
  }
  word->done = TRUE;
 }

+// TODO(tkielbus) Decide between keeping this behavior here or modifying the
+// training data.
+
+// Utility function for fix_quotes
+// Return true if the next character in the string (given the UTF8 length in
+// bytes) is a quote character.
+static int is_simple_quote(const char* signed_str, int length) {
+  const unsigned char* str = reinterpret_cast<const unsigned char*>(signed_str);
+   //standard 1 byte quotes
+  return (length == 1 && (*str == '\'' || *str == '`')) ||
+      //utf8 3 bytes curved quotes
+      (length == 3 && ((*str == 0xe2 &&
+                        *(str + 1) == 0x80 &&
+                        *(str + 2) == 0x98) ||
+                       (*str == 0xe2 &&
+                        *(str + 1) == 0x80 &&
+                        *(str + 2) == 0x99)));
+}

 /**********************************************************************
 * fix_quotes
 *
 * Change pairs of quotes to double quotes.
 **********************************************************************/
-
 void fix_quotes(               //make double quotes
-                char *string,  //string to fix
+                WERD_CHOICE *choice,  //choice to fix
                WERD *word,    //word to do //char choices
                BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  char *ptr;                     //string ptr
+  char *str = (char *) choice->string().string();//string ptr
+  int i;
+  int offset;
                                 //blobs
  PBLOB_IT blob_it = word->blob_list ();
                                 //choices
@ -1105,12 +1129,20 @@ void fix_quotes(               //make double quotes
  BLOB_CHOICE_IT it1;            //first choices
  BLOB_CHOICE_IT it2;            //second choices

-  for (ptr = string;
-  *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
-    if ((*ptr == '\'' || *ptr == '`')
-    && (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) {
-      *ptr = '"';                //turn to double
-      strcpy (ptr + 1, ptr + 2); //shuffle up
+  for (i = 0, offset = 0; str[offset] != '\0';
+       offset += choice->lengths()[i++],
+           blob_it.forward (), choice_it.forward ()) {
+    if (str[offset + choice->lengths()[i]] != '\0' &&
+        is_simple_quote(str + offset, choice->lengths()[i]) &&
+        is_simple_quote(str + offset + choice->lengths()[i],
+                        choice->lengths()[i + 1])) {
+      str[offset] = '"';                //turn to double
+      strcpy (str + offset + 1,
+              str + offset + choice->lengths()[i] +
+              choice->lengths()[i + 1]); //shuffle up
+      choice->lengths()[i] = 1;
+      strcpy ((char*) choice->lengths().string() + i + 1,
+              choice->lengths().string() + i + 2);
      merge_blobs (blob_it.data (), blob_it.data_relative (1));
      blob_it.forward ();
      delete blob_it.extract (); //get rid of spare
@ -1138,12 +1170,13 @@ void fix_quotes(               //make double quotes
 * Change pairs of hyphens to a single hyphen if the bounding boxes touch
 * Typically a long dash which has been segmented.
 **********************************************************************/
-
 void fix_hyphens(               //crunch double hyphens
-                 char *string,  //string to fix
+                 WERD_CHOICE *choice,  //choice to fix
                 WERD *word,    //word to do //char choices
                 BLOB_CHOICE_LIST_CLIST *blob_choices) {
-  char *ptr;                     //string ptr
+  char *str = (char *) choice->string().string();//string ptr
+  int i;
+  int offset;
                                 //blobs
  PBLOB_IT blob_it = word->blob_list ();
                                 //choices
@ -1151,14 +1184,20 @@ void fix_hyphens(               //crunch double hyphens
  BLOB_CHOICE_IT it1;            //first choices
  BLOB_CHOICE_IT it2;            //second choices

-  for (ptr = string;
-  *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
-    if ((*ptr == '-' || *ptr == '~') &&
-      (*(ptr + 1) == '-' || *(ptr + 1) == '~') &&
+  for (i = 0, offset = 0; str[offset] != '\0';
+  offset += choice->lengths()[i++],
+           blob_it.forward (), choice_it.forward ()) {
+    if ((str[offset] == '-' || str[offset] == '~') &&
+      (str[offset + choice->lengths()[i]] == '-' ||
+       str[offset + choice->lengths()[i]] == '~') &&
      (blob_it.data ()->bounding_box ().right () >=
    blob_it.data_relative (1)->bounding_box ().left ())) {
-      *ptr = '-';                //turn to single hyphen
-      strcpy (ptr + 1, ptr + 2); //shuffle up
+      str[offset] = '-';                //turn to single hyphen
+      strcpy (str + offset + choice->lengths()[i],
+              str + offset + choice->lengths()[i] +
+              choice->lengths()[i + 1]); //shuffle up
+      strcpy ((char*) choice->lengths().string() + i + 1,
+              choice->lengths().string() + i + 2);
      merge_blobs (blob_it.data (), blob_it.data_relative (1));
      blob_it.forward ();
      delete blob_it.extract (); //get rid of spare
@ -1249,11 +1288,9 @@ void choice_dump_tester(                           //dump chars in word
  it.set_to_list (ratings);
  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
    blob_choice = it.data ();
-    if ((blob_choice->char_class () >= '!') &&
-      (blob_choice->char_class () <= '~'))
-      fprintf (choice_file, "\t%c\t%f\t%f",
-        blob_choice->char_class (),
-        blob_choice->rating (), blob_choice->certainty ());
+    fprintf (choice_file, "\t%s\t%f\t%f",
+             blob_choice->unichar (),
+             blob_choice->rating (), blob_choice->certainty ());
  }
  fprintf (choice_file, "\n");
 }
@ -1290,33 +1327,37 @@ WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) {
 }


-ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
+ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
+                                            const char *lengths) {
  int i = 0;
+  int offset = 0;
  int leading_punct_count;
  int upper_count = 0;
  int hyphen_pos = -1;
  ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;

-  if (strlen (s) > 20)
+  if (strlen (lengths) > 20)
    return word_type;

  /* Single Leading punctuation char*/

-  if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i])))
-    i++;
+  if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
+    offset += lengths[i++];
  leading_punct_count = i;

  /* Initial cap */
-  while (isupper (s[i])) {
-    i++;
+  while ((s[offset] != '\0') &&
+         unicharset.get_isupper(s + offset, lengths[i])) {
+    offset += lengths[i++];
    upper_count++;
  }
  if (upper_count > 1)
    word_type = AC_UPPER_CASE;
  else {
    /* Lower case word, possibly with an initial cap */
-    while (islower (s[i])) {
-      i++;
+    while ((s[offset] != '\0') &&
+           unicharset.get_islower (s + offset, lengths[i])) {
+      offset += lengths[i++];
    }
    if (i - leading_punct_count < quality_min_initial_alphas_reqd)
      goto not_a_word;
@ -1324,11 +1365,13 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
    Allow a single hyphen in a lower case word
    - dont trust upper case - I've seen several cases of "H" -> "I-I"
    */
-    if (s[i] == '-') {
-      hyphen_pos = i++;
-      if (s[i] != '\0') {
-        while (islower (s[i])) {
-          i++;
+    if (lengths[i] == 1 && s[offset] == '-') {
+      hyphen_pos = i;
+      offset += lengths[i++];
+      if (s[offset] != '\0') {
+        while ((s[offset] != '\0') &&
+               unicharset.get_islower(s + offset, lengths[i])) {
+          offset += lengths[i++];
        }
        if (i < hyphen_pos + 3)
          goto not_a_word;
@ -1336,8 +1379,11 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
    }
    else {
      /* Allow "'s" in NON hyphenated lower case words */
-      if ((s[i] == '\'') && (s[i + 1] == 's'))
-        i += 2;
+      if (lengths[i] == 1 && (s[offset] == '\'') &&
+          lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
+        offset += lengths[i++];
+        offset += lengths[i++];
+      }
    }
    if (upper_count > 0)
      word_type = AC_INITIAL_CAP;
@ -1346,13 +1392,15 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
  }

  /* Up to two different, constrained trailing punctuation chars */
-  if ((s[i] != '\0') && (STRING (chs_trailing_punct1).contains (s[i])))
-    i++;
-  if ((s[i] != '\0') &&
-    (s[i - 1] != s[i]) && (STRING (chs_trailing_punct2).contains (s[i])))
-    i++;
+  if (lengths[i] == 1 && (s[offset] != '\0') &&
+      (STRING (chs_trailing_punct1).contains (s[offset])))
+    offset += lengths[i++];
+  if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
+    (s[offset - lengths[i - 1]] != s[offset]) &&
+      (STRING (chs_trailing_punct2).contains (s[offset])))
+    offset += lengths[i++];

-  if (s[i] != '\0')
+  if (s[offset] != '\0')
    word_type = AC_UNACCEPTABLE;

  not_a_word:
@ -1360,17 +1408,26 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
  if (word_type == AC_UNACCEPTABLE) {
    /* Look for abbreviation string */
    i = 0;
-    if (isupper (s[0])) {
+    offset = 0;
+    if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
      word_type = AC_UC_ABBREV;
-      while ((s[i] != '\0') && isupper (s[i]) && (s[i + 1] == '.'))
-        i += 2;
+      while ((s[offset] != '\0') &&
+             unicharset.get_isupper(s + offset, lengths[i]) &&
+             (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
+        offset += lengths[i++];
+        offset += lengths[i++];
+      }
    }
-    else if (islower (s[0])) {
+    else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
      word_type = AC_LC_ABBREV;
-      while ((s[i] != '\0') && islower (s[i]) && (s[i + 1] == '.'))
-        i += 2;
+      while ((s[offset] != '\0') &&
+             unicharset.get_islower(s + offset, lengths[i]) &&
+             (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
+        offset += lengths[i++];
+        offset += lengths[i++];
+      }
    }
-    if (s[i] != '\0')
+    if (s[offset] != '\0')
      word_type = AC_UNACCEPTABLE;
  }

@ -1478,7 +1535,8 @@ void set_word_fonts(                 //good chars in word
                    WERD_RES *word,  //word to adapt to //detailed results
                    BLOB_CHOICE_LIST_CLIST *blob_choices) {
  INT32 index;                   //char index
-  char choice_char;              //char from word
+  INT32 offset;                  //char offset
+  char choice_char[UNICHAR_LEN + 1];    //char from word
  INT8 config;                   //font of char
                                 //character iterator
  BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
@ -1517,16 +1575,19 @@ void set_word_fonts(                 //good chars in word

  word->italic = 0;
  word->bold = 0;
-  for (char_it.mark_cycle_pt (), index = 0;
-  !char_it.cycled_list (); char_it.forward (), index++) {
-    choice_char = word->best_choice->string ()[index];
+  for (char_it.mark_cycle_pt (), index = 0, offset = 0;
+  !char_it.cycled_list (); char_it.forward (),
+           offset += word->best_choice->lengths()[index++]) {
+    strncpy(choice_char, word->best_choice->string ().string() + offset,
+            word->best_choice->lengths()[index]);
+    choice_char[word->best_choice->lengths()[index]] = '\0';
    choice_it.set_to_list (char_it.data ());
    for (choice_it.mark_cycle_pt (); !choice_it.cycled_list ();
-    choice_it.forward ()) {
-      if (choice_it.data ()->char_class () == choice_char) {
+         choice_it.forward ()) {
+      if (strcmp(choice_it.data ()->unichar (), choice_char) == 0) {
        config = choice_it.data ()->config ();
        if (tessedit_debug_fonts)
-          tprintf ("%c(%d=%d%c%c)",
+          tprintf ("%s(%d=%d%c%c)",
            choice_char, config, (config & 31) >> 2,
            config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
        if (config != -1) {
--- a/ccmain/tfacep.h
+++ b/ccmain/tfacep.h
@ -46,66 +46,8 @@
 typedef void (*TESS_TESTER) (TBLOB *, BOOL8, char *, INT32, LIST);
 typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *, void *, TEXTROW *);

-extern "C"
-{
-  /*
-  int							start_recog(				//Real main in C
-  int							argc,
-  char						*argv[]);
-  void						program_editup2(			//afterforking part
-  int							argc,
-  char**						argv);
-
-  int							end_recog(					//Real main in C
-  int							argc,
-  char						*argv[]);
-  void						set_interactive_pass();
-  void						set_pass1();
-  void						set_pass2();
-  //ARRAY						cc_recog(TWERD*,TESS_CHOICE*,TESS_CHOICE*,TESS_TESTER,
-  //										TESS_TESTER);*/
-  //void                                          wo_learn_blob(TBLOB*,TEXTROW*,char*,INT32);
-  //LIST                                  AdaptiveClassifier(TBLOB*,TBLOB*,TEXTROW*);
-  //void                                          LearnBlob(TBLOB*,TEXTROW*,char*,INT32);
-  //TWERD                                         *newword();
-  //TBLOB                                         *newblob();
-  //TESSLINE                                      *newoutline();
-  //EDGEPT                                                *newedgept();
-  //void                                          oldedgept(EDGEPT*);
-  //void                                          destroy_nodes(void*,void (*)(void*));
-  //TESS_LIST                                     *append_choice(TESS_LIST*,char*,double,double,char);
-  //void                                          fix_quotes (char*);
-  //void                                          record_certainty(double,int);
-  //int                                                   AcceptableResult(A_CHOICE*,A_CHOICE*);
-  //int                                                   AdaptableWord(TWERD*,const char*,const char*);
-  //void                                          delete_word(TWERD*);
-  //void                                          free_blob(TBLOB*);
-  //void                                          add_document_word(A_CHOICE*);
-  //void                                          AdaptToWord(TWERD*,TEXTROW*,const char*,const char*,const char*);
-  //void                                          SaveBadWord(const char*,double);
-  //void                                          free_choice(TESS_CHOICE*);
-  //TWERD                                         *newword();
-  //TBLOB                                         *newblob();
-  //void                                          free_blob(                                      //free a blob
-  //      TBLOB                                           *blob);                                         //blob to free
-
-  //int                                                   dict_word( const char* );
-
-  //extern int                                    tess_cn_matching;
-  //extern int                                    tess_bn_matching;
-  //extern int                                    last_word_on_line;
-  extern TEXTROW normalized_row;
-  //extern TESS_MATCHER                   blob_matchers[];
-  //extern FILE                                   *rawfile;
-  //extern FILE                                   *textfile;
-  //extern int                                    character_count;
-  //extern int                                    word_count;
-  //extern int                                    enable_assoc;
-  //extern int                                    chop_enable;
-  //extern int                                    permute_only_top;
-  extern int display_ratings;
-
-};
+extern TEXTROW normalized_row;
+extern int display_ratings;

 #if 0
 #define strsave(s)    \
--- a/ccutil/tessopt.cpp
+++ b/ccutil/tessopt.cpp
@ -23,8 +23,8 @@
 #include          "tessopt.h"
 #include          "notdll.h"     //must be last include

-int optind;
-char *optarg;
+int tessoptind;
+char *tessoptarg;

 /**********************************************************************
 * tessopt
@ -37,22 +37,22 @@ INT32 argc,                      //arg count
 char *argv[],                    //args
 const char *arglist                    //string of arg chars
 ) {
-  char *arg;                     //arg char
+  const char *arg;                     //arg char

-  if (optind == 0)
-    optind = 1;
-  if (optind < argc && argv[optind][0] == '-') {
-    arg = strchr (arglist, argv[optind][1]);
+  if (tessoptind == 0)
+    tessoptind = 1;
+  if (tessoptind < argc && argv[tessoptind][0] == '-') {
+    arg = strchr (arglist, argv[tessoptind][1]);
    if (arg == NULL || *arg == ':')
      return '?';                //dud option
-    optind++;
-    optarg = argv[optind];
+    tessoptind++;
+    tessoptarg = argv[tessoptind];
    if (arg[1] == ':') {
-      if (argv[optind - 1][2] != '\0')
+      if (argv[tessoptind - 1][2] != '\0')
                                 //immediately after
-        optarg = argv[optind - 1] + 2;
+        tessoptarg = argv[tessoptind - 1] + 2;
      else
-        optind++;
+        tessoptind++;
    }
    return *arg;
  }
--- a/ccutil/tessopt.h
+++ b/ccutil/tessopt.h
@ -20,8 +20,8 @@
 #include          "host.h"
 #include          "notdll.h"     //must be last include

-extern int optind;
-extern char *optarg;
+extern int tessoptind;
+extern char *tessoptarg;

 int tessopt (                     //parse args
 INT32 argc,                      //arg count
--- a/classify/extern.h
+++ b/classify/extern.h
@ -28,7 +28,7 @@
 */

 #ifdef __cplusplus
-#define EXTERN extern "C"
+#define EXTERN extern
 #else
 #define EXTERN extern
 #endif
--- a/cutil/globals.h
+++ b/cutil/globals.h
@ -39,14 +39,12 @@
 extern TBLOB *pageblobs;         /*first blob on page */
 extern TEXTBLOCK *pageblocks;    /*first block on page */
                                 /*class definitions */
-extern char classes[CLASSIZE][CLASSLENGTH];
+/* extern char classes[CLASSIZE][CLASSLENGTH]; */
 extern int resolution;           /*scanner res in dpi */
 extern int acts[MAXPROC];        /*action flags */
 extern int debugs[MAXPROC];      /*debug flags */
 extern int plots[MAXPROC];       /*plot flags */
 extern int corners[4];           /*corners of scan window */
-extern int optind;               /*option index */
-extern char *optarg;             /*option argument */
                                 /*image file name */
 extern char imagefile[FILENAMESIZE];
                                 /* main directory */
--- a/training/cnTraining.cpp
+++ b/training/cnTraining.cpp
@ -37,6 +37,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <math.h>
+#include "unichar.h"

 #define MAXNAMESIZE	80
 #define MAX_NUM_SAMPLES	10000
@ -219,21 +220,34 @@ int main (
 	ParseArguments (argc, argv);
 	while ((PageName = GetNextFilename()) != NULL)
 	{
-		printf ("\nReading %s ...", PageName);
+		printf ("Reading %s ...\n", PageName);
 		TrainingPage = Efopen (PageName, "r");
 		ReadTrainingSamples (TrainingPage, &CharList);
 		fclose (TrainingPage);
 		//WriteTrainingSamples (Directory, CharList);
 	}
+        printf("Clustering ...\n");
 	pCharList = CharList;
 	iterate(pCharList)
 	{
-		//Cluster
-		CharSample = (LABELEDLIST) first_node (pCharList);
-		printf ("\nClustering %s ...", CharSample->Label);
-		Clusterer = SetUpForClustering(CharSample);
-		ProtoList = ClusterSamples(Clusterer, &Config);
-		AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
+          //Cluster
+          CharSample = (LABELEDLIST) first_node (pCharList);
+          //printf ("\nClustering %s ...", CharSample->Label);
+          Clusterer = SetUpForClustering(CharSample);
+          float SavedMinSamples = Config.MinSamples;
+          while (Config.MinSamples > 0.001) {
+            ProtoList = ClusterSamples(Clusterer, &Config);
+            if (NumberOfProtos(ProtoList, 1, 0) > 0)
+              break;
+            else {
+              Config.MinSamples *= 0.95;
+              printf("0 significant protos for %s."
+                     " Retrying clustering with MinSamples = %f%%\n",
+                     CharSample->Label, Config.MinSamples);
+            }
+          }
+          Config.MinSamples = SavedMinSamples;
+          AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
 	}
 	FreeTrainingSamples (CharList);
 	WriteNormProtos (Directory, NormProtoList, Clusterer);
@ -262,7 +276,7 @@ void ParseArguments(
 **		ShowSignificantProtos	flag controlling proto display
 **		ShowInsignificantProtos	flag controlling proto display
 **		Config			current clustering parameters
-**		optarg, optind		defined by tessopt sys call
+**		tessoptarg, tessoptind		defined by tessopt sys call
 **		Argc, Argv		global copies of argc and argv
 **	Operation:
 **		This routine parses the command line arguments that were
@ -287,7 +301,6 @@ void ParseArguments(
 	int		Option;
 	int		ParametersRead;
 	BOOL8		Error;
-	extern char	*optarg;

 	Error = FALSE;
 	Argc = argc;
@ -297,48 +310,48 @@ void ParseArguments(
 		switch ( Option )
 		{
 		case 'n':
-      sscanf(optarg,"%d", &ParametersRead);
+      sscanf(tessoptarg,"%d", &ParametersRead);
 			ShowInsignificantProtos = ParametersRead;
 			break;
 		case 'p':
-      sscanf(optarg,"%d", &ParametersRead);
+      sscanf(tessoptarg,"%d", &ParametersRead);
 			ShowSignificantProtos = ParametersRead;
 			break;
 		case 'd':
 			ShowAllSamples = FALSE;
 			break;
 		case 'C':
-			ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) );
+			ParametersRead = sscanf( tessoptarg, "%lf", &(Config.Confidence) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.Confidence > 1 ) Config.Confidence = 1;
 			else if ( Config.Confidence < 0 ) Config.Confidence = 0;
 			break;
 		case 'I':
-			ParametersRead = sscanf( optarg, "%f", &(Config.Independence) );
+			ParametersRead = sscanf( tessoptarg, "%f", &(Config.Independence) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.Independence > 1 ) Config.Independence = 1;
 			else if ( Config.Independence < 0 ) Config.Independence = 0;
 			break;
 		case 'M':
-			ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) );
+			ParametersRead = sscanf( tessoptarg, "%f", &(Config.MinSamples) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
 			else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
 			break;
 		case 'B':
-			ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) );
+			ParametersRead = sscanf( tessoptarg, "%f", &(Config.MaxIllegal) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
 			else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
 			break;
 		case 'R':
-			ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy );
+			ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
 			else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
 			break;
 		case 'S':
-			switch ( optarg[0] )
+			switch ( tessoptarg[0] )
 			{
 			case 's': Config.ProtoStyle = spherical; break;
 			case 'e': Config.ProtoStyle = elliptical; break;
@ -348,10 +361,10 @@ void ParseArguments(
 			}
 			break;
 			case 'D':
-				Directory = optarg;
+				Directory = tessoptarg;
 				break;
 			case 'N':
-				if (sscanf (optarg, "%d", &MaxNumSamples) != 1 ||
+				if (sscanf (tessoptarg, "%d", &MaxNumSamples) != 1 ||
 					MaxNumSamples <= 0)
 					Error = TRUE;
 				break;
@ -375,7 +388,7 @@ char *GetNextFilename ()
 /*
 **	Parameters: none
 **	Globals:
-**		optind			defined by tessopt sys call
+**		tessoptind			defined by tessopt sys call
 **		Argc, Argv		global copies of argc and argv
 **	Operation:
 **		This routine returns the next command line argument.  If
@ -388,8 +401,8 @@ char *GetNextFilename ()
 */

 {
-	if (optind < Argc)
-		return (Argv [optind++]);
+	if (tessoptind < Argc)
+		return (Argv [tessoptind++]);
 	else
 		return (NULL);

@ -417,32 +430,32 @@ void ReadTrainingSamples (
 */

 {
-	char			CharName[MAXNAMESIZE];
+	char		unichar[UNICHAR_LEN + 1];
 	LABELEDLIST	CharSample;
 	FEATURE_SET	FeatureSamples;
-	CHAR_DESC		CharDesc;
-	int			Type, i;
+	CHAR_DESC	CharDesc;
+	int		Type, i;

-	while (fscanf (File, "%s %s", FontName, CharName) == 2) {
-		CharSample = FindList (*TrainingSamples, CharName);
-		if (CharSample == NULL) {
-			CharSample = NewLabeledList (CharName);
-			*TrainingSamples = push (*TrainingSamples, CharSample);
-		}
-		CharDesc = ReadCharDescription (File);
-		Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
-		FeatureSamples = FeaturesOfType(CharDesc, Type);
-    for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
-      FEATURE f = FeatureSamples->Features[feature];
-      for (int dim =0; dim < f->Type->NumParams; ++dim)
-        f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
-    }
-		CharSample->List = push (CharSample->List, FeatureSamples);
-		for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
-			if (Type != i)
-				FreeFeatureSet (FeaturesOfType (CharDesc, i));
-		free (CharDesc);
-    }
+	while (fscanf (File, "%s %s", FontName, unichar) == 2) {
+          CharSample = FindList (*TrainingSamples, unichar);
+          if (CharSample == NULL) {
+            CharSample = NewLabeledList (unichar);
+            *TrainingSamples = push (*TrainingSamples, CharSample);
+          }
+          CharDesc = ReadCharDescription (File);
+          Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
+          FeatureSamples = FeaturesOfType(CharDesc, Type);
+          for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
+            FEATURE f = FeatureSamples->Features[feature];
+            for (int dim =0; dim < f->Type->NumParams; ++dim)
+              f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
+          }
+          CharSample->List = push (CharSample->List, FeatureSamples);
+          for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
+            if (Type != i)
+              FreeFeatureSet (FeaturesOfType (CharDesc, i));
+          free (CharDesc);
+        }
 }	// ReadTrainingSamples

 /*---------------------------------------------------------------------------*/
@ -606,7 +619,6 @@ void WriteNormProtos (
 	char		Filename[MAXNAMESIZE];
 	LABELEDLIST LabeledProto;
 	int N;
-	char Label;

 	strcpy (Filename, "");
 	if (Directory != NULL)
@ -623,9 +635,17 @@ void WriteNormProtos (
 	{
 		LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
 		N = NumberOfProtos(LabeledProto->List,
-			ShowSignificantProtos, ShowInsignificantProtos);
-		Label = NameToChar(LabeledProto->Label);
-		fprintf(File, "\n%c %d\n", Label, N);
+		ShowSignificantProtos, ShowInsignificantProtos);
+                if (N < 1) {
+                  printf ("\nError! Not enough protos for %s: %d protos"
+                          " (%d significant protos"
+                          ", %d insignificant protos)\n",
+                          LabeledProto->Label, N,
+                          NumberOfProtos(LabeledProto->List, 1, 0),
+                          NumberOfProtos(LabeledProto->List, 0, 1));
+                  exit(1);
+                }
+		fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
 		WriteProtos(File, Clusterer->SampleSize, LabeledProto->List,
 			ShowSignificantProtos, ShowInsignificantProtos);
 	}
--- a/training/mfTraining.cpp
+++ b/training/mfTraining.cpp
@ -44,6 +44,9 @@
 #include "intproto.h"
 #include "variables.h"
 #include "freelist.h"
+#include "efio.h"
+#include "danerror.h"
+#include "globals.h"

 #include <string.h>
 #include <stdio.h>
@ -73,7 +76,6 @@ typedef MERGE_CLASS_NODE* MERGE_CLASS;

 #define round(x,frag)(floor(x/frag+.5)*frag)

-
 /**----------------------------------------------------------------------------
 					Public Function Prototypes
 ----------------------------------------------------------------------------**/
@ -164,21 +166,7 @@ void Normalize (
 void SetUpForFloat2Int(
 	LIST LabeledClassList);

-void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) {
-  FILE* fp = Efopen(filename, "wb");
-  /* then write out each class */
-  for (int i = 0; i < NumClassesIn (Templates); i++) {
-    int MaxLength = 0;
-    INT_CLASS Class = ClassForIndex (Templates, i);
-    for (int ConfigId = 0; ConfigId < NumIntConfigsIn (Class); ConfigId++) {
-      if (LengthForConfigId (Class, ConfigId) > MaxLength)
-        MaxLength = LengthForConfigId (Class, ConfigId);
-    }
-    fprintf(fp, "%c %d\n", ClassIdForIndex(Templates, i), MaxLength);
-  }
-  fclose(fp);
-}
-
+void WritePFFMTable(INT_TEMPLATES Templates, const char* filename);

 //--------------Global Data Definitions and Declarations--------------
 static char FontName[MAXNAMESIZE];
@ -200,6 +188,9 @@ static CLUSTERCONFIG Config =

 static FLOAT32 RoundingAccuracy = 0.0;

+// The unicharset used during mftraining
+static UNICHARSET unicharset_mftraining;
+
 /*----------------------------------------------------------------------------
 						Public Code
 -----------------------------------------------------------------------------*/
@ -260,12 +251,17 @@ int main (
 	LIST pCharList, pProtoList;
 	char Filename[MAXNAMESIZE];

+        // Clean the unichar set
+        unicharset_mftraining.clear();
+        // Space character needed to represent NIL classification
+        unicharset_mftraining.unichar_insert(" ");
+
 	ParseArguments (argc, argv);
 	InitFastTrainerVars ();
 	InitSubfeatureVars ();
 	while ((PageName = GetNextFilename()) != NULL)
 	{
-		printf ("\nReading %s ...", PageName);
+		printf ("Reading %s ...\n", PageName);
 		TrainingPage = Efopen (PageName, "r");
 		CharList = ReadTrainingSamples (TrainingPage);
 		fclose (TrainingPage);
@ -275,7 +271,7 @@ int main (
 		{
 			//Cluster
 			CharSample = (LABELEDLIST) first_node (pCharList);
-			printf ("\nClustering %s ...", CharSample->Label);
+// 			printf ("\nClustering %s ...", CharSample->Label);
 			Clusterer = SetUpForClustering(CharSample);
 			ProtoList = ClusterSamples(Clusterer, &Config);
 			//WriteClusteredTrainingSamples (Directory, ProtoList, Clusterer, CharSample);
@ -320,14 +316,13 @@ int main (
 			FreeProtoList (&ProtoList);
 		}
 		FreeTrainingSamples (CharList);
-		printf ("\n");
 	}
 	//WriteMergedTrainingSamples(Directory,ClassList);
 	WriteMicrofeat(Directory, ClassList);
 	InitIntProtoVars ();
 	InitPrototypes ();
 	SetUpForFloat2Int(ClassList);
-	IntTemplates = CreateIntTemplates(TrainingData);
+	IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining);
 	strcpy (Filename, "");
 	if (Directory != NULL)
 	{
@ -340,11 +335,18 @@ int main (
 #else
 	OutFile = Efopen (Filename, "wb");
 #endif
-	WriteIntTemplates(OutFile, IntTemplates);
+	WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining);
 	fclose (OutFile);
-  // Now create pffmtable.
-  WritePFFMTable(IntTemplates, "pffmtable");
-	printf ("\nDone!\n"); /**/
+	strcpy (Filename, "");
+	if (Directory != NULL)
+	{
+		strcat (Filename, Directory);
+		strcat (Filename, "/");
+	}
+	strcat (Filename, "pffmtable");
+        // Now create pffmtable.
+        WritePFFMTable(IntTemplates, Filename);
+	printf ("Done!\n"); /**/
 	FreeLabeledClassList (ClassList);
  return 0;
 }	/* main */
@ -367,7 +369,7 @@ char	**argv)
 **		ShowSignificantProtos	flag controlling proto display
 **		ShowInsignificantProtos	flag controlling proto display
 **		Config			current clustering parameters
-**		optarg, optind		defined by tessopt sys call
+**		tessoptarg, tessoptind		defined by tessopt sys call
 **		Argc, Argv		global copies of argc and argv
 **	Operation:
 **		This routine parses the command line arguments that were
@ -392,7 +394,6 @@ char	**argv)
 	int		Option;
 	int		ParametersRead;
 	BOOL8		Error;
-	extern char	*optarg;

 	Error = FALSE;
 	Argc = argc;
@ -411,37 +412,37 @@ char	**argv)
 			ShowAllSamples = FALSE;
 			break;
 		case 'C':
-			ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) );
+			ParametersRead = sscanf( tessoptarg, "%lf", &(Config.Confidence) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.Confidence > 1 ) Config.Confidence = 1;
 			else if ( Config.Confidence < 0 ) Config.Confidence = 0;
 			break;
 		case 'I':
-			ParametersRead = sscanf( optarg, "%f", &(Config.Independence) );
+			ParametersRead = sscanf( tessoptarg, "%f", &(Config.Independence) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.Independence > 1 ) Config.Independence = 1;
 			else if ( Config.Independence < 0 ) Config.Independence = 0;
 			break;
 		case 'M':
-			ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) );
+			ParametersRead = sscanf( tessoptarg, "%f", &(Config.MinSamples) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
 			else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
 			break;
 		case 'B':
-			ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) );
+			ParametersRead = sscanf( tessoptarg, "%f", &(Config.MaxIllegal) );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
 			else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
 			break;
 		case 'R':
-			ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy );
+			ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy );
 			if ( ParametersRead != 1 ) Error = TRUE;
 			else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
 			else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
 			break;
 		case 'S':
-			switch ( optarg[0] )
+			switch ( tessoptarg[0] )
 			{
 			case 's': Config.ProtoStyle = spherical; break;
 			case 'e': Config.ProtoStyle = elliptical; break;
@ -451,10 +452,10 @@ char	**argv)
 			}
 			break;
 			case 'D':
-				Directory = optarg;
+				Directory = tessoptarg;
 				break;
 			case 'N':
-				if (sscanf (optarg, "%d", &MaxNumSamples) != 1 ||
+				if (sscanf (tessoptarg, "%d", &MaxNumSamples) != 1 ||
 					MaxNumSamples <= 0)
 					Error = TRUE;
 				break;
@ -478,7 +479,7 @@ char *GetNextFilename ()
 /*
 **	Parameters: none
 **	Globals:
-**		optind			defined by tessopt sys call
+**		tessoptind			defined by tessopt sys call
 **		Argc, Argv		global copies of argc and argv
 **	Operation:
 **		This routine returns the next command line argument.  If
@ -491,8 +492,8 @@ char *GetNextFilename ()
 */

 {
-	if (optind < Argc)
-		return (Argv [optind++]);
+	if (tessoptind < Argc)
+		return (Argv [tessoptind++]);
 	else
 		return (NULL);

@ -519,33 +520,41 @@ LIST ReadTrainingSamples (
 */

 {
-	char			CharName[MAXNAMESIZE];
-	LABELEDLIST	CharSample;
-  FEATURE_SET FeatureSamples;
+	char			unichar[UNICHAR_LEN + 1];
+	LABELEDLIST             CharSample;
+        FEATURE_SET             FeatureSamples;
 	LIST			TrainingSamples = NIL;
 	CHAR_DESC		CharDesc;
 	int			Type, i;

-	while (fscanf (File, "%s %s", FontName, CharName) == 2) {
-		CharSample = FindList (TrainingSamples, CharName);
+	while (fscanf (File, "%s %s", FontName, unichar) == 2) {
+          if (!unicharset_mftraining.contains_unichar(unichar)) {
+            unicharset_mftraining.unichar_insert(unichar);
+            if (unicharset_mftraining.size() > MAX_NUM_CLASSES) {
+              cprintf("Error: Size of unicharset of mftraining is "
+                      "greater than MAX_NUM_CLASSES\n");
+              exit(1);
+            }
+          }
+		CharSample = FindList (TrainingSamples, unichar);
 		if (CharSample == NULL) {
-			CharSample = NewLabeledList (CharName);
+			CharSample = NewLabeledList (unichar);
 			TrainingSamples = push (TrainingSamples, CharSample);
 		}
 		CharDesc = ReadCharDescription (File);
 		Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
 		FeatureSamples = FeaturesOfType(CharDesc, Type);
-    for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
-      FEATURE f = FeatureSamples->Features[feature];
-      for (int dim =0; dim < f->Type->NumParams; ++dim)
-        f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
-    }
+                for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
+                  FEATURE f = FeatureSamples->Features[feature];
+                  for (int dim =0; dim < f->Type->NumParams; ++dim)
+                    f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
+                }
 		CharSample->List = push (CharSample->List, FeatureSamples);
 		for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
-			if (Type != i)
-				FreeFeatureSet (FeaturesOfType (CharDesc, i));
+                  if (Type != i)
+                    FreeFeatureSet (FeaturesOfType (CharDesc, i));
 		free (CharDesc);
-    }
+        }
 	return (TrainingSamples);

 }	/* ReadTrainingSamples */
@ -843,7 +852,7 @@ void WriteProtos(
 	int i;
 	PROTO Proto;

-	fprintf(File, "%c\n", NameToChar(MergeClass->Label));
+	fprintf(File, "%s\n", MergeClass->Label);
 	fprintf(File, "%d\n", NumProtosIn(MergeClass->Class));
 	for(i=0; i < NumProtosIn(MergeClass->Class); i++)
 	{
@ -900,7 +909,7 @@ void FreeTrainingSamples (
 	LIST		FeatureList;


-	printf ("\nFreeTrainingSamples...");
+// 	printf ("FreeTrainingSamples...\n");
 	iterate (CharList) 		/* iterate thru all of the fonts */
 	{
 		CharSample = (LABELEDLIST) first_node (CharList);
@ -1161,12 +1170,13 @@ void SetUpForFloat2Int(
 	BIT_VECTOR		NewConfig;
 	BIT_VECTOR		OldConfig;

-	printf("Float2Int ...");
+// 	printf("Float2Int ...\n");

 	iterate(LabeledClassList)
 	{
 		MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
-		Class = &TrainingData[NameToChar(MergeClass->Label)];
+		Class = &TrainingData[unicharset_mftraining.unichar_to_id(
+                                          MergeClass->Label)];
 		NumProtos = NumProtosIn(MergeClass->Class);
 		NumConfigs = NumConfigsIn(MergeClass->Class);

@ -1204,3 +1214,20 @@ void SetUpForFloat2Int(
 		}
 	}
 } // SetUpForFloat2Int
+
+/*--------------------------------------------------------------------------*/
+void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) {
+  FILE* fp = Efopen(filename, "wb");
+  /* then write out each class */
+  for (int i = 0; i < NumClassesIn (Templates); i++) {
+    int MaxLength = 0;
+    INT_CLASS Class = ClassForIndex (Templates, i);
+    for (int ConfigId = 0; ConfigId < NumIntConfigsIn (Class); ConfigId++) {
+      if (LengthForConfigId (Class, ConfigId) > MaxLength)
+        MaxLength = LengthForConfigId (Class, ConfigId);
+    }
+    fprintf(fp, "%s %d\n", unicharset_mftraining.id_to_unichar(
+                ClassIdForIndex(Templates, i)), MaxLength);
+  }
+  fclose(fp);
+} // WritePFFMTable
--- a/training/unicharset_extractor.cpp
+++ b/training/unicharset_extractor.cpp
@ -52,8 +52,8 @@ int main(int argc, char** argv) {
  while ((option = tessopt(argc, argv, "D" )) != EOF) {
    switch (option) {
      case 'D':
-        output_directory = optarg;
-        ++optind;
+        output_directory = tessoptarg;
+        ++tessoptind;
        break;
    }
  }
@ -64,12 +64,12 @@ int main(int argc, char** argv) {
  unicharset_file_name += kUnicharsetFileName;

  // Load box files
-  for (; optind < argc; ++optind) {
-    printf("Extracting unicharset from %s\n", argv[optind]);
+  for (; tessoptind < argc; ++tessoptind) {
+    printf("Extracting unicharset from %s\n", argv[tessoptind]);

-    FILE* box_file = fopen(argv[optind], "r");
+    FILE* box_file = fopen(argv[tessoptind], "r");
    if (box_file == NULL) {
-      printf("Cannot open box file %s\n", argv[optind]);
+      printf("Cannot open box file %s\n", argv[tessoptind]);
      return -1;
    }