Changes to ccutil for 3.00

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@305 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-01-18 14:41:36 +08:00 · 2009-07-11 02:50:24 +00:00 · 2009-07-11 02:50:24 +00:00 · d8b1456dd5
commit d8b1456dd5
parent b47efd2cc4
32 changed files with 4234 additions and 261 deletions
--- a/ccutil/Makefile.am
+++ b/ccutil/Makefile.am
@ -1,19 +1,30 @@
 SUBDIRS =
 AM_CXXFLAGS = -DTESSDATA_PREFIX=@datadir@/

+EXTRA_DIST = ccutil.vcproj mfcpch.cpp scanutils.cpp scanutils.h
+
 include_HEADERS = \
-    basedir.h bits16.h clst.h debugwin.h elst2.h elst.h errcode.h \
-    fileerr.h tessopt.h globaloc.h hashfn.h host.h hosthplb.h lsterr.h \
-    mainblk.h memblk.h memryerr.h memry.h mfcpch.h ndminx.h notdll.h \
-    nwmain.h ocrclass.h ocrshell.h platform.h secname.h serialis.h \
-    stderr.h strngs.h tessclas.h tprintf.h varable.h \
-    mfcpch.cpp scanutils.cpp scanutils.h unichar.h \
-    unicharmap.h unicharset.h boxread.h
+    ambigs.h basedir.h bits16.h boxread.h \
+    callback.h ccutil.h clst.h \
+    debugwin.h elst2.h elst.h errcode.h \
+    fileerr.h genericvector.h globaloc.h \
+    hashfn.h helpers.h host.h hosthplb.h lsterr.h \
+    mainblk.h memblk.h memry.h memryerr.h mfcpch.h \
+    ndminx.h notdll.h nwmain.h \
+    ocrclass.h ocrshell.h platform.h qrsequence.h \
+    secname.h serialis.h stderr.h strngs.h \
+    tessclas.h tessdatamanager.h tessopt.h tordvars.h tprintf.h \
+    unichar.h unicharmap.h unicharset.h unicity_table.h \
+    varable.h

 lib_LIBRARIES = libtesseract_ccutil.a
 libtesseract_ccutil_a_SOURCES = \
-    basedir.cpp bits16.cpp clst.cpp debugwin.cpp elst.cpp \
-    elst2.cpp errcode.cpp globaloc.cpp hashfn.cpp mainblk.cpp \
-    memblk.cpp memry.cpp ocrshell.cpp serialis.cpp strngs.cpp \
-    tprintf.cpp varable.cpp unichar.cpp tessopt.cpp \
-    unicharmap.cpp unicharset.cpp boxread.cpp
+    ambigs.cpp basedir.cpp bits16.cpp boxread.cpp \
+    ccutil.cpp clst.cpp debugwin.cpp \
+    elst2.cpp elst.cpp errcode.cpp \
+    globaloc.cpp hashfn.cpp \
+    mainblk.cpp memblk.cpp memry.cpp ocrshell.cpp \
+    serialis.cpp strngs.cpp \
+    tessdatamanager.cpp tessopt.cpp tordvars.cpp tprintf.cpp \
+    unichar.cpp unicharmap.cpp unicharset.cpp \
+    varable.cpp
--- a/ccutil/Makefile.in
+++ b/ccutil/Makefile.in
@ -57,14 +57,15 @@ AR = ar
 ARFLAGS = cru
 libtesseract_ccutil_a_AR = $(AR) $(ARFLAGS)
 libtesseract_ccutil_a_LIBADD =
-am_libtesseract_ccutil_a_OBJECTS = basedir.$(OBJEXT) bits16.$(OBJEXT) \
-	clst.$(OBJEXT) debugwin.$(OBJEXT) elst.$(OBJEXT) \
-	elst2.$(OBJEXT) errcode.$(OBJEXT) globaloc.$(OBJEXT) \
+am_libtesseract_ccutil_a_OBJECTS = ambigs.$(OBJEXT) basedir.$(OBJEXT) \
+	bits16.$(OBJEXT) boxread.$(OBJEXT) ccutil.$(OBJEXT) \
+	clst.$(OBJEXT) debugwin.$(OBJEXT) elst2.$(OBJEXT) \
+	elst.$(OBJEXT) errcode.$(OBJEXT) globaloc.$(OBJEXT) \
 	hashfn.$(OBJEXT) mainblk.$(OBJEXT) memblk.$(OBJEXT) \
 	memry.$(OBJEXT) ocrshell.$(OBJEXT) serialis.$(OBJEXT) \
-	strngs.$(OBJEXT) tprintf.$(OBJEXT) varable.$(OBJEXT) \
-	unichar.$(OBJEXT) tessopt.$(OBJEXT) unicharmap.$(OBJEXT) \
-	unicharset.$(OBJEXT) boxread.$(OBJEXT)
+	strngs.$(OBJEXT) tessdatamanager.$(OBJEXT) tessopt.$(OBJEXT) \
+	tordvars.$(OBJEXT) tprintf.$(OBJEXT) unichar.$(OBJEXT) \
+	unicharmap.$(OBJEXT) unicharset.$(OBJEXT) varable.$(OBJEXT)
 libtesseract_ccutil_a_OBJECTS = $(am_libtesseract_ccutil_a_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/config/depcomp
@ -196,22 +197,32 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 SUBDIRS = 
 AM_CXXFLAGS = -DTESSDATA_PREFIX=@datadir@/
+EXTRA_DIST = ccutil.vcproj mfcpch.cpp scanutils.cpp scanutils.h
 include_HEADERS = \
-    basedir.h bits16.h clst.h debugwin.h elst2.h elst.h errcode.h \
-    fileerr.h tessopt.h globaloc.h hashfn.h host.h hosthplb.h lsterr.h \
-    mainblk.h memblk.h memryerr.h memry.h mfcpch.h ndminx.h notdll.h \
-    nwmain.h ocrclass.h ocrshell.h platform.h secname.h serialis.h \
-    stderr.h strngs.h tessclas.h tprintf.h varable.h \
-    mfcpch.cpp scanutils.cpp scanutils.h unichar.h \
-    unicharmap.h unicharset.h boxread.h
+    ambigs.h basedir.h bits16.h boxread.h \
+    callback.h ccutil.h clst.h \
+    debugwin.h elst2.h elst.h errcode.h \
+    fileerr.h genericvector.h globaloc.h \
+    hashfn.h helpers.h host.h hosthplb.h lsterr.h \
+    mainblk.h memblk.h memry.h memryerr.h mfcpch.h \
+    ndminx.h notdll.h nwmain.h \
+    ocrclass.h ocrshell.h platform.h qrsequence.h \
+    secname.h serialis.h stderr.h strngs.h \
+    tessclas.h tessdatamanager.h tessopt.h tordvars.h tprintf.h \
+    unichar.h unicharmap.h unicharset.h unicity_table.h \
+    varable.h

 lib_LIBRARIES = libtesseract_ccutil.a
 libtesseract_ccutil_a_SOURCES = \
-    basedir.cpp bits16.cpp clst.cpp debugwin.cpp elst.cpp \
-    elst2.cpp errcode.cpp globaloc.cpp hashfn.cpp mainblk.cpp \
-    memblk.cpp memry.cpp ocrshell.cpp serialis.cpp strngs.cpp \
-    tprintf.cpp varable.cpp unichar.cpp tessopt.cpp \
-    unicharmap.cpp unicharset.cpp boxread.cpp
+    ambigs.cpp basedir.cpp bits16.cpp boxread.cpp \
+    ccutil.cpp clst.cpp debugwin.cpp \
+    elst2.cpp elst.cpp errcode.cpp \
+    globaloc.cpp hashfn.cpp \
+    mainblk.cpp memblk.cpp memry.cpp ocrshell.cpp \
+    serialis.cpp strngs.cpp \
+    tessdatamanager.cpp tessopt.cpp tordvars.cpp tprintf.cpp \
+    unichar.cpp unicharmap.cpp unicharset.cpp \
+    varable.cpp

 all: all-recursive

@ -286,9 +297,11 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c

+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ambigs.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/basedir.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bits16.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/boxread.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccutil.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clst.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/debugwin.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/elst.Po@am__quote@
@ -302,7 +315,9 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ocrshell.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serialis.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strngs.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessdatamanager.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessopt.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tordvars.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tprintf.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unichar.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unicharmap.Po@am__quote@
--- a/ccutil/ambigs.cpp
+++ b/ccutil/ambigs.cpp
@ -0,0 +1,254 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ambigs.cc
+// Description: Functions for dealing with ambiguities
+//              (training and recognition).
+// Author:      Daria Antonova
+// Created:     Mon Feb 5 11:26:43 PDT 2009
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "ambigs.h"
+#include "helpers.h"
+
+INT_VAR(global_ambigs_debug_level, 0, "Debug level for unichar ambiguities");
+BOOL_VAR(use_definite_ambigs_for_classifier, 0,
+         "Use definite ambiguities when running character classifier");
+
+namespace tesseract {
+
+AmbigSpec::AmbigSpec() {
+  wrong_ngram[0] = INVALID_UNICHAR_ID;
+  correct_fragments[0] = INVALID_UNICHAR_ID;
+  correct_ngram_id = INVALID_UNICHAR_ID;
+  type = NOT_AMBIG;
+  wrong_ngram_size = 0;
+}
+
+ELISTIZE(AmbigSpec);
+
+void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, inT64 end_offset,
+                                      UNICHARSET *unicharset) {
+  int i;
+  for (i = 0; i < unicharset->size(); ++i) {
+    replace_ambigs_.push_back(NULL);
+    dang_ambigs_.push_back(NULL);
+    one_to_one_definite_ambigs_.push_back(NULL);
+  }
+  if (global_ambigs_debug_level) tprintf("Reading ambiguities\n");
+
+  int TestAmbigPartSize;
+  int ReplacementAmbigPartSize;
+  // Maximum line size:
+  //   10 for sizes of ambigs, tabs, abmig type and newline
+  //   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
+  // The space for buffer is allocated on the heap to avoid
+  // GCC frame size warning.
+  const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
+  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
+  char *buffer = new char[kBufferSize];
+  char ReplacementString[kMaxAmbigStringSize];
+  UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
+  int line_num = 0;
+  int type = NOT_AMBIG;
+
+  // Determine the version of the ambigs file.
+  int version = 0;
+  ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
+              strlen(buffer) > 0);
+  if (*buffer == 'v') {
+    version = static_cast<int>(strtol(buffer+1, NULL, 10));
+    ++line_num;
+  } else {
+    rewind(AmbigFile);
+  }
+  while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
+         fgets(buffer, kBufferSize, AmbigFile) != NULL) {
+    chomp_string(buffer);
+    if (global_ambigs_debug_level > 2) tprintf("read line %s\n", buffer);
+    ++line_num;
+    if (!ParseAmbiguityLine(line_num, version, *unicharset, buffer,
+                            &TestAmbigPartSize, TestUnicharIds,
+                            &ReplacementAmbigPartSize,
+                            ReplacementString, &type)) continue;
+    // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
+    AmbigSpec *ambig_spec = new AmbigSpec();
+    InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
+                    TestAmbigPartSize, TestUnicharIds,
+                    ReplacementAmbigPartSize, ReplacementString, type,
+                    ambig_spec, unicharset);
+
+    // Update one_to_one_definite_ambigs_.
+    if (use_definite_ambigs_for_classifier && TestAmbigPartSize == 1 &&
+        ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
+      if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
+        one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
+      }
+      one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
+          ambig_spec->correct_ngram_id);
+    }
+  }
+  delete[] buffer;
+  // Print what was read from the input file.
+  if (global_ambigs_debug_level > 2) {
+    for (int tbl = 0; tbl < 2; ++tbl) {
+      const UnicharAmbigsVector &print_table =
+        (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
+      for (i = 0; i < print_table.size(); ++i) {
+        AmbigSpec_LIST *lst = print_table[i];
+        if (lst == NULL) continue;
+        if (!lst->empty()) {
+          tprintf("%s Ambiguities for %s:\n",
+                  (tbl == 0) ? "Replaceable" : "Dangerous",
+                  unicharset->debug_str(i).string());
+        }
+        AmbigSpec_IT lst_it(lst);
+        for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
+          AmbigSpec *ambig_spec = lst_it.data();
+          tprintf("wrong_ngram:");
+          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
+          tprintf("correct_fragments:");
+          UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
+        }
+      }
+    }
+  }
+}
+
+bool UnicharAmbigs::ParseAmbiguityLine(
+    int line_num, int version, const UNICHARSET &unicharset,
+    char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
+    int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
+  int i;
+  char *token;
+  char *next_token;
+  if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
+      !sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
+    if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
+    return false;
+  }
+  if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
+    tprintf("Too many unichars in ambiguity on line %d\n");
+    return false;
+  }
+  for (i = 0; i < *TestAmbigPartSize; ++i) {
+    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
+    if (!unicharset.contains_unichar(token)) {
+      if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
+      break;
+    }
+    TestUnicharIds[i] = unicharset.unichar_to_id(token);
+  }
+  TestUnicharIds[i] = INVALID_UNICHAR_ID;
+
+  if (i != *TestAmbigPartSize ||
+      !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
+      !sscanf(token, "%d", ReplacementAmbigPartSize) ||
+        *ReplacementAmbigPartSize <= 0) {
+    if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
+    return false;
+  }
+  if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
+    tprintf("Too many unichars in ambiguity on line %d\n");
+    return false;
+  }
+  ReplacementString[0] = '\0';
+  for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
+    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
+    strcat(ReplacementString, token);
+    if (!unicharset.contains_unichar(token)) {
+      if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
+      break;
+    }
+  }
+  if (i != *ReplacementAmbigPartSize) {
+    if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
+    return false;
+  }
+  if (version > 0) {
+    // The next field being true indicates that the abiguity should
+    // always be substituted (e.g. '' should always be changed to ").
+    // For such "certain" n -> m ambigs tesseract will insert character
+    // fragments for the n pieces in the unicharset. AmbigsFound()
+    // will then replace the incorrect ngram with the character
+    // fragments of the correct character (or ngram if m > 1).
+    // Note that if m > 1, an ngram will be inserted into the
+    // modified word, not the individual unigrams. Tesseract
+    // has limited support for ngram unichar (e.g. dawg permuter).
+    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
+        !sscanf(token, "%d", type)) {
+      if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
+      return false;
+    }
+  }
+  return true;
+}
+
+void UnicharAmbigs::InsertIntoTable(
+    UnicharAmbigsVector &table, int TestAmbigPartSize,
+    UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
+    const char *ReplacementString, int type,
+    AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
+  ambig_spec->type = static_cast<AmbigType>(type);
+  if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
+      unicharset->to_lower(TestUnicharIds[0]) ==
+      unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
+    ambig_spec->type = CASE_AMBIG;
+  }
+
+  ambig_spec->wrong_ngram_size =
+    UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
+
+  // Since we need to maintain a constant number of unichar positions in
+  // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
+  // each n->m ambiguity we will have to place n character fragments of the
+  // correct ngram into the corresponding positions in the vector (e.g. given
+  // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and
+  // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed
+  // from fragments by dawg_permute_and_select().
+
+  // Insert the corresponding correct ngram into the unicharset.
+  // Unicharset code assumes that the "base" ngram is inserted into
+  // the unicharset before fragments of this ngram are inserted.
+  unicharset->unichar_insert(ReplacementString);
+  ambig_spec->correct_ngram_id =
+    unicharset->unichar_to_id(ReplacementString);
+  if (ReplacementAmbigPartSize > 1) {
+    unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
+  }
+  // Add the corresponding fragments of the correct ngram to unicharset.
+  int i;
+  for (i = 0; i < TestAmbigPartSize; ++i) {
+    UNICHAR_ID unichar_id;
+    if (TestAmbigPartSize == 1) {
+      unichar_id = ambig_spec->correct_ngram_id;
+    } else {
+      STRING frag_str = CHAR_FRAGMENT::to_string(
+          ReplacementString, i, TestAmbigPartSize);
+      unicharset->unichar_insert(frag_str.string());
+      unichar_id = unicharset->unichar_to_id(frag_str.string());
+    }
+    ambig_spec->correct_fragments[i] = unichar_id;
+  }
+  ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;
+
+  // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
+  // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
+  if (table[TestUnicharIds[0]] == NULL) {
+    table[TestUnicharIds[0]] = new AmbigSpec_LIST();
+  }
+  table[TestUnicharIds[0]]->add_sorted(
+      AmbigSpec::compare_ambig_specs, ambig_spec);
+}
+
+}  // namespace tesseract
--- a/ccutil/ambigs.h
+++ b/ccutil/ambigs.h
@ -0,0 +1,186 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ambigs.h
+// Description: Constants, flags, functions for dealing with
+//              ambiguities (training and recognition).
+// Author:      Daria Antonova
+// Created:     Mon Aug 23 11:26:43 PDT 2008
+//
+// (C) Copyright 2008, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_AMBIGS_H_
+#define TESSERACT_CCUTIL_AMBIGS_H_
+
+#include "elst.h"
+#include "tprintf.h"
+#include "unichar.h"
+#include "unicharset.h"
+#include "genericvector.h"
+
+#define MAX_AMBIG_SIZE    10
+
+extern INT_VAR_H(global_ambigs_debug_level, 0,
+                 "Debug level for unichar ambiguities");
+extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
+                  "Use definite ambiguities when running character classifier");
+
+namespace tesseract {
+
+static const int kUnigramAmbigsBufferSize = 1000;
+static const char kAmbigNgramSeparator[] = { ' ', '\0' };
+static const char kAmbigDelimiters[] = "\t ";
+static const char kIllegalMsg[] =
+  "Illegal ambiguity specification on line %d\n";
+static const char kIllegalUnicharMsg[] =
+  "Illegal unichar %s in ambiguity specification\n";
+
+enum AmbigType {
+  NOT_AMBIG,        // the ngram pair is not ambiguous
+  REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
+  DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
+  SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
+  CASE_AMBIG,       // this is a case ambiguity (1-1)
+
+  AMBIG_TYPE_COUNT  // number of enum entries
+};
+
+// A collection of utility functions for arrays of UNICHAR_IDs that are
+// terminated by INVALID_UNICHAR_ID.
+class UnicharIdArrayUtils {
+ public:
+  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
+  // less than length of array2, if any array1[i] is less than array2[i].
+  // Returns 0 if the arrays are equal, 1 otherwise.
+  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
+  static inline int compare(const UNICHAR_ID array1[],
+                            const UNICHAR_ID array2[]) {
+    const UNICHAR_ID *ptr1 = array1;
+    const UNICHAR_ID *ptr2 = array2;
+    while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
+      if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
+      ++ptr1;
+      ++ptr2;
+    }
+    if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
+    return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
+  }
+
+  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
+  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
+  // and that dst has enough space for all the elements from src.
+  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
+    int i = 0;
+    do {
+      dst[i] = src[i];
+    } while (dst[i++] != INVALID_UNICHAR_ID);
+    return i - 1;
+  }
+
+  // Prints unichars corresponding to the unichar_ids in the given array.
+  // The function assumes that array is terminated by INVALID_UNICHAR_ID.
+  static inline void print(const UNICHAR_ID array[],
+                           const UNICHARSET &unicharset) {
+    const UNICHAR_ID *ptr = array;
+    if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
+    while (*ptr != INVALID_UNICHAR_ID) {
+      tprintf("%s ", unicharset.id_to_unichar(*ptr++));
+    }
+    tprintf("( ");
+    ptr = array;
+    while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
+    tprintf(")\n");
+  }
+};
+
+// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
+// start with the same unichar (e.g. r->t rn->m rr1->m).
+class AmbigSpec : public ELIST_LINK {
+ public:
+  AmbigSpec();
+  ~AmbigSpec() {}
+
+  // Comparator function for sorting AmbigSpec_LISTs. The lists will
+  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
+  // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
+  static int compare_ambig_specs(const void *spec1, const void *spec2) {
+    const AmbigSpec *s1 =
+      *reinterpret_cast<const AmbigSpec * const *>(spec1);
+    const AmbigSpec *s2 =
+      *reinterpret_cast<const AmbigSpec * const *>(spec2);
+    return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
+  }
+
+  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
+  UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
+  UNICHAR_ID correct_ngram_id;
+  AmbigType type;
+  int wrong_ngram_size;
+};
+ELISTIZEH(AmbigSpec);
+
+// AMBIG_TABLE[i] stores a set of ambiguities whose
+// wrong ngram starts with unichar id i.
+typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
+typedef GenericVector<UNICHAR_ID> UnicharIdVector;
+
+class UnicharAmbigs {
+ public:
+  UnicharAmbigs() {}
+  ~UnicharAmbigs() {
+    replace_ambigs_.delete_data_pointers();
+    dang_ambigs_.delete_data_pointers();
+    one_to_one_definite_ambigs_.delete_data_pointers();
+  }
+
+  const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
+  const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
+
+  // Fills in two ambiguity tables (replaceable and dangerous) with information
+  // read from the ambigs file. An ambiguity table is an array of lists.
+  // The array is indexed by a class id. Each entry in the table provides
+  // a list of potential ambiguities which can start with the corresponding
+  // character. For example the ambiguity "rn -> m", would be located in the
+  // table at index of unicharset.unichar_to_id('r').
+  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
+  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
+  // of the wrong part of the ambiguity and each entry contains a vector of
+  // unichar ids that are ambiguous to it.
+  void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
+                         UNICHARSET *unicharset);
+
+  // Return definite 1-1 ambigs.
+  const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
+    if (one_to_one_definite_ambigs_.empty()) return NULL;
+    return one_to_one_definite_ambigs_[unichar_id];
+  }
+
+ private:
+
+  bool ParseAmbiguityLine(int line_num, int version,
+                          const UNICHARSET &unicharset, char *buffer,
+                          int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
+                          int *ReplacementAmbigPartSize,
+                          char *ReplacementString, int *type);
+  void InsertIntoTable(UnicharAmbigsVector &table,
+                       int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
+                       int ReplacementAmbigPartSize,
+                       const char *ReplacementString, int type,
+                       AmbigSpec *ambig_spec, UNICHARSET *unicharset);
+  UnicharAmbigsVector dang_ambigs_;
+  UnicharAmbigsVector replace_ambigs_;
+  GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_CCUTIL_AMBIGS_H_
--- a/ccutil/basedir.cpp
+++ b/ccutil/basedir.cpp
@ -22,8 +22,6 @@
 #ifdef __UNIX__
 #include          <unistd.h>
 #include                    <fcntl.h>
-#else
-#include          <io.h>
 #endif
 #include          <stdlib.h>
 #include          "basedir.h"
@ -103,7 +101,7 @@ DLLSYM inT8 getpath(                   //get dir name of code
      return -1;
    }
  } else {
-    strcpy(directory, code);
+    strncpy(directory, code, MAX_PATH - 1);
  }
  while ((path_end = strchr (directory, '\\')) != NULL)
    *path_end = '/';
--- a/ccutil/callback.h
+++ b/ccutil/callback.h
--- a/ccutil/ccutil.cpp
+++ b/ccutil/ccutil.cpp
@ -0,0 +1,48 @@
+// Copyright 2008 Google Inc. All Rights Reserved.
+// Author: scharron@google.com (Samuel Charron)
+
+#include "ccutil.h"
+
+namespace tesseract {
+CCUtil::CCUtil()
+    : //// mainblk.* /////////////////////////////////////////////////////
+      BOOL_MEMBER(m_print_variables, FALSE,
+                  "Print initial values of all variables"),
+      STRING_MEMBER(m_data_sub_dir,
+                  "tessdata/", "Directory for data files")
+      ////////////////////////////////////////////////////////////////////
+      {
+
+}
+
+CCUtil::~CCUtil() {
+}
+
+
+CCUtilMutex::CCUtilMutex() {
+#ifdef WIN32
+  mutex_ = CreateMutex(0, FALSE, 0);
+#else
+  pthread_mutex_init(&mutex_, NULL);
+#endif
+}
+
+void CCUtilMutex::Lock() {
+#ifdef WIN32
+  WaitForSingleObject(mutex_, INFINITE);
+#else
+  pthread_mutex_lock(&mutex_);
+#endif
+}
+
+void CCUtilMutex::Unlock() {
+#ifdef WIN32
+  ReleaseMutex(mutex_);
+#else
+  pthread_mutex_unlock(&mutex_);
+#endif
+}
+
+
+CCUtilMutex tprintfMutex;
+} // namespace tesseract
--- a/ccutil/ccutil.h
+++ b/ccutil/ccutil.h
@ -0,0 +1,83 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ccutil.h
+// Description: ccutil class.
+// Author:      Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_CCUTIL_H__
+#define TESSERACT_CCUTIL_CCUTIL_H__
+
+#include "ambigs.h"
+#include "errcode.h"
+#include "strngs.h"
+#include "tessdatamanager.h"
+#include "varable.h"
+#include "unicharset.h"
+
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <pthread.h>
+#include <semaphore.h>
+#endif
+
+namespace tesseract {
+
+class CCUtilMutex {
+ public:
+  CCUtilMutex();
+
+  void Lock();
+
+  void Unlock();
+ private:
+#ifdef WIN32
+  HANDLE mutex_;
+#else
+  pthread_mutex_t mutex_;
+#endif
+};
+
+
+class CCUtil {
+ public:
+  CCUtil();
+  ~CCUtil();
+
+ public:
+  void main_setup(
+                  const char *argv0,        // program name
+                  const char *basename      // name of image
+                 );
+ public:
+  STRING datadir;        // dir for data files
+  STRING imagebasename;  // name of image
+
+  BOOL_VAR_H (m_print_variables, FALSE,
+                   "Print initial values of all variables");
+  STRING_VAR_H (m_data_sub_dir, "tessdata/", "Directory for data files");
+  STRING lang;
+  STRING language_data_path_prefix;
+  TessdataManager tessdata_manager;
+  UNICHARSET unicharset;
+  UnicharAmbigs unichar_ambigs;
+  STRING imagefile;  // image file name
+  STRING directory;  // main directory
+};
+
+extern CCUtilMutex tprintfMutex;
+}  // namespace tesseract
+
+#endif  // TESSERACT_CCUTIL_CCUTIL_H__
--- a/ccutil/ccutil.vcproj
+++ b/ccutil/ccutil.vcproj
@ -0,0 +1,819 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="ccutil"
+	ProjectGUID="{DF2FA86F-A663-4805-AED7-2F81D9EAC796}"
+	RootNamespace="ccutil"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="4"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__MSW32__;_CRT_SECURE_NO_WARNINGS"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="1"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="4"
+			CharacterSet="2"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__MSW32__;_CRT_SECURE_NO_WARNINGS"
+				RuntimeLibrary="0"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath=".\ambigs.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\basedir.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\bits16.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\boxread.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\ccutil.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\clst.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\debugwin.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\elst.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\elst2.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\errcode.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\globaloc.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\hashfn.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\mainblk.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\memblk.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\memry.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\mfcpch.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="1"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="1"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\ocrshell.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\serialis.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\strngs.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\tessdatamanager.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\tessopt.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\tordvars.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\tprintf.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath=".\unichar.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\unicharmap.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\unicharset.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\varable.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						UsePrecompiledHeader="2"
+						PrecompiledHeaderThrough="mfcpch.h"
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath=".\ambigs.h"
+				>
+			</File>
+			<File
+				RelativePath=".\basedir.h"
+				>
+			</File>
+			<File
+				RelativePath=".\bits16.h"
+				>
+			</File>
+			<File
+				RelativePath=".\boxread.h"
+				>
+			</File>
+			<File
+				RelativePath=".\callback.h"
+				>
+			</File>
+			<File
+				RelativePath=".\ccutil.h"
+				>
+			</File>
+			<File
+				RelativePath=".\clst.h"
+				>
+			</File>
+			<File
+				RelativePath=".\debugwin.h"
+				>
+			</File>
+			<File
+				RelativePath=".\elst.h"
+				>
+			</File>
+			<File
+				RelativePath=".\elst2.h"
+				>
+			</File>
+			<File
+				RelativePath=".\errcode.h"
+				>
+			</File>
+			<File
+				RelativePath=".\fileerr.h"
+				>
+			</File>
+			<File
+				RelativePath=".\genericvector.h"
+				>
+			</File>
+			<File
+				RelativePath=".\globaloc.h"
+				>
+			</File>
+			<File
+				RelativePath=".\hashfn.h"
+				>
+			</File>
+			<File
+				RelativePath=".\helpers.h"
+				>
+			</File>
+			<File
+				RelativePath=".\host.h"
+				>
+			</File>
+			<File
+				RelativePath=".\hosthplb.h"
+				>
+			</File>
+			<File
+				RelativePath=".\lsterr.h"
+				>
+			</File>
+			<File
+				RelativePath=".\mainblk.h"
+				>
+			</File>
+			<File
+				RelativePath=".\memblk.h"
+				>
+			</File>
+			<File
+				RelativePath=".\memry.h"
+				>
+			</File>
+			<File
+				RelativePath=".\memryerr.h"
+				>
+			</File>
+			<File
+				RelativePath=".\mfcpch.h"
+				>
+			</File>
+			<File
+				RelativePath=".\ndminx.h"
+				>
+			</File>
+			<File
+				RelativePath=".\notdll.h"
+				>
+			</File>
+			<File
+				RelativePath=".\nwmain.h"
+				>
+			</File>
+			<File
+				RelativePath=".\ocrclass.h"
+				>
+			</File>
+			<File
+				RelativePath=".\ocrshell.h"
+				>
+			</File>
+			<File
+				RelativePath=".\platform.h"
+				>
+			</File>
+			<File
+				RelativePath=".\qrsequence.h"
+				>
+			</File>
+			<File
+				RelativePath=".\scanutils.h"
+				>
+			</File>
+			<File
+				RelativePath=".\secname.h"
+				>
+			</File>
+			<File
+				RelativePath=".\serialis.h"
+				>
+			</File>
+			<File
+				RelativePath=".\stderr.h"
+				>
+			</File>
+			<File
+				RelativePath=".\strngs.h"
+				>
+			</File>
+			<File
+				RelativePath=".\tessclas.h"
+				>
+			</File>
+			<File
+				RelativePath=".\tessdatamanager.h"
+				>
+			</File>
+			<File
+				RelativePath=".\tessopt.h"
+				>
+			</File>
+			<File
+				RelativePath=".\tordvars.h"
+				>
+			</File>
+			<File
+				RelativePath=".\tprintf.h"
+				>
+			</File>
+			<File
+				RelativePath=".\unichar.h"
+				>
+			</File>
+			<File
+				RelativePath=".\unicharmap.h"
+				>
+			</File>
+			<File
+				RelativePath=".\unicharset.h"
+				>
+			</File>
+			<File
+				RelativePath=".\unicity_table.h"
+				>
+			</File>
+			<File
+				RelativePath=".\varable.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/ccutil/clst.h
+++ b/ccutil/clst.h
@ -96,11 +96,11 @@ class DLLSYM CLIST
    void shallow_clear();  //clear list but dont
    //delete data elements

-    BOOL8 empty() {  //is list empty?
+    bool empty() {  //is list empty?
      return !last;
    }

-    BOOL8 singleton() {
+    bool singleton() {
      return last != NULL ? (last == last->next) : FALSE;
    }

@ -165,13 +165,13 @@ class DLLSYM CLIST_ITERATOR
  CLIST_LINK *prev;              //prev element
  CLIST_LINK *current;           //current element
  CLIST_LINK *next;              //next element
-  BOOL8 ex_current_was_last;     //current extracted
+  bool ex_current_was_last;     //current extracted
  //was end of list
-  BOOL8 ex_current_was_cycle_pt; //current extracted
+  bool ex_current_was_cycle_pt; //current extracted
  //was cycle point
  CLIST_LINK *cycle_pt;          //point we are cycling
  //the list to.
-  BOOL8 started_cycling;         //Have we moved off
+  bool started_cycling;         //Have we moved off
  //the start?

  CLIST_LINK *extract_sublist(                            //from this current...
@ -229,7 +229,7 @@ class DLLSYM CLIST_ITERATOR

    void mark_cycle_pt();  //remember current

-    BOOL8 empty() {  //is list empty?
+    bool empty() {  //is list empty?
    #ifndef NDEBUG
      if (!list)
        NO_LIST.error ("CLIST_ITERATOR::empty", ABORT, NULL);
@ -237,15 +237,15 @@ class DLLSYM CLIST_ITERATOR
      return list->empty ();
    }

-    BOOL8 current_extracted() {  //current extracted?
+    bool current_extracted() {  //current extracted?
      return !current;
    }

-    BOOL8 at_first();  //Current is first?
+    bool at_first();  //Current is first?

-    BOOL8 at_last();  //Current is last?
+    bool at_last();  //Current is last?

-    BOOL8 cycled_list();  //Completed a cycle?
+    bool cycled_list();  //Completed a cycle?

    void add_to_end(                  //add at end &
                    void *new_data);  //dont move
@ -695,7 +695,7 @@ inline void CLIST_ITERATOR::mark_cycle_pt() {
 *
 **********************************************************************/

-inline BOOL8 CLIST_ITERATOR::at_first() {
+inline bool CLIST_ITERATOR::at_first() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("CLIST_ITERATOR::at_first", ABORT, NULL);
@ -717,7 +717,7 @@ inline BOOL8 CLIST_ITERATOR::at_first() {
 *
 **********************************************************************/

-inline BOOL8 CLIST_ITERATOR::at_last() {
+inline bool CLIST_ITERATOR::at_last() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("CLIST_ITERATOR::at_last", ABORT, NULL);
@ -739,7 +739,7 @@ inline BOOL8 CLIST_ITERATOR::at_last() {
 *
 **********************************************************************/

-inline BOOL8 CLIST_ITERATOR::cycled_list() {
+inline bool CLIST_ITERATOR::cycled_list() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("CLIST_ITERATOR::cycled_list", ABORT, NULL);
--- a/ccutil/debugwin.cpp
+++ b/ccutil/debugwin.cpp
@ -39,7 +39,6 @@ static LCommander *pCommander = NULL;
                                 //NT implementation
 #if defined(__MSW32__) && !defined(_CONSOLE)

-#include          <io.h>
 #define ID_DEBUG_MSG       32779

 /**********************************************************************
--- a/ccutil/elst.h
+++ b/ccutil/elst.h
@ -141,11 +141,11 @@ class DLLSYM ELIST
                                 //ptr to zapper functn
      void (*zapper) (ELIST_LINK *));

-    BOOL8 empty() {  //is list empty?
+    bool empty() {  //is list empty?
      return !last;
    }

-    BOOL8 singleton() {
+    bool singleton() {
      return last ? (last == last->next) : FALSE;
    }

@ -210,13 +210,13 @@ class DLLSYM ELIST_ITERATOR
  ELIST_LINK *prev;              //prev element
  ELIST_LINK *current;           //current element
  ELIST_LINK *next;              //next element
-  BOOL8 ex_current_was_last;     //current extracted
+  bool ex_current_was_last;     //current extracted
  //was end of list
-  BOOL8 ex_current_was_cycle_pt; //current extracted
+  bool ex_current_was_cycle_pt; //current extracted
  //was cycle point
  ELIST_LINK *cycle_pt;          //point we are cycling
  //the list to.
-  BOOL8 started_cycling;         //Have we moved off
+  bool started_cycling;         //Have we moved off
  //the start?

  ELIST_LINK *extract_sublist(                            //from this current...
@ -274,7 +274,7 @@ class DLLSYM ELIST_ITERATOR

    void mark_cycle_pt();  //remember current

-    BOOL8 empty() {  //is list empty?
+    bool empty() {  //is list empty?
    #ifndef NDEBUG
      if (!list)
        NO_LIST.error ("ELIST_ITERATOR::empty", ABORT, NULL);
@ -282,15 +282,15 @@ class DLLSYM ELIST_ITERATOR
      return list->empty ();
    }

-    BOOL8 current_extracted() {  //current extracted?
+    bool current_extracted() {  //current extracted?
      return !current;
    }

-    BOOL8 at_first();  //Current is first?
+    bool at_first();  //Current is first?

-    BOOL8 at_last();  //Current is last?
+    bool at_last();  //Current is last?

-    BOOL8 cycled_list();  //Completed a cycle?
+    bool cycled_list();  //Completed a cycle?

    void add_to_end(                        //add at end &
                    ELIST_LINK *new_link);  //dont move
@ -728,7 +728,7 @@ inline void ELIST_ITERATOR::mark_cycle_pt() {
 *
 **********************************************************************/

-inline BOOL8 ELIST_ITERATOR::at_first() {
+inline bool ELIST_ITERATOR::at_first() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("ELIST_ITERATOR::at_first", ABORT, NULL);
@ -750,7 +750,7 @@ inline BOOL8 ELIST_ITERATOR::at_first() {
 *
 **********************************************************************/

-inline BOOL8 ELIST_ITERATOR::at_last() {
+inline bool ELIST_ITERATOR::at_last() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("ELIST_ITERATOR::at_last", ABORT, NULL);
@ -772,7 +772,7 @@ inline BOOL8 ELIST_ITERATOR::at_last() {
 *
 **********************************************************************/

-inline BOOL8 ELIST_ITERATOR::cycled_list() {
+inline bool ELIST_ITERATOR::cycled_list() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("ELIST_ITERATOR::cycled_list", ABORT, NULL);
--- a/ccutil/elst2.h
+++ b/ccutil/elst2.h
@ -110,11 +110,11 @@ class DLLSYM ELIST2
      void (*zapper) (ELIST2_LINK *));
    //ptr to zapper functn

-    BOOL8 empty() {  //is list empty?
+    bool empty() {  //is list empty?
      return !last;
    }

-    BOOL8 singleton() {
+    bool singleton() {
      return last ? (last == last->next) : FALSE;
    }

@ -179,13 +179,13 @@ class DLLSYM ELIST2_ITERATOR
  ELIST2_LINK *prev;             //prev element
  ELIST2_LINK *current;          //current element
  ELIST2_LINK *next;             //next element
-  BOOL8 ex_current_was_last;     //current extracted
+  bool ex_current_was_last;     //current extracted
  //was end of list
-  BOOL8 ex_current_was_cycle_pt; //current extracted
+  bool ex_current_was_cycle_pt; //current extracted
  //was cycle point
  ELIST2_LINK *cycle_pt;         //point we are cycling
  //the list to.
-  BOOL8 started_cycling;         //Have we moved off
+  bool started_cycling;         //Have we moved off
  //the start?

  ELIST2_LINK *extract_sublist(                             //from this current...
@ -246,7 +246,7 @@ class DLLSYM ELIST2_ITERATOR

    void mark_cycle_pt();  //remember current

-    BOOL8 empty() {  //is list empty?
+    bool empty() {  //is list empty?
    #ifndef NDEBUG
      if (!list)
        NO_LIST.error ("ELIST2_ITERATOR::empty", ABORT, NULL);
@ -254,15 +254,15 @@ class DLLSYM ELIST2_ITERATOR
      return list->empty ();
    }

-    BOOL8 current_extracted() {  //current extracted?
+    bool current_extracted() {  //current extracted?
      return !current;
    }

-    BOOL8 at_first();  //Current is first?
+    bool at_first();  //Current is first?

-    BOOL8 at_last();  //Current is last?
+    bool at_last();  //Current is last?

-    BOOL8 cycled_list();  //Completed a cycle?
+    bool cycled_list();  //Completed a cycle?

    void add_to_end(                         //add at end &
                    ELIST2_LINK *new_link);  //dont move
@ -750,7 +750,7 @@ inline void ELIST2_ITERATOR::mark_cycle_pt() {
 *
 **********************************************************************/

-inline BOOL8 ELIST2_ITERATOR::at_first() {
+inline bool ELIST2_ITERATOR::at_first() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("ELIST2_ITERATOR::at_first", ABORT, NULL);
@ -772,7 +772,7 @@ inline BOOL8 ELIST2_ITERATOR::at_first() {
 *
 **********************************************************************/

-inline BOOL8 ELIST2_ITERATOR::at_last() {
+inline bool ELIST2_ITERATOR::at_last() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("ELIST2_ITERATOR::at_last", ABORT, NULL);
@ -794,7 +794,7 @@ inline BOOL8 ELIST2_ITERATOR::at_last() {
 *
 **********************************************************************/

-inline BOOL8 ELIST2_ITERATOR::cycled_list() {
+inline bool ELIST2_ITERATOR::cycled_list() {
  #ifndef NDEBUG
  if (!this)
    NULL_OBJECT.error ("ELIST2_ITERATOR::cycled_list", ABORT, NULL);
--- a/ccutil/genericvector.h
+++ b/ccutil/genericvector.h
@ -0,0 +1,398 @@
+///////////////////////////////////////////////////////////////////////
+// File:        genericvector.h
+// Description: Generic vector class
+// Author:      Daria Antonova
+// Created:     Mon Jun 23 11:26:43 PDT 2008
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+#ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_
+#define TESSERACT_CCUTIL_GENERICVECTOR_H_
+
+#include <stdio.h>
+
+#include "callback.h"
+#include "errcode.h"
+
+template <typename T>
+class GenericVector {
+ public:
+  GenericVector() { this->init(kDefaultVectorSize); }
+  GenericVector(int size) { this->init(size); }
+
+  // Copy
+  GenericVector(const GenericVector& other) {
+    this->init(other.size());
+    this->operator+=(other);
+  }
+  GenericVector<T> &operator+=(const GenericVector& other);
+  GenericVector<T> &operator=(const GenericVector& other);
+
+  virtual ~GenericVector();
+
+  // Reserve some memory.
+  void reserve(int size);
+  // Double the size of the internal array.
+  void double_the_size();
+
+  // Init the object, allocating size memory.
+  void init(int size);
+
+  // Return the size used.
+  int size() const {
+    return size_used_;
+  }
+
+  int length() const {
+    return size_used_;
+  }
+
+  // Return true if empty.
+  bool empty() const {
+    return size_used_ == 0;
+  }
+
+  // Return the object from an index.
+  T &get(int index) const;
+  T &operator[](int index) const;
+
+  // Return the index of the T object.
+  // This method NEEDS a compare_callback to be passed to
+  // set_compare_callback.
+  int get_index(T object) const;
+
+  // Return true if T is in the array
+  bool contains(T object) const;
+
+  // Return true if the index is valid
+  T contains_index(int index) const;
+
+  // Push an element in the end of the array
+  int push_back(T object);
+  void operator+=(T t);
+
+  // Set the value at the given index
+  void set(T t, int index);
+
+  // Insert t at the given index, push other elements to the right.
+  void insert(T t, int index);
+
+  // Removes an element at the given index and
+  // shifts the remaining elements to the left.
+  void remove(int index);
+
+  // Add a callback to be called to delete the elements when the array took
+  // their ownership.
+  void set_clear_callback(Callback1<T>* cb);
+
+  // Add a callback to be called to compare the elements when needed (contains,
+  // get_id, ...)
+  void set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb);
+
+  // Clear the array, calling the clear callback function if any.
+  // All the owned Callbacks are also deleted.
+  // If you don't want the Callbacks to be deleted, before calling clear, set
+  // the callback to NULL.
+  virtual void clear();
+
+  // Delete objects pointed to by data_[i]
+  void delete_data_pointers();
+
+  // This method clears the current object, then, does a shallow copy of
+  // its argument, and finally invalidate its argument.
+  // Callbacks are moved to the current object;
+  void move(GenericVector<T>* from);
+
+  // Read/Write the array to a file. This does _NOT_ read/write the callbacks.
+  // The Callback given must be permanent since they will be called more than
+  // once. The given callback will be deleted at the end.
+  void write(FILE* f, Callback2<FILE*, T const &>* cb);
+  void read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap);
+
+  // Allocates a new array of double the current_size, copies over the
+  // information from data to the new location, deletes data and returns
+  // the pointed to the new larger array.
+  // This function uses memcpy to copy the data, instead of invoking
+  // operator=() for each element like double_the_size() does.
+  static T *double_the_size_memcpy(int current_size, T *data) {
+    T *data_new = new T[current_size * 2];
+    memcpy(data_new, data, sizeof(T) * current_size);
+    delete[] data;
+    return data_new;
+  }
+
+ protected:
+  // We are assuming that the object generally placed in thie
+  // vector are small enough that for efficiency it makes sence
+  // to start with a larger initial size.
+  static const int kDefaultVectorSize = 4;
+  int   size_used_;
+  int   size_reserved_;
+  T*    data_;
+  Callback1<T>* clear_cb_;
+  // Mutable because Run method is not const
+  mutable ResultCallback2<bool, T const &, T const &>* compare_cb_;
+};
+
+namespace tesseract {
+
+template <typename T>
+bool cmp_eq(T const & t1, T const & t2) {
+  return t1 == t2;
+}
+
+}  // namespace tesseract
+
+// A useful vector that uses operator== to do comparisons.
+template <typename T>
+class GenericVectorEqEq : public GenericVector<T> {
+ public:
+  GenericVectorEqEq() {
+    GenericVector<T>::set_compare_callback(
+        NewPermanentCallback(tesseract::cmp_eq<T>));
+  }
+  GenericVectorEqEq(int size) : GenericVector<T>(size) {
+    GenericVector<T>::set_compare_callback(
+        NewPermanentCallback(tesseract::cmp_eq<T>));
+  }
+};
+
+template <typename T>
+void GenericVector<T>::init(int size) {
+  size_used_ = 0;
+  size_reserved_ = 0;
+  data_ = 0;
+  clear_cb_ = 0;
+  compare_cb_ = 0;
+  reserve(size);
+}
+
+template <typename T>
+GenericVector<T>::~GenericVector() {
+  clear();
+}
+
+// Reserve some memory. If the internal array contains elements, they are
+// copied.
+template <typename T>
+void GenericVector<T>::reserve(int size) {
+  if (size_reserved_ > size || size <= 0)
+    return;
+  T* new_array = new T[size];
+  for (int i = 0; i < size_used_; ++i)
+    new_array[i] = data_[i];
+  if (data_ != NULL) delete[] data_;
+  data_ = new_array;
+  size_reserved_ = size;
+}
+
+template <typename T>
+void GenericVector<T>::double_the_size() {
+  if (size_reserved_ == 0) {
+    reserve(kDefaultVectorSize);
+  }
+  else {
+    reserve(2 * size_reserved_);
+  }
+}
+
+
+
+// Return the object from an index.
+template <typename T>
+T &GenericVector<T>::get(int index) const {
+  ASSERT_HOST(index >= 0 && index < size_used_);
+  return data_[index];
+}
+
+template <typename T>
+T &GenericVector<T>::operator[](int index) const {
+ return data_[index];
+}
+
+// Return the object from an index.
+template <typename T>
+void GenericVector<T>::set(T t, int index) {
+  ASSERT_HOST(index >= 0 && index < size_used_);
+  data_[index] = t;
+}
+
+// Shifts the rest of the elements to the right to make
+// space for the new elements and inserts the given element
+// at the specified index.
+template <typename T>
+void GenericVector<T>::insert(T t, int index) {
+  ASSERT_HOST(index >= 0 && index < size_used_);
+  if (size_reserved_ == size_used_)
+    double_the_size();
+  for (int i = size_used_; i > index; --i) {
+    data_[i] = data_[i-1];
+  }
+  data_[index] = t;
+  size_used_++;
+}
+
+// Removes an element at the given index and
+// shifts the remaining elements to the left.
+template <typename T>
+void GenericVector<T>::remove(int index) {
+  ASSERT_HOST(index >= 0 && index < size_used_);
+  for (int i = index; i < size_used_ - 1; ++i) {
+    data_[i] = data_[i+1];
+  }
+  size_used_--;
+}
+
+// Return true if the index is valindex
+template <typename T>
+T GenericVector<T>::contains_index(int index) const {
+  return index >= 0 && index < size_used_;
+}
+
+// Return the index of the T object.
+template <typename T>
+int GenericVector<T>::get_index(T object) const {
+  for (int i = 0; i < size_used_; ++i) {
+    ASSERT_HOST(compare_cb_ != NULL);
+    if (compare_cb_->Run(object, data_[i]))
+      return i;
+  }
+  return -1;
+}
+
+// Return true if T is in the array
+template <typename T>
+bool GenericVector<T>::contains(T object) const {
+  return get_index(object) != -1;
+}
+
+// Add an element in the array
+template <typename T>
+int GenericVector<T>::push_back(T object) {
+  int index = 0;
+  if (size_used_ == size_reserved_)
+    double_the_size();
+  index = size_used_++;
+  data_[index] = object;
+  return index;
+}
+
+template <typename T>
+void GenericVector<T>::operator+=(T t) {
+  push_back(t);
+}
+
+template <typename T>
+GenericVector<T> &GenericVector<T>::operator+=(const GenericVector& other) {
+  for (int i = 0; i < other.size(); ++i) {
+    this->operator+=(other.data_[i]);
+  }
+  return *this;
+}
+
+template <typename T>
+GenericVector<T> &GenericVector<T>::operator=(const GenericVector& other) {
+  this->clear();
+  this->operator+=(other);
+  return *this;
+}
+
+// Add a callback to be called to delete the elements when the array took
+// their ownership.
+template <typename T>
+void GenericVector<T>::set_clear_callback(Callback1<T>* cb) {
+  clear_cb_ = cb;
+}
+
+// Add a callback to be called to delete the elements when the array took
+// their ownership.
+template <typename T>
+void GenericVector<T>::set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb) {
+  compare_cb_ = cb;
+}
+
+// Clear the array, calling the callback function if any.
+template <typename T>
+void GenericVector<T>::clear() {
+  if (size_reserved_ > 0) {
+    if (clear_cb_ != NULL)
+      for (int i = 0; i < size_used_; ++i)
+        clear_cb_->Run(data_[i]);
+    delete[] data_;
+    size_used_ = 0;
+    size_reserved_ = 0;
+  }
+  if (clear_cb_ != NULL) {
+    delete clear_cb_;
+    clear_cb_ = NULL;
+  }
+  if (compare_cb_ != NULL) {
+    delete compare_cb_;
+    compare_cb_ = NULL;
+  }
+}
+
+template <typename T>
+void GenericVector<T>::delete_data_pointers() {
+  for (int i = 0; i < size_used_; ++i)
+    if (data_[i]) {
+      delete data_[i];
+    }
+}
+
+
+template <typename T>
+void GenericVector<T>::write(FILE* f, Callback2<FILE*, T const &>* cb) {
+  fwrite(&size_reserved_, sizeof(int), 1, f);
+  fwrite(&size_used_, sizeof(int), 1, f);
+  for (int i = 0; i < size_used_; ++i) {
+    cb->Run(f, data_[i]);
+  }
+  delete cb;
+}
+
+template <typename T>
+void GenericVector<T>::read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap) {
+  uinT32 reserved;
+  fread(&reserved, sizeof(int), 1, f);
+  if (swap)
+    reserved = reverse32(reserved);
+  reserve(reserved);
+  fread(&size_used_, sizeof(int), 1, f);
+  if (swap)
+    size_used_ = reverse32(size_used_);
+  for (int i = 0; i < size_used_; ++i) {
+    cb->Run(f, data_ + i, swap);
+  }
+  delete cb;
+}
+
+// This method clear the current object, then, does a shallow copy of
+// its argument, and finally invalindate its argument.
+template <typename T>
+void GenericVector<T>::move(GenericVector<T>* from) {
+  this->clear();
+  this->data_ = from->data_;
+  this->size_reserved_ = from->size_reserved_;
+  this->size_used_ = from->size_used_;
+  this->compare_cb_ = from->compare_cb_;
+  this->clear_cb_ = from->clear_cb_;
+  from->data_ = NULL;
+  from->clear_cb_ = NULL;
+  from->compare_cb_ = NULL;
+  from->size_used_ = 0;
+  from->size_reserved_ = 0;
+}
+
+#endif  // TESSERACT_CCUTIL_GENERICVECTOR_H_
--- a/ccutil/helpers.h
+++ b/ccutil/helpers.h
@ -0,0 +1,41 @@
+/* -*-C-*-
+ ********************************************************************************
+ *
+ * File:         helpers.h
+ * Description:  General utility functions
+ * Author:       Daria Antonova
+ * Created:      Wed Apr 8 14:37:00 2009
+ * Language:     C
+ * Package:      N/A
+ * Status:       Reusable Software Component
+ *
+ * (c) Copyright 2009, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ ********************************************************************************/
+
+#ifndef TESSERACT_CCUTIL_HELPERS_H_
+#define TESSERACT_CCUTIL_HELPERS_H_
+
+// Remove newline (if any) at the end of the string.
+inline void chomp_string(char *string) {
+  int last_index = strlen(string) - 1;
+  if (string[last_index] == '\n') {
+    string[last_index] = '\0';
+  }
+}
+
+// Advance the current pointer of the file if it points to a newline character.
+inline void SkipNewline(FILE *file) {
+  if (fgetc(file) != '\n') fseek(file, -1, SEEK_CUR);
+}
+
+#endif
--- a/ccutil/mainblk.cpp
+++ b/ccutil/mainblk.cpp
@ -22,23 +22,22 @@
 #ifdef __UNIX__
 #include          <unistd.h>
 #include          <signal.h>
-#else
-#include          <io.h>
 #endif
 #include          <stdlib.h>
 #include          "basedir.h"
 #include          "mainblk.h"
+#include          "ccutil.h"

 #define VARDIR        "configs/" /*variables files */
 #define EXTERN

+/*
 EXTERN DLLSYM STRING datadir;    //dir for data files
                                 //name of image
 EXTERN DLLSYM STRING imagebasename;
 EXTERN BOOL_VAR (m_print_variables, FALSE,
 "Print initial values of all variables");
 EXTERN STRING_VAR (m_data_sub_dir, "tessdata/", "Directory for data files");
-/*
 EXTERN INT_VAR (memgrab_size, 0, "Preallocation size for batch use");*/


@ -46,24 +45,17 @@ const ERRCODE NO_PATH =
 "Warning:explicit path for executable will not be used for configs";
 static const ERRCODE USAGE = "Usage";

+namespace tesseract {
 /**********************************************************************
 * main_setup
 *
 * Main for mithras demo program. Read the arguments and set up globals.
 **********************************************************************/

-void main_setup(                 /*main demo program */
+void CCUtil::main_setup(                 /*main demo program */
                const char *argv0,       //program name
-                const char *basename,    //name of image
-                int argc,                /*argument count */
-                const char *const *argv  /*arguments */
+                const char *basename     //name of image
               ) {
-  inT32 arg;                     /*argument */
-  inT32 offset;                  //for flag
-  FILE *fp;                      /*variables file */
-  char flag[2];                  //+/-
-  STRING varfile;                /*name of file */
-
  imagebasename = basename;      /*name of image */

  // TESSDATA_PREFIX Environment variable overrules everything.
@ -93,34 +85,6 @@ void main_setup(                 /*main demo program */
    datadir = getenv("TESSDATA_PREFIX");
  }

-  for (arg = 0; arg < argc; arg++) {
-    if (argv[arg][0] == '+' || argv[arg][0] == '-') {
-      offset = 1;
-      flag[0] = argv[arg][0];
-    }
-    else {
-      offset = 0;
-    }
-    flag[offset] = '\0';
-    varfile = flag;
-                                 /*attempt open */
-    fp = fopen (argv[arg] + offset, "r");
-    if (fp != NULL) {
-      fclose(fp);  /*was only to test */
-    }
-    else {
-      varfile += datadir;
-      varfile += m_data_sub_dir; /*data directory */
-      varfile += VARDIR;         /*variables dir */
-    }
-                                 /*actual name */
-    varfile += argv[arg] + offset;
-    read_variables_file (varfile.string ());
-  }
-
-  if (m_print_variables)
-    print_variables(stdout);  /*print them all */
-
-
  datadir += m_data_sub_dir;     /*data directory */
 }
+}  // namespace tesseract
--- a/ccutil/mainblk.h
+++ b/ccutil/mainblk.h
@ -26,14 +26,15 @@
 extern DLLSYM STRING datadir;    //dir for data files
                                 //name of image
 extern DLLSYM STRING imagebasename;
-extern BOOL_VAR_H (m_print_variables, FALSE,
-"Print initial values of all variables");
-extern STRING_VAR_H (m_data_sub_dir, "data/", "Directory for data files");
-extern INT_VAR_H (memgrab_size, 13000000, "Preallocation size for batch use");
-void main_setup(                         /*main demo program */
-                const char *argv0,       //program name
-                const char *basename,    //name of image
-                int argc,                /*argument count */
-                const char *const *argv  /*arguments */
-               );
+extern BOOL_VAR_H(m_print_variables, FALSE,
+                  "Print initial values of all variables");
+extern STRING_VAR_H(m_data_sub_dir, "data/", "Directory for data files");
+extern INT_VAR_H(memgrab_size, 13000000, "Preallocation size for batch use");
+// > ccutil.h
+//void main_setup(                         /*main demo program */
+//                const char *argv0,       //program name
+//                const char *basename,    //name of image
+//                int argc,                /*argument count */
+//                const char *const *argv  /*arguments */
+//               );
 #endif
--- a/ccutil/platform.h
+++ b/ccutil/platform.h
@ -3,6 +3,14 @@
 #ifdef __MSW32__
 #define SIGNED
 #define snprintf _snprintf
+#define read _read
+#define write _write
+#define close _close
+#define lseek _lseek
+#define open _open
+#define ultoa _ultoa
+#define ltoa _ltoa
+#define strtok_r(s, d, p) strtok(s, d)
 #if (_MSC_VER <= 1400)
 #define vsnprintf _vsnprintf
 #endif
--- a/ccutil/qrsequence.h
+++ b/ccutil/qrsequence.h
@ -0,0 +1,80 @@
+///////////////////////////////////////////////////////////////////////
+// File:        qrsequence.h
+// Description: Quasi-random sequence generator class.
+// Author:      Ranjith Unnikrishnan
+// Created:     Wed May 20 2009
+//
+// Class to generate a (deterministic) quasi-random Van der Corput sequence that
+// covers the interval [0,N) without repetition.
+//
+// The sequence is generated by reversing the base-2 representation of the
+// sequence of natural numbers {0, 1,... M-1}, where M is 2^{num_bits_} and
+// num_bits is the minimum number of bits required to represent N. If a reversed
+// numbers is >= N it is rejected and the next natural number is considered
+// until a valid output number is found.
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+// by applicable law or agreed to in writing, software distributed under the
+// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+// OF ANY KIND, either express or implied.  See the License for the specific
+// language governing permissions and limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_QRSEQUENCE_H_
+#define TESSERACT_CCUTIL_QRSEQUENCE_H_
+
+#include <math.h>
+
+class QRSequenceGenerator {
+ public:
+  // Object is initalized with the size of the output range.
+  explicit QRSequenceGenerator(int N) : N_(N), next_num_(0) {
+    num_bits_ = ceil(log(static_cast<double>(N)) / log(2.0));
+  }
+
+  // Main worker method that retrieves the next number in the sequence.
+  // Returns kInvalidVal if called more than N times after object initialization
+  int GetVal() {
+    const int kInvalidVal = -1;
+    const int kMaxNaturalNumberValue = 1 << num_bits_;
+    if (next_num_ >= kMaxNaturalNumberValue)
+      return kInvalidVal;
+    int n = next_num_;
+
+    while (next_num_ < kMaxNaturalNumberValue) {
+      n = GetBinaryReversedInteger(next_num_++);
+      if (n < N_) break;
+    }
+    return (next_num_ > kMaxNaturalNumberValue) ? kInvalidVal : n;
+  }
+
+ protected:
+  // Outputs the integer formed by reversing the bits of the input integer. Only
+  // the lowest num_bits_ bits of the input integer are reversed.
+  int GetBinaryReversedInteger(int in_val) const {
+    int bit_pos = num_bits_;
+    int out_val = 0;
+    while(bit_pos--) {
+      // Set the value of the last bit.
+      out_val |= (in_val & 0x1);
+      if (bit_pos > 0) {
+        // Left-shift output value to prepare for storing the next bit.
+        out_val <<= 1;
+      }
+      // Right-shift input value to prepare for retrieving the next bit.
+      in_val >>= 1;
+    }
+    return out_val;
+  }
+  int N_;
+  // Next number to be considered for reversal and output.
+  int next_num_;
+  // number of bits required to represent the numbers of the sequence
+  int num_bits_;
+};
+
+#endif  // TESSERACT_CCUTIL_QRSEQUENCE_H_
--- a/ccutil/scanutils.cpp
+++ b/ccutil/scanutils.cpp
@ -31,6 +31,7 @@
 #include <fcntl.h>

 #include "scanutils.h"
+#include "tprintf.h"

 enum Flags {
  FL_SPLAT  = 0x01,   // Drop the value, do not assign
@ -45,6 +46,7 @@ enum Ranks {
  RANK_INT  = 0,
  RANK_LONG = 1,
  RANK_LONGLONG = 2,
+  RANK_PTR      = INT_MAX // Special value used for pointers
  RANK_PTR      = 3 // Special value used for pointers
 };

@ -183,7 +185,7 @@ double strtofloat(const char* s)
 {
  int minus = 0;
  int v = 0;
-  int d, c;
+  int d;
  int k = 1;
  int w = 0;

@ -243,7 +245,7 @@ int vfscanf(FILE* stream, const char *format, va_list ap)
    ST_MATCH,         // Main state of %[ sequence
    ST_MATCH_RANGE,   // After - in a %[ sequence
  } state = ST_NORMAL;
-  char *oarg, *sarg = NULL;    // %s %c or %[ string argument
+  char *sarg = NULL;    // %s %c or %[ string argument
  enum Bail bail = BAIL_NONE;
  int sign;
  int converted = 0;    // Successful conversions
--- a/ccutil/scanutils.h
+++ b/ccutil/scanutils.h
@ -25,6 +25,7 @@
 #include <stddef.h>
 #include <stdio.h>
 #include <klibc/extern.h>
+#include <sys/stat.h>

 // Attempts to parse the given file stream s as an integer of the base
 // 'base'. Returns the first successfully parsed integer as a uintmax_t, or
--- a/ccutil/tessdatamanager.cpp
+++ b/ccutil/tessdatamanager.cpp
@ -0,0 +1,203 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tessdatamanager.cpp
+// Description: Functions to handle loading/combining tesseract data files.
+// Author:      Daria Antonova
+// Created:     Wed Jun 03 11:26:43 PST 2009
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tessdatamanager.h"
+
+#include <stdio.h>
+
+#include "serialis.h"
+#include "strngs.h"
+#include "tprintf.h"
+#include "varable.h"
+
+BOOL_VAR(global_load_system_dawg, true, "Load system word dawg.");
+BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg.");
+BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns.");
+BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns.");
+
+INT_VAR(global_tessdata_manager_debug_level, 0,
+        "Debug level for TessdataManager functions.");
+
+namespace tesseract {
+
+void TessdataManager::Init(const char *data_file_name) {
+  int i;
+  data_file_ = fopen(data_file_name, "rb");
+  if (data_file_ == NULL) {
+    tprintf("Error openning data file %s\n", data_file_name);
+    exit(1);
+  }
+  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
+  bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
+  if (swap) {
+    actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
+  }
+  ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
+  fread(offset_table_, sizeof(inT64),
+        actual_tessdata_num_entries_, data_file_);
+  if (swap) {
+    for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
+      offset_table_[i] = reverse64(offset_table_[i]);
+    }
+  }
+  if (global_tessdata_manager_debug_level) {
+    tprintf("TessdataManager loaded %d types of tesseract data files.\n",
+            actual_tessdata_num_entries_);
+    for (i = 0; i < actual_tessdata_num_entries_; ++i) {
+      tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
+    }
+  }
+}
+
+FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
+                                  const char *file_suffix, bool required_file,
+                                  bool text_file) {
+  STRING file_name = language_data_path_prefix;
+  file_name += file_suffix;
+  FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
+  if (required_file && (file_ptr == NULL)) {
+    tprintf("Error openning required file %s\n", file_name.string());
+    exit(1);
+  }
+  return file_ptr;
+}
+
+void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
+                               bool newline_end) {
+  int buffer_size = 1024;
+  char *chunk = new char[buffer_size];
+  int bytes_read;
+  char last_char = 0x0;
+  while ((bytes_read = fread(chunk, sizeof(char),
+                             buffer_size, input_file))) {
+    fwrite(chunk, sizeof(char), bytes_read, output_file);
+    last_char = chunk[bytes_read-1];
+  }
+  if (newline_end) ASSERT_HOST(last_char == '\n');
+  delete[] chunk;
+}
+
+void TessdataManager::CombineDataFiles(
+    const char *language_data_path_prefix,
+    const char *output_filename) {
+  FILE *file_ptr;
+  STRING file_name;
+  int i;
+  inT64 offset_table[TESSDATA_NUM_ENTRIES];
+  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
+  FILE *output_file = fopen(output_filename, "wb");
+  // Leave some space for recording the offset_table.
+  fseek(output_file,
+        sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
+
+  // Record language-specific tesseract config file.
+  file_ptr = GetFilePtr(language_data_path_prefix,
+                        kLangConfigFileSuffix, false, true);
+  if (file_ptr != NULL) {
+    offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
+    CopyFile(file_ptr, output_file, true);
+    fclose(file_ptr);
+  }
+
+  // Record unicharset.
+  file_ptr = GetFilePtr(language_data_path_prefix,
+                        kUnicharsetFileSuffix, true, true);
+  offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
+  CopyFile(file_ptr, output_file, true);
+  fclose(file_ptr);
+
+  // Record ambiguities.
+  file_ptr = GetFilePtr(language_data_path_prefix,
+                        kAmbigsFileSuffix, false, true);
+  if (file_ptr != NULL) {
+    offset_table[TESSDATA_AMBIGS] = ftell(output_file);
+    CopyFile(file_ptr, output_file, true);
+    fclose(file_ptr);
+  }
+
+  // Record inttemp.
+  file_ptr =
+    GetFilePtr(language_data_path_prefix,
+               kBuiltInTemplatesFileSuffix, false, false);
+  if (file_ptr != NULL) {
+    offset_table[TESSDATA_INTTEMP] = ftell(output_file);
+    CopyFile(file_ptr, output_file, false);
+    fclose(file_ptr);
+
+    // Record pffmtable.
+    file_ptr = GetFilePtr(language_data_path_prefix,
+                          kBuiltInCutoffsFileSuffix, true, true);
+    offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
+    CopyFile(file_ptr, output_file, true);
+    fclose(file_ptr);
+
+    // Record normproto.
+    file_ptr = GetFilePtr(language_data_path_prefix,
+                          kNormProtoFileSuffix, true, true);
+    offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
+    CopyFile(file_ptr, output_file, true);
+    fclose(file_ptr);
+  }
+
+  // Record dawgs.
+  file_ptr = GetFilePtr(language_data_path_prefix,
+                        kPuncDawgFileSuffix, false, false);
+  if (file_ptr != NULL) {
+    offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
+    CopyFile(file_ptr, output_file, false);
+    fclose(file_ptr);
+  }
+
+  file_ptr = GetFilePtr(language_data_path_prefix,
+                        kSystemDawgFileSuffix, false, false);
+  if (file_ptr != NULL) {
+    offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
+    CopyFile(file_ptr, output_file, false);
+    fclose(file_ptr);
+  }
+
+  file_ptr = GetFilePtr(language_data_path_prefix,
+                        kNumberDawgFileSuffix, false, false);
+  if (file_ptr != NULL) {
+    offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
+    CopyFile(file_ptr, output_file, false);
+    fclose(file_ptr);
+  }
+
+  file_ptr = GetFilePtr(language_data_path_prefix,
+                        kFreqDawgFileSuffix, false, false);
+  if (file_ptr != NULL) {
+    offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
+    CopyFile(file_ptr, output_file, false);
+    fclose(file_ptr);
+  }
+
+  fseek(output_file, 0, SEEK_SET);
+  inT32 num_entries = TESSDATA_NUM_ENTRIES;
+  fwrite(&num_entries, sizeof(inT32), 1, output_file);
+  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
+  fclose(output_file);
+
+  tprintf("TessdataManager combined tesseract data files.\n");
+  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
+  }
+}
+
+}  // namespace tesseract
--- a/ccutil/tessdatamanager.h
+++ b/ccutil/tessdatamanager.h
@ -0,0 +1,165 @@
+///////////////////////////////////////////////////////////////////////
+// File:        tessdatamanager.h
+// Description: Functions to handle loading/combining tesseract data files.
+// Author:      Daria Antonova
+// Created:     Wed Jun 03 11:26:43 PST 2009
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
+#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
+
+#include <stdio.h>
+#include "host.h"
+#include "tprintf.h"
+#include "varable.h"
+
+extern BOOL_VAR_H(global_load_punc_dawg, true,
+                  "Load dawg with punctuation patterns.");
+extern BOOL_VAR_H(global_load_system_dawg, true, "Load system word dawg.");
+extern BOOL_VAR_H(global_load_number_dawg, true,
+                  "Load dawg with number patterns.");
+extern BOOL_VAR_H(global_load_freq_dawg, true, "Load frequent word dawg.");
+
+extern INT_VAR_H(global_tessdata_manager_debug_level, 0,
+                 "Debug level for TessdataManager functions.");
+
+static const char kTrainedDataSuffix[] = "traineddata";
+
+static const char kLangConfigFileSuffix[] = "config";
+static const char kUnicharsetFileSuffix[] = "unicharset";
+static const char kAmbigsFileSuffix[] = "unicharambigs";
+static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
+static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
+static const char kNormProtoFileSuffix[] = "normproto";
+static const char kPuncDawgFileSuffix[] = "punc-dawg";
+static const char kSystemDawgFileSuffix[] = "word-dawg";
+static const char kNumberDawgFileSuffix[] = "number-dawg";
+static const char kFreqDawgFileSuffix[] = "freq-dawg";
+
+namespace tesseract {
+
+enum TessdataType {
+  TESSDATA_LANG_CONFIG,  // 0
+  TESSDATA_UNICHARSET,   // 1
+  TESSDATA_AMBIGS,       // 2
+  TESSDATA_INTTEMP,      // 3
+  TESSDATA_PFFMTABLE,    // 4
+  TESSDATA_NORMPROTO,    // 5
+  TESSDATA_PUNC_DAWG,    // 6
+  TESSDATA_SYSTEM_DAWG,  // 7
+  TESSDATA_NUMBER_DAWG,  // 8
+  TESSDATA_FREQ_DAWG,    // 9
+
+  TESSDATA_NUM_ENTRIES
+};
+
+// TessdataType could be updated to contain more entries, however
+// we do not expect that number to be astronomically high.
+// In order to automatically detect endianness TessdataManager will
+// flip the bits if actual_tessdata_num_entries_ is larger than
+// kMaxNumTessdataEntries.
+static const int kMaxNumTessdataEntries = 1000;
+
+
+class TessdataManager {
+ public:
+  TessdataManager() {
+    data_file_ = NULL;
+    actual_tessdata_num_entries_ = 0;
+    for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+      offset_table_[i] = -1;
+    }
+  }
+  ~TessdataManager() {}
+
+  // Opens the given data file and reads the offset table.
+  void Init(const char *data_file_name);
+
+  // Returns data file pointer.
+  inline FILE *GetDataFilePtr() const { return data_file_; }
+
+  // Returns false if there is no data of the given type.
+  // Otherwise does a seek on the data_file_ to position the pointer
+  // at the start of the data of the given type.
+  inline bool SeekToStart(TessdataType tessdata_type) {
+    if (global_tessdata_manager_debug_level) {
+      tprintf("TessdataManager: seek to offset %lld (start of tessdata"
+              "type %d)\n", offset_table_[tessdata_type], tessdata_type);
+    }
+    if (offset_table_[tessdata_type] < 0) {
+      return false;
+    } else {
+      ASSERT_HOST(fseek(data_file_,
+                        offset_table_[tessdata_type], SEEK_SET) == 0);
+      return true;
+    }
+  }
+  // Returns the end offset for the given tesseract data file type.
+  inline inT64 GetEndOffset(TessdataType tessdata_type) const {
+    int index = tessdata_type + 1;
+    while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
+      ++index;  // skip tessdata types not present in the combined file
+    }
+    if (global_tessdata_manager_debug_level) {
+      tprintf("TessdataManager: end offset for type %d is %lld\n",
+              tessdata_type,
+              (index == actual_tessdata_num_entries_) ? -1
+              : offset_table_[index]);
+    }
+    return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
+  }
+  // Closes data_file_ (if it was opened by Init()).
+  inline void End() {
+    if (data_file_ != NULL) {
+      fclose(data_file_);
+      data_file_ = NULL;
+    }
+  }
+
+  // Reads all the standard tesseract config and data files for a language
+  // at the given path and bundles them up into one binary data file.
+  static void CombineDataFiles(const char *language_data_path_prefix,
+                               const char *output_filename);
+
+ private:
+
+  // Opens the file whose name is a concatentation of language_data_path_prefix
+  // and file_suffix. Terminates the program if required_file is set to true,
+  // but the file could not be found or opened for reading.
+  // Returns a file pointer to the opened file.
+  static FILE *GetFilePtr(const char *language_data_path_prefix,
+                          const char *file_suffix, bool required_file,
+                          bool text_file);
+
+  // Copies all the bytes in the given input file to the output_file provided.
+  static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end);
+
+  // Each offset_table_[i] contains a file offset in the combined data file
+  // where the data of TessdataFileType i is stored.
+  inT64 offset_table_[TESSDATA_NUM_ENTRIES];
+  // Actual number of entries in the tessdata table. This value can only be
+  // same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
+  // since then it would be impossible to interpret the type of tessdata at
+  // indices same and higher than TESSDATA_NUM_ENTRIES.
+  // This parameter is used to allow for backward compatiblity
+  // when new tessdata types are introduced.
+  inT32 actual_tessdata_num_entries_;
+  FILE *data_file_;  // pointer to the data file.
+};
+
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
--- a/ccutil/tordvars.cpp
+++ b/ccutil/tordvars.cpp
@ -0,0 +1,66 @@
+/* -*-C-*-
+ ********************************************************************************
+ *
+ * File:         tordvars.cpp
+ * Description:  Text Ordering Control Variables
+ * Author:       Mark Seaman, OCR Technology
+ * Created:      Wed Jan 17 12:47:29 1990
+ * Modified:     Tue Jul 30 16:22:40 1991 (Mark Seaman) marks@hpgrlt
+ * Language:     C
+ * Package:      N/A
+ * Status:       Experimental (Do Not Distribute)
+ *
+ * (c) Copyright 1990, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *********************************************************************************/
+/*----------------------------------------------------------------------
+              I n c l u d e s
+----------------------------------------------------------------------*/
+#include "mfcpch.h"
+
+#include  <stdio.h>
+
+#include "varable.h"
+
+/*----------------------------------------------------------------------
+              V a r i a b l e s
+----------------------------------------------------------------------*/
+FILE *rawfile;                   /* Text before dictionary */
+FILE *textfile;                  /* Text output file */
+FILE *matcher_fp;                //matcher log
+FILE *correct_fp;                //correct text
+
+BOOL_VAR(tord_write_output, 0, "Text file output");
+
+BOOL_VAR(tord_write_raw_output, 0, "Text before context");
+
+BOOL_VAR(tord_similarity_enable, 0, "Switch for Similarity");
+
+double_VAR(tord_certainty_threshold, -2.25, "Certainty Value");
+
+INT_VAR(tord_num_word_choices, 30, "Number of choices");
+
+BOOL_VAR(tord_blob_skip, 0, "Skip to Next selection");
+
+double_VAR(tord_overlap_threshold, 0.33, "Overlap Threshold");
+
+BOOL_VAR(tord_debug_3, 0, "Textord Debug #3");
+
+BOOL_VAR(tord_debug_5, 0, "Textord Debug #5");
+
+BOOL_VAR(tord_debug_8, 0, "Textord Debug #8");
+
+INT_VAR(tord_display_ratings, 0, "Ratings display");
+
+BOOL_VAR(tord_display_text, 0, "Display Text");
+
+BOOL_VAR(tord_show_bold, 1, "Show Bold Text");
--- a/ccutil/tordvars.h
+++ b/ccutil/tordvars.h
@ -0,0 +1,66 @@
+/* -*-C-*-
+ ********************************************************************************
+ *
+ * File:         tordvars.h
+ * Description:  Text Ordering Control Variables
+ * Author:       Mark Seaman, OCR Technology
+ * Created:      Wed Oct 25 16:33:01 1989
+ * Modified:     Mon Jul  1 14:28:23 1991 (Mark Seaman) marks@hpgrlt
+ * Language:     C
+ * Package:      N/A
+ * Status:       Experimental (Do Not Distribute)
+ *
+ * (c) Copyright 1989, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *********************************************************************************/
+#ifndef TOVARS_H
+#define TOVARS_H
+
+#include <stdio.h>
+
+#include "varable.h"
+
+/*----------------------------------------------------------------------
+              V a r i a b l e s
+----------------------------------------------------------------------*/
+extern FILE *rawfile;                       /* Text before dictionary */
+extern FILE *textfile;                      /* Text output file */
+extern FILE *correct_fp;                    //correct text
+extern FILE *matcher_fp;
+
+extern BOOL_VAR_H(tord_write_output, 0, "Text file output");
+
+extern BOOL_VAR_H(tord_write_raw_output, 0, "Text before context");
+
+extern BOOL_VAR_H(tord_similarity_enable, 0, "Switch for Similarity");
+
+extern double_VAR_H(tord_certainty_threshold, -2.25, "Certainty Value");
+
+extern INT_VAR_H(tord_num_word_choices, 30, "Number of choices");
+
+extern BOOL_VAR_H(tord_blob_skip, 0, "Skip to Next selection");
+
+extern double_VAR_H(tord_overlap_threshold, 0.33, "Overlap Threshold");
+
+extern BOOL_VAR_H(tord_debug_3, 0, "Textord Debug #3");
+
+extern BOOL_VAR_H(tord_debug_5, 0, "Textord Debug #5");
+
+extern BOOL_VAR_H(tord_debug_8, 0, "Textord Debug #8");
+
+extern INT_VAR_H(tord_display_ratings, 0, "Ratings display");
+
+extern BOOL_VAR_H(tord_display_text, 0, "Display Text");
+
+extern BOOL_VAR_H(tord_show_bold, 1, "Show Bold Text");
+
+#endif
--- a/ccutil/tprintf.cpp
+++ b/ccutil/tprintf.cpp
@ -24,6 +24,7 @@
 #include              "debugwin.h"
 //#include                                      "ipeerr.h"
 #include          "tprintf.h"
+#include          "ccutil.h"

 #define MAX_MSG_LEN     1024

@ -36,6 +37,7 @@ DLLSYM void
 tprintf (                        //Trace printf
 const char *format, ...          //special message
 ) {
+  tesseract::tprintfMutex.Lock();
  va_list args;                  //variable args
  static FILE *debugfp = NULL;   //debug file
                                 //debug window
@ -76,6 +78,7 @@ const char *format, ...          //special message
      fprintf (stderr, "%s", msg);
    }
  }
+  tesseract::tprintfMutex.Unlock();
 }


--- a/ccutil/unichar.h
+++ b/ccutil/unichar.h
@ -21,6 +21,7 @@
 #define TESSERACT_CCUTIL_UNICHAR_H__

 #include <memory.h>
+#include <string.h>

 // Maximum number of characters that can be stored in a UNICHAR. Must be
 // at least 4. Must not exceed 31 without changing the coding of length.
--- a/ccutil/unicharset.cpp
+++ b/ccutil/unicharset.cpp
@ -22,13 +22,16 @@
 #include <stdio.h>
 #include <string.h>

+#include "tprintf.h"
 #include "unichar.h"
 #include "unicharset.h"
+#include "varable.h"

 static const int ISALPHA_MASK = 0x1;
 static const int ISLOWER_MASK = 0x2;
 static const int ISUPPER_MASK = 0x4;
 static const int ISDIGIT_MASK = 0x8;
+static const int ISPUNCTUATION_MASK = 0x10;

 UNICHARSET::UNICHARSET() :
    unichars(NULL),
@ -38,15 +41,20 @@ UNICHARSET::UNICHARSET() :
    script_table(0),
    script_table_size_used(0),
    script_table_size_reserved(0),
-    null_script("NULL")
-{
-}
+    null_script("NULL"),
+    null_sid_(0),
+    common_sid_(0),
+    latin_sid_(0),
+    cyrillic_sid_(0),
+    greek_sid_(0),
+    han_sid_(0) {}

 UNICHARSET::~UNICHARSET() {
  if (size_reserved > 0) {
    for (int i = 0; i < script_table_size_used; ++i)
      delete[] script_table[i];
    delete[] script_table;
+    delete_pointers_in_unichars();
    delete[] unichars;
  }
 }
@ -56,8 +64,10 @@ void UNICHARSET::reserve(int unichars_number) {
    UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
    for (int i = 0; i < size_used; ++i)
      memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
-    for (int j = size_used; j < unichars_number; ++j)
-      unichars_new[j].properties.script = add_script(null_script);
+    for (int j = size_used; j < unichars_number; ++j) {
+      unichars_new[j].properties.script_id = add_script(null_script);
+      unichars_new[j].properties.fragment = NULL;
+    }
    delete[] unichars;
    unichars = unichars_new;
    size_reserved = unichars_number;
@ -66,15 +76,15 @@ void UNICHARSET::reserve(int unichars_number) {

 const UNICHAR_ID
 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
-  assert(ids.contains(unichar_repr));
-  return ids.unichar_to_id(unichar_repr);
+  return ids.contains(unichar_repr) ?
+    ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
 }

 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
                                           int length) const {
  assert(length > 0 && length <= UNICHAR_LEN);
-  assert(ids.contains(unichar_repr, length));
-  return ids.unichar_to_id(unichar_repr, length);
+  return ids.contains(unichar_repr, length) ?
+    ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
 }

 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
@ -102,14 +112,16 @@ int UNICHARSET::step(const char* str) const {
 }

 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
+  if (id == INVALID_UNICHAR_ID) {
+    return INVALID_UNICHAR;
+  }
  assert(id < this->size());
  return unichars[id].representation;
 }

-// Return a STRING containing debug information on the unichar, including
-// the id_to_unichar, its hex unicodes and the properties.
-STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
-  const char* str = id_to_unichar(id);
+// Return a STRING that reformats the utf8 str into the str followed
+// by its hex unicodes.
+STRING UNICHARSET::debug_utf8_str(const char* str) {
  STRING result = str;
  result += " [";
  int step = 1;
@ -128,6 +140,21 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
    result += " ";
  }
  result += "]";
+  return result;
+}
+
+// Return a STRING containing debug information on the unichar, including
+// the id_to_unichar, its hex unicodes and the properties.
+STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
+  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
+  if (fragment) {
+    STRING base = debug_str(fragment->get_unichar());
+    return CHAR_FRAGMENT::to_string(base.string(), fragment->get_pos(),
+                                    fragment->get_total());
+  }
+  const char* str = id_to_unichar(id);
+  if (id == INVALID_UNICHAR_ID) return STRING(str);
+  STRING result = debug_utf8_str(str);
  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
  if (get_isalpha(id)) {
    if (get_islower(id))
@ -141,11 +168,22 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
  if (get_isdigit(id)) {
    result += "0";
  }
+  // Append p is a punctuation symbol.
+  if (get_ispunctuation(id)) {
+    result += "p";
+  }
  return result;
 }

+
+
 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
  if (!ids.contains(unichar_repr)) {
+    if (strlen(unichar_repr) > UNICHAR_LEN) {
+      fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
+              int(strlen(unichar_repr)), unichar_repr);
+      return;
+    }
    if (size_used == size_reserved) {
      if (size_used == 0)
        reserve(8);
@ -158,31 +196,43 @@ void UNICHARSET::unichar_insert(const char* const unichar_repr) {
    this->set_islower(size_used, false);
    this->set_isupper(size_used, false);
    this->set_isdigit(size_used, false);
-    this->set_script(size_used, add_script(null_script));
+    this->set_ispunctuation(size_used, false);
+    this->set_isngram(size_used, false);
+    this->set_script(size_used, null_script);
+    // If the given unichar_repr represents a fragmented character, set
+    // fragment property to a pointer to CHAR_FRAGMENT class instance with
+    // information parsed from the unichar representation. Use the script
+    // of the base unichar for the fragmented character if possible.
+    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
+    this->unichars[size_used].properties.fragment = frag;
+    if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
+      this->unichars[size_used].properties.script_id =
+        this->get_script(frag->get_unichar());
+    }
    this->unichars[size_used].properties.enabled = true;
    ids.insert(unichar_repr, size_used);
    ++size_used;
  }
 }

-bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
+bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
  return ids.contains(unichar_repr);
 }

-bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
+bool UNICHARSET::contains_unichar(const char* const unichar_repr,
+                                  int length) const {
+  if (length == 0) {
+    return false;
+  }
  return ids.contains(unichar_repr, length);
 }

-bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
+bool UNICHARSET::eq(UNICHAR_ID unichar_id,
+                    const char* const unichar_repr) const {
  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
 }

-bool UNICHARSET::save_to_file(const char* filename) const {
-  FILE* file = fopen(filename, "w+");
-
-  if (file == NULL)
-    return false;
-
+bool UNICHARSET::save_to_file(FILE *file) const {
  fprintf(file, "%d\n", this->size());
  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
    unsigned int properties = 0;
@ -195,29 +245,28 @@ bool UNICHARSET::save_to_file(const char* filename) const {
      properties |= ISUPPER_MASK;
    if (this->get_isdigit(id))
      properties |= ISDIGIT_MASK;
+    if (this->get_ispunctuation(id))
+      properties |= ISPUNCTUATION_MASK;

    if (strcmp(this->id_to_unichar(id), " ") == 0)
-      fprintf(file, "%s %x %s\n", "NULL", properties, this->get_script(id));
+      fprintf(file, "%s %x %s %d\n", "NULL", properties,
+              this->get_script_from_script_id(this->get_script(id)),
+              this->get_other_case(id));
    else
-      fprintf(file, "%s %x %s\n", this->id_to_unichar(id), properties,
-              this->get_script(id));
+      fprintf(file, "%s %x %s %d\n", this->id_to_unichar(id), properties,
+              this->get_script_from_script_id(this->get_script(id)),
+              this->get_other_case(id));
  }
-  fclose(file);
  return true;
 }

-bool UNICHARSET::load_from_file(const char* filename) {
-  FILE* file = fopen(filename, "r");
+bool UNICHARSET::load_from_file(FILE *file) {
  int unicharset_size;
  char buffer[256];

-  if (file == NULL)
-    return false;
-
  this->clear();
  if (fgets(buffer, sizeof (buffer), file) == NULL ||
      sscanf(buffer, "%d", &unicharset_size) != 1) {
-    fclose(file);
    return false;
  }
  this->reserve(unicharset_size);
@ -226,11 +275,13 @@ bool UNICHARSET::load_from_file(const char* filename) {
    unsigned int properties;
    char script[64];

+    strcpy(script, null_script);
+    this->unichars[id].properties.other_case = id;
    if (fgets(buffer, sizeof (buffer), file) == NULL ||
-        (sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
-        !(sscanf(buffer, "%s %x", unichar, &properties) == 2 &&
-         strcpy(script, null_script)))) {
-      fclose(file);
+        (sscanf(buffer, "%s %x %63s %d", unichar, &properties,
+                script, &(this->unichars[id].properties.other_case)) != 4 &&
+         sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
+         sscanf(buffer, "%s %x", unichar, &properties) != 2)) {
      return false;
    }
    if (strcmp(unichar, "NULL") == 0)
@ -238,14 +289,23 @@ bool UNICHARSET::load_from_file(const char* filename) {
    else
      this->unichar_insert(unichar);

-    this->set_isalpha(id, properties & ISALPHA_MASK);
-    this->set_islower(id, properties & ISLOWER_MASK);
-    this->set_isupper(id, properties & ISUPPER_MASK);
-    this->set_isdigit(id, properties & ISDIGIT_MASK);
-    this->set_script(id, add_script(script));
+    this->set_isalpha(id, (properties & ISALPHA_MASK) != 0);
+    this->set_islower(id, (properties & ISLOWER_MASK) != 0);
+    this->set_isupper(id, (properties & ISUPPER_MASK) != 0);
+    this->set_isdigit(id, (properties & ISDIGIT_MASK) != 0);
+    this->set_ispunctuation(id, (properties & ISPUNCTUATION_MASK) != 0);
+    this->set_isngram(id, false);
+    this->set_script(id, script);
    this->unichars[id].properties.enabled = true;
  }
-  fclose(file);
+
+  null_sid_ = get_script_id_from_name(null_script);
+  ASSERT_HOST(null_sid_ == 0);
+  common_sid_ = get_script_id_from_name("Common");
+  latin_sid_ = get_script_id_from_name("Latin");
+  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
+  greek_sid_ = get_script_id_from_name("Greek");
+  han_sid_ = get_script_id_from_name("Han");
  return true;
 }

@ -285,10 +345,10 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
  }
 }

-char* UNICHARSET::add_script(const char* script) {
+int UNICHARSET::add_script(const char* script) {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script, script_table[i]) == 0)
-      return script_table[i];
+      return i;
  }
  if (script_table_size_reserved == 0) {
    script_table_size_reserved = 8;
@ -303,5 +363,51 @@ char* UNICHARSET::add_script(const char* script) {
  }
  script_table[script_table_size_used] = new char[strlen(script) + 1];
  strcpy(script_table[script_table_size_used], script);
-  return script_table[script_table_size_used++];
+  return script_table_size_used++;
+}
+
+CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
+  const char *ptr = string;
+  int len = strlen(string);
+  if (len < kMinLen || *ptr != kSeparator) {
+    return NULL;  // this string can not represent a fragment
+  }
+  ptr++;  // move to the next character
+  int step = 0;
+  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
+    step += UNICHAR::utf8_step(ptr + step);
+  }
+  if (step == 0 || step > UNICHAR_LEN) {
+    return NULL;  // no character for unichar or the character is too long
+  }
+  char unichar[UNICHAR_LEN + 1];
+  strncpy(unichar, ptr, step);
+  unichar[step] = '\0';  // null terminate unichar
+  ptr += step;  // move to the next fragment separator
+  int pos = 0;
+  int total = 0;
+  char *end_ptr = NULL;
+  for (int i = 0; i < 2; i++) {
+    if (ptr > string + len || *ptr != kSeparator) {
+      return NULL;  // failed to parse fragment representation
+    }
+    ptr++;  // move to the next character
+    i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
+      : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
+    ptr = end_ptr;
+  }
+  if (ptr != string + len) {
+    return NULL;  // malformed fragment representation
+  }
+  CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
+  fragment->set_all(unichar, pos, total);
+  return fragment;
+}
+
+int UNICHARSET::get_script_id_from_name(const char* script_name) const {
+  for (int i = 0; i < script_table_size_used; ++i) {
+    if (strcmp(script_name, script_table[i]) == 0)
+      return i;
+  }
+  return 0;  // 0 is always the null_script
 }
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@ -17,19 +17,110 @@
 //
 ///////////////////////////////////////////////////////////////////////

-#ifndef THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
-#define THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
+#ifndef TESSERACT_CCUTIL_UNICHARSET_H__
+#define TESSERACT_CCUTIL_UNICHARSET_H__

+#include "assert.h"
 #include "strngs.h"
 #include "unichar.h"
 #include "unicharmap.h"
+#include "varable.h"
+
+class CHAR_FRAGMENT {
+ public:
+  // Minimum number of characters used for fragment representation.
+  static const int kMinLen = 6;
+  // Maximum number of characters used for fragment representation.
+  static const int kMaxLen = 3 + UNICHAR_LEN + 2;
+  // Special character used in representing character fragments.
+  static const char kSeparator = '|';
+  // Maximum number of fragments per character.
+  static const int kMaxChunks = 3;
+
+  // Setters and Getters.
+  inline void set_all(const char *unichar, int pos, int total) {
+    this->set_unichar(unichar);
+    this->set_pos(pos);
+    this->set_total(total);
+  }
+  inline void set_unichar(const char *uch) {
+    strncpy(this->unichar, uch, UNICHAR_LEN);
+    this->unichar[UNICHAR_LEN] = '\0';
+  }
+  inline void set_pos(int p) { this->pos = p; }
+  inline void set_total(int t) { this->total = t; }
+  inline const char* get_unichar() const { return this->unichar; }
+  inline int get_pos() const { return this->pos; }
+  inline int get_total() const { return this->total; }
+
+  // Returns the string that represents a fragment
+  // with the given unichar, pos and total.
+  static STRING to_string(const char *unichar, int pos, int total) {
+    STRING result = "";
+    result += kSeparator;
+    result += unichar;
+    char buffer[kMaxLen];
+    snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
+    result += buffer;
+    return result;
+  }
+  // Returns the string that represents this fragment.
+  STRING to_string() const {
+    return to_string(this->unichar, this->pos, this->total);
+  }
+
+  // Checks whether a fragment has the same unichar,
+  // position and total as the given inputs.
+  inline bool equals(const char *other_unichar,
+                     int other_pos, int other_total) const {
+    return (strcmp(this->unichar, other_unichar) == 0 &&
+            this->pos == other_pos && this->total == other_total);
+  }
+  inline bool equals(const CHAR_FRAGMENT *other) const {
+    return this->equals(other->get_unichar(),
+                        other->get_pos(),
+                        other->get_total());
+  }
+
+  // Checks whether a given fragment is a continuation of this fragment.
+  // Assumes that the given fragment pointer is not NULL.
+  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
+    return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
+            this->total == fragment->get_total() &&
+            this->pos == fragment->get_pos() + 1);
+  }
+
+  // Returns true if this fragment is a beginning fragment.
+  inline bool is_beginning() const { return this->pos == 0; }
+
+  // Returns true if this fragment is an ending fragment.
+  inline bool is_ending() const { return this->pos == this->total-1; }
+
+  // Parses the string to see whether it represents a character fragment
+  // (rather than a regular character). If so, allocates memory for a new
+  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
+  // information. Fragments are of the form:
+  // |m|1|2, meaning chunk 1 of 2 of character m.
+  //
+  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
+  // instance, otherwise (if the string does not represent a fragment or it
+  // looks like it does, but parsing it as a fragment fails) returns NULL.
+  //
+  // Note: The caller is responsible for deallocating memory
+  // associated with the returned pointer.
+  static CHAR_FRAGMENT *parse_from_string(const char *str);
+
+ private:
+  char unichar[UNICHAR_LEN + 1];
+  inT16 pos;    // fragment position in the character
+  inT16 total;  // total number of fragments in the character
+};

 // The UNICHARSET class is an utility class for Tesseract that holds the
 // set of characters that are used by the engine. Each character is identified
 // by a unique number, from 0 to (size - 1).
 class UNICHARSET {
 public:
-
  // Create an empty UNICHARSET
  UNICHARSET();

@ -54,20 +145,43 @@ class UNICHARSET {
  // within the UNICHARSET.
  const char* const id_to_unichar(UNICHAR_ID id) const;

+  // Return a STRING that reformats the utf8 str into the str followed
+  // by its hex unicodes.
+  static STRING debug_utf8_str(const char* str);
+
  // Return a STRING containing debug information on the unichar, including
  // the id_to_unichar, its hex unicodes and the properties.
  STRING debug_str(UNICHAR_ID id) const;
+  STRING debug_str(const char * unichar_repr) const {
+    return debug_str(unichar_to_id(unichar_repr));
+  }

  // Add a unichar representation to the set.
  void unichar_insert(const char* const unichar_repr);

+  // Return true if the given unichar id exists within the set.
+  // Relies on the fact that unichar ids are contiguous in the unicharset.
+  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
+    return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
+  }
+
  // Return true if the given unichar representation exists within the set.
-  bool contains_unichar(const char* const unichar_repr);
-  bool contains_unichar(const char* const unichar_repr, int length);
+  bool contains_unichar(const char* const unichar_repr) const;
+  bool contains_unichar(const char* const unichar_repr, int length) const;

  // Return true if the given unichar representation corresponds to the given
  // UNICHAR_ID within the set.
-  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr);
+  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
+
+  // Delete CHAR_FRAGMENTs stored in properties of unichars array.
+  void delete_pointers_in_unichars() {
+    for (int i = 0; i < size_used; ++i) {
+      if (unichars[i].properties.fragment != NULL) {
+        delete unichars[i].properties.fragment;
+        unichars[i].properties.fragment = NULL;
+      }
+    }
+  }

  // Clear the UNICHARSET (all the previous data is lost).
  void clear() {
@ -78,6 +192,7 @@ class UNICHARSET {
      script_table = 0;
      script_table_size_reserved = 0;
      script_table_size_used = 0;
+      delete_pointers_in_unichars();
      delete[] unichars;
      unichars = 0;
      size_reserved = 0;
@ -94,13 +209,34 @@ class UNICHARSET {
  // Reserve enough memory space for the given number of UNICHARS
  void reserve(int unichars_number);

-  // Save the content of the UNICHARSET to the given file. Return true if the
-  // operation is successful.
-  bool save_to_file(const char* const filename) const;
+  // Opens the file indicated by filename and saves unicharset to that file.
+  // Returns true if the operation is successful.
+  bool save_to_file(const char * const filename) const {
+    FILE* file = fopen(filename, "w+");
+    if (file == NULL) return false;
+    bool result = save_to_file(file);
+    fclose(file);
+    return result;
+  }

-  // Load the UNICHARSET from the given file. The previous data is lost. Return
-  // true if the operation is successful.
-  bool load_from_file(const char* const filename);
+  // Saves the content of the UNICHARSET to the given file.
+  // Returns true if the operation is successful.
+  bool save_to_file(FILE *file) const;
+
+  // Opens the file indicated by filename and loads the UNICHARSET
+  // from the given file. The previous data is lost.
+  // Returns true if the operation is successful.
+  bool load_from_file(const char* const filename) {
+    FILE* file = fopen(filename, "r");
+    if (file == NULL) return false;
+    bool result = load_from_file(file);
+    fclose(file);
+    return result;
+  }
+
+  // Loads the UNICHARSET from the given file. The previous data is lost.
+  // Returns true if the operation is successful.
+  bool load_from_file(FILE *file);

  // Set a whitelist and/or blacklist of characters to recognize.
  // An empty or NULL whitelist enables everything (minus any blacklist).
@ -131,10 +267,25 @@ class UNICHARSET {
    unichars[unichar_id].properties.isdigit = value;
  }

+  // Set the ispunctuation property of the given unichar to the given value.
+  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
+    unichars[unichar_id].properties.ispunctuation = value;
+  }
+
+  // Set the isngram property of the given unichar to the given value.
+  void set_isngram(UNICHAR_ID unichar_id, bool value) {
+    unichars[unichar_id].properties.isngram = value;
+  }
+
  // Set the script name of the given unichar to the given value.
  // Value is copied and thus can be a temporary;
  void set_script(UNICHAR_ID unichar_id, const char* value) {
-    unichars[unichar_id].properties.script = add_script(value);
+    unichars[unichar_id].properties.script_id = add_script(value);
+  }
+
+  // Set other_case unichar id in the properties for the given unichar id.
+  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
+    unichars[unichar_id].properties.other_case = other_case;
  }

  // Return the isalpha property of the given unichar.
@ -157,11 +308,44 @@ class UNICHARSET {
    return unichars[unichar_id].properties.isdigit;
  }

+  // Return the ispunctuation property of the given unichar.
+  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.ispunctuation;
+  }
+
+  // Return the isngram property of the given unichar.
+  bool get_isngram(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.isngram;
+  }
+
  // Return the script name of the given unichar.
  // The returned pointer will always be the same for the same script, it's
  // managed by unicharset and thus MUST NOT be deleted
-  const char* get_script(UNICHAR_ID unichar_id) const {
-    return unichars[unichar_id].properties.script;
+  int get_script(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.script_id;
+  }
+
+  // Get other_case unichar id in the properties for the given unichar id.
+  UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.other_case;
+  }
+
+  // Returns UNICHAR_ID of the corresponding lower-case unichar.
+  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
+    if (unichars[unichar_id].properties.islower) return unichar_id;
+    return unichars[unichar_id].properties.other_case;
+  }
+
+  // Returns UNICHAR_ID of the corresponding upper-case unichar.
+  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
+    if (unichars[unichar_id].properties.isupper) return unichar_id;
+    return unichars[unichar_id].properties.other_case;
+  }
+
+  // Return a pointer to the CHAR_FRAGMENT class if the given
+  // unichar id represents a character fragment.
+  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.fragment;
  }

  // Return the isalpha property of the given unichar representation.
@ -184,13 +368,28 @@ class UNICHARSET {
    return get_isdigit(unichar_to_id(unichar_repr));
  }

+  // Return the ispunctuation property of the given unichar representation.
+  bool get_ispunctuation(const char* const unichar_repr) const {
+    return get_ispunctuation(unichar_to_id(unichar_repr));
+  }
+
  // Return the script name of the given unichar representation.
  // The returned pointer will always be the same for the same script, it's
  // managed by unicharset and thus MUST NOT be deleted
-  const char* get_script(const char* const unichar_repr) const {
+  int get_script(const char* const unichar_repr) const {
    return get_script(unichar_to_id(unichar_repr));
  }

+  // Return a pointer to the CHAR_FRAGMENT class struct if the given
+  // unichar representation represents a character fragment.
+  const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
+    if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
+        !ids.contains(unichar_repr)) {
+      return NULL;
+    }
+    return get_fragment(unichar_to_id(unichar_repr));
+  }
+
  // Return the isalpha property of the given unichar representation.
  // Only the first length characters from unichar_repr are used.
  bool get_isalpha(const char* const unichar_repr,
@ -219,34 +418,82 @@ class UNICHARSET {
    return get_isdigit(unichar_to_id(unichar_repr, length));
  }

+  // Return the ispunctuation property of the given unichar representation.
+  // Only the first length characters from unichar_repr are used.
+  bool get_ispunctuation(const char* const unichar_repr,
+                          int length) const {
+    return get_ispunctuation(unichar_to_id(unichar_repr, length));
+  }
+
  // Return the script name of the given unichar representation.
  // Only the first length characters from unichar_repr are used.
  // The returned pointer will always be the same for the same script, it's
  // managed by unicharset and thus MUST NOT be deleted
-  const char* get_script(const char* const unichar_repr,
-               int length) const {
+  int get_script(const char* const unichar_repr,
+                 int length) const {
    return get_script(unichar_to_id(unichar_repr, length));
  }

+  // Return the (current) number of scripts in the script table
+  int get_script_table_size() const {
+    return script_table_size_used;
+  }
+
+  // Return the script string from its id
+  const char* get_script_from_script_id(int id) const {
+    if (id >= script_table_size_used || id < 0)
+      return null_script;
+    return script_table[id];
+  }
+
+  // Returns the id from the name of the script, or 0 if script is not found.
+  // Note that this is an expensive operation since it involves iteratively
+  // comparing strings in the script table.  To avoid dependency on STL, we
+  // won't use a hash.  Instead, the calling function can use this to lookup
+  // and save the ID for relevant scripts for fast comparisons later.
+  int get_script_id_from_name(const char* script_name) const;
+
+  // Return true if the given script is the null script
+  bool is_null_script(const char* script) const {
+    return script == null_script;
+  }
+
+  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
+  // then the returned pointer will be the same.
+  // The script parameter is copied and thus can be a temporary.
+  int add_script(const char* script);
+
  // Return the enabled property of the given unichar.
  bool get_enabled(UNICHAR_ID unichar_id) const {
    return unichars[unichar_id].properties.enabled;
  }

- private:

-  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
-  // then the returned pointer will be the same.
-  // The script parameter is copied and thus can be a temporary.
-  char* add_script(const char* script);
+  int null_sid() const { return null_sid_; }
+  int common_sid() const { return common_sid_; }
+  int latin_sid() const { return latin_sid_; }
+  int cyrillic_sid() const { return cyrillic_sid_; }
+  int greek_sid() const { return greek_sid_; }
+  int han_sid() const { return han_sid_; }
+
+ private:

  struct UNICHAR_PROPERTIES {
    bool  isalpha;
    bool  islower;
    bool  isupper;
    bool  isdigit;
+    bool  ispunctuation;
+    bool  isngram;
    bool  enabled;
-    char* script;
+    int   script_id;
+    UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar
+
+    // Contains meta information about the fragment if a unichar represents
+    // a fragment of a character, otherwise should be set to NULL.
+    // It is assumed that character fragments are added to the unicharset
+    // after the corresponding 'base' characters.
+    CHAR_FRAGMENT *fragment;
  };

  struct UNICHAR_SLOT {
@ -262,6 +509,16 @@ class UNICHARSET {
  int script_table_size_used;
  int script_table_size_reserved;
  const char* null_script;
+
+  // A few convenient script name-to-id mapping without using hash.
+  // These are initialized when unicharset file is loaded.  Anything
+  // missing from this list can be looked up using get_script_id_from_name.
+  int null_sid_;
+  int common_sid_;
+  int latin_sid_;
+  int cyrillic_sid_;
+  int greek_sid_;
+  int han_sid_;
 };

-#endif  // THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
+#endif  // TESSERACT_CCUTIL_UNICHARSET_H__
--- a/ccutil/unicity_table.h
+++ b/ccutil/unicity_table.h
@ -0,0 +1,198 @@
+///////////////////////////////////////////////////////////////////////
+// File:        UnicityTable.h
+// Description: a class to uniquify objects, manipulating them using integers
+// ids.
+// Author:      Samuel Charron
+//
+// (C) Copyright 2006, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCUTIL_UNICITY_TABLE_H_
+#define TESSERACT_CCUTIL_UNICITY_TABLE_H_
+
+#include "callback.h"
+#include "errcode.h"
+#include "genericvector.h"
+
+// A class to uniquify objects, manipulating them using integers ids.
+// T requirements:
+//   operator= to add an element
+//   default-constructible: allocating the internal table will call the default
+//     constructor.
+template <typename T>
+class UnicityTable {
+ public:
+  UnicityTable();
+  // Clear the structures and deallocate internal structures.
+  ~UnicityTable();
+
+  // Reserve some memory. If there is size or more elements, the table will
+  // then allocate size * 2 elements.
+  void reserve(int size);
+
+  // Return the size used.
+  int size() const;
+
+  // Return the object from an id.
+  T get(int id) const;
+
+  // Return the id of the T object.
+  // This method NEEDS a compare_callback to be passed to
+  // set_compare_callback.
+  int get_id(T object) const;
+
+  // Return true if T is in the table
+  bool contains(T object) const;
+
+  // Return true if the id is valid
+  T contains_id(int id) const;
+
+  // Add an element in the table
+  int push_back(T object);
+
+  // Add a callback to be called to delete the elements when the table took
+  // their ownership.
+  void set_clear_callback(Callback1<T>* cb);
+
+  // Add a callback to be called to compare the elements when needed (contains,
+  // get_id, ...)
+  void set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb);
+
+  // Clear the table, calling the callback function if any.
+  // All the owned Callbacks are also deleted.
+  // If you don't want the Callbacks to be deleted, before calling clear, set
+  // the callback to NULL.
+  void clear();
+
+  // This method clear the current object, then, does a shallow copy of
+  // its argument, and finally invalidate its argument.
+  void move(UnicityTable<T>* from);
+
+  // Read/Write the table to a file. This does _NOT_ read/write the callbacks.
+  // The Callback given must be permanent since they will be called more than
+  // once. The given callback will be deleted at the end.
+  void write(FILE* f, Callback2<FILE*, T const &>* cb);
+  // swap is used to switch the endianness.
+  void read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap);
+
+ private:
+  GenericVector<T> table_;
+  // Mutable because Run method is not const
+  mutable ResultCallback2<bool, T const &, T const &>* compare_cb_;
+};
+
+template <typename T>
+class UnicityTableEqEq : public UnicityTable<T> {
+ public:
+  UnicityTableEqEq() {
+    UnicityTable<T>::set_compare_callback(
+        NewPermanentCallback(tesseract::cmp_eq<T>));
+  }
+};
+
+template <typename T>
+UnicityTable<T>::UnicityTable() :
+  compare_cb_(0) {
+}
+
+
+template <typename T>
+UnicityTable<T>::~UnicityTable() {
+  clear();
+}
+
+template <typename T>
+int UnicityTable<T>::size() const{
+  return table_.size();
+}
+
+// Reserve some memory. If there is size or more elements, the table will
+// then allocate size * 2 elements.
+template <typename T>
+void UnicityTable<T>::reserve(int size) {
+  table_.reserve(size);
+}
+
+// Return the object from an id.
+template <typename T>
+T UnicityTable<T>::get(int id) const {
+  return table_.get(id);
+}
+
+// Return true if the id is valid
+template <typename T>
+T UnicityTable<T>::contains_id(int id) const {
+  return table_.contains_index(id);
+}
+
+// Return the id of the T object.
+template <typename T>
+int UnicityTable<T>::get_id(T object) const {
+  return table_.get_index(object);
+}
+
+// Return true if T is in the table
+template <typename T>
+bool UnicityTable<T>::contains(T object) const {
+  return get_id(object) != -1;
+}
+
+// Add an element in the table
+template <typename T>
+int UnicityTable<T>::push_back(T object) {
+  int idx = get_id(object);
+  if (idx == -1) {
+    idx = table_.push_back(object);
+  }
+  return idx;
+}
+
+// Add a callback to be called to delete the elements when the table took
+// their ownership.
+template <typename T>
+void UnicityTable<T>::set_clear_callback(Callback1<T>* cb) {
+  table_.set_clear_callback(cb);
+}
+
+// Add a callback to be called to delete the elements when the table took
+// their ownership.
+template <typename T>
+void UnicityTable<T>::set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb) {
+  table_.set_compare_callback(cb);
+  compare_cb_ = cb;
+}
+
+// Clear the table, calling the callback function if any.
+template <typename T>
+void UnicityTable<T>::clear() {
+  table_.clear();
+}
+
+template <typename T>
+void UnicityTable<T>::write(FILE* f, Callback2<FILE*, T const &>* cb) {
+  table_.write(f, cb);
+}
+
+template <typename T>
+void UnicityTable<T>::read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap) {
+  table_.read(f, cb, swap);
+}
+
+// This method clear the current object, then, does a shallow copy of
+// its argument, and finally invalidate its argument.
+template <typename T>
+void UnicityTable<T>::move(UnicityTable<T>* from) {
+  table_.move(&from->table_);
+}
+
+#endif  // TESSERACT_CCUTIL_UNICITY_TABLE_H_
--- a/ccutil/varable.cpp
+++ b/ccutil/varable.cpp
@ -18,13 +18,14 @@
 **********************************************************************/

 #include          "mfcpch.h"     //precompiled headers
+
 #include          <stdio.h>
 #include          <string.h>
 #include          <stdlib.h>
-#include          "tprintf.h"
-//#include                                      "ipeerr.h"
-#include          "varable.h"
+
 #include          "scanutils.h"
+#include          "tprintf.h"
+#include          "varable.h"

 #define PLUS          '+'        //flag states
 #define MINUS         '-'
@ -379,24 +380,23 @@ STRING_VARIABLE_CLIST *STRING_VARIABLE::get_head() {  // access to static
 * Print the entire list of STRING_VARIABLEs.
 **********************************************************************/

-void STRING_VARIABLE::print(FILE *fp  // file to print on
-                           ) {
-                                 // list iterator
-  STRING_VARIABLE_C_IT it = &head;
+void STRING_VARIABLE::print(FILE *fp) {
+  STRING_VARIABLE_C_IT it = &head;  // list iterator
  STRING_VARIABLE *elt;          // current element

+  // Comments aren't allowed with string variables, so the # character can
+  // be part of a string.
  if (fp == stdout) {
    tprintf("#Variables of type STRING_VARIABLE:\n");
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      elt = it.data();
-      tprintf("%s #%s %s\n", elt->name, elt->value.string(), elt->info);
+      tprintf("%s %s\n", elt->name, elt->value.string());
    }
  } else {
    fprintf(fp, "#Variables of type STRING_VARIABLE:\n");
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      elt = it.data();
-      fprintf(fp, "%s #%s %s\n",
-        elt->name, elt->value.string(), elt->info);
+      fprintf(fp, "%s %s\n", elt->name, elt->value.string());
    }
  }
 }
@ -519,20 +519,14 @@ void double_VARIABLE::print(FILE *fp  // file to print on
 * Values may have any whitespace after the name and are the rest of line.
 **********************************************************************/

-DLLSYM BOOL8 read_variables_file(const char *file  // name to read
-                                ) {
-  BOOL8 anyerr;                  // true if any error
+DLLSYM BOOL8 read_variables_file(const char *file,  // name to read
+                                 bool global_only   // only set variables
+                                 ) {                // starting with "global_"
  char flag;                     // file flag
-  BOOL8 foundit;                 // found variable
-  inT16 length;                  // length of line
  inT16 nameoffset;              // offset for real name
-  char *valptr;                  // value field
-  char *stringend;               // end of string value
  FILE *fp;                      // file pointer
                                 // iterators
-  char line[MAX_PATH];           // input line

-  anyerr = FALSE;
  if (*file == PLUS) {
    flag = PLUS;                 // file has flag
    nameoffset = 1;
@ -546,54 +540,48 @@ DLLSYM BOOL8 read_variables_file(const char *file  // name to read

  fp = fopen(file + nameoffset, "r");
  if (fp == NULL) {
-    tprintf("read_variables_file:Can't open %s", file + nameoffset);
+    tprintf("read_variables_file: Can't open %s\n", file + nameoffset);
    return TRUE;                 // can't open it
  }
-  while (fgets (line, MAX_PATH, fp)) {
+  return read_variables_from_fp(fp, -1, global_only);
+  fclose(fp);
+}
+
+bool read_variables_from_fp(FILE *fp, inT64 end_offset, bool global_only) {
+  char line[MAX_PATH];           // input line
+  bool anyerr = false;          // true if any error
+  bool foundit;                 // found variable
+  inT16 length;                  // length of line
+  char *valptr;                  // value field
+
+  while ((end_offset < 0 || ftell(fp) < end_offset) &&
+         fgets(line, MAX_PATH, fp)) {
    if (line[0] != '\n' && line[0] != '#') {
      length = strlen (line);
      if (line[length - 1] == '\n')
        line[length - 1] = '\0';  // cut newline
      for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t';
        valptr++);
-      if (*valptr) {             //found blank
-        *valptr = '\0';          //make name a string
+      if (*valptr) {             // found blank
+        *valptr = '\0';          // make name a string
        do
-
-        valptr++;              //find end of blanks
+          valptr++;              // find end of blanks
        while (*valptr == ' ' || *valptr == '\t');
-
-        if (*valptr && *valptr != '#') {
-                                 //last char in string
-          stringend = valptr + strlen (valptr) - 1;
-          while (stringend != valptr) {
-            while (stringend != valptr
-              && (*stringend == ' ' || *stringend == '\t'))
-              // cut trailing blanks
-              stringend--;
-            stringend[1] = '\0'; // terminate string
-
-            while (stringend != valptr
-              && ((*stringend != ' ' && *stringend != '\t')
-              || stringend[1] != '#'))
-              stringend--;       // find word start
-          }
-        }
      }
-      foundit = set_new_style_variable(line, valptr);
+      if (global_only && strstr(line, kGlobalVariablePrefix) == NULL) continue;
+      foundit = set_variable(line, valptr);

      if (!foundit) {
        anyerr = TRUE;         // had an error
-        tprintf("read_variables_file:variable not found: %s\n",
-          line);
+        tprintf("read_variables_file: variable not found: %s\n", line);
+        exit(1);
      }
    }
  }
-  fclose(fp);  // close file
  return anyerr;
 }

-bool set_new_style_variable(const char *variable, const char* value) {
+bool set_variable(const char *variable, const char* value) {
  INT_VARIABLE_C_IT int_it = &INT_VARIABLE::head;
  BOOL_VARIABLE_C_IT BOOL_it = &BOOL_VARIABLE::head;
  STRING_VARIABLE_C_IT STRING_it = &STRING_VARIABLE::head;
@ -606,10 +594,7 @@ bool set_new_style_variable(const char *variable, const char* value) {
       STRING_it.forward());
  if (!STRING_it.cycled_list()) {
    foundit = true;          // found the varaible
-    if (*value == '\0')
-      STRING_it.data()->set_value((char *) NULL);  // No value.
-    else
-      STRING_it.data()->set_value(value);  // set its value
+    STRING_it.data()->set_value(value);  // set its value
  }

  if (*value) {
@ -624,7 +609,7 @@ bool set_new_style_variable(const char *variable, const char* value) {
      int_it.data()->set_value(intval);  // set its value.
    }
    for (BOOL_it.mark_cycle_pt();
-         !BOOL_it.cycled_list () && strcmp(variable, BOOL_it.data()->name);
+         !BOOL_it.cycled_list() && strcmp(variable, BOOL_it.data()->name);
         BOOL_it.forward());
    if (!BOOL_it.cycled_list()) {
      if (*value == 'T' || *value == 't' ||
--- a/ccutil/varable.h
+++ b/ccutil/varable.h
@ -21,18 +21,27 @@
 #define           VARABLE_H

 #include          <stdio.h>
+
 #include          "clst.h"
 #include          "strngs.h"

 class DLLSYM INT_VARIABLE;

-                                 //read the file
-extern DLLSYM BOOL8 read_variables_file(const char *file  //name to read
-                                       );
-bool set_new_style_variable(const char *variable, const char* value);
-                                 //print all vars
-extern DLLSYM void print_variables(FILE *fp  //file to print on
-                                  );
+// Read config file.
+extern DLLSYM BOOL8 read_variables_file(
+    const char *file,   // filename to read
+    bool global_only);  // only set variables starting with "global_"
+
+// Read variables from the given file pointer (stop at end_offset).
+bool read_variables_from_fp(FILE *fp, inT64 end_offset, bool global_only);
+
+// Set a variable to have the given value.
+bool set_variable(const char *variable, const char* value);
+
+// Print variables to a file.
+extern DLLSYM void print_variables(FILE *fp);
+
+const char kGlobalVariablePrefix[] = "global_";

 CLISTIZEH (INT_VARIABLE)
 class DLLSYM INT_VAR_FROM
@ -57,7 +66,7 @@ class DLLSYM INT_VARIABLE
  friend class INT_VAR_TO;
  friend class INT_VAR_FROM;
                                 //for setting values
-  friend bool set_new_style_variable(const char *variable, const char* value);
+  friend bool set_variable(const char *variable, const char* value);

  public:
    INT_VARIABLE(inT32 v,               // initial value
@ -124,7 +133,7 @@ class DLLSYM BOOL_VARIABLE {
  friend class BOOL_VAR_FROM;
  friend class BOOL_VAR_TO;
                                 //for setting values
-  friend bool set_new_style_variable(const char *variable, const char* value);
+  friend bool set_variable(const char *variable, const char* value);

  public:
    BOOL_VARIABLE(                       //constructor
@ -197,7 +206,7 @@ class DLLSYM STRING_VARIABLE
  friend class STRING_VAR_TO;
  friend class STRING_VAR_FROM;
                                 //for setting values
-  friend bool set_new_style_variable(const char *variable, const char* value);
+  friend bool set_variable(const char *variable, const char* value);

  public:
    STRING_VARIABLE(                       //constructor
@ -274,7 +283,7 @@ class DLLSYM double_VARIABLE
  friend class double_VAR_TO;
  friend class double_VAR_FROM;
                                 //for setting values
-  friend bool set_new_style_variable(const char *variable, const char* value);
+  friend bool set_variable(const char *variable, const char* value);

  public:
    double_VARIABLE(                       //constructor