Major internationalization improvements

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-08-06 13:56:47 +08:00 · 2008-02-01 00:21:49 +00:00 · 2008-02-01 00:21:49 +00:00 · 2a678305c6
commit 2a678305c6
parent aa55810b6b
17 changed files with 616 additions and 44 deletions
--- a/ccutil/unichar.h
+++ b/ccutil/unichar.h
@ -24,7 +24,7 @@

 // Maximum number of characters that can be stored in a UNICHAR. Must be
 // at least 4. Must not exceed 31 without changing the coding of length.
-#define UNICHAR_LEN 8
+#define UNICHAR_LEN 24

 // A UNICHAR_ID is the unique id of a unichar.
 typedef int UNICHAR_ID;
--- a/ccutil/unicharmap.cpp
+++ b/ccutil/unicharmap.cpp
@ -19,7 +19,7 @@

 #include <assert.h>
 #include "unichar.h"
-
+#include "host.h"
 #include "unicharmap.h"

 UNICHARMAP::UNICHARMAP() :
@ -135,6 +135,22 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
      current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
 }

+// Return the minimum number of characters that must be used from this string
+// to obtain a match in the UNICHARMAP.
+int UNICHARMAP::minmatch(const char* const unichar_repr) const {
+  const char* current_char = unichar_repr;
+  UNICHARMAP_NODE* current_nodes = nodes;
+
+  while (current_nodes != NULL && *current_char != '\0') {
+    if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
+      return current_char + 1 - unichar_repr;
+    current_nodes =
+        current_nodes[static_cast<unsigned char>(*current_char)].children;
+    ++current_char;
+  }
+  return 0;
+}
+
 void UNICHARMAP::clear() {
  if (nodes != 0)
  {
--- a/ccutil/unicharmap.h
+++ b/ccutil/unicharmap.h
@ -56,6 +56,10 @@ class UNICHARMAP {
  // used. The length MUST be non-zero.
  bool contains(const char* const unichar_repr, int length) const;

+  // Return the minimum number of characters that must be used from this string
+  // to obtain a match in the UNICHARMAP.
+  int minmatch(const char* const unichar_repr) const;
+
  // Clear the UNICHARMAP. All previous data is lost.
  void clear();

--- a/ccutil/unicharset.cpp
+++ b/ccutil/unicharset.cpp
@ -44,11 +44,10 @@ UNICHARSET::~UNICHARSET() {
 }

 void UNICHARSET::reserve(int unichars_number) {
-  if (unichars_number > size_reserved)
-  {
+  if (unichars_number > size_reserved) {
    UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
    for (int i = 0; i < size_used; ++i)
-      memcpy(&unichars_new[i], &unichars[i], sizeof (UNICHAR_SLOT));
+      memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
    delete[] unichars;
    unichars = unichars_new;
    size_reserved = unichars_number;
@ -68,6 +67,30 @@ const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
  return ids.unichar_to_id(unichar_repr, length);
 }

+// Return the minimum number of bytes that matches a legal UNICHAR_ID,
+// while leaving a legal UNICHAR_ID afterwards. In other words, if there
+// is both a short and a long match to the string, return the length that
+// ensures there is a legal match after it.
+int UNICHARSET::step(const char* str) const {
+  // Find the length of the first matching unicharset member.
+  int minlength = ids.minmatch(str);
+  if (minlength == 0)
+    return 0;  // Empty string or illegal char.
+
+  int goodlength = minlength;
+  while (goodlength <= UNICHAR_LEN) {
+    if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
+      return goodlength;  // This length works!
+    // The next char is illegal so find the next usable length.
+    do {
+      ++goodlength;
+    } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
+             !ids.contains(str, goodlength));
+  }
+  // Search to find a subsequent legal char failed so return the minlength.
+  return minlength;
+}
+
 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
  assert(id < this->size());
  return unichars[id].representation;
@ -75,8 +98,7 @@ const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {

 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
  if (!ids.contains(unichar_repr)) {
-    if (size_used == size_reserved)
-    {
+    if (size_used == size_reserved) {
      if (size_used == 0)
        reserve(8);
      else
@ -84,6 +106,11 @@ void UNICHARSET::unichar_insert(const char* const unichar_repr) {
    }

    strcpy(unichars[size_used].representation, unichar_repr);
+    this->set_isalpha(size_used, false);
+    this->set_islower(size_used, false);
+    this->set_isupper(size_used, false);
+    this->set_isdigit(size_used, false);
+    this->unichars[size_used].properties.enabled = true;
    ids.insert(unichar_repr, size_used);
    ++size_used;
  }
@ -93,6 +120,10 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
  return ids.contains(unichar_repr);
 }

+bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
+  return ids.contains(unichar_repr, length);
+}
+
 bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
 }
@ -135,8 +166,7 @@ bool UNICHARSET::load_from_file(const char* filename) {

  this->clear();
  if (fgets(buffer, sizeof (buffer), file) == NULL ||
-      sscanf(buffer, "%d", &unicharset_size) != 1)
-  {
+      sscanf(buffer, "%d", &unicharset_size) != 1) {
    fclose(file);
    return false;
  }
@ -146,8 +176,7 @@ bool UNICHARSET::load_from_file(const char* filename) {
    unsigned int properties;

    if (fgets(buffer, sizeof (buffer), file) == NULL ||
-        sscanf(buffer, "%s %x", unichar, &properties) != 2)
-    {
+        sscanf(buffer, "%s %x", unichar, &properties) != 2) {
      fclose(file);
      return false;
    }
@ -160,7 +189,45 @@ bool UNICHARSET::load_from_file(const char* filename) {
    this->set_islower(id, properties & ISLOWER_MASK);
    this->set_isupper(id, properties & ISUPPER_MASK);
    this->set_isdigit(id, properties & ISDIGIT_MASK);
+    this->unichars[id].properties.enabled = true;
  }
  fclose(file);
  return true;
 }
+
+// Set a whitelist and/or blacklist of characters to recognize.
+// An empty or NULL whitelist enables everything (minus any blacklist).
+// An empty or NULL blacklist disables nothing.
+void UNICHARSET::set_black_and_whitelist(const char* blacklist,
+                                         const char* whitelist) {
+  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
+  // Set everything to default
+  for (int ch = 0; ch < size_used; ++ch)
+    unichars[ch].properties.enabled = def_enabled;
+  int ch_step;
+  if (!def_enabled) {
+    // Enable the whitelist.
+    for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
+      ch_step = step(whitelist + w_ind);
+      if (ch_step > 0) {
+        UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
+        unichars[u_id].properties.enabled = true;
+      } else {
+        ch_step = 1;
+      }
+    }
+  }
+  if (blacklist != NULL && blacklist[0] != '\0') {
+    // Disable the blacklist.
+    for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
+      ch_step = step(blacklist + b_ind);
+      if (ch_step > 0) {
+        UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
+        unichars[u_id].properties.enabled = false;
+      } else {
+        ch_step = 1;
+      }
+    }
+  }
+}
+
--- a/ccutil/unicharset.h
+++ b/ccutil/unicharset.h
@ -43,6 +43,12 @@ class UNICHARSET {
  const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
                                 int length) const;

+  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
+  // while leaving a legal UNICHAR_ID afterwards. In other words, if there
+  // is both a short and a long match to the string, return the length that
+  // ensures there is a legal match after it.
+  int step(const char* str) const;
+
  // Return the unichar representation corresponding to the given UNICHAR_ID
  // within the UNICHARSET.
  const char* const id_to_unichar(UNICHAR_ID id) const;
@ -52,6 +58,7 @@ class UNICHARSET {

  // Return true if the given unichar representation exists within the set.
  bool contains_unichar(const char* const unichar_repr);
+  bool contains_unichar(const char* const unichar_repr, int length);

  // Return true if the given unichar representation corresponds to the given
  // UNICHAR_ID within the set.
@ -84,6 +91,15 @@ class UNICHARSET {
  // true if the operation is successful.
  bool load_from_file(const char* const filename);

+  // Set a whitelist and/or blacklist of characters to recognize.
+  // An empty or NULL whitelist enables everything (minus any blacklist).
+  // An empty or NULL blacklist disables nothing.
+  // The blacklist overrides the whitelist.
+  // Each list is a string of utf8 character strings. Boundaries between
+  // unicharset units are worked out automatically, and characters not in
+  // the unicharset are silently ignored.
+  void set_black_and_whitelist(const char* blacklist, const char* whitelist);
+
  // Set the isalpha property of the given unichar to the given value.
  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
    unichars[unichar_id].properties.isalpha = value;
@ -172,6 +188,11 @@ class UNICHARSET {
    return get_isdigit(unichar_to_id(unichar_repr, length));
  }

+  // Return the enabled property of the given unichar.
+  bool get_enabled(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.enabled;
+  }
+
 private:

  struct UNICHAR_PROPERTIES {
@ -179,6 +200,7 @@ class UNICHARSET {
    bool islower;
    bool isupper;
    bool isdigit;
+    bool enabled;
  };

  struct UNICHAR_SLOT {
--- a/classify/intproto.cpp
+++ b/classify/intproto.cpp
@ -59,6 +59,10 @@
 /* define pad used to snap near horiz/vertical protos to horiz/vertical */
 #define HV_TOLERANCE  (0.0025)   /* approx 0.9 degrees */

+const int kInputSize = 16;
+//extern int input_unicode[kInputSize];
+int input_unicode[kInputSize];
+
 typedef enum
 { StartSwitch, EndSwitch, LastSwitch }
 SWITCH_TYPE;
@ -872,6 +876,7 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
  int i, j, x, y, z;
  int nread;
  int unicharset_size;
+  int version_id = 0;
  INT_TEMPLATES Templates;
  CLASS_PRUNER Pruner;
  INT_CLASS Class;
@ -900,6 +905,12 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
            unicharset_size, unicharset.size());
    exit(1);
  }
+  if (Templates->NumClasses < 0) {
+    // This file has a version id!
+    version_id = -Templates->NumClasses;
+    if (fread(&Templates->NumClasses, sizeof(int), 1, File) != 1)
+      cprintf ("Bad read of inttemp!\n");
+  }
  for (i = 0; i < unicharset_size; ++i) {
    if (fread(&Templates->IndexFor[i], sizeof(CLASS_INDEX), 1, File) != 1)
      cprintf("Bad read of inttemp!\n");
@ -944,10 +955,13 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
        fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
        fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
      cprintf ("Bad read of inttemp!\n");
-    for (j = 0; j <= MAX_NUM_PROTO_SETS; ++j) {
-      int junk;
-      if (fread(&junk, sizeof(junk), 1, File) != 1)
-        cprintf ("Bad read of inttemp!\n");
+    if (version_id == 0) {
+      // Only version 0 writes 5 pointless pointers to the file.
+      for (j = 0; j < 5; ++j) {
+        int junk;
+        if (fread(&junk, sizeof(junk), 1, File) != 1)
+          cprintf ("Bad read of inttemp!\n");
+      }
    }
    for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
      if (fread(&Class->ConfigLengths[j], sizeof(UINT16), 1, File) != 1)
@ -1072,11 +1086,13 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
  int i, j;
  INT_CLASS Class;
  int unicharset_size = target_unicharset.size();
+  int version_id = -1;  // Turns positive on reading.

  /* first write the high level template struct */
  fwrite((char *) &unicharset_size, sizeof (int), 1, File);
-  fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
+  fwrite((char *) &version_id, sizeof (int), 1, File);
  fwrite((char *) &Templates->NumClassPruners, sizeof (int), 1, File);
+  fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
  fwrite((char *) &Templates->IndexFor[0], sizeof (CLASS_INDEX),
         unicharset_size, File);
  fwrite((char *) &Templates->ClassIdFor[0], sizeof (CLASS_ID),
@ -1092,7 +1108,12 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
    Class = ClassForIndex (Templates, i);

    /* first write out the high level struct for the class */
-    fwrite ((char *) Class, sizeof (INT_CLASS_STRUCT), 1, File);
+    fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
+    fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
+    fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
+    for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
+      fwrite(&Class->ConfigLengths[j], sizeof(UINT16), 1, File);
+    }

    /* then write out the proto lengths */
    fwrite ((char *) (Class->ProtoLengths), sizeof (UINT8),
@ -1546,7 +1567,7 @@ FLOAT32 AnglePad, PROTO Proto, TABLE_FILLER * Filler)
  else {
    /* diagonal proto */

-    if (Angle > 0.0 && Angle < 0.25 || Angle > 0.5 && Angle < 0.75) {
+    if ((Angle > 0.0 && Angle < 0.25) || (Angle > 0.5 && Angle < 0.75)) {
      /* rising diagonal proto */
      Angle *= 2.0 * PI;
      Cos = fabs (cos (Angle));
@ -1736,17 +1757,19 @@ void RenderIntProto(void *window,
  Xmin = Ymin = NUM_PP_BUCKETS;
  Xmax = Ymax = 0;
  for (Bucket = 0; Bucket < NUM_PP_BUCKETS; Bucket++) {
-    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex])
+    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex]) {
      if (Bucket < Xmin)
        Xmin = Bucket;
-    else if (Bucket > Xmax)
-      Xmax = Bucket;
+      else if (Bucket > Xmax)
+        Xmax = Bucket;
+    }

-    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex])
+    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex]) {
      if (Bucket < Ymin)
        Ymin = Bucket;
-    else if (Bucket > Ymax)
-      Ymax = Bucket;
+      else if (Bucket > Ymax)
+        Ymax = Bucket;
+    }
  }
  X = (Xmin + Xmax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;
  Y = (Ymin + Ymax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;
--- a/classify/intproto.h
+++ b/classify/intproto.h
@ -39,7 +39,7 @@
 #define MAX_PROTO_INDEX   24
 #define BITS_PER_WERD   (8 * sizeof (UINT32))
 #define MAX_NUM_CONFIGS   32
-#define MAX_NUM_PROTOS    256
+#define MAX_NUM_PROTOS    512
 #define PROTOS_PER_PROTO_SET  64
 #define MAX_NUM_PROTO_SETS  (MAX_NUM_PROTOS / PROTOS_PER_PROTO_SET)
 #define NUM_PP_PARAMS   3
--- a/classify/protos.cpp
+++ b/classify/protos.cpp
@ -31,6 +31,7 @@
 #include "emalloc.h"
 #include "freelist.h"
 #include "callcpp.h"
+#include "tprintf.h"
 #include "adaptmatch.h"
 #include "scanutils.h"
 #include "globals.h"
@ -122,6 +123,10 @@ int AddProtoToClass(CLASS_TYPE Class) {
  }
  NewProto = NumProtosIn (Class);
  NumProtosIn (Class)++;
+  if (NumProtosIn(Class) > MAX_NUM_PROTOS) {
+    tprintf("Ouch! number of protos = %d, vs max of %d!",
+            NumProtosIn(Class), MAX_NUM_PROTOS);
+  }
  return (NewProto);
 }

--- a/dict/context.cpp
+++ b/dict/context.cpp
@ -32,6 +32,21 @@
 #include <string.h>
 #include <math.h>

+// Initialize probability_in_context to point to a default implementation (a
+// main program can override this).
+PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;
+
+double def_probability_in_context(const char* context,
+                                  int context_bytes,
+                                  const char* character,
+                                  int character_bytes) {
+  (void) context;
+  (void) context_bytes;
+  (void) character;
+  (void) character_bytes;
+  return 0.0;
+}
+
 /*----------------------------------------------------------------------
              V a r i a b l e s
 ----------------------------------------------------------------------*/
@ -85,8 +100,15 @@ int punctuation_ok(const char *word, const char *lengths) {
  for (x = 0; x < 5; x++)
    punctuation_types[x] = 0;

+  // check for un-supported symbols
  for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
+    // a un-supported symbol
+    if (!unicharset.contains_unichar (word + offset, lengths[x])) {
+      return -1;
+    }
+  }

+  for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
    if (unicharset.get_isalpha (word + offset, lengths[x])) {
      if (trailing &&
        !(unicharset.get_isalpha (word + offset - lengths[x - 1], lengths[x - 1])
--- a/dict/context.h
+++ b/dict/context.h
@ -42,6 +42,18 @@ int case_ok(const char *word, const char *lengths);

 void write_choice_line();

+typedef double (*PROBABILITY_IN_CONTEXT_FUNCTION)(const char* context,
+                                                  int context_bytes,
+                                                  const char* character,
+                                                  int character_bytes);
+
+extern PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context;
+
+extern double def_probability_in_context(const char* context,
+                                         int context_bytes,
+                                         const char* character,
+                                         int character_bytes);
+
 /*
 #if defined(__STDC__) || defined(__cplusplus)
 # define _ARGS(s) s
--- a/dict/dawg.cpp
+++ b/dict/dawg.cpp
@ -32,7 +32,8 @@
 #endif
 #include "dawg.h"
 #include "cutil.h"
-#include "callcpp.h"
+#include "tprintf.h"
+#include "freelist.h"
 #include "context.h"
 #include "strngs.h"
 #include "emalloc.h"
@ -297,10 +298,9 @@ void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node) {
 /**********************************************************************
 * read_squished_dawg
 *
- * Read the DAWG from a file
+ * Read the DAWG from a file and return it. Must be freed with memfree.
 **********************************************************************/
-void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,
-                        INT32 max_num_edges) {
+EDGE_ARRAY read_squished_dawg(const char *filename) {
  FILE       *file;
  EDGE_REF   edge;
  INT32      num_edges = 0;
@ -308,8 +308,6 @@ void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,

  if (debug) print_string ("read_debug");

-  clear_all_edges(dawg, edge, max_num_edges);
-
  #ifdef __UNIX__
  file = open_file (filename, "r");
  #else
@ -317,23 +315,27 @@ void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,
  #endif
  fread (&num_edges,  sizeof (INT32), 1, file);
  num_edges = ntohl(num_edges);
-  if (num_edges > max_num_edges || num_edges < 0) {
-    cprintf("Error: trying to read a DAWG '%s' that contains \
-%d edges while the maximum is %d.\n", filename, num_edges, max_num_edges);
+  if (num_edges > MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE || num_edges < 0) {
+    tprintf("(ENDIAN)Error: trying to read a DAWG '%s' that contains "
+            "%d edges while the maximum is %d.\n",
+            filename, num_edges, MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE);
    exit(1);
  }

  UINT32 *dawg_32 = (UINT32*) Emalloc(num_edges * sizeof (UINT32));
  fread(&dawg_32[0], sizeof (UINT32), num_edges, file);
  fclose(file);
+  EDGE_ARRAY dawg = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges);

  for (edge = 0; edge < num_edges; ++edge)
    dawg[edge] = ntohl(dawg_32[edge]);

  Efree(dawg_32);

-  for  (edge = 0; edge < num_edges; ++edge)
+  for (edge = 0; edge < num_edges; ++edge)
    if (last_edge (dawg, edge)) node_count++;
+
+  return dawg;
 }


--- a/dict/dawg.h
+++ b/dict/dawg.h
@ -336,8 +336,7 @@ INT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node);

 void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node);

-void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,
-                        INT32 max_num_edges);
+EDGE_ARRAY read_squished_dawg(const char *filename);

 INT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, INT32 char_index);

--- a/dict/matchdefs.h
+++ b/dict/matchdefs.h
@ -28,7 +28,7 @@
 /* define the maximum number of classes defined for any matcher
  and the maximum class id for any matcher. This must be changed
  if more different classes need to be classified */
-#define MAX_NUM_CLASSES   256
+#define MAX_NUM_CLASSES   8192
 #define MAX_CLASS_ID    (MAX_NUM_CLASSES - 1)

 /* a CLASS_ID is the ascii character to be associated with a class */
--- a/dict/permngram.cpp
+++ b/dict/permngram.cpp
@ -0,0 +1,358 @@
+///////////////////////////////////////////////////////////////////////
+// File:        permngram.cpp
+// Description: Character n-gram permuter
+// Author:      Thomas Kielbus
+// Created:     Wed Sep 12 11:26:43 PDT 2007
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "const.h"
+#include "permngram.h"
+#include "permnum.h"
+#include "debug.h"
+#include "permute.h"
+#include "dawg.h"
+#include "tordvars.h"
+#include "stopper.h"
+#include "globals.h"
+#include "context.h"
+
+#include <math.h>
+#include <ctype.h>
+
+// Ratio to control the relative importance of the classifier and the ngram
+// in the final score of a classification unit. Must be >= 0 and <= 1.
+// A value of 1.0 uses only the shape classifier score.
+// A value of 0.0 uses only the ngram score.
+double_VAR(classifier_score_ngram_score_ratio,
+           0.7,
+           "");
+
+// Rating adjustment multiplier for words not in the DAWG. Must be >= 1.
+double_VAR(non_dawg_prefix_rating_adjustment,
+           1.5,
+           "");
+
+// HypothesisPrefix represents a word prefix during the search of the
+// character-level n-gram model based permuter.
+// It holds the data needed to create the corresponding A_CHOICE.
+// Note that the string stored in the _word data member always begin with a
+// space character. This is used by the n-gram model to score the word.
+// HypothesisPrefix also contains the node in the DAWG that is reached when
+// searching for the corresponding prefix.
+class HypothesisPrefix {
+ public:
+  HypothesisPrefix();
+  HypothesisPrefix(const HypothesisPrefix& prefix,
+                   A_CHOICE* choice,
+                   bool end_of_word,
+                   EDGE_ARRAY dawg);
+
+  double rating() const {return rating_;}
+  double certainty() const {return certainty_;}
+  const char* word() const {return word_;}
+  const char* unichar_lengths() const {return unichar_lengths_;}
+  const float* certainty_array() const {return certainty_array_;}
+  bool is_dawg_prefix() const {return is_dawg_prefix_;}
+  NODE_REF dawg_node() const {return dawg_node_;}
+
+ private:
+  double rating_;
+  double certainty_;
+  char word_[UNICHAR_LEN * MAX_WERD_LENGTH + 2];
+  char unichar_lengths_[MAX_WERD_LENGTH + 1];
+  float certainty_array_[MAX_WERD_LENGTH + 1];
+  NODE_REF dawg_node_;
+  bool is_dawg_prefix_;
+};
+
+// HypothesisPrefix is the class used as nodes in HypothesisPrefixLists
+typedef HypothesisPrefix HypothesisPrefixListNode;
+
+// HypothesisPrefixList maintains a sorted list of HypothesisPrefixes. The size
+// is bounded by the argument given to the constructor.
+// For the sake of simplicity, current implementation is not as efficient as it
+// could be. The list is represented by a static array of pointers to its
+// elements. All nodes are stored in positions from 0 to (size() - 1).
+class HypothesisPrefixList {
+ public:
+  HypothesisPrefixList(int size_bound);
+  ~HypothesisPrefixList();
+
+  void add_node(HypothesisPrefix* node);
+  int size() const {return _size;}
+  void clear();
+  const HypothesisPrefix& node(int index) {return *_list_nodes[index];}
+
+ private:
+  HypothesisPrefix** _list_nodes;
+  int _size_bound;
+  int _size;
+};
+
+// Return the classifier_score_ngram_score_ratio for a given choice string.
+// The classification decision for characters like comma and period should
+// be based only on shape rather than on shape and n-gram score.
+// Return 1.0 for them, the default classifier_score_ngram_score_ratio
+// otherwise.
+static double get_classifier_score_ngram_score_ratio(const char* choice);
+
+// Permute the given char_choices using a character level n-gram model and
+// return the best word choice found.
+// This is performed by maintaining a HypothesisPrefixList of HypothesisPrefixes.
+// For each character position, each possible character choice is appended to
+// the best current prefixes to create the list of best prefixes at the next
+// character position.
+A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
+                                   float rating_limit,
+                                   EDGE_ARRAY dawg) {
+  if (array_count (char_choices) <= MAX_WERD_LENGTH) {
+    CHOICES choices;
+    int char_index_max = array_count(char_choices);
+    HypothesisPrefixList list_1(20);
+    HypothesisPrefixList list_2(20);
+    HypothesisPrefixList* current_list = &list_1;
+    HypothesisPrefixList* next_list = &list_2;
+    HypothesisPrefix* initial_node = new HypothesisPrefix();
+    current_list->add_node(initial_node);
+    for (int char_index = 0; char_index < char_index_max; ++char_index) {
+      iterate_list(choices, (CHOICES) array_index(char_choices, char_index)) {
+        A_CHOICE* choice = (A_CHOICE *) first_node(choices);
+        for (int node_index = 0;
+             node_index < current_list->size();
+             ++node_index) {
+          // Append this choice to the current node
+          HypothesisPrefix* new_node = new HypothesisPrefix(
+              current_list->node(node_index),
+              choice,
+              char_index == char_index_max - 1,
+              dawg);
+          next_list->add_node(new_node);
+        }
+      }
+      // Clear current list and switch lists
+      current_list->clear();
+      HypothesisPrefixList* temp_list = current_list;
+      current_list = next_list;
+      next_list = temp_list;
+
+      // Give up if the current best rating is worse than rating_limit
+      if (current_list->node(0).rating() > rating_limit)
+        return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
+    }
+    const HypothesisPrefix& best_word = current_list->node(0);
+    A_CHOICE* best_choice = new_choice (best_word.word() + 1,
+                                        best_word.unichar_lengths(),
+                                        best_word.rating(),
+                                        best_word.certainty(), -1,
+                                        valid_word(best_word.word() + 1) ?
+                                        SYSTEM_DAWG_PERM : TOP_CHOICE_PERM);
+    LogNewWordChoice(best_choice, best_word.is_dawg_prefix() ?
+                     1.0 : non_dawg_prefix_rating_adjustment,
+                     const_cast<float*>(best_word.certainty_array()));
+    return best_choice;
+  } else {
+    return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
+  }
+}
+
+double get_classifier_score_ngram_score_ratio(const char* choice) {
+  if (!strcmp(",", choice) ||
+      !strcmp(".", choice))
+    return 1.0;
+  else
+    return classifier_score_ngram_score_ratio;
+}
+
+// Initial HypothesisPrefix constructor used to create the first state of the
+// search.
+HypothesisPrefix::HypothesisPrefix() {
+  rating_ = 0;
+  certainty_ = MAXFLOAT;
+  strcpy(word_, " ");
+  unichar_lengths_[0] = '\0';
+  dawg_node_ = 0;
+  is_dawg_prefix_ = true;
+}
+
+// Main constructor to create a new HypothesisPrefix by appending a character
+// choice (A_CHOICE) to an existing HypothesisPrefix. This constructor takes
+// care of copying the original prefix's data members, appends the character
+// choice to the word and updates its rating using a character-level n-gram
+// model. The state in the DAWG is also updated.
+HypothesisPrefix::HypothesisPrefix(const HypothesisPrefix& prefix,
+                                   A_CHOICE* choice,
+                                   bool end_of_word,
+                                   EDGE_ARRAY dawg) {
+  char* word_ptr = word_;
+  const char* prefix_word_ptr = prefix.word_;
+
+  // Copy first space character
+  *(word_ptr++) = *(prefix_word_ptr++);
+
+  // Copy existing word, unichar_lengths, certainty_array
+  int char_index;
+  for (char_index = 0;
+       prefix.unichar_lengths_[char_index] != '\0';
+       ++char_index) {
+    for (int char_subindex = 0;
+         char_subindex < prefix.unichar_lengths_[char_index];
+         ++char_subindex) {
+      *(word_ptr++) = *(prefix_word_ptr++);
+    }
+    unichar_lengths_[char_index] = prefix.unichar_lengths_[char_index];
+    certainty_array_[char_index] = prefix.certainty_array_[char_index];
+  }
+
+  // If choice is empty, use a space character instead
+  const char* class_string_choice = *class_string(choice) == '\0' ?
+      " " : class_string(choice);
+
+  // Update certainty
+  certainty_ = min(prefix.certainty_, class_certainty(choice));
+
+  // Apprend choice to the word
+  strcpy(word_ptr, class_string_choice);
+  unichar_lengths_[char_index] = strlen(class_string_choice);
+  unichar_lengths_[char_index + 1] = '\0';
+
+  // Append choice certainty to the certainty array
+  certainty_array_[char_index] = class_certainty(choice);
+
+  // Copy DAWG node state
+  dawg_node_ = prefix.dawg_node_;
+  is_dawg_prefix_ = prefix.is_dawg_prefix_;
+
+  // Verify DAWG and update dawg_node_ if the current prefix is already valid
+  if (is_dawg_prefix_) {
+    for (int char_subindex = 0;
+         class_string_choice[char_subindex] != '\0';
+         ++char_subindex) {
+
+      // Verify each byte of the appended character. Note that word_ptr points
+      // to the first byte so (word_ptr - (word_ + 1)) is the index of the first
+      // new byte in the string that starts at (word_ + 1).
+      int current_byte_index = word_ptr - (word_ + 1) + char_subindex;
+      if(!letter_is_okay(dawg, &dawg_node_, current_byte_index, '\0',
+                         word_ + 1, end_of_word &&
+                         class_string_choice[char_subindex + 1] == '\0')) {
+        dawg_node_ = NO_EDGE;
+        is_dawg_prefix_ = false;
+        break;
+      }
+    }
+  }
+
+  // Copy the prefix rating
+  rating_ = prefix.rating_;
+
+  // Compute rating of current character
+  double probability = probability_in_context(prefix.word_, -1,
+                                              class_string_choice, -1);
+
+  // If last character of the word, take the following space into account
+  if (end_of_word)
+    probability *= probability_in_context(word_, -1, " ", -1);
+
+  double local_classifier_score_ngram_score_ratio =
+      get_classifier_score_ngram_score_ratio(class_string_choice);
+
+  double classifier_rating = class_probability(choice);
+  double ngram_rating = -log(probability) / log(2.0);
+  double mixed_rating =
+      local_classifier_score_ngram_score_ratio * classifier_rating +
+      (1 - local_classifier_score_ngram_score_ratio) * ngram_rating;
+
+  // If the current word is not a valid prefix, adjust the rating of the
+  // character being appended. If it used to be a valid prefix, compensate for
+  // previous adjustments.
+  if (!is_dawg_prefix_) {
+    if (prefix.is_dawg_prefix_)
+      rating_ *= non_dawg_prefix_rating_adjustment;
+    mixed_rating *= non_dawg_prefix_rating_adjustment;
+  }
+
+  // Update rating by adding the rating of the character being appended.
+  rating_ += mixed_rating;
+}
+
+// Create an empty HypothesisPrefixList. Its maximum size is set to the given
+// bound.
+HypothesisPrefixList::HypothesisPrefixList(int size_bound):
+    _size_bound(size_bound),
+    _size(0) {
+  _list_nodes = new HypothesisPrefix*[_size_bound];
+  for (int i = 0; i < _size_bound; ++i)
+    _list_nodes[i] = NULL;
+}
+
+// Destroy a HypothesisPrefixList all contained nodes are deleted as well.
+HypothesisPrefixList::~HypothesisPrefixList() {
+  this->clear();
+  delete[] _list_nodes;
+}
+
+// Add a node to the HypothesisPrefixList. Maintains the sorted list property.
+// Note that the HypothesisPrefixList takes ownership of the given node and
+// might delete it if needed. It must therefore have been allocated on the heap.
+void HypothesisPrefixList::add_node(HypothesisPrefix* node) {
+  // Detect nodes that have a worst rating that the current maximum and treat
+  // them separately.
+  if (_size > 0 && _list_nodes[_size - 1]->rating() < node->rating()) {
+    if (_size == _size_bound) {
+      // The list is already full. This node will not be added
+      delete node;
+    } else {
+      // The list is not full. Add the node at the last position.
+      _list_nodes[_size] = node;
+      ++_size;
+    }
+    return;
+  }
+  // Find the correct position
+  int node_index_target = 0;
+  while (node_index_target < _size_bound &&
+         _list_nodes[node_index_target] != NULL &&
+         _list_nodes[node_index_target]->rating() < node->rating()) {
+    ++node_index_target;
+  }
+  if (node_index_target >= _size_bound) {
+    delete node;
+  } else {
+    // Move next states by 1. Starting from the last one.
+    int node_index_move = _size - 1;
+    while (node_index_move >= node_index_target) {
+      if (node_index_move == _size_bound - 1)
+        delete _list_nodes[node_index_move];
+      else
+        _list_nodes[node_index_move + 1] = _list_nodes[node_index_move];
+      _list_nodes[node_index_move] = NULL;
+      --node_index_move;
+    }
+    // Insert new node
+    _list_nodes[node_index_target] = node;
+    // Increment size if it has changed
+    if (_size < _size_bound)
+      ++_size;
+  }
+}
+
+// Delete all contained nodes and set the size of the HypothesisPrefixList to 0
+void HypothesisPrefixList::clear() {
+  for (int i = 0; i < _size_bound && _list_nodes[i] != NULL; ++i) {
+    delete _list_nodes[i];
+    _list_nodes[i] = NULL;
+  }
+  _size = 0;
+}
--- a/dict/permngram.h
+++ b/dict/permngram.h
@ -0,0 +1,33 @@
+///////////////////////////////////////////////////////////////////////
+// File:        permngram.h
+// Description: Character n-gram permuter
+// Author:      Thomas Kielbus
+// Created:     Wed Sep 12 11:26:42 PDT 2007
+//
+// (C) Copyright 2007, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef PERMNGRAM_H
+#define PERMNGRAM_H
+
+#include "choicearr.h"
+#include "dawg.h"
+
+// Permute the given char_choices using a character level n-gram model and
+// return the best word choice found. The given dawg is used to determine
+// which choices are contained in the dictionary.
+A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
+                                   float rating_limit,
+                                   EDGE_ARRAY dawg);
+
+#endif  // PERMNGRAM_H
--- a/dict/stopper.cpp
+++ b/dict/stopper.cpp
@ -437,9 +437,9 @@ void DebugWordChoices() {
  char LabelString[80];

  if (StopperDebugLevel >= 1 ||
-    WordToDebug && BestChoices &&
+    (WordToDebug && BestChoices &&
  StringSameAs (WordToDebug, WordToDebug_lengths,
-                (VIABLE_CHOICE) first_node (BestChoices))) {
+                (VIABLE_CHOICE) first_node (BestChoices)))) {
    if (BestRawChoice)
      PrintViableChoice (stderr, "\nBest Raw Choice:   ", BestRawChoice);

@ -731,11 +731,12 @@ FLOAT32 AdjustFactor, float Certainties[]) {
  NewChoice = NULL;
  Choices = BestChoices;
  iterate(Choices) {
-    if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first_node (Choices)))
+    if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first_node (Choices))) {
      if (class_probability (Choice) < BestRating (Choices))
        NewChoice = (VIABLE_CHOICE) first_node (Choices);
-    else
-      return;
+      else
+        return;
+    }
  }

  if (NewChoice) {
@ -1087,11 +1088,14 @@ AMBIG_TABLE *FillAmbigTable() {
    TestString_lengths[0] = 0;
    ReplacementString[0] = '\0';
    ReplacementString_lengths[0] = 0;
+    bool illegal_char = false;
    for (i = 0; i < AmbigPartSize; ++i) {
      fscanf (AmbigFile, "%s", buffer);
      strcat(TestString, buffer);
      lengths[0] = strlen(buffer);
      strcat(TestString_lengths, lengths);
+      if (!unicharset.contains_unichar(buffer))
+        illegal_char = true;
    }
    fscanf (AmbigFile, "%d", &AmbigPartSize);
    for (i = 0; i < AmbigPartSize; ++i) {
@ -1099,11 +1103,16 @@ AMBIG_TABLE *FillAmbigTable() {
      strcat(ReplacementString, buffer);
      lengths[0] = strlen(buffer);
      strcat(ReplacementString_lengths, lengths);
+      if (!unicharset.contains_unichar(buffer))
+        illegal_char = true;
    }

    if (strlen (TestString_lengths) > MAX_AMBIG_SIZE ||
        strlen (ReplacementString_lengths) > MAX_AMBIG_SIZE)
      DoError (0, "Illegal ambiguity specification!");
+    if (illegal_char) {
+      continue;
+    }

    AmbigSpec = (AMBIG_SPEC *) Emalloc (sizeof (AMBIG_SPEC));

--- a/dict/trie.cpp
+++ b/dict/trie.cpp
@ -187,7 +187,7 @@ void add_word_to_dawg(EDGE_ARRAY dawg,
        break;
      }
      if (edges_in_node (dawg, last_node) + last_node == the_next_node) {
-        cprintf ("Node collision at %d\n", the_next_node);
+        //cprintf ("Node collision at %d\n", the_next_node);
        the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
                                       max_num_edges, reserved_edges);
        if (the_next_node == 0) {