mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-08-06 13:56:47 +08:00
Major internationalization improvements
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
aa55810b6b
commit
2a678305c6
@ -24,7 +24,7 @@
|
||||
|
||||
// Maximum number of characters that can be stored in a UNICHAR. Must be
|
||||
// at least 4. Must not exceed 31 without changing the coding of length.
|
||||
#define UNICHAR_LEN 8
|
||||
#define UNICHAR_LEN 24
|
||||
|
||||
// A UNICHAR_ID is the unique id of a unichar.
|
||||
typedef int UNICHAR_ID;
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
#include <assert.h>
|
||||
#include "unichar.h"
|
||||
|
||||
#include "host.h"
|
||||
#include "unicharmap.h"
|
||||
|
||||
UNICHARMAP::UNICHARMAP() :
|
||||
@ -135,6 +135,22 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
|
||||
}
|
||||
|
||||
// Return the minimum number of characters that must be used from this string
|
||||
// to obtain a match in the UNICHARMAP.
|
||||
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
|
||||
const char* current_char = unichar_repr;
|
||||
UNICHARMAP_NODE* current_nodes = nodes;
|
||||
|
||||
while (current_nodes != NULL && *current_char != '\0') {
|
||||
if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
|
||||
return current_char + 1 - unichar_repr;
|
||||
current_nodes =
|
||||
current_nodes[static_cast<unsigned char>(*current_char)].children;
|
||||
++current_char;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void UNICHARMAP::clear() {
|
||||
if (nodes != 0)
|
||||
{
|
||||
|
@ -56,6 +56,10 @@ class UNICHARMAP {
|
||||
// used. The length MUST be non-zero.
|
||||
bool contains(const char* const unichar_repr, int length) const;
|
||||
|
||||
// Return the minimum number of characters that must be used from this string
|
||||
// to obtain a match in the UNICHARMAP.
|
||||
int minmatch(const char* const unichar_repr) const;
|
||||
|
||||
// Clear the UNICHARMAP. All previous data is lost.
|
||||
void clear();
|
||||
|
||||
|
@ -44,11 +44,10 @@ UNICHARSET::~UNICHARSET() {
|
||||
}
|
||||
|
||||
void UNICHARSET::reserve(int unichars_number) {
|
||||
if (unichars_number > size_reserved)
|
||||
{
|
||||
if (unichars_number > size_reserved) {
|
||||
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
|
||||
for (int i = 0; i < size_used; ++i)
|
||||
memcpy(&unichars_new[i], &unichars[i], sizeof (UNICHAR_SLOT));
|
||||
memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
|
||||
delete[] unichars;
|
||||
unichars = unichars_new;
|
||||
size_reserved = unichars_number;
|
||||
@ -68,6 +67,30 @@ const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
||||
return ids.unichar_to_id(unichar_repr, length);
|
||||
}
|
||||
|
||||
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
||||
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
|
||||
// is both a short and a long match to the string, return the length that
|
||||
// ensures there is a legal match after it.
|
||||
int UNICHARSET::step(const char* str) const {
|
||||
// Find the length of the first matching unicharset member.
|
||||
int minlength = ids.minmatch(str);
|
||||
if (minlength == 0)
|
||||
return 0; // Empty string or illegal char.
|
||||
|
||||
int goodlength = minlength;
|
||||
while (goodlength <= UNICHAR_LEN) {
|
||||
if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
|
||||
return goodlength; // This length works!
|
||||
// The next char is illegal so find the next usable length.
|
||||
do {
|
||||
++goodlength;
|
||||
} while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
|
||||
!ids.contains(str, goodlength));
|
||||
}
|
||||
// Search to find a subsequent legal char failed so return the minlength.
|
||||
return minlength;
|
||||
}
|
||||
|
||||
const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
||||
assert(id < this->size());
|
||||
return unichars[id].representation;
|
||||
@ -75,8 +98,7 @@ const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
||||
|
||||
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
||||
if (!ids.contains(unichar_repr)) {
|
||||
if (size_used == size_reserved)
|
||||
{
|
||||
if (size_used == size_reserved) {
|
||||
if (size_used == 0)
|
||||
reserve(8);
|
||||
else
|
||||
@ -84,6 +106,11 @@ void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
||||
}
|
||||
|
||||
strcpy(unichars[size_used].representation, unichar_repr);
|
||||
this->set_isalpha(size_used, false);
|
||||
this->set_islower(size_used, false);
|
||||
this->set_isupper(size_used, false);
|
||||
this->set_isdigit(size_used, false);
|
||||
this->unichars[size_used].properties.enabled = true;
|
||||
ids.insert(unichar_repr, size_used);
|
||||
++size_used;
|
||||
}
|
||||
@ -93,6 +120,10 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
|
||||
return ids.contains(unichar_repr);
|
||||
}
|
||||
|
||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
|
||||
return ids.contains(unichar_repr, length);
|
||||
}
|
||||
|
||||
bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
|
||||
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
|
||||
}
|
||||
@ -135,8 +166,7 @@ bool UNICHARSET::load_from_file(const char* filename) {
|
||||
|
||||
this->clear();
|
||||
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
||||
sscanf(buffer, "%d", &unicharset_size) != 1)
|
||||
{
|
||||
sscanf(buffer, "%d", &unicharset_size) != 1) {
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
@ -146,8 +176,7 @@ bool UNICHARSET::load_from_file(const char* filename) {
|
||||
unsigned int properties;
|
||||
|
||||
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
||||
sscanf(buffer, "%s %x", unichar, &properties) != 2)
|
||||
{
|
||||
sscanf(buffer, "%s %x", unichar, &properties) != 2) {
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
@ -160,7 +189,45 @@ bool UNICHARSET::load_from_file(const char* filename) {
|
||||
this->set_islower(id, properties & ISLOWER_MASK);
|
||||
this->set_isupper(id, properties & ISUPPER_MASK);
|
||||
this->set_isdigit(id, properties & ISDIGIT_MASK);
|
||||
this->unichars[id].properties.enabled = true;
|
||||
}
|
||||
fclose(file);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Set a whitelist and/or blacklist of characters to recognize.
|
||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||
// An empty or NULL blacklist disables nothing.
|
||||
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
||||
const char* whitelist) {
|
||||
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
|
||||
// Set everything to default
|
||||
for (int ch = 0; ch < size_used; ++ch)
|
||||
unichars[ch].properties.enabled = def_enabled;
|
||||
int ch_step;
|
||||
if (!def_enabled) {
|
||||
// Enable the whitelist.
|
||||
for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
|
||||
ch_step = step(whitelist + w_ind);
|
||||
if (ch_step > 0) {
|
||||
UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
|
||||
unichars[u_id].properties.enabled = true;
|
||||
} else {
|
||||
ch_step = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (blacklist != NULL && blacklist[0] != '\0') {
|
||||
// Disable the blacklist.
|
||||
for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
|
||||
ch_step = step(blacklist + b_ind);
|
||||
if (ch_step > 0) {
|
||||
UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
|
||||
unichars[u_id].properties.enabled = false;
|
||||
} else {
|
||||
ch_step = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -43,6 +43,12 @@ class UNICHARSET {
|
||||
const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
|
||||
int length) const;
|
||||
|
||||
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
||||
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
|
||||
// is both a short and a long match to the string, return the length that
|
||||
// ensures there is a legal match after it.
|
||||
int step(const char* str) const;
|
||||
|
||||
// Return the unichar representation corresponding to the given UNICHAR_ID
|
||||
// within the UNICHARSET.
|
||||
const char* const id_to_unichar(UNICHAR_ID id) const;
|
||||
@ -52,6 +58,7 @@ class UNICHARSET {
|
||||
|
||||
// Return true if the given unichar representation exists within the set.
|
||||
bool contains_unichar(const char* const unichar_repr);
|
||||
bool contains_unichar(const char* const unichar_repr, int length);
|
||||
|
||||
// Return true if the given unichar representation corresponds to the given
|
||||
// UNICHAR_ID within the set.
|
||||
@ -84,6 +91,15 @@ class UNICHARSET {
|
||||
// true if the operation is successful.
|
||||
bool load_from_file(const char* const filename);
|
||||
|
||||
// Set a whitelist and/or blacklist of characters to recognize.
|
||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||
// An empty or NULL blacklist disables nothing.
|
||||
// The blacklist overrides the whitelist.
|
||||
// Each list is a string of utf8 character strings. Boundaries between
|
||||
// unicharset units are worked out automatically, and characters not in
|
||||
// the unicharset are silently ignored.
|
||||
void set_black_and_whitelist(const char* blacklist, const char* whitelist);
|
||||
|
||||
// Set the isalpha property of the given unichar to the given value.
|
||||
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
|
||||
unichars[unichar_id].properties.isalpha = value;
|
||||
@ -172,6 +188,11 @@ class UNICHARSET {
|
||||
return get_isdigit(unichar_to_id(unichar_repr, length));
|
||||
}
|
||||
|
||||
// Return the enabled property of the given unichar.
|
||||
bool get_enabled(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.enabled;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
struct UNICHAR_PROPERTIES {
|
||||
@ -179,6 +200,7 @@ class UNICHARSET {
|
||||
bool islower;
|
||||
bool isupper;
|
||||
bool isdigit;
|
||||
bool enabled;
|
||||
};
|
||||
|
||||
struct UNICHAR_SLOT {
|
||||
|
@ -59,6 +59,10 @@
|
||||
/* define pad used to snap near horiz/vertical protos to horiz/vertical */
|
||||
#define HV_TOLERANCE (0.0025) /* approx 0.9 degrees */
|
||||
|
||||
const int kInputSize = 16;
|
||||
//extern int input_unicode[kInputSize];
|
||||
int input_unicode[kInputSize];
|
||||
|
||||
typedef enum
|
||||
{ StartSwitch, EndSwitch, LastSwitch }
|
||||
SWITCH_TYPE;
|
||||
@ -872,6 +876,7 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
|
||||
int i, j, x, y, z;
|
||||
int nread;
|
||||
int unicharset_size;
|
||||
int version_id = 0;
|
||||
INT_TEMPLATES Templates;
|
||||
CLASS_PRUNER Pruner;
|
||||
INT_CLASS Class;
|
||||
@ -900,6 +905,12 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
|
||||
unicharset_size, unicharset.size());
|
||||
exit(1);
|
||||
}
|
||||
if (Templates->NumClasses < 0) {
|
||||
// This file has a version id!
|
||||
version_id = -Templates->NumClasses;
|
||||
if (fread(&Templates->NumClasses, sizeof(int), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
}
|
||||
for (i = 0; i < unicharset_size; ++i) {
|
||||
if (fread(&Templates->IndexFor[i], sizeof(CLASS_INDEX), 1, File) != 1)
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
@ -944,10 +955,13 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
|
||||
fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
|
||||
fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
for (j = 0; j <= MAX_NUM_PROTO_SETS; ++j) {
|
||||
int junk;
|
||||
if (fread(&junk, sizeof(junk), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
if (version_id == 0) {
|
||||
// Only version 0 writes 5 pointless pointers to the file.
|
||||
for (j = 0; j < 5; ++j) {
|
||||
int junk;
|
||||
if (fread(&junk, sizeof(junk), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
}
|
||||
}
|
||||
for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
|
||||
if (fread(&Class->ConfigLengths[j], sizeof(UINT16), 1, File) != 1)
|
||||
@ -1072,11 +1086,13 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
|
||||
int i, j;
|
||||
INT_CLASS Class;
|
||||
int unicharset_size = target_unicharset.size();
|
||||
int version_id = -1; // Turns positive on reading.
|
||||
|
||||
/* first write the high level template struct */
|
||||
fwrite((char *) &unicharset_size, sizeof (int), 1, File);
|
||||
fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
|
||||
fwrite((char *) &version_id, sizeof (int), 1, File);
|
||||
fwrite((char *) &Templates->NumClassPruners, sizeof (int), 1, File);
|
||||
fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
|
||||
fwrite((char *) &Templates->IndexFor[0], sizeof (CLASS_INDEX),
|
||||
unicharset_size, File);
|
||||
fwrite((char *) &Templates->ClassIdFor[0], sizeof (CLASS_ID),
|
||||
@ -1092,7 +1108,12 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
|
||||
Class = ClassForIndex (Templates, i);
|
||||
|
||||
/* first write out the high level struct for the class */
|
||||
fwrite ((char *) Class, sizeof (INT_CLASS_STRUCT), 1, File);
|
||||
fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
|
||||
fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
|
||||
fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
|
||||
for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
|
||||
fwrite(&Class->ConfigLengths[j], sizeof(UINT16), 1, File);
|
||||
}
|
||||
|
||||
/* then write out the proto lengths */
|
||||
fwrite ((char *) (Class->ProtoLengths), sizeof (UINT8),
|
||||
@ -1546,7 +1567,7 @@ FLOAT32 AnglePad, PROTO Proto, TABLE_FILLER * Filler)
|
||||
else {
|
||||
/* diagonal proto */
|
||||
|
||||
if (Angle > 0.0 && Angle < 0.25 || Angle > 0.5 && Angle < 0.75) {
|
||||
if ((Angle > 0.0 && Angle < 0.25) || (Angle > 0.5 && Angle < 0.75)) {
|
||||
/* rising diagonal proto */
|
||||
Angle *= 2.0 * PI;
|
||||
Cos = fabs (cos (Angle));
|
||||
@ -1736,17 +1757,19 @@ void RenderIntProto(void *window,
|
||||
Xmin = Ymin = NUM_PP_BUCKETS;
|
||||
Xmax = Ymax = 0;
|
||||
for (Bucket = 0; Bucket < NUM_PP_BUCKETS; Bucket++) {
|
||||
if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex])
|
||||
if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex]) {
|
||||
if (Bucket < Xmin)
|
||||
Xmin = Bucket;
|
||||
else if (Bucket > Xmax)
|
||||
Xmax = Bucket;
|
||||
else if (Bucket > Xmax)
|
||||
Xmax = Bucket;
|
||||
}
|
||||
|
||||
if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex])
|
||||
if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex]) {
|
||||
if (Bucket < Ymin)
|
||||
Ymin = Bucket;
|
||||
else if (Bucket > Ymax)
|
||||
Ymax = Bucket;
|
||||
else if (Bucket > Ymax)
|
||||
Ymax = Bucket;
|
||||
}
|
||||
}
|
||||
X = (Xmin + Xmax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;
|
||||
Y = (Ymin + Ymax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;
|
||||
|
@ -39,7 +39,7 @@
|
||||
#define MAX_PROTO_INDEX 24
|
||||
#define BITS_PER_WERD (8 * sizeof (UINT32))
|
||||
#define MAX_NUM_CONFIGS 32
|
||||
#define MAX_NUM_PROTOS 256
|
||||
#define MAX_NUM_PROTOS 512
|
||||
#define PROTOS_PER_PROTO_SET 64
|
||||
#define MAX_NUM_PROTO_SETS (MAX_NUM_PROTOS / PROTOS_PER_PROTO_SET)
|
||||
#define NUM_PP_PARAMS 3
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include "emalloc.h"
|
||||
#include "freelist.h"
|
||||
#include "callcpp.h"
|
||||
#include "tprintf.h"
|
||||
#include "adaptmatch.h"
|
||||
#include "scanutils.h"
|
||||
#include "globals.h"
|
||||
@ -122,6 +123,10 @@ int AddProtoToClass(CLASS_TYPE Class) {
|
||||
}
|
||||
NewProto = NumProtosIn (Class);
|
||||
NumProtosIn (Class)++;
|
||||
if (NumProtosIn(Class) > MAX_NUM_PROTOS) {
|
||||
tprintf("Ouch! number of protos = %d, vs max of %d!",
|
||||
NumProtosIn(Class), MAX_NUM_PROTOS);
|
||||
}
|
||||
return (NewProto);
|
||||
}
|
||||
|
||||
|
@ -32,6 +32,21 @@
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
// Initialize probability_in_context to point to a default implementation (a
|
||||
// main program can override this).
|
||||
PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;
|
||||
|
||||
double def_probability_in_context(const char* context,
|
||||
int context_bytes,
|
||||
const char* character,
|
||||
int character_bytes) {
|
||||
(void) context;
|
||||
(void) context_bytes;
|
||||
(void) character;
|
||||
(void) character_bytes;
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
V a r i a b l e s
|
||||
----------------------------------------------------------------------*/
|
||||
@ -85,8 +100,15 @@ int punctuation_ok(const char *word, const char *lengths) {
|
||||
for (x = 0; x < 5; x++)
|
||||
punctuation_types[x] = 0;
|
||||
|
||||
// check for un-supported symbols
|
||||
for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
|
||||
// a un-supported symbol
|
||||
if (!unicharset.contains_unichar (word + offset, lengths[x])) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
|
||||
if (unicharset.get_isalpha (word + offset, lengths[x])) {
|
||||
if (trailing &&
|
||||
!(unicharset.get_isalpha (word + offset - lengths[x - 1], lengths[x - 1])
|
||||
|
@ -42,6 +42,18 @@ int case_ok(const char *word, const char *lengths);
|
||||
|
||||
void write_choice_line();
|
||||
|
||||
typedef double (*PROBABILITY_IN_CONTEXT_FUNCTION)(const char* context,
|
||||
int context_bytes,
|
||||
const char* character,
|
||||
int character_bytes);
|
||||
|
||||
extern PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context;
|
||||
|
||||
extern double def_probability_in_context(const char* context,
|
||||
int context_bytes,
|
||||
const char* character,
|
||||
int character_bytes);
|
||||
|
||||
/*
|
||||
#if defined(__STDC__) || defined(__cplusplus)
|
||||
# define _ARGS(s) s
|
||||
|
@ -32,7 +32,8 @@
|
||||
#endif
|
||||
#include "dawg.h"
|
||||
#include "cutil.h"
|
||||
#include "callcpp.h"
|
||||
#include "tprintf.h"
|
||||
#include "freelist.h"
|
||||
#include "context.h"
|
||||
#include "strngs.h"
|
||||
#include "emalloc.h"
|
||||
@ -297,10 +298,9 @@ void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node) {
|
||||
/**********************************************************************
|
||||
* read_squished_dawg
|
||||
*
|
||||
* Read the DAWG from a file
|
||||
* Read the DAWG from a file and return it. Must be freed with memfree.
|
||||
**********************************************************************/
|
||||
void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,
|
||||
INT32 max_num_edges) {
|
||||
EDGE_ARRAY read_squished_dawg(const char *filename) {
|
||||
FILE *file;
|
||||
EDGE_REF edge;
|
||||
INT32 num_edges = 0;
|
||||
@ -308,8 +308,6 @@ void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,
|
||||
|
||||
if (debug) print_string ("read_debug");
|
||||
|
||||
clear_all_edges(dawg, edge, max_num_edges);
|
||||
|
||||
#ifdef __UNIX__
|
||||
file = open_file (filename, "r");
|
||||
#else
|
||||
@ -317,23 +315,27 @@ void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,
|
||||
#endif
|
||||
fread (&num_edges, sizeof (INT32), 1, file);
|
||||
num_edges = ntohl(num_edges);
|
||||
if (num_edges > max_num_edges || num_edges < 0) {
|
||||
cprintf("Error: trying to read a DAWG '%s' that contains \
|
||||
%d edges while the maximum is %d.\n", filename, num_edges, max_num_edges);
|
||||
if (num_edges > MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE || num_edges < 0) {
|
||||
tprintf("(ENDIAN)Error: trying to read a DAWG '%s' that contains "
|
||||
"%d edges while the maximum is %d.\n",
|
||||
filename, num_edges, MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
UINT32 *dawg_32 = (UINT32*) Emalloc(num_edges * sizeof (UINT32));
|
||||
fread(&dawg_32[0], sizeof (UINT32), num_edges, file);
|
||||
fclose(file);
|
||||
EDGE_ARRAY dawg = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges);
|
||||
|
||||
for (edge = 0; edge < num_edges; ++edge)
|
||||
dawg[edge] = ntohl(dawg_32[edge]);
|
||||
|
||||
Efree(dawg_32);
|
||||
|
||||
for (edge = 0; edge < num_edges; ++edge)
|
||||
for (edge = 0; edge < num_edges; ++edge)
|
||||
if (last_edge (dawg, edge)) node_count++;
|
||||
|
||||
return dawg;
|
||||
}
|
||||
|
||||
|
||||
|
@ -336,8 +336,7 @@ INT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node);
|
||||
|
||||
void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node);
|
||||
|
||||
void read_squished_dawg(const char *filename, EDGE_ARRAY dawg,
|
||||
INT32 max_num_edges);
|
||||
EDGE_ARRAY read_squished_dawg(const char *filename);
|
||||
|
||||
INT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, INT32 char_index);
|
||||
|
||||
|
@ -28,7 +28,7 @@
|
||||
/* define the maximum number of classes defined for any matcher
|
||||
and the maximum class id for any matcher. This must be changed
|
||||
if more different classes need to be classified */
|
||||
#define MAX_NUM_CLASSES 256
|
||||
#define MAX_NUM_CLASSES 8192
|
||||
#define MAX_CLASS_ID (MAX_NUM_CLASSES - 1)
|
||||
|
||||
/* a CLASS_ID is the ascii character to be associated with a class */
|
||||
|
358
dict/permngram.cpp
Normal file
358
dict/permngram.cpp
Normal file
@ -0,0 +1,358 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: permngram.cpp
|
||||
// Description: Character n-gram permuter
|
||||
// Author: Thomas Kielbus
|
||||
// Created: Wed Sep 12 11:26:43 PDT 2007
|
||||
//
|
||||
// (C) Copyright 2007, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "const.h"
|
||||
#include "permngram.h"
|
||||
#include "permnum.h"
|
||||
#include "debug.h"
|
||||
#include "permute.h"
|
||||
#include "dawg.h"
|
||||
#include "tordvars.h"
|
||||
#include "stopper.h"
|
||||
#include "globals.h"
|
||||
#include "context.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <ctype.h>
|
||||
|
||||
// Ratio to control the relative importance of the classifier and the ngram
|
||||
// in the final score of a classification unit. Must be >= 0 and <= 1.
|
||||
// A value of 1.0 uses only the shape classifier score.
|
||||
// A value of 0.0 uses only the ngram score.
|
||||
double_VAR(classifier_score_ngram_score_ratio,
|
||||
0.7,
|
||||
"");
|
||||
|
||||
// Rating adjustment multiplier for words not in the DAWG. Must be >= 1.
|
||||
double_VAR(non_dawg_prefix_rating_adjustment,
|
||||
1.5,
|
||||
"");
|
||||
|
||||
// HypothesisPrefix represents a word prefix during the search of the
|
||||
// character-level n-gram model based permuter.
|
||||
// It holds the data needed to create the corresponding A_CHOICE.
|
||||
// Note that the string stored in the _word data member always begin with a
|
||||
// space character. This is used by the n-gram model to score the word.
|
||||
// HypothesisPrefix also contains the node in the DAWG that is reached when
|
||||
// searching for the corresponding prefix.
|
||||
class HypothesisPrefix {
|
||||
public:
|
||||
HypothesisPrefix();
|
||||
HypothesisPrefix(const HypothesisPrefix& prefix,
|
||||
A_CHOICE* choice,
|
||||
bool end_of_word,
|
||||
EDGE_ARRAY dawg);
|
||||
|
||||
double rating() const {return rating_;}
|
||||
double certainty() const {return certainty_;}
|
||||
const char* word() const {return word_;}
|
||||
const char* unichar_lengths() const {return unichar_lengths_;}
|
||||
const float* certainty_array() const {return certainty_array_;}
|
||||
bool is_dawg_prefix() const {return is_dawg_prefix_;}
|
||||
NODE_REF dawg_node() const {return dawg_node_;}
|
||||
|
||||
private:
|
||||
double rating_;
|
||||
double certainty_;
|
||||
char word_[UNICHAR_LEN * MAX_WERD_LENGTH + 2];
|
||||
char unichar_lengths_[MAX_WERD_LENGTH + 1];
|
||||
float certainty_array_[MAX_WERD_LENGTH + 1];
|
||||
NODE_REF dawg_node_;
|
||||
bool is_dawg_prefix_;
|
||||
};
|
||||
|
||||
// HypothesisPrefix is the class used as nodes in HypothesisPrefixLists
|
||||
typedef HypothesisPrefix HypothesisPrefixListNode;
|
||||
|
||||
// HypothesisPrefixList maintains a sorted list of HypothesisPrefixes. The size
|
||||
// is bounded by the argument given to the constructor.
|
||||
// For the sake of simplicity, current implementation is not as efficient as it
|
||||
// could be. The list is represented by a static array of pointers to its
|
||||
// elements. All nodes are stored in positions from 0 to (size() - 1).
|
||||
class HypothesisPrefixList {
|
||||
public:
|
||||
HypothesisPrefixList(int size_bound);
|
||||
~HypothesisPrefixList();
|
||||
|
||||
void add_node(HypothesisPrefix* node);
|
||||
int size() const {return _size;}
|
||||
void clear();
|
||||
const HypothesisPrefix& node(int index) {return *_list_nodes[index];}
|
||||
|
||||
private:
|
||||
HypothesisPrefix** _list_nodes;
|
||||
int _size_bound;
|
||||
int _size;
|
||||
};
|
||||
|
||||
// Return the classifier_score_ngram_score_ratio for a given choice string.
|
||||
// The classification decision for characters like comma and period should
|
||||
// be based only on shape rather than on shape and n-gram score.
|
||||
// Return 1.0 for them, the default classifier_score_ngram_score_ratio
|
||||
// otherwise.
|
||||
static double get_classifier_score_ngram_score_ratio(const char* choice);
|
||||
|
||||
// Permute the given char_choices using a character level n-gram model and
|
||||
// return the best word choice found.
|
||||
// This is performed by maintaining a HypothesisPrefixList of HypothesisPrefixes.
|
||||
// For each character position, each possible character choice is appended to
|
||||
// the best current prefixes to create the list of best prefixes at the next
|
||||
// character position.
|
||||
A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
|
||||
float rating_limit,
|
||||
EDGE_ARRAY dawg) {
|
||||
if (array_count (char_choices) <= MAX_WERD_LENGTH) {
|
||||
CHOICES choices;
|
||||
int char_index_max = array_count(char_choices);
|
||||
HypothesisPrefixList list_1(20);
|
||||
HypothesisPrefixList list_2(20);
|
||||
HypothesisPrefixList* current_list = &list_1;
|
||||
HypothesisPrefixList* next_list = &list_2;
|
||||
HypothesisPrefix* initial_node = new HypothesisPrefix();
|
||||
current_list->add_node(initial_node);
|
||||
for (int char_index = 0; char_index < char_index_max; ++char_index) {
|
||||
iterate_list(choices, (CHOICES) array_index(char_choices, char_index)) {
|
||||
A_CHOICE* choice = (A_CHOICE *) first_node(choices);
|
||||
for (int node_index = 0;
|
||||
node_index < current_list->size();
|
||||
++node_index) {
|
||||
// Append this choice to the current node
|
||||
HypothesisPrefix* new_node = new HypothesisPrefix(
|
||||
current_list->node(node_index),
|
||||
choice,
|
||||
char_index == char_index_max - 1,
|
||||
dawg);
|
||||
next_list->add_node(new_node);
|
||||
}
|
||||
}
|
||||
// Clear current list and switch lists
|
||||
current_list->clear();
|
||||
HypothesisPrefixList* temp_list = current_list;
|
||||
current_list = next_list;
|
||||
next_list = temp_list;
|
||||
|
||||
// Give up if the current best rating is worse than rating_limit
|
||||
if (current_list->node(0).rating() > rating_limit)
|
||||
return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
|
||||
}
|
||||
const HypothesisPrefix& best_word = current_list->node(0);
|
||||
A_CHOICE* best_choice = new_choice (best_word.word() + 1,
|
||||
best_word.unichar_lengths(),
|
||||
best_word.rating(),
|
||||
best_word.certainty(), -1,
|
||||
valid_word(best_word.word() + 1) ?
|
||||
SYSTEM_DAWG_PERM : TOP_CHOICE_PERM);
|
||||
LogNewWordChoice(best_choice, best_word.is_dawg_prefix() ?
|
||||
1.0 : non_dawg_prefix_rating_adjustment,
|
||||
const_cast<float*>(best_word.certainty_array()));
|
||||
return best_choice;
|
||||
} else {
|
||||
return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
|
||||
}
|
||||
}
|
||||
|
||||
double get_classifier_score_ngram_score_ratio(const char* choice) {
|
||||
if (!strcmp(",", choice) ||
|
||||
!strcmp(".", choice))
|
||||
return 1.0;
|
||||
else
|
||||
return classifier_score_ngram_score_ratio;
|
||||
}
|
||||
|
||||
// Initial HypothesisPrefix constructor used to create the first state of the
|
||||
// search.
|
||||
HypothesisPrefix::HypothesisPrefix() {
|
||||
rating_ = 0;
|
||||
certainty_ = MAXFLOAT;
|
||||
strcpy(word_, " ");
|
||||
unichar_lengths_[0] = '\0';
|
||||
dawg_node_ = 0;
|
||||
is_dawg_prefix_ = true;
|
||||
}
|
||||
|
||||
// Main constructor to create a new HypothesisPrefix by appending a character
|
||||
// choice (A_CHOICE) to an existing HypothesisPrefix. This constructor takes
|
||||
// care of copying the original prefix's data members, appends the character
|
||||
// choice to the word and updates its rating using a character-level n-gram
|
||||
// model. The state in the DAWG is also updated.
|
||||
HypothesisPrefix::HypothesisPrefix(const HypothesisPrefix& prefix,
|
||||
A_CHOICE* choice,
|
||||
bool end_of_word,
|
||||
EDGE_ARRAY dawg) {
|
||||
char* word_ptr = word_;
|
||||
const char* prefix_word_ptr = prefix.word_;
|
||||
|
||||
// Copy first space character
|
||||
*(word_ptr++) = *(prefix_word_ptr++);
|
||||
|
||||
// Copy existing word, unichar_lengths, certainty_array
|
||||
int char_index;
|
||||
for (char_index = 0;
|
||||
prefix.unichar_lengths_[char_index] != '\0';
|
||||
++char_index) {
|
||||
for (int char_subindex = 0;
|
||||
char_subindex < prefix.unichar_lengths_[char_index];
|
||||
++char_subindex) {
|
||||
*(word_ptr++) = *(prefix_word_ptr++);
|
||||
}
|
||||
unichar_lengths_[char_index] = prefix.unichar_lengths_[char_index];
|
||||
certainty_array_[char_index] = prefix.certainty_array_[char_index];
|
||||
}
|
||||
|
||||
// If choice is empty, use a space character instead
|
||||
const char* class_string_choice = *class_string(choice) == '\0' ?
|
||||
" " : class_string(choice);
|
||||
|
||||
// Update certainty
|
||||
certainty_ = min(prefix.certainty_, class_certainty(choice));
|
||||
|
||||
// Apprend choice to the word
|
||||
strcpy(word_ptr, class_string_choice);
|
||||
unichar_lengths_[char_index] = strlen(class_string_choice);
|
||||
unichar_lengths_[char_index + 1] = '\0';
|
||||
|
||||
// Append choice certainty to the certainty array
|
||||
certainty_array_[char_index] = class_certainty(choice);
|
||||
|
||||
// Copy DAWG node state
|
||||
dawg_node_ = prefix.dawg_node_;
|
||||
is_dawg_prefix_ = prefix.is_dawg_prefix_;
|
||||
|
||||
// Verify DAWG and update dawg_node_ if the current prefix is already valid
|
||||
if (is_dawg_prefix_) {
|
||||
for (int char_subindex = 0;
|
||||
class_string_choice[char_subindex] != '\0';
|
||||
++char_subindex) {
|
||||
|
||||
// Verify each byte of the appended character. Note that word_ptr points
|
||||
// to the first byte so (word_ptr - (word_ + 1)) is the index of the first
|
||||
// new byte in the string that starts at (word_ + 1).
|
||||
int current_byte_index = word_ptr - (word_ + 1) + char_subindex;
|
||||
if(!letter_is_okay(dawg, &dawg_node_, current_byte_index, '\0',
|
||||
word_ + 1, end_of_word &&
|
||||
class_string_choice[char_subindex + 1] == '\0')) {
|
||||
dawg_node_ = NO_EDGE;
|
||||
is_dawg_prefix_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the prefix rating
|
||||
rating_ = prefix.rating_;
|
||||
|
||||
// Compute rating of current character
|
||||
double probability = probability_in_context(prefix.word_, -1,
|
||||
class_string_choice, -1);
|
||||
|
||||
// If last character of the word, take the following space into account
|
||||
if (end_of_word)
|
||||
probability *= probability_in_context(word_, -1, " ", -1);
|
||||
|
||||
double local_classifier_score_ngram_score_ratio =
|
||||
get_classifier_score_ngram_score_ratio(class_string_choice);
|
||||
|
||||
double classifier_rating = class_probability(choice);
|
||||
double ngram_rating = -log(probability) / log(2.0);
|
||||
double mixed_rating =
|
||||
local_classifier_score_ngram_score_ratio * classifier_rating +
|
||||
(1 - local_classifier_score_ngram_score_ratio) * ngram_rating;
|
||||
|
||||
// If the current word is not a valid prefix, adjust the rating of the
|
||||
// character being appended. If it used to be a valid prefix, compensate for
|
||||
// previous adjustments.
|
||||
if (!is_dawg_prefix_) {
|
||||
if (prefix.is_dawg_prefix_)
|
||||
rating_ *= non_dawg_prefix_rating_adjustment;
|
||||
mixed_rating *= non_dawg_prefix_rating_adjustment;
|
||||
}
|
||||
|
||||
// Update rating by adding the rating of the character being appended.
|
||||
rating_ += mixed_rating;
|
||||
}
|
||||
|
||||
// Create an empty HypothesisPrefixList. Its maximum size is set to the given
|
||||
// bound.
|
||||
HypothesisPrefixList::HypothesisPrefixList(int size_bound):
|
||||
_size_bound(size_bound),
|
||||
_size(0) {
|
||||
_list_nodes = new HypothesisPrefix*[_size_bound];
|
||||
for (int i = 0; i < _size_bound; ++i)
|
||||
_list_nodes[i] = NULL;
|
||||
}
|
||||
|
||||
// Destroy a HypothesisPrefixList all contained nodes are deleted as well.
|
||||
HypothesisPrefixList::~HypothesisPrefixList() {
|
||||
this->clear();
|
||||
delete[] _list_nodes;
|
||||
}
|
||||
|
||||
// Add a node to the HypothesisPrefixList. Maintains the sorted list property.
|
||||
// Note that the HypothesisPrefixList takes ownership of the given node and
|
||||
// might delete it if needed. It must therefore have been allocated on the heap.
|
||||
void HypothesisPrefixList::add_node(HypothesisPrefix* node) {
|
||||
// Detect nodes that have a worst rating that the current maximum and treat
|
||||
// them separately.
|
||||
if (_size > 0 && _list_nodes[_size - 1]->rating() < node->rating()) {
|
||||
if (_size == _size_bound) {
|
||||
// The list is already full. This node will not be added
|
||||
delete node;
|
||||
} else {
|
||||
// The list is not full. Add the node at the last position.
|
||||
_list_nodes[_size] = node;
|
||||
++_size;
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Find the correct position
|
||||
int node_index_target = 0;
|
||||
while (node_index_target < _size_bound &&
|
||||
_list_nodes[node_index_target] != NULL &&
|
||||
_list_nodes[node_index_target]->rating() < node->rating()) {
|
||||
++node_index_target;
|
||||
}
|
||||
if (node_index_target >= _size_bound) {
|
||||
delete node;
|
||||
} else {
|
||||
// Move next states by 1. Starting from the last one.
|
||||
int node_index_move = _size - 1;
|
||||
while (node_index_move >= node_index_target) {
|
||||
if (node_index_move == _size_bound - 1)
|
||||
delete _list_nodes[node_index_move];
|
||||
else
|
||||
_list_nodes[node_index_move + 1] = _list_nodes[node_index_move];
|
||||
_list_nodes[node_index_move] = NULL;
|
||||
--node_index_move;
|
||||
}
|
||||
// Insert new node
|
||||
_list_nodes[node_index_target] = node;
|
||||
// Increment size if it has changed
|
||||
if (_size < _size_bound)
|
||||
++_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Delete all contained nodes and set the size of the HypothesisPrefixList to 0
|
||||
void HypothesisPrefixList::clear() {
|
||||
for (int i = 0; i < _size_bound && _list_nodes[i] != NULL; ++i) {
|
||||
delete _list_nodes[i];
|
||||
_list_nodes[i] = NULL;
|
||||
}
|
||||
_size = 0;
|
||||
}
|
33
dict/permngram.h
Normal file
33
dict/permngram.h
Normal file
@ -0,0 +1,33 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: permngram.h
|
||||
// Description: Character n-gram permuter
|
||||
// Author: Thomas Kielbus
|
||||
// Created: Wed Sep 12 11:26:42 PDT 2007
|
||||
//
|
||||
// (C) Copyright 2007, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef PERMNGRAM_H
|
||||
#define PERMNGRAM_H
|
||||
|
||||
#include "choicearr.h"
|
||||
#include "dawg.h"
|
||||
|
||||
// Permute the given char_choices using a character level n-gram model and
|
||||
// return the best word choice found. The given dawg is used to determine
|
||||
// which choices are contained in the dictionary.
|
||||
A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
|
||||
float rating_limit,
|
||||
EDGE_ARRAY dawg);
|
||||
|
||||
#endif // PERMNGRAM_H
|
@ -437,9 +437,9 @@ void DebugWordChoices() {
|
||||
char LabelString[80];
|
||||
|
||||
if (StopperDebugLevel >= 1 ||
|
||||
WordToDebug && BestChoices &&
|
||||
(WordToDebug && BestChoices &&
|
||||
StringSameAs (WordToDebug, WordToDebug_lengths,
|
||||
(VIABLE_CHOICE) first_node (BestChoices))) {
|
||||
(VIABLE_CHOICE) first_node (BestChoices)))) {
|
||||
if (BestRawChoice)
|
||||
PrintViableChoice (stderr, "\nBest Raw Choice: ", BestRawChoice);
|
||||
|
||||
@ -731,11 +731,12 @@ FLOAT32 AdjustFactor, float Certainties[]) {
|
||||
NewChoice = NULL;
|
||||
Choices = BestChoices;
|
||||
iterate(Choices) {
|
||||
if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first_node (Choices)))
|
||||
if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first_node (Choices))) {
|
||||
if (class_probability (Choice) < BestRating (Choices))
|
||||
NewChoice = (VIABLE_CHOICE) first_node (Choices);
|
||||
else
|
||||
return;
|
||||
else
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (NewChoice) {
|
||||
@ -1087,11 +1088,14 @@ AMBIG_TABLE *FillAmbigTable() {
|
||||
TestString_lengths[0] = 0;
|
||||
ReplacementString[0] = '\0';
|
||||
ReplacementString_lengths[0] = 0;
|
||||
bool illegal_char = false;
|
||||
for (i = 0; i < AmbigPartSize; ++i) {
|
||||
fscanf (AmbigFile, "%s", buffer);
|
||||
strcat(TestString, buffer);
|
||||
lengths[0] = strlen(buffer);
|
||||
strcat(TestString_lengths, lengths);
|
||||
if (!unicharset.contains_unichar(buffer))
|
||||
illegal_char = true;
|
||||
}
|
||||
fscanf (AmbigFile, "%d", &AmbigPartSize);
|
||||
for (i = 0; i < AmbigPartSize; ++i) {
|
||||
@ -1099,11 +1103,16 @@ AMBIG_TABLE *FillAmbigTable() {
|
||||
strcat(ReplacementString, buffer);
|
||||
lengths[0] = strlen(buffer);
|
||||
strcat(ReplacementString_lengths, lengths);
|
||||
if (!unicharset.contains_unichar(buffer))
|
||||
illegal_char = true;
|
||||
}
|
||||
|
||||
if (strlen (TestString_lengths) > MAX_AMBIG_SIZE ||
|
||||
strlen (ReplacementString_lengths) > MAX_AMBIG_SIZE)
|
||||
DoError (0, "Illegal ambiguity specification!");
|
||||
if (illegal_char) {
|
||||
continue;
|
||||
}
|
||||
|
||||
AmbigSpec = (AMBIG_SPEC *) Emalloc (sizeof (AMBIG_SPEC));
|
||||
|
||||
|
@ -187,7 +187,7 @@ void add_word_to_dawg(EDGE_ARRAY dawg,
|
||||
break;
|
||||
}
|
||||
if (edges_in_node (dawg, last_node) + last_node == the_next_node) {
|
||||
cprintf ("Node collision at %d\n", the_next_node);
|
||||
//cprintf ("Node collision at %d\n", the_next_node);
|
||||
the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
|
||||
max_num_edges, reserved_edges);
|
||||
if (the_next_node == 0) {
|
||||
|
Loading…
Reference in New Issue
Block a user