2007-03-08 04:03:40 +08:00
|
|
|
/* -*-C-*-
|
|
|
|
********************************************************************************
|
|
|
|
*
|
|
|
|
* File: trie.c (Formerly trie.c)
|
|
|
|
* Description: Functions to build a trie data structure.
|
|
|
|
* Author: Mark Seaman, OCR Technology
|
|
|
|
* Created: Fri Oct 16 14:37:00 1987
|
|
|
|
* Modified: Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt
|
|
|
|
* Language: C
|
|
|
|
* Package: N/A
|
|
|
|
* Status: Reusable Software Component
|
|
|
|
*
|
|
|
|
* (c) Copyright 1987, Hewlett-Packard Company.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
*********************************************************************************/
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
I n c l u d e s
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
#include "trie.h"
|
|
|
|
#include "callcpp.h"
|
|
|
|
|
|
|
|
#ifdef __UNIX__
|
|
|
|
#include <assert.h>
|
|
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
V a r i a b l e s
|
|
|
|
----------------------------------------------------------------------*/
|
2008-04-22 08:42:51 +08:00
|
|
|
static inT32 move_counter = 0;
|
|
|
|
static inT32 new_counter = 0;
|
|
|
|
static inT32 edge_counter = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 max_new_attempts = 0;
|
2007-07-18 09:05:40 +08:00
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
F u n c t i o n s
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
/**********************************************************************
|
|
|
|
* add_edge_linkage
|
|
|
|
*
|
|
|
|
* Add a single edge linkage to between the two nodes. This function
|
|
|
|
* can be used to add either forward or backward links.
|
|
|
|
**********************************************************************/
|
|
|
|
void add_edge_linkage(EDGE_ARRAY dawg,
|
|
|
|
NODE_REF node1,
|
|
|
|
NODE_REF node2,
|
2007-07-18 09:05:40 +08:00
|
|
|
EDGE_RECORD direction,
|
2008-08-15 06:44:46 +08:00
|
|
|
int character,
|
2007-07-18 09:05:40 +08:00
|
|
|
EDGE_RECORD word_end) {
|
2007-03-08 04:03:40 +08:00
|
|
|
EDGE_REF edge1 = node1;
|
|
|
|
EDGE_REF edge2;
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 num_edges = edges_in_node (dawg, node1);
|
2008-08-15 06:44:46 +08:00
|
|
|
inT64 last_one;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
word_end = (word_end ? WERD_END_FLAG : 0);
|
|
|
|
|
|
|
|
if (num_edges == 0) { /* No edges yet */
|
|
|
|
direction = ((direction == FORWARD_EDGE) ? DIRECTION_FLAG : 0);
|
|
|
|
link_edge (dawg, edge1, node2, character,
|
|
|
|
LAST_FLAG | direction | word_end);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* Forward links */
|
|
|
|
if (direction == FORWARD_EDGE) {
|
|
|
|
last_one = (forward_edge (dawg, edge1) ? 0 : LAST_FLAG);
|
|
|
|
if (debug)
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("moving edges (nodes = " REFFORMAT ", " REFFORMAT \
|
|
|
|
", num = %d)\n",
|
2007-03-08 04:03:40 +08:00
|
|
|
edge1, edge1+1, num_edges);
|
|
|
|
copy_edges (dawg, edge1, edge1+1, num_edges);
|
|
|
|
link_edge (dawg, edge1, node2, character,
|
|
|
|
last_one | DIRECTION_FLAG | word_end);
|
|
|
|
}
|
|
|
|
else { /* Backward links */
|
|
|
|
|
|
|
|
if (forward_edge (dawg, edge1))
|
2007-05-16 08:44:44 +08:00
|
|
|
edge_loop(dawg, edge1);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/* Existing back edges */
|
|
|
|
if (backward_edge (dawg,edge1)) {
|
|
|
|
num_edges = 0;
|
|
|
|
edge2 = edge1;
|
|
|
|
do { num_edges++; }
|
2007-05-16 08:44:44 +08:00
|
|
|
edge_loop(dawg, edge2);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (debug)
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("moving edges (nodes = " REFFORMAT ", " REFFORMAT \
|
|
|
|
", num = %d)\n",
|
2007-03-08 04:03:40 +08:00
|
|
|
edge1, edge1+1, num_edges);
|
|
|
|
copy_edges (dawg, edge1, edge1+1, num_edges);
|
2007-05-16 08:44:44 +08:00
|
|
|
link_edge(dawg, edge1, node2, character, word_end);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else { /* First back edge */
|
|
|
|
link_edge (dawg, edge1, node2, character,
|
|
|
|
LAST_FLAG | word_end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* add_new_edge
|
|
|
|
*
|
|
|
|
* Add an edge between two nodes in the DAWG. Link the nodes both ways.
|
|
|
|
**********************************************************************/
|
2007-08-31 02:16:00 +08:00
|
|
|
bool add_new_edge(EDGE_ARRAY dawg,
|
2007-03-08 04:03:40 +08:00
|
|
|
NODE_REF *node1,
|
|
|
|
NODE_REF *node2,
|
2008-08-15 06:44:46 +08:00
|
|
|
int character,
|
2007-07-18 09:05:40 +08:00
|
|
|
EDGE_RECORD word_end,
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 max_num_edges,
|
|
|
|
inT32 reserved_edges) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int direction;
|
|
|
|
|
|
|
|
if (debug)
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("add_new_edge (nodes = " REFFORMAT ", " REFFORMAT \
|
|
|
|
", ch = '%c', end = %d)\n",
|
2007-03-08 04:03:40 +08:00
|
|
|
*node1, *node2, character, word_end);
|
|
|
|
edge_counter++;
|
|
|
|
|
2007-08-31 02:16:00 +08:00
|
|
|
if (!move_node_if_needed(dawg, node1, max_num_edges, reserved_edges))
|
|
|
|
return false;
|
|
|
|
if (!move_node_if_needed(dawg, node2, max_num_edges, reserved_edges))
|
|
|
|
return false;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
direction = (int) FORWARD_EDGE;
|
2007-05-16 08:44:44 +08:00
|
|
|
add_edge_linkage(dawg, *node1, *node2, direction, character, word_end);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
direction = (int) BACKWARD_EDGE;
|
2007-05-16 08:44:44 +08:00
|
|
|
add_edge_linkage(dawg, *node2, *node1, direction, character, word_end);
|
2007-08-31 02:16:00 +08:00
|
|
|
return true;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2008-08-21 01:47:05 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* add_word_ending
|
|
|
|
*
|
|
|
|
* Set the word ending flags in an already existing edge pair.
|
|
|
|
* Return true on success.
|
|
|
|
**********************************************************************/
|
|
|
|
bool add_word_ending(EDGE_ARRAY dawg,
|
|
|
|
EDGE_REF edge,
|
|
|
|
NODE_REF the_next_node,
|
|
|
|
int ch) {
|
|
|
|
EDGE_REF other_edge = the_next_node;
|
|
|
|
// Find the backward link from the_next_node back with ch.
|
|
|
|
if (forward_edge(dawg, other_edge))
|
|
|
|
edge_loop(dawg, other_edge);
|
|
|
|
if (backward_edge(dawg, other_edge)) {
|
|
|
|
other_edge = edge_char_of(dawg, other_edge, ch, false);
|
|
|
|
if (other_edge != NO_EDGE) {
|
|
|
|
// Mark both directions as end of word.
|
|
|
|
dawg[edge] |= (WERD_END_FLAG << FLAG_START_BIT);
|
|
|
|
dawg[other_edge] |= (WERD_END_FLAG << FLAG_START_BIT);
|
|
|
|
return true; // Success.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false; // Failed.
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* add_word_to_dawg
|
|
|
|
*
|
|
|
|
* Add in a word by creating the necessary nodes and edges.
|
|
|
|
**********************************************************************/
|
|
|
|
void add_word_to_dawg(EDGE_ARRAY dawg,
|
2008-08-15 06:44:46 +08:00
|
|
|
const char *string,
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 max_num_edges,
|
|
|
|
inT32 reserved_edges) {
|
2007-03-08 04:03:40 +08:00
|
|
|
EDGE_REF edge;
|
|
|
|
NODE_REF last_node = 0;
|
|
|
|
NODE_REF the_next_node;
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 i;
|
|
|
|
inT32 still_finding_chars = TRUE;
|
|
|
|
inT32 word_end = FALSE;
|
2007-08-31 02:16:00 +08:00
|
|
|
bool add_failed = false;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-07-18 09:05:40 +08:00
|
|
|
if (debug) cprintf("Adding word %s\n", string);
|
2007-03-08 04:03:40 +08:00
|
|
|
for (i=0; i<strlen(string)-1; i++) {
|
2008-08-15 06:44:46 +08:00
|
|
|
unsigned char ch = case_sensative ? string[i] : tolower(string[i]);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (still_finding_chars) {
|
2008-08-15 06:44:46 +08:00
|
|
|
edge = edge_char_of(dawg, last_node, ch, word_end);
|
2007-07-18 09:05:40 +08:00
|
|
|
if (debug) cprintf ("exploring edge = " REFFORMAT "\n", edge);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (edge == NO_EDGE)
|
|
|
|
still_finding_chars = FALSE;
|
|
|
|
else
|
|
|
|
if (next_node (dawg, edge) == 0) {
|
|
|
|
word_end = TRUE;
|
|
|
|
still_finding_chars = FALSE;
|
2008-08-15 06:44:46 +08:00
|
|
|
remove_edge (dawg, last_node, 0, ch, word_end);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
last_node = next_node (dawg, edge);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (! still_finding_chars) {
|
2007-07-18 09:05:40 +08:00
|
|
|
the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
|
|
|
|
max_num_edges, reserved_edges);
|
2007-08-31 02:16:00 +08:00
|
|
|
if (the_next_node == 0) {
|
|
|
|
add_failed = true;
|
|
|
|
break;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
if (edges_in_node (dawg, last_node) + last_node == the_next_node) {
|
2008-02-01 08:21:49 +08:00
|
|
|
//cprintf ("Node collision at %d\n", the_next_node);
|
2007-07-18 09:05:40 +08:00
|
|
|
the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
|
|
|
|
max_num_edges, reserved_edges);
|
2007-08-31 02:16:00 +08:00
|
|
|
if (the_next_node == 0) {
|
|
|
|
add_failed = true;
|
|
|
|
break;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2008-08-15 06:44:46 +08:00
|
|
|
if (!add_new_edge (dawg, &last_node, &the_next_node, ch,
|
|
|
|
word_end, max_num_edges, reserved_edges)) {
|
2007-08-31 02:16:00 +08:00
|
|
|
add_failed = true;
|
|
|
|
break;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
word_end = FALSE;
|
|
|
|
if (debug)
|
|
|
|
cprintf ("adding node = %ld\n", the_next_node);
|
|
|
|
last_node = the_next_node;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
the_next_node = 0;
|
2008-08-15 06:44:46 +08:00
|
|
|
unsigned char ch = case_sensative ? string[i] : tolower(string[i]);
|
2008-08-21 01:47:05 +08:00
|
|
|
if (still_finding_chars &&
|
|
|
|
(edge = edge_char_of(dawg, last_node, ch, false))!= NO_EDGE &&
|
|
|
|
(the_next_node = next_node(dawg, edge)) != 0) {
|
|
|
|
// An extension of this word already exists in the trie, so we
|
|
|
|
// only have to add the ending flags in both directions.
|
|
|
|
if (!add_word_ending(dawg, edge, the_next_node, ch))
|
|
|
|
cprintf("Unable to find backward edge for subword ending! %s\n", string);
|
|
|
|
} else {
|
|
|
|
if (!add_failed &&
|
|
|
|
!add_new_edge(dawg, &last_node, &the_next_node, ch,
|
|
|
|
TRUE, max_num_edges, reserved_edges))
|
|
|
|
add_failed = true;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (edges_in_node (dawg, 0) > reserved_edges) {
|
|
|
|
cprintf ("error: Not enough room in root node, %d\n",
|
|
|
|
edges_in_node (dawg, 0));
|
2007-08-31 02:16:00 +08:00
|
|
|
add_failed = true;
|
|
|
|
}
|
|
|
|
if (add_failed) {
|
|
|
|
cprintf ("Re-initializing document dictionary...\n");
|
|
|
|
initialize_dawg(dawg,max_num_edges);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* initialize_dawg
|
|
|
|
*
|
|
|
|
* Initialize the DAWG data structure for further used. Reset each of
|
|
|
|
* the edge cells to NO_EDGE.
|
|
|
|
**********************************************************************/
|
2008-04-22 08:42:51 +08:00
|
|
|
void initialize_dawg(EDGE_ARRAY dawg, inT32 max_num_edges) {
|
|
|
|
inT32 x;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-05-16 08:44:44 +08:00
|
|
|
|
|
|
|
//changed by jetsoft
|
|
|
|
// these values were not getting changes.
|
|
|
|
|
|
|
|
move_counter = 0;
|
|
|
|
new_counter = 0;
|
|
|
|
edge_counter = 0;
|
|
|
|
// end jetsoft
|
|
|
|
|
|
|
|
clear_all_edges(dawg, x, max_num_edges);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
2007-08-31 02:16:00 +08:00
|
|
|
* move_node_if_needed
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
2007-08-31 02:16:00 +08:00
|
|
|
* Move the node location if there is a need to do it because there is
|
|
|
|
* not enough room to add the new edges.
|
2007-03-08 04:03:40 +08:00
|
|
|
**********************************************************************/
|
2007-08-31 02:16:00 +08:00
|
|
|
bool move_node_if_needed(EDGE_ARRAY dawg,
|
|
|
|
NODE_REF* node,
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 max_num_edges,
|
|
|
|
inT32 reserved_edges) {
|
2007-08-31 02:16:00 +08:00
|
|
|
if (room_in_node(dawg, *node))
|
|
|
|
return true;
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
NODE_REF this_new_node;
|
|
|
|
EDGE_REF edge;
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 num_edges = edges_in_node (dawg, *node);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (debug)
|
2007-08-31 02:16:00 +08:00
|
|
|
print_dawg_node(dawg, *node);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-07-18 09:05:40 +08:00
|
|
|
this_new_node = new_dawg_node (dawg, num_edges + EDGE_NUM_MARGIN,
|
|
|
|
max_num_edges, reserved_edges);
|
2007-08-31 02:16:00 +08:00
|
|
|
if (this_new_node == 0)
|
|
|
|
return false;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (debug)
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("move_node (from = " REFFORMAT ", to = " REFFORMAT
|
|
|
|
", num = %d)\n",
|
2007-08-31 02:16:00 +08:00
|
|
|
*node, this_new_node, num_edges);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-08-31 02:16:00 +08:00
|
|
|
move_edges(dawg, *node, this_new_node, num_edges);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (debug)
|
2007-05-16 08:44:44 +08:00
|
|
|
print_dawg_node(dawg, this_new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
edge = this_new_node;
|
|
|
|
if (forward_edge (dawg, edge)) {
|
|
|
|
do {
|
2007-08-31 02:16:00 +08:00
|
|
|
relocate_edge (dawg, next_node (dawg, edge), *node, this_new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
} edge_loop (dawg, edge);
|
|
|
|
}
|
|
|
|
if (backward_edge (dawg, edge)) {
|
|
|
|
do {
|
2007-08-31 02:16:00 +08:00
|
|
|
relocate_edge (dawg, next_node (dawg, edge), *node, this_new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
} edge_loop (dawg, edge);
|
|
|
|
}
|
|
|
|
|
|
|
|
move_counter++;
|
2007-08-31 02:16:00 +08:00
|
|
|
*node = this_new_node;
|
|
|
|
return true;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* new_dawg_node
|
|
|
|
*
|
|
|
|
* Create a space within the DAWG data structure to house a node that
|
|
|
|
* consists of the requested number of edges.
|
|
|
|
**********************************************************************/
|
|
|
|
NODE_REF new_dawg_node(EDGE_ARRAY dawg,
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 num_edges,
|
|
|
|
inT32 max_num_edges,
|
|
|
|
inT32 reserved_edges) {
|
|
|
|
inT32 i;
|
|
|
|
inT32 n;
|
|
|
|
inT32 edge_index;
|
|
|
|
inT32 edge_collision;
|
2007-07-18 09:05:40 +08:00
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
/* Try several times */
|
|
|
|
for (i=0; i<NUM_PLACEMENT_ATTEMPTS; i++) {
|
|
|
|
|
|
|
|
edge_index = reserved_edges +
|
2007-07-18 09:05:40 +08:00
|
|
|
long_rand (max_num_edges - MAX_WERD_LENGTH - reserved_edges);
|
2007-03-08 04:03:40 +08:00
|
|
|
edge_collision = FALSE;
|
|
|
|
/* Find N adjacent slots */
|
|
|
|
for (n=0; n<num_edges && !edge_collision; n++) {
|
|
|
|
if (edge_occupied (dawg, edge_index + n))
|
|
|
|
edge_collision = TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (! edge_collision) {
|
|
|
|
new_counter++;
|
|
|
|
//? max_new_attempts = max (max_new_attempts, i);
|
|
|
|
return (edge_index);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("DAWG Table is too full, nodes = %d, edges = %d, moves %d\n",
|
2007-08-31 02:16:00 +08:00
|
|
|
new_counter, edge_counter, move_counter);
|
2007-03-08 04:03:40 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-07-18 09:05:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* print_dawg_map
|
|
|
|
*
|
|
|
|
* Print the contents of one of the nodes in the DAWG.
|
|
|
|
**********************************************************************/
|
2008-04-22 08:42:51 +08:00
|
|
|
void print_dawg_map (EDGE_ARRAY dawg, inT32 max_num_edges) {
|
2007-07-18 09:05:40 +08:00
|
|
|
EDGE_REF edge = 0;
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 counter = 0;
|
2007-07-18 09:05:40 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
if (edge_occupied (dawg, edge))
|
|
|
|
printf (".");
|
|
|
|
else
|
|
|
|
printf (" ");
|
|
|
|
if (counter % 100 == 0) fflush (stdout);
|
|
|
|
} while (edge++ < max_num_edges);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* read_full_dawg
|
|
|
|
*
|
|
|
|
* Read this DAWG in from a file. The nodes in this DAWG contain both
|
|
|
|
* forward and backward edges. There are gaps in between the nodes.
|
|
|
|
**********************************************************************/
|
|
|
|
void read_full_dawg (const char *filename,
|
|
|
|
EDGE_ARRAY dawg,
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 max_num_edges) {
|
2007-07-18 09:05:40 +08:00
|
|
|
FILE *file;
|
|
|
|
EDGE_REF node_index;
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 num_edges;
|
|
|
|
inT32 node_count;
|
|
|
|
inT32 error_occured = FALSE;
|
2007-07-18 09:05:40 +08:00
|
|
|
|
|
|
|
if (debug) print_string ("read_dawg");
|
|
|
|
|
|
|
|
clear_all_edges (dawg, node_index, max_num_edges);
|
|
|
|
|
|
|
|
file = open_file (filename, "rb");
|
|
|
|
|
2008-04-22 08:42:51 +08:00
|
|
|
fread (&node_count, sizeof (inT32), 1, file);
|
2007-07-18 09:05:40 +08:00
|
|
|
|
|
|
|
while (node_count-- > 0) {
|
|
|
|
|
|
|
|
fread (&node_index, sizeof (EDGE_REF), 1, file);
|
2008-04-22 08:42:51 +08:00
|
|
|
fread (&num_edges, sizeof (inT32), 1, file);
|
2007-07-18 09:05:40 +08:00
|
|
|
|
|
|
|
assert (node_index + num_edges < max_num_edges);
|
|
|
|
fread (&dawg[node_index], sizeof (EDGE_RECORD), num_edges, file);
|
|
|
|
|
|
|
|
if (debug > 2) {
|
|
|
|
print_dawg_node (dawg, node_index);
|
|
|
|
new_line ();
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
fclose (file);
|
|
|
|
if (error_occured) exit (1);
|
|
|
|
}
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* read_word_list
|
|
|
|
*
|
|
|
|
* Read the requested file (containing a list of words) and add all
|
|
|
|
* the words to the DAWG.
|
|
|
|
**********************************************************************/
|
2007-05-16 08:44:44 +08:00
|
|
|
void read_word_list(const char *filename,
|
2007-03-08 04:03:40 +08:00
|
|
|
EDGE_ARRAY dawg,
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 max_num_edges,
|
|
|
|
inT32 reserved_edges) {
|
2007-07-18 09:05:40 +08:00
|
|
|
FILE *word_file;
|
|
|
|
char string [CHARS_PER_LINE];
|
|
|
|
int word_count = 0;
|
|
|
|
int old_debug = debug;
|
|
|
|
if (debug > 0 && debug < 3)
|
|
|
|
debug = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
word_file = open_file (filename, "r");
|
|
|
|
|
2007-05-16 08:44:44 +08:00
|
|
|
initialize_dawg(dawg, max_num_edges);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
|
|
|
|
string [strlen (string) - 1] = (char) 0;
|
2007-07-18 09:05:40 +08:00
|
|
|
++word_count;
|
|
|
|
if (debug && word_count % 10000 == 0)
|
|
|
|
cprintf("Read %d words so far\n", word_count);
|
|
|
|
if (string[0] != '\0' /* strlen (string) */) {
|
2008-08-21 01:47:05 +08:00
|
|
|
if (!word_in_dawg(dawg, string)) {
|
|
|
|
add_word_to_dawg(dawg, string, max_num_edges, reserved_edges);
|
|
|
|
if (!word_in_dawg(dawg, string)) {
|
|
|
|
cprintf("error: word not in DAWG after adding it '%s'\n", string);
|
|
|
|
return;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2007-07-18 09:05:40 +08:00
|
|
|
debug = old_debug;
|
|
|
|
if (debug)
|
|
|
|
cprintf("Read %d words total.\n", word_count);
|
2007-05-16 08:44:44 +08:00
|
|
|
fclose(word_file);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* relocate_edge
|
|
|
|
*
|
|
|
|
* The location of a node has moved, so an edge entry in another node
|
|
|
|
* must be changed. Change the value of this edge entry to match the
|
|
|
|
* new location of the node.
|
|
|
|
**********************************************************************/
|
|
|
|
void relocate_edge(EDGE_ARRAY dawg,
|
|
|
|
NODE_REF node,
|
|
|
|
NODE_REF old_node,
|
|
|
|
NODE_REF new_node) {
|
|
|
|
EDGE_REF edge;
|
|
|
|
|
2007-07-18 09:05:40 +08:00
|
|
|
if (debug) cprintf ("relocate (" REFFORMAT ", " REFFORMAT " ==> " \
|
|
|
|
REFFORMAT ")\n", node, old_node, new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
edge = node;
|
|
|
|
if (forward_edge (dawg, edge)) {
|
|
|
|
do {
|
|
|
|
if (next_node (dawg, edge) == old_node) {
|
|
|
|
if (debug)
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("forward assign (" REFFORMAT ", " REFFORMAT " ==> " \
|
|
|
|
REFFORMAT ")\n", edge, old_node, new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-05-16 08:44:44 +08:00
|
|
|
set_next_edge(dawg, edge, new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
} edge_loop (dawg, edge);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (backward_edge (dawg, edge)) {
|
|
|
|
do {
|
|
|
|
if (next_node (dawg, edge) == old_node) {
|
|
|
|
if (debug)
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("backward assign (" REFFORMAT ", " REFFORMAT " ==> " \
|
|
|
|
REFFORMAT ")\n", edge, old_node, new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-05-16 08:44:44 +08:00
|
|
|
set_next_edge(dawg, edge, new_node);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2007-05-16 08:44:44 +08:00
|
|
|
edge_loop(dawg, edge);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* remove_edge
|
|
|
|
*
|
|
|
|
* Add a single edge linkage to between the two nodes. This function
|
|
|
|
* can be used to add either forward or backward links.
|
|
|
|
**********************************************************************/
|
|
|
|
void remove_edge(EDGE_ARRAY dawg,
|
|
|
|
NODE_REF node1,
|
|
|
|
NODE_REF node2,
|
2008-08-15 06:44:46 +08:00
|
|
|
int character,
|
2007-07-18 09:05:40 +08:00
|
|
|
EDGE_RECORD word_end) {
|
2007-05-16 08:44:44 +08:00
|
|
|
remove_edge_linkage(dawg, node1, node2, FORWARD_EDGE, character, word_end);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-05-16 08:44:44 +08:00
|
|
|
remove_edge_linkage(dawg, node2, node1, BACKWARD_EDGE, character, word_end);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* remove_edge_linkage
|
|
|
|
*
|
|
|
|
* Remove this edge reference from this node. Move the edge entries in
|
|
|
|
* this node to fill the gap.
|
|
|
|
**********************************************************************/
|
|
|
|
void remove_edge_linkage(EDGE_ARRAY dawg,
|
|
|
|
NODE_REF node,
|
|
|
|
NODE_REF next,
|
2007-07-18 09:05:40 +08:00
|
|
|
EDGE_RECORD direction,
|
2008-08-15 06:44:46 +08:00
|
|
|
int character,
|
2007-07-18 09:05:40 +08:00
|
|
|
EDGE_RECORD word_end) {
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 forward_edges;
|
|
|
|
inT32 num_edges;
|
2007-07-18 09:05:40 +08:00
|
|
|
NODE_REF e = node;
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 last_flag;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
forward_edges = num_forward_edges (dawg, node);
|
|
|
|
num_edges = edges_in_node (dawg, node);
|
|
|
|
|
|
|
|
for (e=node; e<node+num_edges; e++) {
|
|
|
|
/* Is this the edge*/
|
|
|
|
if ((edge_letter (dawg, e) == character) &&
|
|
|
|
((direction == FORWARD_EDGE) ?
|
|
|
|
forward_edge(dawg,e) : backward_edge(dawg,e)) &&
|
|
|
|
(next_node (dawg, e) == next) &&
|
|
|
|
(word_end ? end_of_word(dawg,e) : (! end_of_word(dawg,e)))) {
|
|
|
|
|
|
|
|
if (debug)
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("remove (edge = " REFFORMAT ", character is '%c')\n",
|
2007-03-08 04:03:40 +08:00
|
|
|
e, edge_letter (dawg, e));
|
|
|
|
|
|
|
|
/* Delete the slot */
|
|
|
|
last_flag = last_edge (dawg, e);
|
2007-05-16 08:44:44 +08:00
|
|
|
set_empty_edge(dawg, e);
|
2007-03-08 04:03:40 +08:00
|
|
|
move_edges (dawg, e+1, e, num_edges+node-e-1);
|
|
|
|
/* Restore 'last' flag */
|
|
|
|
if (direction == FORWARD_EDGE) {
|
|
|
|
if ((forward_edges - 1) &&
|
|
|
|
(forward_edges - 1 == e - node)) {
|
|
|
|
set_last_flag (dawg, e - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if ((num_edges - forward_edges - 1) &&
|
|
|
|
(num_edges - 1 == e - node)) {
|
|
|
|
set_last_flag (dawg, e - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (debug)
|
2007-05-16 08:44:44 +08:00
|
|
|
print_dawg_node(dawg, node);
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2007-07-18 09:05:40 +08:00
|
|
|
cprintf ("error: Could not find the edge to remove, %d char = '%c'\n",
|
|
|
|
next, character);
|
2007-05-16 08:44:44 +08:00
|
|
|
print_dawg_node(dawg, node);
|
2007-03-08 04:03:40 +08:00
|
|
|
exit (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* room_in_node
|
|
|
|
*
|
|
|
|
* Check to see if there is enough room left in this node for one more
|
|
|
|
* edge link. This may be a forward or backward link.
|
|
|
|
**********************************************************************/
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 room_in_node(EDGE_ARRAY dawg, NODE_REF node) {
|
2007-03-08 04:03:40 +08:00
|
|
|
EDGE_REF edge = node;
|
|
|
|
|
|
|
|
if (edge_occupied (dawg, edge + edges_in_node (dawg, node))) {
|
|
|
|
return (FALSE);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return (TRUE);
|
|
|
|
}
|
|
|
|
}
|
2007-07-18 09:05:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* write_full_dawg
|
|
|
|
*
|
|
|
|
* Write the DAWG out to a file
|
|
|
|
**********************************************************************/
|
|
|
|
void write_full_dawg (const char *filename, EDGE_ARRAY dawg,
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 max_num_edges) {
|
2007-07-18 09:05:40 +08:00
|
|
|
FILE *file;
|
|
|
|
EDGE_REF edge;
|
2008-04-22 08:42:51 +08:00
|
|
|
inT32 num_edges;
|
|
|
|
inT32 node_count = 0;
|
2007-07-18 09:05:40 +08:00
|
|
|
NODE_REF node;
|
|
|
|
|
|
|
|
if (debug) print_string ("write_full_dawg");
|
|
|
|
|
|
|
|
for (edge=0; edge<max_num_edges; edge++) {
|
|
|
|
/* Count nodes links */
|
|
|
|
if (forward_edge (dawg, edge)) {
|
|
|
|
node_count++;
|
|
|
|
if (forward_edge (dawg, edge)) edge_loop (dawg, edge);
|
|
|
|
if (backward_edge (dawg, edge)) edge_loop (dawg, edge);
|
|
|
|
edge--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
file = open_file (filename, "wb");
|
2008-04-22 08:42:51 +08:00
|
|
|
fwrite (&node_count, sizeof (inT32), 1, file);
|
2007-07-18 09:05:40 +08:00
|
|
|
|
|
|
|
node_count = 0;
|
|
|
|
for (edge=0; edge<max_num_edges; edge++) {
|
|
|
|
/* Write all edges */
|
|
|
|
if (forward_edge (dawg, edge)) {
|
|
|
|
node = edge;
|
|
|
|
num_edges = edges_in_node (dawg, edge);
|
|
|
|
|
|
|
|
assert ((node + num_edges < max_num_edges) && (num_edges > 0));
|
|
|
|
|
|
|
|
fwrite (&edge, sizeof (EDGE_REF), 1, file);
|
2008-04-22 08:42:51 +08:00
|
|
|
fwrite (&num_edges, sizeof (inT32), 1, file);
|
2007-07-18 09:05:40 +08:00
|
|
|
fwrite (&edge_of (dawg,edge), sizeof (EDGE_RECORD), num_edges, file);
|
|
|
|
|
|
|
|
node_count++;
|
|
|
|
|
|
|
|
if (debug) {
|
|
|
|
printf ("Writing node index=" REFFORMAT
|
|
|
|
", node_count=%d, edges=%d\n",
|
|
|
|
node, node_count, num_edges);
|
|
|
|
|
|
|
|
for (edge = node; edge < node + num_edges; edge++) {
|
|
|
|
printf ("Writing index=" REFFORMAT ", old_node=" REFFORMAT \
|
|
|
|
", letter=%c, flags=%d\n",
|
|
|
|
node, next_node (dawg,edge),
|
|
|
|
(char) edge_letter(dawg,edge),
|
|
|
|
(char) edge_flags (dawg,edge));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
edge = node + num_edges - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fclose (file);
|
|
|
|
}
|