/* -*-C-*- ******************************************************************************** * * File: dawg.c (Formerly dawg.c) * Description: Use a Directed Accyclic Word Graph * Author: Mark Seaman, OCR Technology * Created: Fri Oct 16 14:37:00 1987 * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt * Language: C * Package: N/A * Status: Reusable Software Component * * (c) Copyright 1987, Hewlett-Packard Company. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * *********************************************************************************/ /*---------------------------------------------------------------------- I n c l u d e s ----------------------------------------------------------------------*/ #ifdef __MSW32__ #include #else #include #endif #include "dawg.h" #include "cutil.h" #include "tprintf.h" #include "freelist.h" #include "context.h" #include "strngs.h" #include "emalloc.h" /*---------------------------------------------------------------------- V a r i a b l e s ----------------------------------------------------------------------*/ INT32 debug = 0; INT32 case_sensative = 1; /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ /********************************************************************** * edge_char_of * * Return the edge that corresponds to the letter out of this node. **********************************************************************/ EDGE_REF edge_char_of(EDGE_ARRAY dawg, NODE_REF node, int character, int word_end) { EDGE_REF edge = node; if (! case_sensative) character = tolower (character); if (edge_occupied (dawg, edge)) { do { if ((edge_letter (dawg, edge) == character) && (! word_end || end_of_word(dawg,edge))) return (edge); } edge_loop (dawg, edge); } return (NO_EDGE); } /********************************************************************** * edges_in_node * * Count the number of edges in this node in the DAWG. This includes * both forward and back links. **********************************************************************/ INT32 edges_in_node(EDGE_ARRAY dawg, NODE_REF node) { EDGE_REF edge = node; if (edge_occupied (dawg, edge)) { edge_loop(dawg, edge); if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { edge_loop(dawg, edge); return (edge - node); } else { return (edge - node); } } else { return (edge - node); } } /* * Initialize letter_is_okay to point to default implmentation (a main * program can override this). */ LETTER_OK_FUNC letter_is_okay = &def_letter_is_okay; /********************************************************************** * def_letter_is_okay * * Default way to check this letter in light of the current state. If * everything is still OK then return TRUE. **********************************************************************/ // TODO(tkielbus) Change the prevchar argument to make it unicode safe. // We might want to get rid of def_letter_is_okay at some point though. INT32 def_letter_is_okay(EDGE_ARRAY dawg, NODE_REF *node, INT32 char_index, char prevchar, const char *word, INT32 word_end) { EDGE_REF edge; STRING dummy_word(word); // Auto-deleting string fixes memory leak. STRING word_single_lengths; //Lengths of single UTF-8 characters of the word. const char *ptr; for (ptr = word; *ptr != '\0';) { int step = UNICHAR::utf8_step(ptr); if (step == 0) return FALSE; word_single_lengths += step; ptr += step; } if (*node == NO_EDGE) { /* Trailing punctuation */ if (trailing_punc (dummy_word [char_index]) && (!trailing_punc (prevchar) || punctuation_ok(dummy_word.string(), word_single_lengths.string())>=0)) return (TRUE); else return (FALSE); } else { /* Leading punctuation */ if (*node == 0 && char_index != 0 && // TODO(tkielbus) Replace islalpha by unicode versions. // However the lengths information is not available at this point in the // code. We will probably get rid of the dictionaries at some point anyway. isalpha (dummy_word [char_index]) && ! leading_punc (dummy_word [char_index-1]) && dummy_word [char_index-1] != '-') { return (FALSE); } } /* Handle compund words */ #if 0 if (dummy_word [char_index] == '-') { if (char_index>0 && !word_end && word [char_index-1] == '-' && word [char_index+1] == '-') return FALSE; /*not allowed*/ dummy_word [char_index] = (char) 0; if (word_in_dawg (dawg, dummy_word.string())) { dummy_word [char_index] = '-'; *node = 0; return (TRUE); } else { dummy_word [char_index] = '-'; return (FALSE); } } #endif /* Check the DAWG */ edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end); if (edge != NO_EDGE) { /* Normal edge in DAWG */ if (case_sensative || case_is_okay (dummy_word, char_index)) { //next_node (dawg, edge); *node = next_node(dawg, edge); if (*node == 0) *node = NO_EDGE; return (TRUE); } else { return (FALSE); } } else { /* Leading punctuation */ if (leading_punc (word [char_index]) && (char_index == 0 || leading_punc (dummy_word [char_index-1]))) { *node = 0; if (leading_punc (prevchar) || punctuation_ok (word, word_single_lengths.string())>=0) return (TRUE); else return FALSE; } /* Trailing punctuation */ if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) { *node = NO_EDGE; return (TRUE); } return (FALSE); } } /********************************************************************** * num_forward_edges * * Count and return the number of forward edges for this node. **********************************************************************/ INT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node) { EDGE_REF edge = node; INT32 num = 0; if (forward_edge (dawg, edge)) { do { num++; } edge_loop (dawg, edge); } return (num); } /********************************************************************** * print_dawg_node * * Print the contents of one of the nodes in the DAWG. **********************************************************************/ void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node) { EDGE_REF edge = node; const char *forward_string = "FORWARD"; const char *backward_string = " "; const char *last_string = "LAST"; const char *not_last_string = " "; const char *eow_string = "EOW"; const char *not_eow_string = " "; const char *direction; const char *is_last; const char *eow; char ch; if (edge_occupied (dawg, edge)) { do { if (forward_edge (dawg, edge)) direction = forward_string; else direction = backward_string; if (last_edge (dawg, edge)) is_last = last_string; else is_last = not_last_string; if (end_of_word (dawg, edge)) eow = eow_string; else eow = not_eow_string; ch = edge_letter (dawg, edge); cprintf (REFFORMAT " : next = " REFFORMAT ", char = '%c', %s %s %s\n", edge, next_node (dawg, edge), ch, direction, is_last, eow); if (edge - node > MAX_NODE_EDGES_DISPLAY) return; } edge_loop (dawg, edge); if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) { do { if (forward_edge (dawg, edge)) direction = forward_string; else direction = backward_string; if (last_edge (dawg, edge)) is_last = last_string; else is_last = not_last_string; if (end_of_word (dawg, edge)) eow = eow_string; else eow = not_eow_string; ch = edge_letter (dawg, edge); cprintf (REFFORMAT " : next = " REFFORMAT ", char = '%c', %s %s %s\n", edge, next_node (dawg, edge), ch, direction, is_last, eow); if (edge - node > MAX_NODE_EDGES_DISPLAY) return; } edge_loop (dawg, edge); } } else { cprintf (REFFORMAT " : no edges in this node\n", node); } cprintf("\n"); } /********************************************************************** * read_squished_dawg * * Read the DAWG from a file and return it. Must be freed with memfree. **********************************************************************/ EDGE_ARRAY read_squished_dawg(const char *filename) { FILE *file; EDGE_REF edge; INT32 num_edges = 0; INT32 node_count = 0; if (debug) print_string ("read_debug"); #ifdef __UNIX__ file = open_file (filename, "r"); #else file = open_file (filename, "rb"); #endif fread (&num_edges, sizeof (INT32), 1, file); num_edges = ntohl(num_edges); if (num_edges > MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE || num_edges < 0) { tprintf("(ENDIAN)Error: trying to read a DAWG '%s' that contains " "%d edges while the maximum is %d.\n", filename, num_edges, MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE); exit(1); } UINT32 *dawg_32 = (UINT32*) Emalloc(num_edges * sizeof (UINT32)); fread(&dawg_32[0], sizeof (UINT32), num_edges, file); fclose(file); EDGE_ARRAY dawg = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges); for (edge = 0; edge < num_edges; ++edge) dawg[edge] = ntohl(dawg_32[edge]); Efree(dawg_32); for (edge = 0; edge < num_edges; ++edge) if (last_edge (dawg, edge)) node_count++; return dawg; } /********************************************************************** * verify_trailing_punct * * Make sure that there is a valid transition from the word core to a * string of trailing puntuation. TRUE is returned if everything is * OK. **********************************************************************/ INT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, INT32 char_index) { char last_char; char *first_char; if (trailing_punc (word [char_index])) { last_char = word [char_index]; word [char_index] = (char) 0; for (first_char = word; leading_punc (first_char[0]); first_char++); if (word_in_dawg (dawg, first_char)) { word [char_index] = last_char; return (TRUE); } word [char_index] = last_char; } return (FALSE); } /********************************************************************** * word_in_dawg * * Test to see if the word can be found in the DAWG. **********************************************************************/ INT32 word_in_dawg(EDGE_ARRAY dawg, const char *string) { NODE_REF node = 0; INT32 i; INT32 length; length=strlen(string); if (length==0) return FALSE; for (i=0; i 1) { print_dawg_node(dawg, node); new_line(); } if (! letter_is_okay (dawg, &node, i, '\0', string, (string[i+1]==0))) { return (FALSE); } } return (TRUE); }