/* -*-C-*- ******************************************************************************** * * File: lookdawg.cpp * Description: Look up words in a Directed Accyclic Word Graph * Author: Mark Seaman, OCR Technology * Created: Fri Oct 16 14:37:00 1987 * Modified: Thu Jul 25 17:09:55 1991 (Mark Seaman) marks@hpgrlt * Language: C * Package: N/A * Status: Reusable Software Component * * (c) Copyright 1987, Hewlett-Packard Company, all rights reserved. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * ******************************************************************************** */ /* ---------------------------------------------------------------------- I n c l u d e s ---------------------------------------------------------------------- */ #include "lookdawg.h" #include "cutil.h" #include "trie.h" #ifdef __UNIX__ #include #endif /* ---------------------------------------------------------------------- V a r i a b l e s ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- F u n c t i o n s ---------------------------------------------------------------------- */ /********************************************************************** * check_for_words * * Check the DAWG for the words that are listed in the requested file. * A file name of NULL will cause the words to be read from stdin. **********************************************************************/ void check_for_words (EDGE_ARRAY dawg, char *filename) { FILE *word_file; char string [CHARS_PER_LINE]; int misses = 0; word_file = open_file (filename, "r"); if (filename == NULL) { printf ("? "); fflush (stdout); } while (fgets (string, CHARS_PER_LINE, word_file) != NULL) { string [strlen (string) - 1] = (char) 0; if (strlen (string)) { if (debug) { debug=0; if (! word_in_dawg (dawg, string)) { puts (string); if (filename == NULL) { debug = 1; word_in_dawg (dawg, string); } } debug = 1; } else { if (!match_words (dawg, string, 0, 0)) ++misses; } } if (filename == NULL) { printf ("? "); fflush (stdout); } } fclose (word_file); // Make sure the user sees this with fprintf instead of tprintf. fprintf(stderr, "Number of lost words=%d\n", misses); } #if 0 /********************************************************************** * main * * Test the DAWG functions. **********************************************************************/ int main (argc, argv) int argc; char **argv; { inT32 max_num_edges = 700000; EDGE_ARRAY dawg; int argnum = 1; int show_nodes = FALSE; dawg = (EDGE_ARRAY) malloc (sizeof (EDGE_RECORD) * max_num_edges); if (dawg == NULL) { printf ("error: Could not allocate enough memory for DAWG "); printf ("(%ld,%03ld bytes needed)\n", sizeof (EDGE_RECORD) * max_num_edges / 1000, sizeof (EDGE_RECORD) * max_num_edges % 1000); exit (1); } if (! strcmp (argv[argnum], "-v")) { show_nodes = TRUE; argnum++; } if (strcmp (argv[argnum], "-f")) { read_squished_dawg (argv[argnum++], dawg, max_num_edges); } else { argnum++; read_full_dawg (argv[argnum++], dawg, max_num_edges); } printf ("argc = %d\n", argc); print_int ("argnum", argnum); print_string (argv[argnum]); if (argc < argnum + 1) { printf ("Type in words to search for: (use * for wildcard)\n"); debug = show_nodes; check_for_words (dawg, NULL); new_line (); } else { print_lost_words (dawg, argv[argnum]); } } #endif /********************************************************************** * match_words * * Match all of the words that are specified with this string. The *'s * in this string are wildcards. **********************************************************************/ bool match_words (EDGE_ARRAY dawg, char *string, inT32 index, NODE_REF node) { EDGE_REF edge; inT32 word_end; if (string[index] == '*') { bool any_matched = false; edge = node; do { string[index] = edge_letter (dawg, edge); if (match_words (dawg, string, index, node)) any_matched = true; } edge_loop (dawg, edge); string[index] = '*'; return any_matched; } else { word_end = (string[index+1] == (char) 0); edge = edge_char_of(dawg, node, static_cast(string[index]), word_end); if (edge != NO_EDGE) { /* Normal edge in DAWG */ node = next_node (dawg, edge); if (word_end) { printf ("%s\n", string); return true; } else if (node != 0) { return match_words (dawg, string, index+1, node); } } } return false; } /********************************************************************** * print_lost_words * * Check the DAWG for the words that are listed in the requested file. * A file name of NULL will cause the words to be read from stdin. Print * each of the words that can not be found in the DAWG. **********************************************************************/ void print_lost_words (EDGE_ARRAY dawg, char *filename) { FILE *word_file; char string [CHARS_PER_LINE]; word_file = open_file (filename, "r"); while (fgets (string, CHARS_PER_LINE, word_file) != NULL) { string [strlen (string) - 1] = (char) 0; if (strlen (string)) { if (! word_in_dawg (dawg, string)) { puts (string); } } } fclose (word_file); }