/* -*-C-*-
********************************************************************************
*
* File:         lookdawg.cpp
* Description:  Look up words in a Directed Accyclic Word Graph
* Author:       Mark Seaman, OCR Technology
* Created:      Fri Oct 16 14:37:00 1987
* Modified:     Thu Jul 25 17:09:55 1991 (Mark Seaman) marks@hpgrlt
* Language:     C
* Package:      N/A
* Status:       Reusable Software Component
*
* (c) Copyright 1987, Hewlett-Packard Company, all rights reserved.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
********************************************************************************
*/


/*
----------------------------------------------------------------------
                     I n c l u d e s
----------------------------------------------------------------------
*/

#include "lookdawg.h"

#include "cutil.h"
#include "trie.h"
#ifdef __UNIX__
#include <assert.h>
#endif

/*
----------------------------------------------------------------------
                     V a r i a b l e s
----------------------------------------------------------------------
*/

/*
----------------------------------------------------------------------
                     F u n c t i o n s
----------------------------------------------------------------------
*/

/**********************************************************************
* check_for_words
*
* Check the DAWG for the words that are listed in the requested file.
* A file name of NULL will cause the words to be read from stdin.
**********************************************************************/

void check_for_words (EDGE_ARRAY dawg,
                      char       *filename) {
  FILE       *word_file;
  char       string [CHARS_PER_LINE];

  word_file = open_file (filename, "r");

  if (filename == NULL) {
    printf ("? ");
    fflush (stdout);
  }

  while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
    string [strlen (string) - 1] = (char) 0;

    if (strlen (string)) {
      if (debug) {
        debug=0;
        if (! word_in_dawg (dawg, string)) {
          puts (string);
          if (filename == NULL) {
            debug = 1;
            word_in_dawg (dawg, string);
          }
        }
        debug = 1;
      }
      else {
        match_words (dawg, string, 0, 0);
      }
    }

    if (filename == NULL) {
      printf ("? ");
      fflush (stdout);
    }
  }
  fclose (word_file);
}

#if 0
/**********************************************************************
* main
*
* Test the DAWG functions.
**********************************************************************/

int main (argc, argv)
   int  argc;
   char **argv;
{
   inT32       max_num_edges  = 700000;
   EDGE_ARRAY  dawg;
   int         argnum = 1;
   int         show_nodes = FALSE;

   dawg = (EDGE_ARRAY) malloc (sizeof (EDGE_RECORD) * max_num_edges);
   if (dawg == NULL) {
      printf ("error: Could not allocate enough memory for DAWG  ");
      printf ("(%ld,%03ld bytes needed)\n",
	      sizeof (EDGE_RECORD) * max_num_edges / 1000,
	      sizeof (EDGE_RECORD) * max_num_edges % 1000);
      exit (1);
   }

   if (! strcmp (argv[argnum], "-v")) {
      show_nodes = TRUE;
      argnum++;
   }

   if (strcmp  (argv[argnum], "-f")) {
      read_squished_dawg  (argv[argnum++], dawg, max_num_edges);
   }
   else {
      argnum++;
      read_full_dawg  (argv[argnum++], dawg, max_num_edges);
   }

   printf ("argc = %d\n", argc);
   print_int ("argnum", argnum);
   print_string (argv[argnum]);

   if (argc < argnum + 1) {
      printf ("Type in words to search for:          (use * for wildcard)\n");
      debug = show_nodes;
      check_for_words (dawg, NULL);
      new_line ();
   }
   else {
      print_lost_words (dawg, argv[argnum]);
   }
}
#endif

/**********************************************************************
* match_words
*
* Match all of the words that are specified with this string.  The *'s
* in this string are wildcards.
**********************************************************************/

void match_words (EDGE_ARRAY  dawg,
                  char        *string,
                  inT32         index,
                  NODE_REF    node) {
  EDGE_REF   edge;
  inT32        word_end;

  if (string[index] == '*') {
    edge = node;
    do {
      string[index] = edge_letter (dawg, edge);
      match_words (dawg, string, index, node);
    } edge_loop (dawg, edge);
    string[index] = '*';
  }
  else {
    word_end = (string[index+1] == (char) 0);
    edge = edge_char_of (dawg, node, string[index], word_end);
    if (edge != NO_EDGE) {                         /* Normal edge in DAWG */
      node = next_node (dawg, edge);
      if (word_end) {
        printf ("%s\n", string);
      }
      else if (node != 0) {
        match_words (dawg, string, index+1, node);
      }
    }
  }
}


/**********************************************************************
* print_lost_words
*
* Check the DAWG for the words that are listed in the requested file.
* A file name of NULL will cause the words to be read from stdin. Print
* each of the words that can not be found in the DAWG.
**********************************************************************/

void print_lost_words (EDGE_ARRAY dawg,
                       char       *filename) {
  FILE       *word_file;
  char       string [CHARS_PER_LINE];

  word_file = open_file (filename, "r");

  while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
    string [strlen (string) - 1] = (char) 0;

    if (strlen (string)) {
      if (! word_in_dawg (dawg, string)) {
        puts (string);
      }
    }
  }
  fclose (word_file);
}