tesseract/training/wordlist2dawg.cpp

///////////////////////////////////////////////////////////////////////
// File:        wordlist2dawg.cpp
// Description: Program to generate a DAWG from a word list file
// Author:      Thomas Kielbus
// Created:     Thu May 10 18:11:42 PDT 2007
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

// Given a file that contains a list of words (one word per line) this program
// generates the corresponding squished DAWG file.

#include <stdio.h>

#include "classify.h"
#include "dawg.h"
#include "dict.h"
#include "emalloc.h"
#include "freelist.h"
#include "helpers.h"
#include "serialis.h"
#include "trie.h"
#include "unicharset.h"

static const int kMaxNumEdges =  10000000;

int main(int argc, char** argv) {
  int min_word_length;
  int max_word_length;
  if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
        (argc == 7 && strcmp(argv[1], "-l") == 0 &&
         sscanf(argv[2], "%d", &min_word_length) == 1 &&
         sscanf(argv[3], "%d", &max_word_length) == 1))) {
    printf("Usage: %s [-t | -l min_len max_len] word_list_file"
           " dawg_file unicharset_file", argv[0]);
    return 1;
  }
  tesseract::Classify *classify = new tesseract::Classify();
  int argv_index = 0;
  if (argc == 5) ++argv_index;
  if (argc == 7) argv_index += 3;
  const char* wordlist_filename = argv[++argv_index];
  const char* dawg_filename = argv[++argv_index];
  const char* unicharset_file = argv[++argv_index];
  tprintf("Loading unicharset from '%s'\n", unicharset_file);
  if (!classify->getDict().getUnicharset().load_from_file(unicharset_file)) {
    tprintf("Failed to load unicharset from '%s'\n", unicharset_file);
    delete classify;
    return 1;
  }
  const UNICHARSET &unicharset = classify->getDict().getUnicharset();
  if (argc == 4) {
    tesseract::Trie trie(
        // the first 3 arguments are not used in this case
        tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
        kMaxNumEdges, unicharset.size(),
        classify->getDict().dawg_debug_level);
    tprintf("Reading word list from '%s'\n", wordlist_filename);
    if (!trie.read_word_list(wordlist_filename, unicharset)) {
      tprintf("Failed to read word list from '%s'\n", wordlist_filename);
      exit(1);
    }
    tprintf("Reducing Trie to SquishedDawg\n");
    tesseract::SquishedDawg *dawg = trie.trie_to_dawg();
    if (dawg != NULL && dawg->NumEdges() > 0) {
      tprintf("Writing squished DAWG to '%s'\n", dawg_filename);
      dawg->write_squished_dawg(dawg_filename);
    } else {
      tprintf("Dawg is empty, skip producing the output file\n");
    }
    delete dawg;
  } else if (argc == 5) {
    tprintf("Loading dawg DAWG from '%s'\n", dawg_filename);
    tesseract::SquishedDawg words(
        dawg_filename,
        // these 3 arguments are not used in this case
        tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
        classify->getDict().dawg_debug_level);
    tprintf("Checking word list from '%s'\n", wordlist_filename);
    words.check_for_words(wordlist_filename, unicharset, true);
  } else if (argc == 7) {
    // Place words of different lengths in separate Dawgs.
    char str[CHARS_PER_LINE];
    FILE *word_file = fopen(wordlist_filename, "rb");
    if (word_file == NULL) {
      tprintf("Failed to open wordlist file %s\n", wordlist_filename);
      exit(1);
    }
    FILE *dawg_file = fopen(dawg_filename, "wb");
    if (dawg_file == NULL) {
      tprintf("Failed to open dawg output file %s\n", dawg_filename);
      exit(1);
    }
    tprintf("Reading word list from '%s'\n", wordlist_filename);
    GenericVector<tesseract::Trie *> trie_vec;
    int i;
    for (i = min_word_length; i <= max_word_length; ++i) {
      trie_vec.push_back(new tesseract::Trie(
          // the first 3 arguments are not used in this case
          tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
          kMaxNumEdges, unicharset.size(),
          classify->getDict().dawg_debug_level));
    }
    while (fgets(str, CHARS_PER_LINE, word_file) != NULL) {
      chomp_string(str);  // remove newline
      WERD_CHOICE word(str, unicharset);
      if (word.length() >= min_word_length &&
          word.length() <= max_word_length &&
          !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
        tesseract::Trie *curr_trie = trie_vec[word.length()-min_word_length];
        if (!curr_trie->word_in_dawg(word)) {
          curr_trie->add_word_to_dawg(word);
          if (classify->getDict().dawg_debug_level > 1) {
            tprintf("Added word %s of length %d\n", str, word.length());
          }
          if (!curr_trie->word_in_dawg(word)) {
            tprintf("Error: word '%s' not in DAWG after adding it\n", str);
            exit(1);
          }
        }
      }
    }
    fclose(word_file);
    tprintf("Writing fixed length dawgs to '%s'\n", dawg_filename);
    GenericVector<tesseract::SquishedDawg *> dawg_vec;
    for (i = 0; i <= max_word_length; ++i) {
      dawg_vec.push_back(i < min_word_length ? NULL :
                         trie_vec[i-min_word_length]->trie_to_dawg());
    }
    tesseract::Dict::WriteFixedLengthDawgs(
        dawg_vec, max_word_length - min_word_length + 1,
        classify->getDict().dawg_debug_level, dawg_file);
    fclose(dawg_file);
    dawg_vec.delete_data_pointers();
    trie_vec.delete_data_pointers();
  } else {  // should never get here
    tprintf("Invalid command-line options\n");
    exit(1);
  }
  delete classify;
  return 0;
}
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`///////////////////////////////////////////////////////////////////////`
			`// File: wordlist2dawg.cpp`
			`// Description: Program to generate a DAWG from a word list file`
			`// Author: Thomas Kielbus`
			`// Created: Thu May 10 18:11:42 PDT 2007`
			`//`
			`// (C) Copyright 2006, Google Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`//`
			`///////////////////////////////////////////////////////////////////////`

			`// Given a file that contains a list of words (one word per line) this program`
			`// generates the corresponding squished DAWG file.`

			`#include <stdio.h>`

Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`#include "classify.h"`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`#include "dawg.h"`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`#include "dict.h"`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`#include "emalloc.h"`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`#include "freelist.h"`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`#include "helpers.h"`
			`#include "serialis.h"`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`#include "trie.h"`
			`#include "unicharset.h"`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`static const int kMaxNumEdges = 10000000;`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`int main(int argc, char** argv) {`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`int min_word_length;`
			`int max_word_length;`
			`if (!(argc == 4 \|\| (argc == 5 && strcmp(argv[1], "-t") == 0) \|\|`
			`(argc == 7 && strcmp(argv[1], "-l") == 0 &&`
			`sscanf(argv[2], "%d", &min_word_length) == 1 &&`
			`sscanf(argv[3], "%d", &max_word_length) == 1))) {`
			`printf("Usage: %s [-t \| -l min_len max_len] word_list_file"`
			`" dawg_file unicharset_file", argv[0]);`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`return 1;`
			`}`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`tesseract::Classify *classify = new tesseract::Classify();`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`int argv_index = 0;`
			`if (argc == 5) ++argv_index;`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`if (argc == 7) argv_index += 3;`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`const char* wordlist_filename = argv[++argv_index];`
			`const char* dawg_filename = argv[++argv_index];`
			`const char* unicharset_file = argv[++argv_index];`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Loading unicharset from '%s'\n", unicharset_file);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`if (!classify->getDict().getUnicharset().load_from_file(unicharset_file)) {`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`tprintf("Failed to load unicharset from '%s'\n", unicharset_file);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`delete classify;`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`return 1;`
			`}`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`const UNICHARSET &unicharset = classify->getDict().getUnicharset();`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`if (argc == 4) {`
			`tesseract::Trie trie(`
			`// the first 3 arguments are not used in this case`
			`tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`kMaxNumEdges, unicharset.size(),`
			`classify->getDict().dawg_debug_level);`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Reading word list from '%s'\n", wordlist_filename);`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`if (!trie.read_word_list(wordlist_filename, unicharset)) {`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Failed to read word list from '%s'\n", wordlist_filename);`
Fixed name collision with jpeg library git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@162 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-04-22 08:39:51 +08:00			`exit(1);`
			`}`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Reducing Trie to SquishedDawg\n");`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`tesseract::SquishedDawg *dawg = trie.trie_to_dawg();`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`if (dawg != NULL && dawg->NumEdges() > 0) {`
			`tprintf("Writing squished DAWG to '%s'\n", dawg_filename);`
Fixed bug with empty dawgs git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@534 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-30 09:01:13 +08:00			`dawg->write_squished_dawg(dawg_filename);`
			`} else {`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Dawg is empty, skip producing the output file\n");`
Fixed bug with empty dawgs git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@534 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-30 09:01:13 +08:00			`}`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`delete dawg;`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`} else if (argc == 5) {`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Loading dawg DAWG from '%s'\n", dawg_filename);`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`tesseract::SquishedDawg words(`
			`dawg_filename,`
			`// these 3 arguments are not used in this case`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,`
			`classify->getDict().dawg_debug_level);`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Checking word list from '%s'\n", wordlist_filename);`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`words.check_for_words(wordlist_filename, unicharset, true);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`} else if (argc == 7) {`
			`// Place words of different lengths in separate Dawgs.`
			`char str[CHARS_PER_LINE];`
show page 0 for multipage tiff; Windows: use binary mode for fopen (issue 70); autotools: fixed cutil/Makefile.am, improved tessdata/Makefile.am; git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@604 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-08-12 05:42:13 +08:00			`FILE *word_file = fopen(wordlist_filename, "rb");`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`if (word_file == NULL) {`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Failed to open wordlist file %s\n", wordlist_filename);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`exit(1);`
			`}`
			`FILE *dawg_file = fopen(dawg_filename, "wb");`
			`if (dawg_file == NULL) {`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Failed to open dawg output file %s\n", dawg_filename);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`exit(1);`
			`}`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Reading word list from '%s'\n", wordlist_filename);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`GenericVector<tesseract::Trie *> trie_vec;`
			`int i;`
			`for (i = min_word_length; i <= max_word_length; ++i) {`
			`trie_vec.push_back(new tesseract::Trie(`
			`// the first 3 arguments are not used in this case`
			`tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,`
			`kMaxNumEdges, unicharset.size(),`
			`classify->getDict().dawg_debug_level));`
			`}`
			`while (fgets(str, CHARS_PER_LINE, word_file) != NULL) {`
			`chomp_string(str); // remove newline`
			`WERD_CHOICE word(str, unicharset);`
			`if (word.length() >= min_word_length &&`
			`word.length() <= max_word_length &&`
			`!word.contains_unichar_id(INVALID_UNICHAR_ID)) {`
			`tesseract::Trie *curr_trie = trie_vec[word.length()-min_word_length];`
			`if (!curr_trie->word_in_dawg(word)) {`
			`curr_trie->add_word_to_dawg(word);`
			`if (classify->getDict().dawg_debug_level > 1) {`
			`tprintf("Added word %s of length %d\n", str, word.length());`
			`}`
			`if (!curr_trie->word_in_dawg(word)) {`
			`tprintf("Error: word '%s' not in DAWG after adding it\n", str);`
			`exit(1);`
			`}`
			`}`
			`}`
			`}`
			`fclose(word_file);`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Writing fixed length dawgs to '%s'\n", dawg_filename);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`GenericVector<tesseract::SquishedDawg *> dawg_vec;`
			`for (i = 0; i <= max_word_length; ++i) {`
			`dawg_vec.push_back(i < min_word_length ? NULL :`
			`trie_vec[i-min_word_length]->trie_to_dawg());`
			`}`
			`tesseract::Dict::WriteFixedLengthDawgs(`
			`dawg_vec, max_word_length - min_word_length + 1,`
			`classify->getDict().dawg_debug_level, dawg_file);`
			`fclose(dawg_file);`
			`dawg_vec.delete_data_pointers();`
			`trie_vec.delete_data_pointers();`
			`} else { // should never get here`
Various fixes, including memory leak in fixspace, font labels on output, removed some annoying debug output, fixes to initialization of parameters, general cleanup, and added Hindi git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@573 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2011-03-22 05:48:58 +08:00			`tprintf("Invalid command-line options\n");`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`exit(1);`
Fixed name collision with jpeg library git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@162 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-04-22 08:39:51 +08:00			`}`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`delete classify;`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`return 0;`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`}`