tesseract/dict/context.cpp

/* -*-C-*-
 ********************************************************************************
 *
 * File:        context.c  (Formerly context.c)
 * Description:  Context checking functions
 * Author:       Mark Seaman, OCR Technology
 * Created:      Thu Feb 15 11:18:24 1990
 * Modified:     Tue Jul  9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
 * Language:     C
 * Package:      N/A
 * Status:       Experimental (Do Not Distribute)
 *
 * (c) Copyright 1990, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 *********************************************************************************/
#include "context.h"
#include "tordvars.h"
#include "callcpp.h"
#include "globals.h"

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <math.h>

// Initialize probability_in_context to point to a default implementation (a
// main program can override this).
PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;

double def_probability_in_context(const char* context,
                                  int context_bytes,
                                  const char* character,
                                  int character_bytes) {
  (void) context;
  (void) context_bytes;
  (void) character;
  (void) character_bytes;
  return 0.0;
}

/*----------------------------------------------------------------------
              V a r i a b l e s
----------------------------------------------------------------------*/
static FILE *choice_file = NULL; /* File to save choices */

/*----------------------------------------------------------------------
              F u n c t i o n s
----------------------------------------------------------------------*/
/**********************************************************************
 * close_choices
 *
 * Close the choices file.
 **********************************************************************/
void close_choices() {
  if (choice_file)
    fclose(choice_file);
}


/**********************************************************************
 * fix_quotes
 *
 * Fix up two single quote to make them two double quotes.
 **********************************************************************/
void fix_quotes(char *str) {
  int i;
  for (i = 0; i < strlen (str); i++) {

    if (((str[i] == '\'') || (str[i] == '`')) &&
    ((str[i + 1] == '\'') || (str[i + 1] == '`'))) {
      str[i] = '\"';
      strcpy (str + i + 1, str + i + 2);
    }
  }
}


/**********************************************************************
 * punctuation_ok
 *
 * Check a string to see if it matches a set of punctuation rules.
 **********************************************************************/
int punctuation_ok(const char *word, const char *lengths) {
  int punctuation_types[5];
  int trailing = 0;
  int num_puncts = 0;
  register int x;
  int offset;
  UNICHAR_ID ch_id;

  for (x = 0; x < 5; x++)
    punctuation_types[x] = 0;

  // check for un-supported symbols
  for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
    // a un-supported symbol
    if (!unicharset.contains_unichar (word + offset, lengths[x])) {
      return -1;
    }
  }

  for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {
    if (unicharset.get_isalpha (word + offset, lengths[x])) {
      if (trailing &&
        !(unicharset.get_isalpha (word + offset - lengths[x - 1], lengths[x - 1])
#if 0
          ||
        (word[x - 1] == '\'' &&
        (word[x] == 's' || word[x] == 'd' || word[x] == 'l')) ||
        (word[x - 1] == '-')
#endif
          ))
        return (-1);
      trailing = 1;
    }
    else {
      ch_id = unicharset.unichar_to_id(word + offset, lengths[x]);

      if (unicharset.eq(ch_id, ".") && trailing) {
        if (punctuation_types[0])
          return (-1);
        (punctuation_types[0])++;
      }

      else if (((unicharset.eq(ch_id, "{")) ||
                (unicharset.eq(ch_id, "[")) ||
                (unicharset.eq(ch_id, "("))) && !trailing) {
        if (punctuation_types[1])
          return (-1);
        (punctuation_types[1])++;
      }

      else if (((unicharset.eq(ch_id, "}")) ||
                (unicharset.eq(ch_id, "]")) ||
                (unicharset.eq(ch_id, ")"))) && trailing) {
        if (punctuation_types[2])
          return (-1);
        (punctuation_types[2])++;
      }

      else if (((unicharset.eq(ch_id, ":")) ||
                (unicharset.eq(ch_id, ";")) ||
                (unicharset.eq(ch_id, "!")) ||
                (unicharset.eq(ch_id, "-")) ||
                (unicharset.eq(ch_id, ",")) ||
                (unicharset.eq(ch_id, "?"))) && trailing) {
        if (punctuation_types[3])
          return (-1);
        (punctuation_types[3])++;
        if (unicharset.eq(ch_id, "-"))
          punctuation_types[3] = 0;
      }

      else if (x < strlen(lengths) - 1 &&
               ((unicharset.eq(ch_id, "`")) ||
               (unicharset.eq(ch_id, "\"")) ||
               (unicharset.eq(ch_id, "\'")))) {
        UNICHAR_ID ch_id2 = unicharset.unichar_to_id(word + offset + lengths[x],
                                                     lengths[x + 1]);
        if ((unicharset.eq(ch_id2, "`")) ||
            (unicharset.eq(ch_id2, "\'"))) {
          offset += lengths[x++];
        }
        (punctuation_types[4])++;
        if (punctuation_types[4] > 2)
          return (-1);
      }

      else if (!unicharset.get_isdigit (ch_id))
        return (-1);
    }
  }

  for (x = 0; x < 5; x++) {
    if (punctuation_types[x])
      num_puncts++;
  }

  return (num_puncts);
}


/**********************************************************************
 * case_ok
 *
 * Check a string to see if it matches a set of lexical rules.
 **********************************************************************/
int case_ok(const char *word, const char *lengths) {
  static int case_state_table[6][4] = { {
                                 /*  0. Begining of word         */
    /*    P   U   L   D                                     */
    /* -1. Error on case            */
      0, 1, 5, 4
    },
    {                            /*  1. After initial capital    */
      0, 3, 2, 4
    },
    {                            /*  2. After lower case         */
      0, -1, 2, -1
    },
    {                            /*  3. After upper case         */
      0, 3, -1, 4
    },
    {                            /*  4. After a digit            */
      0, -1, -1, 4
    },
    {                            /*  5. After initial lower case */
      5, -1, 2, -1
    },
  };

  register int last_state = 0;
  register int state = 0;
  register int x;
  int offset;
  UNICHAR_ID ch_id;

  for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {

    ch_id = unicharset.unichar_to_id(word + offset, lengths[x]);
    if (unicharset.get_islower (ch_id))
      state = case_state_table[state][2];
    else if (unicharset.get_isupper (ch_id))
      state = case_state_table[state][1];
    else if (unicharset.get_isdigit (ch_id))
      state = case_state_table[state][3];
    else
      state = case_state_table[state][0];

    if (debug_3)
      cprintf ("Case state = %d, char = %s\n", state,
               unicharset.id_to_unichar(ch_id));
    if (state == -1) {
                                 /* Handle ACCRONYMs */
#if 0
      if (word[x] == 's' &&
        !isalpha (word[x + 1]) && !isdigit (word[x + 1]))
        state = last_state;
      else
#endif
        return (FALSE);
    }

    last_state = state;
  }
  return state != 5;             /*single lower is bad */
}


/**********************************************************************
 * write_choice_line
 *
 * Write a blank line to the choices file.  This will indicate that
 * there is a new word that is following.
 **********************************************************************/
void write_choice_line() {
  if (choice_file) {
    fprintf (choice_file, "\n");
    fflush(choice_file);
  }
}
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`/* --C--`
			`********************************************************************************`
			`*`
			`* File: context.c (Formerly context.c)`
			`* Description: Context checking functions`
			`* Author: Mark Seaman, OCR Technology`
			`* Created: Thu Feb 15 11:18:24 1990`
			`* Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt`
			`* Language: C`
			`* Package: N/A`
			`* Status: Experimental (Do Not Distribute)`
			`*`
			`* (c) Copyright 1990, Hewlett-Packard Company.`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*`
			`*********************************************************************************/`
			`#include "context.h"`
			`#include "tordvars.h"`
			`#include "callcpp.h"`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`#include "globals.h"`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
			`#include <stdio.h>`
			`#include <ctype.h>`
			`#include <string.h>`
			`#include <math.h>`

Major internationalization improvements git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-02-01 08:21:49 +08:00			`// Initialize probability_in_context to point to a default implementation (a`
			`// main program can override this).`
			`PROBABILITY_IN_CONTEXT_FUNCTION probability_in_context = &def_probability_in_context;`

			`double def_probability_in_context(const char* context,`
			`int context_bytes,`
			`const char* character,`
			`int character_bytes) {`
			`(void) context;`
			`(void) context_bytes;`
			`(void) character;`
			`(void) character_bytes;`
			`return 0.0;`
			`}`

top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`/*----------------------------------------------------------------------`
			`V a r i a b l e s`
			`----------------------------------------------------------------------*/`
			`static FILE choice_file = NULL; / File to save choices */`

			`/*----------------------------------------------------------------------`
			`F u n c t i o n s`
			`----------------------------------------------------------------------*/`
			`/**********************************************************************`
			`* close_choices`
			`*`
			`* Close the choices file.`
			`**********************************************************************/`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`void close_choices() {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (choice_file)`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`fclose(choice_file);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`


			`/**********************************************************************`
			`* fix_quotes`
			`*`
			`* Fix up two single quote to make them two double quotes.`
			`**********************************************************************/`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`void fix_quotes(char *str) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`int i;`
			`for (i = 0; i < strlen (str); i++) {`

			if (((str[i] == '\'') \|\| (str[i] == '`')) &&
			((str[i + 1] == '\'') \|\| (str[i + 1] == '`'))) {
			`str[i] = '\"';`
			`strcpy (str + i + 1, str + i + 2);`
			`}`
			`}`
			`}`


			`/**********************************************************************`
			`* punctuation_ok`
			`*`
			`* Check a string to see if it matches a set of punctuation rules.`
			`**********************************************************************/`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`int punctuation_ok(const char word, const char lengths) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`int punctuation_types[5];`
			`int trailing = 0;`
			`int num_puncts = 0;`
			`register int x;`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`int offset;`
			`UNICHAR_ID ch_id;`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
			`for (x = 0; x < 5; x++)`
			`punctuation_types[x] = 0;`

Major internationalization improvements git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-02-01 08:21:49 +08:00			`// check for un-supported symbols`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {`
Major internationalization improvements git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-02-01 08:21:49 +08:00			`// a un-supported symbol`
			`if (!unicharset.contains_unichar (word + offset, lengths[x])) {`
			`return -1;`
			`}`
			`}`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
Major internationalization improvements git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-02-01 08:21:49 +08:00			`for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`if (unicharset.get_isalpha (word + offset, lengths[x])) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (trailing &&`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`!(unicharset.get_isalpha (word + offset - lengths[x - 1], lengths[x - 1])`
			`#if 0`
			`\|\|`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`(word[x - 1] == '\'' &&`
			`(word[x] == 's' \|\| word[x] == 'd' \|\| word[x] == 'l')) \|\|`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`(word[x - 1] == '-')`
			`#endif`
			`))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`return (-1);`
			`trailing = 1;`
			`}`
			`else {`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`ch_id = unicharset.unichar_to_id(word + offset, lengths[x]);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`if (unicharset.eq(ch_id, ".") && trailing) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (punctuation_types[0])`
			`return (-1);`
			`(punctuation_types[0])++;`
			`}`

Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`else if (((unicharset.eq(ch_id, "{")) \|\|`
			`(unicharset.eq(ch_id, "[")) \|\|`
			`(unicharset.eq(ch_id, "("))) && !trailing) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (punctuation_types[1])`
			`return (-1);`
			`(punctuation_types[1])++;`
			`}`

Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`else if (((unicharset.eq(ch_id, "}")) \|\|`
			`(unicharset.eq(ch_id, "]")) \|\|`
			`(unicharset.eq(ch_id, ")"))) && trailing) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (punctuation_types[2])`
			`return (-1);`
			`(punctuation_types[2])++;`
			`}`

Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`else if (((unicharset.eq(ch_id, ":")) \|\|`
			`(unicharset.eq(ch_id, ";")) \|\|`
			`(unicharset.eq(ch_id, "!")) \|\|`
			`(unicharset.eq(ch_id, "-")) \|\|`
			`(unicharset.eq(ch_id, ",")) \|\|`
			`(unicharset.eq(ch_id, "?"))) && trailing) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (punctuation_types[3])`
			`return (-1);`
			`(punctuation_types[3])++;`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`if (unicharset.eq(ch_id, "-"))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`punctuation_types[3] = 0;`
			`}`

Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`else if (x < strlen(lengths) - 1 &&`
			((unicharset.eq(ch_id, "`")) \|\|
			`(unicharset.eq(ch_id, "\"")) \|\|`
			`(unicharset.eq(ch_id, "\'")))) {`
			`UNICHAR_ID ch_id2 = unicharset.unichar_to_id(word + offset + lengths[x],`
			`lengths[x + 1]);`
			if ((unicharset.eq(ch_id2, "`")) \|\|
			`(unicharset.eq(ch_id2, "\'"))) {`
			`offset += lengths[x++];`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
			`(punctuation_types[4])++;`
			`if (punctuation_types[4] > 2)`
			`return (-1);`
			`}`

Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`else if (!unicharset.get_isdigit (ch_id))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`return (-1);`
			`}`
			`}`

			`for (x = 0; x < 5; x++) {`
			`if (punctuation_types[x])`
			`num_puncts++;`
			`}`

			`return (num_puncts);`
			`}`


			`/**********************************************************************`
			`* case_ok`
			`*`
			`* Check a string to see if it matches a set of lexical rules.`
			`**********************************************************************/`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`int case_ok(const char word, const char lengths) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`static int case_state_table[6][4] = { {`
			`/* 0. Begining of word */`
			`/* P U L D */`
			`/* -1. Error on case */`
			`0, 1, 5, 4`
			`},`
			`{ /* 1. After initial capital */`
			`0, 3, 2, 4`
			`},`
			`{ /* 2. After lower case */`
			`0, -1, 2, -1`
			`},`
			`{ /* 3. After upper case */`
			`0, 3, -1, 4`
			`},`
			`{ /* 4. After a digit */`
			`0, -1, -1, 4`
			`},`
			`{ /* 5. After initial lower case */`
			`5, -1, 2, -1`
			`},`
			`};`

			`register int last_state = 0;`
			`register int state = 0;`
			`register int x;`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`int offset;`
			`UNICHAR_ID ch_id;`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`for (x = 0, offset = 0; x < strlen (lengths); offset += lengths[x++]) {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`ch_id = unicharset.unichar_to_id(word + offset, lengths[x]);`
			`if (unicharset.get_islower (ch_id))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`state = case_state_table[state][2];`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`else if (unicharset.get_isupper (ch_id))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`state = case_state_table[state][1];`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`else if (unicharset.get_isdigit (ch_id))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`state = case_state_table[state][3];`
			`else`
			`state = case_state_table[state][0];`

			`if (debug_3)`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`cprintf ("Case state = %d, char = %s\n", state,`
			`unicharset.id_to_unichar(ch_id));`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (state == -1) {`
			`/* Handle ACCRONYMs */`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`#if 0`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (word[x] == 's' &&`
			`!isalpha (word[x + 1]) && !isdigit (word[x + 1]))`
			`state = last_state;`
			`else`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`#endif`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`return (FALSE);`
			`}`

			`last_state = state;`
			`}`
			`return state != 5; /single lower is bad /`
			`}`


			`/**********************************************************************`
			`* write_choice_line`
			`*`
			`* Write a blank line to the choices file. This will indicate that`
			`* there is a new word that is following.`
			`**********************************************************************/`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`void write_choice_line() {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`if (choice_file) {`
			`fprintf (choice_file, "\n");`
Remaining changes for Unicodeization project git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:15:07 +08:00			`fflush(choice_file);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
			`}`