tesseract/dict/dawg.h
2008-02-01 00:21:49 +00:00

395 lines
13 KiB
C

/* -*-C-*-
********************************************************************************
*
* File: dawg.h (Formerly dawg.h)
* Description:
* Author: Mark Seaman, SW Productivity
* Created: Fri Oct 16 14:37:00 1987
* Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Reusable Software Component
*
* (c) Copyright 1987, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
#ifndef DAWG_H
#define DAWG_H
/*----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------*/
#include <ctype.h>
#include "general.h"
/*----------------------------------------------------------------------
T y p e s
----------------------------------------------------------------------*/
/* #define MAX_WERD_LENGTH (INT32) 40 */
/* #define MAX_NODE_EDGES_DISPLAY (INT32) 100 */
/* #define LAST_FLAG (INT32) 1 */
/* #define DIRECTION_FLAG (INT32) 2 */
/* #define WERD_END_FLAG (INT32) 4 */
/* #define LETTER_START_BIT 0 */
/* #define FLAG_START_BIT 8 */
/* #define NEXT_EDGE_START_BIT 11 */
/* #define NO_EDGE (INT32) 0x001fffff */
/* #define NEXT_EDGE_MASK (INT32) 0xfffff800 */
/* #define FLAGS_MASK (INT32) 0x00000700 */
/* #define LETTER_MASK (INT32) 0x000000ff */
/* #define REFFORMAT "%d" */
/* typedef UINT32 EDGE_RECORD; */
/* typedef EDGE_RECORD *EDGE_ARRAY; */
/* typedef INT32 EDGE_REF; */
/* typedef INT32 NODE_REF; */
#define MAX_WERD_LENGTH (INT64) 40
#define MAX_NODE_EDGES_DISPLAY (INT64) 100
#define LAST_FLAG (INT64) 1
#define DIRECTION_FLAG (INT64) 2
#define WERD_END_FLAG (INT64) 4
#define LETTER_START_BIT 0
#define FLAG_START_BIT 8
#define NEXT_EDGE_START_BIT 11
#ifdef __MSW32__
#define NO_EDGE (INT64) 0x001fffffffffffffi64
#define NEXT_EDGE_MASK (INT64) 0xfffffffffffff800i64
#define FLAGS_MASK (INT64) 0x0000000000000700i64
#define LETTER_MASK (INT64) 0x00000000000000ffi64
#else
#define NO_EDGE (INT64) 0x001fffffffffffffll
#define NEXT_EDGE_MASK (INT64) 0xfffffffffffff800ll
#define FLAGS_MASK (INT64) 0x0000000000000700ll
#define LETTER_MASK (INT64) 0x00000000000000ffll
#endif
#define MAX_NUM_EDGES_IN_SQUISHED_DAWG_FILE 2000000
#define REFFORMAT "%lld"
typedef UINT64 EDGE_RECORD;
typedef EDGE_RECORD *EDGE_ARRAY;
typedef INT64 EDGE_REF;
typedef INT64 NODE_REF;
/*---------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
extern INT32 case_sensative;
extern INT32 debug;
/*----------------------------------------------------------------------
M a c r o s
----------------------------------------------------------------------*/
/**********************************************************************
* edge_of
*
* Access the edge that is indexed by the requested edge number.
**********************************************************************/
#define edge_of(edges,e) \
((edges)[e])
/**********************************************************************
* print_edge
*
* Print the contents of a single edge entry in the DAWG.
**********************************************************************/
#define print_edge(dawg,edge) \
printf ("%7d : next = %7d, char = '%c', %s %s %s\n", \
edge, next_node (dawg, edge), edge_letter (dawg, edge), \
(forward_edge (dawg, edge) ? "FORWARD" : " "), \
(last_edge (dawg, edge) ? "LAST" : " "), \
(end_of_word (dawg, edge) ? "EOW" : "")) \
/**********************************************************************
* next_node
*
* The next node visited in the DAWG by following this edge.
**********************************************************************/
#define next_node(edges,e) \
(((edges)[e] & NEXT_EDGE_MASK) >> NEXT_EDGE_START_BIT)
/**********************************************************************
* set_next_edge
*
* Set the next node link for this edge in the DAWG.
**********************************************************************/
#define set_next_edge(edges,e,value) \
((edges)[e] = ((edges)[e] & (~NEXT_EDGE_MASK)) |\
((value << NEXT_EDGE_START_BIT) & NEXT_EDGE_MASK))
/**********************************************************************
* empty_edge_spot
*
* Return TRUE if this edge spot in this location is unoccupied.
**********************************************************************/
#define empty_edge_spot(edges,e) \
((edges)[e] == NEXT_EDGE_MASK)
/**********************************************************************
* set_empty_edge
*
* Return TRUE if this edge spot in this location is unoccupied.
**********************************************************************/
#define set_empty_edge(edges,e) \
((edges)[e] = NEXT_EDGE_MASK)
/**********************************************************************
* clear_all_edges
*
* Go through all the edges in the DAWG and clear out each one.
**********************************************************************/
#define clear_all_edges(dawg,edge,max_num_edges) \
for (edge=0; edge<max_num_edges; edge++) \
set_empty_edge (dawg, edge);
/**********************************************************************
* edge_occupied
*
* Return TRUE if this edge spot in this location is occupied.
**********************************************************************/
#define edge_occupied(edges,e) \
((edges)[e] != NEXT_EDGE_MASK)
/**********************************************************************
* edge_flags
*
* The letter choice that corresponds to this edge in the DAWG.
**********************************************************************/
#define edge_flags(edges,e) \
(((edges)[e] & FLAGS_MASK) >> FLAG_START_BIT)
/**********************************************************************
* edge_letter
*
* The letter choice that corresponds to this edge in the DAWG.
**********************************************************************/
#define edge_letter(edges,e) \
((char)(((edges)[e] & LETTER_MASK) >> LETTER_START_BIT))
/**********************************************************************
* letter_of_edge
*
* The letter choice that corresponds to this edge in the DAWG.
**********************************************************************/
#define letter_of_edge(edge) \
((char)((edge & LETTER_MASK) >> LETTER_START_BIT))
/**********************************************************************
* last_edge
*
* Return TRUE if this edge is the last edge in the sequence. This is
* TRUE for the last one in both the forward and backward part.
**********************************************************************/
#define last_edge(edges,e) \
((edges)[e] & (LAST_FLAG << FLAG_START_BIT))
/**********************************************************************
* end_of_word
*
* Return TRUE if this edge marks the end of a word.
**********************************************************************/
#define end_of_word(edges,e) \
((edges)[e] & (WERD_END_FLAG << FLAG_START_BIT))
/**********************************************************************
* forward_edge
*
* Return TRUE if this edge is in the forward direction.
**********************************************************************/
#define forward_edge(edges,e) \
((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT) && \
edge_occupied (edges,e))
/**********************************************************************
* backward_edge
*
* Return TRUE if this edge is in the backward direction.
**********************************************************************/
#define backward_edge(edges,e) \
(! ((edges)[e] & (DIRECTION_FLAG << FLAG_START_BIT)) && \
edge_occupied (edges,e))
/**********************************************************************
* edge_loop
*
* Loop for each of the edges in the forward direction. This macro
* can be used in the following way:
*********************************************************************/
#define edge_loop(edges,e) \
while (! last_edge (edges,e++))
/**********************************************************************
* case_is_okay
*
* Check the case of this character in the character string to make
* sure that there is not a problem with the case.
**********************************************************************/
// TODO(tkielbus) Replace islalpha, islower & isupper by unicode versions.
// However the lengths information is not available at this point in the
// code. We will probably get rid of the dictionaries at some point anyway.
#define case_is_okay(word,i) \
(i ? \
((isupper(word[i]) && islower(word[i-1])) ? \
FALSE : \
((islower(word[i]) && isupper(word[i-1]) && \
i>1 && isalpha (word[i-2])) ? \
FALSE : \
TRUE)) : \
TRUE)
/**********************************************************************
* trailing_punc
*
* Check for leading punctuation.
**********************************************************************/
#define trailing_punc(ch) \
((ch == '}' ) || \
(ch == ':' ) || \
(ch == ';' ) || \
(ch == '-' ) || \
(ch == ']' ) || \
(ch == '!' ) || \
(ch == '?' ) || \
(ch == '`' ) || \
(ch == ',' ) || \
(ch == '.' ) || \
(ch == ')' ) || \
(ch == '\"' ) || \
(ch == '\'' ))
/**********************************************************************
* leading_punc
*
* Check for leading punctuation.
**********************************************************************/
#define leading_punc(ch) \
((ch == '\"' ) || \
(ch == '(' ) || \
(ch == '{' ) || \
(ch == '[' ) || \
(ch == '`' ) || \
(ch == '\'' ))
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
EDGE_REF edge_char_of(EDGE_ARRAY dawg,
NODE_REF node,
int character,
int word_end);
INT32 edges_in_node(EDGE_ARRAY dawg, NODE_REF node);
INT32 def_letter_is_okay(EDGE_ARRAY dawg,
NODE_REF *node,
INT32 char_index,
char prevchar,
const char *word,
INT32 word_end);
/*
* Allow for externally provided letter_is_okay.
*/
typedef INT32 (*LETTER_OK_FUNC)(EDGE_ARRAY, NODE_REF*, INT32, char, const char*,
INT32);
extern LETTER_OK_FUNC letter_is_okay;
INT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node);
void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node);
EDGE_ARRAY read_squished_dawg(const char *filename);
INT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, INT32 char_index);
INT32 word_in_dawg(EDGE_ARRAY dawg, const char *string);
/*
#if defined(__STDC__) || defined(__cplusplus) || MAC_OR_DOS
# define _ARGS(s) s
#else
# define _ARGS(s) ()
#endif*/
/* dawg.c
EDGE_REF edge_char_of
_ARGS((EDGE_ARRAY dawg,
NODE_REF node,
int character,
int word_end));
INT32 edges_in_node
_ARGS((EDGE_ARRAY dawg,
NODE_REF node));
INT32 def_letter_is_okay
_ARGS((EDGE_ARRAY dawg,
NODE_REF *node,
INT32 char_index,
char *word,
INT32 word_end));
INT32 num_forward_edges
_ARGS((EDGE_ARRAY dawg,
NODE_REF node));
void print_dawg_node
_ARGS((EDGE_ARRAY dawg,
NODE_REF node));
void read_squished_dawg
_ARGS((char *filename,
EDGE_ARRAY dawg,
INT32 max_num_edges));
INT32 verify_trailing_punct
_ARGS((EDGE_ARRAY dawg,
char *word,
INT32 char_index));
INT32 word_in_dawg
_ARGS((EDGE_ARRAY dawg,
char *string));
#undef _ARGS
*/
#endif