tesseract/dict/dawg.h
theraysmith 3a13d80d24 Changes to dict for 3.00
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2009-07-11 02:20:33 +00:00

475 lines
18 KiB
C

/* -*-C-*-
********************************************************************************
*
* File: dawg.h (Formerly dawg.h)
* Description: Definition of a class that represents Directed Accyclic Word
* Graph (DAWG), functions to build and manipulate the DAWG.
* Author: Mark Seaman, SW Productivity
* Created: Fri Oct 16 14:37:00 1987
* Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Reusable Software Component
*
* (c) Copyright 1987, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
#ifndef DAWG_H
#define DAWG_H
/*----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------*/
#include "elst.h"
#include "general.h"
#include "ratngs.h"
#include "varable.h"
/*----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
extern INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info, to"
" 2 for more details, to 3 to see all the debug messages");
#ifdef __MSW32__
#define NO_EDGE (inT64) 0xffffffffffffffffi64
#else
#define NO_EDGE (inT64) 0xffffffffffffffffll
#endif
/*----------------------------------------------------------------------
T y p e s
----------------------------------------------------------------------*/
class UNICHARSET;
typedef uinT64 EDGE_RECORD;
typedef EDGE_RECORD *EDGE_ARRAY;
typedef inT64 EDGE_REF;
typedef inT64 NODE_REF;
typedef EDGE_REF *NODE_MAP;
namespace tesseract {
struct NodeChild {
UNICHAR_ID unichar_id;
EDGE_REF edge_ref;
NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
};
typedef GenericVector<NodeChild> NodeChildVector;
typedef GenericVector<int> SuccessorList;
typedef GenericVector<SuccessorList *> SuccessorListsVector;
enum DawgType {
DAWG_TYPE_PUNCTUATION,
DAWG_TYPE_PREFIX,
DAWG_TYPE_ROOT,
DAWG_TYPE_WORD,
DAWG_TYPE_SUFFIX,
DAWG_TYPE_NUMBER,
DAWG_TYPE_COUNT // number of enum entries
};
/*----------------------------------------------------------------------
C o n s t a n t s
----------------------------------------------------------------------*/
#define FORWARD_EDGE (inT32) 0
#define BACKWARD_EDGE (inT32) 1
#define MAX_NODE_EDGES_DISPLAY (inT64) 100
#define LAST_FLAG (inT64) 1
#define DIRECTION_FLAG (inT64) 2
#define WERD_END_FLAG (inT64) 4
#define LETTER_START_BIT 0
#define NUM_FLAG_BITS 3
#define REFFORMAT "%lld"
// Set kBeginningDawgsType[i] to true if a Dawg of
// DawgType i can contain the beginning of a word.
static const bool kBeginningDawgsType[] = {1, 1, 0, 1, 0, 1 };
static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
{ 0, 1, 0, 1, 0, 0 }, // for DAWG_TYPE_PUNCTUATION
{ 0, 0, 1, 1, 0, 0 }, // for DAWG_TYPE_PREFIX
{ 0, 0, 0, 0, 1, 0 }, // for DAWG_TYPE_ROOT
{ 1, 0, 0, 0, 0, 0 }, // for DAWG_TYPE_WORD
{ 1, 0, 0, 0, 0, 0 }, // for DAWG_TYPE_SUFFIX
{ 0, 0, 0, 0, 0, 0 } // for DAWG_TYPE_NUMBER
};
static const char kWildcard[] = "*";
/*----------------------------------------------------------------------
C l a s s e s a n d S t r u c t s
----------------------------------------------------------------------*/
//
// Abstract class (an interface) that declares methods needed by the
// various tesseract classes to operate on SquishedDawg and Trie objects.
//
// This class initializes all the edge masks (since their usage by
// SquishedDawg and Trie is identical) and implements simple accessors
// for each of the fields encoded in an EDGE_RECORD.
// This class also implements word_in_dawg() and check_for_words()
// (since they use only the public methods of SquishedDawg and Trie
// classes that are inherited from the Dawg base class).
//
class Dawg {
public:
// Magic number to determine endianness when reading the Dawg from file.
static const inT16 kDawgMagicNumber = 42;
// A special unichar id that indicates that any appropriate pattern
// (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead
// Used for expressing patterns in punctuation and number Dawgs.
static const UNICHAR_ID kPatternUnicharID = 0;
inline DawgType type() const { return type_; }
inline const STRING &lang() const { return lang_; }
inline PermuterType permuter() const { return perm_; }
virtual ~Dawg() {};
// Returns true if the given word is in the Dawg.
bool word_in_dawg(const WERD_CHOICE &word) const;
// Checks the Dawg for the words that are listed in the requested file.
// Returns the number of words in the given file missing from the Dawg.
int check_for_words(const char *filename,
const UNICHARSET &unicharset,
bool enable_wildcard) const;
// Pure virtual function that should be implemented by the derived classes.
// Returns the edge that corresponds to the letter out of this node.
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
bool word_end) const = 0;
// Fills the given NodeChildVector with all the unichar ids (and the
// corresponding EDGE_REFs) for which there is an edge out of this node.
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
// Returns the next node visited by following the edge
// indicated by the given EDGE_REF.
virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
// Returns true if the edge indicated by the given EDGE_REF
// marks the end of a word.
virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
// Prints the contents of the node indicated by the given NODE_REF.
// At most max_num_edges will be printed.
virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
protected:
Dawg() {}
// Returns the next node visited by following this edge.
inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
}
// Returns the direction flag of this edge.
inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
BACKWARD_EDGE : FORWARD_EDGE;
}
// Returns true if this edge marks the end of a word.
inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
}
// Returns UNICHAR_ID recorded in this edge.
inline UNICHAR_ID unichar_id_from_edge_rec(
const EDGE_RECORD &edge_rec) const {
return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
}
// Sets the next node link for this edge in the Dawg.
inline void set_next_node_in_edge_rec(
EDGE_RECORD *edge_rec, EDGE_REF value) {
*edge_rec &= (~next_node_mask_);
*edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
}
// Sets this edge record to be the last one in a sequence of edges.
inline void set_last_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
*edge_rec |= (LAST_FLAG << flag_start_bit_);
}
// Sequentially compares the given values of unichar ID, next node
// and word end marker with the values in the given EDGE_RECORD.
// Returns: 1 if at any step the given input value exceeds
// that of edge_rec (and all the values already
// checked are the same)
// 0 if edge_rec_match() returns true
// -1 otherwise
inline int given_greater_than_edge_rec(NODE_REF next_node,
bool word_end,
UNICHAR_ID unichar_id,
const EDGE_RECORD &edge_rec) const {
UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
curr_word_end, curr_unichar_id)) return 0;
if (unichar_id > curr_unichar_id) return 1;
if (unichar_id == curr_unichar_id) {
if (next_node > curr_next_node) return 1;
if (next_node == curr_next_node) {
if (word_end > curr_word_end) return 1;
}
}
return -1;
}
// Returns true if all the values are equal (any value matches
// next_node if next_node == NO_EDGE, any value matches word_end
// if word_end is false).
inline bool edge_rec_match(NODE_REF next_node,
bool word_end,
UNICHAR_ID unichar_id,
NODE_REF other_next_node,
bool other_word_end,
UNICHAR_ID other_unichar_id) const {
return ((unichar_id == other_unichar_id) &&
(next_node == NO_EDGE || next_node == other_next_node) &&
(!word_end || (word_end == other_word_end)));
}
// Sets type_, lang_, perm_, unicharset_size_.
// Initializes the values of various masks from unicharset_size_.
void init(DawgType type, const STRING &lang,
PermuterType perm, int unicharset_size);
// Matches all of the words that are represented by this string.
// If wilcard is set to something other than INVALID_UNICHAR_ID,
// the *'s in this string are interpreted as wildcards.
// WERD_CHOICE param is not passed by const so that wildcard searches
// can modify it and work without having to copy WERD_CHOICEs.
bool match_words(WERD_CHOICE *word, inT32 index,
NODE_REF node, UNICHAR_ID wildcard) const;
// Member Variables.
DawgType type_;
STRING lang_;
// Permuter code that should be used if the word is found in this Dawg.
PermuterType perm_;
// Variables to construct various edge masks. Formerly:
// #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64
// #define FLAGS_MASK (inT64) 0x0000000700000000i64
// #define LETTER_MASK (inT64) 0x00000000ffffffffi64
int unicharset_size_;
int flag_start_bit_;
int next_node_start_bit_;
uinT64 next_node_mask_;
uinT64 flags_mask_;
uinT64 letter_mask_;
};
//
// DawgInfo struct and DawgInfoVector class are used for
// storing information about the current Dawg search state.
//
struct DawgInfo {
DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
bool operator==(const DawgInfo &other) {
return (this->dawg_index == other.dawg_index &&
this->ref == other.ref);
}
int dawg_index;
EDGE_REF ref;
};
class DawgInfoVector : public GenericVector<DawgInfo> {
public:
// Overload destructor, since clear() does not delete data_[] any more.
~DawgInfoVector() {
if (size_reserved_ > 0) {
delete[] data_;
size_used_ = 0;
size_reserved_ = 0;
}
}
// Overload clear() in order to avoid allocating/deallocating memory
// when clearing the vector and re-inserting entries into it later.
void clear() { size_used_ = 0; }
// Adds an entry for the given dawg_index with the given node to the vec.
// Returns false if the same entry already exists in the vector,
// true otherwise.
inline bool add_unique(const DawgInfo &new_info, const char *debug_msg) {
for (int i = 0; i < size_used_; ++i) {
if (data_[i] == new_info) return false;
}
push_back(new_info);
if (dawg_debug_level) {
tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
new_info.dawg_index, new_info.ref);
}
return true;
}
// Removes an entry that equals to the given DawgInfo.
// This function assumes that the entries in the vector are unique.
// Returns true if an entry was found and removed.
inline bool remove(const DawgInfo &info) {
for (int i = 0; i < size_used_; ++i) {
if (data_[i] == info) {
for (int j = i + 1; j < size_used_; ++j) {
data_[j-1] = data_[j];
}
size_used_--;
return true;
}
}
return false;
}
};
//
// Concrete class that can operate on a compacted (squished) Dawg (read,
// search and write to file). This class is read-only in the sense that
// new words can not be added to an instance of SquishedDawg.
// The underlying representation of the nodes and edges in SquishedDawg
// is stored as a contiguous EDGE_ARRAY (read from file or given as an
// argument to the constructor).
//
class SquishedDawg : public Dawg {
public:
SquishedDawg(FILE *file, DawgType type,
const STRING &lang, PermuterType perm) {
read_squished_dawg(file, type, lang, perm);
num_forward_edges_in_node0 = num_forward_edges(0);
}
SquishedDawg(const char* filename, DawgType type,
const STRING &lang, PermuterType perm) {
FILE *file = fopen(filename, "rb");
if (file == NULL) {
tprintf("Failed to open dawg file %s\n", filename);
exit(1);
}
read_squished_dawg(file, type, lang, perm);
num_forward_edges_in_node0 = num_forward_edges(0);
fclose(file);
}
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
const STRING &lang, PermuterType perm, int unicharset_size) :
edges_(edges), num_edges_(num_edges) {
init(type, lang, perm, unicharset_size);
num_forward_edges_in_node0 = num_forward_edges(0);
if (dawg_debug_level > 3) print_all("SquishedDawg:");
}
~SquishedDawg();
// Returns the edge that corresponds to the letter out of this node.
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
bool word_end) const;
// Fills the given NodeChildVector with all the unichar ids (and the
// corresponding EDGE_REFs) for which there is an edge out of this node.
void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
EDGE_REF edge = node;
if (!edge_occupied(edge) || edge == NO_EDGE) return;
assert(forward_edge(edge)); // we don't expect any backward edges to
do { // be present when this funciton is called
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
} while (!last_edge(edge++));
}
// Returns the next node visited by following the edge
// indicated by the given EDGE_REF.
NODE_REF next_node(EDGE_REF edge) const {
return next_node_from_edge_rec((edges_[edge]));
}
// Returns true if the edge indicated by the given EDGE_REF
// marks the end of a word.
bool end_of_word(EDGE_REF edge_ref) const {
return end_of_word_from_edge_rec((edges_[edge_ref]));
}
// Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
return unichar_id_from_edge_rec((edges_[edge_ref]));
}
// Prints the contents of the node indicated by the given NODE_REF.
// At most max_num_edges will be printed.
void print_node(NODE_REF node, int max_num_edges) const;
// Writes the squished/reduced Dawg to a file.
void write_squished_dawg(const char *filename);
private:
// Sets the next node link for this edge.
inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
}
// Sets the edge to be empty.
inline void set_empty_edge(EDGE_REF edge_ref) {
(edges_[edge_ref] = next_node_mask_);
}
// Goes through all the edges and clears each one out.
inline void clear_all_edges() {
for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
}
// Clears the last flag of this edge.
inline void clear_last_flag(EDGE_REF edge_ref) {
(edges_[edge_ref] &= ~(LAST_FLAG << flag_start_bit_));
}
// Returns true if this edge is in the forward direction.
inline bool forward_edge(EDGE_REF edge_ref) const {
return (edge_occupied(edge_ref) &&
(FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
}
// Returns true if this edge is in the backward direction.
inline bool backward_edge(EDGE_REF edge_ref) const {
return (edge_occupied(edge_ref) &&
(BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
}
// Returns true if the edge spot in this location is occupied.
inline bool edge_occupied(EDGE_REF edge_ref) const {
return (edges_[edge_ref] != next_node_mask_);
}
// Returns true if this edge is the last edge in a sequence.
inline bool last_edge(EDGE_REF edge_ref) const {
return (edges_[edge_ref] & (LAST_FLAG << flag_start_bit_)) != 0;
}
// Counts and returns the number of forward edges in this node.
inT32 num_forward_edges(NODE_REF node) const;
// Reads SquishedDawg from a file.
void read_squished_dawg(FILE *file, DawgType type,
const STRING &lang, PermuterType perm);
// Prints the contents of an edge indicated by the given EDGE_REF.
void print_edge(EDGE_REF edge) const;
// Prints the contents of the SquishedDawg.
void print_all(const char* msg) {
tprintf("\n__________________________\n%s\n", msg);
for (int i = 0; i < num_edges_; ++i) print_edge(i);
tprintf("__________________________\n");
}
// Constructs a mapping from the memory node indices to disk node indices.
NODE_MAP build_node_map(inT32 *num_nodes) const;
// Member variables.
EDGE_ARRAY edges_;
int num_edges_;
int num_forward_edges_in_node0;
};
} // namespace tesseract
#endif