tesseract/cube/word_list_lang_model.h

91 lines
3.5 KiB
C++

/**********************************************************************
* File: word_list_lang_model.h
* Description: Declaration of the Word List Language Model Class
* Author: Ahmad Abdulkader
* Created: 2008
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
// The WordListLangModel class abstracts a language model that is based on
// a list of words. It inherits from the LangModel abstract class
// Besides providing the methods inherited from the LangModel abstract class,
// the class provided methods to add new strings to the Language Model:
// AddString & AddString32
#ifndef WORD_LIST_LANG_MODEL_H
#define WORD_LIST_LANG_MODEL_H
#include <vector>
#include "cube_reco_context.h"
#include "lang_model.h"
#include "tess_lang_mod_edge.h"
namespace tesseract {
class Trie;
class WordListLangModel : public LangModel {
public:
explicit WordListLangModel(CubeRecoContext *cntxt);
~WordListLangModel();
// Returns an edge pointer to the Root
LangModEdge *Root();
// Returns the edges that fan-out of the specified edge and their count
LangModEdge **GetEdges(CharAltList *alt_list,
LangModEdge *edge,
int *edge_cnt);
// Returns is a sequence of 32-bit characters are valid within this language
// model or net. And EndOfWord flag is specified. If true, the sequence has
// to end on a valid word. The function also optionally returns the list
// of language model edges traversed to parse the string
bool IsValidSequence(const char_32 *sequence,
bool eow_flag,
LangModEdge **edges);
bool IsLeadingPunc(char_32 ch) { return false; } // not yet implemented
bool IsTrailingPunc(char_32 ch) { return false; } // not yet implemented
bool IsDigit(char_32 ch) { return false; } // not yet implemented
// Adds a new UTF-8 string to the language model
bool AddString(const char *char_ptr);
// Adds a new UTF-32 string to the language model
bool AddString32(const char_32 *char_32_ptr);
// Compute all the variants of a 32-bit string in terms of the class-ids.
// This is needed for languages that have ligatures. A word can then have
// more than one spelling in terms of the class-ids.
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset,
string_32 str32,
vector<WERD_CHOICE *> *word_variants);
private:
// constants needed to configure the language model
static const int kMaxEdge = 512;
static const int kMaxDawgEdges = 20000;
CubeRecoContext *cntxt_;
Trie *dawg_;
bool init_;
// Initialize the language model
bool Init();
// Cleanup
void Cleanup();
// Recursive helper function for WordVariants().
static void WordVariants(
const CharSet &char_set,
string_32 prefix_str32, WERD_CHOICE *word_so_far,
string_32 str32,
vector<WERD_CHOICE *> *word_variants);
};
} // tesseract
#endif // WORD_LIST_LANG_MODEL_H