tesseract/cube/tess_lang_model.h

143 lines
5.4 KiB
C
Raw Normal View History

/**********************************************************************
* File: tess_lang_model.h
* Description: Declaration of the Tesseract Language Model Class
* Author: Ahmad Abdulkader
* Created: 2008
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESS_LANG_MODEL_H
#define TESS_LANG_MODEL_H
#include <string>
#include "char_altlist.h"
#include "cube_reco_context.h"
#include "cube_tuning_params.h"
#include "dict.h"
#include "lang_model.h"
#include "tessdatamanager.h"
#include "tess_lang_mod_edge.h"
namespace tesseract {
const int kStateCnt = 4;
const int kNumLiteralCnt = 5;
class TessLangModel : public LangModel {
public:
TessLangModel(const string &lm_params,
const string &data_file_path,
bool load_system_dawg,
TessdataManager *tessdata_manager,
CubeRecoContext *cntxt);
~TessLangModel() {
if (word_dawgs_ != NULL) {
word_dawgs_->delete_data_pointers();
delete word_dawgs_;
}
}
// returns a pointer to the root of the language model
inline TessLangModEdge *Root() {
return NULL;
}
// The general fan-out generation function. Returns the list of edges
// fanning-out of the specified edge and their count. If an AltList is
// specified, only the class-ids with a minimum cost are considered
LangModEdge **GetEdges(CharAltList *alt_list,
LangModEdge *edge,
int *edge_cnt);
// Determines if a sequence of 32-bit chars is valid in this language model
// starting from the root. If the eow_flag is ON, also checks for
// a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
// edge
bool IsValidSequence(const char_32 *sequence, bool eow_flag,
LangModEdge **final_edge = NULL);
bool IsLeadingPunc(char_32 ch);
bool IsTrailingPunc(char_32 ch);
bool IsDigit(char_32 ch);
void RemoveInvalidCharacters(string *lm_str);
private:
// static LM state machines
static const Dawg *ood_dawg_;
static const Dawg *number_dawg_;
static const int num_state_machine_[kStateCnt][kNumLiteralCnt];
static const int num_max_repeat_[kStateCnt];
// word_dawgs_ should only be loaded if cube has its own version of the
// unicharset (different from the one used by tesseract) and therefore
// can not use the dawgs loaded for tesseract (since the unichar ids
// encoded in the dawgs differ).
DawgVector *word_dawgs_;
static int max_edge_;
static int max_ood_shape_cost_;
// remaining language model elements needed by cube. These get loaded from
// the .lm file
string lead_punc_;
string trail_punc_;
string num_lead_punc_;
string num_trail_punc_;
string operators_;
string digits_;
string alphas_;
// String of characters in RHS of each line of <lang>.cube.lm
// Each element is hard-coded to correspond to a specific token type
// (see LoadLangModelElements)
string *literal_str_[kNumLiteralCnt];
// Recognition context needed to access language properties
// (case, cursive,..)
CubeRecoContext *cntxt_;
bool has_case_;
// computes and returns the edges that fan out of an edge ref
int FanOut(CharAltList *alt_list,
const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
const char_32 *str, bool root_flag, LangModEdge **edge_array);
// generate edges from an NULL terminated string
// (used for punctuation, operators and digits)
int Edges(const char *strng, const Dawg *dawg,
EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
LangModEdge **edge_array);
// Generate the edges fanning-out from an edge in the number state machine
int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array);
// Generate OOD edges
int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref,
EDGE_REF edge_ref_mask, LangModEdge **edge_array);
// Cleanup an edge array
void FreeEdges(int edge_cnt, LangModEdge **edge_array);
// Determines if a sequence of 32-bit chars is valid in this language model
// starting from the specified edge. If the eow_flag is ON, also checks for
// a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
// edge
bool IsValidSequence(LangModEdge *edge, const char_32 *sequence,
bool eow_flag, LangModEdge **final_edge);
// Parse language model elements from the given string, which should
// have been loaded from <lang>.cube.lm file, e.g. in CubeRecoContext
bool LoadLangModelElements(const string &lm_params);
// Returns the number of word Dawgs in the language model.
int NumDawgs() const;
// Returns the dawgs with the given index from either the dawgs
// stored by the Tesseract object, or the word_dawgs_.
const Dawg *GetDawg(int index) const;
};
} // tesseract
#endif // TESS_LANG_MODEL_H