mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-13 07:59:04 +08:00
143 lines
5.4 KiB
C
143 lines
5.4 KiB
C
|
/**********************************************************************
|
||
|
* File: tess_lang_model.h
|
||
|
* Description: Declaration of the Tesseract Language Model Class
|
||
|
* Author: Ahmad Abdulkader
|
||
|
* Created: 2008
|
||
|
*
|
||
|
* (C) Copyright 2008, Google Inc.
|
||
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
** you may not use this file except in compliance with the License.
|
||
|
** You may obtain a copy of the License at
|
||
|
** http://www.apache.org/licenses/LICENSE-2.0
|
||
|
** Unless required by applicable law or agreed to in writing, software
|
||
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
** See the License for the specific language governing permissions and
|
||
|
** limitations under the License.
|
||
|
*
|
||
|
**********************************************************************/
|
||
|
|
||
|
#ifndef TESS_LANG_MODEL_H
|
||
|
#define TESS_LANG_MODEL_H
|
||
|
|
||
|
#include <string>
|
||
|
|
||
|
#include "char_altlist.h"
|
||
|
#include "cube_reco_context.h"
|
||
|
#include "cube_tuning_params.h"
|
||
|
#include "dict.h"
|
||
|
#include "lang_model.h"
|
||
|
#include "tessdatamanager.h"
|
||
|
#include "tess_lang_mod_edge.h"
|
||
|
|
||
|
namespace tesseract {
|
||
|
|
||
|
const int kStateCnt = 4;
|
||
|
const int kNumLiteralCnt = 5;
|
||
|
|
||
|
class TessLangModel : public LangModel {
|
||
|
public:
|
||
|
TessLangModel(const string &lm_params,
|
||
|
const string &data_file_path,
|
||
|
bool load_system_dawg,
|
||
|
TessdataManager *tessdata_manager,
|
||
|
CubeRecoContext *cntxt);
|
||
|
~TessLangModel() {
|
||
|
if (word_dawgs_ != NULL) {
|
||
|
word_dawgs_->delete_data_pointers();
|
||
|
delete word_dawgs_;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// returns a pointer to the root of the language model
|
||
|
inline TessLangModEdge *Root() {
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
// The general fan-out generation function. Returns the list of edges
|
||
|
// fanning-out of the specified edge and their count. If an AltList is
|
||
|
// specified, only the class-ids with a minimum cost are considered
|
||
|
LangModEdge **GetEdges(CharAltList *alt_list,
|
||
|
LangModEdge *edge,
|
||
|
int *edge_cnt);
|
||
|
// Determines if a sequence of 32-bit chars is valid in this language model
|
||
|
// starting from the root. If the eow_flag is ON, also checks for
|
||
|
// a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
|
||
|
// edge
|
||
|
bool IsValidSequence(const char_32 *sequence, bool eow_flag,
|
||
|
LangModEdge **final_edge = NULL);
|
||
|
bool IsLeadingPunc(char_32 ch);
|
||
|
bool IsTrailingPunc(char_32 ch);
|
||
|
bool IsDigit(char_32 ch);
|
||
|
|
||
|
void RemoveInvalidCharacters(string *lm_str);
|
||
|
private:
|
||
|
// static LM state machines
|
||
|
static const Dawg *ood_dawg_;
|
||
|
static const Dawg *number_dawg_;
|
||
|
static const int num_state_machine_[kStateCnt][kNumLiteralCnt];
|
||
|
static const int num_max_repeat_[kStateCnt];
|
||
|
// word_dawgs_ should only be loaded if cube has its own version of the
|
||
|
// unicharset (different from the one used by tesseract) and therefore
|
||
|
// can not use the dawgs loaded for tesseract (since the unichar ids
|
||
|
// encoded in the dawgs differ).
|
||
|
DawgVector *word_dawgs_;
|
||
|
|
||
|
static int max_edge_;
|
||
|
static int max_ood_shape_cost_;
|
||
|
|
||
|
// remaining language model elements needed by cube. These get loaded from
|
||
|
// the .lm file
|
||
|
string lead_punc_;
|
||
|
string trail_punc_;
|
||
|
string num_lead_punc_;
|
||
|
string num_trail_punc_;
|
||
|
string operators_;
|
||
|
string digits_;
|
||
|
string alphas_;
|
||
|
// String of characters in RHS of each line of <lang>.cube.lm
|
||
|
// Each element is hard-coded to correspond to a specific token type
|
||
|
// (see LoadLangModelElements)
|
||
|
string *literal_str_[kNumLiteralCnt];
|
||
|
// Recognition context needed to access language properties
|
||
|
// (case, cursive,..)
|
||
|
CubeRecoContext *cntxt_;
|
||
|
bool has_case_;
|
||
|
|
||
|
// computes and returns the edges that fan out of an edge ref
|
||
|
int FanOut(CharAltList *alt_list,
|
||
|
const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
|
||
|
const char_32 *str, bool root_flag, LangModEdge **edge_array);
|
||
|
// generate edges from an NULL terminated string
|
||
|
// (used for punctuation, operators and digits)
|
||
|
int Edges(const char *strng, const Dawg *dawg,
|
||
|
EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
|
||
|
LangModEdge **edge_array);
|
||
|
// Generate the edges fanning-out from an edge in the number state machine
|
||
|
int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array);
|
||
|
// Generate OOD edges
|
||
|
int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref,
|
||
|
EDGE_REF edge_ref_mask, LangModEdge **edge_array);
|
||
|
// Cleanup an edge array
|
||
|
void FreeEdges(int edge_cnt, LangModEdge **edge_array);
|
||
|
// Determines if a sequence of 32-bit chars is valid in this language model
|
||
|
// starting from the specified edge. If the eow_flag is ON, also checks for
|
||
|
// a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
|
||
|
// edge
|
||
|
bool IsValidSequence(LangModEdge *edge, const char_32 *sequence,
|
||
|
bool eow_flag, LangModEdge **final_edge);
|
||
|
// Parse language model elements from the given string, which should
|
||
|
// have been loaded from <lang>.cube.lm file, e.g. in CubeRecoContext
|
||
|
bool LoadLangModelElements(const string &lm_params);
|
||
|
|
||
|
// Returns the number of word Dawgs in the language model.
|
||
|
int NumDawgs() const;
|
||
|
|
||
|
// Returns the dawgs with the given index from either the dawgs
|
||
|
// stored by the Tesseract object, or the word_dawgs_.
|
||
|
const Dawg *GetDawg(int index) const;
|
||
|
};
|
||
|
} // tesseract
|
||
|
|
||
|
#endif // TESS_LANG_MODEL_H
|