mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-11 23:19:04 +08:00
79 lines
2.9 KiB
C
79 lines
2.9 KiB
C
|
/**********************************************************************
|
||
|
* File: lang_model.h
|
||
|
* Description: Declaration of the Language Model Edge Base Class
|
||
|
* Author: Ahmad Abdulkader
|
||
|
* Created: 2007
|
||
|
*
|
||
|
* (C) Copyright 2008, Google Inc.
|
||
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
** you may not use this file except in compliance with the License.
|
||
|
** You may obtain a copy of the License at
|
||
|
** http://www.apache.org/licenses/LICENSE-2.0
|
||
|
** Unless required by applicable law or agreed to in writing, software
|
||
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
** See the License for the specific language governing permissions and
|
||
|
** limitations under the License.
|
||
|
*
|
||
|
**********************************************************************/
|
||
|
|
||
|
// The LanguageModel class abstracts a State machine that is modeled as a Trie
|
||
|
// structure. The state machine models the language being recognized by the OCR
|
||
|
// Engine
|
||
|
// This is an abstract class that is to be inherited by any language model
|
||
|
|
||
|
#ifndef LANG_MODEL_H
|
||
|
#define LANG_MODEL_H
|
||
|
|
||
|
#include "lang_mod_edge.h"
|
||
|
#include "char_altlist.h"
|
||
|
#include "char_set.h"
|
||
|
#include "tuning_params.h"
|
||
|
|
||
|
namespace tesseract {
|
||
|
class LangModel {
|
||
|
public:
|
||
|
LangModel() {
|
||
|
ood_enabled_ = true;
|
||
|
numeric_enabled_ = true;
|
||
|
word_list_enabled_ = true;
|
||
|
punc_enabled_ = true;
|
||
|
}
|
||
|
virtual ~LangModel() {}
|
||
|
|
||
|
// Returns an edge pointer to the Root
|
||
|
virtual LangModEdge *Root() = 0;
|
||
|
// Returns the edges that fan-out of the specified edge and their count
|
||
|
virtual LangModEdge **GetEdges(CharAltList *alt_list,
|
||
|
LangModEdge *parent_edge,
|
||
|
int *edge_cnt) = 0;
|
||
|
// Returns is a sequence of 32-bit characters are valid within this language
|
||
|
// model or net. And EndOfWord flag is specified. If true, the sequence has
|
||
|
// to end on a valid word. The function also optionally returns the list
|
||
|
// of language model edges traversed to parse the string
|
||
|
virtual bool IsValidSequence(const char_32 *str, bool eow_flag,
|
||
|
LangModEdge **edge_array = NULL) = 0;
|
||
|
virtual bool IsLeadingPunc(char_32 ch) = 0;
|
||
|
virtual bool IsTrailingPunc(char_32 ch) = 0;
|
||
|
virtual bool IsDigit(char_32 ch) = 0;
|
||
|
|
||
|
// accessor functions
|
||
|
inline bool OOD() { return ood_enabled_; }
|
||
|
inline bool Numeric() { return numeric_enabled_; }
|
||
|
inline bool WordList() { return word_list_enabled_; }
|
||
|
inline bool Punc() { return punc_enabled_; }
|
||
|
inline void SetOOD(bool ood) { ood_enabled_ = ood; }
|
||
|
inline void SetNumeric(bool numeric) { numeric_enabled_ = numeric; }
|
||
|
inline void SetWordList(bool word_list) { word_list_enabled_ = word_list; }
|
||
|
inline void SetPunc(bool punc_enabled) { punc_enabled_ = punc_enabled; }
|
||
|
|
||
|
protected:
|
||
|
bool ood_enabled_;
|
||
|
bool numeric_enabled_;
|
||
|
bool word_list_enabled_;
|
||
|
bool punc_enabled_;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
#endif // LANG_MODEL_H
|