mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-23 06:57:50 +08:00
07ca24aeaf
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1044 d0cd1f9f-072b-0410-8dd7-cf729c803f20
206 lines
6.2 KiB
C++
206 lines
6.2 KiB
C++
/**********************************************************************
|
|
* File: word_list_lang_model.cpp
|
|
* Description: Implementation of the Word List Language Model Class
|
|
* Author: Ahmad Abdulkader
|
|
* Created: 2008
|
|
*
|
|
* (C) Copyright 2008, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include "word_list_lang_model.h"
|
|
#include "cube_utils.h"
|
|
|
|
#include "ratngs.h"
|
|
#include "trie.h"
|
|
|
|
namespace tesseract {
|
|
WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) {
|
|
cntxt_ = cntxt;
|
|
dawg_ = NULL;
|
|
init_ = false;
|
|
}
|
|
|
|
WordListLangModel::~WordListLangModel() {
|
|
Cleanup();
|
|
}
|
|
|
|
// Cleanup
|
|
void WordListLangModel::Cleanup() {
|
|
if (dawg_ != NULL) {
|
|
delete dawg_;
|
|
dawg_ = NULL;
|
|
}
|
|
init_ = false;
|
|
}
|
|
|
|
// Initialize the language model
|
|
bool WordListLangModel::Init() {
|
|
if (init_ == true) {
|
|
return true;
|
|
}
|
|
// The last parameter to the Trie constructor (the debug level) is set to
|
|
// false for now, until Cube has a way to express its preferred debug level.
|
|
dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
|
|
cntxt_->CharacterSet()->ClassCount(), false);
|
|
if (dawg_ == NULL) {
|
|
return false;
|
|
}
|
|
init_ = true;
|
|
return true;
|
|
}
|
|
|
|
// return a pointer to the root
|
|
LangModEdge * WordListLangModel::Root() {
|
|
return NULL;
|
|
}
|
|
|
|
// return the edges emerging from the current state
|
|
LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
|
|
LangModEdge *edge,
|
|
int *edge_cnt) {
|
|
// initialize if necessary
|
|
if (init_ == false) {
|
|
if (Init() == false) {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
(*edge_cnt) = 0;
|
|
|
|
EDGE_REF edge_ref;
|
|
|
|
TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
|
|
|
|
if (tess_lm_edge == NULL) {
|
|
edge_ref = 0;
|
|
} else {
|
|
edge_ref = tess_lm_edge->EndEdge();
|
|
|
|
// advance node
|
|
edge_ref = dawg_->next_node(edge_ref);
|
|
if (edge_ref == 0) {
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
// allocate memory for edges
|
|
LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
|
|
if (edge_array == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
// now get all the emerging edges
|
|
(*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
|
|
edge_array + (*edge_cnt));
|
|
|
|
return edge_array;
|
|
}
|
|
|
|
// returns true if the char_32 is supported by the language model
|
|
// TODO(ahmadab) currently not implemented
|
|
bool WordListLangModel::IsValidSequence(const char_32 *sequence,
|
|
bool terminal, LangModEdge **edges) {
|
|
return false;
|
|
}
|
|
|
|
// Recursive helper function for WordVariants().
|
|
void WordListLangModel::WordVariants(const CharSet &char_set,
|
|
string_32 prefix_str32,
|
|
WERD_CHOICE *word_so_far,
|
|
string_32 str32,
|
|
vector<WERD_CHOICE *> *word_variants) {
|
|
int str_len = str32.length();
|
|
if (str_len == 0) {
|
|
if (word_so_far->length() > 0) {
|
|
word_variants->push_back(new WERD_CHOICE(*word_so_far));
|
|
}
|
|
} else {
|
|
// Try out all the possible prefixes of the str32.
|
|
for (int len = 1; len <= str_len; len++) {
|
|
// Check if prefix is supported in character set.
|
|
string_32 str_pref32 = str32.substr(0, len);
|
|
int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
|
|
str_pref32.c_str()));
|
|
if (class_id <= 0) {
|
|
continue;
|
|
} else {
|
|
string_32 new_prefix_str32 = prefix_str32 + str_pref32;
|
|
string_32 new_str32 = str32.substr(len);
|
|
word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
|
|
WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
|
|
word_variants);
|
|
word_so_far->remove_last_unichar_id();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Compute all the variants of a 32-bit string in terms of the class-ids
|
|
// This is needed for languages that have ligatures. A word can then have more
|
|
// than one spelling in terms of the class-ids
|
|
void WordListLangModel::WordVariants(const CharSet &char_set,
|
|
const UNICHARSET *uchset, string_32 str32,
|
|
vector<WERD_CHOICE *> *word_variants) {
|
|
for (int i = 0; i < word_variants->size(); i++) {
|
|
delete (*word_variants)[i];
|
|
}
|
|
word_variants->clear();
|
|
string_32 prefix_str32;
|
|
WERD_CHOICE word_so_far(uchset);
|
|
WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
|
|
}
|
|
|
|
// add a new UTF-8 string to the lang model
|
|
bool WordListLangModel::AddString(const char *char_ptr) {
|
|
if (!init_ && !Init()) { // initialize if necessary
|
|
return false;
|
|
}
|
|
|
|
string_32 str32;
|
|
CubeUtils::UTF8ToUTF32(char_ptr, &str32);
|
|
if (str32.length() < 1) {
|
|
return false;
|
|
}
|
|
return AddString32(str32.c_str());
|
|
}
|
|
|
|
// add a new UTF-32 string to the lang model
|
|
bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
|
|
if (char_32_ptr == NULL) {
|
|
return false;
|
|
}
|
|
// get all the word variants
|
|
vector<WERD_CHOICE *> word_variants;
|
|
WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
|
|
char_32_ptr, &word_variants);
|
|
|
|
if (word_variants.size() > 0) {
|
|
// find the shortest variant
|
|
int shortest_word = 0;
|
|
for (int word = 1; word < word_variants.size(); word++) {
|
|
if (word_variants[shortest_word]->length() >
|
|
word_variants[word]->length()) {
|
|
shortest_word = word;
|
|
}
|
|
}
|
|
// only add the shortest grapheme interpretation of string to the word list
|
|
dawg_->add_word_to_dawg(*word_variants[shortest_word]);
|
|
}
|
|
for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
|
|
return true;
|
|
}
|
|
|
|
}
|