tesseract/cube/word_list_lang_model.cpp

201 lines
6.0 KiB
C++

/**********************************************************************
* File: word_list_lang_model.cpp
* Description: Implementation of the Word List Language Model Class
* Author: Ahmad Abdulkader
* Created: 2008
*
* (C) Copyright 2008, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <string>
#include <vector>
#include "word_list_lang_model.h"
#include "cube_utils.h"
#include "ratngs.h"
#include "trie.h"
namespace tesseract {
WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) {
cntxt_ = cntxt;
dawg_ = NULL;
init_ = false;
}
WordListLangModel::~WordListLangModel() {
Cleanup();
}
// Cleanup
void WordListLangModel::Cleanup() {
if (dawg_ != NULL) {
delete dawg_;
dawg_ = NULL;
}
init_ = false;
}
// Initialize the language model
bool WordListLangModel::Init() {
if (init_ == true) {
return true;
}
// The last parameter to the Trie constructor (the debug level) is set to
// false for now, until Cube has a way to express its preferred debug level.
dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
WordListLangModel::kMaxDawgEdges,
cntxt_->CharacterSet()->ClassCount(), false);
if (dawg_ == NULL) {
return false;
}
init_ = true;
return true;
}
// return a pointer to the root
LangModEdge * WordListLangModel::Root() {
return NULL;
}
// return the edges emerging from the current state
LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
LangModEdge *edge,
int *edge_cnt) {
// initialize if necessary
if (init_ == false) {
if (Init() == false) {
return false;
}
}
(*edge_cnt) = 0;
EDGE_REF edge_ref;
TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
if (tess_lm_edge == NULL) {
edge_ref = 0;
} else {
edge_ref = tess_lm_edge->EndEdge();
// advance node
edge_ref = dawg_->next_node(edge_ref);
if (edge_ref == 0) {
return 0;
}
}
// allocate memory for edges
LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
if (edge_array == NULL) {
return NULL;
}
// now get all the emerging edges
(*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
edge_array + (*edge_cnt));
return edge_array;
}
// returns true if the char_32 is supported by the language model
// TODO(ahmadab) currently not implemented
bool WordListLangModel::IsValidSequence(const char_32 *sequence,
bool terminal, LangModEdge **edges) {
return false;
}
// Recursive helper function for WordVariants().
void WordListLangModel::WordVariants(const CharSet &char_set,
string_32 prefix_str32,
WERD_CHOICE *word_so_far,
string_32 str32,
vector<WERD_CHOICE> *word_variants) {
int str_len = str32.length();
if (str_len == 0) {
if (word_so_far->length() > 0) {
word_variants->push_back(*word_so_far);
}
} else {
// Try out all the possible prefixes of the str32.
for (int len = 1; len <= str_len; len++) {
// Check if prefix is supported in character set.
string_32 str_pref32 = str32.substr(0, len);
int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
str_pref32.c_str()));
if (class_id <= 0) {
continue;
} else {
string_32 new_prefix_str32 = prefix_str32 + str_pref32;
string_32 new_str32 = str32.substr(len);
word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
word_variants);
word_so_far->remove_last_unichar_id();
}
}
}
}
// Compute all the variants of a 32-bit string in terms of the class-ids
// This is needed for languages that have ligatures. A word can then have more
// than one spelling in terms of the class-ids
void WordListLangModel::WordVariants(const CharSet &char_set, string_32 str32,
vector<WERD_CHOICE> *word_variants) {
word_variants->clear();
string_32 prefix_str32;
WERD_CHOICE word_so_far;
WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
}
// add a new UTF-8 string to the lang model
bool WordListLangModel::AddString(const char *char_ptr) {
if (!init_ && !Init()) { // initialize if necessary
return false;
}
string_32 str32;
CubeUtils::UTF8ToUTF32(char_ptr, &str32);
if (str32.length() < 1) {
return false;
}
return AddString32(str32.c_str());
}
// add a new UTF-32 string to the lang model
bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
if (char_32_ptr == NULL) {
return false;
}
// get all the word variants
vector<WERD_CHOICE> word_variants;
WordVariants(*(cntxt_->CharacterSet()), char_32_ptr, &word_variants);
if (word_variants.size() > 0) {
// find the shortest variant
int shortest_word = 0;
for (int word = 1; word < word_variants.size(); word++) {
if (word_variants[shortest_word].length() >
word_variants[word].length()) {
shortest_word = word;
}
}
// only add the shortest grapheme interpretation of string to the word list
dawg_->add_word_to_dawg(word_variants[shortest_word]);
}
return true;
}
}