2008-02-01 08:21:49 +08:00
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
// File: permngram.cpp
|
|
|
|
// Description: Character n-gram permuter
|
|
|
|
// Author: Thomas Kielbus
|
|
|
|
// Created: Wed Sep 12 11:26:43 PDT 2007
|
|
|
|
//
|
|
|
|
// (C) Copyright 2007, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#include "const.h"
|
|
|
|
#include "permngram.h"
|
|
|
|
#include "permute.h"
|
|
|
|
#include "dawg.h"
|
|
|
|
#include "tordvars.h"
|
|
|
|
#include "stopper.h"
|
|
|
|
#include "globals.h"
|
|
|
|
#include "context.h"
|
2009-07-11 10:20:33 +08:00
|
|
|
#include "ndminx.h"
|
|
|
|
#include "dict.h"
|
|
|
|
#include "conversion.h"
|
2008-02-01 08:21:49 +08:00
|
|
|
|
|
|
|
#include <math.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
|
|
|
|
// Ratio to control the relative importance of the classifier and the ngram
|
|
|
|
// in the final score of a classification unit. Must be >= 0 and <= 1.
|
|
|
|
// A value of 1.0 uses only the shape classifier score.
|
|
|
|
// A value of 0.0 uses only the ngram score.
|
|
|
|
double_VAR(classifier_score_ngram_score_ratio,
|
|
|
|
0.7,
|
|
|
|
"");
|
|
|
|
|
|
|
|
// Rating adjustment multiplier for words not in the DAWG. Must be >= 1.
|
|
|
|
double_VAR(non_dawg_prefix_rating_adjustment,
|
|
|
|
1.5,
|
|
|
|
"");
|
|
|
|
|
|
|
|
// HypothesisPrefix represents a word prefix during the search of the
|
|
|
|
// character-level n-gram model based permuter.
|
|
|
|
// It holds the data needed to create the corresponding A_CHOICE.
|
|
|
|
// Note that the string stored in the _word data member always begin with a
|
|
|
|
// space character. This is used by the n-gram model to score the word.
|
|
|
|
// HypothesisPrefix also contains the node in the DAWG that is reached when
|
|
|
|
// searching for the corresponding prefix.
|
|
|
|
class HypothesisPrefix {
|
|
|
|
public:
|
|
|
|
HypothesisPrefix();
|
|
|
|
HypothesisPrefix(const HypothesisPrefix& prefix,
|
|
|
|
A_CHOICE* choice,
|
|
|
|
bool end_of_word,
|
2009-07-11 10:20:33 +08:00
|
|
|
const tesseract::Dawg *dawg,
|
|
|
|
tesseract::Dict* dict);
|
2008-02-01 08:21:49 +08:00
|
|
|
|
|
|
|
double rating() const {return rating_;}
|
|
|
|
double certainty() const {return certainty_;}
|
|
|
|
const char* word() const {return word_;}
|
|
|
|
const char* unichar_lengths() const {return unichar_lengths_;}
|
|
|
|
const float* certainty_array() const {return certainty_array_;}
|
|
|
|
bool is_dawg_prefix() const {return is_dawg_prefix_;}
|
|
|
|
NODE_REF dawg_node() const {return dawg_node_;}
|
|
|
|
|
|
|
|
private:
|
|
|
|
double rating_;
|
|
|
|
double certainty_;
|
|
|
|
char word_[UNICHAR_LEN * MAX_WERD_LENGTH + 2];
|
|
|
|
char unichar_lengths_[MAX_WERD_LENGTH + 1];
|
|
|
|
float certainty_array_[MAX_WERD_LENGTH + 1];
|
|
|
|
NODE_REF dawg_node_;
|
|
|
|
bool is_dawg_prefix_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// HypothesisPrefix is the class used as nodes in HypothesisPrefixLists
|
|
|
|
typedef HypothesisPrefix HypothesisPrefixListNode;
|
|
|
|
|
|
|
|
// HypothesisPrefixList maintains a sorted list of HypothesisPrefixes. The size
|
|
|
|
// is bounded by the argument given to the constructor.
|
|
|
|
// For the sake of simplicity, current implementation is not as efficient as it
|
|
|
|
// could be. The list is represented by a static array of pointers to its
|
|
|
|
// elements. All nodes are stored in positions from 0 to (size() - 1).
|
|
|
|
class HypothesisPrefixList {
|
|
|
|
public:
|
|
|
|
HypothesisPrefixList(int size_bound);
|
|
|
|
~HypothesisPrefixList();
|
|
|
|
|
|
|
|
void add_node(HypothesisPrefix* node);
|
|
|
|
int size() const {return _size;}
|
|
|
|
void clear();
|
|
|
|
const HypothesisPrefix& node(int index) {return *_list_nodes[index];}
|
|
|
|
|
|
|
|
private:
|
|
|
|
HypothesisPrefix** _list_nodes;
|
|
|
|
int _size_bound;
|
|
|
|
int _size;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Return the classifier_score_ngram_score_ratio for a given choice string.
|
|
|
|
// The classification decision for characters like comma and period should
|
|
|
|
// be based only on shape rather than on shape and n-gram score.
|
|
|
|
// Return 1.0 for them, the default classifier_score_ngram_score_ratio
|
|
|
|
// otherwise.
|
|
|
|
static double get_classifier_score_ngram_score_ratio(const char* choice);
|
|
|
|
|
|
|
|
// Permute the given char_choices using a character level n-gram model and
|
|
|
|
// return the best word choice found.
|
|
|
|
// This is performed by maintaining a HypothesisPrefixList of HypothesisPrefixes.
|
|
|
|
// For each character position, each possible character choice is appended to
|
|
|
|
// the best current prefixes to create the list of best prefixes at the next
|
|
|
|
// character position.
|
2009-07-11 10:20:33 +08:00
|
|
|
namespace tesseract {
|
|
|
|
A_CHOICE *Dict::ngram_permute_and_select(CHOICES_LIST char_choices,
|
|
|
|
float rating_limit,
|
|
|
|
const Dawg *dawg) {
|
2008-02-01 08:21:49 +08:00
|
|
|
if (array_count (char_choices) <= MAX_WERD_LENGTH) {
|
|
|
|
CHOICES choices;
|
|
|
|
int char_index_max = array_count(char_choices);
|
|
|
|
HypothesisPrefixList list_1(20);
|
|
|
|
HypothesisPrefixList list_2(20);
|
|
|
|
HypothesisPrefixList* current_list = &list_1;
|
|
|
|
HypothesisPrefixList* next_list = &list_2;
|
|
|
|
HypothesisPrefix* initial_node = new HypothesisPrefix();
|
|
|
|
current_list->add_node(initial_node);
|
|
|
|
for (int char_index = 0; char_index < char_index_max; ++char_index) {
|
|
|
|
iterate_list(choices, (CHOICES) array_index(char_choices, char_index)) {
|
|
|
|
A_CHOICE* choice = (A_CHOICE *) first_node(choices);
|
|
|
|
for (int node_index = 0;
|
|
|
|
node_index < current_list->size();
|
|
|
|
++node_index) {
|
|
|
|
// Append this choice to the current node
|
|
|
|
HypothesisPrefix* new_node = new HypothesisPrefix(
|
|
|
|
current_list->node(node_index),
|
|
|
|
choice,
|
|
|
|
char_index == char_index_max - 1,
|
2009-07-11 10:20:33 +08:00
|
|
|
dawg, this);
|
2008-02-01 08:21:49 +08:00
|
|
|
next_list->add_node(new_node);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Clear current list and switch lists
|
|
|
|
current_list->clear();
|
|
|
|
HypothesisPrefixList* temp_list = current_list;
|
|
|
|
current_list = next_list;
|
|
|
|
next_list = temp_list;
|
|
|
|
|
|
|
|
// Give up if the current best rating is worse than rating_limit
|
|
|
|
if (current_list->node(0).rating() > rating_limit)
|
|
|
|
return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
|
|
|
|
}
|
|
|
|
const HypothesisPrefix& best_word = current_list->node(0);
|
|
|
|
A_CHOICE* best_choice = new_choice (best_word.word() + 1,
|
|
|
|
best_word.unichar_lengths(),
|
|
|
|
best_word.rating(),
|
|
|
|
best_word.certainty(), -1,
|
|
|
|
valid_word(best_word.word() + 1) ?
|
|
|
|
SYSTEM_DAWG_PERM : TOP_CHOICE_PERM);
|
|
|
|
LogNewWordChoice(best_choice, best_word.is_dawg_prefix() ?
|
|
|
|
1.0 : non_dawg_prefix_rating_adjustment,
|
2009-07-11 10:20:33 +08:00
|
|
|
const_cast<float*>(best_word.certainty_array()),
|
|
|
|
getUnicharset());
|
2008-02-01 08:21:49 +08:00
|
|
|
return best_choice;
|
|
|
|
} else {
|
|
|
|
return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
|
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:20:33 +08:00
|
|
|
} // namespace tesseract
|
2008-02-01 08:21:49 +08:00
|
|
|
|
|
|
|
double get_classifier_score_ngram_score_ratio(const char* choice) {
|
|
|
|
if (!strcmp(",", choice) ||
|
|
|
|
!strcmp(".", choice))
|
|
|
|
return 1.0;
|
|
|
|
else
|
|
|
|
return classifier_score_ngram_score_ratio;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initial HypothesisPrefix constructor used to create the first state of the
|
|
|
|
// search.
|
|
|
|
HypothesisPrefix::HypothesisPrefix() {
|
|
|
|
rating_ = 0;
|
|
|
|
certainty_ = MAXFLOAT;
|
|
|
|
strcpy(word_, " ");
|
|
|
|
unichar_lengths_[0] = '\0';
|
|
|
|
dawg_node_ = 0;
|
|
|
|
is_dawg_prefix_ = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Main constructor to create a new HypothesisPrefix by appending a character
|
|
|
|
// choice (A_CHOICE) to an existing HypothesisPrefix. This constructor takes
|
|
|
|
// care of copying the original prefix's data members, appends the character
|
|
|
|
// choice to the word and updates its rating using a character-level n-gram
|
|
|
|
// model. The state in the DAWG is also updated.
|
|
|
|
HypothesisPrefix::HypothesisPrefix(const HypothesisPrefix& prefix,
|
|
|
|
A_CHOICE* choice,
|
|
|
|
bool end_of_word,
|
2009-07-11 10:20:33 +08:00
|
|
|
const tesseract::Dawg *dawg,
|
|
|
|
tesseract::Dict* dict) {
|
2008-02-01 08:21:49 +08:00
|
|
|
char* word_ptr = word_;
|
|
|
|
const char* prefix_word_ptr = prefix.word_;
|
|
|
|
|
|
|
|
// Copy first space character
|
|
|
|
*(word_ptr++) = *(prefix_word_ptr++);
|
|
|
|
|
|
|
|
// Copy existing word, unichar_lengths, certainty_array
|
|
|
|
int char_index;
|
|
|
|
for (char_index = 0;
|
|
|
|
prefix.unichar_lengths_[char_index] != '\0';
|
|
|
|
++char_index) {
|
|
|
|
for (int char_subindex = 0;
|
|
|
|
char_subindex < prefix.unichar_lengths_[char_index];
|
|
|
|
++char_subindex) {
|
|
|
|
*(word_ptr++) = *(prefix_word_ptr++);
|
|
|
|
}
|
|
|
|
unichar_lengths_[char_index] = prefix.unichar_lengths_[char_index];
|
|
|
|
certainty_array_[char_index] = prefix.certainty_array_[char_index];
|
|
|
|
}
|
|
|
|
|
|
|
|
// If choice is empty, use a space character instead
|
|
|
|
const char* class_string_choice = *class_string(choice) == '\0' ?
|
|
|
|
" " : class_string(choice);
|
|
|
|
|
|
|
|
// Update certainty
|
2009-07-11 10:20:33 +08:00
|
|
|
certainty_ = MIN(prefix.certainty_, class_certainty(choice));
|
2008-02-01 08:21:49 +08:00
|
|
|
|
|
|
|
// Apprend choice to the word
|
|
|
|
strcpy(word_ptr, class_string_choice);
|
|
|
|
unichar_lengths_[char_index] = strlen(class_string_choice);
|
|
|
|
unichar_lengths_[char_index + 1] = '\0';
|
|
|
|
|
|
|
|
// Append choice certainty to the certainty array
|
|
|
|
certainty_array_[char_index] = class_certainty(choice);
|
|
|
|
|
|
|
|
// Copy DAWG node state
|
|
|
|
dawg_node_ = prefix.dawg_node_;
|
|
|
|
is_dawg_prefix_ = prefix.is_dawg_prefix_;
|
|
|
|
|
|
|
|
// Verify DAWG and update dawg_node_ if the current prefix is already valid
|
|
|
|
if (is_dawg_prefix_) {
|
|
|
|
for (int char_subindex = 0;
|
|
|
|
class_string_choice[char_subindex] != '\0';
|
|
|
|
++char_subindex) {
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
// TODO(daria): update this code (and the rest of ngram permuter code
|
|
|
|
// to deal with unichar ids, make use of the new parallel dawg search
|
|
|
|
// and use WERD_CHOICE, BLOB_CHOICE_LIST_VECTOR instead of the deprecated
|
|
|
|
// A_CHOICE.
|
|
|
|
tprintf("Error: ngram permuter functionality is not available\n");
|
|
|
|
exit(1);
|
|
|
|
|
2008-02-01 08:21:49 +08:00
|
|
|
// Verify each byte of the appended character. Note that word_ptr points
|
|
|
|
// to the first byte so (word_ptr - (word_ + 1)) is the index of the first
|
|
|
|
// new byte in the string that starts at (word_ + 1).
|
2009-07-11 10:20:33 +08:00
|
|
|
/*
|
2008-02-01 08:21:49 +08:00
|
|
|
int current_byte_index = word_ptr - (word_ + 1) + char_subindex;
|
2009-07-11 10:20:33 +08:00
|
|
|
if (!(dict->*dict->letter_is_okay_)(
|
|
|
|
dawg, &dawg_node_, current_byte_index, word_ + 1,
|
|
|
|
end_of_word && class_string_choice[char_subindex + 1] == '\0')) {
|
2008-02-01 08:21:49 +08:00
|
|
|
dawg_node_ = NO_EDGE;
|
|
|
|
is_dawg_prefix_ = false;
|
|
|
|
break;
|
|
|
|
}
|
2009-07-11 10:20:33 +08:00
|
|
|
*/
|
2008-02-01 08:21:49 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copy the prefix rating
|
|
|
|
rating_ = prefix.rating_;
|
|
|
|
|
|
|
|
// Compute rating of current character
|
|
|
|
double probability = probability_in_context(prefix.word_, -1,
|
|
|
|
class_string_choice, -1);
|
|
|
|
|
|
|
|
// If last character of the word, take the following space into account
|
|
|
|
if (end_of_word)
|
|
|
|
probability *= probability_in_context(word_, -1, " ", -1);
|
|
|
|
|
|
|
|
double local_classifier_score_ngram_score_ratio =
|
|
|
|
get_classifier_score_ngram_score_ratio(class_string_choice);
|
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
double classifier_rating = class_rating(choice);
|
2008-02-01 08:21:49 +08:00
|
|
|
double ngram_rating = -log(probability) / log(2.0);
|
|
|
|
double mixed_rating =
|
|
|
|
local_classifier_score_ngram_score_ratio * classifier_rating +
|
|
|
|
(1 - local_classifier_score_ngram_score_ratio) * ngram_rating;
|
|
|
|
|
|
|
|
// If the current word is not a valid prefix, adjust the rating of the
|
|
|
|
// character being appended. If it used to be a valid prefix, compensate for
|
|
|
|
// previous adjustments.
|
|
|
|
if (!is_dawg_prefix_) {
|
|
|
|
if (prefix.is_dawg_prefix_)
|
|
|
|
rating_ *= non_dawg_prefix_rating_adjustment;
|
|
|
|
mixed_rating *= non_dawg_prefix_rating_adjustment;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update rating by adding the rating of the character being appended.
|
|
|
|
rating_ += mixed_rating;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create an empty HypothesisPrefixList. Its maximum size is set to the given
|
|
|
|
// bound.
|
|
|
|
HypothesisPrefixList::HypothesisPrefixList(int size_bound):
|
|
|
|
_size_bound(size_bound),
|
|
|
|
_size(0) {
|
|
|
|
_list_nodes = new HypothesisPrefix*[_size_bound];
|
|
|
|
for (int i = 0; i < _size_bound; ++i)
|
|
|
|
_list_nodes[i] = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Destroy a HypothesisPrefixList all contained nodes are deleted as well.
|
|
|
|
HypothesisPrefixList::~HypothesisPrefixList() {
|
|
|
|
this->clear();
|
|
|
|
delete[] _list_nodes;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add a node to the HypothesisPrefixList. Maintains the sorted list property.
|
|
|
|
// Note that the HypothesisPrefixList takes ownership of the given node and
|
|
|
|
// might delete it if needed. It must therefore have been allocated on the heap.
|
|
|
|
void HypothesisPrefixList::add_node(HypothesisPrefix* node) {
|
|
|
|
// Detect nodes that have a worst rating that the current maximum and treat
|
|
|
|
// them separately.
|
|
|
|
if (_size > 0 && _list_nodes[_size - 1]->rating() < node->rating()) {
|
|
|
|
if (_size == _size_bound) {
|
|
|
|
// The list is already full. This node will not be added
|
|
|
|
delete node;
|
|
|
|
} else {
|
|
|
|
// The list is not full. Add the node at the last position.
|
|
|
|
_list_nodes[_size] = node;
|
|
|
|
++_size;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Find the correct position
|
|
|
|
int node_index_target = 0;
|
|
|
|
while (node_index_target < _size_bound &&
|
|
|
|
_list_nodes[node_index_target] != NULL &&
|
|
|
|
_list_nodes[node_index_target]->rating() < node->rating()) {
|
|
|
|
++node_index_target;
|
|
|
|
}
|
|
|
|
if (node_index_target >= _size_bound) {
|
|
|
|
delete node;
|
|
|
|
} else {
|
|
|
|
// Move next states by 1. Starting from the last one.
|
|
|
|
int node_index_move = _size - 1;
|
|
|
|
while (node_index_move >= node_index_target) {
|
|
|
|
if (node_index_move == _size_bound - 1)
|
|
|
|
delete _list_nodes[node_index_move];
|
|
|
|
else
|
|
|
|
_list_nodes[node_index_move + 1] = _list_nodes[node_index_move];
|
|
|
|
_list_nodes[node_index_move] = NULL;
|
|
|
|
--node_index_move;
|
|
|
|
}
|
|
|
|
// Insert new node
|
|
|
|
_list_nodes[node_index_target] = node;
|
|
|
|
// Increment size if it has changed
|
|
|
|
if (_size < _size_bound)
|
|
|
|
++_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete all contained nodes and set the size of the HypothesisPrefixList to 0
|
|
|
|
void HypothesisPrefixList::clear() {
|
|
|
|
for (int i = 0; i < _size_bound && _list_nodes[i] != NULL; ++i) {
|
|
|
|
delete _list_nodes[i];
|
|
|
|
_list_nodes[i] = NULL;
|
|
|
|
}
|
|
|
|
_size = 0;
|
|
|
|
}
|