mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-19 03:54:10 +08:00
520077bd41
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@164 d0cd1f9f-072b-0410-8dd7-cf729c803f20
409 lines
14 KiB
C++
409 lines
14 KiB
C++
/* -*-C-*-
|
|
********************************************************************************
|
|
*
|
|
* File: permdawg.c (Formerly permdawg.c)
|
|
* Description:
|
|
* Author: Mark Seaman, OCR Technology
|
|
* Created: Fri Oct 16 14:37:00 1987
|
|
* Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
|
|
* Language: C
|
|
* Package: N/A
|
|
* Status: Reusable Software Component
|
|
*
|
|
* (c) Copyright 1987, Hewlett-Packard Company.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
*********************************************************************************/
|
|
/*----------------------------------------------------------------------
|
|
I n c l u d e s
|
|
----------------------------------------------------------------------*/
|
|
#include "permdawg.h"
|
|
#include "debug.h"
|
|
#include "hyphen.h"
|
|
#include "permute.h"
|
|
#include "tordvars.h"
|
|
#include "context.h"
|
|
#include "stopper.h"
|
|
#include "freelist.h"
|
|
#include "globals.h"
|
|
#include "tprintf.h"
|
|
#include "cutil.h"
|
|
#include "dawg.h"
|
|
#include <ctype.h>
|
|
|
|
/*----------------------------------------------------------------------
|
|
T y p e s
|
|
----------------------------------------------------------------------*/
|
|
#define FREQ_WERD 1.0
|
|
#define GOOD_WERD 1.1
|
|
#define OK_WERD 1.3125
|
|
#define MAX_FREQ_EDGES 1500
|
|
#define NO_RATING -1
|
|
|
|
/*----------------------------------------------------------------------
|
|
V a r i a b l e s
|
|
----------------------------------------------------------------------*/
|
|
static EDGE_ARRAY frequent_words;
|
|
static float rating_margin;
|
|
static float rating_pad = 5.0;
|
|
|
|
make_toggle_var (dawg_debug, 0, make_dawg_debug,
|
|
8, 10, set_dawg_debug, "DAWG Debug ");
|
|
|
|
make_float_var (ok_word, OK_WERD, make_ok_word,
|
|
8, 17, set_ok_word, "Bad word adjustment");
|
|
|
|
make_float_var (good_word, GOOD_WERD, make_good_word,
|
|
8, 18, set_good_word, "Good word adjustment");
|
|
|
|
make_float_var (freq_word, FREQ_WERD, make_freq_word,
|
|
8, 19, set_freq_word, "Freq word adjustment");
|
|
|
|
/*----------------------------------------------------------------------
|
|
F u n c t i o n s
|
|
----------------------------------------------------------------------*/
|
|
/**********************************************************************
|
|
* adjust_word
|
|
*
|
|
* Assign an adjusted value to a string that is a word. The value
|
|
* that this word choice has is based on case and punctuation rules.
|
|
**********************************************************************/
|
|
void adjust_word(A_CHOICE *best_choice, float *certainty_array) {
|
|
char *this_word;
|
|
int punct_status;
|
|
float adjust_factor;
|
|
|
|
if (adjust_debug)
|
|
tprintf ("%s %4.2f ",
|
|
class_string (best_choice), class_probability (best_choice));
|
|
|
|
this_word = class_string (best_choice);
|
|
punct_status = punctuation_ok (this_word, class_lengths (best_choice));
|
|
|
|
class_probability (best_choice) += RATING_PAD;
|
|
if (case_ok (this_word, class_lengths (best_choice))
|
|
&& punct_status != -1) {
|
|
if (punct_status < 1 && word_in_dawg (frequent_words, this_word)) {
|
|
class_probability (best_choice) *= freq_word;
|
|
class_permuter (best_choice) = FREQ_DAWG_PERM;
|
|
adjust_factor = freq_word;
|
|
if (adjust_debug)
|
|
tprintf(", F, %4.2f ", freq_word);
|
|
}
|
|
else {
|
|
class_probability (best_choice) *= good_word;
|
|
adjust_factor = good_word;
|
|
if (adjust_debug)
|
|
tprintf(", %4.2f ", good_word);
|
|
}
|
|
}
|
|
else {
|
|
class_probability (best_choice) *= ok_word;
|
|
adjust_factor = ok_word;
|
|
if (adjust_debug) {
|
|
if (!case_ok (this_word, class_lengths (best_choice)))
|
|
tprintf(", C");
|
|
if (punctuation_ok (this_word, class_lengths (best_choice)) == -1)
|
|
tprintf(", P");
|
|
tprintf(", %4.2f ", ok_word);
|
|
}
|
|
}
|
|
|
|
class_probability (best_choice) -= RATING_PAD;
|
|
|
|
LogNewWordChoice(best_choice, adjust_factor, certainty_array);
|
|
|
|
if (adjust_debug)
|
|
tprintf(" --> %4.2f\n", class_probability (best_choice));
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* append_next_choice
|
|
*
|
|
* Check to see whether or not the next choice is worth appending to
|
|
* the string being generated. If so then keep going deeper into the
|
|
* word.
|
|
**********************************************************************/
|
|
void append_next_choice( /*previous option */
|
|
EDGE_ARRAY dawg,
|
|
NODE_REF node,
|
|
char permuter,
|
|
char *word,
|
|
char unichar_lengths[],
|
|
int unichar_offsets[],
|
|
CHOICES_LIST choices,
|
|
int char_index,
|
|
A_CHOICE *this_choice,
|
|
const char *prevchar,
|
|
float *limit,
|
|
float rating,
|
|
float certainty,
|
|
float *rating_array,
|
|
float *certainty_array,
|
|
int word_ending,
|
|
int last_word,
|
|
CHOICES *result) {
|
|
A_CHOICE *better_choice;
|
|
/* Add new character */
|
|
strcpy(word + unichar_offsets[char_index], class_string (this_choice));
|
|
|
|
unichar_lengths[char_index] = strlen(class_string (this_choice));
|
|
unichar_lengths[char_index + 1] = 0;
|
|
unichar_offsets[char_index + 1] = unichar_offsets[char_index] +
|
|
unichar_lengths[char_index];
|
|
if (word[unichar_offsets[char_index]] == '\0') {
|
|
word[unichar_offsets[char_index]] = ' ';
|
|
word[unichar_offsets[char_index] + 1] = '\0';
|
|
unichar_lengths[char_index] = 1;
|
|
unichar_lengths[char_index + 1] = 0;
|
|
unichar_offsets[char_index + 1] = unichar_offsets[char_index] +
|
|
unichar_lengths[char_index];
|
|
}
|
|
certainty_array[char_index] = class_certainty (this_choice);
|
|
|
|
rating += class_probability (this_choice);
|
|
certainty = min (class_certainty (this_choice), certainty);
|
|
|
|
if (rating_array[char_index] == NO_RATING) {
|
|
/* Prune bad subwords */
|
|
rating_array[char_index] = rating;
|
|
}
|
|
else {
|
|
if (rating_array[char_index] * rating_margin + rating_pad < rating) {
|
|
if (dawg_debug) {
|
|
tprintf("early pruned word rating=%4.2f, limit=%4.2f",
|
|
rating, *limit);
|
|
print_word_string(word);
|
|
tprintf("\n");
|
|
}
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* Deal with hyphens */
|
|
if (word_ending && last_word && word[unichar_offsets[char_index]] == '-' &&
|
|
char_index > 0) {
|
|
*limit = rating;
|
|
if (dawg_debug)
|
|
tprintf("new hyphen choice = %s\n", word);
|
|
better_choice = new_choice (word, unichar_lengths, rating, certainty, -1, permuter);
|
|
adjust_word(better_choice, certainty_array);
|
|
push_on(*result, better_choice);
|
|
set_hyphen_word(word, unichar_lengths, unichar_offsets, rating, node);
|
|
}
|
|
/* Look up char in DAWG */
|
|
else {
|
|
int sub_offset = 0;
|
|
NODE_REF node_saved = node;
|
|
while (sub_offset < unichar_lengths[char_index] &&
|
|
letter_is_okay (dawg, &node, unichar_offsets[char_index] +
|
|
sub_offset, *prevchar, word, word_ending &&
|
|
sub_offset == unichar_lengths[char_index] - 1))
|
|
++sub_offset;
|
|
if (sub_offset == unichar_lengths[char_index]) {
|
|
/* Add a new word choice */
|
|
if (word_ending) {
|
|
if (dawg_debug == 1)
|
|
tprintf("new choice = %s\n", word);
|
|
*limit = rating;
|
|
|
|
better_choice = new_choice (hyphen_tail (word), unichar_lengths +
|
|
hyphen_base_size(),
|
|
rating, certainty,
|
|
-1, permuter);
|
|
adjust_word (better_choice, &certainty_array[hyphen_base_size ()]);
|
|
push_on(*result, better_choice);
|
|
}
|
|
else {
|
|
/* Search the next letter */
|
|
JOIN_ON (*result,
|
|
dawg_permute (dawg, node, permuter,
|
|
choices, char_index + 1, limit,
|
|
word, unichar_lengths, unichar_offsets, rating, certainty,
|
|
rating_array, certainty_array, last_word));
|
|
}
|
|
} else {
|
|
if (dawg_debug == 1) {
|
|
tprintf("letter not OK at char %d, index %d + sub index %d/%d\n",
|
|
char_index, unichar_offsets[char_index],
|
|
sub_offset, unichar_lengths[char_index]);
|
|
tprintf("Word");
|
|
print_word_string(word);
|
|
tprintf("\nRejected tail");
|
|
print_word_string(word + unichar_offsets[char_index]);
|
|
tprintf("\n");
|
|
}
|
|
if (node != 0)
|
|
node = node_saved;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* dawg_permute
|
|
*
|
|
* Permute all the valid words that can be created with this starting
|
|
* point. The node (in the DAWG) and the word string define a base
|
|
* from which to start adding the remaining character choices.
|
|
**********************************************************************/
|
|
CHOICES dawg_permute(EDGE_ARRAY dawg,
|
|
NODE_REF node,
|
|
char permuter,
|
|
CHOICES_LIST choices,
|
|
int char_index,
|
|
float *limit,
|
|
char *word,
|
|
char unichar_lengths[],
|
|
int unichar_offsets[],
|
|
float rating,
|
|
float certainty,
|
|
float *rating_array,
|
|
float *certainty_array,
|
|
int last_word) {
|
|
CHOICES result = NIL;
|
|
CHOICES c;
|
|
char *prevchar;
|
|
int word_ending = FALSE;
|
|
|
|
if (dawg_debug) {
|
|
tprintf("dawg_permute (node=" REFFORMAT ", char_index=%d, limit=%f, word=",
|
|
node, char_index, *limit);
|
|
print_word_string(word);
|
|
tprintf(", rating=%4.2f, certainty=%4.2f)\n",
|
|
rating, certainty);
|
|
}
|
|
|
|
/* Check for EOW */
|
|
if (1 + char_index == array_count (choices) + hyphen_base_size ())
|
|
word_ending = TRUE;
|
|
|
|
if (char_index < array_count (choices) + hyphen_base_size ()) {
|
|
prevchar = NULL;
|
|
iterate_list (c,
|
|
(CHOICES) array_index (choices,
|
|
char_index - hyphen_base_size ())) {
|
|
append_next_choice (dawg, node, permuter, word, unichar_lengths,
|
|
unichar_offsets, choices, char_index,
|
|
(A_CHOICE *) first_node (c),
|
|
prevchar != NULL ? prevchar : "", limit,
|
|
rating, certainty, rating_array, certainty_array,
|
|
word_ending, last_word, &result);
|
|
prevchar = best_string (c);
|
|
}
|
|
}
|
|
if (result && (dawg_debug == 1))
|
|
print_choices ("dawg_permute", result);
|
|
return (result);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* dawg_permute_and_select
|
|
*
|
|
* Use a DAWG type data structure to enumerate all the valid strings
|
|
* in some gramar. Compare each of the choices against the best choice
|
|
* so far. Update the best choice if needed.
|
|
**********************************************************************/
|
|
void dawg_permute_and_select(const char *string,
|
|
EDGE_ARRAY dawg,
|
|
char permuter,
|
|
CHOICES_LIST character_choices,
|
|
A_CHOICE *best_choice,
|
|
inT16 system_words) {
|
|
CHOICES result = NIL;
|
|
char word[UNICHAR_LEN * MAX_WERD_LENGTH + 1];
|
|
char unichar_lengths[MAX_WERD_LENGTH + 1];
|
|
int unichar_offsets[MAX_WERD_LENGTH + 1];
|
|
float certainty_array[MAX_WERD_LENGTH + 1];
|
|
float rating_array[MAX_WERD_LENGTH + 1];
|
|
float rating;
|
|
int char_index;
|
|
NODE_REF dawg_node = 0;
|
|
|
|
/* Pruning margin ratio */
|
|
rating_margin = ok_word / good_word;
|
|
|
|
word[0] = '\0';
|
|
unichar_lengths[0] = 0;
|
|
unichar_offsets[0] = 0;
|
|
rating = class_probability (best_choice);
|
|
|
|
for (char_index = 0; char_index < MAX_WERD_LENGTH + 1; char_index++)
|
|
rating_array[char_index] = NO_RATING;
|
|
char_index = 0;
|
|
|
|
if (!is_last_word () && hyphen_string) {
|
|
strcpy(word, hyphen_string);
|
|
strcpy(unichar_lengths, hyphen_unichar_lengths);
|
|
memcpy(unichar_offsets, hyphen_unichar_offsets,
|
|
(hyphen_base_size()) * sizeof (int));
|
|
unichar_offsets[hyphen_base_size()] =
|
|
unichar_offsets[hyphen_base_size() - 1] +
|
|
unichar_lengths[hyphen_base_size() - 1];
|
|
char_index = strlen (hyphen_unichar_lengths);
|
|
if (system_words)
|
|
dawg_node = hyphen_state;
|
|
}
|
|
|
|
result = dawg_permute (dawg, dawg_node, permuter, character_choices,
|
|
char_index, &rating, word, unichar_lengths, unichar_offsets, 0.0, 0.0,
|
|
rating_array, certainty_array, is_last_word ());
|
|
|
|
if (display_ratings && result) {
|
|
print_choices(string, result);
|
|
}
|
|
|
|
while (result != NIL) {
|
|
if (best_probability (result) < class_probability (best_choice)) {
|
|
clone_choice (best_choice, first_node (result));
|
|
}
|
|
free_choice (first_node (result));
|
|
pop_off(result);
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* init_permdawg
|
|
*
|
|
* Initialize the variables needed by this file.
|
|
**********************************************************************/
|
|
void init_permdawg_vars() {
|
|
make_dawg_debug();
|
|
make_ok_word();
|
|
make_good_word();
|
|
make_freq_word();
|
|
}
|
|
|
|
void init_permdawg() {
|
|
STRING name;
|
|
name = language_data_path_prefix;
|
|
name += "freq-dawg";
|
|
frequent_words = read_squished_dawg(name.string());
|
|
}
|
|
|
|
void end_permdawg() {
|
|
memfree(frequent_words);
|
|
frequent_words = NULL;
|
|
}
|
|
|
|
/**********************************************************************
|
|
* test_freq_words()
|
|
*
|
|
* Tests a word against the frequent word dawg
|
|
**********************************************************************/
|
|
int test_freq_words(const char *word) {
|
|
return (word_in_dawg (frequent_words, word));
|
|
}
|