tesseract/dict/permute.cpp
theraysmith 0371d16fe1 Fixed compiler warnings
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@141 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2008-02-01 00:41:25 +00:00

1708 lines
44 KiB
C++

/* -*-C-*-
********************************************************************************
*
* File: permute.c (Formerly permute.c)
* Description: Handle the new ratings choices for Wise Owl
* Author: Mark Seaman, OCR Technology
* Created: Fri Sep 22 14:05:51 1989
* Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1989, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
/*----------------------------------------------------------------------
I n c l u d e s
---------------------------------------------------------------------*/
#include "permute.h"
#include "globals.h"
#include "permdawg.h"
#include "debug.h"
#include "tordvars.h"
#include "hyphen.h"
#include "stopper.h"
#include "trie.h"
#include "context.h"
#include "permnum.h"
#include "freelist.h"
#include "callcpp.h"
#include "permngram.h"
#include <math.h>
int permutation_count; // Used in metrics.cpp.
/*----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
// TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant
// (or make it dynamic)
#define MAX_NUM_EDGES 2000000
#define MAX_DOC_EDGES 250000
#define RESERVED_DOC_EDGES 10000
#define MAX_USER_EDGES 50000
#define USER_RESERVED_EDGES 2000
/* Weights for adjustment */
#define NON_WERD 1.25
#define GARBAGE_STRING 1.5
#define MAX_PERM_LENGTH 128
EDGE_ARRAY pending_words;
EDGE_ARRAY document_words;
EDGE_ARRAY user_words;
EDGE_ARRAY word_dawg;
make_toggle_var (adjust_debug, 0, make_adjust_debug,
8, 13, set_adjust_debug, "Adjustment Debug");
make_toggle_var (compound_debug, 0, make_compound_debug,
8, 14, set_compound_debug, "Compound Debug");
make_float_var (non_word, NON_WERD, make_non_word,
8, 20, set_non_word, "Non-word adjustment");
make_float_var (garbage, GARBAGE_STRING, make_garbage,
8, 21, set_garbage, "Garbage adjustment");
make_toggle_var (save_doc_words, 0, make_doc_words,
8, 22, set_doc_words, "Save Document Words ");
make_toggle_var (doc_dict_enable, 1, make_doc_dict,
8, 25, set_doc_dict, "Enable Document Dictionary ");
/* PREV DEFAULT 0 */
BOOL_VAR(ngram_permuter_activated, FALSE,
"Activate character-level n-gram-based permuter");
int permute_only_top = 0;
#if 0
//0x0=.
static INT32 bigram_counts[256][3] = { {
0, 0, 0
},
{ //0x1=.
0, 0, 0
},
{ //0x2=.
0, 0, 0
},
{ //0x3=.
0, 0, 0
},
{ //0x4=.
0, 0, 0
},
{ //0x5=.
0, 0, 0
},
{ //0x6=.
0, 0, 0
},
{ //0x7=.
0, 0, 0
},
{ //0x8=.
0, 0, 0
},
{ //0x9=.
0, 0, 0
},
{ //0xa=.
93, 28, 0
},
{ //0xb=.
0, 0, 0
},
{ //0xc=.
0, 0, 0
},
{ //0xd=.
0, 0, 0
},
{ //0xe=.
0, 0, 0
},
{ //0xf=.
0, 0, 0
},
{ //0x10=.
0, 0, 0
},
{ //0x11=.
0, 0, 0
},
{ //0x12=.
0, 0, 0
},
{ //0x13=.
0, 0, 0
},
{ //0x14=.
0, 0, 0
},
{ //0x15=.
0, 0, 0
},
{ //0x16=.
0, 0, 0
},
{ //0x17=.
0, 0, 0
},
{ //0x18=.
0, 0, 0
},
{ //0x19=.
0, 0, 0
},
{ //0x1a=.
0, 0, 0
},
{ //0x1b=.
0, 0, 0
},
{ //0x1c=.
0, 0, 0
},
{ //0x1d=.
0, 0, 0
},
{ //0x1e=.
0, 0, 0
},
{ //0x1f=.
0, 0, 0
},
{ //0x20=
324, 377, 2
},
{ //0x21=!
2, 1, 0
},
{ //0x22="
2, 1, 0
},
{ //0x23=#
1, 0, 1
},
{ //0x24=$
2, 1, 0
},
{ //0x25=%
2, 0, 0
},
{ //0x26=&
2, 1, 0
},
{ //0x27='
1, 21, 8
},
{ //0x28=(
2, 1, 0
},
{ //0x29=)
19, 0, 0
},
{ //0x2a=*
2, 1, 0
},
{ //0x2b=+
1, 0, 0
},
{ //0x2c=,
75, 4, 0
},
{ //0x2d=-
52, 7, 0
},
{ //0x2e=.
190, 16, 3
},
{ //0x2f=/
53, 2, 0
},
{ //0x30=0
399, 0, 0
},
{ //0x31=1
220, 0, 0
},
{ //0x32=2
226, 0, 0
},
{ //0x33=3
128, 0, 0
},
{ //0x34=4
147, 0, 0
},
{ //0x35=5
179, 0, 1
},
{ //0x36=6
173, 0, 0
},
{ //0x37=7
115, 0, 0
},
{ //0x38=8
107, 0, 0
},
{ //0x39=9
934, 0, 1
},
{ //0x3a=:
27, 0, 1
},
{ //0x3b=;
2, 1, 0
},
{ //0x3c=<
2, 1, 0
},
{ //0x3d==
2, 1, 0
},
{ //0x3e=>
2, 1, 0
},
{ //0x3f=?
2, 1, 0
},
{ //0x40=@
2, 1, 0
},
{ //0x41=A
3, 1, 0
},
{ //0x42=B
1, 73, 0
},
{ //0x43=C
1, 6, 0
},
{ //0x44=D
1, 24, 0
},
{ //0x45=E
1, 2, 0
},
{ //0x46=F
1, 19, 0
},
{ //0x47=G
1, 2, 0
},
{ //0x48=H
3, 2, 1
},
{ //0x49=I
0, 68, 0
},
{ //0x4a=J
1, 2, 0
},
{ //0x4b=K
1, 2, 0
},
{ //0x4c=L
1, 82, 0
},
{ //0x4d=M
10, 10, 0
},
{ //0x4e=N
3, 239, 0
},
{ //0x4f=O
1, 10, 0
},
{ //0x50=P
0, 1, 3
},
{ //0x51=Q
2, 3, 0
},
{ //0x52=R
1, 43, 0
},
{ //0x53=S
1, 53, 0
},
{ //0x54=T
2, 18, 0
},
{ //0x55=U
1, 2, 0
},
{ //0x56=V
1, 17, 0
},
{ //0x57=W
1, 5, 0
},
{ //0x58=X
1, 6, 0
},
{ //0x59=Y
1, 2, 0
},
{ //0x5a=Z
1, 2, 0
},
{ //0x5b=[
2, 1, 0
},
{ //0x5c=backslash
2, 1, 0
},
{ //0x5d=]
2, 1, 0
},
{ //0x5e=^
2, 1, 0
},
{ //0x5f=_
2, 1, 0
},
{ //0x60=`
1, 0, 2
},
{ //0x61=a
0, 0, 671
},
{ //0x62=b
0, 1, 16
},
{ //0x63=c
0, 2, 1
},
{ //0x64=d
0, 14, 0
},
{ //0x65=e
0, 0, 763
},
{ //0x66=f
0, 186, 0
},
{ //0x67=g
0, 2, 1
},
{ //0x68=h
0, 2, 1
},
{ //0x69=i
0, 0, 818
},
{ //0x6a=j
0, 2, 1
},
{ //0x6b=k
0, 4, 1
},
{ //0x6c=l
0, 26, 3
},
{ //0x6d=m
0, 69, 0
},
{ //0x6e=n
0, 885, 0
},
{ //0x6f=o
0, 17, 722
},
{ //0x70=p
0, 1, 5
},
{ //0x71=q
2, 1, 0
},
{ //0x72=r
0, 21, 0
},
{ //0x73=s
3, 49, 0
},
{ //0x74=t
0, 219, 5
},
{ //0x75=u
0, 0, 56
},
{ //0x76=v
0, 4, 0
},
{ //0x77=w
0, 2, 1
},
{ //0x78=x
0, 2, 1
},
{ //0x79=y
0, 1, 23
},
{ //0x7a=z
0, 2, 1
},
{ //0x7b={
2, 1, 0
},
{ //0x7c=|
59, 0, 3
},
{ //0x7d=}
2, 1, 0
},
{ //0x7e=~
2, 1, 0
},
{ //0x7f=.
0, 0, 0
},
{ //0x80=.
0, 0, 0
},
{ //0x81=.
0, 0, 0
},
{ //0x82=.
0, 0, 0
},
{ //0x83=.
0, 0, 0
},
{ //0x84=.
0, 0, 0
},
{ //0x85=.
0, 0, 0
},
{ //0x86=.
0, 0, 0
},
{ //0x87=.
0, 0, 0
},
{ //0x88=.
0, 0, 0
},
{ //0x89=.
0, 0, 0
},
{ //0x8a=.
0, 0, 0
},
{ //0x8b=.
0, 0, 0
},
{ //0x8c=.
0, 0, 0
},
{ //0x8d=.
0, 0, 0
},
{ //0x8e=.
0, 0, 0
},
{ //0x8f=.
0, 0, 0
},
{ //0x90=.
0, 0, 0
},
{ //0x91=.
0, 0, 0
},
{ //0x92=.
0, 0, 0
},
{ //0x93=.
0, 0, 0
},
{ //0x94=.
0, 0, 0
},
{ //0x95=.
0, 0, 0
},
{ //0x96=.
0, 0, 0
},
{ //0x97=.
0, 0, 0
},
{ //0x98=.
0, 0, 0
},
{ //0x99=.
0, 0, 0
},
{ //0x9a=.
0, 0, 0
},
{ //0x9b=.
0, 0, 0
},
{ //0x9c=.
0, 0, 0
},
{ //0x9d=.
0, 0, 0
},
{ //0x9e=.
0, 0, 0
},
{ //0x9f=.
0, 0, 0
},
{ //0xa0=.
0, 0, 0
},
{ //0xa1=.
0, 0, 0
},
{ //0xa2=.
0, 0, 0
},
{ //0xa3=.
0, 0, 0
},
{ //0xa4=.
0, 0, 0
},
{ //0xa5=.
0, 0, 0
},
{ //0xa6=.
0, 0, 0
},
{ //0xa7=.
0, 0, 0
},
{ //0xa8=.
0, 0, 0
},
{ //0xa9=.
0, 0, 0
},
{ //0xaa=.
0, 0, 0
},
{ //0xab=.
0, 0, 0
},
{ //0xac=.
0, 0, 0
},
{ //0xad=.
0, 0, 0
},
{ //0xae=.
0, 0, 0
},
{ //0xaf=.
0, 0, 0
},
{ //0xb0=.
0, 0, 0
},
{ //0xb1=.
0, 0, 0
},
{ //0xb2=.
0, 0, 0
},
{ //0xb3=.
0, 0, 0
},
{ //0xb4=.
0, 0, 0
},
{ //0xb5=.
0, 0, 0
},
{ //0xb6=.
0, 0, 0
},
{ //0xb7=.
0, 0, 0
},
{ //0xb8=.
0, 0, 0
},
{ //0xb9=.
0, 0, 0
},
{ //0xba=.
0, 0, 0
},
{ //0xbb=.
0, 0, 0
},
{ //0xbc=.
0, 0, 0
},
{ //0xbd=.
0, 0, 0
},
{ //0xbe=.
0, 0, 0
},
{ //0xbf=.
0, 0, 0
},
{ //0xc0=.
0, 0, 0
},
{ //0xc1=.
0, 0, 0
},
{ //0xc2=.
0, 0, 0
},
{ //0xc3=.
0, 0, 0
},
{ //0xc4=.
0, 0, 0
},
{ //0xc5=.
0, 0, 0
},
{ //0xc6=.
0, 0, 0
},
{ //0xc7=.
0, 0, 0
},
{ //0xc8=.
0, 0, 0
},
{ //0xc9=.
0, 0, 0
},
{ //0xca=.
0, 0, 0
},
{ //0xcb=.
0, 0, 0
},
{ //0xcc=.
0, 0, 0
},
{ //0xcd=.
0, 0, 0
},
{ //0xce=.
0, 0, 0
},
{ //0xcf=.
0, 0, 0
},
{ //0xd0=.
0, 0, 0
},
{ //0xd1=.
0, 0, 0
},
{ //0xd2=.
0, 0, 0
},
{ //0xd3=.
0, 0, 0
},
{ //0xd4=.
0, 0, 0
},
{ //0xd5=.
0, 0, 0
},
{ //0xd6=.
0, 0, 0
},
{ //0xd7=.
0, 0, 0
},
{ //0xd8=.
0, 0, 0
},
{ //0xd9=.
0, 0, 0
},
{ //0xda=.
0, 0, 0
},
{ //0xdb=.
0, 0, 0
},
{ //0xdc=.
0, 0, 0
},
{ //0xdd=.
0, 0, 0
},
{ //0xde=.
0, 0, 0
},
{ //0xdf=.
0, 0, 0
},
{ //0xe0=.
0, 0, 0
},
{ //0xe1=.
0, 0, 0
},
{ //0xe2=.
0, 0, 0
},
{ //0xe3=.
0, 0, 0
},
{ //0xe4=.
0, 0, 0
},
{ //0xe5=.
0, 0, 0
},
{ //0xe6=.
0, 0, 0
},
{ //0xe7=.
0, 0, 0
},
{ //0xe8=.
0, 0, 0
},
{ //0xe9=.
0, 0, 0
},
{ //0xea=.
0, 0, 0
},
{ //0xeb=.
0, 0, 0
},
{ //0xec=.
0, 0, 0
},
{ //0xed=.
0, 0, 0
},
{ //0xee=.
0, 0, 0
},
{ //0xef=.
0, 0, 0
},
{ //0xf0=.
0, 0, 0
},
{ //0xf1=.
0, 0, 0
},
{ //0xf2=.
0, 0, 0
},
{ //0xf3=.
0, 0, 0
},
{ //0xf4=.
0, 0, 0
},
{ //0xf5=.
0, 0, 0
},
{ //0xf6=.
0, 0, 0
},
{ //0xf7=.
0, 0, 0
},
{ //0xf8=.
0, 0, 0
},
{ //0xf9=.
0, 0, 0
},
{ //0xfa=.
0, 0, 0
},
{ //0xfb=.
0, 0, 0
},
{ //0xfc=.
0, 0, 0
},
{ //0xfd=.
0, 0, 0
},
{ //0xfe=.
0, 0, 0
},
{ //0xff=.
0, 0, 0
},
};
#endif
//extern "C" double permuter_pending_threshold;
/* Similarity matcher values */
#define SIM_CERTAINTY_SCALE -10.0
/* Similarity matcher values */
#define SIM_CERTAINTY_OFFSET -10.0
/* Worst E*L product to stop on */
#define SIMILARITY_FLOOR 100.0
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
/**********************************************************************
* good_choice
*
* Return TRUE if a good answer is found for the unknown blob rating.
**********************************************************************/
int good_choice(A_CHOICE *choice) {
register float certainty;
if (choice == NULL)
return (FALSE);
if (similarity_enable) {
if ((class_probability (choice) + 1) * class_certainty (choice) >
SIMILARITY_FLOOR)
return (FALSE);
certainty =
SIM_CERTAINTY_OFFSET +
class_probability (choice) * SIM_CERTAINTY_SCALE;
}
else {
certainty = class_certainty (choice);
}
if (certainty > certainty_threshold) {
return (TRUE);
}
else {
return (FALSE);
}
}
/**********************************************************************
* add_document_word
*
* Add a word found on this document to the document specific
* dictionary.
**********************************************************************/
void add_document_word(A_CHOICE *best_choice) {
char filename[CHARS_PER_LINE];
FILE *doc_word_file;
char *string;
char *lengths;
int stringlen; //length of word
string = class_string (best_choice);
lengths = class_lengths (best_choice);
stringlen = strlen (lengths);
// Skip if using external dictionary.
if (letter_is_okay != &def_letter_is_okay) return;
if (!doc_dict_enable
|| valid_word (string) || CurrentWordAmbig () || stringlen < 2)
return;
if (!good_choice (best_choice) || stringlen == 2) {
if (class_certainty (best_choice) < permuter_pending_threshold)
return;
if (!word_in_dawg (pending_words, string)) {
if (stringlen > 2 ||
(stringlen >= 2 && unicharset.get_isupper (string, lengths[0]) &&
unicharset.get_isupper (string + lengths[0], lengths[1])))
add_word_to_dawg(pending_words,
string,
MAX_DOC_EDGES,
RESERVED_DOC_EDGES);
return;
}
}
if (save_doc_words) {
strcpy(filename, imagefile);
strcat (filename, ".doc");
doc_word_file = open_file (filename, "a");
fprintf (doc_word_file, "%s\n", string);
fclose(doc_word_file);
}
add_word_to_dawg(document_words, string, MAX_DOC_EDGES, RESERVED_DOC_EDGES);
}
/**********************************************************************
* adjust_non_word
*
* Assign an adjusted value to a string that is a non-word. The value
* that this word choice has is based on case and punctuation rules.
**********************************************************************/
void
adjust_non_word (A_CHOICE * best_choice, float certainties[]) {
char *this_word;
float adjust_factor;
if (adjust_debug)
cprintf ("%s %4.2f ",
class_string (best_choice), class_probability (best_choice));
this_word = class_string (best_choice);
class_probability (best_choice) += RATING_PAD;
if (case_ok (this_word, class_lengths (best_choice))
&& punctuation_ok (this_word, class_lengths (best_choice)) != -1) {
class_probability (best_choice) *= non_word;
adjust_factor = non_word;
if (adjust_debug)
cprintf (", %4.2f ", non_word);
}
else {
class_probability (best_choice) *= garbage;
adjust_factor = garbage;
if (adjust_debug) {
if (!case_ok (this_word, class_lengths (best_choice)))
cprintf (", C");
if (punctuation_ok (this_word, class_lengths (best_choice)) == -1)
cprintf (", P");
cprintf (", %4.2f ", garbage);
}
}
class_probability (best_choice) -= RATING_PAD;
LogNewWordChoice(best_choice, adjust_factor, certainties);
if (adjust_debug)
cprintf (" --> %4.2f\n", class_probability (best_choice));
}
/**********************************************************************
* init_permute
*
* Initialize anything that needs to be set up for the permute
* functions.
**********************************************************************/
void init_permute_vars() {
make_adjust_debug();
make_compound_debug();
make_non_word();
make_garbage();
make_doc_words();
make_doc_dict();
init_permdawg_vars();
init_permnum();
}
void init_permute() {
if (word_dawg != NULL)
end_permute();
init_permdawg();
STRING name;
name = language_data_path_prefix;
name += "word-dawg";
word_dawg = read_squished_dawg(name.string());
document_words =
(EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
initialize_dawg(document_words, MAX_DOC_EDGES);
pending_words =
(EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
initialize_dawg(pending_words, MAX_DOC_EDGES);
user_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_USER_EDGES);
name = language_data_path_prefix;
name += "user-words";
read_word_list(name.string(), user_words, MAX_USER_EDGES, USER_RESERVED_EDGES);
}
void end_permute() {
if (word_dawg == NULL)
return; // Not safe to call twice.
memfree(word_dawg);
word_dawg = NULL;
memfree(document_words);
document_words = NULL;
memfree(pending_words);
pending_words = NULL;
memfree(user_words);
user_words = NULL;
end_permdawg();
}
/**********************************************************************
* permute_all
*
* Permute all the characters together using all of the different types
* of permuters/selectors available. Each of the characters must have
* a non-NIL choice list.
**********************************************************************/
A_CHOICE *permute_all(CHOICES_LIST char_choices,
float rating_limit,
A_CHOICE *raw_choice) {
A_CHOICE *result_1;
A_CHOICE *result_2 = NULL;
BOOL8 any_alpha;
result_1 = permute_top_choice (char_choices, rating_limit, raw_choice,
&any_alpha);
if (ngram_permuter_activated)
return ngram_permute_and_select(char_choices, rating_limit, word_dawg);
if (result_1 == NULL)
return (NULL);
if (permute_only_top)
return result_1;
if (any_alpha && array_count (char_choices) <= MAX_WERD_LENGTH) {
result_2 = permute_words (char_choices, rating_limit);
if (class_probability (result_1) < class_probability (result_2)
|| class_string (result_2) == NULL) {
free_choice(result_2);
}
else {
free_choice(result_1);
result_1 = result_2;
}
}
result_2 = number_permute_and_select (char_choices, rating_limit);
if (class_probability (result_1) < class_probability (result_2)
|| class_string (result_2) == NULL) {
free_choice(result_2);
}
else {
free_choice(result_1);
result_1 = result_2;
}
result_2 = permute_compound_words (char_choices, rating_limit);
if (!result_2 ||
class_probability (result_1) < class_probability (result_2)
|| class_string (result_2) == NULL) {
free_choice(result_2);
}
else {
free_choice(result_1);
result_1 = result_2;
}
return (result_1);
}
/**********************************************************************
* permute_characters
*
* Permute these characters together according to each of the different
* permuters that are enabled.
**********************************************************************/
void permute_characters(CHOICES_LIST char_choices,
float limit,
A_CHOICE *best_choice,
A_CHOICE *raw_choice) {
A_CHOICE *this_choice;
permutation_count++; /* Global counter */
this_choice = permute_all (char_choices, limit, raw_choice);
if (this_choice &&
class_probability (this_choice) < class_probability (best_choice)) {
clone_choice(best_choice, this_choice);
}
free_choice(this_choice);
if (display_ratings)
cprintf ("permute_characters: %-15s %4.2f %4.2f\n",
class_string (best_choice),
class_probability (best_choice), class_certainty (best_choice));
}
/**********************************************************************
* permute_compound_word
*
* Return the top choice for each character as the choice for the word.
**********************************************************************/
A_CHOICE *permute_compound_words(CHOICES_LIST character_choices,
float rating_limit) {
A_CHOICE *first_choice;
A_CHOICE *best_choice = NULL;
char word[UNICHAR_LEN * MAX_WERD_LENGTH + 1];
char unichar_lengths[MAX_WERD_LENGTH + 1];
float rating = 0;
float certainty = 10000;
char char_choice;
int x;
int first_index = 0;
char *ptr;
word[0] = '\0';
unichar_lengths[0] = 0;
if (array_count (character_choices) > MAX_WERD_LENGTH) {
return (new_choice (NULL, NULL, MAX_FLOAT32, -MAX_FLOAT32, -1, NO_PERM));
}
array_loop(character_choices, x) {
first_choice =
(A_CHOICE *) first_node ((CHOICES) array_value (character_choices, x));
ptr = class_string (first_choice);
char_choice = ptr != NULL ? *ptr : '\0';
if (x > first_index && (char_choice == '-' || char_choice == '/')) {
if (compound_debug)
cprintf ("Hyphenated word found\n");
permute_subword (character_choices, rating_limit,
first_index, x - 1, word, unichar_lengths,
&rating, &certainty);
if (rating > rating_limit)
break;
first_index = x + 1;
strcat(word, class_string (first_choice));
char length[] = {strlen(class_string (first_choice)), 0};
strcat(unichar_lengths + x, length);
rating += class_probability (first_choice);
certainty = min (class_certainty (first_choice), certainty);
}
}
if (first_index > 0 && first_index < x && rating <= rating_limit) {
permute_subword (character_choices, rating_limit,
first_index, x - 1, word, unichar_lengths,
&rating, &certainty);
best_choice = new_choice (word, unichar_lengths, rating,
certainty, -1, COMPOUND_PERM);
}
return (best_choice);
}
/**********************************************************************
* permute_subword
*
* Permute a part of a compound word this subword is bounded by hyphens
* and the start and end of the word. Call the standard word permute
* function on a set of choices covering only part of the original
* word. When it is done reclaim the memory that was used in the
* excercise.
**********************************************************************/
void permute_subword(CHOICES_LIST character_choices,
float rating_limit,
int start,
int end,
char *word,
char unichar_lengths[],
float *rating,
float *certainty) {
int x;
A_CHOICE *best_choice = NULL;
A_CHOICE raw_choice;
CHOICES_LIST subchoices;
CHOICES choices;
char this_char;
char *ptr;
DisableChoiceAccum();
raw_choice.string = NULL;
raw_choice.lengths = NULL;
raw_choice.rating = MAX_INT16;
raw_choice.certainty = -MAX_INT16;
subchoices = new_choice_list ();
for (x = start; x <= end; x++) {
choices = (CHOICES) array_value (character_choices, x);
ptr = best_string (choices);
this_char = ptr != NULL ? *ptr : '\0';
if (this_char != '-' && this_char != '/') {
subchoices = array_push (subchoices, choices);
} else {
const char* str = best_string(choices);
strcat(word, str);
char length[] = {strlen(str), 0};
strcat(unichar_lengths + x, length);
}
}
if (array_count (subchoices)) {
if (compound_debug)
dawg_debug = TRUE;
best_choice = permute_all (subchoices, rating_limit, &raw_choice);
if (compound_debug)
dawg_debug = FALSE;
if (best_choice && class_string (best_choice)) {
strcat (word, class_string (best_choice));
strcat (unichar_lengths, class_lengths (best_choice));
*rating += class_probability (best_choice);
*certainty = min (class_certainty (best_choice), *certainty);
}
else {
*rating = MAX_FLOAT32;
}
}
else {
*rating = MAX_FLOAT32;
}
free_choice_list(subchoices);
if (best_choice)
free_choice(best_choice);
if (compound_debug && *rating < MAX_FLOAT32) {
cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
word, *rating, *certainty);
}
if (raw_choice.string)
strfree(raw_choice.string);
if (raw_choice.lengths)
strfree(raw_choice.lengths);
EnableChoiceAccum();
}
/**********************************************************************
* permute_top_choice
*
* Return the top choice for each character as the choice for the word.
* In addition a choice is created for the best lower and upper case
* non-words. In each character position the best lower (or upper) case
* character is substituted for the best overall character.
**********************************************************************/
A_CHOICE *permute_top_choice(CHOICES_LIST character_choices,
float rating_limit,
A_CHOICE *raw_choice,
BOOL8 *any_alpha) {
CHOICES char_list;
A_CHOICE *first_choice;
A_CHOICE *best_choice;
A_CHOICE *other_choice;
const char *ptr;
const char *first_char; //first choice
const char *second_char; //second choice
const char *third_char; //third choice
char prev_char[UNICHAR_LEN + 1]; //prev in word
const char *next_char = ""; //next in word
const char *next_next_char = ""; //after next next in word
char word[UNICHAR_LEN * MAX_PERM_LENGTH + 1];
char capital_word[UNICHAR_LEN * MAX_PERM_LENGTH + 1];
char lower_word[UNICHAR_LEN * MAX_PERM_LENGTH + 1];
char word_lengths[MAX_PERM_LENGTH + 1];
char capital_word_lengths[MAX_PERM_LENGTH + 1];
char lower_word_lengths[MAX_PERM_LENGTH + 1];
int x;
int x_word = 0;
int x_capital_word = 0;
int x_lower_word = 0;
BOOL8 char_alpha;
float rating = 0;
float upper_rating = 0;
float lower_rating = 0;
float first_rating = 0;
float certainty = 10000;
float upper_certainty = 10000;
float lower_certainty = 10000;
float certainties[MAX_PERM_LENGTH + 1];
float lower_certainties[MAX_PERM_LENGTH + 1];
float upper_certainties[MAX_PERM_LENGTH + 1];
register CHOICES this_char;
register const char* ch;
register INT8 lower_done;
register INT8 upper_done;
prev_char[0] = '\0';
if (any_alpha != NULL)
*any_alpha = FALSE;
if (array_count (character_choices) > MAX_PERM_LENGTH) {
return (NULL);
}
array_loop(character_choices, x) {
if (x + 1 < array_count (character_choices)) {
char_list = (CHOICES) array_value (character_choices, x + 1);
first_choice = (A_CHOICE *) first_node (char_list);
ptr = class_string (first_choice);
next_char = (ptr != NULL && *ptr != '\0') ? ptr : " ";
}
else
next_char = "";
if (x + 2 < array_count (character_choices)) {
char_list = (CHOICES) array_value (character_choices, x + 2);
first_choice = (A_CHOICE *) first_node (char_list);
ptr = class_string (first_choice);
next_next_char = (ptr != NULL && *ptr != '\0') ? ptr : " ";
}
else
next_next_char = "";
char_list = (CHOICES) array_value (character_choices, x);
first_choice = (A_CHOICE *) first_node (char_list);
ptr = class_string (first_choice);
if (ptr != NULL && *ptr != '\0')
{
strcpy(word + x_word, ptr);
word_lengths[x] = strlen(ptr);
strcpy(capital_word + x_capital_word, ptr);
capital_word_lengths[x] = strlen(ptr);
strcpy(lower_word + x_lower_word, ptr);
lower_word_lengths[x] = strlen(ptr);
}
else
{
word[x_word] = ' ';
word_lengths[x] = 1;
capital_word[x_capital_word] = ' ';
capital_word_lengths[x] = 1;
lower_word[x_lower_word] = ' ';
lower_word_lengths[x] = 1;
}
first_char = (ptr != NULL && *ptr != '\0') ? ptr : " ";
first_rating = class_probability (first_choice);
upper_rating += class_probability (first_choice);
lower_rating += class_probability (first_choice);
lower_certainty = min (class_certainty (first_choice), lower_certainty);
upper_certainty = min (class_certainty (first_choice), upper_certainty);
certainties[x] = class_certainty (first_choice);
lower_certainties[x] = class_certainty (first_choice);
upper_certainties[x] = class_certainty (first_choice);
lower_done = FALSE;
upper_done = FALSE;
char_alpha = FALSE;
second_char = "";
third_char = "";
iterate_list(this_char, char_list) {
ptr = best_string (this_char);
ch = ptr != NULL ? ptr : "";
if (strcmp(ch, "l") == 0 && rest (this_char) != NULL
&& best_probability (rest (this_char)) == first_rating) {
ptr = best_string (rest (this_char));
if (ptr != NULL && (strcmp(ptr, "1") == 0 || strcmp(ptr, "I") == 0)) {
second_char = ptr;
this_char = rest (this_char);
if (rest (this_char) != NULL
&& best_probability (rest (this_char)) == first_rating) {
ptr = best_string (rest (this_char));
if (ptr != NULL && (strcmp(ptr, "1") == 0 || strcmp(ptr, "I") == 0)) {
third_char = ptr;
this_char = rest (this_char);
}
}
ch = choose_il1 (first_char, second_char, third_char,
prev_char, next_char, next_next_char);
if (strcmp(ch, "l") != 0 && word_lengths[x] == 1 &&
word[x_word] == 'l') {
word[x_word] = *ch;
lower_word[x_lower_word] = *ch;
capital_word[x_capital_word] = *ch;
}
}
}
if (ch != NULL && *ch != '\0') {
/* Find lower case */
if (!lower_done && (unicharset.get_islower(ch) ||
(unicharset.get_isupper(ch) && x == 0))) {
strcpy(lower_word + x_lower_word, ch);
lower_word_lengths[x] = strlen(ch);
lower_rating += best_probability (this_char);
lower_rating -= class_probability (first_choice);
lower_certainty = min (best_certainty (this_char), lower_certainty);
lower_certainties[x] = best_certainty (this_char);
lower_done = TRUE;
}
/* Find upper case */
if (!upper_done && unicharset.get_isupper(ch)) {
strcpy(capital_word + x_capital_word, ch);
capital_word_lengths[x] = strlen(ch);
upper_rating += best_probability (this_char);
upper_rating -= class_probability (first_choice);
upper_certainty = min (best_certainty (this_char), upper_certainty);
upper_certainties[x] = best_certainty (this_char);
upper_done = TRUE;
}
if (!char_alpha && unicharset.get_isalpha(ch))
char_alpha = TRUE;
if (lower_done && upper_done)
break;
}
}
if (char_alpha && any_alpha != NULL)
*any_alpha = TRUE;
if (first_choice == NULL) {
cprintf ("Permuter giving up due to null choices list");
word[x_word + 1] = '$';
word[x_word + 2] = '\0';
word_lengths[x + 1] = 1;
word_lengths[x + 2] = 0;
cprintf (" word=%s\n", word);
return (NULL);
}
rating += class_probability (first_choice);
if (rating > rating_limit)
return (NULL);
certainty = min (class_certainty (first_choice), certainty);
strncpy(prev_char, word + x_word, word_lengths[x]);
prev_char[word_lengths[x]] = '\0';
x_word += word_lengths[x];
x_capital_word += capital_word_lengths[x];
x_lower_word += lower_word_lengths[x];
}
word[x_word] = '\0';
word_lengths[x] = 0;
capital_word[x_capital_word] = '\0';
capital_word_lengths[x] = 0;
lower_word[x_lower_word] = '\0';
lower_word_lengths[x] = 0;
if (rating < class_probability (raw_choice)) {
if (class_string (raw_choice))
strfree (class_string (raw_choice));
if (class_lengths (raw_choice))
strfree (class_lengths (raw_choice));
class_probability (raw_choice) = rating;
class_certainty (raw_choice) = certainty;
class_string (raw_choice) = strsave (word);
class_lengths (raw_choice) = strsave (word_lengths);
class_permuter (raw_choice) = TOP_CHOICE_PERM;
LogNewRawChoice (raw_choice, 1.0, certainties);
}
if (ngram_permuter_activated)
return NULL;
best_choice = new_choice (word, word_lengths,
rating, certainty, -1, TOP_CHOICE_PERM);
adjust_non_word(best_choice, certainties);
other_choice = new_choice (lower_word, lower_word_lengths,
lower_rating, lower_certainty,
-1, LOWER_CASE_PERM);
adjust_non_word(other_choice, lower_certainties);
if (class_probability (best_choice) > class_probability (other_choice)) {
clone_choice(best_choice, other_choice);
}
free_choice(other_choice);
other_choice = new_choice (capital_word, capital_word_lengths,
upper_rating, upper_certainty,
-1, UPPER_CASE_PERM);
adjust_non_word(other_choice, upper_certainties);
if (class_probability (best_choice) > class_probability (other_choice)) {
clone_choice(best_choice, other_choice);
}
free_choice(other_choice);
return (best_choice);
}
/**********************************************************************
* choose_il1
*
* Choose between the candidate il1 chars.
**********************************************************************/
const char* choose_il1(const char *first_char, //first choice
const char *second_char, //second choice
const char *third_char, //third choice
const char *prev_char, //prev in word
const char *next_char, //next in word
const char *next_next_char) { //after next next in word
INT32 type1; //1/I/l type of first choice
INT32 type2; //1/I/l type of second choice
INT32 type3; //1/I/l type of third choice
int first_char_length = strlen(first_char);
int prev_char_length = strlen(prev_char);
int next_char_length = strlen(next_char);
int next_next_char_length = strlen(next_next_char);
if (*first_char == 'l' && *second_char != '\0') {
if (*second_char == 'I'
&& (((prev_char_length != 0 &&
unicharset.get_isupper (prev_char, prev_char_length)) &&
(next_char_length == 0 ||
!unicharset.get_islower (next_char, next_char_length)) &&
(next_char_length == 0 ||
!unicharset.get_isdigit (next_char, next_char_length))) ||
((next_char_length != 0 &&
unicharset.get_isupper (next_char, next_char_length)) &&
(prev_char_length == 0 ||
!unicharset.get_islower (prev_char, prev_char_length)) &&
(prev_char_length == 0 ||
!unicharset.get_isdigit (prev_char, prev_char_length)))))
first_char = second_char; //override
else if (*second_char == '1' || *third_char == '1') {
if ((next_char_length != 0 &&
unicharset.get_isdigit (next_char, next_char_length)) ||
(prev_char_length != 0 &&
unicharset.get_isdigit (prev_char, prev_char_length))
|| (*next_char == 'l' &&
(next_next_char_length != 0 &&
unicharset.get_isdigit (next_next_char, next_next_char_length)))) {
first_char = "1";
first_char_length = 1;
}
else if ((prev_char_length == 0 ||
!unicharset.get_islower (prev_char, prev_char_length)) &&
((next_char_length == 0 ||
!unicharset.get_islower (next_char, next_char_length)) ||
(*next_char == 's' &&
*next_next_char == 't'))) {
if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
&& ((*next_char != '\'' && *next_char != '`')
|| *prev_char != '\0')) {
first_char = "1";
first_char_length = 1;
}
}
}
if (*first_char == 'l' && *next_char != '\0' &&
(prev_char_length == 0 ||
!unicharset.get_isalpha (prev_char, prev_char_length))) {
type1 = 2;
if (*second_char == '1')
type2 = 0;
else if (*second_char == 'I')
type2 = 1;
else if (*second_char == 'l')
type2 = 2;
else
type2 = type1;
if (*third_char == '1')
type3 = 0;
else if (*third_char == 'I')
type3 = 1;
else if (*third_char == 'l')
type3 = 2;
else
type3 = type1;
#if 0
if (bigram_counts[*next_char][type2] >
bigram_counts[*next_char][type1]) {
first_char = second_char;
type1 = type2;
}
if (bigram_counts[*next_char][type3] >
bigram_counts[*next_char][type1]) {
first_char = third_char;
}
#endif
}
}
return first_char;
}
/**********************************************************************
* permute_words
*
* Permute all the characters together using the dawg to prune all
* but the valid words.
**********************************************************************/
A_CHOICE *permute_words(CHOICES_LIST char_choices, float rating_limit) {
A_CHOICE *best_choice;
best_choice = new_choice (NULL, NULL, rating_limit, -MAX_FLOAT32, -1, NO_PERM);
if (hyphen_base_size() + array_count (char_choices) > MAX_WERD_LENGTH) {
class_probability (best_choice) = MAX_FLOAT32;
}
else {
dawg_permute_and_select ("system words:", word_dawg, SYSTEM_DAWG_PERM,
char_choices, best_choice, TRUE);
dawg_permute_and_select ("document_words", document_words,
DOC_DAWG_PERM, char_choices, best_choice,
FALSE);
dawg_permute_and_select ("user words", user_words, USER_DAWG_PERM,
char_choices, best_choice, FALSE);
}
return (best_choice);
}
/**********************************************************************
* valid_word
*
* Check all the DAWGs to see if this word is in any of them.
**********************************************************************/
int valid_word(const char *string) {
int result = NO_PERM;
if (word_in_dawg (word_dawg, string))
result = SYSTEM_DAWG_PERM;
else {
if (word_in_dawg (document_words, string))
result = DOC_DAWG_PERM;
else if (word_in_dawg (user_words, string))
result = USER_DAWG_PERM;
}
return (result);
}