mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 06:53:36 +08:00
0371d16fe1
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@141 d0cd1f9f-072b-0410-8dd7-cf729c803f20
1708 lines
44 KiB
C++
1708 lines
44 KiB
C++
/* -*-C-*-
|
|
********************************************************************************
|
|
*
|
|
* File: permute.c (Formerly permute.c)
|
|
* Description: Handle the new ratings choices for Wise Owl
|
|
* Author: Mark Seaman, OCR Technology
|
|
* Created: Fri Sep 22 14:05:51 1989
|
|
* Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
|
|
* Language: C
|
|
* Package: N/A
|
|
* Status: Experimental (Do Not Distribute)
|
|
*
|
|
* (c) Copyright 1989, Hewlett-Packard Company.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
*********************************************************************************/
|
|
/*----------------------------------------------------------------------
|
|
I n c l u d e s
|
|
---------------------------------------------------------------------*/
|
|
#include "permute.h"
|
|
#include "globals.h"
|
|
#include "permdawg.h"
|
|
#include "debug.h"
|
|
#include "tordvars.h"
|
|
#include "hyphen.h"
|
|
#include "stopper.h"
|
|
#include "trie.h"
|
|
#include "context.h"
|
|
#include "permnum.h"
|
|
#include "freelist.h"
|
|
#include "callcpp.h"
|
|
#include "permngram.h"
|
|
|
|
#include <math.h>
|
|
|
|
int permutation_count; // Used in metrics.cpp.
|
|
/*----------------------------------------------------------------------
|
|
V a r i a b l e s
|
|
----------------------------------------------------------------------*/
|
|
// TODO(tkielbus) Choose a value for the MAX_NUM_EDGES constant
|
|
// (or make it dynamic)
|
|
#define MAX_NUM_EDGES 2000000
|
|
#define MAX_DOC_EDGES 250000
|
|
#define RESERVED_DOC_EDGES 10000
|
|
#define MAX_USER_EDGES 50000
|
|
#define USER_RESERVED_EDGES 2000
|
|
/* Weights for adjustment */
|
|
#define NON_WERD 1.25
|
|
#define GARBAGE_STRING 1.5
|
|
#define MAX_PERM_LENGTH 128
|
|
|
|
EDGE_ARRAY pending_words;
|
|
EDGE_ARRAY document_words;
|
|
EDGE_ARRAY user_words;
|
|
EDGE_ARRAY word_dawg;
|
|
|
|
make_toggle_var (adjust_debug, 0, make_adjust_debug,
|
|
8, 13, set_adjust_debug, "Adjustment Debug");
|
|
|
|
make_toggle_var (compound_debug, 0, make_compound_debug,
|
|
8, 14, set_compound_debug, "Compound Debug");
|
|
|
|
make_float_var (non_word, NON_WERD, make_non_word,
|
|
8, 20, set_non_word, "Non-word adjustment");
|
|
|
|
make_float_var (garbage, GARBAGE_STRING, make_garbage,
|
|
8, 21, set_garbage, "Garbage adjustment");
|
|
|
|
make_toggle_var (save_doc_words, 0, make_doc_words,
|
|
8, 22, set_doc_words, "Save Document Words ");
|
|
|
|
make_toggle_var (doc_dict_enable, 1, make_doc_dict,
|
|
8, 25, set_doc_dict, "Enable Document Dictionary ");
|
|
/* PREV DEFAULT 0 */
|
|
|
|
BOOL_VAR(ngram_permuter_activated, FALSE,
|
|
"Activate character-level n-gram-based permuter");
|
|
|
|
int permute_only_top = 0;
|
|
|
|
#if 0
|
|
//0x0=.
|
|
static INT32 bigram_counts[256][3] = { {
|
|
0, 0, 0
|
|
},
|
|
{ //0x1=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x2=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x3=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x4=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x5=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x6=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x7=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x8=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x9=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa=.
|
|
93, 28, 0
|
|
},
|
|
{ //0xb=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x10=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x11=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x12=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x13=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x14=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x15=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x16=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x17=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x18=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x19=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x1a=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x1b=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x1c=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x1d=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x1e=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x1f=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x20=
|
|
324, 377, 2
|
|
},
|
|
{ //0x21=!
|
|
2, 1, 0
|
|
},
|
|
{ //0x22="
|
|
2, 1, 0
|
|
},
|
|
{ //0x23=#
|
|
1, 0, 1
|
|
},
|
|
{ //0x24=$
|
|
2, 1, 0
|
|
},
|
|
{ //0x25=%
|
|
2, 0, 0
|
|
},
|
|
{ //0x26=&
|
|
2, 1, 0
|
|
},
|
|
{ //0x27='
|
|
1, 21, 8
|
|
},
|
|
{ //0x28=(
|
|
2, 1, 0
|
|
},
|
|
{ //0x29=)
|
|
19, 0, 0
|
|
},
|
|
{ //0x2a=*
|
|
2, 1, 0
|
|
},
|
|
{ //0x2b=+
|
|
1, 0, 0
|
|
},
|
|
{ //0x2c=,
|
|
75, 4, 0
|
|
},
|
|
{ //0x2d=-
|
|
52, 7, 0
|
|
},
|
|
{ //0x2e=.
|
|
190, 16, 3
|
|
},
|
|
{ //0x2f=/
|
|
53, 2, 0
|
|
},
|
|
{ //0x30=0
|
|
399, 0, 0
|
|
},
|
|
{ //0x31=1
|
|
220, 0, 0
|
|
},
|
|
{ //0x32=2
|
|
226, 0, 0
|
|
},
|
|
{ //0x33=3
|
|
128, 0, 0
|
|
},
|
|
{ //0x34=4
|
|
147, 0, 0
|
|
},
|
|
{ //0x35=5
|
|
179, 0, 1
|
|
},
|
|
{ //0x36=6
|
|
173, 0, 0
|
|
},
|
|
{ //0x37=7
|
|
115, 0, 0
|
|
},
|
|
{ //0x38=8
|
|
107, 0, 0
|
|
},
|
|
{ //0x39=9
|
|
934, 0, 1
|
|
},
|
|
{ //0x3a=:
|
|
27, 0, 1
|
|
},
|
|
{ //0x3b=;
|
|
2, 1, 0
|
|
},
|
|
{ //0x3c=<
|
|
2, 1, 0
|
|
},
|
|
{ //0x3d==
|
|
2, 1, 0
|
|
},
|
|
{ //0x3e=>
|
|
2, 1, 0
|
|
},
|
|
{ //0x3f=?
|
|
2, 1, 0
|
|
},
|
|
{ //0x40=@
|
|
2, 1, 0
|
|
},
|
|
{ //0x41=A
|
|
3, 1, 0
|
|
},
|
|
{ //0x42=B
|
|
1, 73, 0
|
|
},
|
|
{ //0x43=C
|
|
1, 6, 0
|
|
},
|
|
{ //0x44=D
|
|
1, 24, 0
|
|
},
|
|
{ //0x45=E
|
|
1, 2, 0
|
|
},
|
|
{ //0x46=F
|
|
1, 19, 0
|
|
},
|
|
{ //0x47=G
|
|
1, 2, 0
|
|
},
|
|
{ //0x48=H
|
|
3, 2, 1
|
|
},
|
|
{ //0x49=I
|
|
0, 68, 0
|
|
},
|
|
{ //0x4a=J
|
|
1, 2, 0
|
|
},
|
|
{ //0x4b=K
|
|
1, 2, 0
|
|
},
|
|
{ //0x4c=L
|
|
1, 82, 0
|
|
},
|
|
{ //0x4d=M
|
|
10, 10, 0
|
|
},
|
|
{ //0x4e=N
|
|
3, 239, 0
|
|
},
|
|
{ //0x4f=O
|
|
1, 10, 0
|
|
},
|
|
{ //0x50=P
|
|
0, 1, 3
|
|
},
|
|
{ //0x51=Q
|
|
2, 3, 0
|
|
},
|
|
{ //0x52=R
|
|
1, 43, 0
|
|
},
|
|
{ //0x53=S
|
|
1, 53, 0
|
|
},
|
|
{ //0x54=T
|
|
2, 18, 0
|
|
},
|
|
{ //0x55=U
|
|
1, 2, 0
|
|
},
|
|
{ //0x56=V
|
|
1, 17, 0
|
|
},
|
|
{ //0x57=W
|
|
1, 5, 0
|
|
},
|
|
{ //0x58=X
|
|
1, 6, 0
|
|
},
|
|
{ //0x59=Y
|
|
1, 2, 0
|
|
},
|
|
{ //0x5a=Z
|
|
1, 2, 0
|
|
},
|
|
{ //0x5b=[
|
|
2, 1, 0
|
|
},
|
|
{ //0x5c=backslash
|
|
2, 1, 0
|
|
},
|
|
{ //0x5d=]
|
|
2, 1, 0
|
|
},
|
|
{ //0x5e=^
|
|
2, 1, 0
|
|
},
|
|
{ //0x5f=_
|
|
2, 1, 0
|
|
},
|
|
{ //0x60=`
|
|
1, 0, 2
|
|
},
|
|
{ //0x61=a
|
|
0, 0, 671
|
|
},
|
|
{ //0x62=b
|
|
0, 1, 16
|
|
},
|
|
{ //0x63=c
|
|
0, 2, 1
|
|
},
|
|
{ //0x64=d
|
|
0, 14, 0
|
|
},
|
|
{ //0x65=e
|
|
0, 0, 763
|
|
},
|
|
{ //0x66=f
|
|
0, 186, 0
|
|
},
|
|
{ //0x67=g
|
|
0, 2, 1
|
|
},
|
|
{ //0x68=h
|
|
0, 2, 1
|
|
},
|
|
{ //0x69=i
|
|
0, 0, 818
|
|
},
|
|
{ //0x6a=j
|
|
0, 2, 1
|
|
},
|
|
{ //0x6b=k
|
|
0, 4, 1
|
|
},
|
|
{ //0x6c=l
|
|
0, 26, 3
|
|
},
|
|
{ //0x6d=m
|
|
0, 69, 0
|
|
},
|
|
{ //0x6e=n
|
|
0, 885, 0
|
|
},
|
|
{ //0x6f=o
|
|
0, 17, 722
|
|
},
|
|
{ //0x70=p
|
|
0, 1, 5
|
|
},
|
|
{ //0x71=q
|
|
2, 1, 0
|
|
},
|
|
{ //0x72=r
|
|
0, 21, 0
|
|
},
|
|
{ //0x73=s
|
|
3, 49, 0
|
|
},
|
|
{ //0x74=t
|
|
0, 219, 5
|
|
},
|
|
{ //0x75=u
|
|
0, 0, 56
|
|
},
|
|
{ //0x76=v
|
|
0, 4, 0
|
|
},
|
|
{ //0x77=w
|
|
0, 2, 1
|
|
},
|
|
{ //0x78=x
|
|
0, 2, 1
|
|
},
|
|
{ //0x79=y
|
|
0, 1, 23
|
|
},
|
|
{ //0x7a=z
|
|
0, 2, 1
|
|
},
|
|
{ //0x7b={
|
|
2, 1, 0
|
|
},
|
|
{ //0x7c=|
|
|
59, 0, 3
|
|
},
|
|
{ //0x7d=}
|
|
2, 1, 0
|
|
},
|
|
{ //0x7e=~
|
|
2, 1, 0
|
|
},
|
|
{ //0x7f=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x80=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x81=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x82=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x83=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x84=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x85=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x86=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x87=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x88=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x89=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x8a=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x8b=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x8c=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x8d=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x8e=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x8f=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x90=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x91=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x92=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x93=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x94=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x95=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x96=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x97=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x98=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x99=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x9a=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x9b=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x9c=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x9d=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x9e=.
|
|
0, 0, 0
|
|
},
|
|
{ //0x9f=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa0=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa1=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa2=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa3=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa4=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa5=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa6=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa7=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa8=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xa9=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xaa=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xab=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xac=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xad=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xae=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xaf=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb0=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb1=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb2=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb3=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb4=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb5=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb6=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb7=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb8=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xb9=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xba=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xbb=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xbc=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xbd=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xbe=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xbf=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc0=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc1=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc2=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc3=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc4=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc5=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc6=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc7=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc8=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xc9=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xca=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xcb=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xcc=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xcd=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xce=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xcf=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd0=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd1=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd2=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd3=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd4=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd5=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd6=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd7=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd8=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xd9=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xda=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xdb=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xdc=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xdd=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xde=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xdf=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe0=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe1=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe2=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe3=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe4=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe5=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe6=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe7=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe8=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xe9=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xea=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xeb=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xec=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xed=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xee=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xef=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf0=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf1=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf2=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf3=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf4=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf5=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf6=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf7=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf8=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xf9=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xfa=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xfb=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xfc=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xfd=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xfe=.
|
|
0, 0, 0
|
|
},
|
|
{ //0xff=.
|
|
0, 0, 0
|
|
},
|
|
};
|
|
#endif
|
|
|
|
//extern "C" double permuter_pending_threshold;
|
|
|
|
/* Similarity matcher values */
|
|
#define SIM_CERTAINTY_SCALE -10.0
|
|
/* Similarity matcher values */
|
|
#define SIM_CERTAINTY_OFFSET -10.0
|
|
/* Worst E*L product to stop on */
|
|
#define SIMILARITY_FLOOR 100.0
|
|
/*----------------------------------------------------------------------
|
|
F u n c t i o n s
|
|
----------------------------------------------------------------------*/
|
|
|
|
/**********************************************************************
|
|
* good_choice
|
|
*
|
|
* Return TRUE if a good answer is found for the unknown blob rating.
|
|
**********************************************************************/
|
|
int good_choice(A_CHOICE *choice) {
|
|
register float certainty;
|
|
if (choice == NULL)
|
|
return (FALSE);
|
|
if (similarity_enable) {
|
|
if ((class_probability (choice) + 1) * class_certainty (choice) >
|
|
SIMILARITY_FLOOR)
|
|
return (FALSE);
|
|
certainty =
|
|
SIM_CERTAINTY_OFFSET +
|
|
class_probability (choice) * SIM_CERTAINTY_SCALE;
|
|
}
|
|
|
|
else {
|
|
certainty = class_certainty (choice);
|
|
}
|
|
if (certainty > certainty_threshold) {
|
|
return (TRUE);
|
|
}
|
|
|
|
else {
|
|
return (FALSE);
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* add_document_word
|
|
*
|
|
* Add a word found on this document to the document specific
|
|
* dictionary.
|
|
**********************************************************************/
|
|
void add_document_word(A_CHOICE *best_choice) {
|
|
char filename[CHARS_PER_LINE];
|
|
FILE *doc_word_file;
|
|
char *string;
|
|
char *lengths;
|
|
int stringlen; //length of word
|
|
|
|
string = class_string (best_choice);
|
|
lengths = class_lengths (best_choice);
|
|
stringlen = strlen (lengths);
|
|
|
|
// Skip if using external dictionary.
|
|
if (letter_is_okay != &def_letter_is_okay) return;
|
|
|
|
if (!doc_dict_enable
|
|
|| valid_word (string) || CurrentWordAmbig () || stringlen < 2)
|
|
return;
|
|
|
|
if (!good_choice (best_choice) || stringlen == 2) {
|
|
if (class_certainty (best_choice) < permuter_pending_threshold)
|
|
return;
|
|
if (!word_in_dawg (pending_words, string)) {
|
|
if (stringlen > 2 ||
|
|
(stringlen >= 2 && unicharset.get_isupper (string, lengths[0]) &&
|
|
unicharset.get_isupper (string + lengths[0], lengths[1])))
|
|
add_word_to_dawg(pending_words,
|
|
string,
|
|
MAX_DOC_EDGES,
|
|
RESERVED_DOC_EDGES);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (save_doc_words) {
|
|
strcpy(filename, imagefile);
|
|
strcat (filename, ".doc");
|
|
doc_word_file = open_file (filename, "a");
|
|
fprintf (doc_word_file, "%s\n", string);
|
|
fclose(doc_word_file);
|
|
}
|
|
add_word_to_dawg(document_words, string, MAX_DOC_EDGES, RESERVED_DOC_EDGES);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* adjust_non_word
|
|
*
|
|
* Assign an adjusted value to a string that is a non-word. The value
|
|
* that this word choice has is based on case and punctuation rules.
|
|
**********************************************************************/
|
|
void
|
|
adjust_non_word (A_CHOICE * best_choice, float certainties[]) {
|
|
char *this_word;
|
|
float adjust_factor;
|
|
|
|
if (adjust_debug)
|
|
cprintf ("%s %4.2f ",
|
|
class_string (best_choice), class_probability (best_choice));
|
|
|
|
this_word = class_string (best_choice);
|
|
|
|
class_probability (best_choice) += RATING_PAD;
|
|
if (case_ok (this_word, class_lengths (best_choice))
|
|
&& punctuation_ok (this_word, class_lengths (best_choice)) != -1) {
|
|
class_probability (best_choice) *= non_word;
|
|
adjust_factor = non_word;
|
|
if (adjust_debug)
|
|
cprintf (", %4.2f ", non_word);
|
|
}
|
|
else {
|
|
class_probability (best_choice) *= garbage;
|
|
adjust_factor = garbage;
|
|
if (adjust_debug) {
|
|
if (!case_ok (this_word, class_lengths (best_choice)))
|
|
cprintf (", C");
|
|
if (punctuation_ok (this_word, class_lengths (best_choice)) == -1)
|
|
cprintf (", P");
|
|
cprintf (", %4.2f ", garbage);
|
|
}
|
|
}
|
|
|
|
class_probability (best_choice) -= RATING_PAD;
|
|
|
|
LogNewWordChoice(best_choice, adjust_factor, certainties);
|
|
|
|
if (adjust_debug)
|
|
cprintf (" --> %4.2f\n", class_probability (best_choice));
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* init_permute
|
|
*
|
|
* Initialize anything that needs to be set up for the permute
|
|
* functions.
|
|
**********************************************************************/
|
|
void init_permute_vars() {
|
|
make_adjust_debug();
|
|
make_compound_debug();
|
|
make_non_word();
|
|
make_garbage();
|
|
make_doc_words();
|
|
make_doc_dict();
|
|
|
|
init_permdawg_vars();
|
|
init_permnum();
|
|
}
|
|
|
|
void init_permute() {
|
|
if (word_dawg != NULL)
|
|
end_permute();
|
|
init_permdawg();
|
|
STRING name;
|
|
name = language_data_path_prefix;
|
|
name += "word-dawg";
|
|
word_dawg = read_squished_dawg(name.string());
|
|
|
|
document_words =
|
|
(EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
|
|
initialize_dawg(document_words, MAX_DOC_EDGES);
|
|
|
|
pending_words =
|
|
(EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_DOC_EDGES);
|
|
initialize_dawg(pending_words, MAX_DOC_EDGES);
|
|
|
|
user_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) * MAX_USER_EDGES);
|
|
name = language_data_path_prefix;
|
|
name += "user-words";
|
|
read_word_list(name.string(), user_words, MAX_USER_EDGES, USER_RESERVED_EDGES);
|
|
}
|
|
|
|
void end_permute() {
|
|
if (word_dawg == NULL)
|
|
return; // Not safe to call twice.
|
|
memfree(word_dawg);
|
|
word_dawg = NULL;
|
|
memfree(document_words);
|
|
document_words = NULL;
|
|
memfree(pending_words);
|
|
pending_words = NULL;
|
|
memfree(user_words);
|
|
user_words = NULL;
|
|
end_permdawg();
|
|
}
|
|
|
|
/**********************************************************************
|
|
* permute_all
|
|
*
|
|
* Permute all the characters together using all of the different types
|
|
* of permuters/selectors available. Each of the characters must have
|
|
* a non-NIL choice list.
|
|
**********************************************************************/
|
|
A_CHOICE *permute_all(CHOICES_LIST char_choices,
|
|
float rating_limit,
|
|
A_CHOICE *raw_choice) {
|
|
A_CHOICE *result_1;
|
|
A_CHOICE *result_2 = NULL;
|
|
BOOL8 any_alpha;
|
|
|
|
result_1 = permute_top_choice (char_choices, rating_limit, raw_choice,
|
|
&any_alpha);
|
|
|
|
if (ngram_permuter_activated)
|
|
return ngram_permute_and_select(char_choices, rating_limit, word_dawg);
|
|
|
|
if (result_1 == NULL)
|
|
return (NULL);
|
|
if (permute_only_top)
|
|
return result_1;
|
|
if (any_alpha && array_count (char_choices) <= MAX_WERD_LENGTH) {
|
|
result_2 = permute_words (char_choices, rating_limit);
|
|
if (class_probability (result_1) < class_probability (result_2)
|
|
|| class_string (result_2) == NULL) {
|
|
free_choice(result_2);
|
|
}
|
|
else {
|
|
free_choice(result_1);
|
|
result_1 = result_2;
|
|
}
|
|
}
|
|
|
|
result_2 = number_permute_and_select (char_choices, rating_limit);
|
|
|
|
if (class_probability (result_1) < class_probability (result_2)
|
|
|| class_string (result_2) == NULL) {
|
|
free_choice(result_2);
|
|
}
|
|
else {
|
|
free_choice(result_1);
|
|
result_1 = result_2;
|
|
}
|
|
|
|
result_2 = permute_compound_words (char_choices, rating_limit);
|
|
|
|
if (!result_2 ||
|
|
class_probability (result_1) < class_probability (result_2)
|
|
|| class_string (result_2) == NULL) {
|
|
free_choice(result_2);
|
|
}
|
|
else {
|
|
free_choice(result_1);
|
|
result_1 = result_2;
|
|
}
|
|
|
|
return (result_1);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* permute_characters
|
|
*
|
|
* Permute these characters together according to each of the different
|
|
* permuters that are enabled.
|
|
**********************************************************************/
|
|
void permute_characters(CHOICES_LIST char_choices,
|
|
float limit,
|
|
A_CHOICE *best_choice,
|
|
A_CHOICE *raw_choice) {
|
|
A_CHOICE *this_choice;
|
|
|
|
permutation_count++; /* Global counter */
|
|
|
|
this_choice = permute_all (char_choices, limit, raw_choice);
|
|
|
|
if (this_choice &&
|
|
class_probability (this_choice) < class_probability (best_choice)) {
|
|
clone_choice(best_choice, this_choice);
|
|
}
|
|
free_choice(this_choice);
|
|
|
|
if (display_ratings)
|
|
cprintf ("permute_characters: %-15s %4.2f %4.2f\n",
|
|
class_string (best_choice),
|
|
class_probability (best_choice), class_certainty (best_choice));
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* permute_compound_word
|
|
*
|
|
* Return the top choice for each character as the choice for the word.
|
|
**********************************************************************/
|
|
A_CHOICE *permute_compound_words(CHOICES_LIST character_choices,
|
|
float rating_limit) {
|
|
A_CHOICE *first_choice;
|
|
A_CHOICE *best_choice = NULL;
|
|
char word[UNICHAR_LEN * MAX_WERD_LENGTH + 1];
|
|
char unichar_lengths[MAX_WERD_LENGTH + 1];
|
|
float rating = 0;
|
|
float certainty = 10000;
|
|
char char_choice;
|
|
int x;
|
|
int first_index = 0;
|
|
char *ptr;
|
|
|
|
word[0] = '\0';
|
|
unichar_lengths[0] = 0;
|
|
|
|
if (array_count (character_choices) > MAX_WERD_LENGTH) {
|
|
return (new_choice (NULL, NULL, MAX_FLOAT32, -MAX_FLOAT32, -1, NO_PERM));
|
|
}
|
|
|
|
array_loop(character_choices, x) {
|
|
|
|
first_choice =
|
|
(A_CHOICE *) first_node ((CHOICES) array_value (character_choices, x));
|
|
|
|
ptr = class_string (first_choice);
|
|
char_choice = ptr != NULL ? *ptr : '\0';
|
|
if (x > first_index && (char_choice == '-' || char_choice == '/')) {
|
|
if (compound_debug)
|
|
cprintf ("Hyphenated word found\n");
|
|
|
|
permute_subword (character_choices, rating_limit,
|
|
first_index, x - 1, word, unichar_lengths,
|
|
&rating, &certainty);
|
|
|
|
if (rating > rating_limit)
|
|
break;
|
|
first_index = x + 1;
|
|
|
|
strcat(word, class_string (first_choice));
|
|
char length[] = {strlen(class_string (first_choice)), 0};
|
|
strcat(unichar_lengths + x, length);
|
|
rating += class_probability (first_choice);
|
|
certainty = min (class_certainty (first_choice), certainty);
|
|
}
|
|
}
|
|
|
|
if (first_index > 0 && first_index < x && rating <= rating_limit) {
|
|
permute_subword (character_choices, rating_limit,
|
|
first_index, x - 1, word, unichar_lengths,
|
|
&rating, &certainty);
|
|
|
|
best_choice = new_choice (word, unichar_lengths, rating,
|
|
certainty, -1, COMPOUND_PERM);
|
|
}
|
|
return (best_choice);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* permute_subword
|
|
*
|
|
* Permute a part of a compound word this subword is bounded by hyphens
|
|
* and the start and end of the word. Call the standard word permute
|
|
* function on a set of choices covering only part of the original
|
|
* word. When it is done reclaim the memory that was used in the
|
|
* excercise.
|
|
**********************************************************************/
|
|
void permute_subword(CHOICES_LIST character_choices,
|
|
float rating_limit,
|
|
int start,
|
|
int end,
|
|
char *word,
|
|
char unichar_lengths[],
|
|
float *rating,
|
|
float *certainty) {
|
|
int x;
|
|
A_CHOICE *best_choice = NULL;
|
|
A_CHOICE raw_choice;
|
|
CHOICES_LIST subchoices;
|
|
CHOICES choices;
|
|
char this_char;
|
|
char *ptr;
|
|
|
|
DisableChoiceAccum();
|
|
raw_choice.string = NULL;
|
|
raw_choice.lengths = NULL;
|
|
raw_choice.rating = MAX_INT16;
|
|
raw_choice.certainty = -MAX_INT16;
|
|
|
|
subchoices = new_choice_list ();
|
|
for (x = start; x <= end; x++) {
|
|
choices = (CHOICES) array_value (character_choices, x);
|
|
ptr = best_string (choices);
|
|
this_char = ptr != NULL ? *ptr : '\0';
|
|
if (this_char != '-' && this_char != '/') {
|
|
subchoices = array_push (subchoices, choices);
|
|
} else {
|
|
const char* str = best_string(choices);
|
|
strcat(word, str);
|
|
char length[] = {strlen(str), 0};
|
|
strcat(unichar_lengths + x, length);
|
|
}
|
|
}
|
|
|
|
if (array_count (subchoices)) {
|
|
if (compound_debug)
|
|
dawg_debug = TRUE;
|
|
best_choice = permute_all (subchoices, rating_limit, &raw_choice);
|
|
if (compound_debug)
|
|
dawg_debug = FALSE;
|
|
|
|
if (best_choice && class_string (best_choice)) {
|
|
strcat (word, class_string (best_choice));
|
|
strcat (unichar_lengths, class_lengths (best_choice));
|
|
*rating += class_probability (best_choice);
|
|
*certainty = min (class_certainty (best_choice), *certainty);
|
|
}
|
|
else {
|
|
*rating = MAX_FLOAT32;
|
|
}
|
|
}
|
|
else {
|
|
*rating = MAX_FLOAT32;
|
|
}
|
|
|
|
free_choice_list(subchoices);
|
|
if (best_choice)
|
|
free_choice(best_choice);
|
|
|
|
if (compound_debug && *rating < MAX_FLOAT32) {
|
|
cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
|
|
word, *rating, *certainty);
|
|
}
|
|
if (raw_choice.string)
|
|
strfree(raw_choice.string);
|
|
if (raw_choice.lengths)
|
|
strfree(raw_choice.lengths);
|
|
|
|
EnableChoiceAccum();
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* permute_top_choice
|
|
*
|
|
* Return the top choice for each character as the choice for the word.
|
|
* In addition a choice is created for the best lower and upper case
|
|
* non-words. In each character position the best lower (or upper) case
|
|
* character is substituted for the best overall character.
|
|
**********************************************************************/
|
|
A_CHOICE *permute_top_choice(CHOICES_LIST character_choices,
|
|
float rating_limit,
|
|
A_CHOICE *raw_choice,
|
|
BOOL8 *any_alpha) {
|
|
CHOICES char_list;
|
|
A_CHOICE *first_choice;
|
|
A_CHOICE *best_choice;
|
|
A_CHOICE *other_choice;
|
|
const char *ptr;
|
|
const char *first_char; //first choice
|
|
const char *second_char; //second choice
|
|
const char *third_char; //third choice
|
|
char prev_char[UNICHAR_LEN + 1]; //prev in word
|
|
const char *next_char = ""; //next in word
|
|
const char *next_next_char = ""; //after next next in word
|
|
|
|
char word[UNICHAR_LEN * MAX_PERM_LENGTH + 1];
|
|
char capital_word[UNICHAR_LEN * MAX_PERM_LENGTH + 1];
|
|
char lower_word[UNICHAR_LEN * MAX_PERM_LENGTH + 1];
|
|
|
|
char word_lengths[MAX_PERM_LENGTH + 1];
|
|
char capital_word_lengths[MAX_PERM_LENGTH + 1];
|
|
char lower_word_lengths[MAX_PERM_LENGTH + 1];
|
|
|
|
int x;
|
|
int x_word = 0;
|
|
int x_capital_word = 0;
|
|
int x_lower_word = 0;
|
|
BOOL8 char_alpha;
|
|
|
|
float rating = 0;
|
|
float upper_rating = 0;
|
|
float lower_rating = 0;
|
|
float first_rating = 0;
|
|
|
|
float certainty = 10000;
|
|
float upper_certainty = 10000;
|
|
float lower_certainty = 10000;
|
|
|
|
float certainties[MAX_PERM_LENGTH + 1];
|
|
float lower_certainties[MAX_PERM_LENGTH + 1];
|
|
float upper_certainties[MAX_PERM_LENGTH + 1];
|
|
|
|
register CHOICES this_char;
|
|
register const char* ch;
|
|
register INT8 lower_done;
|
|
register INT8 upper_done;
|
|
|
|
prev_char[0] = '\0';
|
|
|
|
if (any_alpha != NULL)
|
|
*any_alpha = FALSE;
|
|
|
|
if (array_count (character_choices) > MAX_PERM_LENGTH) {
|
|
return (NULL);
|
|
}
|
|
|
|
array_loop(character_choices, x) {
|
|
if (x + 1 < array_count (character_choices)) {
|
|
char_list = (CHOICES) array_value (character_choices, x + 1);
|
|
first_choice = (A_CHOICE *) first_node (char_list);
|
|
|
|
ptr = class_string (first_choice);
|
|
next_char = (ptr != NULL && *ptr != '\0') ? ptr : " ";
|
|
}
|
|
else
|
|
next_char = "";
|
|
if (x + 2 < array_count (character_choices)) {
|
|
char_list = (CHOICES) array_value (character_choices, x + 2);
|
|
first_choice = (A_CHOICE *) first_node (char_list);
|
|
|
|
ptr = class_string (first_choice);
|
|
next_next_char = (ptr != NULL && *ptr != '\0') ? ptr : " ";
|
|
}
|
|
else
|
|
next_next_char = "";
|
|
|
|
char_list = (CHOICES) array_value (character_choices, x);
|
|
first_choice = (A_CHOICE *) first_node (char_list);
|
|
|
|
ptr = class_string (first_choice);
|
|
if (ptr != NULL && *ptr != '\0')
|
|
{
|
|
strcpy(word + x_word, ptr);
|
|
word_lengths[x] = strlen(ptr);
|
|
|
|
strcpy(capital_word + x_capital_word, ptr);
|
|
capital_word_lengths[x] = strlen(ptr);
|
|
|
|
strcpy(lower_word + x_lower_word, ptr);
|
|
lower_word_lengths[x] = strlen(ptr);
|
|
}
|
|
else
|
|
{
|
|
word[x_word] = ' ';
|
|
word_lengths[x] = 1;
|
|
|
|
capital_word[x_capital_word] = ' ';
|
|
capital_word_lengths[x] = 1;
|
|
|
|
lower_word[x_lower_word] = ' ';
|
|
lower_word_lengths[x] = 1;
|
|
}
|
|
|
|
first_char = (ptr != NULL && *ptr != '\0') ? ptr : " ";
|
|
first_rating = class_probability (first_choice);
|
|
upper_rating += class_probability (first_choice);
|
|
lower_rating += class_probability (first_choice);
|
|
lower_certainty = min (class_certainty (first_choice), lower_certainty);
|
|
upper_certainty = min (class_certainty (first_choice), upper_certainty);
|
|
|
|
certainties[x] = class_certainty (first_choice);
|
|
lower_certainties[x] = class_certainty (first_choice);
|
|
upper_certainties[x] = class_certainty (first_choice);
|
|
|
|
lower_done = FALSE;
|
|
upper_done = FALSE;
|
|
char_alpha = FALSE;
|
|
second_char = "";
|
|
third_char = "";
|
|
iterate_list(this_char, char_list) {
|
|
ptr = best_string (this_char);
|
|
ch = ptr != NULL ? ptr : "";
|
|
if (strcmp(ch, "l") == 0 && rest (this_char) != NULL
|
|
&& best_probability (rest (this_char)) == first_rating) {
|
|
ptr = best_string (rest (this_char));
|
|
if (ptr != NULL && (strcmp(ptr, "1") == 0 || strcmp(ptr, "I") == 0)) {
|
|
second_char = ptr;
|
|
this_char = rest (this_char);
|
|
if (rest (this_char) != NULL
|
|
&& best_probability (rest (this_char)) == first_rating) {
|
|
ptr = best_string (rest (this_char));
|
|
if (ptr != NULL && (strcmp(ptr, "1") == 0 || strcmp(ptr, "I") == 0)) {
|
|
third_char = ptr;
|
|
this_char = rest (this_char);
|
|
}
|
|
}
|
|
ch = choose_il1 (first_char, second_char, third_char,
|
|
prev_char, next_char, next_next_char);
|
|
if (strcmp(ch, "l") != 0 && word_lengths[x] == 1 &&
|
|
word[x_word] == 'l') {
|
|
word[x_word] = *ch;
|
|
lower_word[x_lower_word] = *ch;
|
|
capital_word[x_capital_word] = *ch;
|
|
}
|
|
}
|
|
}
|
|
if (ch != NULL && *ch != '\0') {
|
|
/* Find lower case */
|
|
if (!lower_done && (unicharset.get_islower(ch) ||
|
|
(unicharset.get_isupper(ch) && x == 0))) {
|
|
strcpy(lower_word + x_lower_word, ch);
|
|
lower_word_lengths[x] = strlen(ch);
|
|
lower_rating += best_probability (this_char);
|
|
lower_rating -= class_probability (first_choice);
|
|
lower_certainty = min (best_certainty (this_char), lower_certainty);
|
|
lower_certainties[x] = best_certainty (this_char);
|
|
lower_done = TRUE;
|
|
}
|
|
/* Find upper case */
|
|
if (!upper_done && unicharset.get_isupper(ch)) {
|
|
strcpy(capital_word + x_capital_word, ch);
|
|
capital_word_lengths[x] = strlen(ch);
|
|
upper_rating += best_probability (this_char);
|
|
upper_rating -= class_probability (first_choice);
|
|
upper_certainty = min (best_certainty (this_char), upper_certainty);
|
|
upper_certainties[x] = best_certainty (this_char);
|
|
upper_done = TRUE;
|
|
}
|
|
if (!char_alpha && unicharset.get_isalpha(ch))
|
|
char_alpha = TRUE;
|
|
if (lower_done && upper_done)
|
|
break;
|
|
}
|
|
}
|
|
if (char_alpha && any_alpha != NULL)
|
|
*any_alpha = TRUE;
|
|
|
|
if (first_choice == NULL) {
|
|
cprintf ("Permuter giving up due to null choices list");
|
|
word[x_word + 1] = '$';
|
|
word[x_word + 2] = '\0';
|
|
word_lengths[x + 1] = 1;
|
|
word_lengths[x + 2] = 0;
|
|
cprintf (" word=%s\n", word);
|
|
return (NULL);
|
|
}
|
|
|
|
rating += class_probability (first_choice);
|
|
if (rating > rating_limit)
|
|
return (NULL);
|
|
|
|
certainty = min (class_certainty (first_choice), certainty);
|
|
|
|
strncpy(prev_char, word + x_word, word_lengths[x]);
|
|
prev_char[word_lengths[x]] = '\0';
|
|
|
|
x_word += word_lengths[x];
|
|
x_capital_word += capital_word_lengths[x];
|
|
x_lower_word += lower_word_lengths[x];
|
|
}
|
|
|
|
word[x_word] = '\0';
|
|
word_lengths[x] = 0;
|
|
|
|
capital_word[x_capital_word] = '\0';
|
|
capital_word_lengths[x] = 0;
|
|
|
|
lower_word[x_lower_word] = '\0';
|
|
lower_word_lengths[x] = 0;
|
|
|
|
if (rating < class_probability (raw_choice)) {
|
|
if (class_string (raw_choice))
|
|
strfree (class_string (raw_choice));
|
|
if (class_lengths (raw_choice))
|
|
strfree (class_lengths (raw_choice));
|
|
|
|
class_probability (raw_choice) = rating;
|
|
class_certainty (raw_choice) = certainty;
|
|
class_string (raw_choice) = strsave (word);
|
|
class_lengths (raw_choice) = strsave (word_lengths);
|
|
class_permuter (raw_choice) = TOP_CHOICE_PERM;
|
|
|
|
LogNewRawChoice (raw_choice, 1.0, certainties);
|
|
}
|
|
|
|
if (ngram_permuter_activated)
|
|
return NULL;
|
|
|
|
best_choice = new_choice (word, word_lengths,
|
|
rating, certainty, -1, TOP_CHOICE_PERM);
|
|
adjust_non_word(best_choice, certainties);
|
|
|
|
other_choice = new_choice (lower_word, lower_word_lengths,
|
|
lower_rating, lower_certainty,
|
|
-1, LOWER_CASE_PERM);
|
|
adjust_non_word(other_choice, lower_certainties);
|
|
if (class_probability (best_choice) > class_probability (other_choice)) {
|
|
clone_choice(best_choice, other_choice);
|
|
}
|
|
free_choice(other_choice);
|
|
|
|
other_choice = new_choice (capital_word, capital_word_lengths,
|
|
upper_rating, upper_certainty,
|
|
-1, UPPER_CASE_PERM);
|
|
adjust_non_word(other_choice, upper_certainties);
|
|
if (class_probability (best_choice) > class_probability (other_choice)) {
|
|
clone_choice(best_choice, other_choice);
|
|
}
|
|
free_choice(other_choice);
|
|
return (best_choice);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* choose_il1
|
|
*
|
|
* Choose between the candidate il1 chars.
|
|
**********************************************************************/
|
|
const char* choose_il1(const char *first_char, //first choice
|
|
const char *second_char, //second choice
|
|
const char *third_char, //third choice
|
|
const char *prev_char, //prev in word
|
|
const char *next_char, //next in word
|
|
const char *next_next_char) { //after next next in word
|
|
INT32 type1; //1/I/l type of first choice
|
|
INT32 type2; //1/I/l type of second choice
|
|
INT32 type3; //1/I/l type of third choice
|
|
|
|
int first_char_length = strlen(first_char);
|
|
int prev_char_length = strlen(prev_char);
|
|
int next_char_length = strlen(next_char);
|
|
int next_next_char_length = strlen(next_next_char);
|
|
|
|
if (*first_char == 'l' && *second_char != '\0') {
|
|
if (*second_char == 'I'
|
|
&& (((prev_char_length != 0 &&
|
|
unicharset.get_isupper (prev_char, prev_char_length)) &&
|
|
(next_char_length == 0 ||
|
|
!unicharset.get_islower (next_char, next_char_length)) &&
|
|
(next_char_length == 0 ||
|
|
!unicharset.get_isdigit (next_char, next_char_length))) ||
|
|
((next_char_length != 0 &&
|
|
unicharset.get_isupper (next_char, next_char_length)) &&
|
|
(prev_char_length == 0 ||
|
|
!unicharset.get_islower (prev_char, prev_char_length)) &&
|
|
(prev_char_length == 0 ||
|
|
!unicharset.get_isdigit (prev_char, prev_char_length)))))
|
|
first_char = second_char; //override
|
|
else if (*second_char == '1' || *third_char == '1') {
|
|
if ((next_char_length != 0 &&
|
|
unicharset.get_isdigit (next_char, next_char_length)) ||
|
|
(prev_char_length != 0 &&
|
|
unicharset.get_isdigit (prev_char, prev_char_length))
|
|
|| (*next_char == 'l' &&
|
|
(next_next_char_length != 0 &&
|
|
unicharset.get_isdigit (next_next_char, next_next_char_length)))) {
|
|
first_char = "1";
|
|
first_char_length = 1;
|
|
}
|
|
else if ((prev_char_length == 0 ||
|
|
!unicharset.get_islower (prev_char, prev_char_length)) &&
|
|
((next_char_length == 0 ||
|
|
!unicharset.get_islower (next_char, next_char_length)) ||
|
|
(*next_char == 's' &&
|
|
*next_next_char == 't'))) {
|
|
if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
|
|
&& ((*next_char != '\'' && *next_char != '`')
|
|
|| *prev_char != '\0')) {
|
|
first_char = "1";
|
|
first_char_length = 1;
|
|
}
|
|
}
|
|
}
|
|
if (*first_char == 'l' && *next_char != '\0' &&
|
|
(prev_char_length == 0 ||
|
|
!unicharset.get_isalpha (prev_char, prev_char_length))) {
|
|
type1 = 2;
|
|
|
|
if (*second_char == '1')
|
|
type2 = 0;
|
|
else if (*second_char == 'I')
|
|
type2 = 1;
|
|
else if (*second_char == 'l')
|
|
type2 = 2;
|
|
else
|
|
type2 = type1;
|
|
|
|
if (*third_char == '1')
|
|
type3 = 0;
|
|
else if (*third_char == 'I')
|
|
type3 = 1;
|
|
else if (*third_char == 'l')
|
|
type3 = 2;
|
|
else
|
|
type3 = type1;
|
|
|
|
#if 0
|
|
if (bigram_counts[*next_char][type2] >
|
|
bigram_counts[*next_char][type1]) {
|
|
first_char = second_char;
|
|
type1 = type2;
|
|
}
|
|
if (bigram_counts[*next_char][type3] >
|
|
bigram_counts[*next_char][type1]) {
|
|
first_char = third_char;
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
return first_char;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* permute_words
|
|
*
|
|
* Permute all the characters together using the dawg to prune all
|
|
* but the valid words.
|
|
**********************************************************************/
|
|
A_CHOICE *permute_words(CHOICES_LIST char_choices, float rating_limit) {
|
|
A_CHOICE *best_choice;
|
|
|
|
best_choice = new_choice (NULL, NULL, rating_limit, -MAX_FLOAT32, -1, NO_PERM);
|
|
|
|
if (hyphen_base_size() + array_count (char_choices) > MAX_WERD_LENGTH) {
|
|
class_probability (best_choice) = MAX_FLOAT32;
|
|
}
|
|
else {
|
|
|
|
dawg_permute_and_select ("system words:", word_dawg, SYSTEM_DAWG_PERM,
|
|
char_choices, best_choice, TRUE);
|
|
|
|
dawg_permute_and_select ("document_words", document_words,
|
|
DOC_DAWG_PERM, char_choices, best_choice,
|
|
FALSE);
|
|
|
|
dawg_permute_and_select ("user words", user_words, USER_DAWG_PERM,
|
|
char_choices, best_choice, FALSE);
|
|
}
|
|
|
|
return (best_choice);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* valid_word
|
|
*
|
|
* Check all the DAWGs to see if this word is in any of them.
|
|
**********************************************************************/
|
|
int valid_word(const char *string) {
|
|
int result = NO_PERM;
|
|
|
|
if (word_in_dawg (word_dawg, string))
|
|
result = SYSTEM_DAWG_PERM;
|
|
else {
|
|
if (word_in_dawg (document_words, string))
|
|
result = DOC_DAWG_PERM;
|
|
else if (word_in_dawg (user_words, string))
|
|
result = USER_DAWG_PERM;
|
|
}
|
|
return (result);
|
|
}
|
|
|