2007-03-08 04:03:40 +08:00
|
|
|
/* -*-C-*-
|
|
|
|
********************************************************************************
|
|
|
|
*
|
|
|
|
* File: context.c (Formerly context.c)
|
|
|
|
* Description: Context checking functions
|
|
|
|
* Author: Mark Seaman, OCR Technology
|
|
|
|
* Created: Thu Feb 15 11:18:24 1990
|
|
|
|
* Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
|
|
|
|
* Language: C
|
|
|
|
* Package: N/A
|
|
|
|
* Status: Experimental (Do Not Distribute)
|
|
|
|
*
|
|
|
|
* (c) Copyright 1990, Hewlett-Packard Company.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
*********************************************************************************/
|
2009-07-11 10:20:33 +08:00
|
|
|
|
|
|
|
#include "dict.h"
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "tprintf.h"
|
2009-07-11 10:20:33 +08:00
|
|
|
#include "unicharset.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:20:33 +08:00
|
|
|
namespace tesseract {
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
static const int kMinAbsoluteGarbageWordLength = 10;
|
|
|
|
static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
|
|
|
|
|
|
|
|
const int case_state_table[6][4] = { {
|
|
|
|
/* 0. Begining of word */
|
|
|
|
/* P U L D */
|
|
|
|
/* -1. Error on case */
|
2007-03-08 04:03:40 +08:00
|
|
|
0, 1, 5, 4
|
|
|
|
},
|
|
|
|
{ /* 1. After initial capital */
|
|
|
|
0, 3, 2, 4
|
|
|
|
},
|
|
|
|
{ /* 2. After lower case */
|
|
|
|
0, -1, 2, -1
|
|
|
|
},
|
|
|
|
{ /* 3. After upper case */
|
|
|
|
0, 3, -1, 4
|
|
|
|
},
|
|
|
|
{ /* 4. After a digit */
|
|
|
|
0, -1, -1, 4
|
|
|
|
},
|
|
|
|
{ /* 5. After initial lower case */
|
|
|
|
5, -1, 2, -1
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
|
|
|
|
int last_state = 0;
|
|
|
|
int state = 0;
|
|
|
|
int x;
|
2009-07-11 10:20:33 +08:00
|
|
|
for (x = 0; x < word.length(); ++x) {
|
|
|
|
UNICHAR_ID ch_id = word.unichar_id(x);
|
2008-04-22 08:42:51 +08:00
|
|
|
if (unicharset.get_isupper(ch_id))
|
2007-03-08 04:03:40 +08:00
|
|
|
state = case_state_table[state][1];
|
2009-07-11 10:20:33 +08:00
|
|
|
else if (unicharset.get_islower(ch_id))
|
2008-04-22 08:42:51 +08:00
|
|
|
state = case_state_table[state][2];
|
|
|
|
else if (unicharset.get_isdigit(ch_id))
|
2007-03-08 04:03:40 +08:00
|
|
|
state = case_state_table[state][3];
|
|
|
|
else
|
|
|
|
state = case_state_table[state][0];
|
2010-11-24 02:34:14 +08:00
|
|
|
if (state == -1) return false;
|
2007-03-08 04:03:40 +08:00
|
|
|
last_state = state;
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return state != 5; // single lower is bad
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
bool Dict::absolute_garbage(const WERD_CHOICE &word,
|
|
|
|
const UNICHARSET &unicharset) {
|
|
|
|
if (word.length() < kMinAbsoluteGarbageWordLength) return false;
|
|
|
|
int num_alphanum = 0;
|
|
|
|
for (int x = 0; x < word.length(); ++x) {
|
|
|
|
num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
|
|
|
|
unicharset.get_isdigit(word.unichar_id(x)));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return (static_cast<float>(num_alphanum) /
|
|
|
|
static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
|
|
} // namespace tesseract
|