mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
Fixed the extern C mismatches properly.
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@82 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
f4baca27c8
commit
1943de9aa9
@ -50,6 +50,7 @@
|
||||
#include "notdll.h"
|
||||
#include "tordvars.h"
|
||||
#include "adaptmatch.h"
|
||||
#include "globals.h"
|
||||
|
||||
#define MIN_FONT_ROW_COUNT 8
|
||||
#define MAX_XHEIGHT_DIFF 3
|
||||
@ -148,18 +149,9 @@ EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
|
||||
EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");
|
||||
|
||||
extern int MatcherDebugLevel;
|
||||
extern "C" { extern int display_ratings; }
|
||||
extern int display_ratings;
|
||||
extern int number_debug;
|
||||
extern int adjust_debug;
|
||||
/*
|
||||
extern "C" {
|
||||
extern int MatcherDebugLevel;
|
||||
extern int display_ratings;
|
||||
extern int number_debug;
|
||||
extern int adjust_debug;
|
||||
// extern int LearningDebugLevel;
|
||||
};
|
||||
*/
|
||||
FILE *choice_file = NULL; //Choice file ptr
|
||||
|
||||
CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
|
||||
@ -569,8 +561,8 @@ if (dopasses==1) return;
|
||||
|
||||
////changed by jetsoft
|
||||
//needed for dll to output memory structure
|
||||
if ((dopasses==0 || dopasses==2) && monitor)
|
||||
output_pass (page_res_it,true, target_word_box);
|
||||
if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
|
||||
output_pass(page_res_it, ocr_char_space() > 0, target_word_box);
|
||||
// end jetsoft
|
||||
|
||||
}
|
||||
@ -620,34 +612,33 @@ void classify_word_pass1( //recog one word
|
||||
tess_default_matcher,
|
||||
word->raw_choice, &blob_choices,
|
||||
word->outword);
|
||||
|
||||
/*
|
||||
Test for TESS screw up on word. Recog_word has already ensured that the
|
||||
choice list, outword blob lists and best_choice string are the same
|
||||
length. A TESS screw up is indicated by a blank filled or 0 length string.
|
||||
*/
|
||||
if ((word->best_choice->string ().length () == 0) ||
|
||||
if ((word->best_choice->lengths ().length () == 0) ||
|
||||
(strspn (word->best_choice->string ().string (), " ") ==
|
||||
word->best_choice->string ().length ())) {
|
||||
word->done = FALSE; //Try again on pass2 - adaption may help
|
||||
word->tess_failed = TRUE;
|
||||
word->reject_map.initialise (word->best_choice->string ().length ());
|
||||
word->reject_map.initialise (word->best_choice->lengths ().length ());
|
||||
word->reject_map.rej_word_tess_failure ();
|
||||
}
|
||||
else {
|
||||
word->tess_failed = FALSE;
|
||||
if ((word->best_choice->string ().length () !=
|
||||
if ((word->best_choice->lengths ().length () !=
|
||||
word->outword->blob_list ()->length ()) ||
|
||||
(word->best_choice->string ().length () != blob_choices.length ())) {
|
||||
(word->best_choice->lengths ().length () != blob_choices.length ())) {
|
||||
tprintf
|
||||
("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
|
||||
word->best_choice->string ().string (),
|
||||
word->best_choice->string ().length (),
|
||||
word->best_choice->lengths ().length (),
|
||||
word->outword->blob_list ()->length (), blob_choices.length ());
|
||||
}
|
||||
ASSERT_HOST (word->best_choice->string ().length () ==
|
||||
ASSERT_HOST (word->best_choice->lengths ().length () ==
|
||||
word->outword->blob_list ()->length ());
|
||||
ASSERT_HOST (word->best_choice->string ().length () ==
|
||||
ASSERT_HOST (word->best_choice->lengths ().length () ==
|
||||
blob_choices.length ());
|
||||
|
||||
/*
|
||||
@ -664,12 +655,12 @@ void classify_word_pass1( //recog one word
|
||||
fix_rep_char(word);
|
||||
}
|
||||
else {
|
||||
fix_quotes ((char *) word->best_choice->string ().string (),
|
||||
fix_quotes (word->best_choice,
|
||||
//turn to double
|
||||
word->outword, &blob_choices);
|
||||
if (tessedit_fix_hyphens)
|
||||
//turn 2 to 1
|
||||
fix_hyphens ((char *) word->best_choice->string ().string (), word->outword, &blob_choices);
|
||||
fix_hyphens (word->best_choice, word->outword, &blob_choices);
|
||||
record_certainty (word->best_choice->certainty (), 1);
|
||||
//accounting
|
||||
|
||||
@ -692,7 +683,7 @@ void classify_word_pass1( //recog one word
|
||||
rejmap = NULL;
|
||||
else {
|
||||
ASSERT_HOST (word->reject_map.length () ==
|
||||
word->best_choice->string ().length ());
|
||||
word->best_choice->lengths ().length ());
|
||||
|
||||
for (index = 0; index < word->reject_map.length (); index++) {
|
||||
if (adapt_ok || word->reject_map[index].accepted ())
|
||||
@ -704,7 +695,9 @@ void classify_word_pass1( //recog one word
|
||||
}
|
||||
|
||||
//adapt to it
|
||||
tess_adapter (word->outword, &word->denorm, word->best_choice->string ().string (), word->raw_choice->string ().string (), rejmap);
|
||||
tess_adapter (word->outword, &word->denorm,
|
||||
*word->best_choice,
|
||||
*word->raw_choice, rejmap);
|
||||
}
|
||||
|
||||
if (tessedit_enable_doc_dict)
|
||||
@ -712,10 +705,12 @@ void classify_word_pass1( //recog one word
|
||||
set_word_fonts(word, &blob_choices);
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
if (tessedit_print_text) {
|
||||
write_cooked_text (bln_word, word->best_choice->string (),
|
||||
word->done, FALSE, stdout);
|
||||
}
|
||||
#endif
|
||||
delete bln_word;
|
||||
blob_choices.deep_clear ();
|
||||
}
|
||||
@ -898,10 +893,12 @@ void classify_word_pass2( //word to do
|
||||
#endif
|
||||
|
||||
set_global_subloc_code(SUBLOC_NORM);
|
||||
#if 0
|
||||
if (tessedit_print_text) {
|
||||
write_cooked_text (word->outword, word->best_choice->string (),
|
||||
word->done, done_this_pass, stdout);
|
||||
}
|
||||
#endif
|
||||
check_debug_pt (word, 50);
|
||||
}
|
||||
|
||||
@ -971,18 +968,18 @@ void match_word_pass2( //recog one word
|
||||
// tprintf("Empty word produced\n");
|
||||
}
|
||||
else {
|
||||
if ((word->best_choice->string ().length () !=
|
||||
if ((word->best_choice->lengths ().length () !=
|
||||
word->outword->blob_list ()->length ()) ||
|
||||
(word->best_choice->string ().length () != blob_choices.length ())) {
|
||||
(word->best_choice->lengths ().length () != blob_choices.length ())) {
|
||||
tprintf
|
||||
("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
|
||||
word->best_choice->string ().string (),
|
||||
word->best_choice->string ().length (),
|
||||
word->best_choice->lengths ().length (),
|
||||
word->outword->blob_list ()->length (), blob_choices.length ());
|
||||
}
|
||||
ASSERT_HOST (word->best_choice->string ().length () ==
|
||||
ASSERT_HOST (word->best_choice->lengths ().length () ==
|
||||
word->outword->blob_list ()->length ());
|
||||
ASSERT_HOST (word->best_choice->string ().length () ==
|
||||
ASSERT_HOST (word->best_choice->lengths ().length () ==
|
||||
blob_choices.length ());
|
||||
|
||||
word->tess_failed = FALSE;
|
||||
@ -990,29 +987,29 @@ void match_word_pass2( //recog one word
|
||||
fix_rep_char(word);
|
||||
}
|
||||
else {
|
||||
fix_quotes ((char *) word->best_choice->string ().string (),
|
||||
fix_quotes (word->best_choice,
|
||||
word->outword, &blob_choices);
|
||||
if (tessedit_fix_hyphens)
|
||||
fix_hyphens ((char *) word->best_choice->string ().string (),
|
||||
fix_hyphens (word->best_choice,
|
||||
word->outword, &blob_choices);
|
||||
/* Dont trust fix_quotes! - though I think I've fixed the bug */
|
||||
if ((word->best_choice->string ().length () !=
|
||||
word->outword->blob_list ()->length ()) ||
|
||||
(word->best_choice->string ().length () !=
|
||||
blob_choices.length ())) {
|
||||
if ((word->best_choice->lengths ().length () !=
|
||||
word->outword->blob_list ()->length ()) ||
|
||||
(word->best_choice->lengths ().length () !=
|
||||
blob_choices.length ())) {
|
||||
#ifndef SECURE_NAMES
|
||||
tprintf
|
||||
("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
|
||||
word->best_choice->string ().string (),
|
||||
word->best_choice->string ().length (),
|
||||
word->outword->blob_list ()->length (),
|
||||
blob_choices.length ());
|
||||
word->best_choice->string ().string (),
|
||||
word->best_choice->lengths ().length (),
|
||||
word->outword->blob_list ()->length (),
|
||||
blob_choices.length ());
|
||||
#endif
|
||||
|
||||
}
|
||||
ASSERT_HOST (word->best_choice->string ().length () ==
|
||||
ASSERT_HOST (word->best_choice->lengths ().length () ==
|
||||
word->outword->blob_list ()->length ());
|
||||
ASSERT_HOST (word->best_choice->string ().length () ==
|
||||
ASSERT_HOST (word->best_choice->lengths ().length () ==
|
||||
blob_choices.length ());
|
||||
|
||||
word->tess_accepted = tess_acceptable_word (word->best_choice,
|
||||
@ -1039,7 +1036,7 @@ void fix_rep_char( //Repeated char word
|
||||
) {
|
||||
struct REP_CH
|
||||
{
|
||||
char ch;
|
||||
char ch[UNICHAR_LEN + 1];
|
||||
int count;
|
||||
};
|
||||
|
||||
@ -1048,19 +1045,25 @@ void fix_rep_char( //Repeated char word
|
||||
int rep_ch_count = 0; //how many unique chs
|
||||
const char *word_str; //the repeated chs
|
||||
int i, j;
|
||||
int offset;
|
||||
int total = 0;
|
||||
int max = 0;
|
||||
char maxch = ' '; //Most common char
|
||||
char *maxch = NULL; //Most common char
|
||||
|
||||
word_str = word->best_choice->string ().string ();
|
||||
word_len = strlen (word_str);
|
||||
word_len = word->best_choice->lengths ().length ();;
|
||||
rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH));
|
||||
for (i = 0; i < word_len; i++) {
|
||||
for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++);
|
||||
for (i = 0, offset = 0; i < word_len;
|
||||
offset += word->best_choice->lengths()[i++]) {
|
||||
for (j = 0; j < rep_ch_count &&
|
||||
strncmp(rep_ch[j].ch, word_str + offset,
|
||||
word->best_choice->lengths()[i]) != 0; j++);
|
||||
if (j < rep_ch_count)
|
||||
rep_ch[j].count++;
|
||||
else {
|
||||
rep_ch[rep_ch_count].ch = word_str[i];
|
||||
strncpy(rep_ch[rep_ch_count].ch, word_str + offset,
|
||||
word->best_choice->lengths()[i]);
|
||||
rep_ch[rep_ch_count].ch[word->best_choice->lengths()[i]] = '\0';
|
||||
rep_ch[rep_ch_count].count = 1;
|
||||
rep_ch_count++;
|
||||
}
|
||||
@ -1068,7 +1071,7 @@ void fix_rep_char( //Repeated char word
|
||||
|
||||
for (j = 0; j < rep_ch_count; j++) {
|
||||
total += rep_ch[j].count;
|
||||
if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) {
|
||||
if ((rep_ch[j].count > max) && (*rep_ch[j].ch != ' ')) {
|
||||
max = rep_ch[j].count;
|
||||
maxch = rep_ch[j].ch;
|
||||
}
|
||||
@ -1078,26 +1081,47 @@ void fix_rep_char( //Repeated char word
|
||||
free_mem(rep_ch);
|
||||
|
||||
word->reject_map.initialise (word_len);
|
||||
for (i = 0; i < word_len; i++) {
|
||||
if (word_str[i] != maxch)
|
||||
for (i = 0, offset = 0; i < word_len;
|
||||
offset += word->best_choice->lengths()[i++]) {
|
||||
if (strncmp(word_str + offset, maxch,
|
||||
word->best_choice->lengths()[i]) != 0)
|
||||
//rej unrecognised blobs
|
||||
word->reject_map[i].setrej_bad_repetition ();
|
||||
}
|
||||
word->done = TRUE;
|
||||
}
|
||||
|
||||
// TODO(tkielbus) Decide between keeping this behavior here or modifying the
|
||||
// training data.
|
||||
|
||||
// Utility function for fix_quotes
|
||||
// Return true if the next character in the string (given the UTF8 length in
|
||||
// bytes) is a quote character.
|
||||
static int is_simple_quote(const char* signed_str, int length) {
|
||||
const unsigned char* str = reinterpret_cast<const unsigned char*>(signed_str);
|
||||
//standard 1 byte quotes
|
||||
return (length == 1 && (*str == '\'' || *str == '`')) ||
|
||||
//utf8 3 bytes curved quotes
|
||||
(length == 3 && ((*str == 0xe2 &&
|
||||
*(str + 1) == 0x80 &&
|
||||
*(str + 2) == 0x98) ||
|
||||
(*str == 0xe2 &&
|
||||
*(str + 1) == 0x80 &&
|
||||
*(str + 2) == 0x99)));
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* fix_quotes
|
||||
*
|
||||
* Change pairs of quotes to double quotes.
|
||||
**********************************************************************/
|
||||
|
||||
void fix_quotes( //make double quotes
|
||||
char *string, //string to fix
|
||||
WERD_CHOICE *choice, //choice to fix
|
||||
WERD *word, //word to do //char choices
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
char *ptr; //string ptr
|
||||
char *str = (char *) choice->string().string();//string ptr
|
||||
int i;
|
||||
int offset;
|
||||
//blobs
|
||||
PBLOB_IT blob_it = word->blob_list ();
|
||||
//choices
|
||||
@ -1105,12 +1129,20 @@ void fix_quotes( //make double quotes
|
||||
BLOB_CHOICE_IT it1; //first choices
|
||||
BLOB_CHOICE_IT it2; //second choices
|
||||
|
||||
for (ptr = string;
|
||||
*ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
|
||||
if ((*ptr == '\'' || *ptr == '`')
|
||||
&& (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) {
|
||||
*ptr = '"'; //turn to double
|
||||
strcpy (ptr + 1, ptr + 2); //shuffle up
|
||||
for (i = 0, offset = 0; str[offset] != '\0';
|
||||
offset += choice->lengths()[i++],
|
||||
blob_it.forward (), choice_it.forward ()) {
|
||||
if (str[offset + choice->lengths()[i]] != '\0' &&
|
||||
is_simple_quote(str + offset, choice->lengths()[i]) &&
|
||||
is_simple_quote(str + offset + choice->lengths()[i],
|
||||
choice->lengths()[i + 1])) {
|
||||
str[offset] = '"'; //turn to double
|
||||
strcpy (str + offset + 1,
|
||||
str + offset + choice->lengths()[i] +
|
||||
choice->lengths()[i + 1]); //shuffle up
|
||||
choice->lengths()[i] = 1;
|
||||
strcpy ((char*) choice->lengths().string() + i + 1,
|
||||
choice->lengths().string() + i + 2);
|
||||
merge_blobs (blob_it.data (), blob_it.data_relative (1));
|
||||
blob_it.forward ();
|
||||
delete blob_it.extract (); //get rid of spare
|
||||
@ -1138,12 +1170,13 @@ void fix_quotes( //make double quotes
|
||||
* Change pairs of hyphens to a single hyphen if the bounding boxes touch
|
||||
* Typically a long dash which has been segmented.
|
||||
**********************************************************************/
|
||||
|
||||
void fix_hyphens( //crunch double hyphens
|
||||
char *string, //string to fix
|
||||
WERD_CHOICE *choice, //choice to fix
|
||||
WERD *word, //word to do //char choices
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
char *ptr; //string ptr
|
||||
char *str = (char *) choice->string().string();//string ptr
|
||||
int i;
|
||||
int offset;
|
||||
//blobs
|
||||
PBLOB_IT blob_it = word->blob_list ();
|
||||
//choices
|
||||
@ -1151,14 +1184,20 @@ void fix_hyphens( //crunch double hyphens
|
||||
BLOB_CHOICE_IT it1; //first choices
|
||||
BLOB_CHOICE_IT it2; //second choices
|
||||
|
||||
for (ptr = string;
|
||||
*ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
|
||||
if ((*ptr == '-' || *ptr == '~') &&
|
||||
(*(ptr + 1) == '-' || *(ptr + 1) == '~') &&
|
||||
for (i = 0, offset = 0; str[offset] != '\0';
|
||||
offset += choice->lengths()[i++],
|
||||
blob_it.forward (), choice_it.forward ()) {
|
||||
if ((str[offset] == '-' || str[offset] == '~') &&
|
||||
(str[offset + choice->lengths()[i]] == '-' ||
|
||||
str[offset + choice->lengths()[i]] == '~') &&
|
||||
(blob_it.data ()->bounding_box ().right () >=
|
||||
blob_it.data_relative (1)->bounding_box ().left ())) {
|
||||
*ptr = '-'; //turn to single hyphen
|
||||
strcpy (ptr + 1, ptr + 2); //shuffle up
|
||||
str[offset] = '-'; //turn to single hyphen
|
||||
strcpy (str + offset + choice->lengths()[i],
|
||||
str + offset + choice->lengths()[i] +
|
||||
choice->lengths()[i + 1]); //shuffle up
|
||||
strcpy ((char*) choice->lengths().string() + i + 1,
|
||||
choice->lengths().string() + i + 2);
|
||||
merge_blobs (blob_it.data (), blob_it.data_relative (1));
|
||||
blob_it.forward ();
|
||||
delete blob_it.extract (); //get rid of spare
|
||||
@ -1249,11 +1288,9 @@ void choice_dump_tester( //dump chars in word
|
||||
it.set_to_list (ratings);
|
||||
for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
|
||||
blob_choice = it.data ();
|
||||
if ((blob_choice->char_class () >= '!') &&
|
||||
(blob_choice->char_class () <= '~'))
|
||||
fprintf (choice_file, "\t%c\t%f\t%f",
|
||||
blob_choice->char_class (),
|
||||
blob_choice->rating (), blob_choice->certainty ());
|
||||
fprintf (choice_file, "\t%s\t%f\t%f",
|
||||
blob_choice->unichar (),
|
||||
blob_choice->rating (), blob_choice->certainty ());
|
||||
}
|
||||
fprintf (choice_file, "\n");
|
||||
}
|
||||
@ -1290,33 +1327,37 @@ WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) {
|
||||
}
|
||||
|
||||
|
||||
ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
|
||||
ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
|
||||
const char *lengths) {
|
||||
int i = 0;
|
||||
int offset = 0;
|
||||
int leading_punct_count;
|
||||
int upper_count = 0;
|
||||
int hyphen_pos = -1;
|
||||
ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
|
||||
|
||||
if (strlen (s) > 20)
|
||||
if (strlen (lengths) > 20)
|
||||
return word_type;
|
||||
|
||||
/* Single Leading punctuation char*/
|
||||
|
||||
if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i])))
|
||||
i++;
|
||||
if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))
|
||||
offset += lengths[i++];
|
||||
leading_punct_count = i;
|
||||
|
||||
/* Initial cap */
|
||||
while (isupper (s[i])) {
|
||||
i++;
|
||||
while ((s[offset] != '\0') &&
|
||||
unicharset.get_isupper(s + offset, lengths[i])) {
|
||||
offset += lengths[i++];
|
||||
upper_count++;
|
||||
}
|
||||
if (upper_count > 1)
|
||||
word_type = AC_UPPER_CASE;
|
||||
else {
|
||||
/* Lower case word, possibly with an initial cap */
|
||||
while (islower (s[i])) {
|
||||
i++;
|
||||
while ((s[offset] != '\0') &&
|
||||
unicharset.get_islower (s + offset, lengths[i])) {
|
||||
offset += lengths[i++];
|
||||
}
|
||||
if (i - leading_punct_count < quality_min_initial_alphas_reqd)
|
||||
goto not_a_word;
|
||||
@ -1324,11 +1365,13 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
|
||||
Allow a single hyphen in a lower case word
|
||||
- dont trust upper case - I've seen several cases of "H" -> "I-I"
|
||||
*/
|
||||
if (s[i] == '-') {
|
||||
hyphen_pos = i++;
|
||||
if (s[i] != '\0') {
|
||||
while (islower (s[i])) {
|
||||
i++;
|
||||
if (lengths[i] == 1 && s[offset] == '-') {
|
||||
hyphen_pos = i;
|
||||
offset += lengths[i++];
|
||||
if (s[offset] != '\0') {
|
||||
while ((s[offset] != '\0') &&
|
||||
unicharset.get_islower(s + offset, lengths[i])) {
|
||||
offset += lengths[i++];
|
||||
}
|
||||
if (i < hyphen_pos + 3)
|
||||
goto not_a_word;
|
||||
@ -1336,8 +1379,11 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
|
||||
}
|
||||
else {
|
||||
/* Allow "'s" in NON hyphenated lower case words */
|
||||
if ((s[i] == '\'') && (s[i + 1] == 's'))
|
||||
i += 2;
|
||||
if (lengths[i] == 1 && (s[offset] == '\'') &&
|
||||
lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
|
||||
offset += lengths[i++];
|
||||
offset += lengths[i++];
|
||||
}
|
||||
}
|
||||
if (upper_count > 0)
|
||||
word_type = AC_INITIAL_CAP;
|
||||
@ -1346,13 +1392,15 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
|
||||
}
|
||||
|
||||
/* Up to two different, constrained trailing punctuation chars */
|
||||
if ((s[i] != '\0') && (STRING (chs_trailing_punct1).contains (s[i])))
|
||||
i++;
|
||||
if ((s[i] != '\0') &&
|
||||
(s[i - 1] != s[i]) && (STRING (chs_trailing_punct2).contains (s[i])))
|
||||
i++;
|
||||
if (lengths[i] == 1 && (s[offset] != '\0') &&
|
||||
(STRING (chs_trailing_punct1).contains (s[offset])))
|
||||
offset += lengths[i++];
|
||||
if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&
|
||||
(s[offset - lengths[i - 1]] != s[offset]) &&
|
||||
(STRING (chs_trailing_punct2).contains (s[offset])))
|
||||
offset += lengths[i++];
|
||||
|
||||
if (s[i] != '\0')
|
||||
if (s[offset] != '\0')
|
||||
word_type = AC_UNACCEPTABLE;
|
||||
|
||||
not_a_word:
|
||||
@ -1360,17 +1408,26 @@ ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
|
||||
if (word_type == AC_UNACCEPTABLE) {
|
||||
/* Look for abbreviation string */
|
||||
i = 0;
|
||||
if (isupper (s[0])) {
|
||||
offset = 0;
|
||||
if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {
|
||||
word_type = AC_UC_ABBREV;
|
||||
while ((s[i] != '\0') && isupper (s[i]) && (s[i + 1] == '.'))
|
||||
i += 2;
|
||||
while ((s[offset] != '\0') &&
|
||||
unicharset.get_isupper(s + offset, lengths[i]) &&
|
||||
(lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
|
||||
offset += lengths[i++];
|
||||
offset += lengths[i++];
|
||||
}
|
||||
}
|
||||
else if (islower (s[0])) {
|
||||
else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {
|
||||
word_type = AC_LC_ABBREV;
|
||||
while ((s[i] != '\0') && islower (s[i]) && (s[i + 1] == '.'))
|
||||
i += 2;
|
||||
while ((s[offset] != '\0') &&
|
||||
unicharset.get_islower(s + offset, lengths[i]) &&
|
||||
(lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {
|
||||
offset += lengths[i++];
|
||||
offset += lengths[i++];
|
||||
}
|
||||
}
|
||||
if (s[i] != '\0')
|
||||
if (s[offset] != '\0')
|
||||
word_type = AC_UNACCEPTABLE;
|
||||
}
|
||||
|
||||
@ -1478,7 +1535,8 @@ void set_word_fonts( //good chars in word
|
||||
WERD_RES *word, //word to adapt to //detailed results
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
INT32 index; //char index
|
||||
char choice_char; //char from word
|
||||
INT32 offset; //char offset
|
||||
char choice_char[UNICHAR_LEN + 1]; //char from word
|
||||
INT8 config; //font of char
|
||||
//character iterator
|
||||
BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
|
||||
@ -1517,16 +1575,19 @@ void set_word_fonts( //good chars in word
|
||||
|
||||
word->italic = 0;
|
||||
word->bold = 0;
|
||||
for (char_it.mark_cycle_pt (), index = 0;
|
||||
!char_it.cycled_list (); char_it.forward (), index++) {
|
||||
choice_char = word->best_choice->string ()[index];
|
||||
for (char_it.mark_cycle_pt (), index = 0, offset = 0;
|
||||
!char_it.cycled_list (); char_it.forward (),
|
||||
offset += word->best_choice->lengths()[index++]) {
|
||||
strncpy(choice_char, word->best_choice->string ().string() + offset,
|
||||
word->best_choice->lengths()[index]);
|
||||
choice_char[word->best_choice->lengths()[index]] = '\0';
|
||||
choice_it.set_to_list (char_it.data ());
|
||||
for (choice_it.mark_cycle_pt (); !choice_it.cycled_list ();
|
||||
choice_it.forward ()) {
|
||||
if (choice_it.data ()->char_class () == choice_char) {
|
||||
choice_it.forward ()) {
|
||||
if (strcmp(choice_it.data ()->unichar (), choice_char) == 0) {
|
||||
config = choice_it.data ()->config ();
|
||||
if (tessedit_debug_fonts)
|
||||
tprintf ("%c(%d=%d%c%c)",
|
||||
tprintf ("%s(%d=%d%c%c)",
|
||||
choice_char, config, (config & 31) >> 2,
|
||||
config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
|
||||
if (config != -1) {
|
||||
|
@ -46,66 +46,8 @@
|
||||
typedef void (*TESS_TESTER) (TBLOB *, BOOL8, char *, INT32, LIST);
|
||||
typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *, void *, TEXTROW *);
|
||||
|
||||
extern "C"
|
||||
{
|
||||
/*
|
||||
int start_recog( //Real main in C
|
||||
int argc,
|
||||
char *argv[]);
|
||||
void program_editup2( //afterforking part
|
||||
int argc,
|
||||
char** argv);
|
||||
|
||||
int end_recog( //Real main in C
|
||||
int argc,
|
||||
char *argv[]);
|
||||
void set_interactive_pass();
|
||||
void set_pass1();
|
||||
void set_pass2();
|
||||
//ARRAY cc_recog(TWERD*,TESS_CHOICE*,TESS_CHOICE*,TESS_TESTER,
|
||||
// TESS_TESTER);*/
|
||||
//void wo_learn_blob(TBLOB*,TEXTROW*,char*,INT32);
|
||||
//LIST AdaptiveClassifier(TBLOB*,TBLOB*,TEXTROW*);
|
||||
//void LearnBlob(TBLOB*,TEXTROW*,char*,INT32);
|
||||
//TWERD *newword();
|
||||
//TBLOB *newblob();
|
||||
//TESSLINE *newoutline();
|
||||
//EDGEPT *newedgept();
|
||||
//void oldedgept(EDGEPT*);
|
||||
//void destroy_nodes(void*,void (*)(void*));
|
||||
//TESS_LIST *append_choice(TESS_LIST*,char*,double,double,char);
|
||||
//void fix_quotes (char*);
|
||||
//void record_certainty(double,int);
|
||||
//int AcceptableResult(A_CHOICE*,A_CHOICE*);
|
||||
//int AdaptableWord(TWERD*,const char*,const char*);
|
||||
//void delete_word(TWERD*);
|
||||
//void free_blob(TBLOB*);
|
||||
//void add_document_word(A_CHOICE*);
|
||||
//void AdaptToWord(TWERD*,TEXTROW*,const char*,const char*,const char*);
|
||||
//void SaveBadWord(const char*,double);
|
||||
//void free_choice(TESS_CHOICE*);
|
||||
//TWERD *newword();
|
||||
//TBLOB *newblob();
|
||||
//void free_blob( //free a blob
|
||||
// TBLOB *blob); //blob to free
|
||||
|
||||
//int dict_word( const char* );
|
||||
|
||||
//extern int tess_cn_matching;
|
||||
//extern int tess_bn_matching;
|
||||
//extern int last_word_on_line;
|
||||
extern TEXTROW normalized_row;
|
||||
//extern TESS_MATCHER blob_matchers[];
|
||||
//extern FILE *rawfile;
|
||||
//extern FILE *textfile;
|
||||
//extern int character_count;
|
||||
//extern int word_count;
|
||||
//extern int enable_assoc;
|
||||
//extern int chop_enable;
|
||||
//extern int permute_only_top;
|
||||
extern int display_ratings;
|
||||
|
||||
};
|
||||
extern TEXTROW normalized_row;
|
||||
extern int display_ratings;
|
||||
|
||||
#if 0
|
||||
#define strsave(s) \
|
||||
|
@ -23,8 +23,8 @@
|
||||
#include "tessopt.h"
|
||||
#include "notdll.h" //must be last include
|
||||
|
||||
int optind;
|
||||
char *optarg;
|
||||
int tessoptind;
|
||||
char *tessoptarg;
|
||||
|
||||
/**********************************************************************
|
||||
* tessopt
|
||||
@ -37,22 +37,22 @@ INT32 argc, //arg count
|
||||
char *argv[], //args
|
||||
const char *arglist //string of arg chars
|
||||
) {
|
||||
char *arg; //arg char
|
||||
const char *arg; //arg char
|
||||
|
||||
if (optind == 0)
|
||||
optind = 1;
|
||||
if (optind < argc && argv[optind][0] == '-') {
|
||||
arg = strchr (arglist, argv[optind][1]);
|
||||
if (tessoptind == 0)
|
||||
tessoptind = 1;
|
||||
if (tessoptind < argc && argv[tessoptind][0] == '-') {
|
||||
arg = strchr (arglist, argv[tessoptind][1]);
|
||||
if (arg == NULL || *arg == ':')
|
||||
return '?'; //dud option
|
||||
optind++;
|
||||
optarg = argv[optind];
|
||||
tessoptind++;
|
||||
tessoptarg = argv[tessoptind];
|
||||
if (arg[1] == ':') {
|
||||
if (argv[optind - 1][2] != '\0')
|
||||
if (argv[tessoptind - 1][2] != '\0')
|
||||
//immediately after
|
||||
optarg = argv[optind - 1] + 2;
|
||||
tessoptarg = argv[tessoptind - 1] + 2;
|
||||
else
|
||||
optind++;
|
||||
tessoptind++;
|
||||
}
|
||||
return *arg;
|
||||
}
|
||||
|
@ -20,8 +20,8 @@
|
||||
#include "host.h"
|
||||
#include "notdll.h" //must be last include
|
||||
|
||||
extern int optind;
|
||||
extern char *optarg;
|
||||
extern int tessoptind;
|
||||
extern char *tessoptarg;
|
||||
|
||||
int tessopt ( //parse args
|
||||
INT32 argc, //arg count
|
||||
|
@ -28,7 +28,7 @@
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define EXTERN extern "C"
|
||||
#define EXTERN extern
|
||||
#else
|
||||
#define EXTERN extern
|
||||
#endif
|
||||
|
@ -39,14 +39,12 @@
|
||||
extern TBLOB *pageblobs; /*first blob on page */
|
||||
extern TEXTBLOCK *pageblocks; /*first block on page */
|
||||
/*class definitions */
|
||||
extern char classes[CLASSIZE][CLASSLENGTH];
|
||||
/* extern char classes[CLASSIZE][CLASSLENGTH]; */
|
||||
extern int resolution; /*scanner res in dpi */
|
||||
extern int acts[MAXPROC]; /*action flags */
|
||||
extern int debugs[MAXPROC]; /*debug flags */
|
||||
extern int plots[MAXPROC]; /*plot flags */
|
||||
extern int corners[4]; /*corners of scan window */
|
||||
extern int optind; /*option index */
|
||||
extern char *optarg; /*option argument */
|
||||
/*image file name */
|
||||
extern char imagefile[FILENAMESIZE];
|
||||
/* main directory */
|
||||
|
@ -37,6 +37,7 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "unichar.h"
|
||||
|
||||
#define MAXNAMESIZE 80
|
||||
#define MAX_NUM_SAMPLES 10000
|
||||
@ -219,21 +220,34 @@ int main (
|
||||
ParseArguments (argc, argv);
|
||||
while ((PageName = GetNextFilename()) != NULL)
|
||||
{
|
||||
printf ("\nReading %s ...", PageName);
|
||||
printf ("Reading %s ...\n", PageName);
|
||||
TrainingPage = Efopen (PageName, "r");
|
||||
ReadTrainingSamples (TrainingPage, &CharList);
|
||||
fclose (TrainingPage);
|
||||
//WriteTrainingSamples (Directory, CharList);
|
||||
}
|
||||
printf("Clustering ...\n");
|
||||
pCharList = CharList;
|
||||
iterate(pCharList)
|
||||
{
|
||||
//Cluster
|
||||
CharSample = (LABELEDLIST) first_node (pCharList);
|
||||
printf ("\nClustering %s ...", CharSample->Label);
|
||||
Clusterer = SetUpForClustering(CharSample);
|
||||
ProtoList = ClusterSamples(Clusterer, &Config);
|
||||
AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
|
||||
//Cluster
|
||||
CharSample = (LABELEDLIST) first_node (pCharList);
|
||||
//printf ("\nClustering %s ...", CharSample->Label);
|
||||
Clusterer = SetUpForClustering(CharSample);
|
||||
float SavedMinSamples = Config.MinSamples;
|
||||
while (Config.MinSamples > 0.001) {
|
||||
ProtoList = ClusterSamples(Clusterer, &Config);
|
||||
if (NumberOfProtos(ProtoList, 1, 0) > 0)
|
||||
break;
|
||||
else {
|
||||
Config.MinSamples *= 0.95;
|
||||
printf("0 significant protos for %s."
|
||||
" Retrying clustering with MinSamples = %f%%\n",
|
||||
CharSample->Label, Config.MinSamples);
|
||||
}
|
||||
}
|
||||
Config.MinSamples = SavedMinSamples;
|
||||
AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
|
||||
}
|
||||
FreeTrainingSamples (CharList);
|
||||
WriteNormProtos (Directory, NormProtoList, Clusterer);
|
||||
@ -262,7 +276,7 @@ void ParseArguments(
|
||||
** ShowSignificantProtos flag controlling proto display
|
||||
** ShowInsignificantProtos flag controlling proto display
|
||||
** Config current clustering parameters
|
||||
** optarg, optind defined by tessopt sys call
|
||||
** tessoptarg, tessoptind defined by tessopt sys call
|
||||
** Argc, Argv global copies of argc and argv
|
||||
** Operation:
|
||||
** This routine parses the command line arguments that were
|
||||
@ -287,7 +301,6 @@ void ParseArguments(
|
||||
int Option;
|
||||
int ParametersRead;
|
||||
BOOL8 Error;
|
||||
extern char *optarg;
|
||||
|
||||
Error = FALSE;
|
||||
Argc = argc;
|
||||
@ -297,48 +310,48 @@ void ParseArguments(
|
||||
switch ( Option )
|
||||
{
|
||||
case 'n':
|
||||
sscanf(optarg,"%d", &ParametersRead);
|
||||
sscanf(tessoptarg,"%d", &ParametersRead);
|
||||
ShowInsignificantProtos = ParametersRead;
|
||||
break;
|
||||
case 'p':
|
||||
sscanf(optarg,"%d", &ParametersRead);
|
||||
sscanf(tessoptarg,"%d", &ParametersRead);
|
||||
ShowSignificantProtos = ParametersRead;
|
||||
break;
|
||||
case 'd':
|
||||
ShowAllSamples = FALSE;
|
||||
break;
|
||||
case 'C':
|
||||
ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) );
|
||||
ParametersRead = sscanf( tessoptarg, "%lf", &(Config.Confidence) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.Confidence > 1 ) Config.Confidence = 1;
|
||||
else if ( Config.Confidence < 0 ) Config.Confidence = 0;
|
||||
break;
|
||||
case 'I':
|
||||
ParametersRead = sscanf( optarg, "%f", &(Config.Independence) );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &(Config.Independence) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.Independence > 1 ) Config.Independence = 1;
|
||||
else if ( Config.Independence < 0 ) Config.Independence = 0;
|
||||
break;
|
||||
case 'M':
|
||||
ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &(Config.MinSamples) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
|
||||
else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
|
||||
break;
|
||||
case 'B':
|
||||
ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &(Config.MaxIllegal) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
|
||||
else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
|
||||
break;
|
||||
case 'R':
|
||||
ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
|
||||
else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
|
||||
break;
|
||||
case 'S':
|
||||
switch ( optarg[0] )
|
||||
switch ( tessoptarg[0] )
|
||||
{
|
||||
case 's': Config.ProtoStyle = spherical; break;
|
||||
case 'e': Config.ProtoStyle = elliptical; break;
|
||||
@ -348,10 +361,10 @@ void ParseArguments(
|
||||
}
|
||||
break;
|
||||
case 'D':
|
||||
Directory = optarg;
|
||||
Directory = tessoptarg;
|
||||
break;
|
||||
case 'N':
|
||||
if (sscanf (optarg, "%d", &MaxNumSamples) != 1 ||
|
||||
if (sscanf (tessoptarg, "%d", &MaxNumSamples) != 1 ||
|
||||
MaxNumSamples <= 0)
|
||||
Error = TRUE;
|
||||
break;
|
||||
@ -375,7 +388,7 @@ char *GetNextFilename ()
|
||||
/*
|
||||
** Parameters: none
|
||||
** Globals:
|
||||
** optind defined by tessopt sys call
|
||||
** tessoptind defined by tessopt sys call
|
||||
** Argc, Argv global copies of argc and argv
|
||||
** Operation:
|
||||
** This routine returns the next command line argument. If
|
||||
@ -388,8 +401,8 @@ char *GetNextFilename ()
|
||||
*/
|
||||
|
||||
{
|
||||
if (optind < Argc)
|
||||
return (Argv [optind++]);
|
||||
if (tessoptind < Argc)
|
||||
return (Argv [tessoptind++]);
|
||||
else
|
||||
return (NULL);
|
||||
|
||||
@ -417,32 +430,32 @@ void ReadTrainingSamples (
|
||||
*/
|
||||
|
||||
{
|
||||
char CharName[MAXNAMESIZE];
|
||||
char unichar[UNICHAR_LEN + 1];
|
||||
LABELEDLIST CharSample;
|
||||
FEATURE_SET FeatureSamples;
|
||||
CHAR_DESC CharDesc;
|
||||
int Type, i;
|
||||
CHAR_DESC CharDesc;
|
||||
int Type, i;
|
||||
|
||||
while (fscanf (File, "%s %s", FontName, CharName) == 2) {
|
||||
CharSample = FindList (*TrainingSamples, CharName);
|
||||
if (CharSample == NULL) {
|
||||
CharSample = NewLabeledList (CharName);
|
||||
*TrainingSamples = push (*TrainingSamples, CharSample);
|
||||
}
|
||||
CharDesc = ReadCharDescription (File);
|
||||
Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
|
||||
FeatureSamples = FeaturesOfType(CharDesc, Type);
|
||||
for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
|
||||
FEATURE f = FeatureSamples->Features[feature];
|
||||
for (int dim =0; dim < f->Type->NumParams; ++dim)
|
||||
f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
|
||||
}
|
||||
CharSample->List = push (CharSample->List, FeatureSamples);
|
||||
for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
|
||||
if (Type != i)
|
||||
FreeFeatureSet (FeaturesOfType (CharDesc, i));
|
||||
free (CharDesc);
|
||||
}
|
||||
while (fscanf (File, "%s %s", FontName, unichar) == 2) {
|
||||
CharSample = FindList (*TrainingSamples, unichar);
|
||||
if (CharSample == NULL) {
|
||||
CharSample = NewLabeledList (unichar);
|
||||
*TrainingSamples = push (*TrainingSamples, CharSample);
|
||||
}
|
||||
CharDesc = ReadCharDescription (File);
|
||||
Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
|
||||
FeatureSamples = FeaturesOfType(CharDesc, Type);
|
||||
for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
|
||||
FEATURE f = FeatureSamples->Features[feature];
|
||||
for (int dim =0; dim < f->Type->NumParams; ++dim)
|
||||
f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
|
||||
}
|
||||
CharSample->List = push (CharSample->List, FeatureSamples);
|
||||
for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
|
||||
if (Type != i)
|
||||
FreeFeatureSet (FeaturesOfType (CharDesc, i));
|
||||
free (CharDesc);
|
||||
}
|
||||
} // ReadTrainingSamples
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
@ -606,7 +619,6 @@ void WriteNormProtos (
|
||||
char Filename[MAXNAMESIZE];
|
||||
LABELEDLIST LabeledProto;
|
||||
int N;
|
||||
char Label;
|
||||
|
||||
strcpy (Filename, "");
|
||||
if (Directory != NULL)
|
||||
@ -623,9 +635,17 @@ void WriteNormProtos (
|
||||
{
|
||||
LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
|
||||
N = NumberOfProtos(LabeledProto->List,
|
||||
ShowSignificantProtos, ShowInsignificantProtos);
|
||||
Label = NameToChar(LabeledProto->Label);
|
||||
fprintf(File, "\n%c %d\n", Label, N);
|
||||
ShowSignificantProtos, ShowInsignificantProtos);
|
||||
if (N < 1) {
|
||||
printf ("\nError! Not enough protos for %s: %d protos"
|
||||
" (%d significant protos"
|
||||
", %d insignificant protos)\n",
|
||||
LabeledProto->Label, N,
|
||||
NumberOfProtos(LabeledProto->List, 1, 0),
|
||||
NumberOfProtos(LabeledProto->List, 0, 1));
|
||||
exit(1);
|
||||
}
|
||||
fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
|
||||
WriteProtos(File, Clusterer->SampleSize, LabeledProto->List,
|
||||
ShowSignificantProtos, ShowInsignificantProtos);
|
||||
}
|
||||
|
@ -44,6 +44,9 @@
|
||||
#include "intproto.h"
|
||||
#include "variables.h"
|
||||
#include "freelist.h"
|
||||
#include "efio.h"
|
||||
#include "danerror.h"
|
||||
#include "globals.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
@ -73,7 +76,6 @@ typedef MERGE_CLASS_NODE* MERGE_CLASS;
|
||||
|
||||
#define round(x,frag)(floor(x/frag+.5)*frag)
|
||||
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Public Function Prototypes
|
||||
----------------------------------------------------------------------------**/
|
||||
@ -164,21 +166,7 @@ void Normalize (
|
||||
void SetUpForFloat2Int(
|
||||
LIST LabeledClassList);
|
||||
|
||||
void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) {
|
||||
FILE* fp = Efopen(filename, "wb");
|
||||
/* then write out each class */
|
||||
for (int i = 0; i < NumClassesIn (Templates); i++) {
|
||||
int MaxLength = 0;
|
||||
INT_CLASS Class = ClassForIndex (Templates, i);
|
||||
for (int ConfigId = 0; ConfigId < NumIntConfigsIn (Class); ConfigId++) {
|
||||
if (LengthForConfigId (Class, ConfigId) > MaxLength)
|
||||
MaxLength = LengthForConfigId (Class, ConfigId);
|
||||
}
|
||||
fprintf(fp, "%c %d\n", ClassIdForIndex(Templates, i), MaxLength);
|
||||
}
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void WritePFFMTable(INT_TEMPLATES Templates, const char* filename);
|
||||
|
||||
//--------------Global Data Definitions and Declarations--------------
|
||||
static char FontName[MAXNAMESIZE];
|
||||
@ -200,6 +188,9 @@ static CLUSTERCONFIG Config =
|
||||
|
||||
static FLOAT32 RoundingAccuracy = 0.0;
|
||||
|
||||
// The unicharset used during mftraining
|
||||
static UNICHARSET unicharset_mftraining;
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
Public Code
|
||||
-----------------------------------------------------------------------------*/
|
||||
@ -260,12 +251,17 @@ int main (
|
||||
LIST pCharList, pProtoList;
|
||||
char Filename[MAXNAMESIZE];
|
||||
|
||||
// Clean the unichar set
|
||||
unicharset_mftraining.clear();
|
||||
// Space character needed to represent NIL classification
|
||||
unicharset_mftraining.unichar_insert(" ");
|
||||
|
||||
ParseArguments (argc, argv);
|
||||
InitFastTrainerVars ();
|
||||
InitSubfeatureVars ();
|
||||
while ((PageName = GetNextFilename()) != NULL)
|
||||
{
|
||||
printf ("\nReading %s ...", PageName);
|
||||
printf ("Reading %s ...\n", PageName);
|
||||
TrainingPage = Efopen (PageName, "r");
|
||||
CharList = ReadTrainingSamples (TrainingPage);
|
||||
fclose (TrainingPage);
|
||||
@ -275,7 +271,7 @@ int main (
|
||||
{
|
||||
//Cluster
|
||||
CharSample = (LABELEDLIST) first_node (pCharList);
|
||||
printf ("\nClustering %s ...", CharSample->Label);
|
||||
// printf ("\nClustering %s ...", CharSample->Label);
|
||||
Clusterer = SetUpForClustering(CharSample);
|
||||
ProtoList = ClusterSamples(Clusterer, &Config);
|
||||
//WriteClusteredTrainingSamples (Directory, ProtoList, Clusterer, CharSample);
|
||||
@ -320,14 +316,13 @@ int main (
|
||||
FreeProtoList (&ProtoList);
|
||||
}
|
||||
FreeTrainingSamples (CharList);
|
||||
printf ("\n");
|
||||
}
|
||||
//WriteMergedTrainingSamples(Directory,ClassList);
|
||||
WriteMicrofeat(Directory, ClassList);
|
||||
InitIntProtoVars ();
|
||||
InitPrototypes ();
|
||||
SetUpForFloat2Int(ClassList);
|
||||
IntTemplates = CreateIntTemplates(TrainingData);
|
||||
IntTemplates = CreateIntTemplates(TrainingData, unicharset_mftraining);
|
||||
strcpy (Filename, "");
|
||||
if (Directory != NULL)
|
||||
{
|
||||
@ -340,11 +335,18 @@ int main (
|
||||
#else
|
||||
OutFile = Efopen (Filename, "wb");
|
||||
#endif
|
||||
WriteIntTemplates(OutFile, IntTemplates);
|
||||
WriteIntTemplates(OutFile, IntTemplates, unicharset_mftraining);
|
||||
fclose (OutFile);
|
||||
// Now create pffmtable.
|
||||
WritePFFMTable(IntTemplates, "pffmtable");
|
||||
printf ("\nDone!\n"); /**/
|
||||
strcpy (Filename, "");
|
||||
if (Directory != NULL)
|
||||
{
|
||||
strcat (Filename, Directory);
|
||||
strcat (Filename, "/");
|
||||
}
|
||||
strcat (Filename, "pffmtable");
|
||||
// Now create pffmtable.
|
||||
WritePFFMTable(IntTemplates, Filename);
|
||||
printf ("Done!\n"); /**/
|
||||
FreeLabeledClassList (ClassList);
|
||||
return 0;
|
||||
} /* main */
|
||||
@ -367,7 +369,7 @@ char **argv)
|
||||
** ShowSignificantProtos flag controlling proto display
|
||||
** ShowInsignificantProtos flag controlling proto display
|
||||
** Config current clustering parameters
|
||||
** optarg, optind defined by tessopt sys call
|
||||
** tessoptarg, tessoptind defined by tessopt sys call
|
||||
** Argc, Argv global copies of argc and argv
|
||||
** Operation:
|
||||
** This routine parses the command line arguments that were
|
||||
@ -392,7 +394,6 @@ char **argv)
|
||||
int Option;
|
||||
int ParametersRead;
|
||||
BOOL8 Error;
|
||||
extern char *optarg;
|
||||
|
||||
Error = FALSE;
|
||||
Argc = argc;
|
||||
@ -411,37 +412,37 @@ char **argv)
|
||||
ShowAllSamples = FALSE;
|
||||
break;
|
||||
case 'C':
|
||||
ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) );
|
||||
ParametersRead = sscanf( tessoptarg, "%lf", &(Config.Confidence) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.Confidence > 1 ) Config.Confidence = 1;
|
||||
else if ( Config.Confidence < 0 ) Config.Confidence = 0;
|
||||
break;
|
||||
case 'I':
|
||||
ParametersRead = sscanf( optarg, "%f", &(Config.Independence) );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &(Config.Independence) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.Independence > 1 ) Config.Independence = 1;
|
||||
else if ( Config.Independence < 0 ) Config.Independence = 0;
|
||||
break;
|
||||
case 'M':
|
||||
ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &(Config.MinSamples) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
|
||||
else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
|
||||
break;
|
||||
case 'B':
|
||||
ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &(Config.MaxIllegal) );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
|
||||
else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
|
||||
break;
|
||||
case 'R':
|
||||
ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy );
|
||||
ParametersRead = sscanf( tessoptarg, "%f", &RoundingAccuracy );
|
||||
if ( ParametersRead != 1 ) Error = TRUE;
|
||||
else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
|
||||
else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
|
||||
break;
|
||||
case 'S':
|
||||
switch ( optarg[0] )
|
||||
switch ( tessoptarg[0] )
|
||||
{
|
||||
case 's': Config.ProtoStyle = spherical; break;
|
||||
case 'e': Config.ProtoStyle = elliptical; break;
|
||||
@ -451,10 +452,10 @@ char **argv)
|
||||
}
|
||||
break;
|
||||
case 'D':
|
||||
Directory = optarg;
|
||||
Directory = tessoptarg;
|
||||
break;
|
||||
case 'N':
|
||||
if (sscanf (optarg, "%d", &MaxNumSamples) != 1 ||
|
||||
if (sscanf (tessoptarg, "%d", &MaxNumSamples) != 1 ||
|
||||
MaxNumSamples <= 0)
|
||||
Error = TRUE;
|
||||
break;
|
||||
@ -478,7 +479,7 @@ char *GetNextFilename ()
|
||||
/*
|
||||
** Parameters: none
|
||||
** Globals:
|
||||
** optind defined by tessopt sys call
|
||||
** tessoptind defined by tessopt sys call
|
||||
** Argc, Argv global copies of argc and argv
|
||||
** Operation:
|
||||
** This routine returns the next command line argument. If
|
||||
@ -491,8 +492,8 @@ char *GetNextFilename ()
|
||||
*/
|
||||
|
||||
{
|
||||
if (optind < Argc)
|
||||
return (Argv [optind++]);
|
||||
if (tessoptind < Argc)
|
||||
return (Argv [tessoptind++]);
|
||||
else
|
||||
return (NULL);
|
||||
|
||||
@ -519,33 +520,41 @@ LIST ReadTrainingSamples (
|
||||
*/
|
||||
|
||||
{
|
||||
char CharName[MAXNAMESIZE];
|
||||
LABELEDLIST CharSample;
|
||||
FEATURE_SET FeatureSamples;
|
||||
char unichar[UNICHAR_LEN + 1];
|
||||
LABELEDLIST CharSample;
|
||||
FEATURE_SET FeatureSamples;
|
||||
LIST TrainingSamples = NIL;
|
||||
CHAR_DESC CharDesc;
|
||||
int Type, i;
|
||||
|
||||
while (fscanf (File, "%s %s", FontName, CharName) == 2) {
|
||||
CharSample = FindList (TrainingSamples, CharName);
|
||||
while (fscanf (File, "%s %s", FontName, unichar) == 2) {
|
||||
if (!unicharset_mftraining.contains_unichar(unichar)) {
|
||||
unicharset_mftraining.unichar_insert(unichar);
|
||||
if (unicharset_mftraining.size() > MAX_NUM_CLASSES) {
|
||||
cprintf("Error: Size of unicharset of mftraining is "
|
||||
"greater than MAX_NUM_CLASSES\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
CharSample = FindList (TrainingSamples, unichar);
|
||||
if (CharSample == NULL) {
|
||||
CharSample = NewLabeledList (CharName);
|
||||
CharSample = NewLabeledList (unichar);
|
||||
TrainingSamples = push (TrainingSamples, CharSample);
|
||||
}
|
||||
CharDesc = ReadCharDescription (File);
|
||||
Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
|
||||
FeatureSamples = FeaturesOfType(CharDesc, Type);
|
||||
for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
|
||||
FEATURE f = FeatureSamples->Features[feature];
|
||||
for (int dim =0; dim < f->Type->NumParams; ++dim)
|
||||
f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
|
||||
}
|
||||
for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
|
||||
FEATURE f = FeatureSamples->Features[feature];
|
||||
for (int dim =0; dim < f->Type->NumParams; ++dim)
|
||||
f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
|
||||
}
|
||||
CharSample->List = push (CharSample->List, FeatureSamples);
|
||||
for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
|
||||
if (Type != i)
|
||||
FreeFeatureSet (FeaturesOfType (CharDesc, i));
|
||||
if (Type != i)
|
||||
FreeFeatureSet (FeaturesOfType (CharDesc, i));
|
||||
free (CharDesc);
|
||||
}
|
||||
}
|
||||
return (TrainingSamples);
|
||||
|
||||
} /* ReadTrainingSamples */
|
||||
@ -843,7 +852,7 @@ void WriteProtos(
|
||||
int i;
|
||||
PROTO Proto;
|
||||
|
||||
fprintf(File, "%c\n", NameToChar(MergeClass->Label));
|
||||
fprintf(File, "%s\n", MergeClass->Label);
|
||||
fprintf(File, "%d\n", NumProtosIn(MergeClass->Class));
|
||||
for(i=0; i < NumProtosIn(MergeClass->Class); i++)
|
||||
{
|
||||
@ -900,7 +909,7 @@ void FreeTrainingSamples (
|
||||
LIST FeatureList;
|
||||
|
||||
|
||||
printf ("\nFreeTrainingSamples...");
|
||||
// printf ("FreeTrainingSamples...\n");
|
||||
iterate (CharList) /* iterate thru all of the fonts */
|
||||
{
|
||||
CharSample = (LABELEDLIST) first_node (CharList);
|
||||
@ -1161,12 +1170,13 @@ void SetUpForFloat2Int(
|
||||
BIT_VECTOR NewConfig;
|
||||
BIT_VECTOR OldConfig;
|
||||
|
||||
printf("Float2Int ...");
|
||||
// printf("Float2Int ...\n");
|
||||
|
||||
iterate(LabeledClassList)
|
||||
{
|
||||
MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
|
||||
Class = &TrainingData[NameToChar(MergeClass->Label)];
|
||||
Class = &TrainingData[unicharset_mftraining.unichar_to_id(
|
||||
MergeClass->Label)];
|
||||
NumProtos = NumProtosIn(MergeClass->Class);
|
||||
NumConfigs = NumConfigsIn(MergeClass->Class);
|
||||
|
||||
@ -1204,3 +1214,20 @@ void SetUpForFloat2Int(
|
||||
}
|
||||
}
|
||||
} // SetUpForFloat2Int
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) {
|
||||
FILE* fp = Efopen(filename, "wb");
|
||||
/* then write out each class */
|
||||
for (int i = 0; i < NumClassesIn (Templates); i++) {
|
||||
int MaxLength = 0;
|
||||
INT_CLASS Class = ClassForIndex (Templates, i);
|
||||
for (int ConfigId = 0; ConfigId < NumIntConfigsIn (Class); ConfigId++) {
|
||||
if (LengthForConfigId (Class, ConfigId) > MaxLength)
|
||||
MaxLength = LengthForConfigId (Class, ConfigId);
|
||||
}
|
||||
fprintf(fp, "%s %d\n", unicharset_mftraining.id_to_unichar(
|
||||
ClassIdForIndex(Templates, i)), MaxLength);
|
||||
}
|
||||
fclose(fp);
|
||||
} // WritePFFMTable
|
||||
|
@ -52,8 +52,8 @@ int main(int argc, char** argv) {
|
||||
while ((option = tessopt(argc, argv, "D" )) != EOF) {
|
||||
switch (option) {
|
||||
case 'D':
|
||||
output_directory = optarg;
|
||||
++optind;
|
||||
output_directory = tessoptarg;
|
||||
++tessoptind;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -64,12 +64,12 @@ int main(int argc, char** argv) {
|
||||
unicharset_file_name += kUnicharsetFileName;
|
||||
|
||||
// Load box files
|
||||
for (; optind < argc; ++optind) {
|
||||
printf("Extracting unicharset from %s\n", argv[optind]);
|
||||
for (; tessoptind < argc; ++tessoptind) {
|
||||
printf("Extracting unicharset from %s\n", argv[tessoptind]);
|
||||
|
||||
FILE* box_file = fopen(argv[optind], "r");
|
||||
FILE* box_file = fopen(argv[tessoptind], "r");
|
||||
if (box_file == NULL) {
|
||||
printf("Cannot open box file %s\n", argv[optind]);
|
||||
printf("Cannot open box file %s\n", argv[tessoptind]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user