2007-03-08 04:03:40 +08:00
|
|
|
|
/* -*-C-*-
|
|
|
|
|
********************************************************************************
|
|
|
|
|
*
|
|
|
|
|
* File: wordclass.c (Formerly wordclass.c)
|
|
|
|
|
* Description: Word classifier
|
|
|
|
|
* Author: Mark Seaman, OCR Technology
|
|
|
|
|
* Created: Tue Jan 30 14:03:25 1990
|
|
|
|
|
* Modified: Fri Jul 12 16:03:06 1991 (Mark Seaman) marks@hpgrlt
|
|
|
|
|
* Language: C
|
|
|
|
|
* Package: N/A
|
|
|
|
|
* Status: Experimental (Do Not Distribute)
|
|
|
|
|
*
|
|
|
|
|
* (c) Copyright 1990, Hewlett-Packard Company.
|
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
|
** limitations under the License.
|
|
|
|
|
*
|
|
|
|
|
*********************************************************************************/
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
I N C L U D E S
|
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#ifdef __UNIX__
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#include "wordclass.h"
|
|
|
|
|
#include "fxid.h"
|
|
|
|
|
#include "tordvars.h"
|
|
|
|
|
#include "associate.h"
|
|
|
|
|
#include "render.h"
|
|
|
|
|
#include "metrics.h"
|
|
|
|
|
#include "matchtab.h"
|
|
|
|
|
//#include "tfacepp.h"
|
|
|
|
|
#include "permute.h"
|
|
|
|
|
#include "context.h"
|
|
|
|
|
#include "badwords.h"
|
|
|
|
|
#include "callcpp.h"
|
2007-07-18 09:15:07 +08:00
|
|
|
|
#include <assert.h>
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
|
extern TBLOB *newblob();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
Variables
|
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
INT16 first_pass;
|
|
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
C o n s t a n t s
|
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
|
|
#define BOLD_ON "&dB(s3B"
|
|
|
|
|
#define BOLD_OFF "&d@(s0B"
|
|
|
|
|
#define UNDERLINE_ON "&dD"
|
|
|
|
|
#define UNDERLINE_OFF "&d@"
|
|
|
|
|
LIST call_matcher( //call a matcher
|
|
|
|
|
TBLOB *ptblob, //previous
|
|
|
|
|
TBLOB *tessblob, //blob to match
|
|
|
|
|
TBLOB *ntblob, //next
|
|
|
|
|
void *, //unused parameter
|
|
|
|
|
TEXTROW * //always null anyway
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
F u n c t i o n s
|
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
/**********************************************************************
|
|
|
|
|
* classify_blob
|
|
|
|
|
*
|
|
|
|
|
* Classify the this blob if it is not already recorded in the match
|
|
|
|
|
* table. Attempt to recognize this blob as a character. The recognition
|
|
|
|
|
* rating (probability) for this blob will be stored as a part of the
|
|
|
|
|
* blob. This value will also be returned to the caller.
|
|
|
|
|
**********************************************************************/
|
|
|
|
|
CHOICES classify_blob(TBLOB *pblob,
|
|
|
|
|
TBLOB *blob,
|
|
|
|
|
TBLOB *nblob,
|
|
|
|
|
TEXTROW *row,
|
|
|
|
|
int fx,
|
|
|
|
|
const char *string,
|
|
|
|
|
C_COL color,
|
|
|
|
|
STATE *this_state,
|
|
|
|
|
STATE *best_state,
|
|
|
|
|
INT32 pass,
|
|
|
|
|
INT32 blob_index) {
|
|
|
|
|
CHOICES rating;
|
|
|
|
|
INT32 old_index;
|
|
|
|
|
|
|
|
|
|
chars_classified++; /* Global value */
|
|
|
|
|
if (blob_skip)
|
|
|
|
|
return (NIL);
|
|
|
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
|
|
|
if (display_all_blobs)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
display_blob(blob, color);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
#endif
|
|
|
|
|
rating = get_match (blob);
|
|
|
|
|
if (rating == NIL) {
|
|
|
|
|
if (pass) {
|
|
|
|
|
old_index = blob_index;
|
|
|
|
|
//?cast to int*
|
|
|
|
|
blob_type = compare_states (best_state, this_state, (int *) &blob_index);
|
2007-07-18 09:15:07 +08:00
|
|
|
|
// TODO(tkielbus) Remove this assert and reactivate code.
|
|
|
|
|
// Manage the blob_answer, word_answer mechanism.
|
|
|
|
|
// (convert or remove because it doesnt seem to be used anymore)
|
|
|
|
|
assert(0);
|
|
|
|
|
#if 0
|
2007-03-08 04:03:40 +08:00
|
|
|
|
blob_answer = word_answer[blob_index];
|
|
|
|
|
if (blob_answer < '!')
|
|
|
|
|
fprintf (matcher_fp,
|
|
|
|
|
"Bad compare states: best state=0x%x%x, this=0x%x%x, bits="
|
|
|
|
|
INT32FORMAT ", index=" INT32FORMAT ", outdex="
|
|
|
|
|
INT32FORMAT ", word=%s\n", best_state->part1,
|
|
|
|
|
best_state->part2, this_state->part1, this_state->part2,
|
|
|
|
|
bits_in_states, old_index, blob_index, word_answer);
|
2007-07-18 09:15:07 +08:00
|
|
|
|
#endif
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
blob_type = 0;
|
|
|
|
|
rating = /*(*blob_matchers [fx]) */ (CHOICES) call_matcher (pblob, blob,
|
|
|
|
|
nblob, NULL,
|
|
|
|
|
row);
|
2007-07-18 09:15:07 +08:00
|
|
|
|
put_match(blob, rating);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
|
|
|
if (display_ratings && string)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
print_choices(string, rating);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
if (blob_pause)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
window_wait(blob_window);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return (rating);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
|
* write_text_files
|
|
|
|
|
*
|
|
|
|
|
* Write an answer to the output file that is the raw guess (without
|
|
|
|
|
* context) directly from the classifier.
|
|
|
|
|
**********************************************************************/
|
|
|
|
|
void write_text_files(TWERD *word,
|
|
|
|
|
char *raw_choice,
|
|
|
|
|
int same_row,
|
|
|
|
|
int good_word,
|
|
|
|
|
int firstpass) {
|
|
|
|
|
int x;
|
|
|
|
|
/* Raw output */
|
|
|
|
|
if (write_raw_output) {
|
|
|
|
|
if (same_row)
|
|
|
|
|
fprintf (rawfile, "\n");
|
|
|
|
|
if (raw_choice && strlen (raw_choice)) {
|
|
|
|
|
fprintf (rawfile, "%s ", raw_choice);
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fflush(rawfile);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Text file output */
|
|
|
|
|
if (write_output) {
|
|
|
|
|
if (same_row)
|
|
|
|
|
fprintf (textfile, "\n");
|
|
|
|
|
if (word->guess && strlen (word->guess)) {
|
|
|
|
|
for (x = 0; x < word->blanks; x++)
|
|
|
|
|
fprintf (textfile, " ");
|
|
|
|
|
if (!firstpass)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, BOLD_ON);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
if (!good_word)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, UNDERLINE_ON);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
fprintf (textfile, "%s", word->guess);
|
|
|
|
|
if (!good_word)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, UNDERLINE_OFF);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
if (!firstpass)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, BOLD_OFF);
|
|
|
|
|
fflush(textfile);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Global counters */
|
|
|
|
|
character_count += (word->guess ? strlen (word->guess) : 0);
|
|
|
|
|
word_count++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
|
* save_answer
|
|
|
|
|
*
|
|
|
|
|
* Write an answer to the output file that is the raw guess (without
|
|
|
|
|
* context) directly from the classifier.
|
|
|
|
|
**********************************************************************/
|
|
|
|
|
void save_answer(TWERD *word,
|
|
|
|
|
TEXTROW *row,
|
|
|
|
|
A_CHOICE *best_choice,
|
|
|
|
|
A_CHOICE *raw_choice,
|
|
|
|
|
int firstpass) {
|
|
|
|
|
static TEXTROW *last_row;
|
|
|
|
|
char raw_answer[CHARS_PER_LINE];
|
|
|
|
|
int answer_already;
|
|
|
|
|
int good_answer;
|
|
|
|
|
char *string = NULL;
|
|
|
|
|
|
|
|
|
|
if (best_choice) {
|
|
|
|
|
good_answer = AcceptableResult (best_choice, raw_choice);
|
|
|
|
|
string = class_string (best_choice);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
good_answer = FALSE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (firstpass) {
|
|
|
|
|
/* First pass */
|
|
|
|
|
if (string) {
|
|
|
|
|
/* Got answer */
|
2007-07-18 09:15:07 +08:00
|
|
|
|
add_document_word(best_choice);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
word->guess = string;
|
|
|
|
|
fix_quotes (word->guess);
|
|
|
|
|
strcpy (raw_answer, word->guess);
|
|
|
|
|
|
|
|
|
|
record_certainty (class_certainty (best_choice), 1);
|
|
|
|
|
|
|
|
|
|
if (good_answer) {
|
|
|
|
|
record_certainty (class_certainty (best_choice), 2);
|
|
|
|
|
strcat (raw_answer, " ");
|
|
|
|
|
strcat (raw_answer, class_string (raw_choice));
|
|
|
|
|
word->guess = strsave (raw_answer);
|
|
|
|
|
word->guess[strlen (string)] = 0;
|
|
|
|
|
if (string) {
|
2007-07-18 09:15:07 +08:00
|
|
|
|
strfree(string);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
class_string (best_choice) = NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
/* Not good enough */
|
|
|
|
|
if (word->guess)
|
|
|
|
|
strfree (word->guess);
|
|
|
|
|
word->guess = NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
word->guess = NULL;
|
|
|
|
|
raw_answer[0] = '\0';
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
/* Second pass */
|
|
|
|
|
answer_already = (word->guess != NULL);
|
|
|
|
|
if (answer_already) {
|
|
|
|
|
write_text_files (word,
|
|
|
|
|
&word->guess[strlen (word->guess) + 1],
|
|
|
|
|
(row != last_row), TRUE, TRUE);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
/* Required second pass */
|
|
|
|
|
if (string) {
|
|
|
|
|
if (!good_answer && tessedit_save_stats) {
|
|
|
|
|
SaveBadWord (string, class_certainty (best_choice));
|
|
|
|
|
}
|
|
|
|
|
record_certainty (class_certainty (best_choice), 2);
|
|
|
|
|
word->guess = class_string (best_choice);
|
|
|
|
|
fix_quotes (word->guess);
|
|
|
|
|
write_text_files (word, class_string (raw_choice),
|
|
|
|
|
(row != last_row), good_answer, FALSE);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Word Display */
|
|
|
|
|
if (display_text) {
|
|
|
|
|
if (row != last_row)
|
|
|
|
|
cprintf ("\n");
|
|
|
|
|
if (word->guess && strlen (word->guess))
|
|
|
|
|
cprintf ("%s ", word->guess);
|
|
|
|
|
else
|
|
|
|
|
cprintf ("%s ", raw_answer);
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fflush(stdout);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
last_row = row;
|
|
|
|
|
}
|