mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-18 19:39:24 +08:00
570af48b8b
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20
285 lines
8.8 KiB
C++
285 lines
8.8 KiB
C++
/* -*-C-*-
|
||
********************************************************************************
|
||
*
|
||
* File: wordclass.c (Formerly wordclass.c)
|
||
* Description: Word classifier
|
||
* Author: Mark Seaman, OCR Technology
|
||
* Created: Tue Jan 30 14:03:25 1990
|
||
* Modified: Fri Jul 12 16:03:06 1991 (Mark Seaman) marks@hpgrlt
|
||
* Language: C
|
||
* Package: N/A
|
||
* Status: Experimental (Do Not Distribute)
|
||
*
|
||
* (c) Copyright 1990, Hewlett-Packard Company.
|
||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||
** you may not use this file except in compliance with the License.
|
||
** You may obtain a copy of the License at
|
||
** http://www.apache.org/licenses/LICENSE-2.0
|
||
** Unless required by applicable law or agreed to in writing, software
|
||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
** See the License for the specific language governing permissions and
|
||
** limitations under the License.
|
||
*
|
||
*********************************************************************************/
|
||
/*----------------------------------------------------------------------
|
||
I N C L U D E S
|
||
----------------------------------------------------------------------*/
|
||
#include <stdio.h>
|
||
#ifdef __UNIX__
|
||
#include <assert.h>
|
||
#endif
|
||
|
||
#include "wordclass.h"
|
||
#include "fxid.h"
|
||
#include "tordvars.h"
|
||
#include "associate.h"
|
||
#include "render.h"
|
||
#include "metrics.h"
|
||
#include "matchtab.h"
|
||
//#include "tfacepp.h"
|
||
#include "permute.h"
|
||
#include "context.h"
|
||
#include "badwords.h"
|
||
#include "callcpp.h"
|
||
#include <assert.h>
|
||
|
||
extern TBLOB *newblob();
|
||
|
||
/*----------------------------------------------------------------------
|
||
Variables
|
||
----------------------------------------------------------------------*/
|
||
INT16 first_pass;
|
||
|
||
/*----------------------------------------------------------------------
|
||
C o n s t a n t s
|
||
----------------------------------------------------------------------*/
|
||
|
||
#define BOLD_ON "&dB(s3B"
|
||
#define BOLD_OFF "&d@(s0B"
|
||
#define UNDERLINE_ON "&dD"
|
||
#define UNDERLINE_OFF "&d@"
|
||
LIST call_matcher( //call a matcher
|
||
TBLOB *ptblob, //previous
|
||
TBLOB *tessblob, //blob to match
|
||
TBLOB *ntblob, //next
|
||
void *, //unused parameter
|
||
TEXTROW * //always null anyway
|
||
);
|
||
|
||
/*----------------------------------------------------------------------
|
||
F u n c t i o n s
|
||
----------------------------------------------------------------------*/
|
||
/**********************************************************************
|
||
* classify_blob
|
||
*
|
||
* Classify the this blob if it is not already recorded in the match
|
||
* table. Attempt to recognize this blob as a character. The recognition
|
||
* rating (probability) for this blob will be stored as a part of the
|
||
* blob. This value will also be returned to the caller.
|
||
**********************************************************************/
|
||
CHOICES classify_blob(TBLOB *pblob,
|
||
TBLOB *blob,
|
||
TBLOB *nblob,
|
||
TEXTROW *row,
|
||
int fx,
|
||
const char *string,
|
||
C_COL color,
|
||
STATE *this_state,
|
||
STATE *best_state,
|
||
INT32 pass,
|
||
INT32 blob_index) {
|
||
CHOICES rating;
|
||
INT32 old_index;
|
||
|
||
chars_classified++; /* Global value */
|
||
if (blob_skip)
|
||
return (NIL);
|
||
|
||
#ifndef GRAPHICS_DISABLED
|
||
if (display_all_blobs)
|
||
display_blob(blob, color);
|
||
#endif
|
||
rating = get_match (blob);
|
||
if (rating == NIL) {
|
||
if (pass) {
|
||
old_index = blob_index;
|
||
//?cast to int*
|
||
blob_type = compare_states (best_state, this_state, (int *) &blob_index);
|
||
// TODO(tkielbus) Remove this assert and reactivate code.
|
||
// Manage the blob_answer, word_answer mechanism.
|
||
// (convert or remove because it doesnt seem to be used anymore)
|
||
assert(0);
|
||
#if 0
|
||
blob_answer = word_answer[blob_index];
|
||
if (blob_answer < '!')
|
||
fprintf (matcher_fp,
|
||
"Bad compare states: best state=0x%x%x, this=0x%x%x, bits="
|
||
INT32FORMAT ", index=" INT32FORMAT ", outdex="
|
||
INT32FORMAT ", word=%s\n", best_state->part1,
|
||
best_state->part2, this_state->part1, this_state->part2,
|
||
bits_in_states, old_index, blob_index, word_answer);
|
||
#endif
|
||
}
|
||
else
|
||
blob_type = 0;
|
||
rating = /*(*blob_matchers [fx]) */ (CHOICES) call_matcher (pblob, blob,
|
||
nblob, NULL,
|
||
row);
|
||
put_match(blob, rating);
|
||
}
|
||
|
||
#ifndef GRAPHICS_DISABLED
|
||
if (display_ratings && string)
|
||
print_choices(string, rating);
|
||
|
||
if (blob_pause)
|
||
window_wait(blob_window);
|
||
#endif
|
||
|
||
return (rating);
|
||
}
|
||
|
||
|
||
/**********************************************************************
|
||
* write_text_files
|
||
*
|
||
* Write an answer to the output file that is the raw guess (without
|
||
* context) directly from the classifier.
|
||
**********************************************************************/
|
||
void write_text_files(TWERD *word,
|
||
char *raw_choice,
|
||
int same_row,
|
||
int good_word,
|
||
int firstpass) {
|
||
int x;
|
||
/* Raw output */
|
||
if (write_raw_output) {
|
||
if (same_row)
|
||
fprintf (rawfile, "\n");
|
||
if (raw_choice && strlen (raw_choice)) {
|
||
fprintf (rawfile, "%s ", raw_choice);
|
||
fflush(rawfile);
|
||
}
|
||
}
|
||
/* Text file output */
|
||
if (write_output) {
|
||
if (same_row)
|
||
fprintf (textfile, "\n");
|
||
if (word->guess && strlen (word->guess)) {
|
||
for (x = 0; x < word->blanks; x++)
|
||
fprintf (textfile, " ");
|
||
if (!firstpass)
|
||
fprintf(textfile, BOLD_ON);
|
||
if (!good_word)
|
||
fprintf(textfile, UNDERLINE_ON);
|
||
fprintf (textfile, "%s", word->guess);
|
||
if (!good_word)
|
||
fprintf(textfile, UNDERLINE_OFF);
|
||
if (!firstpass)
|
||
fprintf(textfile, BOLD_OFF);
|
||
fflush(textfile);
|
||
}
|
||
}
|
||
/* Global counters */
|
||
character_count += (word->guess ? strlen (word->guess) : 0);
|
||
word_count++;
|
||
}
|
||
|
||
|
||
/**********************************************************************
|
||
* save_answer
|
||
*
|
||
* Write an answer to the output file that is the raw guess (without
|
||
* context) directly from the classifier.
|
||
**********************************************************************/
|
||
void save_answer(TWERD *word,
|
||
TEXTROW *row,
|
||
A_CHOICE *best_choice,
|
||
A_CHOICE *raw_choice,
|
||
int firstpass) {
|
||
static TEXTROW *last_row;
|
||
char raw_answer[CHARS_PER_LINE];
|
||
int answer_already;
|
||
int good_answer;
|
||
char *string = NULL;
|
||
|
||
if (best_choice) {
|
||
good_answer = AcceptableResult (best_choice, raw_choice);
|
||
string = class_string (best_choice);
|
||
}
|
||
else {
|
||
good_answer = FALSE;
|
||
}
|
||
|
||
if (firstpass) {
|
||
/* First pass */
|
||
if (string) {
|
||
/* Got answer */
|
||
add_document_word(best_choice);
|
||
|
||
word->guess = string;
|
||
fix_quotes (word->guess);
|
||
strcpy (raw_answer, word->guess);
|
||
|
||
record_certainty (class_certainty (best_choice), 1);
|
||
|
||
if (good_answer) {
|
||
record_certainty (class_certainty (best_choice), 2);
|
||
strcat (raw_answer, " ");
|
||
strcat (raw_answer, class_string (raw_choice));
|
||
word->guess = strsave (raw_answer);
|
||
word->guess[strlen (string)] = 0;
|
||
if (string) {
|
||
strfree(string);
|
||
class_string (best_choice) = NULL;
|
||
}
|
||
}
|
||
else {
|
||
/* Not good enough */
|
||
if (word->guess)
|
||
strfree (word->guess);
|
||
word->guess = NULL;
|
||
}
|
||
}
|
||
else {
|
||
word->guess = NULL;
|
||
raw_answer[0] = '\0';
|
||
}
|
||
}
|
||
else {
|
||
/* Second pass */
|
||
answer_already = (word->guess != NULL);
|
||
if (answer_already) {
|
||
write_text_files (word,
|
||
&word->guess[strlen (word->guess) + 1],
|
||
(row != last_row), TRUE, TRUE);
|
||
}
|
||
else {
|
||
/* Required second pass */
|
||
if (string) {
|
||
if (!good_answer && tessedit_save_stats) {
|
||
SaveBadWord (string, class_certainty (best_choice));
|
||
}
|
||
record_certainty (class_certainty (best_choice), 2);
|
||
word->guess = class_string (best_choice);
|
||
fix_quotes (word->guess);
|
||
write_text_files (word, class_string (raw_choice),
|
||
(row != last_row), good_answer, FALSE);
|
||
}
|
||
}
|
||
}
|
||
/* Word Display */
|
||
if (display_text) {
|
||
if (row != last_row)
|
||
cprintf ("\n");
|
||
if (word->guess && strlen (word->guess))
|
||
cprintf ("%s ", word->guess);
|
||
else
|
||
cprintf ("%s ", raw_answer);
|
||
fflush(stdout);
|
||
}
|
||
|
||
last_row = row;
|
||
}
|