2007-03-08 04:03:40 +08:00
|
|
|
|
/* -*-C-*-
|
|
|
|
|
********************************************************************************
|
|
|
|
|
*
|
|
|
|
|
* File: wordclass.c (Formerly wordclass.c)
|
|
|
|
|
* Description: Word classifier
|
|
|
|
|
* Author: Mark Seaman, OCR Technology
|
|
|
|
|
* Created: Tue Jan 30 14:03:25 1990
|
|
|
|
|
* Modified: Fri Jul 12 16:03:06 1991 (Mark Seaman) marks@hpgrlt
|
|
|
|
|
* Language: C
|
|
|
|
|
* Package: N/A
|
|
|
|
|
* Status: Experimental (Do Not Distribute)
|
|
|
|
|
*
|
|
|
|
|
* (c) Copyright 1990, Hewlett-Packard Company.
|
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
|
** limitations under the License.
|
|
|
|
|
*
|
|
|
|
|
*********************************************************************************/
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
I N C L U D E S
|
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#ifdef __UNIX__
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#include "wordclass.h"
|
|
|
|
|
#include "fxid.h"
|
|
|
|
|
#include "tordvars.h"
|
|
|
|
|
#include "associate.h"
|
|
|
|
|
#include "render.h"
|
|
|
|
|
#include "metrics.h"
|
|
|
|
|
#include "matchtab.h"
|
|
|
|
|
//#include "tfacepp.h"
|
|
|
|
|
#include "permute.h"
|
|
|
|
|
#include "context.h"
|
|
|
|
|
#include "badwords.h"
|
|
|
|
|
#include "callcpp.h"
|
2007-07-18 09:15:07 +08:00
|
|
|
|
#include <assert.h>
|
2009-07-11 10:46:01 +08:00
|
|
|
|
#include "wordrec.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
|
extern TBLOB *newblob();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
Variables
|
|
|
|
|
----------------------------------------------------------------------*/
|
2008-04-22 08:35:16 +08:00
|
|
|
|
inT16 first_pass;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
C o n s t a n t s
|
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
|
|
#define BOLD_ON "&dB(s3B"
|
|
|
|
|
#define BOLD_OFF "&d@(s0B"
|
|
|
|
|
#define UNDERLINE_ON "&dD"
|
|
|
|
|
#define UNDERLINE_OFF "&d@"
|
|
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
|
F u n c t i o n s
|
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
/**********************************************************************
|
|
|
|
|
* classify_blob
|
|
|
|
|
*
|
|
|
|
|
* Classify the this blob if it is not already recorded in the match
|
2009-07-11 10:46:01 +08:00
|
|
|
|
* table. Attempt to recognize this blob as a character. The recognition
|
|
|
|
|
* rating for this blob will be stored as a part of the blob. This value
|
|
|
|
|
* will also be returned to the caller.
|
2007-03-08 04:03:40 +08:00
|
|
|
|
**********************************************************************/
|
2009-07-11 10:46:01 +08:00
|
|
|
|
namespace tesseract {
|
|
|
|
|
BLOB_CHOICE_LIST *Wordrec::classify_blob(TBLOB *pblob,
|
|
|
|
|
TBLOB *blob,
|
|
|
|
|
TBLOB *nblob,
|
|
|
|
|
TEXTROW *row,
|
|
|
|
|
const char *string,
|
|
|
|
|
C_COL color) {
|
|
|
|
|
BLOB_CHOICE_LIST *choices = NULL;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
chars_classified++; /* Global value */
|
2009-07-11 10:46:01 +08:00
|
|
|
|
if (tord_blob_skip)
|
|
|
|
|
return (NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2009-07-11 10:46:01 +08:00
|
|
|
|
if (wordrec_display_all_blobs)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
display_blob(blob, color);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
#endif
|
2009-07-11 10:46:01 +08:00
|
|
|
|
choices = get_match(blob);
|
|
|
|
|
if (choices == NULL) {
|
|
|
|
|
choices = call_matcher(pblob, blob, nblob, NULL, row);
|
|
|
|
|
put_match(blob, choices);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2009-07-11 10:46:01 +08:00
|
|
|
|
if (tord_display_ratings && string)
|
|
|
|
|
print_ratings_list(string, choices, getDict().getUnicharset());
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
2009-07-11 10:46:01 +08:00
|
|
|
|
if (wordrec_blob_pause)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
window_wait(blob_window);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
#endif
|
|
|
|
|
|
2009-07-11 10:46:01 +08:00
|
|
|
|
return (choices);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
|
2009-07-11 10:46:01 +08:00
|
|
|
|
/**********************************************************************
|
|
|
|
|
* update_blob_classifications
|
|
|
|
|
*
|
|
|
|
|
* For each blob in the given word update match_table with the
|
|
|
|
|
* corresponding BLOB_CHOICES_LIST from choices.
|
|
|
|
|
* **********************************************************************/
|
|
|
|
|
void Wordrec::update_blob_classifications(
|
|
|
|
|
TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices) {
|
|
|
|
|
TBLOB *tblob = word->blobs;
|
|
|
|
|
int index = 0;
|
|
|
|
|
for (; tblob != NULL && index < choices.length();
|
|
|
|
|
tblob = tblob->next, index++) {
|
|
|
|
|
add_to_match(tblob, choices.get(index));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace tesseract;
|
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
|
* write_text_files
|
|
|
|
|
*
|
|
|
|
|
* Write an answer to the output file that is the raw guess (without
|
|
|
|
|
* context) directly from the classifier.
|
|
|
|
|
**********************************************************************/
|
|
|
|
|
void write_text_files(TWERD *word,
|
|
|
|
|
char *raw_choice,
|
|
|
|
|
int same_row,
|
|
|
|
|
int good_word,
|
|
|
|
|
int firstpass) {
|
|
|
|
|
int x;
|
|
|
|
|
/* Raw output */
|
2009-07-11 10:46:01 +08:00
|
|
|
|
if (tord_write_raw_output) {
|
2007-03-08 04:03:40 +08:00
|
|
|
|
if (same_row)
|
|
|
|
|
fprintf (rawfile, "\n");
|
|
|
|
|
if (raw_choice && strlen (raw_choice)) {
|
|
|
|
|
fprintf (rawfile, "%s ", raw_choice);
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fflush(rawfile);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Text file output */
|
2009-07-11 10:46:01 +08:00
|
|
|
|
if (tord_write_output) {
|
2007-03-08 04:03:40 +08:00
|
|
|
|
if (same_row)
|
|
|
|
|
fprintf (textfile, "\n");
|
|
|
|
|
if (word->guess && strlen (word->guess)) {
|
|
|
|
|
for (x = 0; x < word->blanks; x++)
|
|
|
|
|
fprintf (textfile, " ");
|
|
|
|
|
if (!firstpass)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, BOLD_ON);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
if (!good_word)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, UNDERLINE_ON);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
fprintf (textfile, "%s", word->guess);
|
|
|
|
|
if (!good_word)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, UNDERLINE_OFF);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
if (!firstpass)
|
2007-07-18 09:15:07 +08:00
|
|
|
|
fprintf(textfile, BOLD_OFF);
|
|
|
|
|
fflush(textfile);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
/* Global counters */
|
|
|
|
|
character_count += (word->guess ? strlen (word->guess) : 0);
|
|
|
|
|
word_count++;
|
|
|
|
|
}
|