mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
7e8bd73aea
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@435 d0cd1f9f-072b-0410-8dd7-cf729c803f20
377 lines
16 KiB
C++
377 lines
16 KiB
C++
/**********************************************************************
|
|
* File: tfacepp.cpp (Formerly tface++.c)
|
|
* Description: C++ side of the C/C++ Tess/Editor interface.
|
|
* Author: Ray Smith
|
|
* Created: Thu Apr 23 15:39:23 BST 1992
|
|
*
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#ifdef _MSC_VER
|
|
#pragma warning(disable:4244) // Conversion warnings
|
|
#pragma warning(disable:4305) // int/float warnings
|
|
#pragma warning(disable:4800) // int/bool warnings
|
|
#endif
|
|
|
|
#include "mfcpch.h"
|
|
#ifdef __UNIX__
|
|
#include <assert.h>
|
|
#endif
|
|
#include "errcode.h"
|
|
#include "ratngs.h"
|
|
#include "reject.h"
|
|
#include "werd.h"
|
|
#include "tfacep.h"
|
|
#include "tstruct.h"
|
|
#include "tfacepp.h"
|
|
#include "tessvars.h"
|
|
#include "globals.h"
|
|
#include "reject.h"
|
|
#include "tesseractclass.h"
|
|
|
|
#define EXTERN
|
|
|
|
EXTERN BOOL_VAR (tessedit_override_permuter, TRUE, "According to dict_word");
|
|
|
|
|
|
#define MAX_UNDIVIDED_LENGTH 24
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
* recog_word
|
|
*
|
|
* Convert the word to tess form and pass it to the tess segmenter.
|
|
* Convert the output back to editor form.
|
|
**********************************************************************/
|
|
namespace tesseract {
|
|
WERD_CHOICE *Tesseract::recog_word( //recog one owrd
|
|
WERD *word, //word to do
|
|
DENORM *denorm, //de-normaliser
|
|
//matcher function
|
|
POLY_MATCHER matcher,
|
|
POLY_TESTER tester, //tester function
|
|
POLY_TESTER trainer, //trainer function
|
|
BOOL8 testing, //true if answer driven
|
|
//raw result
|
|
WERD_CHOICE *&raw_choice,
|
|
//list of blob lists
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
|
WERD *&outword //bln word output
|
|
) {
|
|
WERD_CHOICE *word_choice;
|
|
uinT8 perm_type;
|
|
uinT8 real_dict_perm_type;
|
|
|
|
if (word->blob_list ()->empty ()) {
|
|
word_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
|
|
TOP_CHOICE_PERM, unicharset);
|
|
raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
|
|
TOP_CHOICE_PERM, unicharset);
|
|
outword = word->poly_copy (denorm->row ()->x_height ());
|
|
}
|
|
else
|
|
word_choice = recog_word_recursive (word, denorm, matcher, tester,
|
|
trainer, testing, raw_choice,
|
|
blob_choices, outword);
|
|
if ((word_choice->length() != outword->blob_list()->length()) ||
|
|
(word_choice->length() != blob_choices->length())) {
|
|
tprintf
|
|
("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
|
|
word_choice->debug_string(unicharset).string(),
|
|
word_choice->length(), outword->blob_list()->length(),
|
|
blob_choices->length());
|
|
}
|
|
ASSERT_HOST(word_choice->length() == outword->blob_list()->length());
|
|
ASSERT_HOST(word_choice->length() == blob_choices->length());
|
|
|
|
/* Copy any reject blobs into the outword */
|
|
outword->rej_blob_list()->deep_copy(word->rej_blob_list(), &PBLOB::deep_copy);
|
|
|
|
if (tessedit_override_permuter) {
|
|
/* Override the permuter type if a straight dictionary check disagrees. */
|
|
perm_type = word_choice->permuter();
|
|
if ((perm_type != SYSTEM_DAWG_PERM) &&
|
|
(perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
|
|
real_dict_perm_type = dict_word(*word_choice);
|
|
if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
|
|
(real_dict_perm_type == FREQ_DAWG_PERM) ||
|
|
(real_dict_perm_type == USER_DAWG_PERM)) &&
|
|
(alpha_count(word_choice->unichar_string().string(),
|
|
word_choice->unichar_lengths().string()) > 0)) {
|
|
word_choice->set_permuter (real_dict_perm_type); // use dict perm
|
|
}
|
|
}
|
|
if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
|
|
tprintf ("Permuter Type Flipped from %d to %d\n",
|
|
perm_type, word_choice->permuter ());
|
|
}
|
|
}
|
|
assert ((word_choice == NULL) == (raw_choice == NULL));
|
|
return word_choice;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* recog_word_recursive
|
|
*
|
|
* Convert the word to tess form and pass it to the tess segmenter.
|
|
* Convert the output back to editor form.
|
|
**********************************************************************/
|
|
WERD_CHOICE *
|
|
Tesseract::recog_word_recursive(
|
|
WERD *word, // word to do
|
|
DENORM *denorm, // de-normaliser
|
|
POLY_MATCHER matcher, // matcher function
|
|
POLY_TESTER tester, // tester function
|
|
POLY_TESTER trainer, // trainer function
|
|
BOOL8 testing, // true if answer driven
|
|
WERD_CHOICE *&raw_choice, // raw result
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices, // list of blob lists
|
|
WERD *&outword // bln word output
|
|
) {
|
|
inT32 initial_blob_choice_len;
|
|
inT32 word_length; // no of blobs
|
|
STRING word_string; // converted from tess
|
|
STRING word_string_lengths;
|
|
BLOB_CHOICE_LIST_VECTOR *tess_ratings; // tess results
|
|
TWERD *tessword; // tess format
|
|
BLOB_CHOICE_LIST_C_IT blob_choices_it; // iterator
|
|
|
|
tess_matcher = matcher; // install matcher
|
|
tess_tester = testing ? tester : NULL;
|
|
tess_trainer = testing ? trainer : NULL;
|
|
tess_denorm = denorm;
|
|
tess_word = word;
|
|
// blob_matchers[1]=call_matcher;
|
|
if (word->blob_list ()->length () > MAX_UNDIVIDED_LENGTH) {
|
|
return split_and_recog_word (word, denorm, matcher, tester, trainer,
|
|
testing, raw_choice, blob_choices,
|
|
outword);
|
|
} else {
|
|
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
|
|
WERD_CHOICE *best_choice = new WERD_CHOICE();
|
|
raw_choice = new WERD_CHOICE();
|
|
initial_blob_choice_len = blob_choices->length();
|
|
tessword = make_tess_word (word, NULL);
|
|
tess_ratings = cc_recog(tessword, best_choice, raw_choice,
|
|
testing && tester != NULL,
|
|
testing && trainer != NULL,
|
|
word->flag(W_EOL));
|
|
|
|
outword = make_ed_word (tessword, word); // convert word
|
|
if (outword == NULL) {
|
|
outword = word->poly_copy (denorm->row ()->x_height ());
|
|
}
|
|
delete_word(tessword); // get rid of it
|
|
word_length = outword->blob_list()->length(); // no of blobs
|
|
|
|
// Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
|
|
blob_choices_it.set_to_list(blob_choices);
|
|
for (int i = 0; i < tess_ratings->length(); ++i) {
|
|
blob_choices_it.add_to_end(tess_ratings->get(i));
|
|
}
|
|
delete tess_ratings;
|
|
|
|
// Pad raw_choice with spaces if needed.
|
|
if (raw_choice->length() < word_length) {
|
|
while (raw_choice->length() < word_length) {
|
|
raw_choice->append_unichar_id(space_id, 1, 0.0,
|
|
raw_choice->certainty());
|
|
}
|
|
raw_choice->populate_unichars(unicharset);
|
|
}
|
|
|
|
// Do sanity checks and minor fixes on best_choice.
|
|
if (best_choice->length() > word_length) {
|
|
tprintf("recog_word: Discarded long string \"%s\""
|
|
" (%d characters vs %d blobs)\n",
|
|
best_choice->unichar_string().string (),
|
|
best_choice->length(), word_length);
|
|
best_choice->make_bad(); // should never happen
|
|
tprintf("Word is at (%g,%g)\n",
|
|
denorm->origin(),
|
|
denorm->y(word->bounding_box().bottom(), 0.0));
|
|
}
|
|
if (blob_choices->length() - initial_blob_choice_len != word_length) {
|
|
best_choice->make_bad(); // force rejection
|
|
tprintf ("recog_word: Choices list len:%d; blob lists len:%d\n",
|
|
blob_choices->length(), word_length);
|
|
blob_choices_it.set_to_list(blob_choices); // list of lists
|
|
while (blob_choices->length() - initial_blob_choice_len < word_length) {
|
|
blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one
|
|
tprintf("recog_word: Added dummy choice list\n");
|
|
}
|
|
while (blob_choices->length() - initial_blob_choice_len > word_length) {
|
|
blob_choices_it.move_to_last(); // should never happen
|
|
delete blob_choices_it.extract();
|
|
tprintf("recog_word: Deleted choice list\n");
|
|
}
|
|
}
|
|
if (best_choice->length() < word_length) {
|
|
while (best_choice->length() < word_length) {
|
|
best_choice->append_unichar_id(space_id, 1, 0.0,
|
|
best_choice->certainty());
|
|
}
|
|
best_choice->populate_unichars(unicharset);
|
|
}
|
|
|
|
return best_choice;
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* split_and_recog_word
|
|
*
|
|
* Convert the word to tess form and pass it to the tess segmenter.
|
|
* Convert the output back to editor form.
|
|
**********************************************************************/
|
|
|
|
WERD_CHOICE *
|
|
Tesseract::split_and_recog_word( //recog one owrd
|
|
WERD *word, //word to do
|
|
DENORM *denorm, //de-normaliser
|
|
POLY_MATCHER matcher, //matcher function
|
|
POLY_TESTER tester, //tester function
|
|
POLY_TESTER trainer, //trainer function
|
|
BOOL8 testing, //true if answer driven
|
|
//raw result
|
|
WERD_CHOICE *&raw_choice,
|
|
//list of blob lists
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
|
WERD *&outword //bln word output
|
|
) {
|
|
// inT32 outword1_len;
|
|
// inT32 outword2_len;
|
|
WERD *first_word; //poly copy of word
|
|
WERD *second_word; //fabricated word
|
|
WERD *outword2; //2nd output word
|
|
PBLOB *blob;
|
|
WERD_CHOICE *result; //return value
|
|
WERD_CHOICE *result2; //output of 2nd word
|
|
WERD_CHOICE *raw_choice2; //raw version of 2nd
|
|
float gap; //blob gap
|
|
float bestgap; //biggest gap
|
|
PBLOB_LIST new_blobs; //list of gathered blobs
|
|
PBLOB_IT blob_it;
|
|
//iterator
|
|
PBLOB_IT new_blob_it = &new_blobs;
|
|
|
|
first_word = word->poly_copy (denorm->row ()->x_height ());
|
|
blob_it.set_to_list (first_word->blob_list ());
|
|
bestgap = (float) -MAX_INT32;
|
|
while (!blob_it.at_last ()) {
|
|
blob = blob_it.data ();
|
|
//gap to next
|
|
gap = (float) blob_it.data_relative(1)->bounding_box().left() -
|
|
blob->bounding_box().right();
|
|
blob_it.forward ();
|
|
if (gap > bestgap) {
|
|
bestgap = gap; //find biggest
|
|
new_blob_it = blob_it; //save position
|
|
}
|
|
}
|
|
//take 2nd half
|
|
new_blobs.assign_to_sublist (&new_blob_it, &blob_it);
|
|
//make it a word
|
|
second_word = new WERD (&new_blobs, 1, NULL);
|
|
ASSERT_HOST (word->blob_list ()->length () ==
|
|
first_word->blob_list ()->length () +
|
|
second_word->blob_list ()->length ());
|
|
|
|
result = recog_word_recursive (first_word, denorm, matcher,
|
|
tester, trainer, testing, raw_choice,
|
|
blob_choices, outword);
|
|
delete first_word; //done that one
|
|
result2 = recog_word_recursive (second_word, denorm, matcher,
|
|
tester, trainer, testing, raw_choice2,
|
|
blob_choices, outword2);
|
|
delete second_word; //done that too
|
|
*result += *result2; //combine ratings
|
|
delete result2;
|
|
*raw_choice += *raw_choice2;
|
|
delete raw_choice2; //finished with it
|
|
// outword1_len= outword->blob_list()->length();
|
|
// outword2_len= outword2->blob_list()->length();
|
|
outword->join_on (outword2); //join words
|
|
delete outword2;
|
|
// if ( outword->blob_list()->length() != outword1_len + outword2_len )
|
|
// tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n",
|
|
// outword1_len, outword2_len, outword->blob_list()->length() );
|
|
// ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len );
|
|
return result;
|
|
}
|
|
|
|
} // namespace tesseract
|
|
|
|
/**********************************************************************
|
|
* call_tester
|
|
*
|
|
* Called from Tess with a blob in tess form.
|
|
* Convert the blob to editor form.
|
|
* Call the tester setup by the segmenter in tess_tester.
|
|
**********************************************************************/
|
|
#if 0 // dead code
|
|
void call_tester( //call a tester
|
|
const STRING& filename,
|
|
TBLOB *tessblob, //blob to test
|
|
BOOL8 correct_blob, //true if good
|
|
char *text, //source text
|
|
inT32 count, //chars in text
|
|
LIST result //output of matcher
|
|
) {
|
|
PBLOB *blob; //converted blob
|
|
BLOB_CHOICE_LIST ratings; //matcher result
|
|
|
|
blob = make_ed_blob (tessblob);//convert blob
|
|
if (blob == NULL)
|
|
return;
|
|
//make it right type
|
|
convert_choice_list(result, ratings);
|
|
if (tess_tester != NULL)
|
|
(*tess_tester) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
|
|
delete blob; //don't need that now
|
|
}
|
|
#endif
|
|
|
|
/**********************************************************************
|
|
* call_train_tester
|
|
*
|
|
* Called from Tess with a blob in tess form.
|
|
* Convert the blob to editor form.
|
|
* Call the trainer setup by the segmenter in tess_trainer.
|
|
**********************************************************************/
|
|
#if 0 // dead code
|
|
void call_train_tester( //call a tester
|
|
const STRING& filename,
|
|
TBLOB *tessblob, //blob to test
|
|
BOOL8 correct_blob, //true if good
|
|
char *text, //source text
|
|
inT32 count, //chars in text
|
|
LIST result //output of matcher
|
|
) {
|
|
PBLOB *blob; //converted blob
|
|
BLOB_CHOICE_LIST ratings; //matcher result
|
|
|
|
blob = make_ed_blob (tessblob);//convert blob
|
|
if (blob == NULL)
|
|
return;
|
|
//make it right type
|
|
convert_choice_list(result, ratings);
|
|
if (tess_trainer != NULL)
|
|
(*tess_trainer) (filename, blob, tess_denorm, correct_blob, text, count, &ratings);
|
|
delete blob; //don't need that now
|
|
}
|
|
#endif
|