2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: tessbox.cpp (Formerly tessbox.c)
|
|
|
|
* Description: Black boxed Tess for developing a resaljet.
|
|
|
|
* Author: Ray Smith
|
|
|
|
* Created: Thu Apr 23 11:03:36 BST 1992
|
|
|
|
*
|
|
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
2010-05-26 18:22:27 +08:00
|
|
|
#ifdef _MSC_VER
|
|
|
|
#pragma warning(disable:4244) // Conversion warnings
|
|
|
|
#endif
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "mfcpch.h"
|
|
|
|
#include "tfacep.h"
|
|
|
|
#include "tfacepp.h"
|
|
|
|
#include "tessbox.h"
|
|
|
|
#include "mfoutline.h"
|
2009-07-11 10:03:51 +08:00
|
|
|
#include "tesseractclass.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#define EXTERN
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_segment_pass1
|
|
|
|
*
|
|
|
|
* Segment a word using the pass1 conditions of the tess segmenter.
|
2010-07-28 08:38:09 +08:00
|
|
|
* @param word bln word to do
|
|
|
|
* @param denorm de-normaliser
|
|
|
|
* @param matcher matcher function
|
|
|
|
* @param raw_choice raw result
|
|
|
|
* @param blob_choices list of blob lists
|
|
|
|
* @param outword bln word output
|
|
|
|
*/
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
namespace tesseract {
|
2010-07-28 08:38:09 +08:00
|
|
|
WERD_CHOICE *Tesseract::tess_segment_pass1(WERD *word,
|
|
|
|
DENORM *denorm,
|
2009-07-11 10:03:51 +08:00
|
|
|
POLY_MATCHER matcher,
|
|
|
|
WERD_CHOICE *&raw_choice,
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
2010-07-28 08:38:09 +08:00
|
|
|
WERD *&outword) {
|
2007-03-08 04:03:40 +08:00
|
|
|
WERD_CHOICE *result; //return value
|
|
|
|
int saved_enable_assoc = 0;
|
|
|
|
int saved_chop_enable = 0;
|
|
|
|
|
|
|
|
if (word->flag (W_DONT_CHOP)) {
|
2009-07-11 10:03:51 +08:00
|
|
|
saved_enable_assoc = wordrec_enable_assoc;
|
2007-03-08 04:03:40 +08:00
|
|
|
saved_chop_enable = chop_enable;
|
2009-07-11 10:03:51 +08:00
|
|
|
wordrec_enable_assoc.set_value(0);
|
|
|
|
chop_enable.set_value(0);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (word->flag (W_REP_CHAR))
|
|
|
|
permute_only_top = 1;
|
|
|
|
}
|
|
|
|
set_pass1();
|
|
|
|
// tprintf("pass1 chop on=%d, seg=%d, onlytop=%d",chop_enable,enable_assoc,permute_only_top);
|
|
|
|
result = recog_word (word, denorm, matcher, NULL, NULL, FALSE,
|
|
|
|
raw_choice, blob_choices, outword);
|
|
|
|
if (word->flag (W_DONT_CHOP)) {
|
2009-07-11 10:03:51 +08:00
|
|
|
wordrec_enable_assoc.set_value(saved_enable_assoc);
|
|
|
|
chop_enable.set_value(saved_chop_enable);
|
2007-03-08 04:03:40 +08:00
|
|
|
permute_only_top = 0;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_segment_pass2
|
|
|
|
*
|
|
|
|
* Segment a word using the pass2 conditions of the tess segmenter.
|
2010-07-28 08:38:09 +08:00
|
|
|
* @param word bln word to do
|
|
|
|
* @param denorm de-normaliser
|
|
|
|
* @param matcher matcher function
|
|
|
|
* @param raw_choice raw result
|
|
|
|
* @param blob_choices list of blob lists
|
|
|
|
* @param outword bln word output
|
|
|
|
*/
|
|
|
|
|
|
|
|
WERD_CHOICE *Tesseract::tess_segment_pass2(WERD *word,
|
|
|
|
DENORM *denorm,
|
2009-07-11 10:03:51 +08:00
|
|
|
POLY_MATCHER matcher,
|
|
|
|
WERD_CHOICE *&raw_choice,
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
2010-07-28 08:38:09 +08:00
|
|
|
WERD *&outword) {
|
2007-03-08 04:03:40 +08:00
|
|
|
WERD_CHOICE *result; //return value
|
|
|
|
int saved_enable_assoc = 0;
|
|
|
|
int saved_chop_enable = 0;
|
|
|
|
|
|
|
|
if (word->flag (W_DONT_CHOP)) {
|
2009-07-11 10:03:51 +08:00
|
|
|
saved_enable_assoc = wordrec_enable_assoc;
|
2007-03-08 04:03:40 +08:00
|
|
|
saved_chop_enable = chop_enable;
|
2009-07-11 10:03:51 +08:00
|
|
|
wordrec_enable_assoc.set_value(0);
|
|
|
|
chop_enable.set_value(0);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (word->flag (W_REP_CHAR))
|
|
|
|
permute_only_top = 1;
|
|
|
|
}
|
|
|
|
set_pass2();
|
|
|
|
result = recog_word (word, denorm, matcher, NULL, NULL, FALSE,
|
|
|
|
raw_choice, blob_choices, outword);
|
|
|
|
if (word->flag (W_DONT_CHOP)) {
|
2009-07-11 10:03:51 +08:00
|
|
|
wordrec_enable_assoc.set_value(saved_enable_assoc);
|
|
|
|
chop_enable.set_value(saved_chop_enable);
|
2007-03-08 04:03:40 +08:00
|
|
|
permute_only_top = 0;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* correct_segment_pass2
|
|
|
|
*
|
|
|
|
* Segment a word correctly using the pass2 conditions of the tess segmenter.
|
|
|
|
* Then call the tester with all the correctly segmented blobs.
|
|
|
|
* If the correct segmentation cannot be found, the tester is called
|
|
|
|
* with the segmentation found by tess and all the correct flags set to
|
|
|
|
* false and all strings are NULL.
|
2010-07-28 08:38:09 +08:00
|
|
|
* @param word bln word to do
|
|
|
|
* @param denorm de-normaliser
|
|
|
|
* @param matcher matcher function
|
|
|
|
* @param tester tester function
|
|
|
|
* @param raw_choice raw result
|
|
|
|
* @param blob_choices list of blob lists
|
|
|
|
* @param outword bln word output
|
|
|
|
*/
|
|
|
|
|
|
|
|
WERD_CHOICE *Tesseract::correct_segment_pass2(WERD *word,
|
2009-07-11 10:03:51 +08:00
|
|
|
DENORM *denorm,
|
|
|
|
POLY_MATCHER matcher,
|
|
|
|
POLY_TESTER tester,
|
|
|
|
WERD_CHOICE *&raw_choice,
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
2010-07-28 08:38:09 +08:00
|
|
|
WERD *&outword) {
|
2007-03-08 04:03:40 +08:00
|
|
|
set_pass2();
|
|
|
|
return recog_word (word, denorm, matcher, NULL, tester, TRUE,
|
|
|
|
raw_choice, blob_choices, outword);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* test_segment_pass2
|
|
|
|
*
|
|
|
|
* Segment a word correctly using the pass2 conditions of the tess segmenter.
|
|
|
|
* Then call the tester on all words used by tess in its search.
|
|
|
|
* Do this only on words where the correct segmentation could be found.
|
2010-07-28 08:38:09 +08:00
|
|
|
* @param word bln word to do
|
|
|
|
* @param denorm de-normaliser
|
|
|
|
* @param matcher matcher function
|
|
|
|
* @param tester tester function
|
|
|
|
* @param raw_choice raw result
|
|
|
|
* @param blob_choices list of blob lists
|
|
|
|
* @param outword bln word output
|
|
|
|
*/
|
|
|
|
WERD_CHOICE *Tesseract::test_segment_pass2(WERD *word,
|
2009-07-11 10:03:51 +08:00
|
|
|
DENORM *denorm,
|
|
|
|
POLY_MATCHER matcher,
|
|
|
|
POLY_TESTER tester,
|
|
|
|
WERD_CHOICE *&raw_choice,
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
2010-07-28 08:38:09 +08:00
|
|
|
WERD *&outword) {
|
2007-03-08 04:03:40 +08:00
|
|
|
set_pass2();
|
|
|
|
return recog_word (word, denorm, matcher, tester, NULL, TRUE,
|
|
|
|
raw_choice, blob_choices, outword);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_acceptable_word
|
|
|
|
*
|
|
|
|
* Return true if the word is regarded as "good enough".
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
2009-07-11 10:03:51 +08:00
|
|
|
BOOL8 Tesseract::tess_acceptable_word(
|
|
|
|
WERD_CHOICE *word_choice, // after context
|
|
|
|
WERD_CHOICE *raw_choice) { // before context
|
|
|
|
return getDict().AcceptableResult(*word_choice, *raw_choice);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_adaptable_word
|
|
|
|
*
|
|
|
|
* Return true if the word is regarded as "good enough".
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
|
|
|
BOOL8 Tesseract::tess_adaptable_word( //< test adaptability
|
|
|
|
WERD *word, //< word to test
|
|
|
|
WERD_CHOICE *best_choice, //< after context
|
|
|
|
WERD_CHOICE *raw_choice //< before context
|
2009-07-11 10:03:51 +08:00
|
|
|
) {
|
|
|
|
TWERD *tessword = make_tess_word(word, NULL);
|
|
|
|
int result = (tessword && best_choice && raw_choice &&
|
|
|
|
AdaptableWord(tessword, *best_choice, *raw_choice));
|
2007-03-08 04:03:40 +08:00
|
|
|
delete_word(tessword);
|
|
|
|
return result != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_cn_matcher
|
|
|
|
*
|
|
|
|
* Match a blob using the Tess Char Normalized (non-adaptive) matcher
|
|
|
|
* only.
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::tess_cn_matcher( //call tess
|
|
|
|
PBLOB *pblob, //previous blob
|
|
|
|
PBLOB *blob, //blob to match
|
|
|
|
PBLOB *nblob, //next blob
|
|
|
|
WERD *word, //word it came from
|
|
|
|
DENORM *denorm, //de-normaliser
|
|
|
|
BLOB_CHOICE_LIST *ratings, //list of results
|
|
|
|
CLASS_PRUNER_RESULTS cpresults // may be null.
|
2007-03-08 04:03:40 +08:00
|
|
|
) {
|
|
|
|
TBLOB *tessblob; //converted blob
|
|
|
|
TEXTROW tessrow; //dummy row
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
tess_cn_matching.set_value(true); //turn it on
|
|
|
|
tess_bn_matching.set_value(false);
|
2007-03-08 04:03:40 +08:00
|
|
|
//convert blob
|
2009-07-11 10:03:51 +08:00
|
|
|
tessblob = make_rotated_tess_blob(denorm, blob, true);
|
2007-03-08 04:03:40 +08:00
|
|
|
//make dummy row
|
|
|
|
make_tess_row(denorm, &tessrow);
|
|
|
|
//classify
|
2009-07-11 10:03:51 +08:00
|
|
|
AdaptiveClassifier(tessblob, NULL, &tessrow, ratings, cpresults);
|
2007-03-08 04:03:40 +08:00
|
|
|
free_blob(tessblob);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_bn_matcher
|
|
|
|
*
|
|
|
|
* Match a blob using the Tess Baseline Normalized (adaptive) matcher
|
|
|
|
* only.
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::tess_bn_matcher( //call tess
|
|
|
|
PBLOB *pblob, //previous blob
|
|
|
|
PBLOB *blob, //blob to match
|
|
|
|
PBLOB *nblob, //next blob
|
|
|
|
WERD *word, //word it came from
|
|
|
|
DENORM *denorm, //de-normaliser
|
|
|
|
BLOB_CHOICE_LIST *ratings //list of results
|
|
|
|
) {
|
2007-03-08 04:03:40 +08:00
|
|
|
TBLOB *tessblob; //converted blob
|
|
|
|
TEXTROW tessrow; //dummy row
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
tess_bn_matching.set_value(true); //turn it on
|
|
|
|
tess_cn_matching.set_value(false);
|
2007-03-08 04:03:40 +08:00
|
|
|
//convert blob
|
2009-07-11 10:03:51 +08:00
|
|
|
tessblob = make_rotated_tess_blob(denorm, blob, true);
|
2007-03-08 04:03:40 +08:00
|
|
|
//make dummy row
|
|
|
|
make_tess_row(denorm, &tessrow);
|
|
|
|
//classify
|
2009-07-11 10:03:51 +08:00
|
|
|
AdaptiveClassifier(tessblob, NULL, &tessrow, ratings, NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
free_blob(tessblob);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_default_matcher
|
|
|
|
*
|
|
|
|
* Match a blob using the default functionality of the Tess matcher.
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::tess_default_matcher( //call tess
|
|
|
|
PBLOB *pblob, //previous blob
|
|
|
|
PBLOB *blob, //blob to match
|
|
|
|
PBLOB *nblob, //next blob
|
|
|
|
WERD *word, //word it came from
|
|
|
|
DENORM *denorm, //de-normaliser
|
|
|
|
//list of results
|
|
|
|
BLOB_CHOICE_LIST *ratings,
|
|
|
|
const char* script
|
|
|
|
) {
|
|
|
|
assert(ratings != NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
TBLOB *tessblob; //converted blob
|
|
|
|
TEXTROW tessrow; //dummy row
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
tess_bn_matching.set_value(false); //turn it off
|
|
|
|
tess_cn_matching.set_value(false);
|
2007-03-08 04:03:40 +08:00
|
|
|
//convert blob
|
2009-07-11 10:03:51 +08:00
|
|
|
tessblob = make_rotated_tess_blob(denorm, blob, true);
|
2007-03-08 04:03:40 +08:00
|
|
|
//make dummy row
|
|
|
|
make_tess_row(denorm, &tessrow);
|
|
|
|
//classify
|
2009-07-11 10:03:51 +08:00
|
|
|
AdaptiveClassifier (tessblob, NULL, &tessrow, ratings, NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
free_blob(tessblob);
|
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_training_tester
|
|
|
|
*
|
|
|
|
* Matcher tester function which actually trains tess.
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
void tess_training_tester( //call tess
|
2009-07-11 10:03:51 +08:00
|
|
|
const STRING& filename, //filename to output
|
2007-03-08 04:03:40 +08:00
|
|
|
PBLOB *blob, //blob to match
|
|
|
|
DENORM *denorm, //de-normaliser
|
|
|
|
BOOL8 correct, //ly segmented
|
|
|
|
char *text, //correct text
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 count, //chars in text
|
2007-03-08 04:03:40 +08:00
|
|
|
BLOB_CHOICE_LIST *ratings //list of results
|
|
|
|
) {
|
|
|
|
TBLOB *tessblob; //converted blob
|
|
|
|
TEXTROW tessrow; //dummy row
|
|
|
|
|
|
|
|
if (correct) {
|
2009-07-11 10:03:51 +08:00
|
|
|
classify_norm_method.set_value(character); // force char norm spc 30/11/93
|
|
|
|
tess_bn_matching.set_value(false); //turn it off
|
|
|
|
tess_cn_matching.set_value(false);
|
2007-03-08 04:03:40 +08:00
|
|
|
//convert blob
|
|
|
|
tessblob = make_tess_blob (blob, TRUE);
|
|
|
|
//make dummy row
|
|
|
|
make_tess_row(denorm, &tessrow);
|
|
|
|
//learn it
|
2009-07-11 10:03:51 +08:00
|
|
|
LearnBlob(filename, tessblob, &tessrow, text);
|
2007-03-08 04:03:40 +08:00
|
|
|
free_blob(tessblob);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_adapter
|
|
|
|
*
|
|
|
|
* Adapt to the word using the Tesseract mechanism.
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
namespace tesseract {
|
|
|
|
void Tesseract::tess_adapter( //adapt to word
|
|
|
|
WERD *word, //bln word
|
|
|
|
DENORM *denorm, //de-normalise
|
|
|
|
const WERD_CHOICE& choice, //string for word
|
|
|
|
const WERD_CHOICE& raw_choice, //before context
|
|
|
|
const char *rejmap //reject map
|
|
|
|
) {
|
2007-03-08 04:03:40 +08:00
|
|
|
TWERD *tessword; //converted word
|
|
|
|
static TEXTROW tessrow; //dummy row
|
|
|
|
|
|
|
|
//make dummy row
|
|
|
|
make_tess_row(denorm, &tessrow);
|
|
|
|
//make a word
|
|
|
|
tessword = make_tess_word (word, &tessrow);
|
2007-07-18 09:15:07 +08:00
|
|
|
AdaptToWord(tessword, &tessrow, choice, raw_choice, rejmap);
|
2007-03-08 04:03:40 +08:00
|
|
|
//adapt to it
|
|
|
|
delete_word(tessword); //free it
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-07-28 08:38:09 +08:00
|
|
|
/**
|
2007-03-08 04:03:40 +08:00
|
|
|
* tess_add_doc_word
|
|
|
|
*
|
|
|
|
* Add the given word to the document dictionary
|
2010-07-28 08:38:09 +08:00
|
|
|
*/
|
2009-07-11 10:03:51 +08:00
|
|
|
void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) {
|
|
|
|
getDict().add_document_word(*word_choice);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:03:51 +08:00
|
|
|
} // namespace tesseract
|