2007-03-08 04:03:40 +08:00
|
|
|
/******************************************************************************
|
2010-07-27 21:23:23 +08:00
|
|
|
** Filename: adaptmatch.c
|
|
|
|
** Purpose: High level adaptive matcher.
|
|
|
|
** Author: Dan Johnson
|
|
|
|
** History: Mon Mar 11 10:00:10 1991, DSJ, Created.
|
2007-03-08 04:03:40 +08:00
|
|
|
**
|
2008-02-01 08:07:59 +08:00
|
|
|
** (c) Copyright Hewlett-Packard Company, 1988.
|
2007-03-08 04:03:40 +08:00
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
******************************************************************************/
|
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/*-----------------------------------------------------------------------------
|
2007-03-08 04:03:40 +08:00
|
|
|
Include Files and Type Defines
|
2010-07-27 21:23:23 +08:00
|
|
|
-----------------------------------------------------------------------------*/
|
2007-03-08 04:03:40 +08:00
|
|
|
#include <ctype.h>
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "ambigs.h"
|
|
|
|
#include "blobclass.h"
|
|
|
|
#include "blobs.h"
|
|
|
|
#include "helpers.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "normfeat.h"
|
|
|
|
#include "mfoutline.h"
|
|
|
|
#include "picofeat.h"
|
|
|
|
#include "float2int.h"
|
|
|
|
#include "outfeat.h"
|
|
|
|
#include "emalloc.h"
|
|
|
|
#include "intfx.h"
|
|
|
|
#include "speckle.h"
|
|
|
|
#include "efio.h"
|
|
|
|
#include "normmatch.h"
|
|
|
|
#include "permute.h"
|
|
|
|
#include "ndminx.h"
|
|
|
|
#include "intproto.h"
|
|
|
|
#include "const.h"
|
|
|
|
#include "globals.h"
|
|
|
|
#include "werd.h"
|
|
|
|
#include "callcpp.h"
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "pageres.h"
|
|
|
|
#include "params.h"
|
2009-07-11 10:17:36 +08:00
|
|
|
#include "classify.h"
|
|
|
|
#include "unicharset.h"
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "dict.h"
|
|
|
|
#include "featdefs.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <math.h>
|
|
|
|
#ifdef __UNIX__
|
|
|
|
#include <assert.h>
|
|
|
|
#endif
|
|
|
|
|
2010-09-30 23:53:40 +08:00
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
#include "config_auto.h"
|
|
|
|
#endif
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
#define ADAPT_TEMPLATE_SUFFIX ".a"
|
|
|
|
|
|
|
|
#define MAX_MATCHES 10
|
|
|
|
#define UNLIKELY_NUM_FEAT 200
|
|
|
|
#define NO_DEBUG 0
|
|
|
|
#define MAX_ADAPTABLE_WERD_SIZE 40
|
2007-07-18 08:55:02 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
#define ADAPTABLE_WERD_ADJUSTMENT (0.05)
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
|
|
|
|
|
|
|
|
#define WORST_POSSIBLE_RATING (1.0)
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
struct ScoredClass {
|
|
|
|
CLASS_ID id;
|
|
|
|
FLOAT32 rating;
|
|
|
|
inT16 config;
|
|
|
|
inT16 config2;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ADAPT_RESULTS {
|
2008-04-22 08:23:04 +08:00
|
|
|
inT32 BlobLength;
|
2007-03-08 04:03:40 +08:00
|
|
|
int NumMatches;
|
2009-07-11 10:17:36 +08:00
|
|
|
bool HasNonfragment;
|
2010-11-24 02:34:14 +08:00
|
|
|
ScoredClass match[MAX_NUM_CLASSES];
|
|
|
|
ScoredClass best_match;
|
2008-02-01 08:07:59 +08:00
|
|
|
CLASS_PRUNER_RESULTS CPResults;
|
2009-07-11 10:17:36 +08:00
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/// Initializes data members to the default values. Sets the initial
|
|
|
|
/// rating of each class to be the worst possible rating (1.0).
|
2009-07-11 10:17:36 +08:00
|
|
|
inline void Initialize() {
|
|
|
|
BlobLength = MAX_INT32;
|
|
|
|
NumMatches = 0;
|
|
|
|
HasNonfragment = false;
|
2010-11-24 02:34:14 +08:00
|
|
|
best_match.id = NO_CLASS;
|
|
|
|
best_match.rating = WORST_POSSIBLE_RATING;
|
|
|
|
best_match.config = 0;
|
|
|
|
best_match.config2 = 0;
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
};
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
struct PROTO_KEY {
|
2007-03-08 04:03:40 +08:00
|
|
|
ADAPT_TEMPLATES Templates;
|
|
|
|
CLASS_ID ClassId;
|
|
|
|
int ConfigId;
|
2010-11-24 02:34:14 +08:00
|
|
|
};
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/*-----------------------------------------------------------------------------
|
2007-03-08 04:03:40 +08:00
|
|
|
Private Macros
|
2010-07-27 21:23:23 +08:00
|
|
|
-----------------------------------------------------------------------------*/
|
2008-02-01 08:07:59 +08:00
|
|
|
#define MarginalMatch(Rating) \
|
2009-07-11 10:17:36 +08:00
|
|
|
((Rating) > matcher_great_threshold)
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#define InitIntFX() (FeaturesHaveBeenExtracted = FALSE)
|
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/*-----------------------------------------------------------------------------
|
2007-03-08 04:03:40 +08:00
|
|
|
Private Function Prototypes
|
2010-07-27 21:23:23 +08:00
|
|
|
-----------------------------------------------------------------------------*/
|
2010-11-24 02:34:14 +08:00
|
|
|
int CompareByRating(const void *arg1, const void *arg2);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
void InitMatcherRatings(register FLOAT32 *Rating);
|
|
|
|
|
|
|
|
int MakeTempProtoPerm(void *item1, void *item2);
|
|
|
|
|
|
|
|
void SetAdaptiveThreshold(FLOAT32 Threshold);
|
|
|
|
|
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/*-----------------------------------------------------------------------------
|
2007-03-08 04:03:40 +08:00
|
|
|
Public Code
|
2010-07-27 21:23:23 +08:00
|
|
|
-----------------------------------------------------------------------------*/
|
2007-03-08 04:03:40 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2009-07-11 10:17:36 +08:00
|
|
|
namespace tesseract {
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine calls the adaptive matcher
|
|
|
|
* which returns (in an array) the class id of each
|
|
|
|
* class matched.
|
|
|
|
*
|
|
|
|
* It also returns the number of classes matched.
|
|
|
|
* For each class matched it places the best rating
|
|
|
|
* found for that class into the Ratings array.
|
|
|
|
*
|
|
|
|
* Bad matches are then removed so that they don't
|
|
|
|
* need to be sorted. The remaining good matches are
|
|
|
|
* then sorted and converted to choices.
|
|
|
|
*
|
|
|
|
* This routine also performs some simple speckle
|
|
|
|
* filtering.
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Mon Mar 11 10:00:58 1991, DSJ, Created.
|
|
|
|
*
|
|
|
|
* @param Blob blob to be classified
|
|
|
|
* @param[out] Choices List of choices found by adaptive matcher.
|
|
|
|
* @param[out] CPResults Array of CPResultStruct of size MAX_NUM_CLASSES is
|
|
|
|
* filled on return with the choices found by the
|
|
|
|
* class pruner and the ratings therefrom. Also
|
|
|
|
* contains the detailed results of the integer matcher.
|
|
|
|
*
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::AdaptiveClassifier(TBLOB *Blob,
|
|
|
|
BLOB_CHOICE_LIST *Choices,
|
|
|
|
CLASS_PRUNER_RESULTS CPResults) {
|
|
|
|
assert(Choices != NULL);
|
|
|
|
ADAPT_RESULTS *Results = new ADAPT_RESULTS();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (AdaptedTemplates == NULL)
|
2009-07-11 10:17:36 +08:00
|
|
|
AdaptedTemplates = NewAdaptedTemplates (true);
|
|
|
|
|
|
|
|
Results->Initialize();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
DoAdaptiveMatch(Blob, Results);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (CPResults != NULL)
|
|
|
|
memcpy(CPResults, Results->CPResults,
|
|
|
|
sizeof(CPResults[0]) * Results->NumMatches);
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
RemoveBadMatches(Results);
|
|
|
|
qsort((void *)Results->match, Results->NumMatches,
|
|
|
|
sizeof(ScoredClass), CompareByRating);
|
2009-06-03 07:34:20 +08:00
|
|
|
RemoveExtraPuncs(Results);
|
2009-07-11 10:17:36 +08:00
|
|
|
ConvertMatchesToChoices(Results, Choices);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (matcher_debug_level >= 1) {
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("AD Matches = ");
|
2009-06-03 07:34:20 +08:00
|
|
|
PrintAdaptiveMatchResults(stdout, Results);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (LargeSpeckle(Blob))
|
2009-07-11 10:17:36 +08:00
|
|
|
AddLargeSpeckleTo(Choices);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_enable_adaptive_debugger)
|
2010-11-24 02:34:14 +08:00
|
|
|
DebugAdaptiveClassifier(Blob, Results);
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
NumClassesOutput += Choices->length();
|
|
|
|
if (Choices->length() == 0) {
|
2010-11-24 02:34:14 +08:00
|
|
|
if (!classify_bln_numeric_mode)
|
2009-07-11 10:17:36 +08:00
|
|
|
tprintf ("Empty classification!\n"); // Should never normally happen.
|
|
|
|
Choices = new BLOB_CHOICE_LIST();
|
|
|
|
BLOB_CHOICE_IT temp_it;
|
|
|
|
temp_it.set_to_list(Choices);
|
2010-11-24 02:34:14 +08:00
|
|
|
temp_it.add_to_end(new BLOB_CHOICE(0, 50.0f, -20.0f, -1, -1, NULL));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2009-06-03 07:34:20 +08:00
|
|
|
delete Results;
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* AdaptiveClassifier */
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Learns the given word using its chopped_word, seam_array, denorm,
|
|
|
|
// box_word, best_state, and correct_text to learn both correctly and
|
|
|
|
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
|
|
|
|
// is called and the data will be written to a file for static training.
|
|
|
|
// Otherwise AdaptToBlob is called for adaption within a document.
|
|
|
|
// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
|
|
|
|
// be learned, otherwise all chars with good correct_text are learned.
|
|
|
|
void Classify::LearnWord(const char* filename, const char *rejmap,
|
|
|
|
WERD_RES *word) {
|
|
|
|
int word_len = word->correct_text.size();
|
|
|
|
if (word_len == 0) return;
|
|
|
|
|
|
|
|
float* thresholds = NULL;
|
|
|
|
if (filename == NULL) {
|
|
|
|
// Adaption mode.
|
|
|
|
if (!EnableLearning || word->best_choice == NULL)
|
|
|
|
return; // Can't or won't adapt.
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
NumWordsAdaptedTo++;
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 1)
|
2010-11-24 02:34:14 +08:00
|
|
|
tprintf("\n\nAdapting to word = %s\n",
|
|
|
|
word->best_choice->debug_string(unicharset).string());
|
|
|
|
thresholds = new float[word_len];
|
|
|
|
GetAdaptThresholds(word->rebuild_word, *word->best_choice,
|
|
|
|
*word->raw_choice, thresholds);
|
|
|
|
}
|
|
|
|
int start_blob = 0;
|
|
|
|
char prev_map_char = '0';
|
|
|
|
for (int ch = 0; ch < word_len; ++ch) {
|
|
|
|
char rej_map_char = rejmap != NULL ? *rejmap++ : '1';
|
|
|
|
char next_map_char = ch + 1 < word_len
|
|
|
|
? (rejmap != NULL ? *rejmap : '1')
|
|
|
|
: '0';
|
|
|
|
if (word->correct_text[ch].length() > 0 && rej_map_char == '1') {
|
|
|
|
float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;
|
|
|
|
LearnPieces(filename, start_blob, word->best_state[ch],
|
|
|
|
threshold, CST_WHOLE, word->correct_text[ch].string(), word);
|
|
|
|
if (word->best_state[ch] > 1) {
|
|
|
|
// Blob includes fragments, so learn them.
|
|
|
|
for (int frag = 0; frag < word->best_state[ch]; ++frag) {
|
|
|
|
STRING frag_str = CHAR_FRAGMENT::to_string(
|
|
|
|
word->correct_text[ch].string(), frag, word->best_state[ch]);
|
|
|
|
LearnPieces(filename, start_blob + frag, 1,
|
|
|
|
threshold, CST_FRAGMENT, frag_str.string(), word);
|
|
|
|
}
|
|
|
|
// If the next blob is good, make junk with the rightmost fragment.
|
|
|
|
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
|
|
|
|
next_map_char == '1') {
|
|
|
|
LearnPieces(filename, start_blob + word->best_state[ch] - 1,
|
|
|
|
word->best_state[ch + 1] + 1,
|
|
|
|
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
|
|
|
|
}
|
|
|
|
// If the previous blob is good, make junk with the leftmost fragment.
|
|
|
|
if (ch > 0 && word->correct_text[ch - 1].length() > 0 &&
|
|
|
|
prev_map_char == '1') {
|
|
|
|
LearnPieces(filename, start_blob - word->best_state[ch - 1],
|
|
|
|
word->best_state[ch - 1] + 1,
|
|
|
|
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// If the next blob is good, make a join with it.
|
|
|
|
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
|
|
|
|
next_map_char == '1') {
|
|
|
|
STRING joined_text = word->correct_text[ch];
|
|
|
|
joined_text += word->correct_text[ch + 1];
|
|
|
|
LearnPieces(filename, start_blob,
|
|
|
|
word->best_state[ch] + word->best_state[ch + 1],
|
|
|
|
threshold, CST_NGRAM, joined_text.string(), word);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
start_blob += word->best_state[ch];
|
|
|
|
prev_map_char = rej_map_char;
|
|
|
|
}
|
|
|
|
delete [] thresholds;
|
|
|
|
} // LearnWord.
|
|
|
|
|
|
|
|
// Builds a blob of length fragments, from the word, starting at start,
|
|
|
|
// and then learns it, as having the given correct_text.
|
|
|
|
// If filename is not NULL, then LearnBlob
|
|
|
|
// is called and the data will be written to a file for static training.
|
|
|
|
// Otherwise AdaptToBlob is called for adaption within a document.
|
|
|
|
// threshold is a magic number required by AdaptToChar and generated by
|
|
|
|
// GetAdaptThresholds.
|
|
|
|
// Although it can be partly inferred from the string, segmentation is
|
|
|
|
// provided to explicitly clarify the character segmentation.
|
|
|
|
void Classify::LearnPieces(const char* filename, int start, int length,
|
|
|
|
float threshold, CharSegmentationType segmentation,
|
|
|
|
const char* correct_text, WERD_RES *word) {
|
|
|
|
// TODO(daria) Remove/modify this if when we want to train and/or adapt
|
|
|
|
// on fragments and/or n-grams.
|
|
|
|
if (segmentation != CST_WHOLE)
|
|
|
|
return;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
join_pieces(word->chopped_word->blobs, word->seam_array,
|
|
|
|
start, start + length - 1);
|
|
|
|
TBLOB* blob = word->chopped_word->blobs;
|
|
|
|
for (int i = 0; i < start; ++i)
|
|
|
|
blob = blob->next;
|
|
|
|
if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
|
|
|
|
const int kSampleSpaceWidth = 500;
|
|
|
|
if (learn_debug_win_ == NULL) {
|
|
|
|
learn_debug_win_ = new ScrollView(classify_learn_debug_str.string(),
|
|
|
|
100, 400, kSampleSpaceWidth * 2, 200,
|
|
|
|
kSampleSpaceWidth* 2, 200, true);
|
|
|
|
}
|
|
|
|
learn_debug_win_->Clear();
|
|
|
|
learn_debug_win_->Pen(64, 64, 64);
|
|
|
|
learn_debug_win_->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
|
|
|
|
kSampleSpaceWidth, kBlnBaselineOffset);
|
|
|
|
learn_debug_win_->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
|
|
|
|
kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
|
|
|
|
blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
|
|
|
|
learn_debug_win_->Update();
|
|
|
|
window_wait(learn_debug_win_);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (filename != NULL) {
|
|
|
|
classify_norm_method.set_value(character); // force char norm spc 30/11/93
|
|
|
|
tess_bn_matching.set_value(false); // turn it off
|
|
|
|
tess_cn_matching.set_value(false);
|
|
|
|
LearnBlob(feature_defs_, filename, blob, word->denorm, correct_text);
|
|
|
|
} else {
|
|
|
|
if (!unicharset.contains_unichar(correct_text)) {
|
|
|
|
unicharset.unichar_insert(correct_text);
|
|
|
|
// TODO(rays) We may need to worry about exceeding MAX_NUM_CLASSES.
|
|
|
|
// if (unicharset_boxes->size() > MAX_NUM_CLASSES) ...
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 1)
|
2010-11-24 02:34:14 +08:00
|
|
|
tprintf("Adapting to char = %s, thr= %g\n",
|
|
|
|
unicharset.id_to_unichar(class_id), threshold);
|
|
|
|
AdaptToChar(blob, class_id, threshold);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
break_pieces(blob, word->seam_array, start, start + length - 1);
|
|
|
|
} // LearnPieces.
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine performs cleanup operations
|
|
|
|
* on the adaptive classifier. It should be called
|
|
|
|
* before the program is terminated. Its main function
|
|
|
|
* is to save the adapted templates to a file.
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - #AdaptedTemplates current set of adapted templates
|
|
|
|
* - #classify_save_adapted_templates TRUE if templates should be saved
|
|
|
|
* - #classify_enable_adaptive_matcher TRUE if adaptive matcher is enabled
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 19 14:37:06 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::EndAdaptiveClassifier() {
|
|
|
|
STRING Filename;
|
2007-03-08 04:03:40 +08:00
|
|
|
FILE *File;
|
|
|
|
|
|
|
|
#ifndef SECURE_NAMES
|
2009-07-11 10:17:36 +08:00
|
|
|
if (AdaptedTemplates != NULL &&
|
|
|
|
classify_enable_adaptive_matcher && classify_save_adapted_templates) {
|
|
|
|
Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
|
|
|
|
File = fopen (Filename.string(), "wb");
|
2007-03-08 04:03:40 +08:00
|
|
|
if (File == NULL)
|
2009-07-11 10:17:36 +08:00
|
|
|
cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
|
2007-03-08 04:03:40 +08:00
|
|
|
else {
|
2009-07-11 10:17:36 +08:00
|
|
|
cprintf ("\nSaving adapted templates to %s ...", Filename.string());
|
2007-03-08 04:03:40 +08:00
|
|
|
fflush(stdout);
|
|
|
|
WriteAdaptedTemplates(File, AdaptedTemplates);
|
|
|
|
cprintf ("\n");
|
|
|
|
fclose(File);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
if (AdaptedTemplates != NULL) {
|
|
|
|
free_adapted_templates(AdaptedTemplates);
|
|
|
|
AdaptedTemplates = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PreTrainedTemplates != NULL) {
|
|
|
|
free_int_templates(PreTrainedTemplates);
|
|
|
|
PreTrainedTemplates = NULL;
|
|
|
|
}
|
|
|
|
getDict().EndDangerousAmbigs();
|
2007-03-08 04:03:40 +08:00
|
|
|
FreeNormProtos();
|
2009-07-11 10:17:36 +08:00
|
|
|
if (AllProtosOn != NULL) {
|
|
|
|
FreeBitVector(AllProtosOn);
|
|
|
|
FreeBitVector(PrunedProtos);
|
|
|
|
FreeBitVector(AllConfigsOn);
|
|
|
|
FreeBitVector(AllProtosOff);
|
|
|
|
FreeBitVector(AllConfigsOff);
|
|
|
|
FreeBitVector(TempProtoMask);
|
|
|
|
AllProtosOn = NULL;
|
|
|
|
PrunedProtos = NULL;
|
|
|
|
AllConfigsOn = NULL;
|
|
|
|
AllProtosOff = NULL;
|
|
|
|
AllConfigsOff = NULL;
|
|
|
|
TempProtoMask = NULL;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* EndAdaptiveClassifier */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine reads in the training
|
|
|
|
* information needed by the adaptive classifier
|
|
|
|
* and saves it into global variables.
|
2010-11-24 02:34:14 +08:00
|
|
|
* Parameters:
|
|
|
|
* load_pre_trained_templates Indicates whether the pre-trained
|
|
|
|
* templates (inttemp, normproto and pffmtable components)
|
|
|
|
* should be lodaded. Should only be set to true if the
|
|
|
|
* necesary classifier components are present in the
|
|
|
|
* [lang].traineddata file.
|
|
|
|
* Globals:
|
|
|
|
* BuiltInTemplatesFile file to get built-in temps from
|
|
|
|
* BuiltInCutoffsFile file to get avg. feat per class from
|
|
|
|
* classify_use_pre_adapted_templates
|
|
|
|
* enables use of pre-adapted templates
|
|
|
|
* @note History: Mon Mar 11 12:49:34 1991, DSJ, Created.
|
2010-07-27 21:23:23 +08:00
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
|
2009-07-11 10:17:36 +08:00
|
|
|
if (!classify_enable_adaptive_matcher)
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
2009-07-11 10:17:36 +08:00
|
|
|
if (AllProtosOn != NULL)
|
2008-02-01 08:07:59 +08:00
|
|
|
EndAdaptiveClassifier(); // Don't leak with multiple inits.
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
// If there is no language_data_path_prefix, the classifier will be
|
|
|
|
// adaptive only.
|
2010-11-24 02:34:14 +08:00
|
|
|
if (language_data_path_prefix.length() > 0 &&
|
|
|
|
load_pre_trained_templates) {
|
|
|
|
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_INTTEMP));
|
|
|
|
PreTrainedTemplates =
|
|
|
|
ReadIntTemplates(tessdata_manager.GetDataFilePtr());
|
|
|
|
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n");
|
|
|
|
|
|
|
|
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE));
|
|
|
|
ReadNewCutoffs(tessdata_manager.GetDataFilePtr(),
|
|
|
|
tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE),
|
|
|
|
CharNormCutoffs);
|
|
|
|
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n");
|
|
|
|
|
|
|
|
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO));
|
|
|
|
NormProtos =
|
|
|
|
ReadNormProtos(tessdata_manager.GetDataFilePtr(),
|
|
|
|
tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
|
|
|
|
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.Init(&classify_debug_level, classify_integer_matcher_multiplier);
|
2007-03-08 04:03:40 +08:00
|
|
|
InitIntegerFX();
|
|
|
|
|
2009-03-11 03:03:06 +08:00
|
|
|
AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
|
|
|
|
PrunedProtos = NewBitVector(MAX_NUM_PROTOS);
|
|
|
|
AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
|
|
|
|
AllProtosOff = NewBitVector(MAX_NUM_PROTOS);
|
|
|
|
AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
|
|
|
|
TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
|
|
|
|
set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
|
|
|
|
set_all_bits(PrunedProtos, WordsInVectorOfSize(MAX_NUM_PROTOS));
|
|
|
|
set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
|
|
|
|
zero_all_bits(AllProtosOff, WordsInVectorOfSize(MAX_NUM_PROTOS));
|
|
|
|
zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
for (int i = 0; i < MAX_NUM_CLASSES; i++) {
|
|
|
|
BaselineCutoffs[i] = 0;
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_use_pre_adapted_templates) {
|
|
|
|
FILE *File;
|
|
|
|
STRING Filename;
|
|
|
|
|
2007-05-16 09:18:59 +08:00
|
|
|
Filename = imagefile;
|
|
|
|
Filename += ADAPT_TEMPLATE_SUFFIX;
|
2009-07-11 10:17:36 +08:00
|
|
|
File = fopen(Filename.string(), "rb");
|
|
|
|
if (File == NULL) {
|
|
|
|
AdaptedTemplates = NewAdaptedTemplates(true);
|
|
|
|
} else {
|
2007-03-08 04:03:40 +08:00
|
|
|
#ifndef SECURE_NAMES
|
2009-07-11 10:17:36 +08:00
|
|
|
cprintf("\nReading pre-adapted templates from %s ...\n",
|
|
|
|
Filename.string());
|
2007-03-08 04:03:40 +08:00
|
|
|
fflush(stdout);
|
|
|
|
#endif
|
2009-07-11 10:17:36 +08:00
|
|
|
AdaptedTemplates = ReadAdaptedTemplates(File);
|
|
|
|
cprintf("\n");
|
2007-03-08 04:03:40 +08:00
|
|
|
fclose(File);
|
|
|
|
PrintAdaptedTemplates(stdout, AdaptedTemplates);
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
|
|
|
|
BaselineCutoffs[i] = CharNormCutoffs[i];
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2008-02-01 08:07:59 +08:00
|
|
|
} else {
|
|
|
|
if (AdaptedTemplates != NULL)
|
|
|
|
free_adapted_templates(AdaptedTemplates);
|
2009-07-11 10:17:36 +08:00
|
|
|
AdaptedTemplates = NewAdaptedTemplates(true);
|
2008-02-01 08:07:59 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* InitAdaptiveClassifier */
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::ResetAdaptiveClassifier() {
|
2010-11-24 02:34:14 +08:00
|
|
|
if (classify_learning_debug_level > 0) {
|
|
|
|
tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
|
|
|
|
NumAdaptationsFailed);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
free_adapted_templates(AdaptedTemplates);
|
|
|
|
AdaptedTemplates = NULL;
|
2010-11-24 02:34:14 +08:00
|
|
|
NumAdaptationsFailed = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* Print to File the statistics which have
|
|
|
|
* been gathered for the adaptive matcher.
|
|
|
|
*
|
|
|
|
* @param File open text file to print adaptive statistics to
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Thu Apr 18 14:37:37 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::PrintAdaptiveStatistics(FILE *File) {
|
2007-03-08 04:03:40 +08:00
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
|
|
|
|
fprintf (File, "\nADAPTIVE MATCHER STATISTICS:\n");
|
|
|
|
fprintf (File, "\tNum blobs classified = %d\n", AdaptiveMatcherCalls);
|
|
|
|
fprintf (File, "\tNum classes output = %d (Avg = %4.2f)\n",
|
|
|
|
NumClassesOutput,
|
|
|
|
((AdaptiveMatcherCalls == 0) ? (0.0) :
|
|
|
|
((float) NumClassesOutput / AdaptiveMatcherCalls)));
|
|
|
|
fprintf (File, "\t\tBaseline Classifier: %4d calls (%4.2f classes/call)\n",
|
|
|
|
BaselineClassifierCalls,
|
|
|
|
((BaselineClassifierCalls == 0) ? (0.0) :
|
|
|
|
((float) NumBaselineClassesTried / BaselineClassifierCalls)));
|
|
|
|
fprintf (File, "\t\tCharNorm Classifier: %4d calls (%4.2f classes/call)\n",
|
|
|
|
CharNormClassifierCalls,
|
|
|
|
((CharNormClassifierCalls == 0) ? (0.0) :
|
|
|
|
((float) NumCharNormClassesTried / CharNormClassifierCalls)));
|
|
|
|
fprintf (File, "\t\tAmbig Classifier: %4d calls (%4.2f classes/call)\n",
|
|
|
|
AmbigClassifierCalls,
|
|
|
|
((AmbigClassifierCalls == 0) ? (0.0) :
|
|
|
|
((float) NumAmbigClassesTried / AmbigClassifierCalls)));
|
|
|
|
|
|
|
|
fprintf (File, "\nADAPTIVE LEARNER STATISTICS:\n");
|
|
|
|
fprintf (File, "\tNumber of words adapted to: %d\n", NumWordsAdaptedTo);
|
|
|
|
fprintf (File, "\tNumber of chars adapted to: %d\n", NumCharsAdaptedTo);
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
PrintAdaptedTemplates(File, AdaptedTemplates);
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
|
|
|
} /* PrintAdaptiveStatistics */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine prepares the adaptive
|
|
|
|
* matcher for the start
|
|
|
|
* of the first pass. Learning is enabled (unless it
|
|
|
|
* is disabled for the whole program).
|
|
|
|
*
|
2010-08-11 03:20:11 +08:00
|
|
|
* @note this is somewhat redundant, it simply says that if learning is
|
|
|
|
* enabled then it will remain enabled on the first pass. If it is
|
|
|
|
* disabled, then it will remain disabled. This is only put here to
|
|
|
|
* make it very clear that learning is controlled directly by the global
|
|
|
|
* setting of EnableLearning.
|
|
|
|
*
|
2010-07-27 21:23:23 +08:00
|
|
|
* Globals:
|
|
|
|
* - #EnableLearning
|
|
|
|
* set to TRUE by this routine
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Mon Apr 15 16:39:29 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::SettupPass1() {
|
|
|
|
EnableLearning = classify_enable_learning;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
getDict().SettupStopperPass1();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
} /* SettupPass1 */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine prepares the adaptive
|
|
|
|
* matcher for the start of the second pass. Further
|
|
|
|
* learning is disabled.
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - #EnableLearning set to FALSE by this routine
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Mon Apr 15 16:39:29 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::SettupPass2() {
|
2007-03-08 04:03:40 +08:00
|
|
|
EnableLearning = FALSE;
|
2009-07-11 10:17:36 +08:00
|
|
|
getDict().SettupStopperPass2();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
} /* SettupPass2 */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine creates a new adapted
|
|
|
|
* class and uses Blob as the model for the first
|
|
|
|
* config in that class.
|
|
|
|
*
|
|
|
|
* @param Blob blob to model new class after
|
|
|
|
* @param ClassId id of the class to be initialized
|
|
|
|
* @param Class adapted class to be initialized
|
|
|
|
* @param Templates adapted templates to add new class to
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - #AllProtosOn dummy mask with all 1's
|
|
|
|
* - BaselineCutoffs kludge needed to get cutoffs
|
|
|
|
* - #PreTrainedTemplates kludge needed to get cutoffs
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Thu Mar 14 12:49:39 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::InitAdaptedClass(TBLOB *Blob,
|
|
|
|
CLASS_ID ClassId,
|
|
|
|
ADAPT_CLASS Class,
|
|
|
|
ADAPT_TEMPLATES Templates) {
|
2007-03-08 04:03:40 +08:00
|
|
|
FEATURE_SET Features;
|
|
|
|
int Fid, Pid;
|
|
|
|
FEATURE Feature;
|
|
|
|
int NumFeatures;
|
|
|
|
TEMP_PROTO TempProto;
|
|
|
|
PROTO Proto;
|
|
|
|
INT_CLASS IClass;
|
|
|
|
TEMP_CONFIG Config;
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
classify_norm_method.set_value(baseline);
|
2010-11-24 02:34:14 +08:00
|
|
|
Features = ExtractOutlineFeatures(Blob);
|
2009-03-11 03:03:06 +08:00
|
|
|
NumFeatures = Features->NumFeatures;
|
2009-07-11 10:17:36 +08:00
|
|
|
if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
|
2007-03-08 04:03:40 +08:00
|
|
|
FreeFeatureSet(Features);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Config = NewTempConfig (NumFeatures - 1);
|
|
|
|
TempConfigFor (Class, 0) = Config;
|
|
|
|
|
|
|
|
/* this is a kludge to construct cutoffs for adapted templates */
|
2007-05-16 09:18:59 +08:00
|
|
|
if (Templates == AdaptedTemplates)
|
2009-07-11 10:17:36 +08:00
|
|
|
BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
IClass = ClassForClassId (Templates->Templates, ClassId);
|
|
|
|
|
2009-03-11 03:03:06 +08:00
|
|
|
for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
|
2007-03-08 04:03:40 +08:00
|
|
|
Pid = AddIntProto (IClass);
|
|
|
|
assert (Pid != NO_PROTO);
|
|
|
|
|
2009-03-11 03:03:06 +08:00
|
|
|
Feature = Features->Features[Fid];
|
2007-03-08 04:03:40 +08:00
|
|
|
TempProto = NewTempProto ();
|
|
|
|
Proto = &(TempProto->Proto);
|
|
|
|
|
|
|
|
/* compute proto params - NOTE that Y_DIM_OFFSET must be used because
|
|
|
|
ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
|
|
|
|
instead of the -0.25 to 0.75 used in baseline normalization */
|
2009-03-11 03:03:06 +08:00
|
|
|
Proto->Angle = Feature->Params[OutlineFeatDir];
|
|
|
|
Proto->X = Feature->Params[OutlineFeatX];
|
|
|
|
Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
|
|
|
|
Proto->Length = Feature->Params[OutlineFeatLength];
|
2007-03-08 04:03:40 +08:00
|
|
|
FillABC(Proto);
|
|
|
|
|
|
|
|
TempProto->ProtoId = Pid;
|
|
|
|
SET_BIT (Config->Protos, Pid);
|
|
|
|
|
|
|
|
ConvertProto(Proto, Pid, IClass);
|
2010-11-24 02:34:14 +08:00
|
|
|
AddProtoToProtoPruner(Proto, Pid, IClass,
|
|
|
|
classify_learning_debug_level >= 2);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
Class->TempProtos = push (Class->TempProtos, TempProto);
|
|
|
|
}
|
|
|
|
FreeFeatureSet(Features);
|
|
|
|
|
|
|
|
AddIntConfig(IClass);
|
|
|
|
ConvertConfig (AllProtosOn, 0, IClass);
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 1) {
|
|
|
|
cprintf ("Added new class '%s' with class id %d and %d protos.\n",
|
|
|
|
unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
if (IsEmptyAdaptedClass(Class))
|
|
|
|
(Templates->NumNonEmptyClasses)++;
|
|
|
|
} /* InitAdaptedClass */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine sets up the feature
|
|
|
|
* extractor to extract baseline normalized
|
|
|
|
* pico-features.
|
|
|
|
*
|
|
|
|
* The extracted pico-features are converted
|
|
|
|
* to integer form and placed in IntFeatures. The
|
|
|
|
* original floating-pt. features are returned in
|
|
|
|
* FloatFeatures.
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
* @param Blob blob to extract features from
|
|
|
|
* @param LineStats statistics about text row blob is in
|
|
|
|
* @param[out] IntFeatures array to fill with integer features
|
|
|
|
* @param[out] FloatFeatures place to return actual floating-pt features
|
|
|
|
*
|
|
|
|
* @return Number of pico-features returned (0 if
|
|
|
|
* an error occurred)
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 17:55:18 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
int Classify::GetAdaptiveFeatures(TBLOB *Blob,
|
|
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
|
|
FEATURE_SET *FloatFeatures) {
|
2007-03-08 04:03:40 +08:00
|
|
|
FEATURE_SET Features;
|
|
|
|
int NumFeatures;
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
classify_norm_method.set_value(baseline);
|
2010-11-24 02:34:14 +08:00
|
|
|
Features = ExtractPicoFeatures(Blob);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-03-11 03:03:06 +08:00
|
|
|
NumFeatures = Features->NumFeatures;
|
2007-03-08 04:03:40 +08:00
|
|
|
if (NumFeatures > UNLIKELY_NUM_FEAT) {
|
|
|
|
FreeFeatureSet(Features);
|
2010-11-24 02:34:14 +08:00
|
|
|
return 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ComputeIntFeatures(Features, IntFeatures);
|
|
|
|
*FloatFeatures = Features;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
return NumFeatures;
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* GetAdaptiveFeatures */
|
|
|
|
|
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/*-----------------------------------------------------------------------------
|
2007-03-08 04:03:40 +08:00
|
|
|
Private Code
|
2010-07-27 21:23:23 +08:00
|
|
|
-----------------------------------------------------------------------------*/
|
2007-03-08 04:03:40 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* Return TRUE if the specified word is
|
|
|
|
* acceptable for adaptation.
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @param Word current word
|
|
|
|
* @param BestChoiceWord best overall choice for word with context
|
|
|
|
* @param RawChoiceWord best choice for word without context
|
|
|
|
*
|
|
|
|
* @return TRUE or FALSE
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Thu May 30 14:25:06 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
int Classify::AdaptableWord(TWERD *Word,
|
|
|
|
const WERD_CHOICE &BestChoiceWord,
|
|
|
|
const WERD_CHOICE &RawChoiceWord) {
|
|
|
|
int BestChoiceLength = BestChoiceWord.length();
|
2010-11-24 02:34:14 +08:00
|
|
|
float adaptable_score =
|
|
|
|
getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
|
2009-07-11 10:17:36 +08:00
|
|
|
return ( // rules that apply in general - simplest to compute first
|
|
|
|
BestChoiceLength > 0 &&
|
2010-11-24 02:34:14 +08:00
|
|
|
BestChoiceLength == Word->NumBlobs() &&
|
2007-07-18 08:55:02 +08:00
|
|
|
BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE && (
|
2009-07-11 10:17:36 +08:00
|
|
|
(classify_enable_new_adapt_rules &&
|
2010-11-24 02:34:14 +08:00
|
|
|
getDict().CurrentBestChoiceAdjustFactor() <= adaptable_score &&
|
|
|
|
getDict().AlternativeChoicesWorseThan(adaptable_score) &&
|
2009-07-11 10:17:36 +08:00
|
|
|
getDict().CurrentBestChoiceIs(BestChoiceWord)) ||
|
|
|
|
(!classify_enable_new_adapt_rules && // old rules
|
|
|
|
BestChoiceLength == RawChoiceWord.length() &&
|
|
|
|
((getDict().valid_word_or_number(BestChoiceWord) &&
|
2010-11-24 02:34:14 +08:00
|
|
|
getDict().case_ok(BestChoiceWord, getDict().getUnicharset()))))));
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* @param Blob blob to add to templates for ClassId
|
|
|
|
* @param LineStats statistics about text line blob is in
|
|
|
|
* @param ClassId class to add blob to
|
|
|
|
* @param Threshold minimum match rating to existing template
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - AdaptedTemplates current set of adapted templates
|
|
|
|
* - AllProtosOn dummy mask to match against all protos
|
|
|
|
* - AllConfigsOn dummy mask to match against all configs
|
|
|
|
*
|
|
|
|
* @return none
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Thu Mar 14 09:36:03 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
void Classify::AdaptToChar(TBLOB *Blob,
|
|
|
|
CLASS_ID ClassId,
|
|
|
|
FLOAT32 Threshold) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int NumFeatures;
|
|
|
|
INT_FEATURE_ARRAY IntFeatures;
|
|
|
|
INT_RESULT_STRUCT IntResult;
|
|
|
|
INT_CLASS IClass;
|
|
|
|
ADAPT_CLASS Class;
|
|
|
|
TEMP_CONFIG TempConfig;
|
|
|
|
FEATURE_SET FloatFeatures;
|
2007-05-16 09:18:59 +08:00
|
|
|
int NewTempConfigId;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
NumCharsAdaptedTo++;
|
|
|
|
if (!LegalClassId (ClassId))
|
|
|
|
return;
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
Class = AdaptedTemplates->Class[ClassId];
|
|
|
|
assert(Class != NULL);
|
|
|
|
if (IsEmptyAdaptedClass(Class)) {
|
2010-11-24 02:34:14 +08:00
|
|
|
InitAdaptedClass(Blob, ClassId, Class, AdaptedTemplates);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
IClass = ClassForClassId (AdaptedTemplates->Templates, ClassId);
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (NumFeatures <= 0)
|
|
|
|
return;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetBaseLineMatch();
|
|
|
|
im_.Match(IClass, AllProtosOn, AllConfigsOn,
|
|
|
|
NumFeatures, NumFeatures, IntFeatures, 0,
|
|
|
|
&IntResult, classify_adapt_feature_threshold,
|
|
|
|
NO_DEBUG, matcher_debug_separate_windows);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
SetAdaptiveThreshold(Threshold);
|
|
|
|
|
|
|
|
if (IntResult.Rating <= Threshold) {
|
|
|
|
if (ConfigIsPermanent (Class, IntResult.Config)) {
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 1)
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("Found good match to perm config %d = %4.1f%%.\n",
|
|
|
|
IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
|
|
|
|
FreeFeatureSet(FloatFeatures);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
TempConfig = TempConfigFor (Class, IntResult.Config);
|
|
|
|
IncreaseConfidence(TempConfig);
|
2010-11-24 02:34:14 +08:00
|
|
|
if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
|
|
|
|
Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
|
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 1)
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("Increasing reliability of temp config %d to %d.\n",
|
|
|
|
IntResult.Config, TempConfig->NumTimesSeen);
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (TempConfigReliable(ClassId, TempConfig)) {
|
|
|
|
MakePermanent(AdaptedTemplates, ClassId, IntResult.Config, Blob);
|
|
|
|
UpdateAmbigsGroup(ClassId, Blob);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else {
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 1)
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("Found poor match to temp config %d = %4.1f%%.\n",
|
|
|
|
IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
|
2007-05-16 09:18:59 +08:00
|
|
|
NewTempConfigId = MakeNewTemporaryConfig(AdaptedTemplates,
|
|
|
|
ClassId,
|
|
|
|
NumFeatures,
|
|
|
|
IntFeatures,
|
|
|
|
FloatFeatures);
|
|
|
|
|
|
|
|
if (NewTempConfigId >= 0 &&
|
2010-11-24 02:34:14 +08:00
|
|
|
TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
|
|
|
|
MakePermanent(AdaptedTemplates, ClassId, NewTempConfigId, Blob);
|
|
|
|
UpdateAmbigsGroup(ClassId, Blob);
|
|
|
|
}
|
2007-05-16 09:18:59 +08:00
|
|
|
|
2009-06-04 01:24:08 +08:00
|
|
|
#ifndef GRAPHICS_DISABLED
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 1) {
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.Match(IClass, AllProtosOn, AllConfigsOn,
|
|
|
|
NumFeatures, NumFeatures, IntFeatures, 0,
|
|
|
|
&IntResult, classify_adapt_feature_threshold,
|
|
|
|
NO_DEBUG, matcher_debug_separate_windows);
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("Best match to temp config %d = %4.1f%%.\n",
|
|
|
|
IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (classify_learning_debug_level >= 2) {
|
2008-04-22 08:23:04 +08:00
|
|
|
uinT32 ConfigMask;
|
2007-03-08 04:03:40 +08:00
|
|
|
ConfigMask = 1 << IntResult.Config;
|
|
|
|
ShowMatchDisplay();
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.Match(IClass, AllProtosOn, (BIT_VECTOR)&ConfigMask,
|
|
|
|
NumFeatures, NumFeatures, IntFeatures, 0,
|
|
|
|
&IntResult, classify_adapt_feature_threshold,
|
|
|
|
6 | 0x19, matcher_debug_separate_windows);
|
2007-03-08 04:03:40 +08:00
|
|
|
UpdateMatchDisplay();
|
|
|
|
GetClassToDebug ("Adapting");
|
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
#endif
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
FreeFeatureSet(FloatFeatures);
|
|
|
|
}
|
|
|
|
} /* AdaptToChar */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* @param Blob blob to add to templates for ClassId
|
|
|
|
* @param LineStats statistics about text line blob is in
|
|
|
|
* @param ClassId class to add blob to
|
|
|
|
* @param Threshold minimum match rating to existing template
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - PreTrainedTemplates current set of built-in templates
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Thu Mar 14 09:36:03 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::AdaptToPunc(TBLOB *Blob,
|
|
|
|
CLASS_ID ClassId,
|
|
|
|
FLOAT32 Threshold) {
|
|
|
|
ADAPT_RESULTS *Results = new ADAPT_RESULTS();
|
2007-03-08 04:03:40 +08:00
|
|
|
int i;
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
Results->Initialize();
|
2010-11-24 02:34:14 +08:00
|
|
|
CharNormClassifier(Blob, PreTrainedTemplates, Results);
|
2009-07-11 10:17:36 +08:00
|
|
|
RemoveBadMatches(Results);
|
|
|
|
|
|
|
|
if (Results->NumMatches != 1) {
|
|
|
|
if (classify_learning_debug_level >= 1) {
|
2007-07-18 08:55:02 +08:00
|
|
|
cprintf ("Rejecting punc = %s (Alternatives = ",
|
|
|
|
unicharset.id_to_unichar(ClassId));
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
for (i = 0; i < Results->NumMatches; i++)
|
2010-11-24 02:34:14 +08:00
|
|
|
cprintf("%s", unicharset.id_to_unichar(Results->match[i].id));
|
|
|
|
cprintf(")\n");
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
} else {
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (classify_learning_debug_level >= 1)
|
|
|
|
cprintf ("Adapting to punc = %s, thr= %g\n",
|
|
|
|
unicharset.id_to_unichar(ClassId), Threshold);
|
|
|
|
#endif
|
2010-11-24 02:34:14 +08:00
|
|
|
AdaptToChar(Blob, ClassId, Threshold);
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
delete Results;
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* AdaptToPunc */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine adds the result of a classification into
|
|
|
|
* Results. If the new rating is much worse than the current
|
|
|
|
* best rating, it is not entered into results because it
|
|
|
|
* would end up being stripped later anyway. If the new rating
|
|
|
|
* is better than the old rating for the class, it replaces the
|
|
|
|
* old rating. If this is the first rating for the class, the
|
|
|
|
* class is added to the list of matched classes in Results.
|
|
|
|
* If the new rating is better than the best so far, it
|
|
|
|
* becomes the best so far.
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - #matcher_bad_match_pad defines limits of an acceptable match
|
|
|
|
*
|
2010-11-24 02:34:14 +08:00
|
|
|
* @param[out] results results to add new result to
|
|
|
|
* @param class_id class of new result
|
|
|
|
* @param rating rating of new result
|
|
|
|
* @param config_id config id of new result
|
|
|
|
* @param config2_id config id of 2nd choice result
|
2010-07-27 21:23:23 +08:00
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 18:19:29 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
void Classify::AddNewResult(ADAPT_RESULTS *results,
|
|
|
|
CLASS_ID class_id,
|
|
|
|
FLOAT32 rating,
|
|
|
|
int config_id,
|
|
|
|
int config2_id) {
|
|
|
|
ScoredClass *old_match = FindScoredUnichar(results, class_id);
|
|
|
|
ScoredClass match = {class_id, rating, config_id, config2_id};
|
|
|
|
|
|
|
|
if (rating > results->best_match.rating + matcher_bad_match_pad ||
|
|
|
|
(old_match && rating >= old_match->rating))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!unicharset.get_fragment(class_id))
|
|
|
|
results->HasNonfragment = true;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (class_id == NO_CLASS ||
|
|
|
|
!ClassForClassId(PreTrainedTemplates, class_id))
|
|
|
|
match.config = ~0;
|
|
|
|
|
|
|
|
if (old_match)
|
|
|
|
old_match->rating = rating;
|
|
|
|
else
|
|
|
|
results->match[results->NumMatches++] = match;
|
|
|
|
|
|
|
|
if (rating < results->best_match.rating &&
|
|
|
|
// Ensure that fragments do not affect best rating, class and config.
|
|
|
|
// This is needed so that at least one non-fragmented character is
|
|
|
|
// always present in the results.
|
|
|
|
// TODO(daria): verify that this helps accuracy and does not
|
|
|
|
// hurt performance.
|
|
|
|
!unicharset.get_fragment(class_id)) {
|
|
|
|
results->best_match = match;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
} /* AddNewResult */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine is identical to CharNormClassifier()
|
|
|
|
* except that it does no class pruning. It simply matches
|
|
|
|
* the unknown blob against the classes listed in
|
|
|
|
* Ambiguities.
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - #AllProtosOn mask that enables all protos
|
|
|
|
* - #AllConfigsOn mask that enables all configs
|
|
|
|
*
|
|
|
|
* @param Blob blob to be classified
|
|
|
|
* @param Templates built-in templates to classify against
|
|
|
|
* @param Ambiguities array of class id's to match against
|
|
|
|
* @param[out] Results place to put match results
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 19:40:36 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::AmbigClassifier(TBLOB *Blob,
|
|
|
|
INT_TEMPLATES Templates,
|
|
|
|
UNICHAR_ID *Ambiguities,
|
|
|
|
ADAPT_RESULTS *Results) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int NumFeatures;
|
|
|
|
INT_FEATURE_ARRAY IntFeatures;
|
|
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray;
|
|
|
|
INT_RESULT_STRUCT IntResult;
|
|
|
|
CLASS_ID ClassId;
|
|
|
|
|
|
|
|
AmbigClassifierCalls++;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumFeatures = GetCharNormFeatures(Blob, Templates, IntFeatures, CharNormArray,
|
|
|
|
&(Results->BlobLength), NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (NumFeatures <= 0)
|
|
|
|
return;
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (matcher_debug_level >= 2)
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("AM Matches = ");
|
|
|
|
|
2007-07-18 08:55:02 +08:00
|
|
|
while (*Ambiguities >= 0) {
|
2007-03-08 04:03:40 +08:00
|
|
|
ClassId = *Ambiguities;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
|
|
|
|
im_.Match(ClassForClassId(Templates, ClassId),
|
|
|
|
AllProtosOn, AllConfigsOn,
|
|
|
|
Results->BlobLength, NumFeatures, IntFeatures,
|
|
|
|
CharNormArray[ClassId], &IntResult,
|
|
|
|
classify_adapt_feature_threshold, NO_DEBUG,
|
|
|
|
matcher_debug_separate_windows);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (matcher_debug_level >= 2)
|
2007-07-18 08:55:02 +08:00
|
|
|
cprintf ("%s-%-2d %2.0f ", unicharset.id_to_unichar(ClassId),
|
|
|
|
IntResult.Config,
|
|
|
|
IntResult.Rating * 100.0);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
AddNewResult(Results, ClassId, IntResult.Rating,
|
|
|
|
IntResult.Config, IntResult.Config2);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
Ambiguities++;
|
|
|
|
|
|
|
|
NumAmbigClassesTried++;
|
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
if (matcher_debug_level >= 2)
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("\n");
|
|
|
|
|
|
|
|
} /* AmbigClassifier */
|
|
|
|
|
2008-02-01 08:07:59 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/// Factored-out calls to IntegerMatcher based on class pruner results.
|
|
|
|
/// Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::MasterMatcher(INT_TEMPLATES templates,
|
|
|
|
inT16 num_features,
|
|
|
|
INT_FEATURE_ARRAY features,
|
|
|
|
CLASS_NORMALIZATION_ARRAY norm_factors,
|
|
|
|
ADAPT_CLASS* classes,
|
|
|
|
int debug,
|
|
|
|
int num_classes,
|
2010-11-24 02:34:14 +08:00
|
|
|
const TBOX& blob_box,
|
2009-07-11 10:17:36 +08:00
|
|
|
CLASS_PRUNER_RESULTS results,
|
|
|
|
ADAPT_RESULTS* final_results) {
|
2010-11-24 02:34:14 +08:00
|
|
|
int top = blob_box.top();
|
|
|
|
int bottom = blob_box.bottom();
|
2008-02-01 08:07:59 +08:00
|
|
|
for (int c = 0; c < num_classes; c++) {
|
|
|
|
CLASS_ID class_id = results[c].Class;
|
|
|
|
INT_RESULT_STRUCT& int_result = results[c].IMResult;
|
2009-07-11 10:17:36 +08:00
|
|
|
BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos
|
2008-02-01 08:07:59 +08:00
|
|
|
: AllProtosOn;
|
2009-07-11 10:17:36 +08:00
|
|
|
BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs
|
2008-02-01 08:07:59 +08:00
|
|
|
: AllConfigsOn;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.Match(ClassForClassId(templates, class_id),
|
|
|
|
protos, configs, final_results->BlobLength,
|
|
|
|
num_features, features, norm_factors[class_id],
|
|
|
|
&int_result, classify_adapt_feature_threshold, debug,
|
|
|
|
matcher_debug_separate_windows);
|
2008-02-01 08:07:59 +08:00
|
|
|
// Compute class feature corrections.
|
|
|
|
double miss_penalty = tessedit_class_miss_scale *
|
|
|
|
int_result.FeatureMisses;
|
2010-11-24 02:34:14 +08:00
|
|
|
if (matcher_debug_level >= 2 || classify_debug_level > 1) {
|
2008-02-01 08:07:59 +08:00
|
|
|
cprintf("%s-%-2d %2.1f(CP%2.1f, IM%2.1f + MP%2.1f) ",
|
|
|
|
unicharset.id_to_unichar(class_id), int_result.Config,
|
|
|
|
(int_result.Rating + miss_penalty) * 100.0,
|
|
|
|
results[c].Rating * 100.0,
|
|
|
|
int_result.Rating * 100.0, miss_penalty * 100.0);
|
|
|
|
if (c % 4 == 3)
|
|
|
|
cprintf ("\n");
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
// Penalize non-alnums for being vertical misfits.
|
|
|
|
if (!unicharset.get_isalpha(class_id) &&
|
|
|
|
!unicharset.get_isdigit(class_id) &&
|
|
|
|
norm_factors[class_id] != 0 && classify_misfit_junk_penalty > 0.0) {
|
|
|
|
int min_bottom, max_bottom, min_top, max_top;
|
|
|
|
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
|
|
|
|
&min_top, &max_top);
|
|
|
|
if (classify_debug_level > 1) {
|
|
|
|
tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
|
|
|
|
top, min_top, max_top, bottom, min_bottom, max_bottom);
|
|
|
|
}
|
|
|
|
if (top < min_top || top > max_top ||
|
|
|
|
bottom < min_bottom || bottom > max_bottom) {
|
|
|
|
miss_penalty += classify_misfit_junk_penalty;
|
|
|
|
}
|
|
|
|
}
|
2008-02-01 08:07:59 +08:00
|
|
|
int_result.Rating += miss_penalty;
|
|
|
|
if (int_result.Rating > WORST_POSSIBLE_RATING)
|
|
|
|
int_result.Rating = WORST_POSSIBLE_RATING;
|
2010-11-24 02:34:14 +08:00
|
|
|
AddNewResult(final_results, class_id, int_result.Rating,
|
|
|
|
int_result.Config, int_result.Config2);
|
2009-07-11 10:17:36 +08:00
|
|
|
// Add unichars ambiguous with class_id with the same rating as class_id.
|
|
|
|
if (use_definite_ambigs_for_classifier) {
|
|
|
|
const UnicharIdVector *definite_ambigs =
|
|
|
|
getDict().getUnicharAmbigs().OneToOneDefiniteAmbigs(class_id);
|
|
|
|
int ambigs_size = (definite_ambigs == NULL) ? 0 : definite_ambigs->size();
|
|
|
|
for (int ambig = 0; ambig < ambigs_size; ++ambig) {
|
|
|
|
UNICHAR_ID ambig_class_id = (*definite_ambigs)[ambig];
|
2010-11-24 02:34:14 +08:00
|
|
|
// Do not include ambig_class_id if it has permanent adapted templates.
|
|
|
|
if (classes[class_id]->NumPermConfigs > 0) continue;
|
|
|
|
ScoredClass* ambig_match =
|
|
|
|
FindScoredUnichar(final_results, ambig_class_id);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (matcher_debug_level >= 3) {
|
|
|
|
tprintf("class: %d definite ambig: %d rating: old %.4f new %.4f\n",
|
|
|
|
class_id, ambig_class_id,
|
2010-11-24 02:34:14 +08:00
|
|
|
ambig_match ? ambig_match->rating : WORST_POSSIBLE_RATING,
|
|
|
|
int_result.Rating);
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (ambig_match) {
|
2009-07-11 10:17:36 +08:00
|
|
|
// ambig_class_id was already added to final_results,
|
|
|
|
// so just need to modify the rating.
|
2010-11-24 02:34:14 +08:00
|
|
|
if (int_result.Rating < ambig_match->rating) {
|
|
|
|
ambig_match->rating = int_result.Rating;
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
} else {
|
2010-11-24 02:34:14 +08:00
|
|
|
AddNewResult(final_results, ambig_class_id, int_result.Rating,
|
|
|
|
int_result.Config, int_result.Config2);
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2008-02-01 08:07:59 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
if (matcher_debug_level >= 2 || classify_debug_level > 1)
|
2008-02-01 08:07:59 +08:00
|
|
|
cprintf("\n");
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-07-27 21:23:23 +08:00
|
|
|
/**
|
|
|
|
* This routine extracts baseline normalized features
|
|
|
|
* from the unknown character and matches them against the
|
|
|
|
* specified set of templates. The classes which match
|
|
|
|
* are added to Results.
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - BaselineCutoffs expected num features for each class
|
|
|
|
*
|
|
|
|
* @param Blob blob to be classified
|
|
|
|
* @param Templates current set of adapted templates
|
|
|
|
* @param Results place to put match results
|
|
|
|
*
|
|
|
|
* @return Array of possible ambiguous chars that should be checked.
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 19:38:03 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
|
|
|
|
ADAPT_TEMPLATES Templates,
|
|
|
|
ADAPT_RESULTS *Results) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int NumFeatures;
|
|
|
|
int NumClasses;
|
|
|
|
INT_FEATURE_ARRAY IntFeatures;
|
|
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray;
|
|
|
|
CLASS_ID ClassId;
|
|
|
|
|
|
|
|
BaselineClassifierCalls++;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumFeatures = GetBaselineFeatures(
|
|
|
|
Blob, Templates->Templates, IntFeatures, CharNormArray,
|
|
|
|
&(Results->BlobLength));
|
2007-03-08 04:03:40 +08:00
|
|
|
if (NumFeatures <= 0)
|
|
|
|
return NULL;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumClasses = ClassPruner(Templates->Templates, NumFeatures, IntFeatures,
|
|
|
|
CharNormArray, BaselineCutoffs, Results->CPResults);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
NumBaselineClassesTried += NumClasses;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (matcher_debug_level >= 2 || classify_debug_level > 1)
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("BL Matches = ");
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetBaseLineMatch();
|
2008-02-01 08:07:59 +08:00
|
|
|
MasterMatcher(Templates->Templates, NumFeatures, IntFeatures, CharNormArray,
|
2009-07-11 10:17:36 +08:00
|
|
|
Templates->Class, matcher_debug_flags, NumClasses,
|
2010-11-24 02:34:14 +08:00
|
|
|
Blob->bounding_box(), Results->CPResults, Results);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
ClassId = Results->best_match.id;
|
2007-03-08 04:03:40 +08:00
|
|
|
if (ClassId == NO_CLASS)
|
|
|
|
return (NULL);
|
|
|
|
/* this is a bug - maybe should return "" */
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
return Templates->Class[ClassId]->Config[Results->best_match.config].Perm;
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* BaselineClassifier */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine extracts character normalized features
|
|
|
|
* from the unknown character and matches them against the
|
|
|
|
* specified set of templates. The classes which match
|
|
|
|
* are added to Results.
|
|
|
|
*
|
|
|
|
* @param Blob blob to be classified
|
|
|
|
* @param Templates templates to classify unknown against
|
|
|
|
* @param Results place to put match results
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - CharNormCutoffs expected num features for each class
|
|
|
|
* - AllProtosOn mask that enables all protos
|
|
|
|
* - AllConfigsOn mask that enables all configs
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 16:02:52 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
int Classify::CharNormClassifier(TBLOB *Blob,
|
|
|
|
INT_TEMPLATES Templates,
|
|
|
|
ADAPT_RESULTS *Results) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int NumFeatures;
|
|
|
|
int NumClasses;
|
|
|
|
INT_FEATURE_ARRAY IntFeatures;
|
|
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray;
|
|
|
|
|
|
|
|
CharNormClassifierCalls++;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumFeatures = GetCharNormFeatures(Blob, Templates, IntFeatures, CharNormArray,
|
|
|
|
&(Results->BlobLength), NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (NumFeatures <= 0)
|
2009-07-11 10:17:36 +08:00
|
|
|
return 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumClasses = ClassPruner(Templates, NumFeatures, IntFeatures, CharNormArray,
|
|
|
|
CharNormCutoffs, Results->CPResults);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (tessedit_single_match && NumClasses > 1)
|
|
|
|
NumClasses = 1;
|
|
|
|
NumCharNormClassesTried += NumClasses;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
|
2008-02-01 08:07:59 +08:00
|
|
|
MasterMatcher(Templates, NumFeatures, IntFeatures, CharNormArray,
|
2009-07-11 10:17:36 +08:00
|
|
|
NULL, matcher_debug_flags, NumClasses,
|
2010-11-24 02:34:14 +08:00
|
|
|
Blob->bounding_box(), Results->CPResults, Results);
|
2009-07-11 10:17:36 +08:00
|
|
|
return NumFeatures;
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* CharNormClassifier */
|
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine computes a rating which reflects the
|
|
|
|
* likelihood that the blob being classified is a noise
|
|
|
|
* blob. NOTE: assumes that the blob length has already been
|
|
|
|
* computed and placed into Results.
|
|
|
|
*
|
|
|
|
* @param Results results to add noise classification to
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - matcher_avg_noise_size avg. length of a noise blob
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 18:36:52 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::ClassifyAsNoise(ADAPT_RESULTS *Results) {
|
2007-03-08 04:03:40 +08:00
|
|
|
register FLOAT32 Rating;
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
Rating = Results->BlobLength / matcher_avg_noise_size;
|
2007-03-08 04:03:40 +08:00
|
|
|
Rating *= Rating;
|
|
|
|
Rating /= 1.0 + Rating;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
AddNewResult(Results, NO_CLASS, Rating, -1, -1);
|
2007-03-08 04:03:40 +08:00
|
|
|
} /* ClassifyAsNoise */
|
2010-08-11 03:20:11 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-11-24 02:34:14 +08:00
|
|
|
// Return a pointer to the scored unichar in results, or NULL if not present.
|
|
|
|
ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) {
|
|
|
|
for (int i = 0; i < results->NumMatches; i++) {
|
|
|
|
if (results->match[i].id == id)
|
|
|
|
return &results->match[i];
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Retrieve the current rating for a unichar id if we have rated it, defaulting
|
|
|
|
// to WORST_POSSIBLE_RATING.
|
|
|
|
ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) {
|
|
|
|
ScoredClass poor_result = {id, WORST_POSSIBLE_RATING, -1, -1};
|
|
|
|
ScoredClass *entry = FindScoredUnichar(results, id);
|
|
|
|
return (entry == NULL) ? poor_result : *entry;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compare character classes by rating as for qsort(3).
|
|
|
|
// For repeatability, use character class id as a tie-breaker.
|
|
|
|
int CompareByRating(const void *arg1, // ScoredClass *class1
|
|
|
|
const void *arg2) { // ScoredClass *class2
|
|
|
|
const ScoredClass *class1 = (const ScoredClass *)arg1;
|
|
|
|
const ScoredClass *class2 = (const ScoredClass *)arg2;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (class1->rating < class2->rating)
|
|
|
|
return -1;
|
|
|
|
else if (class1->rating > class2->rating)
|
|
|
|
return 1;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (class1->id < class2->id)
|
|
|
|
return -1;
|
|
|
|
else if (class1->id > class2->id)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2009-07-11 10:17:36 +08:00
|
|
|
namespace tesseract {
|
2010-07-27 21:23:23 +08:00
|
|
|
/// The function converts the given match ratings to the list of blob
|
|
|
|
/// choices with ratings and certainties (used by the context checkers).
|
|
|
|
/// If character fragments are present in the results, this function also makes
|
|
|
|
/// sure that there is at least one non-fragmented classification included.
|
|
|
|
/// For each classification result check the unicharset for "definite"
|
|
|
|
/// ambiguities and modify the resulting Choices accordingly.
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::ConvertMatchesToChoices(ADAPT_RESULTS *Results,
|
|
|
|
BLOB_CHOICE_LIST *Choices) {
|
|
|
|
assert(Choices != NULL);
|
2007-03-08 04:03:40 +08:00
|
|
|
FLOAT32 Rating;
|
|
|
|
FLOAT32 Certainty;
|
2009-07-11 10:17:36 +08:00
|
|
|
BLOB_CHOICE_IT temp_it;
|
|
|
|
bool contains_nonfrag = false;
|
|
|
|
temp_it.set_to_list(Choices);
|
|
|
|
int choices_length = 0;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
|
|
for (int i = 0; i < Results->NumMatches; i++) {
|
|
|
|
ScoredClass next = Results->match[i];
|
|
|
|
bool current_is_frag = (unicharset.get_fragment(next.id) != NULL);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (temp_it.length()+1 == MAX_MATCHES &&
|
|
|
|
!contains_nonfrag && current_is_frag) {
|
|
|
|
continue; // look for a non-fragmented character to fill the
|
|
|
|
// last spot in Choices if only fragments are present
|
|
|
|
}
|
|
|
|
// BlobLength can never be legally 0, this means recognition failed.
|
|
|
|
// But we must return a classification result because some invoking
|
|
|
|
// functions (chopper/permuter) do not anticipate a null blob choice.
|
|
|
|
// So we need to assign a poor, but not infinitely bad score.
|
|
|
|
if (Results->BlobLength == 0) {
|
|
|
|
Certainty = -20;
|
|
|
|
Rating = 100; // should be -certainty * real_blob_length
|
|
|
|
} else {
|
2010-11-24 02:34:14 +08:00
|
|
|
Rating = Certainty = next.rating;
|
2009-07-11 10:17:36 +08:00
|
|
|
Rating *= rating_scale * Results->BlobLength;
|
2010-11-24 02:34:14 +08:00
|
|
|
Certainty *= -(getDict().certainty_scale);
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
temp_it.add_to_end(new BLOB_CHOICE(next.id, Rating, Certainty,
|
|
|
|
next.config, next.config2,
|
|
|
|
unicharset.get_script(next.id)));
|
2009-07-11 10:17:36 +08:00
|
|
|
contains_nonfrag |= !current_is_frag; // update contains_nonfrag
|
|
|
|
choices_length++;
|
|
|
|
if (choices_length >= MAX_MATCHES) break;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
Results->NumMatches = choices_length;
|
|
|
|
} // ConvertMatchesToChoices
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param Blob blob whose classification is being debugged
|
|
|
|
* @param Results results of match being debugged
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Wed Mar 13 16:44:41 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::DebugAdaptiveClassifier(TBLOB *Blob,
|
|
|
|
ADAPT_RESULTS *Results) {
|
2007-03-08 04:03:40 +08:00
|
|
|
const char *Prompt =
|
2008-04-22 08:23:04 +08:00
|
|
|
"Left-click in IntegerMatch Window to continue or right click to debug...";
|
2007-03-08 04:03:40 +08:00
|
|
|
const char *DebugMode = "All Templates";
|
2010-11-24 02:34:14 +08:00
|
|
|
CLASS_ID LastClass = Results->best_match.id;
|
2007-03-08 04:03:40 +08:00
|
|
|
CLASS_ID ClassId;
|
|
|
|
BOOL8 AdaptiveOn = TRUE;
|
|
|
|
BOOL8 PreTrainedOn = TRUE;
|
|
|
|
|
|
|
|
ShowMatchDisplay();
|
2007-07-18 08:55:02 +08:00
|
|
|
cprintf ("\nDebugging class = %s (%s) ...\n",
|
|
|
|
unicharset.id_to_unichar(LastClass), DebugMode);
|
2010-11-24 02:34:14 +08:00
|
|
|
ShowBestMatchFor(Blob, LastClass, AdaptiveOn, PreTrainedOn);
|
2007-03-08 04:03:40 +08:00
|
|
|
UpdateMatchDisplay();
|
|
|
|
|
|
|
|
while ((ClassId = GetClassToDebug (Prompt)) != 0) {
|
2007-07-18 08:55:02 +08:00
|
|
|
#if 0
|
2007-03-08 04:03:40 +08:00
|
|
|
switch (ClassId) {
|
|
|
|
case 'b':
|
|
|
|
AdaptiveOn = TRUE;
|
|
|
|
PreTrainedOn = FALSE;
|
|
|
|
DebugMode = "Adaptive Templates Only";
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'c':
|
|
|
|
AdaptiveOn = FALSE;
|
|
|
|
PreTrainedOn = TRUE;
|
|
|
|
DebugMode = "PreTrained Templates Only";
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 'a':
|
|
|
|
AdaptiveOn = TRUE;
|
|
|
|
PreTrainedOn = TRUE;
|
|
|
|
DebugMode = "All Templates";
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
LastClass = ClassId;
|
|
|
|
break;
|
|
|
|
}
|
2007-07-18 08:55:02 +08:00
|
|
|
#endif
|
|
|
|
LastClass = ClassId;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
ShowMatchDisplay();
|
2008-04-22 08:23:04 +08:00
|
|
|
cprintf ("\nDebugging class = %d = %s (%s) ...\n",
|
|
|
|
LastClass, unicharset.id_to_unichar(LastClass), DebugMode);
|
2010-11-24 02:34:14 +08:00
|
|
|
ShowBestMatchFor(Blob, LastClass, AdaptiveOn, PreTrainedOn);
|
2007-03-08 04:03:40 +08:00
|
|
|
UpdateMatchDisplay();
|
|
|
|
}
|
|
|
|
} /* DebugAdaptiveClassifier */
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine performs an adaptive classification.
|
|
|
|
* If we have not yet adapted to enough classes, a simple
|
|
|
|
* classification to the pre-trained templates is performed.
|
|
|
|
* Otherwise, we match the blob against the adapted templates.
|
|
|
|
* If the adapted templates do not match well, we try a
|
|
|
|
* match against the pre-trained templates. If an adapted
|
|
|
|
* template match is found, we do a match to any pre-trained
|
|
|
|
* templates which could be ambiguous. The results from all
|
|
|
|
* of these classifications are merged together into Results.
|
|
|
|
*
|
|
|
|
* @param Blob blob to be classified
|
|
|
|
* @param Results place to put match results
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - PreTrainedTemplates built-in training templates
|
|
|
|
* - AdaptedTemplates templates adapted for this page
|
|
|
|
* - matcher_great_threshold rating limit for a great match
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 08:50:11 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::DoAdaptiveMatch(TBLOB *Blob,
|
2010-11-24 02:34:14 +08:00
|
|
|
ADAPT_RESULTS *Results) {
|
2007-07-18 08:55:02 +08:00
|
|
|
UNICHAR_ID *Ambiguities;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
AdaptiveMatcherCalls++;
|
|
|
|
InitIntFX();
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min ||
|
|
|
|
tess_cn_matching) {
|
|
|
|
CharNormClassifier(Blob, PreTrainedTemplates, Results);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else {
|
2010-11-24 02:34:14 +08:00
|
|
|
Ambiguities = BaselineClassifier(Blob, AdaptedTemplates, Results);
|
|
|
|
if ((Results->NumMatches > 0 &&
|
|
|
|
MarginalMatch (Results->best_match.rating) &&
|
|
|
|
!tess_bn_matching) ||
|
|
|
|
Results->NumMatches == 0) {
|
|
|
|
CharNormClassifier(Blob, PreTrainedTemplates, Results);
|
2009-07-11 10:17:36 +08:00
|
|
|
} else if (Ambiguities && *Ambiguities >= 0) {
|
2007-03-08 04:03:40 +08:00
|
|
|
AmbigClassifier(Blob,
|
|
|
|
PreTrainedTemplates,
|
|
|
|
Ambiguities,
|
|
|
|
Results);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
// Force the blob to be classified as noise
|
|
|
|
// if the results contain only fragments.
|
|
|
|
// TODO(daria): verify that this is better than
|
|
|
|
// just adding a NULL classificaiton.
|
|
|
|
if (!Results->HasNonfragment) {
|
|
|
|
Results->NumMatches = 0;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
if (Results->NumMatches == 0)
|
2009-07-11 10:17:36 +08:00
|
|
|
ClassifyAsNoise(Results);
|
|
|
|
} /* DoAdaptiveMatch */
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine tries to estimate how tight the adaptation
|
|
|
|
* threshold should be set for each character in the current
|
|
|
|
* word. In general, the routine tries to set tighter
|
|
|
|
* thresholds for a character when the current set of templates
|
|
|
|
* would have made an error on that character. It tries
|
|
|
|
* to set a threshold tight enough to eliminate the error.
|
|
|
|
* Two different sets of rules can be used to determine the
|
|
|
|
* desired thresholds.
|
|
|
|
*
|
|
|
|
* @param Word current word
|
|
|
|
* @param BestChoice best choice for current word with context
|
|
|
|
* @param BestRawChoice best choice for current word without context
|
|
|
|
* @param[out] Thresholds array of thresholds to be filled in
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - classify_enable_new_adapt_rules
|
|
|
|
* - matcher_good_threshold
|
|
|
|
* - matcher_perfect_threshold
|
|
|
|
* - matcher_rating_margin
|
|
|
|
*
|
|
|
|
* @return none (results are returned in Thresholds)
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Fri May 31 09:22:08 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
void Classify::GetAdaptThresholds(TWERD * Word,
|
|
|
|
const WERD_CHOICE& BestChoice,
|
|
|
|
const WERD_CHOICE& BestRawChoice,
|
|
|
|
FLOAT32 Thresholds[]) {
|
2009-07-11 10:17:36 +08:00
|
|
|
TBLOB *Blob;
|
|
|
|
const char* BestChoice_string = BestChoice.unichar_string().string();
|
|
|
|
const char* BestChoice_lengths = BestChoice.unichar_lengths().string();
|
|
|
|
const char* BestRawChoice_string = BestRawChoice.unichar_string().string();
|
|
|
|
const char* BestRawChoice_lengths = BestRawChoice.unichar_lengths().string();
|
|
|
|
|
|
|
|
if (classify_enable_new_adapt_rules && /* new rules */
|
|
|
|
getDict().CurrentBestChoiceIs(BestChoice)) {
|
|
|
|
getDict().FindClassifierErrors(matcher_perfect_threshold,
|
|
|
|
matcher_good_threshold,
|
|
|
|
matcher_rating_margin,
|
|
|
|
Thresholds);
|
2010-11-24 02:34:14 +08:00
|
|
|
} else { /* old rules */
|
2009-07-11 10:17:36 +08:00
|
|
|
for (Blob = Word->blobs;
|
|
|
|
Blob != NULL;
|
|
|
|
Blob = Blob->next, BestChoice_string += *(BestChoice_lengths++),
|
|
|
|
BestRawChoice_string += *(BestRawChoice_lengths++), Thresholds++)
|
|
|
|
if (*(BestChoice_lengths) == *(BestRawChoice_lengths) &&
|
|
|
|
strncmp(BestChoice_string, BestRawChoice_string,
|
|
|
|
*(BestChoice_lengths)) == 0)
|
|
|
|
*Thresholds = matcher_good_threshold;
|
2007-03-08 04:03:40 +08:00
|
|
|
else {
|
|
|
|
/* the blob was incorrectly classified - find the rating threshold
|
|
|
|
needed to create a template which will correct the error with
|
|
|
|
some margin. However, don't waste time trying to make
|
|
|
|
templates which are too tight. */
|
2010-11-24 02:34:14 +08:00
|
|
|
*Thresholds = GetBestRatingFor(
|
|
|
|
Blob, unicharset.unichar_to_id(BestChoice_string,
|
|
|
|
*BestChoice_lengths));
|
2009-07-11 10:17:36 +08:00
|
|
|
*Thresholds *= (1.0 - matcher_rating_margin);
|
2010-11-24 02:34:14 +08:00
|
|
|
*Thresholds = ClipToRange<double>(
|
|
|
|
*Thresholds, matcher_perfect_threshold, matcher_good_threshold);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
} /* GetAdaptThresholds */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine matches blob to the built-in templates
|
|
|
|
* to find out if there are any classes other than the correct
|
|
|
|
* class which are potential ambiguities.
|
|
|
|
*
|
|
|
|
* @param Blob blob to get classification ambiguities for
|
|
|
|
* @param CorrectClass correct class for Blob
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - CurrentRatings used by qsort compare routine
|
|
|
|
* - PreTrainedTemplates built-in templates
|
|
|
|
*
|
|
|
|
* @return String containing all possible ambiguous classes.
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Fri Mar 15 08:08:22 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
|
|
|
|
CLASS_ID CorrectClass) {
|
|
|
|
ADAPT_RESULTS *Results = new ADAPT_RESULTS();
|
|
|
|
UNICHAR_ID *Ambiguities;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
Results->Initialize();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
CharNormClassifier(Blob, PreTrainedTemplates, Results);
|
2009-07-11 10:17:36 +08:00
|
|
|
RemoveBadMatches(Results);
|
2010-11-24 02:34:14 +08:00
|
|
|
qsort((void *)Results->match, Results->NumMatches,
|
|
|
|
sizeof(ScoredClass), CompareByRating);
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
/* copy the class id's into an string of ambiguities - don't copy if
|
|
|
|
the correct class is the only class id matched */
|
|
|
|
Ambiguities = (UNICHAR_ID *) Emalloc (sizeof (UNICHAR_ID) *
|
|
|
|
(Results->NumMatches + 1));
|
|
|
|
if (Results->NumMatches > 1 ||
|
2010-11-24 02:34:14 +08:00
|
|
|
(Results->NumMatches == 1 && Results->match[0].id != CorrectClass)) {
|
2009-07-11 10:17:36 +08:00
|
|
|
for (i = 0; i < Results->NumMatches; i++)
|
2010-11-24 02:34:14 +08:00
|
|
|
Ambiguities[i] = Results->match[i].id;
|
2009-07-11 10:17:36 +08:00
|
|
|
Ambiguities[i] = -1;
|
2010-11-24 02:34:14 +08:00
|
|
|
} else {
|
2009-07-11 10:17:36 +08:00
|
|
|
Ambiguities[0] = -1;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
delete Results;
|
2010-11-24 02:34:14 +08:00
|
|
|
return Ambiguities;
|
2009-07-11 10:17:36 +08:00
|
|
|
} /* GetAmbiguities */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine sets up the feature extractor to extract
|
|
|
|
* baseline normalized pico-features.
|
|
|
|
*
|
|
|
|
* The extracted pico-features are converted
|
|
|
|
* to integer form and placed in IntFeatures. CharNormArray
|
|
|
|
* is filled with 0's to indicate to the matcher that no
|
|
|
|
* character normalization adjustment needs to be done.
|
|
|
|
*
|
|
|
|
* The total length of all blob outlines
|
|
|
|
* in baseline normalized units is also returned.
|
|
|
|
*
|
|
|
|
* @param Blob blob to extract features from
|
|
|
|
* @param Templates used to compute char norm adjustments
|
|
|
|
* @param IntFeatures array to fill with integer features
|
|
|
|
* @param CharNormArray array to fill with dummy char norm adjustments
|
|
|
|
* @param BlobLength length of blob in baseline-normalized units
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @return Number of pico-features returned (0 if an error occurred)
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 17:55:18 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
int Classify::GetBaselineFeatures(TBLOB *Blob,
|
|
|
|
INT_TEMPLATES Templates,
|
|
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
|
|
inT32 *BlobLength) {
|
2009-07-11 10:17:36 +08:00
|
|
|
FEATURE_SET Features;
|
|
|
|
int NumFeatures;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (classify_enable_int_fx) {
|
|
|
|
return GetIntBaselineFeatures(Blob, Templates,
|
|
|
|
IntFeatures, CharNormArray, BlobLength);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
classify_norm_method.set_value(baseline);
|
2010-11-24 02:34:14 +08:00
|
|
|
Features = ExtractPicoFeatures(Blob);
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
NumFeatures = Features->NumFeatures;
|
|
|
|
*BlobLength = NumFeatures;
|
|
|
|
if (NumFeatures > UNLIKELY_NUM_FEAT) {
|
2007-03-08 04:03:40 +08:00
|
|
|
FreeFeatureSet(Features);
|
2009-07-11 10:17:36 +08:00
|
|
|
return (0);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
ComputeIntFeatures(Features, IntFeatures);
|
|
|
|
ClearCharNormArray(Templates, CharNormArray);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
FreeFeatureSet(Features);
|
|
|
|
return NumFeatures;
|
|
|
|
} /* GetBaselineFeatures */
|
|
|
|
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine classifies Blob against both sets of
|
|
|
|
* templates for the specified class and returns the best
|
|
|
|
* rating found.
|
|
|
|
*
|
|
|
|
* @param Blob blob to get best rating for
|
|
|
|
* @param ClassId class blob is to be compared to
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - PreTrainedTemplates built-in templates
|
|
|
|
* - AdaptedTemplates current set of adapted templates
|
|
|
|
* - AllProtosOn dummy mask to enable all protos
|
|
|
|
* - AllConfigsOn dummy mask to enable all configs
|
|
|
|
*
|
|
|
|
* @return Best rating for match of Blob to ClassId.
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Apr 9 09:01:24 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
FLOAT32 Classify::GetBestRatingFor(TBLOB *Blob,
|
|
|
|
CLASS_ID ClassId) {
|
|
|
|
int NumCNFeatures, NumBLFeatures;
|
|
|
|
INT_FEATURE_ARRAY CNFeatures, BLFeatures;
|
|
|
|
INT_RESULT_STRUCT CNResult, BLResult;
|
|
|
|
inT32 BlobLength;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
CNResult.Rating = BLResult.Rating = 1.0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (!LegalClassId(ClassId))
|
2010-11-24 02:34:14 +08:00
|
|
|
return 1.0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
|
|
|
|
uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];
|
|
|
|
|
|
|
|
if (!UnusedClassIdIn(PreTrainedTemplates, ClassId)) {
|
2010-11-24 02:34:14 +08:00
|
|
|
NumCNFeatures = GetCharNormFeatures(Blob, PreTrainedTemplates,
|
|
|
|
CNFeatures, CNAdjust, &BlobLength,
|
|
|
|
NULL);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (NumCNFeatures > 0) {
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
|
|
|
|
im_.Match(ClassForClassId(PreTrainedTemplates, ClassId),
|
|
|
|
AllProtosOn, AllConfigsOn,
|
|
|
|
BlobLength, NumCNFeatures, CNFeatures,
|
|
|
|
CNAdjust[ClassId], &CNResult,
|
|
|
|
classify_adapt_feature_threshold, NO_DEBUG,
|
|
|
|
matcher_debug_separate_windows);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (!UnusedClassIdIn(AdaptedTemplates->Templates, ClassId)) {
|
2010-11-24 02:34:14 +08:00
|
|
|
NumBLFeatures = GetBaselineFeatures(Blob,
|
2009-07-11 10:17:36 +08:00
|
|
|
AdaptedTemplates->Templates,
|
|
|
|
BLFeatures, BLAdjust, &BlobLength);
|
|
|
|
if (NumBLFeatures > 0) {
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetBaseLineMatch();
|
|
|
|
im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
|
|
|
|
AdaptedTemplates->Class[ClassId]->PermProtos,
|
|
|
|
AdaptedTemplates->Class[ClassId]->PermConfigs,
|
|
|
|
BlobLength, NumBLFeatures, BLFeatures,
|
|
|
|
BLAdjust[ClassId], &BLResult,
|
|
|
|
classify_adapt_feature_threshold, NO_DEBUG,
|
|
|
|
matcher_debug_separate_windows);
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
// Clean up.
|
|
|
|
delete[] CNAdjust;
|
|
|
|
delete[] BLAdjust;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
return MIN(BLResult.Rating, CNResult.Rating);
|
2009-07-11 10:17:36 +08:00
|
|
|
} /* GetBestRatingFor */
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine sets up the feature extractor to extract
|
|
|
|
* character normalization features and character normalized
|
|
|
|
* pico-features. The extracted pico-features are converted
|
|
|
|
* to integer form and placed in IntFeatures. The character
|
|
|
|
* normalization features are matched to each class in
|
|
|
|
* templates and the resulting adjustment factors are returned
|
|
|
|
* in CharNormArray. The total length of all blob outlines
|
|
|
|
* in baseline normalized units is also returned.
|
|
|
|
*
|
|
|
|
* @param Blob blob to extract features from
|
|
|
|
* @param Templates used to compute char norm adjustments
|
|
|
|
* @param IntFeatures array to fill with integer features
|
|
|
|
* @param CharNormArray array to fill with char norm adjustments
|
|
|
|
* @param BlobLength length of blob in baseline-normalized units
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @return Number of pico-features returned (0 if an error occurred)
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 17:55:18 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
int Classify::GetCharNormFeatures(TBLOB *Blob,
|
|
|
|
INT_TEMPLATES Templates,
|
|
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
2010-11-24 02:34:14 +08:00
|
|
|
inT32 *BlobLength,
|
|
|
|
inT32 *FeatureOutlineIndex) {
|
|
|
|
return GetIntCharNormFeatures(Blob, Templates, IntFeatures, CharNormArray,
|
|
|
|
BlobLength, FeatureOutlineIndex);
|
2009-07-11 10:17:36 +08:00
|
|
|
} /* GetCharNormFeatures */
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine calls the integer (Hardware) feature
|
|
|
|
* extractor if it has not been called before for this blob.
|
|
|
|
* The results from the feature extractor are placed into
|
|
|
|
* globals so that they can be used in other routines without
|
|
|
|
* re-extracting the features.
|
|
|
|
* It then copies the baseline features into the IntFeatures
|
|
|
|
* array provided by the caller.
|
|
|
|
*
|
|
|
|
* @param Blob blob to extract features from
|
|
|
|
* @param Templates used to compute char norm adjustments
|
|
|
|
* @param IntFeatures array to fill with integer features
|
|
|
|
* @param CharNormArray array to fill with dummy char norm adjustments
|
|
|
|
* @param BlobLength length of blob in baseline-normalized units
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - FeaturesHaveBeenExtracted TRUE if fx has been done
|
|
|
|
* - BaselineFeatures holds extracted baseline feat
|
|
|
|
* - CharNormFeatures holds extracted char norm feat
|
|
|
|
* - FXInfo holds misc. FX info
|
|
|
|
*
|
|
|
|
* @return Number of features extracted or 0 if an error occured.
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue May 28 10:40:52 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
int Classify::GetIntBaselineFeatures(TBLOB *Blob,
|
|
|
|
INT_TEMPLATES Templates,
|
|
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
|
|
|
inT32 *BlobLength) {
|
2009-07-11 10:17:36 +08:00
|
|
|
register INT_FEATURE Src, Dest, End;
|
|
|
|
|
|
|
|
if (!FeaturesHaveBeenExtracted) {
|
2010-11-24 02:34:14 +08:00
|
|
|
FeaturesOK = ExtractIntFeat(Blob, denorm_, BaselineFeatures,
|
|
|
|
CharNormFeatures, &FXInfo);
|
2009-07-11 10:17:36 +08:00
|
|
|
FeaturesHaveBeenExtracted = TRUE;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (!FeaturesOK) {
|
2007-07-18 08:55:02 +08:00
|
|
|
*BlobLength = FXInfo.NumBL;
|
2010-11-24 02:34:14 +08:00
|
|
|
return 0;
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
for (Src = BaselineFeatures, End = Src + FXInfo.NumBL, Dest = IntFeatures;
|
|
|
|
Src < End;
|
|
|
|
*Dest++ = *Src++);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
ClearCharNormArray(Templates, CharNormArray);
|
|
|
|
*BlobLength = FXInfo.NumBL;
|
2010-11-24 02:34:14 +08:00
|
|
|
return FXInfo.NumBL;
|
2009-07-11 10:17:36 +08:00
|
|
|
} /* GetIntBaselineFeatures */
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
void Classify::ResetFeaturesHaveBeenExtracted() {
|
|
|
|
FeaturesHaveBeenExtracted = FALSE;
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine calls the integer (Hardware) feature
|
|
|
|
* extractor if it has not been called before for this blob.
|
|
|
|
*
|
|
|
|
* The results from the feature extractor are placed into
|
|
|
|
* globals so that they can be used in other routines without
|
|
|
|
* re-extracting the features.
|
|
|
|
*
|
|
|
|
* It then copies the char norm features into the IntFeatures
|
|
|
|
* array provided by the caller.
|
|
|
|
*
|
|
|
|
* @param Blob blob to extract features from
|
|
|
|
* @param Templates used to compute char norm adjustments
|
|
|
|
* @param IntFeatures array to fill with integer features
|
|
|
|
* @param CharNormArray array to fill with dummy char norm adjustments
|
|
|
|
* @param BlobLength length of blob in baseline-normalized units
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - FeaturesHaveBeenExtracted TRUE if fx has been done
|
|
|
|
* - BaselineFeatures holds extracted baseline feat
|
|
|
|
* - CharNormFeatures holds extracted char norm feat
|
|
|
|
* - FXInfo holds misc. FX info
|
|
|
|
*
|
|
|
|
* @return Number of features extracted or 0 if an error occured.
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue May 28 10:40:52 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
int Classify::GetIntCharNormFeatures(TBLOB *Blob,
|
|
|
|
INT_TEMPLATES Templates,
|
|
|
|
INT_FEATURE_ARRAY IntFeatures,
|
|
|
|
CLASS_NORMALIZATION_ARRAY CharNormArray,
|
2010-11-24 02:34:14 +08:00
|
|
|
inT32 *BlobLength,
|
|
|
|
inT32 *FeatureOutlineArray) {
|
2009-07-11 10:17:36 +08:00
|
|
|
register INT_FEATURE Src, Dest, End;
|
|
|
|
FEATURE NormFeature;
|
|
|
|
FLOAT32 Baseline, Scale;
|
2010-11-24 02:34:14 +08:00
|
|
|
inT32 FeatureOutlineIndex[MAX_NUM_INT_FEATURES];
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
if (!FeaturesHaveBeenExtracted) {
|
2010-11-24 02:34:14 +08:00
|
|
|
FeaturesOK = ExtractIntFeat(Blob, denorm_, BaselineFeatures,
|
|
|
|
CharNormFeatures, &FXInfo,
|
|
|
|
FeatureOutlineIndex);
|
2009-07-11 10:17:36 +08:00
|
|
|
FeaturesHaveBeenExtracted = TRUE;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (!FeaturesOK) {
|
2007-07-18 08:55:02 +08:00
|
|
|
*BlobLength = FXInfo.NumBL;
|
2009-07-11 10:17:36 +08:00
|
|
|
return (0);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
for (Src = CharNormFeatures, End = Src + FXInfo.NumCN, Dest = IntFeatures;
|
|
|
|
Src < End;
|
|
|
|
*Dest++ = *Src++);
|
2010-11-24 02:34:14 +08:00
|
|
|
for (int i = 0; FeatureOutlineArray && i < FXInfo.NumCN; ++i) {
|
|
|
|
FeatureOutlineArray[i] = FeatureOutlineIndex[i];
|
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
NormFeature = NewFeature(&CharNormDesc);
|
2010-11-24 02:34:14 +08:00
|
|
|
Baseline = BASELINE_OFFSET;
|
|
|
|
Scale = MF_SCALE_FACTOR;
|
2009-07-11 10:17:36 +08:00
|
|
|
NormFeature->Params[CharNormY] = (FXInfo.Ymean - Baseline) * Scale;
|
|
|
|
NormFeature->Params[CharNormLength] =
|
|
|
|
FXInfo.Length * Scale / LENGTH_COMPRESSION;
|
|
|
|
NormFeature->Params[CharNormRx] = FXInfo.Rx * Scale;
|
|
|
|
NormFeature->Params[CharNormRy] = FXInfo.Ry * Scale;
|
|
|
|
ComputeIntCharNormArray(NormFeature, Templates, CharNormArray);
|
|
|
|
FreeFeature(NormFeature);
|
|
|
|
|
|
|
|
*BlobLength = FXInfo.NumBL;
|
|
|
|
return (FXInfo.NumCN);
|
|
|
|
} /* GetIntCharNormFeatures */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param Templates adapted templates to add new config to
|
|
|
|
* @param ClassId class id to associate with new config
|
|
|
|
* @param NumFeatures number of features in IntFeatures
|
|
|
|
* @param Features features describing model for new config
|
|
|
|
* @param FloatFeatures floating-pt representation of features
|
|
|
|
*
|
|
|
|
* @return The id of the new config created, a negative integer in
|
|
|
|
* case of error.
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Fri Mar 15 08:49:46 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
int Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
|
2010-11-24 02:34:14 +08:00
|
|
|
CLASS_ID ClassId,
|
|
|
|
int NumFeatures,
|
|
|
|
INT_FEATURE_ARRAY Features,
|
|
|
|
FEATURE_SET FloatFeatures) {
|
2009-07-11 10:17:36 +08:00
|
|
|
INT_CLASS IClass;
|
|
|
|
ADAPT_CLASS Class;
|
|
|
|
PROTO_ID OldProtos[MAX_NUM_PROTOS];
|
|
|
|
FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
|
|
|
|
int NumOldProtos;
|
|
|
|
int NumBadFeatures;
|
|
|
|
int MaxProtoId, OldMaxProtoId;
|
|
|
|
int BlobLength = 0;
|
|
|
|
int MaskSize;
|
|
|
|
int ConfigId;
|
|
|
|
TEMP_CONFIG Config;
|
|
|
|
int i;
|
|
|
|
int debug_level = NO_DEBUG;
|
|
|
|
|
|
|
|
if (classify_learning_debug_level >= 3)
|
|
|
|
debug_level =
|
2007-03-08 04:03:40 +08:00
|
|
|
PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
IClass = ClassForClassId(Templates->Templates, ClassId);
|
|
|
|
Class = Templates->Class[ClassId];
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
|
|
|
|
++NumAdaptationsFailed;
|
|
|
|
if (classify_learning_debug_level >= 1)
|
|
|
|
cprintf("Cannot make new temporary config: maximum number exceeded.\n");
|
|
|
|
return -1;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
OldMaxProtoId = IClass->NumProtos - 1;
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
|
|
|
|
BlobLength, NumFeatures, Features,
|
|
|
|
OldProtos, classify_adapt_proto_threshold,
|
|
|
|
debug_level);
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
|
|
|
|
zero_all_bits(TempProtoMask, MaskSize);
|
|
|
|
for (i = 0; i < NumOldProtos; i++)
|
|
|
|
SET_BIT(TempProtoMask, OldProtos[i]);
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
|
|
|
|
BlobLength, NumFeatures, Features,
|
|
|
|
BadFeatures,
|
|
|
|
classify_adapt_feature_threshold,
|
|
|
|
debug_level);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
|
|
|
|
IClass, Class, TempProtoMask);
|
|
|
|
if (MaxProtoId == NO_PROTO) {
|
|
|
|
++NumAdaptationsFailed;
|
|
|
|
if (classify_learning_debug_level >= 1)
|
|
|
|
cprintf("Cannot make new temp protos: maximum number exceeded.\n");
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
ConfigId = AddIntConfig(IClass);
|
|
|
|
ConvertConfig(TempProtoMask, ConfigId, IClass);
|
|
|
|
Config = NewTempConfig(MaxProtoId);
|
|
|
|
TempConfigFor(Class, ConfigId) = Config;
|
|
|
|
copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
|
|
|
|
|
|
|
|
if (classify_learning_debug_level >= 1)
|
|
|
|
cprintf("Making new temp config %d using %d old and %d new protos.\n",
|
|
|
|
ConfigId, NumOldProtos, MaxProtoId - OldMaxProtoId);
|
|
|
|
|
|
|
|
return ConfigId;
|
|
|
|
} /* MakeNewTemporaryConfig */
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine finds sets of sequential bad features
|
|
|
|
* that all have the same angle and converts each set into
|
|
|
|
* a new temporary proto. The temp proto is added to the
|
|
|
|
* proto pruner for IClass, pushed onto the list of temp
|
|
|
|
* protos in Class, and added to TempProtoMask.
|
|
|
|
*
|
|
|
|
* @param Features floating-pt features describing new character
|
|
|
|
* @param NumBadFeat number of bad features to turn into protos
|
|
|
|
* @param BadFeat feature id's of bad features
|
|
|
|
* @param IClass integer class templates to add new protos to
|
|
|
|
* @param Class adapted class templates to add new protos to
|
|
|
|
* @param TempProtoMask proto mask to add new protos to
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @return Max proto id in class after all protos have been added.
|
|
|
|
* Exceptions: none
|
|
|
|
* History: Fri Mar 15 11:39:38 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features,
|
|
|
|
int NumBadFeat,
|
|
|
|
FEATURE_ID BadFeat[],
|
|
|
|
INT_CLASS IClass,
|
|
|
|
ADAPT_CLASS Class,
|
|
|
|
BIT_VECTOR TempProtoMask) {
|
2009-07-11 10:17:36 +08:00
|
|
|
FEATURE_ID *ProtoStart;
|
|
|
|
FEATURE_ID *ProtoEnd;
|
|
|
|
FEATURE_ID *LastBad;
|
|
|
|
TEMP_PROTO TempProto;
|
|
|
|
PROTO Proto;
|
|
|
|
FEATURE F1, F2;
|
|
|
|
FLOAT32 X1, X2, Y1, Y2;
|
|
|
|
FLOAT32 A1, A2, AngleDelta;
|
|
|
|
FLOAT32 SegmentLength;
|
|
|
|
PROTO_ID Pid;
|
|
|
|
|
|
|
|
for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
|
|
|
|
ProtoStart < LastBad; ProtoStart = ProtoEnd) {
|
|
|
|
F1 = Features->Features[*ProtoStart];
|
|
|
|
X1 = F1->Params[PicoFeatX];
|
|
|
|
Y1 = F1->Params[PicoFeatY];
|
|
|
|
A1 = F1->Params[PicoFeatDir];
|
|
|
|
|
|
|
|
for (ProtoEnd = ProtoStart + 1,
|
|
|
|
SegmentLength = GetPicoFeatureLength();
|
|
|
|
ProtoEnd < LastBad;
|
|
|
|
ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
|
|
|
|
F2 = Features->Features[*ProtoEnd];
|
2009-03-11 03:03:06 +08:00
|
|
|
X2 = F2->Params[PicoFeatX];
|
|
|
|
Y2 = F2->Params[PicoFeatY];
|
|
|
|
A2 = F2->Params[PicoFeatDir];
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
AngleDelta = fabs(A1 - A2);
|
|
|
|
if (AngleDelta > 0.5)
|
|
|
|
AngleDelta = 1.0 - AngleDelta;
|
|
|
|
|
|
|
|
if (AngleDelta > matcher_clustering_max_angle_delta ||
|
|
|
|
fabs(X1 - X2) > SegmentLength ||
|
|
|
|
fabs(Y1 - Y2) > SegmentLength)
|
|
|
|
break;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
F2 = Features->Features[*(ProtoEnd - 1)];
|
|
|
|
X2 = F2->Params[PicoFeatX];
|
|
|
|
Y2 = F2->Params[PicoFeatY];
|
|
|
|
A2 = F2->Params[PicoFeatDir];
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
Pid = AddIntProto(IClass);
|
|
|
|
if (Pid == NO_PROTO)
|
|
|
|
return (NO_PROTO);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
TempProto = NewTempProto();
|
|
|
|
Proto = &(TempProto->Proto);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/* compute proto params - NOTE that Y_DIM_OFFSET must be used because
|
|
|
|
ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
|
|
|
|
instead of the -0.25 to 0.75 used in baseline normalization */
|
|
|
|
Proto->Length = SegmentLength;
|
|
|
|
Proto->Angle = A1;
|
|
|
|
Proto->X = (X1 + X2) / 2.0;
|
|
|
|
Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
|
|
|
|
FillABC(Proto);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
TempProto->ProtoId = Pid;
|
|
|
|
SET_BIT(TempProtoMask, Pid);
|
|
|
|
|
|
|
|
ConvertProto(Proto, Pid, IClass);
|
2010-11-24 02:34:14 +08:00
|
|
|
AddProtoToProtoPruner(Proto, Pid, IClass,
|
|
|
|
classify_learning_debug_level >= 2);
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
Class->TempProtos = push(Class->TempProtos, TempProto);
|
|
|
|
}
|
|
|
|
return IClass->NumProtos - 1;
|
|
|
|
} /* MakeNewTempProtos */
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param Templates current set of adaptive templates
|
|
|
|
* @param ClassId class containing config to be made permanent
|
|
|
|
* @param ConfigId config to be made permanent
|
|
|
|
* @param Blob current blob being adapted to
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Thu Mar 14 15:54:08 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::MakePermanent(ADAPT_TEMPLATES Templates,
|
|
|
|
CLASS_ID ClassId,
|
|
|
|
int ConfigId,
|
2010-11-24 02:34:14 +08:00
|
|
|
TBLOB *Blob) {
|
2009-07-11 10:17:36 +08:00
|
|
|
UNICHAR_ID *Ambigs;
|
|
|
|
TEMP_CONFIG Config;
|
|
|
|
ADAPT_CLASS Class;
|
|
|
|
PROTO_KEY ProtoKey;
|
|
|
|
|
|
|
|
Class = Templates->Class[ClassId];
|
|
|
|
Config = TempConfigFor(Class, ConfigId);
|
|
|
|
|
|
|
|
MakeConfigPermanent(Class, ConfigId);
|
|
|
|
if (Class->NumPermConfigs == 0)
|
|
|
|
Templates->NumPermClasses++;
|
|
|
|
Class->NumPermConfigs++;
|
|
|
|
|
|
|
|
ProtoKey.Templates = Templates;
|
|
|
|
ProtoKey.ClassId = ClassId;
|
|
|
|
ProtoKey.ConfigId = ConfigId;
|
|
|
|
Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey,
|
|
|
|
MakeTempProtoPerm);
|
|
|
|
FreeTempConfig(Config);
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
Ambigs = GetAmbiguities(Blob, ClassId);
|
2009-07-11 10:17:36 +08:00
|
|
|
PermConfigFor(Class, ConfigId) = Ambigs;
|
|
|
|
|
|
|
|
if (classify_learning_debug_level >= 1) {
|
|
|
|
cprintf("Making config %d permanent with ambiguities '",
|
|
|
|
ConfigId, Ambigs);
|
|
|
|
for (UNICHAR_ID *AmbigsPointer = Ambigs;
|
|
|
|
*AmbigsPointer >= 0; ++AmbigsPointer)
|
|
|
|
cprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
|
|
|
|
cprintf("'.\n");
|
|
|
|
}
|
|
|
|
} /* MakePermanent */
|
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine converts TempProto to be permanent if
|
|
|
|
* its proto id is used by the configuration specified in
|
|
|
|
* ProtoKey.
|
|
|
|
*
|
|
|
|
* @param TempProto temporary proto to compare to key
|
|
|
|
* @param ProtoKey defines which protos to make permanent
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @return TRUE if TempProto is converted, FALSE otherwise
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Thu Mar 14 18:49:54 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
int MakeTempProtoPerm(void *item1, void *item2) {
|
|
|
|
ADAPT_CLASS Class;
|
|
|
|
TEMP_CONFIG Config;
|
|
|
|
TEMP_PROTO TempProto;
|
|
|
|
PROTO_KEY *ProtoKey;
|
|
|
|
|
|
|
|
TempProto = (TEMP_PROTO) item1;
|
|
|
|
ProtoKey = (PROTO_KEY *) item2;
|
|
|
|
|
|
|
|
Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
|
|
|
|
Config = TempConfigFor(Class, ProtoKey->ConfigId);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (TempProto->ProtoId > Config->MaxProtoId ||
|
|
|
|
!test_bit (Config->Protos, TempProto->ProtoId))
|
|
|
|
return FALSE;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
MakeProtoPermanent(Class, TempProto->ProtoId);
|
|
|
|
AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId,
|
|
|
|
ProtoKey->Templates->Templates);
|
|
|
|
FreeTempProto(TempProto);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
return TRUE;
|
|
|
|
} /* MakeTempProtoPerm */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
|
|
|
namespace tesseract {
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine writes the matches in Results to File.
|
|
|
|
*
|
|
|
|
* @param File open text file to write Results to
|
|
|
|
* @param Results match results to write to File
|
|
|
|
*
|
|
|
|
* Globals: none
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Mon Mar 18 09:24:53 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results) {
|
|
|
|
for (int i = 0; i < Results->NumMatches; ++i) {
|
|
|
|
cprintf("%s(%d) %.2f ",
|
2010-11-24 02:34:14 +08:00
|
|
|
unicharset.debug_str(Results->match[i].id).string(),
|
|
|
|
Results->match[i].id,
|
|
|
|
Results->match[i].rating * 100.0);
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
printf("\n");
|
|
|
|
} /* PrintAdaptiveMatchResults */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine steps thru each matching class in Results
|
|
|
|
* and removes it from the match list if its rating
|
|
|
|
* is worse than the BestRating plus a pad. In other words,
|
|
|
|
* all good matches get moved to the front of the classes
|
|
|
|
* array.
|
|
|
|
*
|
|
|
|
* @param Results contains matches to be filtered
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - matcher_bad_match_pad defines a "bad match"
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 13:51:03 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {
|
|
|
|
int Next, NextGood;
|
|
|
|
FLOAT32 BadMatchThreshold;
|
|
|
|
static const char* romans = "i v x I V X";
|
2010-11-24 02:34:14 +08:00
|
|
|
BadMatchThreshold = Results->best_match.rating + matcher_bad_match_pad;
|
2009-07-11 10:17:36 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
if (classify_bln_numeric_mode) {
|
2009-07-11 10:17:36 +08:00
|
|
|
UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
|
|
|
|
unicharset.unichar_to_id("1") : -1;
|
|
|
|
UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
|
|
|
|
unicharset.unichar_to_id("0") : -1;
|
2010-11-24 02:34:14 +08:00
|
|
|
ScoredClass scored_one = ScoredUnichar(Results, unichar_id_one);
|
|
|
|
ScoredClass scored_zero = ScoredUnichar(Results, unichar_id_zero);
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
|
2010-11-24 02:34:14 +08:00
|
|
|
if (Results->match[Next].rating <= BadMatchThreshold) {
|
|
|
|
ScoredClass match = Results->match[Next];
|
|
|
|
if (!unicharset.get_isalpha(match.id) ||
|
|
|
|
strstr(romans, unicharset.id_to_unichar(match.id)) != NULL) {
|
|
|
|
Results->match[NextGood++] = Results->match[Next];
|
|
|
|
} else if (unicharset.eq(match.id, "l") &&
|
|
|
|
scored_one.rating >= BadMatchThreshold) {
|
|
|
|
Results->match[NextGood] = scored_one;
|
|
|
|
Results->match[NextGood].rating = match.rating;
|
|
|
|
NextGood++;
|
|
|
|
} else if (unicharset.eq(match.id, "O") &&
|
|
|
|
scored_zero.rating >= BadMatchThreshold) {
|
|
|
|
Results->match[NextGood] = scored_zero;
|
|
|
|
Results->match[NextGood].rating = match.rating;
|
|
|
|
NextGood++;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
} else {
|
2009-07-11 10:17:36 +08:00
|
|
|
for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
|
2010-11-24 02:34:14 +08:00
|
|
|
if (Results->match[Next].rating <= BadMatchThreshold)
|
|
|
|
Results->match[NextGood++] = Results->match[Next];
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
Results->NumMatches = NextGood;
|
|
|
|
} /* RemoveBadMatches */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*----------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine steps thru each matching class in Results
|
|
|
|
* and removes it from the match list if its rating
|
|
|
|
* is worse than the BestRating plus a pad. In other words,
|
|
|
|
* all good matches get moved to the front of the classes
|
|
|
|
* array.
|
|
|
|
*
|
|
|
|
* @parm Results contains matches to be filtered
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - matcher_bad_match_pad defines a "bad match"
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Mar 12 13:51:03 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {
|
|
|
|
int Next, NextGood;
|
|
|
|
int punc_count; /*no of garbage characters */
|
|
|
|
int digit_count;
|
|
|
|
/*garbage characters */
|
|
|
|
static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
|
|
|
|
static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
|
|
|
|
|
|
|
|
punc_count = 0;
|
|
|
|
digit_count = 0;
|
|
|
|
for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
|
2010-11-24 02:34:14 +08:00
|
|
|
ScoredClass match = Results->match[Next];
|
|
|
|
if (strstr(punc_chars, unicharset.id_to_unichar(match.id)) != NULL) {
|
|
|
|
if (punc_count < 2)
|
|
|
|
Results->match[NextGood++] = match;
|
|
|
|
punc_count++;
|
|
|
|
} else {
|
|
|
|
if (strstr(digit_chars, unicharset.id_to_unichar(match.id)) != NULL) {
|
2009-07-11 10:17:36 +08:00
|
|
|
if (digit_count < 1)
|
2010-11-24 02:34:14 +08:00
|
|
|
Results->match[NextGood++] = match;
|
2009-07-11 10:17:36 +08:00
|
|
|
digit_count++;
|
2010-11-24 02:34:14 +08:00
|
|
|
} else {
|
|
|
|
Results->match[NextGood++] = match;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
Results->NumMatches = NextGood;
|
|
|
|
} /* RemoveExtraPuncs */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine resets the internal thresholds inside
|
|
|
|
* the integer matcher to correspond to the specified
|
|
|
|
* threshold.
|
|
|
|
*
|
|
|
|
* @param Threshold threshold for creating new templates
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - matcher_good_threshold default good match rating
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Tue Apr 9 08:33:13 1991, DSJ, Created.
|
|
|
|
*/
|
2010-11-24 02:34:14 +08:00
|
|
|
void Classify::SetAdaptiveThreshold(FLOAT32 Threshold) {
|
|
|
|
Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
|
|
|
|
classify_adapt_proto_threshold.set_value(
|
|
|
|
ClipToRange<int>(255 * Threshold, 0, 255));
|
|
|
|
classify_adapt_feature_threshold.set_value(
|
|
|
|
ClipToRange<int>(255 * Threshold, 0, 255));
|
2009-07-11 10:17:36 +08:00
|
|
|
} /* SetAdaptiveThreshold */
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
/*---------------------------------------------------------------------------*/
|
2010-08-11 03:20:11 +08:00
|
|
|
/**
|
|
|
|
* This routine compares Blob to both sets of templates
|
|
|
|
* (adaptive and pre-trained) and then displays debug
|
|
|
|
* information for the config which matched best.
|
|
|
|
*
|
|
|
|
* @param Blob blob to show best matching config for
|
|
|
|
* @param ClassId class whose configs are to be searched
|
|
|
|
* @param AdaptiveOn TRUE if adaptive configs are enabled
|
|
|
|
* @param PreTrainedOn TRUE if pretrained configs are enabled
|
|
|
|
*
|
|
|
|
* Globals:
|
|
|
|
* - PreTrainedTemplates built-in training
|
|
|
|
* - AdaptedTemplates adaptive templates
|
|
|
|
* - AllProtosOn dummy proto mask
|
|
|
|
* - AllConfigsOn dummy config mask
|
|
|
|
*
|
|
|
|
* @note Exceptions: none
|
|
|
|
* @note History: Fri Mar 22 08:43:52 1991, DSJ, Created.
|
|
|
|
*/
|
2009-07-11 10:17:36 +08:00
|
|
|
void Classify::ShowBestMatchFor(TBLOB *Blob,
|
|
|
|
CLASS_ID ClassId,
|
|
|
|
BOOL8 AdaptiveOn,
|
|
|
|
BOOL8 PreTrainedOn) {
|
|
|
|
int NumCNFeatures = 0, NumBLFeatures = 0;
|
|
|
|
INT_FEATURE_ARRAY CNFeatures, BLFeatures;
|
|
|
|
INT_RESULT_STRUCT CNResult, BLResult;
|
|
|
|
inT32 BlobLength;
|
|
|
|
uinT32 ConfigMask;
|
|
|
|
static int next_config = -1;
|
|
|
|
|
|
|
|
if (PreTrainedOn) next_config = -1;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
CNResult.Rating = BLResult.Rating = 2.0;
|
|
|
|
|
|
|
|
if (!LegalClassId (ClassId)) {
|
|
|
|
cprintf ("%d is not a legal class id!!\n", ClassId);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
|
|
|
|
uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];
|
|
|
|
|
|
|
|
if (PreTrainedOn) {
|
|
|
|
if (UnusedClassIdIn (PreTrainedTemplates, ClassId))
|
|
|
|
cprintf ("No built-in templates for class %d = %s\n",
|
|
|
|
ClassId, unicharset.id_to_unichar(ClassId));
|
|
|
|
else {
|
2010-11-24 02:34:14 +08:00
|
|
|
NumCNFeatures = GetCharNormFeatures(
|
|
|
|
Blob, PreTrainedTemplates, CNFeatures, CNAdjust, &BlobLength, NULL);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (NumCNFeatures <= 0)
|
|
|
|
cprintf ("Illegal blob (char norm features)!\n");
|
2007-03-08 04:03:40 +08:00
|
|
|
else {
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
|
|
|
|
im_.Match(ClassForClassId (PreTrainedTemplates, ClassId),
|
|
|
|
AllProtosOn, AllConfigsOn,
|
|
|
|
BlobLength, NumCNFeatures, CNFeatures,
|
|
|
|
CNAdjust[ClassId], &CNResult,
|
|
|
|
classify_adapt_feature_threshold, NO_DEBUG,
|
|
|
|
matcher_debug_separate_windows);
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
cprintf ("Best built-in template match is config %2d (%4.1f) (cn=%d)\n",
|
|
|
|
CNResult.Config, CNResult.Rating * 100.0, CNAdjust[ClassId]);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
if (AdaptiveOn) {
|
|
|
|
if (UnusedClassIdIn (AdaptedTemplates->Templates, ClassId))
|
|
|
|
cprintf ("No AD templates for class %d = %s\n",
|
|
|
|
ClassId, unicharset.id_to_unichar(ClassId));
|
|
|
|
else {
|
2010-11-24 02:34:14 +08:00
|
|
|
NumBLFeatures = GetBaselineFeatures(Blob,
|
|
|
|
AdaptedTemplates->Templates,
|
|
|
|
BLFeatures, BLAdjust,
|
|
|
|
&BlobLength);
|
2009-07-11 10:17:36 +08:00
|
|
|
if (NumBLFeatures <= 0)
|
|
|
|
cprintf ("Illegal blob (baseline features)!\n");
|
2007-03-08 04:03:40 +08:00
|
|
|
else {
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetBaseLineMatch();
|
|
|
|
im_.Match(ClassForClassId
|
|
|
|
(AdaptedTemplates->Templates, ClassId),
|
|
|
|
AllProtosOn, AllConfigsOn,
|
|
|
|
BlobLength, NumBLFeatures, BLFeatures,
|
|
|
|
BLAdjust[ClassId], &BLResult,
|
|
|
|
classify_adapt_feature_threshold, NO_DEBUG,
|
|
|
|
matcher_debug_separate_windows);
|
2009-07-11 10:17:36 +08:00
|
|
|
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
ADAPT_CLASS Class = AdaptedTemplates->Class[ClassId];
|
|
|
|
cprintf ("Best adaptive template match is config %2d (%4.1f) %s\n",
|
|
|
|
BLResult.Config, BLResult.Rating * 100.0,
|
|
|
|
ConfigIsPermanent(Class, BLResult.Config) ? "Perm" : "Temp");
|
|
|
|
#endif
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
cprintf ("\n");
|
|
|
|
if (BLResult.Rating < CNResult.Rating) {
|
|
|
|
if (next_config < 0) {
|
|
|
|
ConfigMask = 1 << BLResult.Config;
|
|
|
|
next_config = 0;
|
|
|
|
} else {
|
|
|
|
ConfigMask = 1 << next_config;
|
|
|
|
++next_config;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:17:36 +08:00
|
|
|
classify_norm_method.set_value(baseline);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetBaseLineMatch();
|
|
|
|
im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
|
|
|
|
AllProtosOn, (BIT_VECTOR) &ConfigMask,
|
|
|
|
BlobLength, NumBLFeatures, BLFeatures,
|
|
|
|
BLAdjust[ClassId], &BLResult,
|
|
|
|
classify_adapt_feature_threshold,
|
|
|
|
matcher_debug_flags,
|
|
|
|
matcher_debug_separate_windows);
|
2009-07-11 10:17:36 +08:00
|
|
|
cprintf ("Adaptive template match for config %2d is %4.1f\n",
|
|
|
|
BLResult.Config, BLResult.Rating * 100.0);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
ConfigMask = 1 << CNResult.Config;
|
|
|
|
classify_norm_method.set_value(character);
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
im_.SetCharNormMatch(classify_integer_matcher_multiplier);
|
|
|
|
im_.Match(ClassForClassId (PreTrainedTemplates, ClassId),
|
|
|
|
AllProtosOn, (BIT_VECTOR) & ConfigMask,
|
|
|
|
BlobLength, NumCNFeatures, CNFeatures,
|
|
|
|
CNAdjust[ClassId], &CNResult,
|
|
|
|
classify_adapt_feature_threshold,
|
|
|
|
matcher_debug_flags,
|
|
|
|
matcher_debug_separate_windows);
|
2009-07-11 10:17:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Clean up.
|
|
|
|
delete[] CNAdjust;
|
|
|
|
delete[] BLAdjust;
|
|
|
|
} /* ShowBestMatchFor */
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
|
|
// Returns true if the given TEMP_CONFIG is good enough to make it
|
|
|
|
// a permanent config.
|
|
|
|
bool Classify::TempConfigReliable(CLASS_ID class_id,
|
|
|
|
const TEMP_CONFIG &config) {
|
|
|
|
if (classify_learning_debug_level >= 1) {
|
|
|
|
tprintf("NumTimesSeen for config of %s is %d\n",
|
|
|
|
getDict().getUnicharset().debug_str(class_id).string(),
|
|
|
|
config->NumTimesSeen);
|
|
|
|
}
|
|
|
|
if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
|
|
|
|
return true;
|
|
|
|
} else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
|
|
|
|
return false;
|
|
|
|
} else if (use_ambigs_for_adaption) {
|
|
|
|
// Go through the ambigs vector and see whether we have already seen
|
|
|
|
// enough times all the characters represented by the ambigs vector.
|
|
|
|
const UnicharIdVector *ambigs =
|
|
|
|
getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
|
|
|
|
int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
|
|
|
|
for (int ambig = 0; ambig < ambigs_size; ++ambig) {
|
|
|
|
ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
|
|
|
|
assert(ambig_class != NULL);
|
|
|
|
if (ambig_class->NumPermConfigs == 0 &&
|
|
|
|
ambig_class->MaxNumTimesSeen <
|
|
|
|
matcher_min_examples_for_prototyping) {
|
|
|
|
if (classify_learning_debug_level >= 1) {
|
|
|
|
tprintf("Ambig %s has not been seen enough times,"
|
|
|
|
" not making config for %s permanent\n",
|
|
|
|
getDict().getUnicharset().debug_str(
|
|
|
|
(*ambigs)[ambig]).string(),
|
|
|
|
getDict().getUnicharset().debug_str(class_id).string());
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Classify::UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob) {
|
|
|
|
const UnicharIdVector *ambigs =
|
|
|
|
getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
|
|
|
|
int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
|
|
|
|
if (classify_learning_debug_level >= 1) {
|
|
|
|
tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
|
|
|
|
getDict().getUnicharset().debug_str(class_id).string(), class_id);
|
|
|
|
}
|
|
|
|
for (int ambig = 0; ambig < ambigs_size; ++ambig) {
|
|
|
|
CLASS_ID ambig_class_id = (*ambigs)[ambig];
|
|
|
|
const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
|
|
|
|
for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
|
|
|
|
if (ConfigIsPermanent(ambigs_class, cfg)) continue;
|
|
|
|
const TEMP_CONFIG config =
|
|
|
|
TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
|
|
|
|
if (config != NULL && TempConfigReliable(ambig_class_id, config)) {
|
|
|
|
if (classify_learning_debug_level >= 1) {
|
|
|
|
tprintf("Making config %d of %s permanent\n", cfg,
|
|
|
|
getDict().getUnicharset().debug_str(
|
|
|
|
ambig_class_id).string());
|
|
|
|
}
|
|
|
|
MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-07-11 10:17:36 +08:00
|
|
|
} // namespace tesseract
|