2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: charsample.cpp (Formerly charsample.c)
|
|
|
|
* Description: Class to contain character samples and match scores
|
|
|
|
* to be used for adaption
|
|
|
|
* Author: Chris Newton
|
|
|
|
* Created: Thu Oct 7 13:40:37 BST 1993
|
|
|
|
*
|
|
|
|
* (C) Copyright 1993, Hewlett-Packard Ltd.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
2010-05-28 20:03:45 +08:00
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
#include "config_auto.h"
|
|
|
|
#endif
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "mfcpch.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <math.h>
|
|
|
|
#ifdef __UNIX__
|
|
|
|
#include <assert.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
#include "memry.h"
|
|
|
|
#include "tessvars.h"
|
|
|
|
#include "statistc.h"
|
|
|
|
#include "charsample.h"
|
|
|
|
#include "paircmp.h"
|
|
|
|
#include "matmatch.h"
|
|
|
|
#include "adaptions.h"
|
|
|
|
#include "secname.h"
|
|
|
|
#include "notdll.h"
|
2009-07-11 10:03:51 +08:00
|
|
|
#include "tesseractclass.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
extern inT32 demo_word; // Hack for demos
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
ELISTIZE (CHAR_SAMPLE) ELISTIZE (CHAR_SAMPLES) CHAR_SAMPLE::CHAR_SAMPLE () {
|
|
|
|
sample_blob = NULL;
|
|
|
|
sample_denorm = NULL;
|
|
|
|
sample_image = NULL;
|
|
|
|
ch = '\0';
|
|
|
|
n_samples_matched = 0;
|
|
|
|
total_match_scores = 0.0;
|
|
|
|
sumsq_match_scores = 0.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
CHAR_SAMPLE::CHAR_SAMPLE(PBLOB *blob, DENORM *denorm, char c) {
|
2007-03-08 04:03:40 +08:00
|
|
|
sample_blob = blob;
|
|
|
|
sample_denorm = denorm;
|
|
|
|
sample_image = NULL;
|
|
|
|
ch = c;
|
|
|
|
n_samples_matched = 0;
|
|
|
|
total_match_scores = 0.0;
|
|
|
|
sumsq_match_scores = 0.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
CHAR_SAMPLE::CHAR_SAMPLE(IMAGE *image, char c) {
|
2007-03-08 04:03:40 +08:00
|
|
|
sample_blob = NULL;
|
|
|
|
sample_denorm = NULL;
|
|
|
|
sample_image = image;
|
|
|
|
ch = c;
|
|
|
|
n_samples_matched = 0;
|
|
|
|
total_match_scores = 0.0;
|
|
|
|
sumsq_match_scores = 0.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
float CHAR_SAMPLE::match_sample( // Update match scores
|
|
|
|
CHAR_SAMPLE *test_sample,
|
2009-07-11 10:03:51 +08:00
|
|
|
BOOL8 updating,
|
|
|
|
tesseract::Tesseract* tess) {
|
2007-03-08 04:03:40 +08:00
|
|
|
float score1;
|
|
|
|
float score2;
|
|
|
|
IMAGE *image = test_sample->image ();
|
|
|
|
|
|
|
|
if (sample_blob != NULL && test_sample->blob () != NULL) {
|
|
|
|
PBLOB *blob = test_sample->blob ();
|
|
|
|
DENORM *denorm = test_sample->denorm ();
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
score1 = tess->compare_bln_blobs (sample_blob, sample_denorm, blob, denorm);
|
|
|
|
score2 = tess->compare_bln_blobs (blob, denorm, sample_blob, sample_denorm);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
score1 = (score1 > score2) ? score1 : score2;
|
|
|
|
}
|
|
|
|
else if (sample_image != NULL && image != NULL) {
|
|
|
|
CHAR_PROTO *sample = new CHAR_PROTO (this);
|
|
|
|
|
|
|
|
score1 = matrix_match (sample_image, image);
|
|
|
|
delete sample;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return BAD_SCORE;
|
|
|
|
|
|
|
|
if ((tessedit_use_best_sample || tessedit_cluster_debug) && updating) {
|
|
|
|
n_samples_matched++;
|
|
|
|
total_match_scores += score1;
|
|
|
|
sumsq_match_scores += score1 * score1;
|
|
|
|
}
|
|
|
|
return score1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
double CHAR_SAMPLE::mean_score() {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (n_samples_matched > 0)
|
|
|
|
return (total_match_scores / n_samples_matched);
|
|
|
|
else
|
|
|
|
return BAD_SCORE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
double CHAR_SAMPLE::variance() {
|
2007-03-08 04:03:40 +08:00
|
|
|
double mean = mean_score ();
|
|
|
|
|
|
|
|
if (n_samples_matched > 0) {
|
|
|
|
return (sumsq_match_scores / n_samples_matched) - mean * mean;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return BAD_SCORE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_SAMPLE::print(FILE *f) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (!tessedit_cluster_debug)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (n_samples_matched > 0)
|
|
|
|
fprintf (f,
|
|
|
|
"%c - sample matched against " INT32FORMAT
|
|
|
|
" blobs, mean: %f, var: %f\n", ch, n_samples_matched,
|
|
|
|
mean_score (), variance ());
|
|
|
|
else
|
|
|
|
fprintf (f, "No matches for this sample (%c)\n", ch);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_SAMPLE::reset_match_statistics() {
|
2007-03-08 04:03:40 +08:00
|
|
|
n_samples_matched = 0;
|
|
|
|
total_match_scores = 0.0;
|
|
|
|
sumsq_match_scores = 0.0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
CHAR_SAMPLES::CHAR_SAMPLES() {
|
2007-03-08 04:03:40 +08:00
|
|
|
type = UNKNOWN;
|
|
|
|
samples.clear ();
|
|
|
|
ch = '\0';
|
|
|
|
best_sample = NULL;
|
|
|
|
proto = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
CHAR_SAMPLES::CHAR_SAMPLES(CHAR_SAMPLE *sample) {
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_SAMPLE_IT sample_it = &samples;
|
|
|
|
|
|
|
|
ASSERT_HOST (sample->image () != NULL || sample->blob () != NULL);
|
|
|
|
|
|
|
|
if (sample->image () != NULL)
|
|
|
|
type = IMAGE_CLUSTER;
|
|
|
|
else if (sample->blob () != NULL)
|
|
|
|
type = BLOB_CLUSTER;
|
|
|
|
|
|
|
|
samples.clear ();
|
|
|
|
sample_it.add_to_end (sample);
|
|
|
|
if (tessedit_mm_only_match_same_char)
|
|
|
|
ch = sample->character ();
|
|
|
|
else
|
|
|
|
ch = '\0';
|
|
|
|
best_sample = NULL;
|
|
|
|
proto = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
void CHAR_SAMPLES::add_sample(CHAR_SAMPLE *sample, tesseract::Tesseract* tess) {
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_SAMPLE_IT sample_it = &samples;
|
|
|
|
|
|
|
|
if (tessedit_use_best_sample || tessedit_cluster_debug)
|
|
|
|
for (sample_it.mark_cycle_pt ();
|
|
|
|
!sample_it.cycled_list (); sample_it.forward ()) {
|
2009-07-11 10:03:51 +08:00
|
|
|
sample_it.data ()->match_sample (sample, TRUE, tess);
|
|
|
|
sample->match_sample (sample_it.data (), TRUE, tess);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
sample_it.add_to_end (sample);
|
|
|
|
|
2008-02-01 08:13:28 +08:00
|
|
|
if (tessedit_mm_use_prototypes && type == IMAGE_CLUSTER) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (samples.length () == tessedit_mm_prototype_min_size)
|
|
|
|
this->build_prototype ();
|
2008-02-01 08:13:28 +08:00
|
|
|
else if (samples.length () > tessedit_mm_prototype_min_size)
|
|
|
|
this->add_sample_to_prototype (sample);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_SAMPLES::add_sample_to_prototype(CHAR_SAMPLE *sample) {
|
2007-03-08 04:03:40 +08:00
|
|
|
BOOL8 rebuild = FALSE;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 new_xsize = proto->x_size ();
|
|
|
|
inT32 new_ysize = proto->y_size ();
|
|
|
|
inT32 sample_xsize = sample->image ()->get_xsize ();
|
|
|
|
inT32 sample_ysize = sample->image ()->get_ysize ();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (sample_xsize > new_xsize) {
|
|
|
|
new_xsize = sample_xsize;
|
|
|
|
rebuild = TRUE;
|
|
|
|
}
|
|
|
|
if (sample_ysize > new_ysize) {
|
|
|
|
new_ysize = sample_ysize;
|
|
|
|
rebuild = TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rebuild)
|
|
|
|
proto->enlarge_prototype (new_xsize, new_ysize);
|
|
|
|
|
|
|
|
proto->add_sample (sample);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_SAMPLES::build_prototype() {
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_SAMPLE_IT sample_it = &samples;
|
|
|
|
CHAR_SAMPLE *sample;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 proto_xsize = 0;
|
|
|
|
inT32 proto_ysize = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (type != IMAGE_CLUSTER
|
|
|
|
|| samples.length () < tessedit_mm_prototype_min_size)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (sample_it.mark_cycle_pt ();
|
|
|
|
!sample_it.cycled_list (); sample_it.forward ()) {
|
|
|
|
sample = sample_it.data ();
|
|
|
|
if (sample->image ()->get_xsize () > proto_xsize)
|
|
|
|
proto_xsize = sample->image ()->get_xsize ();
|
|
|
|
if (sample->image ()->get_ysize () > proto_ysize)
|
|
|
|
proto_ysize = sample->image ()->get_ysize ();
|
|
|
|
}
|
|
|
|
|
|
|
|
proto = new CHAR_PROTO (proto_xsize, proto_ysize, 0, 0, '\0');
|
|
|
|
|
|
|
|
for (sample_it.mark_cycle_pt ();
|
|
|
|
!sample_it.cycled_list (); sample_it.forward ())
|
|
|
|
this->add_sample_to_prototype (sample_it.data ());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_SAMPLES::find_best_sample() {
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_SAMPLE_IT sample_it = &samples;
|
|
|
|
double score;
|
|
|
|
double best_score = MAX_INT32;
|
|
|
|
|
|
|
|
if (ch == '\0' || samples.length () < tessedit_mm_prototype_min_size)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (sample_it.mark_cycle_pt ();
|
|
|
|
!sample_it.cycled_list (); sample_it.forward ()) {
|
|
|
|
score = sample_it.data ()->mean_score ();
|
|
|
|
if (score < best_score) {
|
|
|
|
best_score = score;
|
|
|
|
best_sample = sample_it.data ();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (tessedit_cluster_debug) {
|
|
|
|
tprintf ("Best sample for this %c cluster:\n", ch);
|
|
|
|
best_sample->print (debug_fp);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
float CHAR_SAMPLES::match_score(CHAR_SAMPLE *sample,
|
|
|
|
tesseract::Tesseract* tess) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (tessedit_mm_only_match_same_char && sample->character () != ch)
|
|
|
|
return BAD_SCORE;
|
|
|
|
|
|
|
|
if (tessedit_use_best_sample && best_sample != NULL)
|
2009-07-11 10:03:51 +08:00
|
|
|
return best_sample->match_sample (sample, FALSE, tess);
|
2007-03-08 04:03:40 +08:00
|
|
|
else if ((tessedit_mm_use_prototypes
|
|
|
|
|| tessedit_mm_adapt_using_prototypes) && proto != NULL)
|
|
|
|
return proto->match_sample (sample);
|
|
|
|
else
|
2009-07-11 10:03:51 +08:00
|
|
|
return this->nn_match_score (sample, tess);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-07-11 10:03:51 +08:00
|
|
|
float CHAR_SAMPLES::nn_match_score(CHAR_SAMPLE *sample,
|
|
|
|
tesseract::Tesseract* tess) {
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_SAMPLE_IT sample_it = &samples;
|
|
|
|
float score;
|
|
|
|
float min_score = MAX_INT32;
|
|
|
|
|
|
|
|
for (sample_it.mark_cycle_pt ();
|
|
|
|
!sample_it.cycled_list (); sample_it.forward ()) {
|
2009-07-11 10:03:51 +08:00
|
|
|
score = sample_it.data ()->match_sample (sample, FALSE, tess);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (score < min_score)
|
|
|
|
min_score = score;
|
|
|
|
}
|
|
|
|
|
|
|
|
return min_score;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_SAMPLES::assign_to_char() {
|
|
|
|
STATS char_frequency(FIRST_CHAR, LAST_CHAR);
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_SAMPLE_IT sample_it = &samples;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 i;
|
|
|
|
inT32 max_index = 0;
|
|
|
|
inT32 max_freq = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (samples.length () == 0 || tessedit_mm_only_match_same_char)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (sample_it.mark_cycle_pt ();
|
|
|
|
!sample_it.cycled_list (); sample_it.forward ())
|
2008-04-22 08:32:14 +08:00
|
|
|
char_frequency.add ((inT32) sample_it.data ()->character (), 1);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
for (i = FIRST_CHAR; i <= LAST_CHAR; i++)
|
|
|
|
if (char_frequency.pile_count (i) > max_freq) {
|
|
|
|
max_index = i;
|
|
|
|
max_freq = char_frequency.pile_count (i);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (samples.length () >= tessedit_cluster_min_size
|
|
|
|
&& max_freq > samples.length () * tessedit_cluster_accept_fraction)
|
|
|
|
ch = (char) max_index;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_SAMPLES::print(FILE *f) {
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_SAMPLE_IT sample_it = &samples;
|
|
|
|
|
|
|
|
fprintf (f, "Collected " INT32FORMAT " samples\n", samples.length ());
|
|
|
|
|
|
|
|
#ifndef SECURE_NAMES
|
|
|
|
if (tessedit_cluster_debug)
|
|
|
|
for (sample_it.mark_cycle_pt ();
|
|
|
|
!sample_it.cycled_list (); sample_it.forward ())
|
|
|
|
sample_it.data ()->print (f);
|
|
|
|
|
|
|
|
if (ch == '\0')
|
|
|
|
fprintf (f, "\nCluster not used for adaption\n");
|
|
|
|
else
|
|
|
|
fprintf (f, "\nCluster used to adapt to '%c's\n", ch);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
CHAR_PROTO::CHAR_PROTO() {
|
2007-03-08 04:03:40 +08:00
|
|
|
xsize = 0;
|
|
|
|
ysize = 0;
|
|
|
|
ch = '\0';
|
|
|
|
nsamples = 0;
|
|
|
|
proto_data = NULL;
|
|
|
|
proto = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
CHAR_PROTO::CHAR_PROTO(inT32 x_size,
|
|
|
|
inT32 y_size,
|
|
|
|
inT32 n_samples,
|
2007-03-08 04:03:40 +08:00
|
|
|
float initial_value,
|
|
|
|
char c) {
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 x;
|
|
|
|
inT32 y;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
xsize = x_size;
|
|
|
|
ysize = y_size;
|
|
|
|
ch = c;
|
|
|
|
nsamples = n_samples;
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
ALLOC_2D_ARRAY(xsize, ysize, proto_data, proto, float);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
for (y = 0; y < ysize; y++)
|
|
|
|
for (x = 0; x < xsize; x++)
|
|
|
|
proto[x][y] = initial_value;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
CHAR_PROTO::CHAR_PROTO(CHAR_SAMPLE *sample) {
|
|
|
|
inT32 x;
|
|
|
|
inT32 y;
|
2007-03-08 04:03:40 +08:00
|
|
|
IMAGELINE imline_s;
|
|
|
|
|
|
|
|
if (sample->image () == NULL) {
|
|
|
|
xsize = 0;
|
|
|
|
ysize = 0;
|
|
|
|
ch = '\0';
|
|
|
|
nsamples = 0;
|
|
|
|
proto_data = NULL;
|
|
|
|
proto = NULL;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
ch = sample->character ();
|
|
|
|
xsize = sample->image ()->get_xsize ();
|
|
|
|
ysize = sample->image ()->get_ysize ();
|
|
|
|
nsamples = 1;
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
ALLOC_2D_ARRAY(xsize, ysize, proto_data, proto, float);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
for (y = 0; y < ysize; y++) {
|
|
|
|
sample->image ()->fast_get_line (0, y, xsize, &imline_s);
|
|
|
|
for (x = 0; x < xsize; x++)
|
|
|
|
if (imline_s.pixels[x] == BINIM_WHITE)
|
|
|
|
proto[x][y] = 1.0;
|
|
|
|
else
|
|
|
|
proto[x][y] = -1.0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
CHAR_PROTO::~CHAR_PROTO () {
|
|
|
|
if (proto_data != NULL)
|
2008-04-22 08:32:14 +08:00
|
|
|
FREE_2D_ARRAY(proto_data, proto);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
float CHAR_PROTO::match_sample(CHAR_SAMPLE *test_sample) {
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_PROTO *test_proto;
|
|
|
|
float score;
|
|
|
|
|
|
|
|
if (test_sample->image () != NULL) {
|
|
|
|
test_proto = new CHAR_PROTO (test_sample);
|
|
|
|
if (xsize > test_proto->x_size ())
|
|
|
|
score = this->match (test_proto);
|
|
|
|
else {
|
|
|
|
demo_word = -demo_word; // Flag different call
|
|
|
|
score = test_proto->match (this);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return BAD_SCORE;
|
|
|
|
|
|
|
|
delete test_proto;
|
|
|
|
|
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
float CHAR_PROTO::match(CHAR_PROTO *test_proto) {
|
|
|
|
inT32 xsize2 = test_proto->x_size ();
|
|
|
|
inT32 y_size;
|
|
|
|
inT32 y_size2;
|
|
|
|
inT32 x_offset;
|
|
|
|
inT32 y_offset;
|
|
|
|
inT32 x;
|
|
|
|
inT32 y;
|
2007-03-08 04:03:40 +08:00
|
|
|
CHAR_PROTO *match_proto;
|
|
|
|
float score;
|
|
|
|
float sum = 0.0;
|
|
|
|
|
|
|
|
ASSERT_HOST (xsize >= xsize2);
|
|
|
|
|
|
|
|
x_offset = (xsize - xsize2) / 2;
|
|
|
|
|
|
|
|
if (ysize < test_proto->y_size ()) {
|
|
|
|
y_size = test_proto->y_size ();
|
|
|
|
y_size2 = ysize;
|
|
|
|
y_offset = (y_size - y_size2) / 2;
|
|
|
|
|
|
|
|
match_proto = new CHAR_PROTO (xsize,
|
|
|
|
y_size,
|
|
|
|
nsamples * test_proto->n_samples (),
|
|
|
|
0, '\0');
|
|
|
|
|
|
|
|
for (y = 0; y < y_offset; y++) {
|
|
|
|
for (x = 0; x < xsize2; x++) {
|
|
|
|
match_proto->data ()[x + x_offset][y] =
|
|
|
|
test_proto->data ()[x][y] * nsamples;
|
|
|
|
sum += match_proto->data ()[x + x_offset][y];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (y = y_offset + y_size2; y < y_size; y++) {
|
|
|
|
for (x = 0; x < xsize2; x++) {
|
|
|
|
match_proto->data ()[x + x_offset][y] =
|
|
|
|
test_proto->data ()[x][y] * nsamples;
|
|
|
|
sum += match_proto->data ()[x + x_offset][y];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (y = y_offset; y < y_offset + y_size2; y++) {
|
|
|
|
for (x = 0; x < x_offset; x++) {
|
|
|
|
match_proto->data ()[x][y] = proto[x][y - y_offset] *
|
|
|
|
test_proto->n_samples ();
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (x = x_offset + xsize2; x < xsize; x++) {
|
|
|
|
match_proto->data ()[x][y] = proto[x][y - y_offset] *
|
|
|
|
test_proto->n_samples ();
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (x = x_offset; x < x_offset + xsize2; x++) {
|
|
|
|
match_proto->data ()[x][y] =
|
|
|
|
proto[x][y - y_offset] * test_proto->data ()[x - x_offset][y];
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
y_size = ysize;
|
|
|
|
y_size2 = test_proto->y_size ();
|
|
|
|
y_offset = (y_size - y_size2) / 2;
|
|
|
|
|
|
|
|
match_proto = new CHAR_PROTO (xsize,
|
|
|
|
y_size,
|
|
|
|
nsamples * test_proto->n_samples (),
|
|
|
|
0, '\0');
|
|
|
|
|
|
|
|
for (y = 0; y < y_offset; y++)
|
|
|
|
for (x = 0; x < xsize; x++) {
|
|
|
|
match_proto->data ()[x][y] =
|
|
|
|
proto[x][y] * test_proto->n_samples ();
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (y = y_offset + y_size2; y < y_size; y++)
|
|
|
|
for (x = 0; x < xsize; x++) {
|
|
|
|
match_proto->data ()[x][y] =
|
|
|
|
proto[x][y] * test_proto->n_samples ();
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (y = y_offset; y < y_offset + y_size2; y++) {
|
|
|
|
for (x = 0; x < x_offset; x++) {
|
|
|
|
match_proto->data ()[x][y] =
|
|
|
|
proto[x][y] * test_proto->n_samples ();
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (x = x_offset + xsize2; x < xsize; x++) {
|
|
|
|
match_proto->data ()[x][y] =
|
|
|
|
proto[x][y] * test_proto->n_samples ();
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
|
|
|
|
for (x = x_offset; x < x_offset + xsize2; x++) {
|
|
|
|
match_proto->data ()[x][y] = proto[x][y] *
|
|
|
|
test_proto->data ()[x - x_offset][y - y_offset];
|
|
|
|
sum += match_proto->data ()[x][y];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
score = (1.0 - sum /
|
|
|
|
(xsize * y_size * nsamples * test_proto->n_samples ()));
|
|
|
|
|
|
|
|
if (tessedit_mm_debug) {
|
|
|
|
if (score < 0) {
|
|
|
|
tprintf ("Match score %f\n", score);
|
|
|
|
tprintf ("x: %d, y: %d, ns: %d, nt: %d, dx %d, dy: %d\n",
|
|
|
|
xsize, y_size, nsamples, test_proto->n_samples (),
|
|
|
|
x_offset, y_offset);
|
|
|
|
for (y = 0; y < y_size; y++) {
|
|
|
|
tprintf ("\n%d", y);
|
|
|
|
for (x = 0; x < xsize; x++)
|
|
|
|
tprintf ("\t%d", match_proto->data ()[x][y]);
|
|
|
|
|
|
|
|
}
|
|
|
|
tprintf ("\n");
|
2008-04-22 08:32:14 +08:00
|
|
|
fflush(debug_fp);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
|
|
if (tessedit_display_mm) {
|
|
|
|
tprintf ("Match score %f\n", score);
|
|
|
|
display_images (this->make_image (),
|
|
|
|
test_proto->make_image (), match_proto->make_image ());
|
|
|
|
}
|
|
|
|
else if (demo_word != 0) {
|
|
|
|
if (demo_word > 0)
|
|
|
|
display_image (test_proto->make_image (), "Test sample",
|
|
|
|
300, 400, FALSE);
|
|
|
|
else
|
|
|
|
display_image (this->make_image (), "Test sample", 300, 400, FALSE);
|
|
|
|
|
|
|
|
display_image (match_proto->make_image (), "Best match",
|
|
|
|
700, 400, TRUE);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
delete match_proto;
|
|
|
|
|
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_PROTO::enlarge_prototype(inT32 new_xsize, inT32 new_ysize) {
|
2007-03-08 04:03:40 +08:00
|
|
|
float *old_proto_data = proto_data;
|
|
|
|
float **old_proto = proto;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 old_xsize = xsize;
|
|
|
|
inT32 old_ysize = ysize;
|
|
|
|
inT32 x_offset;
|
|
|
|
inT32 y_offset;
|
|
|
|
inT32 x;
|
|
|
|
inT32 y;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
ASSERT_HOST (new_xsize >= xsize && new_ysize >= ysize);
|
|
|
|
|
|
|
|
xsize = new_xsize;
|
|
|
|
ysize = new_ysize;
|
2008-04-22 08:32:14 +08:00
|
|
|
ALLOC_2D_ARRAY(xsize, ysize, proto_data, proto, float);
|
2007-03-08 04:03:40 +08:00
|
|
|
x_offset = (xsize - old_xsize) / 2;
|
|
|
|
y_offset = (ysize - old_ysize) / 2;
|
|
|
|
|
|
|
|
for (y = 0; y < y_offset; y++)
|
|
|
|
for (x = 0; x < xsize; x++)
|
|
|
|
proto[x][y] = nsamples;
|
|
|
|
|
|
|
|
for (y = y_offset + old_ysize; y < ysize; y++)
|
|
|
|
for (x = 0; x < xsize; x++)
|
|
|
|
proto[x][y] = nsamples;
|
|
|
|
|
|
|
|
for (y = y_offset; y < y_offset + old_ysize; y++) {
|
|
|
|
for (x = 0; x < x_offset; x++)
|
|
|
|
proto[x][y] = nsamples;
|
|
|
|
|
|
|
|
for (x = x_offset + old_xsize; x < xsize; x++)
|
|
|
|
proto[x][y] = nsamples;
|
|
|
|
|
|
|
|
for (x = x_offset; x < x_offset + old_xsize; x++)
|
|
|
|
proto[x][y] = old_proto[x - x_offset][y - y_offset];
|
|
|
|
}
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
FREE_2D_ARRAY(old_proto_data, old_proto);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
void CHAR_PROTO::add_sample(CHAR_SAMPLE *sample) {
|
|
|
|
inT32 x_offset;
|
|
|
|
inT32 y_offset;
|
|
|
|
inT32 x;
|
|
|
|
inT32 y;
|
2007-03-08 04:03:40 +08:00
|
|
|
IMAGELINE imline_s;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 sample_xsize = sample->image ()->get_xsize ();
|
|
|
|
inT32 sample_ysize = sample->image ()->get_ysize ();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
x_offset = (xsize - sample_xsize) / 2;
|
|
|
|
y_offset = (ysize - sample_ysize) / 2;
|
|
|
|
|
|
|
|
ASSERT_HOST (x_offset >= 0 && y_offset >= 0);
|
|
|
|
|
|
|
|
for (y = 0; y < y_offset; y++)
|
|
|
|
for (x = 0; x < xsize; x++)
|
|
|
|
proto[x][y]++; // Treat pixels outside the
|
|
|
|
// range as white
|
|
|
|
for (y = y_offset + sample_ysize; y < ysize; y++)
|
|
|
|
for (x = 0; x < xsize; x++)
|
|
|
|
proto[x][y]++;
|
|
|
|
|
|
|
|
for (y = y_offset; y < y_offset + sample_ysize; y++) {
|
|
|
|
sample->image ()->fast_get_line (0,
|
|
|
|
y - y_offset, sample_xsize, &imline_s);
|
|
|
|
for (x = x_offset; x < x_offset + sample_xsize; x++) {
|
|
|
|
if (imline_s.pixels[x - x_offset] == BINIM_WHITE)
|
|
|
|
proto[x][y]++;
|
|
|
|
else
|
|
|
|
proto[x][y]--;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (x = 0; x < x_offset; x++)
|
|
|
|
proto[x][y]++;
|
|
|
|
|
|
|
|
for (x = x_offset + sample_xsize; x < xsize; x++)
|
|
|
|
proto[x][y]++;
|
|
|
|
}
|
|
|
|
|
|
|
|
nsamples++;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-04-22 08:32:14 +08:00
|
|
|
IMAGE *CHAR_PROTO::make_image() {
|
2007-03-08 04:03:40 +08:00
|
|
|
IMAGE *image;
|
|
|
|
IMAGELINE imline_p;
|
2008-04-22 08:32:14 +08:00
|
|
|
inT32 x;
|
|
|
|
inT32 y;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
ASSERT_HOST (nsamples != 0);
|
|
|
|
|
|
|
|
image = new (IMAGE);
|
|
|
|
image->create (xsize, ysize, 8);
|
|
|
|
|
|
|
|
for (y = 0; y < ysize; y++) {
|
|
|
|
image->fast_get_line (0, y, xsize, &imline_p);
|
|
|
|
|
|
|
|
for (x = 0; x < xsize; x++) {
|
|
|
|
imline_p.pixels[x] = 128 +
|
2008-04-22 08:32:14 +08:00
|
|
|
(uinT8) ((proto[x][y] * 128.0) / (0.00001 + nsamples));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
image->fast_put_line (0, y, xsize, &imline_p);
|
|
|
|
}
|
|
|
|
return image;
|
|
|
|
}
|