mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-05 02:47:00 +08:00
fea38ee706
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@309 d0cd1f9f-072b-0410-8dd7-cf729c803f20
365 lines
11 KiB
C++
365 lines
11 KiB
C++
/* -*-C-*-
|
|
********************************************************************************
|
|
*
|
|
* File: metrics.c (Formerly metrics.c)
|
|
* Description:
|
|
* Author: Mark Seaman, OCR Technology
|
|
* Created: Fri Oct 16 14:37:00 1987
|
|
* Modified: Tue Jul 30 17:02:07 1991 (Mark Seaman) marks@hpgrlt
|
|
* Language: C
|
|
* Package: N/A
|
|
* Status: Reusable Software Component
|
|
*
|
|
* (c) Copyright 1987, Hewlett-Packard Company.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
*********************************************************************************/
|
|
/*----------------------------------------------------------------------
|
|
I n c l u d e s
|
|
----------------------------------------------------------------------*/
|
|
#include "metrics.h"
|
|
#include "bestfirst.h"
|
|
#include "associate.h"
|
|
#include "tally.h"
|
|
#include "plotseg.h"
|
|
#include "globals.h"
|
|
#include "wordclass.h"
|
|
#include "intmatcher.h"
|
|
#include "freelist.h"
|
|
#include "callcpp.h"
|
|
#include "ndminx.h"
|
|
#include "wordrec.h"
|
|
|
|
/*----------------------------------------------------------------------
|
|
V a r i a b l e s
|
|
----------------------------------------------------------------------*/
|
|
static int states_timed_out1; /* Counters */
|
|
static int states_timed_out2;
|
|
static int words_segmented1;
|
|
static int words_segmented2;
|
|
static int segmentation_states1;
|
|
static int segmentation_states2;
|
|
static int save_priorities;
|
|
|
|
int words_chopped1;
|
|
int words_chopped2;
|
|
int chops_attempted1;
|
|
int chops_performed1;
|
|
int chops_attempted2;
|
|
int chops_performed2;
|
|
|
|
int character_count;
|
|
int word_count;
|
|
int chars_classified;
|
|
|
|
MEASUREMENT num_pieces;
|
|
MEASUREMENT width_measure;
|
|
|
|
MEASUREMENT width_priority_range;/* Help to normalize */
|
|
MEASUREMENT match_priority_range;
|
|
|
|
TALLY states_before_best;
|
|
TALLY best_certainties[2];
|
|
TALLY character_widths; /* Width histogram */
|
|
|
|
FILE *priority_file_1; /* Output to cluster */
|
|
FILE *priority_file_2;
|
|
FILE *priority_file_3;
|
|
|
|
STATE *known_best_state = NULL; /* The right answer */
|
|
|
|
/*----------------------------------------------------------------------
|
|
M a c r o s
|
|
----------------------------------------------------------------------*/
|
|
#define CERTAINTY_BUCKET_SIZE -0.5
|
|
#define CERTAINTY_BUCKETS 40
|
|
|
|
/*----------------------------------------------------------------------
|
|
F u n c t i o n s
|
|
----------------------------------------------------------------------*/
|
|
/**********************************************************************
|
|
* init_metrics
|
|
*
|
|
* Set up the appropriate variables to record information about the
|
|
* OCR process. Later calls will log the data and save a summary.
|
|
**********************************************************************/
|
|
void init_metrics() {
|
|
words_chopped1 = 0;
|
|
words_chopped2 = 0;
|
|
chops_performed1 = 0;
|
|
chops_performed2 = 0;
|
|
chops_attempted1 = 0;
|
|
chops_attempted2 = 0;
|
|
|
|
words_segmented1 = 0;
|
|
words_segmented2 = 0;
|
|
states_timed_out1 = 0;
|
|
states_timed_out2 = 0;
|
|
segmentation_states1 = 0;
|
|
segmentation_states2 = 0;
|
|
|
|
save_priorities = 0;
|
|
|
|
character_count = 0;
|
|
word_count = 0;
|
|
chars_classified = 0;
|
|
permutation_count = 0;
|
|
|
|
end_metrics();
|
|
|
|
states_before_best = new_tally (MIN (100, wordrec_num_seg_states));
|
|
|
|
best_certainties[0] = new_tally (CERTAINTY_BUCKETS);
|
|
best_certainties[1] = new_tally (CERTAINTY_BUCKETS);
|
|
reset_width_tally();
|
|
}
|
|
|
|
void end_metrics() {
|
|
if (states_before_best != NULL) {
|
|
memfree(states_before_best);
|
|
memfree(best_certainties[0]);
|
|
memfree(best_certainties[1]);
|
|
memfree(character_widths);
|
|
states_before_best = NULL;
|
|
best_certainties[0] = NULL;
|
|
best_certainties[1] = NULL;
|
|
character_widths = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* record_certainty
|
|
*
|
|
* Maintain a record of the best certainty values achieved on each
|
|
* word recognition.
|
|
**********************************************************************/
|
|
void record_certainty(float certainty, int pass) {
|
|
int bucket;
|
|
|
|
if (certainty / CERTAINTY_BUCKET_SIZE < MAX_INT32)
|
|
bucket = (int) (certainty / CERTAINTY_BUCKET_SIZE);
|
|
else
|
|
bucket = MAX_INT32;
|
|
|
|
inc_tally_bucket (best_certainties[pass - 1], bucket);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* record_search_status
|
|
*
|
|
* Record information about each iteration of the search. This data
|
|
* is kept in global memory and accumulated over multiple segmenter
|
|
* searches.
|
|
**********************************************************************/
|
|
void record_search_status(int num_states, int before_best, float closeness) {
|
|
inc_tally_bucket(states_before_best, before_best);
|
|
|
|
if (first_pass) {
|
|
if (num_states == wordrec_num_seg_states + 1)
|
|
states_timed_out1++;
|
|
segmentation_states1 += num_states;
|
|
words_segmented1++;
|
|
}
|
|
else {
|
|
if (num_states == wordrec_num_seg_states + 1)
|
|
states_timed_out2++;
|
|
segmentation_states2 += num_states;
|
|
words_segmented2++;
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* save_summary
|
|
*
|
|
* Save the summary information into the file "file.sta".
|
|
**********************************************************************/
|
|
namespace tesseract {
|
|
void Wordrec::save_summary(inT32 elapsed_time) {
|
|
#ifndef SECURE_NAMES
|
|
STRING outfilename;
|
|
FILE *f;
|
|
int x;
|
|
int total;
|
|
|
|
outfilename = imagefile + ".sta";
|
|
f = open_file (outfilename.string(), "w");
|
|
|
|
fprintf (f, INT32FORMAT " seconds elapsed\n", elapsed_time);
|
|
fprintf (f, "\n");
|
|
|
|
fprintf (f, "%d characters\n", character_count);
|
|
fprintf (f, "%d words\n", word_count);
|
|
fprintf (f, "\n");
|
|
|
|
fprintf (f, "%d permutations performed\n", permutation_count);
|
|
fprintf (f, "%d characters classified\n", chars_classified);
|
|
fprintf (f, "%4.0f%% classification overhead\n",
|
|
(float) chars_classified / character_count * 100.0 - 100.0);
|
|
fprintf (f, "\n");
|
|
|
|
fprintf (f, "%d words chopped (pass 1) ", words_chopped1);
|
|
fprintf (f, " (%0.0f%%)\n", (float) words_chopped1 / word_count * 100);
|
|
fprintf (f, "%d chops performed\n", chops_performed1);
|
|
fprintf (f, "%d chops attempted\n", chops_attempted1);
|
|
fprintf (f, "\n");
|
|
|
|
fprintf (f, "%d words joined (pass 1)", words_segmented1);
|
|
fprintf (f, " (%0.0f%%)\n", (float) words_segmented1 / word_count * 100);
|
|
fprintf (f, "%d segmentation states\n", segmentation_states1);
|
|
fprintf (f, "%d segmentations timed out\n", states_timed_out1);
|
|
fprintf (f, "\n");
|
|
|
|
fprintf (f, "%d words chopped (pass 2) ", words_chopped2);
|
|
fprintf (f, " (%0.0f%%)\n", (float) words_chopped2 / word_count * 100);
|
|
fprintf (f, "%d chops performed\n", chops_performed2);
|
|
fprintf (f, "%d chops attempted\n", chops_attempted2);
|
|
fprintf (f, "\n");
|
|
|
|
fprintf (f, "%d words joined (pass 2)", words_segmented2);
|
|
fprintf (f, " (%0.0f%%)\n", (float) words_segmented2 / word_count * 100);
|
|
fprintf (f, "%d segmentation states\n", segmentation_states2);
|
|
fprintf (f, "%d segmentations timed out\n", states_timed_out2);
|
|
fprintf (f, "\n");
|
|
|
|
total = 0;
|
|
iterate_tally (states_before_best, x)
|
|
total += (tally_entry (states_before_best, x) * x);
|
|
fprintf (f, "segmentations (before best) = %d\n", total);
|
|
if (total != 0.0)
|
|
fprintf (f, "%4.0f%% segmentation overhead\n",
|
|
(float) (segmentation_states1 + segmentation_states2) /
|
|
total * 100.0 - 100.0);
|
|
fprintf (f, "\n");
|
|
|
|
print_tally (f, "segmentations (before best)", states_before_best);
|
|
|
|
iterate_tally (best_certainties[0], x)
|
|
cprintf ("best certainty of %8.4f = %4d %4d\n",
|
|
x * CERTAINTY_BUCKET_SIZE,
|
|
tally_entry (best_certainties[0], x),
|
|
tally_entry (best_certainties[1], x));
|
|
|
|
PrintIntMatcherStats(f);
|
|
dj_statistics(f);
|
|
fclose(f);
|
|
#endif
|
|
}
|
|
} // namespace tesseract
|
|
|
|
|
|
/**********************************************************************
|
|
* record_priorities
|
|
*
|
|
* If the record mode is set then record the priorities returned by
|
|
* each of the priority voters. Save them in a file that is set up for
|
|
* doing clustering.
|
|
**********************************************************************/
|
|
void record_priorities(SEARCH_RECORD *the_search,
|
|
FLOAT32 priority_1,
|
|
FLOAT32 priority_2) {
|
|
record_samples(priority_1, priority_2);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* record_samples
|
|
*
|
|
* Remember the priority samples to summarize them later.
|
|
**********************************************************************/
|
|
void record_samples(FLOAT32 match_pri, FLOAT32 width_pri) {
|
|
ADD_SAMPLE(match_priority_range, match_pri);
|
|
ADD_SAMPLE(width_priority_range, width_pri);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* reset_width_tally
|
|
*
|
|
* Create a tally record and initialize it.
|
|
**********************************************************************/
|
|
void reset_width_tally() {
|
|
character_widths = new_tally (20);
|
|
new_measurement(width_measure);
|
|
width_measure.num_samples = 158;
|
|
width_measure.sum_of_samples = 125.0;
|
|
width_measure.sum_of_squares = 118.0;
|
|
}
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
/**********************************************************************
|
|
* save_best_state
|
|
*
|
|
* Save this state away to be compared later.
|
|
**********************************************************************/
|
|
void save_best_state(CHUNKS_RECORD *chunks_record) {
|
|
STATE state;
|
|
SEARCH_STATE chunk_groups;
|
|
int num_joints;
|
|
|
|
if (save_priorities) {
|
|
num_joints = chunks_record->ratings->dimension() - 1;
|
|
|
|
state.part1 = 0xffffffff;
|
|
state.part2 = 0xffffffff;
|
|
|
|
chunk_groups = bin_to_chunks (&state, num_joints);
|
|
display_segmentation (chunks_record->chunks, chunk_groups);
|
|
memfree(chunk_groups);
|
|
|
|
cprintf ("Enter the correct segmentation > ");
|
|
fflush(stdout);
|
|
state.part1 = 0;
|
|
scanf ("%x", &state.part2);
|
|
|
|
chunk_groups = bin_to_chunks (&state, num_joints);
|
|
display_segmentation (chunks_record->chunks, chunk_groups);
|
|
memfree(chunk_groups);
|
|
window_wait(segm_window); /* == 'n') */
|
|
|
|
if (known_best_state)
|
|
free_state(known_best_state);
|
|
known_best_state = new_state (&state);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
|
|
/**********************************************************************
|
|
* start_record
|
|
*
|
|
* Set up everything needed to record the priority voters.
|
|
**********************************************************************/
|
|
void start_recording() {
|
|
if (save_priorities) {
|
|
priority_file_1 = open_file ("Priorities1", "w");
|
|
priority_file_2 = open_file ("Priorities2", "w");
|
|
priority_file_3 = open_file ("Priorities3", "w");
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* stop_recording
|
|
*
|
|
* Put an end to the priority recording mechanism.
|
|
**********************************************************************/
|
|
void stop_recording() {
|
|
if (save_priorities) {
|
|
fclose(priority_file_1);
|
|
fclose(priority_file_2);
|
|
fclose(priority_file_3);
|
|
}
|
|
}
|